Python lockInfo Exemples, utils.lockInfo Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : lockor.py Projet : hufnagel/WmAgentScripts

CI = campaignInfo()
tier_no_custodial = UC.get('tiers_with_no_custodial')
tiers_keep_on_disk = UC.get("tiers_keep_on_disk")

now = time.mktime(time.gmtime())

## can we catch the datasets that actually should go to tape ?
custodial_override = {}
for c in CI.campaigns:
    if 'custodial_override' in CI.campaigns[c]:
        custodial_override[c] = CI.campaigns[c]['custodial_override']

newly_locking = set()
also_locking_from_reqmgr = set()

LI = lockInfo()

## add an addHoc list of things to lock. empyting this list would result in unlocking later
addHocLocks = json.loads(eosRead('%s/addhoc_lock.json' % base_eos_dir))

time_point("Starting addhoc")

for item in addHocLocks:
    ds = item.split('#')[0]
    LI.lock(ds, reason='addhoc lock')
    newly_locking.add(ds)

time_point("Starting reversed statuses check")

for status in statuses:
    print time.asctime(time.gmtime()), "CEST, fetching", status

Exemple #2

0

Afficher le fichier

Fichier : assignor.py Projet : prozober/WmAgentScripts

def assignor(url ,specific = None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    #SI = siteInfo()
    SI = global_SI()
    #NLI = newLockInfo()
    #if not NLI.free() and not options.go: return
    LI = lockInfo()
    if not LI.free() and not options.go: return

    n_assigned = 0
    n_stalled = 0

    wfos=[]
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered','staging'])
    if specific:
        fetch_from.extend(['considered-tried'])
    
    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from",fetch_from

    for status in fetch_from:
        wfos.extend(session.query(Workflow).filter(Workflow.status==status).all())

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    dataset_endpoints = json.loads(open('%s/dataset_endpoints.json'%monitor_dir).read())
    aaa_mapping = json.loads(open('%s/equalizor.json'%monitor_pub_dir).read())['mapping']

    all_stuck = set()
    all_stuck.update( json.loads( open('%s/stuck_transfers.json'%monitor_dir).read() ))
    all_stuck.update( getAllStuckDataset()) 

    max_per_round = UC.get('max_per_round').get('assignor',None)
    max_cpuh_block = UC.get('max_cpuh_block')
    random.shuffle( wfos )
    for wfo in wfos:
        
        if options.limit and (n_stalled+n_assigned)>options.limit:
            break

        if max_per_round and (n_stalled+n_assigned)>max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue
            #if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo( url, wfo.name)

        if options.priority and int(wfh.request['RequestPriority']) < options.priority:
            continue

        options_text=""
        if options.early: options_text+=", early option is ON"
        if options.partial: 
            options_text+=", partial option is ON"
            options_text+=", good fraction is %.2f"%options.good_enough
        


        wfh.sendLog('assignor',"%s to be assigned%s"%(wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput,primary,parent,secondary, sites_allowed) = wfh.getSiteWhiteList()
        output_tiers = list(set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))
        
        
        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',"%s are stuck input"%(','.join( is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = False
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update( CI.campaigns[campaign] )

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update( CI.campaigns[campaign]['secondaries'] )
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[campaign]:
                banned_tier = list(set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers))
                if banned_tier:
                    no_go=True
                    wfh.sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)))
                    sendLog('assignor','These data tiers %s are not allowed'%(','.join( banned_tier)), level='critical')

        if secondary and check_secondary:
            if (set(secondary)&set(allowed_secondary.keys())!=set(secondary)):
                wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))))
                sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))), level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:# and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update( allowed_secondary[sec] )

        if no_go:
            n_stalled+=1 
            continue


            
        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] !='assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',"setting %s away and skipping"%wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name,wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version=wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor',"cannot decide on version number")
                n_stalled+=1
                wfo.status = 'trouble'
                session.commit()
                continue


        original_sites_allowed = copy.deepcopy( sites_allowed )
        wfh.sendLog('assignor',"Site white list %s"%sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request['Campaign'], 'SecondaryLocation', [])

        blocks = []
        if 'BlockWhitelist' in wfh.request:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) ))

        wfh.sendLog('assignor',"Allowed %s"%sorted(sites_allowed))
        secondary_locations=None

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa
        do_partial = False #options.good_enough if options.partial else 0

        if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns:
            assign_parameters.update( CI.campaigns[wfh.request['Campaign']] )

        if 'primary_AAA' in assign_parameters:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']
        if 'partial_copy' in assign_parameters:
            ## can this only work if there is a stuck input ? maybe not
            ## this is a number. 0 means no
            print "Could do partial disk copy assignment"
            if is_stuck or options.partial:
                do_partial = assign_parameters['partial_copy']
                wfh.sendLog('assignor',"Overiding partial copy assignment to %.2f fraction"% do_partial)
                #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name))
            
        do_partial = options.good_enough if options.partial else do_partial


        for sec in list(secondary):
            if override_sec_location: 
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                #sendEmail("tempting to pass sec location check","but we cannot yet IMO")
                #pass
            if secondary_aaa:
                #just continue without checking
                continue

            presence = getDatasetPresence( url, sec )
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.]
            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations==None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations]
            
        wfh.sendLog('assignor',"From secondary requirement, now Allowed%s"%sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy( sites_allowed ) ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy( sites_allowed )
        sites_with_data = copy.deepcopy( sites_allowed )
        sites_with_any_data = copy.deepcopy( sites_allowed )
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc' ## by default
        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor",dataset_endpoints[prim]
                endpoints.update( dataset_endpoints[prim] )
            set_lfn = getLFNbase( prim )
            presence = getDatasetPresence( url, prim , only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] =  getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks)
            #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])]
            #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])]
            sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]]
            sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]]
            sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()]
            wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed]))))
            if primary_locations==None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(set(primary_locations) & set(presence.keys() ))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites=[]
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            elif primary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            else:
                opportunistic_sites = []
            wfh.sendLog('assignor',"We could be running in addition at %s"% sorted(opportunistic_sites))
            if any([osite in SI.sites_not_ready for osite in opportunistic_sites]):
                wfh.sendLog('assignor',"One of the usable site is in downtime %s"%([osite for osite in opportunistic_sites if osite in SI.sites_not_ready]))
                down_time = True
                ## should this be send back to considered ?
                

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted,cpuh = wfh.getNCopies()
        wfh.sendLog('assignor',"we need %s CPUh"%cpuh)
        if cpuh>max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog('assignor','%s requires a large numbr of CPUh %s , not assigning, please check with requester'%( wfo.name, cpuh), level='critical')
            wfh.sendLog('assignor',"Requiring a large number of CPUh %s, not assigning"%cpuh)
            continue

        if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]:
            copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies']
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)
        
        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(1,copies_wanted-less_copies_than_requested) # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog('assignor',"needed availability fraction %s"% copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor',"Allowed sites :%s"% sorted(sites_allowed))

        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_all_data:
                wfh.sendLog('assignor',"Overiding the primary on AAA setting to Off")
                primary_aaa=False
            else:
                aaa_grid = set(sites_all_data)
                for site in list(aaa_grid):
                    aaa_grid.update( aaa_mapping.get(site,[]) )
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog('assignor',"Selected to read primary through xrootd %s"%sorted(sites_allowed))
                
        if not primary_aaa:
            sites_allowed = sites_with_any_data
            wfh.sendLog('assignor',"Selected for any data %s"%sorted(sites_allowed))

        ### check on endpoints for on-going transfers
        if do_partial:
            if endpoints:
                end_sites = [SI.SE_to_CE(s) for s in endpoints]
                sites_allowed = list(set(sites_allowed + end_sites))
                if down_time and not any(osite in SI.sites_not_ready for osite in end_sites):
                    print "Flip the status of downtime, since our destinations are good"
                    down_time = False
                print "with added endpoints",sorted(end_sites)
            else:
                print "Cannot do partial assignment without knowin the endpoints"
                n_stalled+=1
                continue
            
            
        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue


        low_pressure = SI.sites_low_pressure(0.4)
        ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started
        allowed_and_low = sorted(set(low_pressure) & set(sites_allowed))
        if allowed_and_low:
            wfh.sendLog('assignor',"The workflow can run at %s under low pressure currently"%( ','.join( allowed_and_low )))
            copies_wanted = max(1., copies_wanted-1.)


        if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]):
            not_even_once = not all([available>=1. for available in available_fractions.values()])
            above_good = all([available >= do_partial for available in available_fractions.values()])
            wfh.sendLog('assignor',"The input dataset is not available %s times, only %s"%( copies_wanted, available_fractions.values()))
            if down_time and not options.go and not options.early:
                wfo.status = 'considered'
                session.commit()
                wfh.sendLog('assignor',"sending back to considered because of site downtime, instead of waiting")
                #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog('assignor','%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'%( wfo.name ), level='delay')
                n_stalled+=1
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known and not options.limit and not options.go and not options.early and not (do_partial and above_good):
                    wfh.sendLog('assignor',"cannot be assigned, %s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append( wfo.name )
                    open('cannot_assign.json','w').write(json.dumps( known, indent=2))
                
                if options.early:
                    if wfo.status == 'considered':
                        wfh.sendLog('assignor',"setting considered-tried")
                        wfo.status = 'considered-tried'
                        session.commit()
                    else:
                        print "tried but status is",wfo.status
                if do_partial and above_good:
                    print "Will move on with partial locations"
                else:
                    n_stalled+=1
                    continue

        if not len(sites_allowed):
            if not options.early:
                wfh.sendLog('assignor',"cannot be assign with no matched sites")
                sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
            n_stalled+=1
            continue


        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])]
            
            
        wfh.sendLog('assignor',"Placing the output on %s"%sites_out)
        parameters={
            'SiteWhitelist' : sites_allowed,
            'NonCustodialSites' : sites_out,
            'AutoApproveSubscriptionSites' : list(set(sites_out)),
            'AcquisitionEra' : wfh.acquisitionEra(),
            'ProcessingString' : wfh.processingString(),
            'MergedLFNBase' : set_lfn,
            'ProcessingVersion' : version,
            }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog('assignor',"Reading primary through xrootd at %s"%sorted(sites_allowed))            

        if secondary_aaa:
            parameters['TrustPUSitelists'] = True
            wfh.sendLog('assignor',"Reading secondary through xrootd at %s"%sorted(sites_allowed))            


        if 'parameters' in assign_parameters:
            parameters.update( assign_parameters['parameters'] )

        ## plain assignment here
        team='production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team

        if False and 'T2_CH_CERN' in parameters['SiteWhitelist']:
            ## add some check on 
            ### the amount pending to HLT
            ### the size of the request
            ### the priority of the request (maybe not if we decide to overflow during runs)
            parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT']
            team = 'hlt'
            ## reduce the splitting by factor of 4, regardless of type of splitting
            sendEmail("sending work to HLT","%s was assigned to HLT"%wfo.name)
            

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v=getattr(options,key)
                if v!=None:
                    if type(v)==str and ',' in v: 
                        parameters[key] = filter(None,v.split(','))
                    else: 
                        parameters[key] = v

        if lheinput:
            ## throttle reading LHE article 
            wfh.sendLog('assignor', 'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        ## pick up campaign specific assignment parameters
        #parameters.update( CI.parameters(wfh.request['Campaign']) )
        parameters.update( assign_parameters.get('parameters',{}) )

        if not options.test:
            parameters['execute'] = True

        split_check = wfh.checkWorkflowSplitting()
        if split_check!=True:
            parameters.update( split_check )
            if 'NoGo' in split_check.values():
                wfh.sendLog('assignor', "Failing splitting check")
                sendLog('assignor','the workflow %s is failing the splitting check. Verify in the logs'% wfo.name, level='critical')
                n_stalled+=1
                continue

            if 'EventBased' in split_check.values():
                wfh.sendLog('assignor', "Falling back to event splitting.")
                #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name)
                sendLog('assignor','the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting ?'%wfo.name, level='critical')
                ## we have a problem here, that EventBased should never be used as a backup
                if not options.go:  
                    n_stalled+=1
                    continue
                continue ## skip all together
            elif 'EventsPerJob' in split_check.values():
                wfh.sendLog('assignor', "Modifying the number of events per job")
                #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name)
                sendLog('assignor',"the workflow %s is too heavy in number of jobs explosion"%wfo.name, level='critical')
            elif 'EventsPerLumi' in split_check.values():
                wfh.sendLog('assignor', "Modifying the number of events per lumi to be able to process this")

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents/(reqJobs*1.4))
                lumisPerJob = int(eventsPerJob/eventsPerLumi)
                if lumisPerJob==0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob), level='critical')
                    wfh.sendLog('assignor', "%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl['avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob), level='critical')
                        wfh.sendLog('assignor',"%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog('assignor',"leaving splitting untouched for %s, please check on %s"%( pstring, wfo.name), level='critical')
                        wfh.sendLog('assignor',"leaving splitting untouched for PU_RD*, please check.")


        
        
        result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters)


        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned+=1
                wfh.sendLog('assignor',"Properly assigned\n%s"%(json.dumps( parameters, indent=2)))
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo( url, wfo.name)
                    (_,prim,_,sec) = new_wfi.getIO()
                    for secure in list(prim)+list(sec)+new_wfi.request['OutputDatasets']:
                        ## lock all outputs flat
                        #NLI.lock( secure )
                        LI.lock( secure, reason = 'assigning')
                    #for site in [SI.CE_to_SE(site) for site in sites_allowed]:
                    #    for output in new_wfi.request['OutputDatasets']:
                    #        LI.lock( output, site, 'dataset in production')
                    #    for primary in prim:
                    #        LI.lock( primary, site, 'dataset used in input')
                    #    for secondary in sec:
                    #        LI.lock( secondary, site, 'required for mixing' )

                except Exception as e:
                    print "fail in locking output"
                    
                    print str(e)
                    sendEmail("failed locking of output",str(e))


            else:
                wfh.sendLog('assignor',"Failed to assign. Please check the logs")
                print "ERROR could not assign",wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor',"Assigned %d Stalled %s"%(n_assigned, n_stalled))

Exemple #3

0

Afficher le fichier

Fichier : transferor.py Projet : vkuznet/WmAgentScripts

def transferor(url, specific=None, talk=True, options=None):
    if userLock(): return
    mlock = moduleLock()
    if mlock(): return

    use_mcm = True
    up = componentInfo(soft=['mcm', 'wtc', 'jira'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    #NLI = newLockInfo()
    #if not NLI.free(): return
    LI = lockInfo()
    if not LI.free(): return

    mcm = McMClient(dev=False)
    dss = DSS()

    #allowed_secondary = UC.get('')
    print "counting all being handled..."
    being_handled = len(
        session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('stag')).all())
    being_transfered = len(
        session.query(Workflow).filter(Workflow.status == 'staging').all())
    #being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance-')).filter(
                ~Workflow.status.contains('custodial')).all())

    max_to_handle = options.maxworkflows
    max_to_transfer = options.maxstaging

    allowed_to_handle = max(0, max_to_handle - being_handled)
    allowed_to_transfer = max(0, max_to_transfer - being_transfered)
    wf_buffer = 5
    if allowed_to_handle <= wf_buffer:  ## buffer for having several wf per transfer
        print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer"
    else:
        print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer"

    if allowed_to_transfer <= wf_buffer:
        print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer"
    else:
        print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer"

    print "... done"

    all_transfers = defaultdict(list)
    workflow_dependencies = defaultdict(
        set)  ## list of wf.id per input dataset
    wfs_and_wfh = []
    max_per_round = UC.get('max_per_round').get('transferor', None)

    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    all_to_include = session.query(Workflow).filter(
        Workflow.status.startswith('considered')).all()
    if len(cache) > 2000:
        max_to_include = max_per_round
        random.shuffle(cache)  ## randomize first by wf name
        cache = sorted(cache, key=lambda r: r['RequestPriority'],
                       reverse=True)  ## order by prio
        highest = [r['RequestName'] for r in cache[:max_to_include]]
        all_to_include = [wfo for wfo in all_to_include if wfo.name in highest]
        print "limiting what to consider to", max_to_include, "because there is too much stuff going on. Got", len(
            all_to_include)

    for wfo in all_to_include:
        print "\t", wfo.name
        if specific and not specific in wfo.name: continue
        cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append((wfo,
                                workflowInfo(url,
                                             wfo.name,
                                             spec=False,
                                             request=cache_r[0])))
        else:
            wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False)))
    print "... done"

    transfers_per_sites = defaultdict(int)
    input_sizes = defaultdict(float)
    ignored_input_sizes = defaultdict(float)
    input_cput = {}
    input_st = {}
    ## list the size of those in transfer already
    in_transfer_priority = None
    min_transfer_priority = None
    print "getting all wf in staging ..."
    #stucks = json.loads(open('%s/stuck_transfers.json'%monitor_pub_dir).read())
    stucks = json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir))

    for wfo in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wfh = workflowInfo(url, wfo.name, spec=False)
        #(lheinput,primary,parent,secondary) = wfh.getIO()
        #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        print wfo.name, "staging"
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        for site in sites_allowed:  ## we should get the actual transfer destination instead of the full white list
            transfers_per_sites[site] += 1
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        blocks = wfh.getBlocks()
        for prim in primary:
            ds_s = dss.get(prim, blocks=blocks)
            if prim in stucks:
                wfh.sendLog(
                    'transferor',
                    "%s appears stuck, so not counting it %s [GB]" %
                    (prim, ds_s))
                ignored_input_sizes[prim] = max(ds_s,
                                                ignored_input_sizes[prim])
            else:
                input_sizes[prim] = max(ds_s, input_sizes[prim])
                wfh.sendLog('transferor',
                            "%s needs %s [GB]" % (wfo.name, ds_s))
        if in_transfer_priority == None:
            in_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            in_transfer_priority = max(in_transfer_priority,
                                       int(wfh.request['RequestPriority']))
        if min_transfer_priority == None:
            min_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            min_transfer_priority = min(min_transfer_priority,
                                        int(wfh.request['RequestPriority']))

    try:
        print "Ignored input sizes"
        ignored_values = list(ignored_input_sizes.items())
        ignored_values.sort(key=lambda i: i[1])
        print "\n".join(map(str, ignored_values))
        print "Considered input sizes"
        considered_values = list(input_sizes.items())
        considered_values.sort(key=lambda i: i[1])
        print "\n".join(map(str, considered_values))
    except Exception as e:
        print "trying to print the summary of input size"
        print str(e)

    print "... done"
    print "Max priority in transfer already", in_transfer_priority
    print "Min priority in transfer already", min_transfer_priority
    print "transfers per sites"
    print json.dumps(transfers_per_sites, indent=2)
    in_transfer_already = sum(input_sizes.values())
    cput_in_transfer_already = sum(input_cput.values())
    st_in_transfer_already = sum(input_st.values())

    ## list the size of all inputs
    primary_input_per_workflow_gb = defaultdict(float)
    print "getting all input sizes ..."
    input_blocks = {}
    for (wfo, wfh) in wfs_and_wfh:
        (_, primary, _, _) = wfh.getIO()
        blocks = wfh.getBlocks()
        input_blocks[wfo.name] = blocks
        for prim in primary:
            ## do not count it if it appears stalled !
            prim_size = dss.get(prim, blocks=blocks)
            input_sizes[prim] = max(prim_size, input_sizes[prim])
            primary_input_per_workflow_gb[wfo.name] += prim_size
    print "... done"

    # shuffle first by name
    random.shuffle(wfs_and_wfh)

    # Sort smallest transfers first; allows us to transfer as many as possible workflows.
    def prio_and_size(i, j):
        if int(i[1].request['RequestPriority']) == int(
                j[1].request['RequestPriority']):
            return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)),
                       int(primary_input_per_workflow_gb.get(i[0].name, 0)))
        else:
            return cmp(int(i[1].request['RequestPriority']),
                       int(j[1].request['RequestPriority']))

    #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True)
    #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) ))
    #sort by priority higher first
    wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[
        'RequestPriority']), int(j[1].request['RequestPriority'])),
                     reverse=True)

    if min_transfer_priority == None or in_transfer_priority == None:
        print "nothing is lining up for transfer"
        sendLog(
            "transferor",
            "No request in staging, using first request to set priority limit")
        if len(wfs_and_wfh):
            min_transfer_priority = wfs_and_wfh[0][1].request[
                'RequestPriority']
            in_transfer_priority = wfs_and_wfh[0][1].request['RequestPriority']
        else:
            return

    cput_grand_total = sum(input_cput.values())
    cput_to_transfer = cput_grand_total - cput_in_transfer_already
    st_grand_total = sum(input_st.values())
    st_to_transfer = st_grand_total - st_in_transfer_already
    print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already
    print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer
    print "%15.4f [h] worth of absolute system time in transfer" % (
        cput_in_transfer_already / SI.availableSlots())
    print "%15.4f [h] worth of absolute system time is the current requested transfer load" % (
        cput_to_transfer / SI.availableSlots())
    print "%15.4f [h] worth of theoritical system time in transfer" % (
        st_in_transfer_already)
    print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % (
        st_to_transfer)

    grand_total = sum(input_sizes.values())
    to_transfer = grand_total - in_transfer_already
    grand_transfer_limit = options.maxtransfer
    #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB

    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered" % in_transfer_already
    print "%15.4f GB is the current requested transfer load" % to_transfer
    print "%15.4f GB is the global transfer limit" % grand_transfer_limit
    print "%15.4f GB is the available limit" % transfer_limit

    max_staging_per_site = options.maxstagingpersite

    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer = 0  ## so that we can count'em
    passing_along = 0
    transfer_sizes = defaultdict(float)
    went_over_budget = False
    destination_cache = {}
    no_goes = set()

    if max_per_round and not spec:
        wfs_and_wfh = wfs_and_wfh[:max_per_round]

    for (wfo, wfh) in wfs_and_wfh:
        print wfo.name, "to be transfered with priority", wfh.request[
            'RequestPriority']

        if wfh.request['RequestStatus'] != 'assignment-approved':
            if wfh.request['RequestStatus'] in [
                    'aborted', 'rejected', 'rejected-archived',
                    'aborted-archived'
            ]:
                if wfh.isRelval():
                    wfo.status = 'forget'
                else:
                    wfo.status = 'trouble'  ## so that we look or a replacement
            else:
                wfo.status = 'away'
            wfh.sendLog(
                'transferor', '%s in status %s, setting %s' %
                (wfo.name, wfh.request['RequestStatus'], wfo.status))
            continue

        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        blocks = input_blocks.get(wfo.name, wfh.getBlocks())
        if blocks:
            print "Reading only", len(blocks), "blocks in input"
        this_load = sum([dss.get(prim, blocks=blocks) for prim in primary])
        no_budget = False
        if (this_load
                and (sum(transfer_sizes.values()) + this_load > transfer_limit
                     or went_over_budget)):
            if went_over_budget:
                wfh.sendLog('transferor', "Transfer has gone over bubget.")
            else:
                wfh.sendLog('transferor', "Transfer will go over bubget.")
            wfh.sendLog(
                'transferor',
                "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"
                % (this_load, sum(transfer_sizes.values()), transfer_limit))
            #if sum(transfer_sizes.values()) > transfer_limit:
            went_over_budget = True
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(
                        wfh.request['RequestPriority']
                ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over budget" %
                        (wfh.request['RequestPriority'], in_transfer_priority))
                else:
                    if not options.go:
                        wfh.sendLog(
                            'transferor',
                            "%s minimum priority %s < %s : stop" %
                            (min_transfer_priority,
                             wfh.request['RequestPriority'],
                             in_transfer_priority))
                        no_budget = True

        ## throtlle by campaign go
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True
            no_goes.add(wfo.name)

        allowed_secondary = {}
        overide_parameters = {}
        check_secondary = (not wfh.isRelval())
        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                overide_parameters.update(CI.campaigns[campaign])
            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'transferor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('transferor',
                            'These data tiers %s are not allowed in %s' %
                            (','.join(banned_tier), wfo.name),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                msg = '%s is not an allowed secondary' % (
                    ', '.join(set(secondary) - set(allowed_secondary.keys())))
                wfh.sendLog('transferor', msg)
                critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format(
                    wfh.getPrepIDs()[0])
                sendLog('transferor', critical_msg, level='critical')
                if not options.go:
                    no_go = True
            for sec in secondary:
                if sec in allowed_secondary:
                    overide_parameters.update(allowed_secondary[sec])

        if 'SiteWhitelist' in overide_parameters:
            sites_allowed = list(
                set(sites_allowed) & set(overide_parameters['SiteWhitelist']))
            wfh.sendLog(
                'transferor',
                'Intersecting with the overriding whitelist parameters, allowed sites become {}'
                .format(sites_allowed))

        if no_go:
            continue

        if passing_along >= allowed_to_handle:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(wfh.request['RequestPriority']
                       ) >= in_transfer_priority and int(
                           wfh.request['RequestPriority']
                       ) != min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over %s" %
                        (wfh.request['RequestPriority'], in_transfer_priority,
                         max_to_handle))
                else:
                    wfh.sendLog(
                        'transferor',
                        " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"
                        % (max_to_handle, being_handled, passing_along))
                    if not options.go:
                        ## should not allow to jump that fence
                        break

        if this_load and needs_transfer >= allowed_to_transfer:
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(wfh.request['RequestPriority']
                       ) >= in_transfer_priority and int(
                           wfh.request['RequestPriority']
                       ) != min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over %s" %
                        (wfh.request['RequestPriority'], in_transfer_priority,
                         max_to_transfer))
                else:
                    wfh.sendLog(
                        'transferor',
                        "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"
                        % (max_to_transfer, being_transfered, needs_transfer))
                    if not options.go:
                        no_budget = True

        if no_budget:
            continue
        #    break ## try this for a while to make things faster

        ## the site white list considers site, campaign, memory and core information
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')

        for dataset in list(primary) + list(parent) + list(secondary):
            LI.lock(dataset, reason='staging')

        if not sites_allowed:
            wfh.sendLog('transferor', "not possible site to run at")
            #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name ))
            sendLog('transferor',
                    "%s has no possible sites to run at" % (wfo.name),
                    level='critical')
            continue

        can_go = True
        staging = False
        allowed = True
        primary_destinations = set()
        if primary:

            copies_needed_from_CPUh, CPUh = wfh.getNCopies()

            if talk:
                print wfo.name, 'reads', ', '.join(primary), 'in primary'
            ## chope the primary dataset
            for prim in primary:
                ## keep track of what needs what
                workflow_dependencies[prim].add(wfo.id)

                max_priority[prim] = max(max_priority[prim],
                                         int(wfh.request['RequestPriority']))

                wfh.sendLog(
                    'transferor', "Would make %s  from cpu requirement %s" %
                    (copies_needed_from_CPUh, CPUh))
                copies_needed = copies_needed_from_CPUh

                if 'Campaign' in wfh.request and wfh.request[
                        'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                            wfh.request['Campaign']]:
                    copies_needed_from_campaign = CI.campaigns[
                        wfh.request['Campaign']]['maxcopies']
                    copies_needed = min(copies_needed_from_campaign,
                                        copies_needed)

                    wfh.sendLog(
                        'transferor',
                        "Maxed to %s by campaign configuration %s" %
                        (copies_needed, wfh.request['Campaign']))

                if blocks:
                    print "limiting to blocks", "\n".join(sorted(blocks))
                ### new ways of making the whole thing
                destinations, all_block_names = getDatasetDestinations(
                    url,
                    prim,
                    within_sites=[SI.CE_to_SE(site) for site in sites_allowed],
                    only_blocks=blocks)
                print json.dumps(destinations, indent=2)

                ## get where the dataset is in full and completed
                prim_location = [
                    site for (site, info) in destinations.items()
                    if info['completion'] == 100 and info['data_fraction'] == 1
                ]
                ## the rest is places it is going to be
                #prim_destination = [site for site in destinations.keys() if not site in prim_location]
                prim_destination = [
                    site for (site, info) in destinations.items()
                    if info['data_fraction'] == 1 and info['completion'] != 100
                ]
                ## veto the site with no current disk space, for things that are not relval
                prim_destination = [
                    site for site in prim_destination
                    if (SI.disk[site] or wfh.isRelval())
                ]

                if len(prim_location) >= copies_needed:
                    wfh.sendLog(
                        'transferor',
                        "The input is all fully in place at %s sites %s" %
                        (len(prim_location), sorted(prim_location)))
                    continue
                copies_needed = max(0, copies_needed - len(prim_location))
                wfh.sendLog(
                    'transferor',
                    "Counting existing copies ; now need %s" % copies_needed)
                copies_being_made = [
                    sum([
                        info['blocks'].keys().count(block)
                        for site, info in destinations.items()
                        if site in prim_destination
                    ]) for block in all_block_names
                ]

                latching_on_transfers = set()
                [
                    latching_on_transfers.update(info['blocks'].values())
                    for site, info in destinations.items()
                    if site in prim_destination
                ]
                latching_on_transfers = list(latching_on_transfers)
                #print latching_on_transfers

                ## figure out where all this is going to go
                prim_to_distribute = [
                    site for site in sites_allowed
                    if not SI.CE_to_SE(site) in prim_location
                ]
                prim_to_distribute = [
                    site for site in prim_to_distribute
                    if not SI.CE_to_SE(site) in prim_destination
                ]
                ## take out the ones that cannot receive transfers
                potential_destinations = len(prim_to_distribute)
                #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer]
                prim_to_distribute = [
                    site for site in prim_to_distribute
                    if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval())
                ]

                ## do we want to restrict transfers if the amount of site in vetoe are too large ?

                wfh.sendLog(
                    'transferor',
                    "Could be going to: %s" % sorted(prim_to_distribute))
                if not prim_to_distribute or any([
                        transfers_per_sites[site] < max_staging_per_site
                        for site in prim_to_distribute
                ]):
                    ## means there is openings let me go
                    print "There are transfer slots available:", [
                        (site, transfers_per_sites[site])
                        for site in prim_to_distribute
                    ]
                else:
                    if int(
                            wfh.request['RequestPriority']
                    ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                        wfh.sendLog(
                            'transferor',
                            "Higher priority sample %s >= %s go-on over transfer slots available"
                            % (wfh.request['RequestPriority'],
                               in_transfer_priority))
                    else:
                        wfh.sendLog(
                            'transferor',
                            "Not allowed to transfer more than %s per site at a time. Going overboard for %s"
                            % (max_staging_per_site,
                               sorted([
                                   site for site in prim_to_distribute
                                   if transfers_per_sites[site] >=
                                   max_staging_per_site
                               ])))
                        if not options.go:
                            allowed = False
                            break

                for latching in latching_on_transfers:

                    existings = session.query(TransferImp).filter(
                        TransferImp.phedexid == int(latching)).filter(
                            TransferImp.workflow_id == wfo.id).all()
                    if not existings:
                        tri = TransferImp(phedexid=int(latching), workflow=wfo)
                        print "adding", wfo.id, "with phedexid", latching
                        session.add(tri)
                    else:
                        for existing in existings:
                            existing.active = True

                    session.flush()

                    can_go = False
                    transfer_sizes[prim] = max(this_load, transfer_sizes[prim])
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                #copies_needed = max(0,copies_needed - len(prim_destination))
                copies_needed = max(0, copies_needed - min(copies_being_made))
                wfh.sendLog(
                    'transferor',
                    "Counting the copies being made ; then need %s" %
                    copies_needed)
                if copies_needed == 0:
                    wfh.sendLog(
                        'transferor',
                        "The input is either fully in place or getting in full somewhere with %s"
                        % latching_on_transfers)
                    can_go = True
                    continue
                elif len(prim_to_distribute) == 0:
                    wfh.sendLog(
                        'transferor',
                        "We are going to need extra copies of %s, but no destinations seems available"
                        % (prim))
                    sendLog(
                        'transferor',
                        "We are going to need extra copies of %s, but no destinations seems available"
                        % (prim),
                        level='critical')

                    print json.dumps(prim_to_distribute, indent=2)
                    print json.dumps(prim_location, indent=2)
                    print json.dumps(prim_destination, indent=2)

                    prim_to_distribute = [
                        site for site in sites_allowed
                        if not SI.CE_to_SE(site) in prim_location
                    ]
                    #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer ]
                    prim_to_distribute = [
                        site for site in prim_to_distribute
                        if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval())
                    ]

                    print "changed to"
                    print json.dumps(prim_to_distribute, indent=2)

                if len(
                        prim_to_distribute
                ) > 0:  ## maybe that a parameter we can play with to limit the
                    if not options or options.chop:
                        ### hard include the tape disk andpoint ?
                        #tapes = [site for site in  getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')]
                        chops, sizes = getDatasetChops(
                            prim,
                            chop_threshold=options.chopsize,
                            only_blocks=blocks)
                        spreading = distributeToSites(chops,
                                                      prim_to_distribute,
                                                      n_copies=copies_needed,
                                                      weights=SI.cpu_pledges,
                                                      sizes=sizes)
                        ## prune the blocks/destination that are already in the making, so that subscription don't overlap
                        for site in spreading:
                            for block in list(spreading[site]):
                                if site in destinations and block in destinations[
                                        site]['blocks'].keys():
                                    ## prune it
                                    spreading[site].remove(block)

                        transfer_sizes[prim] = sum(sizes)
                        if not spreading:
                            sendLog(
                                'transferor',
                                'cannot send %s to any site, it cannot fit anywhere'
                                % prim,
                                level='critical')
                            wfh.sendLog(
                                'transferor',
                                "cannot send to any site. %s cannot seem to fit anywhere"
                                % (prim))
                            staging = False
                            can_go = False

                    else:
                        spreading = {}
                        for site in prim_to_distribute:
                            if blocks:
                                spreading[site] = blocks
                            else:
                                spreading[site] = [prim]
                        transfer_sizes[prim] = max(this_load,
                                                   transfer_sizes[prim])
                    can_go = False
                    wfh.sendLog(
                        'transferor', "selected CE destinations %s" %
                        (sorted(spreading.keys())))
                    for (site, items) in spreading.items():
                        all_transfers[site].extend(items)
                        transfers_per_sites[site] += 1
                        primary_destinations.add(site)
                else:
                    can_go = False
                    allowed = False

        if not allowed:
            wfh.sendLog('transferor', "Not allowed to move on with")
            continue

        if secondary:

            override_sec_destination = []
            if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]:
                override_sec_destination = CI.campaigns[
                    wfh.request['Campaign']]['SecondaryLocation']
            if 'SecondaryLocation' in overide_parameters:
                override_sec_destination = overide_parameters[
                    'SecondaryLocation']
            print wfo.name, 'reads', ', '.join(secondary), 'in secondary'
            for sec in secondary:

                workflow_dependencies[sec].add(wfo.id)

                if True:
                    ## new style, failing on minbias
                    if not sec in destination_cache:
                        ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist
                        destination_cache[sec], _ = getDatasetDestinations(
                            url, sec)  ## NO SITE WHITE LIST ADDED
                        #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])

                    ## limit to the site whitelist NOW
                    se_allowed = set(
                        [SI.CE_to_SE(site) for site in sites_allowed])
                    destinations = dict([
                        (k, v) for (k, v) in destination_cache[sec].items()
                        if k in se_allowed
                    ])
                    ## truncate location/destination to those making up for >90% of the dataset
                    bad_destinations = [
                        destinations.pop(site)
                        for (site, info) in destinations.items()
                        if info['data_fraction'] < 0.9
                    ]
                    print sec, json.dumps(destinations, indent=2)
                    sec_location = [
                        site for (site, info) in destinations.items()
                        if info['completion'] >= 95
                    ]
                    sec_destination = [
                        site for site in destinations.keys()
                        if not site in sec_location
                    ]  ## this is in SE
                else:
                    ## old style
                    presence = getDatasetPresence(url, sec)
                    sec_location = [
                        site for site, pres in presence.items()
                        if pres[1] > 90.
                    ]  ## more than 90% of the minbias at sites
                    subscriptions = listSubscriptions(url, sec)
                    sec_destination = [site for site in subscriptions]

                ## how to make unified understand that it has to wait for the secondary if the sec_destination and

                #sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [
                    site for site in sites_allowed
                    if not SI.CE_to_SE(site) in sec_location
                ]
                #sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                sec_to_distribute = [
                    site for site in sec_to_distribute
                    if not SI.CE_to_SE(site) in sec_destination
                ]
                presitespace_sec_to_distribute = copy.deepcopy(
                    sec_to_distribute)
                #sec_to_distribute = [site for site in sec_to_distribute if not  any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                #sec_to_distribute = [site for site in sec_to_distribute if not  SI.CE_to_SE(site) in SI.sites_veto_transfer]
                sec_to_distribute = [
                    site for site in sec_to_distribute
                    if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval())
                ]
                ## at this point you have a problem
                if len(sec_to_distribute) == 0 and len(
                        presitespace_sec_to_distribute):
                    sendLog(
                        'transferor',
                        '%s is getting no possible destinations because of lack of space. To be decided what to do in general'
                        % (sec),
                        level='critical')

                if override_sec_destination:
                    ## intersect with where we want the PU to be
                    not_needed_anymore = list(
                        set(sec_to_distribute) - set(override_sec_destination))
                    #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore ))
                    sendLog(
                        'transferor',
                        "the dataset %s could be removed from %s" %
                        (sec, not_needed_anymore))
                    sec_to_distribute = list(
                        set(sec_to_distribute) & set(override_sec_destination))

                if len(sec_to_distribute) > 0:
                    print "secondary could go to", sorted(sec_to_distribute)
                    sec_size = dss.get(sec)
                    for site in sec_to_distribute:
                        site_se = SI.CE_to_SE(site)
                        if (SI.disk[site_se] *
                                1024.) > sec_size or wfh.isRelval():
                            wfh.sendLog('transferor',
                                        'Sending %s to %s' % (sec, site))
                            all_transfers[site].append(sec)
                            can_go = False
                        else:
                            print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[
                                site_se] * 1024, "GB need", sec_size
                            if primary_destinations and site in primary_destinations:
                                #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024))
                                sendLog(
                                    'transferor',
                                    '%s is too big (%s) for %s (%s). %s will not be able to run there.'
                                    % (sec, sec_size, site_se,
                                       SI.disk[site_se] * 1024, wfo.name),
                                    level='critical')
                                wfh.sendLog(
                                    'transferor',
                                    '%s is too big (%s) for %s (%s). will not be able to run there.'
                                    % (sec, sec_size, site_se,
                                       SI.disk[site_se] * 1024))
                else:
                    ## this is bas overall
                    print "the secondary input does not have to be send to site"

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                wfh.sendLog(
                    'transferor',
                    "latches on existing transfers, and nothing else, settin staging"
                )
                wfo.status = 'staging'
                needs_transfer += 1
            else:
                wfh.sendLog(
                    'transferor', "should just be assigned now to %s" %
                    sorted(sites_allowed))
                wfo.status = 'staged'
            passing_along += 1
            wfh.sendLog('transferor',
                        "setting %s status to %s" % (wfo.name, wfo.status))
            #session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                wfh.sendLog('transferor', "latches on existing transfers")
                if not options.test:
                    wfo.status = 'staging'
                    wfh.sendLog(
                        'transferor',
                        "setting %s status to %s" % (wfo.name, wfo.status))
                    #session.commit()
            wfh.sendLog('transferor', "needs a transfer")
            needs_transfer += 1
            passing_along += 1

    if no_goes:
        #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes ))
        sendLog('transferor',
                "No go for \n" + "\n".join(sorted(no_goes)),
                level='critical')

    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    fake_id = -1
    wf_id_in_prestaging = set()

    for (site, items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))

        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        #if site in SI.sites_veto_transfer:
        #    print site,"does not want transfers"
        #    continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        block_datasets = list(set([it.split('#')[0] for it in blocks]))
        datasets = [it for it in items_to_transfer if not '#' in it]

        details_text = "Making a replica to %s (CE) %s (SE) for" % (site,
                                                                    site_se)

        #print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [
            block for block in blocks if not block.split('#')[0] in datasets
        ]
        #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        #print "\t",len(datasets),"datasets"
        #print "\t",datasets
        details_text += '\n\t%d blocks' % len(blocks)
        details_text += '\n\t%d needed blocks for %s' % (
            len(blocks),
            sorted(list(set([block.split('#')[0] for block in blocks]))))
        details_text += '\n\t%d datasets' % len(datasets)
        details_text += '\n\t%s' % sorted(datasets)

        items_to_transfer = blocks + datasets

        if execute:
            sendLog('transferor', details_text)
        else:
            print "Would make a replica to", site, "(CE)", site_se, "(SE) for"
            print details_text

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y', 'yes', 'go']:
                continue
        transfered_items = defaultdict(set)
        if execute:
            priority = 'normal'
            cds = [
                ds for ds in set(datasets + block_datasets)
                if ds in max_priority
            ]
            ## bucketize the transfers by priority of workflows
            prioritized_items = defaultdict(set)
            for item in items_to_transfer:
                d = item.split('#')[0]
                p = max_priority.get(d, 80000)
                q = 'normal'
                if p > 100000:
                    q = 'reserved'
                elif p < 70000:
                    q = 'low'
                prioritized_items[q].add(item)

            for priority, items in prioritized_items.items():
                result = makeReplicaRequest(url,
                                            site_se,
                                            list(items),
                                            'prestaging',
                                            priority=priority,
                                            approve=True)
                if result:
                    these_transfers = [
                        o['id'] for o in result['phedex']['request_created']
                    ]
                    #phedexids.extend( these_transfers )
                    for ph in these_transfers:
                        transfered_items[ph].update(items)
                else:
                    sendLog(
                        'transferor',
                        'Could not make a replica request for items %s to site %s'
                        % (items, site_se),
                        level='critical')

            #result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority, approve=True)
            #phedexids = [o['id'] for o in result['phedex']['request_created']]:
        #else:
        #    #result= {'phedex':{'request_created' : []}}
        #    phedexids = []
        #    fake_id-=1

        if not transfered_items:
            sendLog(
                'transferor',
                'Could not make a replica request for items %s to site %s' %
                (items_to_transfer, site),
                level='critical')
            continue
        for phedexid, items in transfered_items.items():
            print phedexid, "transfer created"
            for transfering in list(
                    set(map(lambda it: it.split('#')[0], items))):
                for wfid in workflow_dependencies[transfering]:
                    new_transfer = session.query(TransferImp).filter(
                        TransferImp.phedexid == int(phedexid)).filter(
                            TransferImp.workflow_id == wfid).first()
                    if not new_transfer:
                        new_transfer = TransferImp(
                            phedexid=phedexid,
                            workflow=session.query(Workflow).get(wfid))
                        session.add(new_transfer)
                    else:
                        new_transfer.active = True

                    wf_id_in_prestaging.add(wfid)
            #session.commit()

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status != 'staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting", tr_wf.name, "to staging"
        #session.commit()

    ## one big session commit at the end that everything went fine
    session.commit()

Exemple #4

0

Afficher le fichier

def assignor(url, specific=None, talk=True, options=None):
    if userLock() and not options.manual: return
    mlock = moduleLock()
    if mlock() and not options.manual: return
    if not componentInfo().check() and not options.manual: return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    SI = siteInfo()
    SI = global_SI()
    ###NLI = newLockInfo()
    ###if not NLI.free() and not options.go: return
    LI = lockInfo()
    #if not LI.free() and not options.go and not options.manual: return

    n_assigned = 0
    n_stalled = 0

    wfos = []
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered', 'staging'])
    if specific:
        fetch_from.extend(['considered-tried'])

    if options.early:
        print "Option Early is on"

    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from", fetch_from

    for status in fetch_from:
        print "getting wf in", status
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == status).all())
        print len(wfos)

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    aaa_mapping = json.loads(eosRead('%s/equalizor.json' %
                                     monitor_pub_dir))['mapping']
    all_stuck = set()
    all_stuck.update(
        json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir)))

    max_per_round = UC.get('max_per_round').get('assignor', None)
    max_cpuh_block = UC.get('max_cpuh_block')

    # Temporarily switch off prioritization
    random.shuffle(wfos)
    ##order by priority instead of random
    """
    if options.early:
        cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority'])
        cache = [r['RequestName'] for r in cache]
        def rank( wfn ):
            return cache.index( wfn ) if wfn in cache else 0

        wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True)
        print "10 first",[wfo.name for wfo in wfos[:10]]
        print "10 last",[wfo.name for wfo in wfos[-10:]]
    else:
        random.shuffle( wfos )
    """

    for wfo in wfos:

        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue

        if not options.manual and 'rucio' in (wfo.name).lower(): continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)

        if wfh.request['RequestStatus'] in [
                'rejected', 'aborted', 'aborted-completed', 'aborted-archived',
                'rejected-archived'
        ] and wfh.isRelval():
            wfo.status = 'forget'
            session.commit()
            n_stalled += 1
            continue

        if options.priority and int(
                wfh.request['RequestPriority']) < options.priority:
            continue

        options_text = ""
        if options.early: options_text += ", early option is ON"

        wfh.sendLog('assignor',
                    "%s to be assigned %s" % (wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary, sites_allowed,
         sites_not_allowed) = wfh.getSiteWhiteList()

        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))

        if not output_tiers:
            n_stalled += 1
            wfh.sendLog('assignor', 'There is no output at all')
            sendLog('assignor',
                    'Workflow %s has no output at all' % (wfo.name),
                    level='critical')
            continue

        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',
                        "%s are stuck input" % (','.join(is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = (not wfh.isRelval())
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update(CI.campaigns[campaign])

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'assignor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('assignor',
                            'These data tiers %s are not allowed' %
                            (','.join(banned_tier)),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                msg = '%s is not an allowed secondary' % (
                    ', '.join(set(secondary) - set(allowed_secondary.keys())))
                wfh.sendLog('assignor', msg)
                critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format(
                    wfh.getPrepIDs()[0])
                sendLog('assignor', critical_msg, level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:  # and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update(allowed_secondary[sec])

        if no_go:
            n_stalled += 1
            ## make a very loud noise if >100k priority stalled
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))

        blocks = wfh.getBlocks()
        if blocks:
            wfh.sendLog(
                'assignor',
                "Needs {} blocks in input {}".format(len(blocks),
                                                     '\n'.join(blocks)))
        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns:
            assign_parameters.update(CI.campaigns[wfh.request['Campaign']])

        if 'primary_AAA' in assign_parameters and primary:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']

        wfh.sendLog(
            'assignor',
            "Initial values for primary_AAA=%s and secondary_AAA=%s" %
            (primary_aaa, secondary_aaa))

        if primary_aaa:
            if "T2_CH_CERN_HLT" in sites_allowed:
                sites_allowed.remove("T2_CH_CERN_HLT")
            if "T2_CH_CERN_HLT" not in sites_not_allowed:
                sites_not_allowed.append("T2_CH_CERN_HLT")

        ## keep track of this, after secondary input location restriction : that's how you want to operate it
        initial_sites_allowed = copy.deepcopy(sites_allowed)

        set_lfn = '/store/mc'  ## by default

        for prim in list(primary):
            set_lfn = getLFNbase(prim)
            ## if they are requested for processing, they should bbe all closed already
            # FIXME: remove this closeAllBlocks
            #closeAllBlocks(url, prim, blocks)

        ## should be 2 but for the time-being let's lower it to get things going
        _copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog('assignor', "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                'assignor',
                '%s requires a large numbr of CPUh %s , not assigning, please check with requester'
                % (wfo.name, cpuh),
                level='critical')
            wfh.sendLog(
                'assignor',
                "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed))

        # TODO Alan on 1/april/2020: keep the AAA functionality
        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_allowed:
                wfh.sendLog('assignor',
                            "Overiding the primary on AAA setting to Off")
                primary_aaa = False
            else:
                aaa_grid = set(sites_allowed)
                for site in list(aaa_grid):
                    aaa_grid.update(aaa_mapping.get(site, []))
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog(
                    'assignor', "Selected to read primary through xrootd %s" %
                    sorted(sites_allowed))

        isStoreResults = ('StoreResults' == wfh.request.setdefault(
            'RequestType', None))

        if isStoreResults:
            if 'MergedLFNBase' in wfh.request:
                set_lfn = wfh.request['MergedLFNBase']
            else:
                n_stalled += 1
                wfh.sendLog(
                    'assignor',
                    "Cannot assign StoreResults request because MergedLFN is missing"
                )
                sendLog(
                    'assignor',
                    'Cannot assign StoreResults request because MergedLFN is missing',
                    level='critical')
                continue

        if not primary_aaa:
            if isStoreResults:
                ## if we are dealing with a StoreResults request, we don't need to check dataset availability and
                ## should use the SiteWhiteList set in the original request
                if 'SiteWhitelist' in wfh.request:
                    sites_allowed = wfh.request['SiteWhitelist']
                else:
                    wfh.sendLog(
                        'assignor',
                        "Cannot assign StoreResults request because SiteWhitelist is missing"
                    )
                    sendLog(
                        'assignor',
                        'Cannot assign StoreResults request because SiteWhitelist is missing',
                        level='critical')
                    n_stalled += 1
                    continue
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue

        if not len(sites_allowed) and not options.SiteWhitelist:
            if not options.early:
                wfh.sendLog('assignor',
                            "cannot be assign with no matched sites")
                sendLog('assignor',
                        '%s has no whitelist' % wfo.name,
                        level='critical')
            n_stalled += 1
            continue

        t1t2_only = [
            ce for ce in sites_allowed
            if [ce.startswith('T1') or ce.startswith('T2')]
        ]
        if t1t2_only:
            # try to pick from T1T2 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1t2_only])]
            # then pick any otherwise
        else:
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        print "available=", SI.disk[sites_out[0]]
        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            'SiteBlacklist': sites_not_allowed,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog(
                'assignor',
                "Reading primary through xrootd at %s" % sorted(sites_allowed))

        if secondary_aaa:
            # Do not set TrustPUSitelist to True if there is no secondary
            if secondary:
                parameters['TrustPUSitelists'] = True
                wfh.sendLog(
                    'assignor', "Reading secondary through xrootd at %s" %
                    sorted(sites_allowed))

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team
        parameters['Team'] = team

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog('assignor',
                        'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        def pick_options(options, parameters):
            ##parse options entered in command line if any
            if options:
                for key in reqMgrClient.assignWorkflow.keys:
                    v = getattr(options, key)
                    if v != None:
                        if type(v) == str and ',' in v:
                            parameters[key] = filter(None, v.split(','))
                        else:
                            parameters[key] = v

        def pick_campaign(assign_parameters, parameters):
            ## pick up campaign specific assignment parameters
            parameters.update(assign_parameters.get('parameters', {}))

        if options.force_options:
            pick_campaign(assign_parameters, parameters)
            pick_options(options, parameters)
        else:
            ## campaign parameters update last
            pick_options(options, parameters)
            pick_campaign(assign_parameters, parameters)

        if not options.test:
            parameters['execute'] = True

        hold_split, split_check = wfh.checkSplitting()
        if hold_split and not options.go:
            if split_check:
                wfh.sendLog(
                    'assignor', 'Holding on to the change in splitting %s' %
                    ('\n\n'.join([str(i) for i in split_check])))
            else:
                wfh.sendLog('assignor', 'Change of splitting is on hold')
            n_stalled += 1
            continue

        if split_check == None or split_check == False:
            n_stalled += 1
            continue
        elif split_check:
            ## operate all recommended changes
            reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check)
            wfh.sendLog(
                'assignor', 'Applying the change in splitting %s' %
                ('\n\n'.join([str(i) for i in split_check])))

        split_check = True  ## bypass completely and use the above

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor',
                            "%s needs to be split by event with %s per job" %
                            (wfo.name, eventsPerJob),
                            level='critical')
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    # FIXME: decide which of the lines below needs to remain...
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',
                                "%s was assigned with %s lumis/job" %
                                (wfo.name, lumisPerJob),
                                level='critical')
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            'assignor',
                            "leaving splitting untouched for %s, please check on %s"
                            % (pstring, wfo.name),
                            level='critical')
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )

        ## make sure to autoapprove all NonCustodialSites
        parameters['AutoApproveSubscriptionSites'] = list(
            set(parameters['NonCustodialSites'] +
                parameters.get('AutoApproveSubscriptionSites', [])))
        result = reqMgrClient.assignWorkflow(
            url, wfo.name, None,
            parameters)  ## team is not relevant anymore here

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                wfh.sendLog(
                    'assignor', "Properly assigned\n%s" %
                    (json.dumps(parameters, indent=2)))
                if wfh.producePremix() and (not wfh.isRelval()):
                    title = "Heavy workflow assigned to {}".format(
                        parameters['SiteWhitelist'])
                    body = "Workflow name: {}".format(
                        wfh.request['RequestName'])
                    body += "\nOutput dataset(s): {}".format(
                        wfh.request['OutputDatasets'])
                    body += "\nAssigned to: {}".format(
                        parameters['SiteWhitelist'])
                    sendEmail(
                        title,
                        body,
                        destination=[
                            '*****@*****.**'
                        ])

                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs
                        LI.lock(secure, reason='assigning')

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                wfh.sendLog(
                    'assignor',
                    "Failed to assign %s.\n%s \n Please check the logs" %
                    (wfo.name, reqMgrClient.assignWorkflow.errorMessage))
                sendLog('assignor',
                        "Failed to assign %s.\n%s \n Please check the logs" %
                        (wfo.name, reqMgrClient.assignWorkflow.errorMessage),
                        level='critical')
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
    if n_stalled and not options.go and not options.early:
        sendLog('assignor',
                "%s workflows cannot be assigned. Please take a look" %
                (n_stalled),
                level='critical')

Exemple #5

0

Afficher le fichier

Fichier : outcleanor.py Projet : thongonary/WmAgentScripts

def outcleanor(url, options):
    print "Deprecated"
    return 

    if duplicateLock(): return 

    do_not_autoapprove = []#'T2_FR_CCIN2P3']
    LI = lockInfo()


    sites_and_datasets = defaultdict(list)
    our_copies = defaultdict(list)
    wf_cleaned = {}
    
    wfs = []
    for fetch in options.fetch.split(','):
        wfs.extend(session.query(Workflow).filter(Workflow.status==fetch).all())

    random.shuffle( wfs )
    last_answer = None
    for wfo in wfs :
        if options.number and len(wf_cleaned)>= options.number:
            print "Reached",options.number,"cleaned"
            break
        print '-'*100
        wfi = workflowInfo(url, wfo.name)
        goes = {} # boolean per output
        for dataset in wfi.request['OutputDatasets']:
            goes[dataset] = False
            keep_one_out = False ## change to no copy kept, since this is DDM handled
            status = getDatasetStatus( dataset )
            print "\n\tLooking at",dataset,status,"\n"
            vetoes = None
            if status == 'INVALID':
                vetoes = ['Export','Buffer'] ## can take themselves out
                keep_one_out = False # just wipe clean

            elif status == None:
                print dataset,"actually does not exist. skip"
                goes[dataset] = True
                continue

            elif status in ['PRODUCTION','VALID'] and wfo.status in ['forget','trouble']:
                print dataset,"should probably be invalidated. (",wfo.status,") skip"
                keep_one_out = False # just wipe clean
                continue ## you are not sure. just skip it for the time being

            elif status == 'PRODUCTION' and wfo.status in ['clean']:
                print dataset,"should probably be set valid .skip"
                continue ## you are not sure. just skip it for the time being

            if status == 'VALID' and dataset.startswith('/MinBias'):
                print "This is a /MinBias. skip"
                continue

            if '/DQM' in dataset:
                keep_one_out = False

            custodials = findCustodialLocation(url, dataset)
            if not len(custodials):
                print dataset,"has no custodial site yet, excluding from cleaning"
                continue

            total_size = getDatasetSize( dataset )
            
            our_presence = getDatasetPresence(url, dataset, complete=None, group="DataOps", vetoes=vetoes)
            also_our_presence = getDatasetPresence(url, dataset, complete=None, group="", vetoes=vetoes)
            
            ## merge in one unique dict
            for site in also_our_presence:
                if site in our_presence:
                    there,frac = our_presence[site]
                    other,ofrac = also_our_presence[site]
                    our_presence[site] = (max(there,other),max(frac,ofrac))
                else:
                    our_presence[site] = also_our_presence[site]
                
            if our_presence: print our_presence

            ## analysis ops copies need to be taken into account
            anaops_presence = getDatasetPresence(url, dataset, complete=None, group="AnalysisOps")
            own_by_anaops = anaops_presence.keys()
            
            ## all our copies
            to_be_cleaned = our_presence.keys()
            if not len(to_be_cleaned):
                print "nowhere to be found of ours,",len(own_by_anaops),"in analysi ops pool"
                goes[dataset] = True
                continue

            print "Where we own bits of dataset"
            print to_be_cleaned
     

            if len(own_by_anaops):
                ## remove site with the anaops copies
                to_be_cleaned = list(set(to_be_cleaned) - set(own_by_anaops))
                keep_one_out = False ## in that case, just remove our copies
                print "Own by anaops (therefore not keep a copy of ours)"
                print own_by_anaops
            else:
                ## we should not be looking at anything that was not passed to DDM, otherwise we'll be cutting the grass under our feet
                using_the_same = getWorkflowByInput(url, dataset, details=True)
                conflict = False
                for other in using_the_same:
                    if other['RequestName'] == wfo.name: continue
                    if other['RequestType'] == 'Resubmission': continue
                    if not other['RequestStatus'] in ['announced','normal-archived','aborted','rejected','aborted-archived','rejected-archived','closed-out','None',None]:
                        print other['RequestName'],'is in status',other['RequestStatus'],'preventing from cleaning',dataset
                        conflict=True
                        break
                if conflict:
                    continue

                ## not being used. a bit less dangerous to clean-out
                ## keep one full copy out there
                full_copies = [site for (site,(there,fract)) in our_presence.items() if there]
                if keep_one_out:
                    if not len(full_copies):
                        print "we do not own a full copy of",dataset,status,wfo.status,".skip"
                        continue
                    t1_full_copies = [ site for site in full_copies if site.startswith('T1')]
                    if t1_full_copies:
                        stay_there = random.choice( t1_full_copies ) #at a place own by ops
                    else:
                        stay_there = random.choice( full_copies ) #at a place own by ops
                    print "Where we keep a full copy", stay_there
                    to_be_cleaned.remove( stay_there )
                    our_copies[stay_there].append( dataset )
                    LI.release_except( dataset, stay_there, 'cleanup of output after production')            
                else:
                    print "We do not want to keep a copy of ",dataset,status,wfo.status
                    LI.release_everywhere( dataset, 'cleanup of output after production')

            if len(to_be_cleaned):
                print "Where we can clean"
                print to_be_cleaned
                for site in to_be_cleaned:
                    sites_and_datasets[site].append( (dataset, total_size*our_presence[site][1]/100., status) )
                goes[dataset] = True
            else:
                print "no cleaning to be done"
                goes[dataset] = True

        print wfo.name,"scrutinized"
        if all(goes.values()):
            print "\t",wfo.name,"can toggle -out"
        def ask():
            global last_answer
            last_answer = raw_input('go on ?')
            return last_answer
        if options.auto or ask() in ['y','']:
            if all(goes.values()):
                wfo.status = wfo.status+'-out'
                wf_cleaned[wfo.name] = wfo.status
            continue
        elif last_answer in ['q','n']:
            break
        else:
            return 

    if options.auto:
        pass
    elif last_answer in ['q']:
        return

    print "Potential cleanups"
    for (site,items) in sites_and_datasets.items():
        cleanup = sum([size for (_,size,_) in items])
        print "\n\t potential cleanup of","%8.4f"%cleanup,"GB at ",site
        print "\n".join([ds+" "+st for ds,_,st in items])
        datasets = [ ds for ds,_,st in items]

    print "Copies and bits we are going to delete"
    print json.dumps( sites_and_datasets, indent=2)

    print "Copies we are keeping"
    print json.dumps( our_copies, indent=2 )     

    print "Workflows cleaned for output"
    print json.dumps( wf_cleaned, indent=2 )
    #stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
    #open('outcleaning_%s.json'%stamp,'w').write( json.dumps( sites_and_datasets, indent=2))
    #open('keepcopies_%s.json'%stamp,'w').write( json.dumps( our_copies, indent=2))
    #open('wfcleanout_%s.json'%stamp,'w').write( json.dumps( wf_cleaned, indent=2))


    if (not options.test) and (options.auto or raw_input("Satisfied ? (y will trigger status change and deletion requests)") in ['y']):
        for (site,items) in sites_and_datasets.items():
            #datasets = [ ds for ds,_,st in items]
            #is_tape = any([v in site for v in ['MSS','Export','Buffer'] ])
            #comments="Cleanup output after production. DataOps will take care of approving it."
            #if is_tape:
            #    comments="Cleanup output after production."
            #print "making deletion to",site
            #result = makeDeleteRequest(url, site, datasets, comments=comments)
            #print result
            ## approve it right away ?
            #for did in [item['id'] for item in result['phedex']['request_created']]:
            #    if not is_tape:
            #        print "auto-approving to",site,"?"
            #        if not site in do_not_autoapprove:
            #            approveSubscription(url, did, nodes = [site], comments = 'Production cleaning by data ops, auto-approved')
            #        pass
            pass
        session.commit()
    else:
        print "Not making the deletion and changing statuses"

Exemple #6

0

Afficher le fichier

def assignor(url, specific=None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    #SI = siteInfo()
    SI = global_SI()
    #NLI = newLockInfo()
    #if not NLI.free() and not options.go: return
    LI = lockInfo()
    if not LI.free() and not options.go: return

    n_assigned = 0
    n_stalled = 0

    wfos = []
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered', 'staging'])
    if specific:
        fetch_from.extend(['considered-tried'])

    if options.early:
        print "Option Early is on"

    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from", fetch_from

    for status in fetch_from:
        print "getting wf in", status
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == status).all())
        print len(wfos)

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    dataset_endpoints = json.loads(
        open('%s/dataset_endpoints.json' % monitor_dir).read())
    aaa_mapping = json.loads(
        open('%s/equalizor.json' % monitor_pub_dir).read())['mapping']

    all_stuck = set()
    all_stuck.update(
        json.loads(open('%s/stuck_transfers.json' % monitor_pub_dir).read()))
    all_stuck.update(getAllStuckDataset())

    max_per_round = UC.get('max_per_round').get('assignor', None)
    max_cpuh_block = UC.get('max_cpuh_block')

    ##order by priority instead of random
    if options.early:
        cache = sorted(getWorkflows(url, 'assignment-approved', details=True),
                       key=lambda r: r['RequestPriority'])
        cache = [r['RequestName'] for r in cache]

        def rank(wfn):
            return cache.index(wfn) if wfn in cache else 0

        wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True)
        print "10 first", [wfo.name for wfo in wfos[:10]]
        print "10 last", [wfo.name for wfo in wfos[-10:]]
    else:
        random.shuffle(wfos)

    for wfo in wfos:

        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)

        if wfh.request['RequestStatus'] in [
                'rejected', 'aborted', 'aborted-completed', 'aborted-archived',
                'rejected-archived'
        ] and wfh.isRelval():
            wfo.status = 'forget'
            session.commit()
            n_stalled += 1
            continue

        if options.priority and int(
                wfh.request['RequestPriority']) < options.priority:
            continue

        options_text = ""
        if options.early: options_text += ", early option is ON"
        if options.partial:
            options_text += ", partial option is ON"
            options_text += ", good fraction is %.2f" % options.good_enough

        wfh.sendLog('assignor',
                    "%s to be assigned%s" % (wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))

        if not output_tiers:
            n_stalled += 1
            wfh.sendLog('assignor', 'There is no output at all')
            sendLog('assignor',
                    'Workflow %s has no output at all' % (wfo.name),
                    level='critical')
            continue

        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',
                        "%s are stuck input" % (','.join(is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = (not wfh.isRelval())
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update(CI.campaigns[campaign])

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'assignor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('assignor',
                            'These data tiers %s are not allowed' %
                            (','.join(banned_tier)),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                wfh.sendLog(
                    'assignor', '%s is not an allowed secondary' %
                    (', '.join(set(secondary) -
                               set(allowed_secondary.keys()))))
                sendLog(
                    'assignor',
                    '%s is not an allowed secondary' %
                    (', '.join(set(secondary) -
                               set(allowed_secondary.keys()))),
                    level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:  # and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update(allowed_secondary[sec])

        if no_go:
            n_stalled += 1
            ## make a very loud noise if >100k priority stalled
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        original_sites_allowed = copy.deepcopy(sites_allowed)
        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request['Campaign'],
                                       'SecondaryLocation', [])

        blocks = wfh.getBlockWhiteList()
        rwl = wfh.getRunWhiteList()
        if rwl:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set(blocks +
                                  getDatasetBlocks(dataset, runs=rwl)))
        lwl = wfh.getLumiWhiteList()
        if lwl:
            ## augment with lumi white list
            for dataset in primary:
                blocks = list(
                    set(blocks + getDatasetBlocks(dataset, lumis=lwl)))

        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))
        secondary_locations = None

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa
        do_partial = False  #options.good_enough if options.partial else 0

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns:
            assign_parameters.update(CI.campaigns[wfh.request['Campaign']])

        if 'primary_AAA' in assign_parameters:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']
        if 'partial_copy' in assign_parameters:
            ## can this only work if there is a stuck input ? maybe not
            ## this is a number. 0 means no
            print "Could do partial disk copy assignment"
            if is_stuck or options.partial:
                do_partial = assign_parameters['partial_copy']
                wfh.sendLog(
                    'assignor',
                    "Overiding partial copy assignment to %.2f fraction" %
                    do_partial)
                #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name))

        do_partial = options.good_enough if options.partial else do_partial

        for sec in list(secondary):
            if override_sec_location:
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                #sendEmail("tempting to pass sec location check","but we cannot yet IMO")
                #pass

            presence = getDatasetPresence(url, sec)
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [
                site for (site, (there, frac)) in presence.items()
                if frac > 98.
            ]

            if secondary_aaa:
                if not one_secondary_locations:
                    sec_availability = getDatasetBlocksFraction(url, sec)
                    if sec_availability >= 1. and options.go:
                        ## there is at least one copy of each block on disk. We should go ahead and let it go.
                        wfh.sendLog(
                            'assignor',
                            "The secondary %s is available %s times on disk, and usable"
                            % (sec, sec_availability))
                    else:
                        ## not even a copy on disk anywhere !!!!
                        sites_allowed = []  ## will block the assignment
                        wfh.sendLog(
                            'assignor',
                            "The secondary %s is nowhere on disk" % sec)
                #just continue without checking
                continue

            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations == None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(
                    set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [
                site for site in sites_allowed
                if SI.CE_to_SE(site) in one_secondary_locations
            ]

        wfh.sendLog(
            'assignor', "From/after secondary requirement, now Allowed%s" %
            sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy(
            sites_allowed
        )  ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy(sites_allowed)
        sites_with_data = copy.deepcopy(sites_allowed)
        sites_with_any_data = copy.deepcopy(sites_allowed)
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc'  ## by default

        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor", dataset_endpoints[prim]
                endpoints.update(dataset_endpoints[prim])
            set_lfn = getLFNbase(prim)
            presence = getDatasetPresence(url, prim, only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] = getDatasetBlocksFraction(
                url,
                prim,
                sites=[SI.CE_to_SE(site) for site in sites_allowed],
                only_blocks=blocks)
            if primary_aaa:
                available_fractions[prim] = getDatasetBlocksFraction(
                    url, prim, only_blocks=blocks)

            sites_all_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in [
                    psite for (psite, (there, frac)) in presence.items()
                    if there
                ]
            ]
            if primary_aaa:
                sites_all_data = list(
                    set([
                        SI.SE_to_CE(psite)
                        for (psite, (there, frac)) in presence.items() if there
                    ]))
            sites_with_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in
                [psite for (psite, frac) in presence.items() if frac[1] > 90.]
            ]
            sites_with_any_data = [
                site for site in sites_with_any_data
                if SI.CE_to_SE(site) in presence.keys()
            ]
            if primary_aaa:
                sites_with_any_data = list(
                    set([SI.SE_to_CE(psite) for psite in presence.keys()]))

            wfh.sendLog(
                'assignor', "Holding the data but not allowed %s" % sorted(
                    list(
                        set([
                            se_site for se_site in presence.keys()
                            if not SI.SE_to_CE(se_site) in sites_allowed
                        ]))))
            if primary_locations == None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(
                    set(primary_locations) & set(presence.keys()))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites = []
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in
                    list((set(secondary_locations) & set(primary_locations)) -
                         set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            elif primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in list(
                        set(primary_locations) -
                        set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            else:
                opportunistic_sites = []
            wfh.sendLog(
                'assignor', "We could be running in addition at %s" %
                sorted(opportunistic_sites))
            if any(
                [osite in SI.sites_not_ready
                 for osite in opportunistic_sites]):
                wfh.sendLog(
                    'assignor', "One of the usable site is in downtime %s" % ([
                        osite for osite in opportunistic_sites
                        if osite in SI.sites_not_ready
                    ]))
                down_time = True
                ## should this be send back to considered ?

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog('assignor', "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                'assignor',
                '%s requires a large numbr of CPUh %s , not assigning, please check with requester'
                % (wfo.name, cpuh),
                level='critical')
            wfh.sendLog(
                'assignor',
                "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                    wfh.request['Campaign']]:
            copies_needed_from_campaign = CI.campaigns[
                wfh.request['Campaign']]['maxcopies']
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)

        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(
                1, copies_wanted -
                less_copies_than_requested)  # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog('assignor',
                    "needed availability fraction %s" % copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed))

        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_all_data:
                wfh.sendLog('assignor',
                            "Overiding the primary on AAA setting to Off")
                primary_aaa = False
            else:
                aaa_grid = set(sites_all_data)
                for site in list(aaa_grid):
                    aaa_grid.update(aaa_mapping.get(site, []))
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog(
                    'assignor', "Selected to read primary through xrootd %s" %
                    sorted(sites_allowed))

        isStoreResults = ('StoreResults' == wfh.request.setdefault(
            'RequestType', None))

        if isStoreResults:
            if 'MergedLFNBase' in wfh.request:
                set_lfn = wfh.request['MergedLFNBase']
            else:
                n_stalled += 1
                wfh.sendLog(
                    'assignor',
                    "Cannot assign StoreResults request because MergedLFN is missing"
                )
                sendLog(
                    'assignor',
                    'Cannot assign StoreResults request because MergedLFN is missing',
                    level='critical')
                continue

        if not primary_aaa:
            if not isStoreResults:
                sites_allowed = sites_with_any_data
            else:
                ## if we are dealing with a StoreResults request, we don't need to check dataset availability and
                ## should use the SiteWhiteList set in the original request
                if 'SiteWhitelist' in wfh.request:
                    sites_allowed = wfh.request['SiteWhitelist']
                else:
                    wfh.sendLog(
                        'assignor',
                        "Cannot assign StoreResults request because SiteWhitelist is missing"
                    )
                    sendLog(
                        'assignor',
                        'Cannot assign StoreResults request because SiteWhitelist is missing',
                        level='critical')
                    n_stalled += 1
                    continue
                available_fractions = {}
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        ### check on endpoints for on-going transfers
        if do_partial:
            if endpoints:
                end_sites = [SI.SE_to_CE(s) for s in endpoints]
                sites_allowed = list(set(sites_allowed + end_sites))
                if down_time and not any(osite in SI.sites_not_ready
                                         for osite in end_sites):
                    print "Flip the status of downtime, since our destinations are good"
                    down_time = False
                print "with added endpoints", sorted(end_sites)
            else:
                print "Cannot do partial assignment without knowin the endpoints"
                n_stalled += 1
                continue

        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue

        low_pressure = SI.sites_low_pressure(0.4)
        ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started
        allowed_and_low = sorted(set(low_pressure) & set(sites_allowed))
        if allowed_and_low:
            wfh.sendLog(
                'assignor',
                "The workflow can run at %s under low pressure currently" %
                (','.join(allowed_and_low)))
            copies_wanted = max(1., copies_wanted - 1.)

        if available_fractions and not all([
                available >= copies_wanted
                for available in available_fractions.values()
        ]):
            not_even_once = not all([
                available >= 1. for available in available_fractions.values()
            ])
            above_good = all([
                available >= do_partial
                for available in available_fractions.values()
            ])
            wfh.sendLog(
                'assignor',
                "The input dataset is not available %s times, only %s" %
                (copies_wanted, available_fractions.values()))
            if down_time and not options.go and not options.early:
                wfo.status = 'considered'
                session.commit()
                wfh.sendLog(
                    'assignor',
                    "sending back to considered because of site downtime, instead of waiting"
                )
                #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog(
                    'assignor',
                    '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'
                    % (wfo.name),
                    level='delay')
                n_stalled += 1
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known and not options.limit and not options.go and not options.early and not (
                        do_partial and above_good):
                    wfh.sendLog(
                        'assignor',
                        "cannot be assigned, %s is not sufficiently available.\n %s"
                        % (wfo.name, json.dumps(available_fractions)))
                    #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append(wfo.name)
                    open('cannot_assign.json',
                         'w').write(json.dumps(known, indent=2))

                if options.early:
                    if wfo.status == 'considered':
                        wfh.sendLog('assignor', "setting considered-tried")
                        wfo.status = 'considered-tried'
                        session.commit()
                    else:
                        print "tried but status is", wfo.status
                if do_partial and above_good:
                    print "Will move on with partial locations"
                else:
                    n_stalled += 1
                    continue

        if not len(sites_allowed):
            if not options.early:
                wfh.sendLog('assignor',
                            "cannot be assign with no matched sites")
                sendLog('assignor',
                        '%s has no whitelist' % wfo.name,
                        level='critical')
            n_stalled += 1
            continue

        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog(
                'assignor',
                "Reading primary through xrootd at %s" % sorted(sites_allowed))

        if secondary_aaa:
            parameters['TrustPUSitelists'] = True
            wfh.sendLog(
                'assignor', "Reading secondary through xrootd at %s" %
                sorted(sites_allowed))

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team
        parameters['Team'] = team

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog('assignor',
                        'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        def pick_options(options, parameters):
            ##parse options entered in command line if any
            if options:
                for key in reqMgrClient.assignWorkflow.keys:
                    v = getattr(options, key)
                    if v != None:
                        if type(v) == str and ',' in v:
                            parameters[key] = filter(None, v.split(','))
                        else:
                            parameters[key] = v

        def pick_campaign(assign_parameters, parameters):
            ## pick up campaign specific assignment parameters
            parameters.update(assign_parameters.get('parameters', {}))

        if options.force_options:
            pick_campaign(assign_parameters, parameters)
            pick_options(options, parameters)
        else:
            ## campaign parameters update last
            pick_options(options, parameters)
            pick_campaign(assign_parameters, parameters)

        if not options.test:
            parameters['execute'] = True

        hold_split, split_check = wfh.checkSplitting()
        if hold_split and not options.go:
            if split_check:
                wfh.sendLog(
                    'assignor', 'Holding on to the change in splitting %s' %
                    ('\n\n'.join([str(i) for i in split_check])))
            else:
                wfh.sendLog('assignor', 'Change of splitting is on hold')
            n_stalled += 1
            continue

        if split_check == None or split_check == False:
            n_stalled += 1
            continue
        elif split_check:
            ## operate all recommended changes
            reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check)
            wfh.sendLog(
                'assignor', 'Applying the change in splitting %s' %
                ('\n\n'.join([str(i) for i in split_check])))

        split_check = True  ## bypass completely and use the above

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor',
                            "%s needs to be split by event with %s per job" %
                            (wfo.name, eventsPerJob),
                            level='critical')
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',
                                "%s was assigned with %s lumis/job" %
                                (wfo.name, lumisPerJob),
                                level='critical')
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            'assignor',
                            "leaving splitting untouched for %s, please check on %s"
                            % (pstring, wfo.name),
                            level='critical')
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )

        if isHEPCloudReady(url) and wfh.isGoodForNERSC():
            parameters['Team'] = 'hepcloud'
            parameters['SiteWhitelist'] = ['T3_US_NERSC']
            if primary:
                parameters['TrustSitelists'] = True
            if secondary:
                parameters['TrustPUSitelists'] = True
            sendEmail("sending work to hepcloud",
                      "pleasse check on %s" % wfh.request['RequestName'],
                      destination=['*****@*****.**'])

        ## make sure to autoapprove all NonCustodialSites
        parameters['AutoApproveSubscriptionSites'] = list(
            set(parameters['NonCustodialSites'] +
                parameters.get('AutoApproveSubscriptionSites', [])))

        result = reqMgrClient.assignWorkflow(
            url, wfo.name, None,
            parameters)  ## team is not relevant anymore here

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                wfh.sendLog(
                    'assignor', "Properly assigned\n%s" %
                    (json.dumps(parameters, indent=2)))
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs
                        LI.lock(secure, reason='assigning')

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                wfh.sendLog(
                    'assignor',
                    "Failed to assign %s.\n%s \n Please check the logs" %
                    (wfo.name, reqMgrClient.assignWorkflow.errorMessage))
                sendLog('assignor',
                        "Failed to assign %s.\n%s \n Please check the logs" %
                        (wfo.name, reqMgrClient.assignWorkflow.errorMessage),
                        level='critical')
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
    if n_stalled and not options.go and not options.early:
        sendLog('assignor',
                "%s workflows cannot be assigned. Please take a look" %
                (n_stalled),
                level='critical')

Exemple #7

0

Afficher le fichier

Fichier : closor.py Projet : julianbadillo/WmAgentScripts

def closor(url, specific=None):
    if not componentInfo().check(): return

    CI = campaignInfo()
    LI = lockInfo()

    ## manually closed-out workflows should get to close with checkor
    for wfo in session.query(Workflow).filter(Workflow.status=='close').all():

        if specific and not specific in wfo.name: continue

        ## what is the expected #lumis 
        wl = getWorkLoad(url, wfo.name)
        wfo.wm_status = wl['RequestStatus']

        if wl['RequestStatus'] in  ['announced','normal-archived']:
            ## manually announced ??
            wfo.status = 'done'
            wfo.wm_status = wl['RequestStatus']
            print wfo.name,"is announced already",wfo.wm_status

        session.commit()


        expected_lumis = 1
        if not 'TotalInputLumis' in wl:
            print wfo.name,"has not been assigned yet, or the database is corrupted"
        else:
            expected_lumis = wl['TotalInputLumis']

        ## what are the outputs
        outputs = wl['OutputDatasets']
        ## check whether the number of lumis is as expected for each
        all_OK = []
        #print outputs
        if len(outputs): 
            print wfo.name,wl['RequestStatus']
        for out in outputs:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=out)
            odb = session.query(Output).filter(Output.datasetname==out).first()
            if not odb:
                print "adding an output object",out
                odb = Output( datasetname = out )
                odb.workflow = wfo
                session.add( odb )
            odb.nlumis = lumi_count
            odb.nevents = event_count
            odb.workfow_id = wfo.id
            if odb.expectedlumis < expected_lumis:
                odb.expectedlumis = expected_lumis
            else:
                expected_lumis = odb.expectedlumis
            odb.date = time.mktime(time.gmtime())
            session.commit()

            print "\t%60s %d/%d = %3.2f%%"%(out,lumi_count,expected_lumis,lumi_count/float(expected_lumis)*100.)
            #print wfo.fraction_for_closing, lumi_count, expected_lumis
            fraction = wfo.fraction_for_closing
            fraction = 0.0
            all_OK.append((float(lumi_count) > float(expected_lumis*fraction)))


        ## only that status can let me go into announced
        if wl['RequestStatus'] in ['closed-out']:
            print wfo.name,"to be announced"

            results=[]#'dummy']
            if not results:
                for (io,out) in enumerate(outputs):
                    if all_OK[io]:
                        results.append(setDatasetStatus(out, 'VALID'))
                        tier = out.split('/')[-1]
                        to_DDM = (wl['RequestType'] == 'ReDigi' and not ('DQM' in tier))
                        campaign = None
                        try:
                            campaign = out.split('/')[2].split('-')[0]
                        except:
                            if 'Campaign' in wl and wl['Campaign']:
                                campaign = wl['Campaign']
                        if campaign and campaign in CI.campaigns and 'toDDM' in CI.campaigns[campaign] and tier in CI.campaigns[campaign]['toDDM']:
                            to_DDM = True
                            
                        ## inject to DDM when necessary
                        passed_to_DDM=True
                        if to_DDM:
                            #print "Sending",out," to DDM"
                            status = subprocess.call(['python','assignDatasetToSite.py','--nCopies=2','--dataset='+out,'--exec'])
                            if status!=0:
                                print "Failed DDM, retrying a second time"
                                status = subprocess.call(['python','assignDatasetToSite.py','--nCopies=2','--dataset='+out,'--exec'])
                                if status!=0:
                                    results.append("Failed DDM for %s"% out)
                                    sendEmail("failed DDM injection","could not add "+out+" to DDM pool. check closor logs.")
                                    passed_to_DDM=False
                            if passed_to_DDM:
                                ## make a lock release
                                LI.release_everywhere( out, reason = 'global unlock after passing to DDM')                                
                                pass

                    else:
                        print wfo.name,"no stats for announcing",out
                        results.append('No Stats')

                if all(map(lambda result : result in ['None',None,True],results)):
                    ## only announce if all previous are fine
                    results.append(reqMgrClient.announceWorkflowCascade(url, wfo.name))
                                
            #print results
            if all(map(lambda result : result in ['None',None,True],results)):
                wfo.status = 'done'
                session.commit()
                print wfo.name,"is announced"
            else:
                print "ERROR with ",wfo.name,"to be announced",json.dumps( results )
        else:
            print wfo.name,"not good for announcing:",wl['RequestStatus']

Exemple #8

0

Afficher le fichier

Fichier : lockor.py Projet : CMSCompOps/WmAgentScripts

CI = campaignInfo()
tier_no_custodial = UC.get('tiers_with_no_custodial')
tiers_keep_on_disk = UC.get("tiers_keep_on_disk")

now = time.mktime( time.gmtime())

## can we catch the datasets that actually should go to tape ?
custodial_override = {}
for c in CI.campaigns:
    if 'custodial_override' in CI.campaigns[c]:
        custodial_override[c] = CI.campaigns[c]['custodial_override']

newly_locking = set()
also_locking_from_reqmgr = set()

LI = lockInfo()

## add an addHoc list of things to lock. empyting this list would result in unlocking later
try:
    addHocLocks = json.loads( eosRead('%s/addhoc_lock.json'%base_eos_dir))
except:
    addHocLocks = []
    sys.exit(0)

time_point("Starting addhoc")

for item in addHocLocks:
    ds = item.split('#')[0]
    LI.lock( ds , reason='addhoc lock')
    newly_locking.add( ds )

Exemple #9

0

Afficher le fichier

Fichier : lock.py Projet : vkuznet/WmAgentScripts

#!/usr/bin/env python
import sys
from utils import lockInfo

LI = lockInfo(andwrite=False)
item = sys.argv[1]
reason = sys.argv[2] if len(sys.argv) > 2 else ''

LI.lock(item, reason=reason)

Exemple #10

0

Afficher le fichier

Fichier : transferor.py Projet : prozober/WmAgentScripts

def transferor(url ,specific = None, talk=True, options=None):
    if userLock():   return
    if duplicateLock():  return

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    #NLI = newLockInfo()
    #if not NLI.free(): return
    LI = lockInfo()
    if not LI.free(): return

    mcm = McMClient(dev=False)
    dss = DSS()

    #allowed_secondary = UC.get('')
    print "counting all being handled..."
    being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all())
    being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all())

    max_to_handle = options.maxworkflows
    max_to_transfer = options.maxstaging

    allowed_to_handle = max(0,max_to_handle - being_handled)
    allowed_to_transfer = max(0,max_to_transfer - being_transfered)
    wf_buffer = 5
    if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer
        print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer"
    else:
        print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer"

    if allowed_to_transfer <= wf_buffer:
        print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer"
    else:
        print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer"

    print "... done"

    all_transfers=defaultdict(list)
    workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset
    wfs_and_wfh=[]
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(Workflow.status.startswith('considered')).all():
        print "\t",wfo.name
        if specific and not specific in wfo.name: continue
        cache_r =filter(lambda d:d['RequestName']==wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) )
        else:
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) )
    print "... done"

    transfers_per_sites = defaultdict(int)
    input_sizes = {}
    ignored_input_sizes = {}
    input_cput = {}
    input_st = {}
    ## list the size of those in transfer already
    in_transfer_priority=None
    min_transfer_priority=None
    print "getting all wf in staging ..."
    stucks = json.loads(open('%s/stuck_transfers.json'%monitor_dir).read())
    
    for wfo in session.query(Workflow).filter(Workflow.status=='staging').all():
        wfh = workflowInfo( url, wfo.name, spec=False)
        #(lheinput,primary,parent,secondary) = wfh.getIO()
        #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        print wfo.name,"staging"
        (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList()
        for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list
            transfers_per_sites[site] += 1 
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:  
            ds_s = dss.get( prim )
            if prim in stucks: 
                wfh.sendLog('transferor', "%s appears stuck, so not counting it %s [GB]"%( prim, ds_s))
                ignored_input_sizes[prim] = ds_s
            else:
                input_sizes[prim] = ds_s
                wfh.sendLog('transferor', "%s needs %s [GB]"%( wfo.name, ds_s))
        if in_transfer_priority==None:
            in_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority']))
        if min_transfer_priority==None:
            min_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority']))



    try:
        print "Ignored input sizes"
        ignored_values = list(ignored_input_sizes.items())
        ignored_values.sort( key = lambda i : i[1] )
        print "\n".join( map(str, ignored_values ) )
        print "Considered input sizes"
        considered_values = list(input_sizes.items())
        considered_values.sort( key = lambda i : i[1] )
        print "\n".join( map(str, considered_values) )
    except Exception as e:
        print "trying to print the summary of input size"
        print str(e)

    print "... done"
    print "Max priority in transfer already",in_transfer_priority
    print "Min priority in transfer already",min_transfer_priority
    print "transfers per sites"
    print json.dumps( transfers_per_sites, indent=2)
    in_transfer_already = sum(input_sizes.values())
    cput_in_transfer_already = sum(input_cput.values())
    st_in_transfer_already = sum(input_st.values())

    ## list the size of all inputs
    primary_input_per_workflow_gb = defaultdict(float)
    print "getting all input sizes ..."
    for (wfo,wfh) in wfs_and_wfh:
        (_,primary,_,_) = wfh.getIO()
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:
            ## do not count it if it appears stalled !
            prim_size = dss.get( prim )
            input_sizes[prim] = prim_size
            primary_input_per_workflow_gb[wfo.name] += prim_size
    print "... done"

    # shuffle first by name
    random.shuffle( wfs_and_wfh )
    # Sort smallest transfers first; allows us to transfer as many as possible workflows.
    def prio_and_size( i, j):
        if int(i[1].request['RequestPriority']) == int(j[1].request['RequestPriority']):
            return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0)) )
        else:
            return cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']))

    #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True)
    #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) ))
    #sort by priority higher first
    wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True)

    if min_transfer_priority==None or in_transfer_priority ==None:
        print "nothing is lining up for transfer"
        sendLog("transferor","No request in staging, using first request to set priority limit")
        if len(wfs_and_wfh):
            min_transfer_priority = wfs_and_wfh[0][1].request['RequestPriority']
            in_transfer_priority = wfs_and_wfh[0][1].request['RequestPriority']
        else:
            return

    cput_grand_total = sum(input_cput.values())
    cput_to_transfer = cput_grand_total - cput_in_transfer_already
    st_grand_total = sum(input_st.values())
    st_to_transfer = st_grand_total - st_in_transfer_already
    print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already
    print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer
    print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots())
    print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots())
    print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already )
    print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer )


    grand_total =  sum(input_sizes.values()) 
    to_transfer = grand_total  - in_transfer_already
    grand_transfer_limit = options.maxtransfer
    #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB
    
    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered"%in_transfer_already
    print "%15.4f GB is the current requested transfer load"%to_transfer
    print "%15.4f GB is the global transfer limit"%grand_transfer_limit
    print "%15.4f GB is the available limit"%transfer_limit


    max_staging_per_site = options.maxstagingpersite
                    
    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer=0 ## so that we can count'em
    passing_along = 0
    transfer_sizes={}
    went_over_budget=False
    destination_cache = {}
    no_goes = set()

    max_per_round = UC.get('max_per_round').get('transferor',None)
    if max_per_round and not spec:
        wfs_and_wfh = wfs_and_wfh[:max_per_round]
    
    for (wfo,wfh) in wfs_and_wfh:
        print wfo.name,"to be transfered with priority",wfh.request['RequestPriority']

        if wfh.request['RequestStatus']!='assignment-approved':
            if wfh.request['RequestStatus'] in ['aborted','rejected','rejected-archived','aborted-archived']:
                wfo.status = 'trouble' ## so that we look or a replacement
            else:
                wfo.status = 'away'
            wfh.sendLog('transferor', '%s in status %s, setting %s'%( wfo.name,wfh.request['RequestStatus'],wfo.status))
            continue

        (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList()
        #(_,primary,_,_) = wfh.getIO()
        this_load=sum([input_sizes[prim] for prim in primary])
        no_budget = False
        if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ):
            if went_over_budget:
                wfh.sendLog('transferor', "Transfer has gone over bubget.")
            else:
                wfh.sendLog('transferor', "Transfer will go over bubget.")
            wfh.sendLog('transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"%(this_load, sum(transfer_sizes.values()), transfer_limit))
            #if sum(transfer_sizes.values()) > transfer_limit:
            went_over_budget = True
            if in_transfer_priority!=None and min_transfer_priority!=None:
                if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                    wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over budget"%( wfh.request['RequestPriority'], in_transfer_priority))
                else:
                    if not options.go: 
                        wfh.sendLog('transferor',"%s minimum priority %s < %s : stop"%( min_transfer_priority,wfh.request['RequestPriority'],in_transfer_priority))
                        no_budget = True

        ## throtlle by campaign go
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True
            no_goes.add( wfo.name )
            
        allowed_secondary = {}
        overide_parameters = {}
        check_secondary = False
        output_tiers = list(set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                overide_parameters.update( CI.campaigns[campaign] )
            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update( CI.campaigns[campaign]['secondaries'] )
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[campaign]:
                banned_tier = list(set(CI.campaigns[campaign]['banned_tier']) & set(output_tiers))
                if banned_tier:
                    no_go=True
                    wfh.sendLog('transferor','These data tiers %s are not allowed'%(','.join( banned_tier)))
                    sendLog('transferor','These data tiers %s are not allowed in %s'%(','.join( banned_tier), wfo.name), level='critical')

        if secondary and check_secondary:
            if (set(secondary)&set(allowed_secondary.keys())!=set(secondary)):
                wfh.sendLog('transferor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))))
                sendLog('transferor','%s is not an allowed secondary'%(', '.join(set(secondary)-set(allowed_secondary.keys()))), level='critical')
                if not options.go: 
                    no_go = True
            for sec in secondary:
                if sec in allowed_secondary:
                    overide_parameters.update( allowed_secondary[sec] )

        if 'SiteWhitelist' in overide_parameters:
            sites_allowed = list(set(sites_allowed) & set(overide_parameters['SiteWhitelist']))

        if no_go:
            continue
        ## check if the batch is announced

        def check_mcm(wfn):
            announced=False
            is_real=False
            if not wfn.startswith('pdmvserv'):
                is_real = True
            try:
                for b in mcm.getA('batches',query='contains=%s'% wfo.name):
                    is_real = True
                    if b['status']=='announced': 
                        announced=True 
                        break
            except:
                try:
                    for b in mcm.getA('batches',query='contains=%s'% wfo.name):
                        is_real = True
                        if b['status']=='announced': 
                            announced=True 
                            break
                except:
                    print "could not get mcm batch announcement, assuming not real"
            return announced,is_real

        if not use_mcm:
            announced,is_real = False,True
        else:
            if wfh.request['RequestType'] in ['ReReco']:
                announced,is_real = True,True
            else:
                announced,is_real = check_mcm( wfo.name )

        if not announced:
            wfh.sendLog('transferor', "does not look announced.")

            
        if not is_real:
            wfh.sendLog('transferor', "does not appear to be genuine.")

            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.)
        now = time.mktime(time.gmtime()) / (60.*60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced: 
                wfh.sendLog('transferor', "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time))
                continue


        if passing_along >= allowed_to_handle:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if in_transfer_priority!=None and min_transfer_priority!=None:
                if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%( wfh.request['RequestPriority'], in_transfer_priority, max_to_handle))
                else:
                    wfh.sendLog('transferor'," Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"%( max_to_handle, being_handled, passing_along))
                    if not options.go: 
                        ## should not allow to jump that fence
                        break

        if this_load and needs_transfer >= allowed_to_transfer:
            if in_transfer_priority!=None and min_transfer_priority!=None:
                if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%(wfh.request['RequestPriority'], in_transfer_priority,max_to_transfer))
                else:
                    wfh.sendLog('transferor',"Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"%( max_to_transfer, being_transfered, needs_transfer))
                    if not options.go: 
                        no_budget = True


        if no_budget:
            break ## try this for a while to make things faster
            #continue

        ## the site white list considers site, campaign, memory and core information
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')


        for dataset in list(primary)+list(parent)+list(secondary):
            ## lock everything flat
            #NLI.lock( dataset )
            LI.lock( dataset , reason='staging' )

        if not sites_allowed:
            wfh.sendLog('transferor',"not possible site to run at")
            #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name ))
            sendLog('transferor',"%s has no possible sites to run at"%( wfo.name ),level='critical')
            continue

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) ))
        if 'LumiList' in wfh.request and wfh.request['LumiList']:
            ## augment with the lumi white list
            blocks = list(set( blocks + getDatasetBlocks( dataset, lumis= wfh.request['LumiList'] ) ))

        if blocks:
            print "Reading",len(blocks),"in block whitelist"

        can_go = True
        staging=False
        allowed=True
        primary_destinations = set()
        if primary:
            
            copies_needed_from_CPUh,CPUh = wfh.getNCopies()

            if talk:
                print wfo.name,'reads',', '.join(primary),'in primary'
            ## chope the primary dataset 
            for prim in primary:
                ## keep track of what needs what
                workflow_dependencies[prim].add( wfo.id )

                max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority']))

                wfh.sendLog('transferor',"Would make %s  from cpu requirement %s"%( copies_needed_from_CPUh, CPUh))
                copies_needed = copies_needed_from_CPUh

                if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]:
                    copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies']
                    copies_needed = min(copies_needed_from_campaign, copies_needed)
                    
                    wfh.sendLog('transferor',"Maxed to %s by campaign configuration %s"%( copies_needed, wfh.request['Campaign']))


                ### new ways of making the whole thing
                destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks )
                print json.dumps(destinations, indent=2)

                ## get where the dataset is in full and completed
                prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1]
                ## the rest is places it is going to be
                prim_destination = [site for site in destinations.keys() if not site in prim_location]
                ## veto the site with no current disk space
                prim_destination = [site for site in prim_destination if SI.disk[site]]


                if len(prim_location) >= copies_needed:
                    wfh.sendLog('transferor',"The input is all fully in place at %s sites %s"%( len(prim_location), sorted(prim_location)))
                    continue
                copies_needed = max(0,copies_needed - len(prim_location))
                wfh.sendLog('transferor',"not counting existing copies ; now need %s"% copies_needed)
                copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names]

                latching_on_transfers = set()
                [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination]
                latching_on_transfers = list(latching_on_transfers)
                #print latching_on_transfers

                ## figure out where all this is going to go
                prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location]
                prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination]
                ## take out the ones that cannot receive transfers
                potential_destinations = len(prim_to_distribute)
                prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]

                ## do we want to restrict transfers if the amount of site in vetoe are too large ?
                

                wfh.sendLog('transferor',"Could be going to: %s"% sorted( prim_to_distribute))
                if not prim_to_distribute or any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]):
                    ## means there is openings let me go
                    print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute]
                    #for site in sites_allowed:
                    #    #increment accross the board, regardless of real destination: could be changed
                    #    transfers_per_sites[site] += 1
                else:
                    if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                        wfh.sendLog('transferor', "Higher priority sample %s >= %s go-on over transfer slots available"%(wfh.request['RequestPriority'], in_transfer_priority))
                    else:
                        wfh.sendLog('transferor',"Not allowed to transfer more than %s per site at a time. Going overboard for %s"%( max_staging_per_site, sorted([site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site])))
                        if not options.go:
                            allowed = False
                            break

                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(Transfer.phedexid == int(latching)).first()
                    if not tfo:
                        tfo = session.query(Transfer).filter(Transfer.phedexid == -int(latching)).first()
                        
                    if not tfo:
                        tfo = Transfer( phedexid = latching)
                        tfo.workflows_id = []
                        session.add(tfo)
                    else:
                        tfo.phedexid = latching ## make it positive ever

                    if not wfo.id in tfo.workflows_id:
                        print "adding",wfo.id,"to",tfo.id,"with phedexid",latching
                        l = copy.deepcopy( tfo.workflows_id )
                        l.append( wfo.id )
                        tfo.workflows_id = l
                    if not options.test:
                        #session.commit()
                        pass
                    else:
                        session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                #copies_needed = max(0,copies_needed - len(prim_destination))
                copies_needed = max(0,copies_needed - min(copies_being_made))
                wfh.sendLog('transferor', "Not counting the copies being made ; then need %s"% copies_needed)                    
                if copies_needed == 0:
                    wfh.sendLog('transferor', "The input is either fully in place or getting in full somewhere with %s"% latching_on_transfers)
                    can_go = True
                    continue
                elif len(prim_to_distribute)==0:
                    wfh.sendLog('transferor', "We are going to need extra copies of %s, but no destinations seems available"%(prim))
                    sendLog('transferor', "We are going to need extra copies of %s, but no destinations seems available"%(prim),level='critical')
                    prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location]
                    prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                    

                if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the 
                    if not options or options.chop:
                        ### hard include the tape disk andpoint ?
                        #tapes = [site for site in  getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')]
                        chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks)
                        spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes)
                        transfer_sizes[prim] = sum(sizes)
                        if not spreading:
                            sendLog('transferor','cannot send %s to any site, it cannot fit anywhere'% prim, level='critical')
                            wfh.sendLog('transferor', "cannot send to any site. %s cannot seem to fit anywhere"%(prim))
                            staging=False
                            can_go = False
                    
                    else:
                        spreading = {} 
                        for site in prim_to_distribute: 
                            if blocks:
                                spreading[site]=blocks
                            else:
                                spreading[site]=[prim]
                        transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified
                    can_go = False
                    wfh.sendLog('transferor', "selected CE destinations %s"%(sorted( spreading.keys())))
                    for (site,items) in spreading.items():
                        all_transfers[site].extend( items )
                        transfers_per_sites[site] += 1
                        primary_destinations.add( site ) 
                else:
                    can_go = False
                    allowed = False

        if not allowed:
            wfh.sendLog('transferor', "Not allowed to move on with")
            continue


        if secondary:

            override_sec_destination = []
            if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]:
                override_sec_destination  = CI.campaigns[wfh.request['Campaign']]['SecondaryLocation']
            if 'SecondaryLocation' in overide_parameters:
                override_sec_destination  = overide_parameters['SecondaryLocation']
            print wfo.name,'reads',', '.join(secondary),'in secondary'
            for sec in secondary:

                workflow_dependencies[sec].add( wfo.id )

                if True:
                    ## new style, failing on minbias
                    if not sec in destination_cache:
                        ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist
                        destination_cache[sec],_ = getDatasetDestinations(url, sec) ## NO SITE WHITE LIST ADDED
                        #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])

                    ## limit to the site whitelist NOW
                    se_allowed = [SI.CE_to_SE(site) for site in sites_allowed]
                    destinations = dict([(k,v) for (k,v) in destination_cache[sec].items() if site in se_allowed])
                    ## truncate location/destination to those making up for >90% of the dataset
                    bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9]
                    sec_location = [site for (site,info) in destinations.items() if info['completion']>=95]
                    sec_destination = [site for site in destinations.keys() if not site in sec_location]
                else:
                    ## old style
                    presence = getDatasetPresence( url, sec )
                    sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites
                    subscriptions = listSubscriptions( url ,sec )
                    sec_destination = [site for site in subscriptions] 


                sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                sec_to_distribute = [site for site in sec_to_distribute if not  any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                if override_sec_destination:
                    ## intersect with where we want the PU to be
                    not_needed_anymore = list(set(sec_to_distribute) - set(override_sec_destination))
                    #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore ))
                    sendLog('transferor', "the dataset %s could be removed from %s"%( sec, not_needed_anymore ))
                    sec_to_distribute = list(set(sec_to_distribute) & set(override_sec_destination))

                if len( sec_to_distribute )>0:
                    print "secondary could go to",sorted(sec_to_distribute)
                    sec_size = dss.get( sec )
                    for site in sec_to_distribute:
                        site_se =SI.CE_to_SE(site)
                        if (SI.disk[site_se]*1024.) > sec_size:
                            wfh.sendLog('transferor', 'Sending %s to %s'%( sec, site ))
                            all_transfers[site].append( sec )
                            can_go = False
                        else:
                            print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size
                            if primary_destinations and site in primary_destinations:
                                #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024))
                                sendLog('transferor', '%s is too big (%s) for %s (%s). %s will not be able to run there.'%( sec, sec_size, site_se, SI.disk[site_se]*1024, wfo.name), level='critical')
                                wfh.sendLog('transferor', '%s is too big (%s) for %s (%s). will not be able to run there.'%( sec, sec_size, site_se, SI.disk[site_se]*1024))
                else:
                    print "the secondary input does not have to be send to site"

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                wfh.sendLog('transferor', "latches on existing transfers, and nothing else, settin staging")
                wfo.status = 'staging'
                needs_transfer+=1
            else:
                wfh.sendLog('transferor', "should just be assigned now to %s"%sorted(sites_allowed))
                wfo.status = 'staged'
            passing_along+=1
            wfh.sendLog('transferor', "setting %s status to %s"%(wfo.name,wfo.status))
            #session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                wfh.sendLog('transferor', "latches on existing transfers")
                if not options.test:
                    wfo.status = 'staging'
                    wfh.sendLog('transferor', "setting %s status to %s"%(wfo.name,wfo.status))
                    #session.commit()
            wfh.sendLog('transferor',"needs a transfer")
            needs_transfer+=1
            passing_along+=1

    if no_goes:
        #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes ))
        sendLog('transferor', "No go for \n"+"\n".join( sorted(no_goes) ), level='critical')

    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    fake_id=-1
    wf_id_in_prestaging=set()

    for (site,items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))

        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer: 
            print site,"does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        block_datasets = list(set([it.split('#')[0] for it in blocks]))
        datasets = [it for it in items_to_transfer if not '#' in it]

        details_text = "Making a replica to %s (CE) %s (SE) for"%( site, site_se)
        

        #print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [block for block in blocks if not block.split('#')[0] in datasets]
        #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        #print "\t",len(datasets),"datasets"
        #print "\t",datasets
        details_text += '\n\t%d blocks'%len(blocks)
        details_text += '\n\t%d needed blocks for %s'%( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks]))))
        details_text += '\n\t%d datasets'% len(datasets)
        details_text += '\n\t%s'%sorted(datasets)
        
        items_to_transfer = blocks + datasets

        if execute:
            sendLog('transferor', details_text)
        else:
            print "Would make a replica to",site,"(CE)",site_se,"(SE) for"
            print details_text

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y','yes','go']:
                continue

        if execute:
            priority = 'normal'
            cds = [ds for ds in datasets+block_datasets if ds in max_priority]
            if cds and False: ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed
                ## decide on an overall priority : that's a bit too large though
                if any([max_priority[ds]>=90000 for ds in cds]):
                    priority = 'high'
                elif all([max_priority[ds]<80000 for ds in cds]):
                    priority = 'low'
                
            result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority, approve=True)
        else:
            result= {'phedex':{'request_created' : []}}
            fake_id-=1



        if not result:
            sendLog('transferor','Could not make a replica request for items %s to site %s'%(items_to_transfer,site),level='critical')
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(Transfer.phedexid == int(phedexid)).first()
            if not new_transfer:
                new_transfer = session.query(Transfer).filter(Transfer.phedexid == -int(phedexid)).first()
            print phedexid,"transfer created"
            if not new_transfer:
                new_transfer = Transfer( phedexid = phedexid)
                session.add( new_transfer )                
            else:
                new_transfer.phedexid = phedexid ## make it positive again

            new_transfer.workflows_id = set()
            for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update( workflow_dependencies[transfering] )
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            #session.commit()
            ## auto approve it
            if execute:
                #approved = approveSubscription(url, phedexid, [site_se])
                ## it's been auto-approved above
                pass

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status!='staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting",tr_wf.name,"to staging"
        #session.commit()

    ## one big session commit at the end that everything went fine
    session.commit()

Exemple #11

0

Afficher le fichier

Fichier : assignor.py Projet : julianbadillo/WmAgentScripts

def assignor(url ,specific = None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return
    #if notRunningBefore( 'stagor' ): return
    if not componentInfo().check(): return

    CI = campaignInfo()
    SI = siteInfo()
    LI = lockInfo()
    NLI = newLockInfo()

    n_assigned = 0
    n_stalled = 0

    wfos=[]
    if specific:
        wfos = session.query(Workflow).filter(Workflow.name==specific).all()
    if not wfos:
        if specific:
            wfos = session.query(Workflow).filter(Workflow.status=='considered').all()
            wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all())
        wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all())

    for wfo in wfos:
        if specific:
            if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue
            #if not specific in wfo.name: continue
        print "\n\n",wfo.name,"\n\tto be assigned"
        wfh = workflowInfo( url, wfo.name)


        ## check if by configuration we gave it a GO
        if not CI.go( wfh.request['Campaign'] ) and not options.go:
            print "No go for",wfh.request['Campaign']
            n_stalled+=1
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] !='assignment-approved':
            if not options.test:
                print wfo.name,wfh.request['RequestStatus'],"setting away and skipping"
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name,wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version=wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                print "cannot decide on version number"
                n_stalled+=1
                continue

        (lheinput,primary,parent,secondary) = wfh.getIO()
        sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )

        print "Site white list",sorted(sites_allowed)

        if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist']

        if 'SiteBlacklist' in CI.parameters(wfh.request['Campaign']):
            print "Reducing the whitelist due to black list in campaign configuration"
            print "Removing",CI.parameters(wfh.request['Campaign'])['SiteBlacklist']
            sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfh.request['Campaign'])['SiteBlacklist']))

        blocks = []
        if 'BlockWhitelist' in wfh.request:
            blocks = wfh.request['BlockWhitelist']

        memory_allowed = SI.sitesByMemory( wfh.request['Memory'] )
        if memory_allowed!=None:
            print "sites allowing", wfh.request['Memory'],"are",sorted(memory_allowed)
            sites_allowed = list(set(sites_allowed) & set(memory_allowed))

        print "Allowed",sorted(sites_allowed)
        secondary_locations=None
        for sec in list(secondary):
            presence = getDatasetPresence( url, sec )
            print sec
            print json.dumps(presence, indent=2)
            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>90.]
            one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations==None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations]
            
        print "From secondary requirement, now Allowed",sorted(sites_allowed)
        sites_all_data = copy.deepcopy( sites_allowed )
        sites_with_data = copy.deepcopy( sites_allowed )
        sites_with_any_data = copy.deepcopy( sites_allowed )
        primary_locations = None
        available_fractions = {}
        for prim in list(primary):
            presence = getDatasetPresence( url, prim , only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] =  getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks)
            #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])]
            #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])]
            sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]]
            sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]]
            sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()]
            print "Holding the data but not allowed",list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed]))
            if primary_locations==None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(set(primary_locations) & set(presence.keys() ))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites=[]
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            elif primary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            else:
                opportunistic_sites = []
            print "We could be running at",sorted(opportunistic_sites),"in addition"
            if any([osite in SI.sites_not_ready for osite in opportunistic_sites]):
                print "One of the destination site is in downtime"
                down_time = True
                ## should this be send back to considered ?
                

        """
        if available_fractions and not all([available>=1. for available in available_fractions.values()]):
            print "The input dataset is not located in full over sites"
            print json.dumps(available_fractions)
            if not options.test and not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known:
                    sendEmail( "cannot be assigned","%s is not full over sites \n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append( wfo.name )
                    open('cannot_assign.json','w').write(json.dumps( known, indent=2))
                n_stalled+=1
                continue ## skip skip skip
        """

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted,cpuh = wfh.getNCopies()
        
        if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]):
            print "The input dataset is not available",copies_wanted,"times, only",available_fractions.values()
            if down_time:
                wfo.status = 'considered'
                session.commit()
                print "sending back to considered because of site downtime, instead of waiting"
                sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                continue
                #pass

            print json.dumps(available_fractions)
            if not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known:
                    sendEmail( "cannot be assigned","%s is not sufficiently available. Probably phedex information lagging behind. \n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append( wfo.name )
                    open('cannot_assign.json','w').write(json.dumps( known, indent=2))
                n_stalled+=1
                continue

        ## default back to white list to original white list with any data
        print "Allowed",sites_allowed
        sites_allowed = sites_with_any_data
        print "Selected for any data",sites_allowed

        if options.restrict:
            print "Allowed",sites_allowed
            sites_allowed = sites_with_any_data
            print "Selected",sites_allowed
        else:
            if set(sites_with_data) != set(sites_allowed):
                ## the data is not everywhere we wanted to run at : enable aaa
                print "Sites with 90% data not matching site white list (block choping!)"
                print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?"
                print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data))
                #options.useSiteListAsLocation = True
                #print "Not commissioned yet"
                #continue
            #print "We could be running at",opportunistic_sites,"in addition"
            ##sites_allowed = list(set(sites_allowed+ opportunistic_sites))

        if not len(sites_allowed):
            print wfo.name,"cannot be assign with no matched sites"
            sendEmail( "cannot be assigned","%s has no whitelist"%(wfo.name))
            n_stalled+=1
            continue

        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])]


        print "Placing the output on", sites_out
        parameters={
            'SiteWhitelist' : sites_allowed,
            #'CustodialSites' : sites_custodial,
            'NonCustodialSites' : sites_out,
            'AutoApproveSubscriptionSites' : list(set(sites_out)),
            'AcquisitionEra' : wfh.acquisitionEra(),
            'ProcessingString' : wfh.processingString(),
            'MergedLFNBase' : '/store/mc', ## to be figured out
            'ProcessingVersion' : version,
            }


        ## plain assignment here
        team='production'
        if options and options.team:
            team = options.team

        if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]):
            ## consider SDSC
            parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC']
            parameters['useSiteListAsLocation'] = True
            team = 'allocation-based'
            sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**'])
            
        if wfh.request['Campaign']=='RunIIWinter15GS' and random.random() < -1.0:
            parameters['SiteWhitelist'] = ['T3_US_SDSC']
            team = 'allocation-based'
            sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**'])
        

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v=getattr(options,key)
                if v!=None:
                    if ',' in v: parameters[key] = filter(None,v.split(','))
                    else: parameters[key] = v

        ## pick up campaign specific assignment parameters
        parameters.update( CI.parameters(wfh.request['Campaign']) )

        if not options.test:
            parameters['execute'] = True

        if not wfh.checkWorkflowSplitting():
            print "Falling back to event splitting."
            parameters['SplittingAlgorithm'] = 'EventBased'
            sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name)
            ## needs to go to event based ? fail for now
            #print "Falling back to event splitting ?"
            #sendEmail("Cannot assign","the workflow %s is too heavy to be processed as it is. Could fallback to EventBased splitting"%wfo.name)
            #continue

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents/(reqJobs*1.4))
                lumisPerJob = int(eventsPerJob/eventsPerLumi)
                if lumisPerJob==0:
                    print "There is no go for assigning that request without event splitting"
                    sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    print "need to go down to",eventsPerJob,"events per job"
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        print "need to go down to",lumisPerJob,"in assignment"
                        sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        print "the regular splitting should work for",pstring
                        sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)

        result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters)


        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned+=1
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo( url, wfo.name)
                    (_,prim,_,sec) = new_wfi.getIO()
                    for output in new_wfi.request['OutputDatasets']:
                        ## lock all outputs flat
                        NLI.lock( output )
                    for site in [SI.CE_to_SE(site) for site in sites_allowed]:
                        for output in new_wfi.request['OutputDatasets']:
                            LI.lock( output, site, 'dataset in production')
                        for primary in prim:
                            LI.lock( primary, site, 'dataset used in input')
                        for secondary in sec:
                            LI.lock( secondary, site, 'required for mixing' )

                except Exception as e:
                    print "fail in locking output"
                    print str(e)
                    sendEmail("failed locking of output",str(e))


            else:
                print "ERROR could not assign",wfo.name
        else:
            pass
    print "Assignment summary:"
    print "Assigned",n_assigned
    print "Stalled",n_stalled

Exemple #12

0

Afficher le fichier

Fichier : transferor.py Projet : julianbadillo/WmAgentScripts

def transferor(url ,specific = None, talk=True, options=None):
    if userLock():   return
    if duplicateLock():  return

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    LI = lockInfo()
    NLI = newLockInfo()
    mcm = McMClient(dev=False)
    dss = DSS()

    print "counting all being handled..."
    being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all())
    being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all())

    max_to_handle = options.maxworkflows
    max_to_transfer = options.maxstaging

    allowed_to_handle = max(0,max_to_handle - being_handled)
    allowed_to_transfer = max(0,max_to_transfer - being_transfered)
    wf_buffer = 5
    if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer
        print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer"
    else:
        print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer"

    if allowed_to_transfer <= wf_buffer:
        print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer"
    else:
        print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer"

    print "... done"

    all_transfers=defaultdict(list)
    needing_locks=defaultdict(list)
    workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset
    wfs_and_wfh=[]
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(Workflow.status=='considered').all():
        print "\t",wfo.name
        if specific and not specific in wfo.name: continue
        cache_r =filter(lambda d:d['RequestName']==wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) )
        else:
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) )
    print "... done"

    transfers_per_sites = defaultdict(int)
    input_sizes = {}
    input_cput = {}
    input_st = {}
    ## list the size of those in transfer already
    in_transfer_priority=0
    min_transfer_priority=100000000
    print "getting all wf in staging ..."
    for wfo in session.query(Workflow).filter(Workflow.status=='staging').all():
        wfh = workflowInfo( url, wfo.name, spec=False)
        (lheinput,primary,parent,secondary) = wfh.getIO()
        sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list
            transfers_per_sites[site] += 1 
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:  
            input_sizes[prim] = dss.get( prim )
            print "\t",wfo.name,"needs",input_sizes[prim],"GB"
        in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority']))
        min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority']))

    print "... done"
    print "Max priority in transfer already",in_transfer_priority
    print "Min priority in transfer already",min_transfer_priority
    print "transfers per sites"
    print json.dumps( transfers_per_sites, indent=2)
    in_transfer_already = sum(input_sizes.values())
    cput_in_transfer_already = sum(input_cput.values())
    st_in_transfer_already = sum(input_st.values())
    # shuffle first by name
    random.shuffle( wfs_and_wfh )
    #sort by priority higher first
    wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True)
    

    ## list the size of all inputs
    print "getting all input sizes ..."
    for (wfo,wfh) in wfs_and_wfh:
        (_,primary,_,_) = wfh.getIO()
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:
            input_sizes[prim] = dss.get( prim )
    print "... done"

    cput_grand_total = sum(input_cput.values())
    cput_to_transfer = cput_grand_total - cput_in_transfer_already
    st_grand_total = sum(input_st.values())
    st_to_transfer = st_grand_total - st_in_transfer_already
    print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already
    print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer
    print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots())
    print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots())
    print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already )
    print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer )


    grand_total =  sum(input_sizes.values()) 
    to_transfer = grand_total  - in_transfer_already
    grand_transfer_limit = options.maxtransfer 
    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered"%in_transfer_already
    print "%15.4f GB is the current requested transfer load"%to_transfer
    print "%15.4f GB is the global transfer limit"%grand_transfer_limit
    print "%15.4f GB is the available limit"%transfer_limit


    max_staging_per_site = options.maxstagingpersite
                    
    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer=0 ## so that we can count'em
    passing_along = 0
    transfer_sizes={}
    went_over_budget=False
    destination_cache = {}
    for (wfo,wfh) in wfs_and_wfh:
        print wfh.request['RequestPriority']
        print wfo.name,"to be transfered"
        #wfh = workflowInfo( url, wfo.name)

        (_,primary,_,_) = wfh.getIO()
        this_load=sum([input_sizes[prim] for prim in primary])
        if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ):
            if went_over_budget:
                print "Transfer has gone over bubget."
            else:
                print "Transfer will go over bubget."
            print "%15.4f GB this load"%this_load
            print "%15.4f GB already this round"%sum(transfer_sizes.values())
            print "%15.4f GB is the available limit"%transfer_limit
            went_over_budget=True
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget"
            else:
                if not options.go: 
                    print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop"
                    continue


        ## throtlle by campaign go
        if not CI.go( wfh.request['Campaign'] ):
            print "No go for",wfh.request['Campaign']
            if not options.go: 
                sendEmail("no go for managing","No go for "+wfh.request['Campaign'])
                continue

        ## check if the batch is announced

        def check_mcm(wfn):
            announced=False
            is_real=False
            if not wfn.startswith('pdmvserv'):
                is_real = True
            try:
                for b in mcm.getA('batches',query='contains=%s'% wfo.name):
                    is_real = True
                    if b['status']=='announced': 
                        announced=True 
                        break
            except:
                try:
                    for b in mcm.getA('batches',query='contains=%s'% wfo.name):
                        is_real = True
                        if b['status']=='announced': 
                            announced=True 
                            break
                except:
                    print "could not get mcm batch announcement, assuming not real"
            return announced,is_real

        if not use_mcm:
            announced,is_real = False,True
        else:
            announced,is_real = check_mcm( wfo.name )

        if not announced:
            print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?"
            
        if not is_real:
            print wfo.name,"does not appear to be genuine."
            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.)
        now = time.mktime(time.gmtime()) / (60.*60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced: 
                print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time)
                continue


        if passing_along >= allowed_to_handle:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority:
                ## higher priority, and not only this priority being transfered
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle
            else:
                print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along
                if not options.go: break

        if this_load and needs_transfer >= allowed_to_transfer:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority:
                ## higher priority, and not only this priority being transfered
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_transfer
            else:
                print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"transfering, and adding",needs_transfer
                if not options.go: continue


        (lheinput,primary,parent,secondary) = wfh.getIO()
        for dataset in list(primary)+list(parent)+list(secondary):
            ## lock everything flat
            NLI.lock( dataset )

        if options and options.tosites:
            sites_allowed = options.tosites.split(',')
        else:
            sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )

        if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist']

        if 'SiteBlacklist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfh.request['Campaign'])['SiteBlacklist']))

        ## reduce right away to sites in case of memory limitation
        memory_allowed = SI.sitesByMemory( wfh.request['Memory'] )
        if memory_allowed!=None:
            print "sites allowing", wfh.request['Memory'],"are",memory_allowed
            sites_allowed = list(set(sites_allowed) & set(memory_allowed))

        if not sites_allowed:
            print wfo.name,"has no possible sites to run at"
            print "available for",wfh.request['Memory'],"are",memory_allowed
            sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name ))
            continue

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## should make the block selection here
            pass

        if 'LumiList' in wfh.request and wfh.request['LumiList']:
            ## same, we could be doing the white list here too
            pass


        if blocks:
            print "Reading",len(blocks),"in whitelist"

        can_go = True
        staging=False
        allowed=True
        if primary:
            
            copies_needed_from_CPUh,CPUh = wfh.getNCopies()

            if talk:
                print wfo.name,'reads',', '.join(primary),'in primary'
            ## chope the primary dataset 
            for prim in primary:
                max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority']))
                sites_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                print "Sites allowed minus the vetoed transfer"
                print sorted(sites_allowed)

                copies_needed_from_site = int(0.35*len(sites_allowed))+1 ## should just go for a fixed number based if the white list grows that big
                print "Would make",copies_needed_from_site,"copies from site white list"
                copies_needed = copies_needed_from_site

                print "Would make",copies_needed_from_CPUh,"from cpu requirement",CPUh
                copies_needed = copies_needed_from_CPUh

                if options.maxcopy>0:
                    ## stop maxing things out ??
                    #copies_needed = min(options.maxcopy,copies_needed)
                    #print "Maxed to",copies_needed
                    if copies_needed_from_CPUh > options.maxcopy:
                        sendEmail('An example of more than three copies','for %s it could have been beneficial to make %s copies'%( wfo.name, copies_needed_from_CPUh))

                
                if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]:
                    copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies']
                    copies_needed = min(copies_needed_from_campaign,copies_needed_from_site)
                    print "Maxed to",copies_needed,"by campaign configuration",wfh.request['Campaign']

                ## remove the sites that do not want transfers                
                workflow_dependencies[prim].add( wfo.id )

                #####################################
                ###### JR 3/8/15 #### deprecating this
                """
                presence = getDatasetPresence( url, prim , within_sites = [SI.CE_to_SE(site) for site in sites_allowed])
                prim_location = [site for site,pres in presence.items() if pres[0]==True]
                prim_parts = [site for site,pres in presence.items() if pres[0]==False]
                if len(prim_location) >= copies_needed:
                    print "The output is all fully in place at",len(prim_location),"sites"
                    continue
                # reduce the number of copies required by existing full copies
                copies_needed = max(0,copies_needed - len(prim_location))
                print "now need",copies_needed
                subscriptions = listSubscriptions( url , prim , sites_allowed )
                prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])]))
                ## remove the subscription where the dataset is in parts at
                #prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']]) and not site in prim_parts]))
                ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place
                prim_destination = [site for site in prim_destination if not site in prim_location]
                ## add transfer dependencies
                latching_on_transfers =  list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])]))
                print latching_on_transfers
                """
                ###### JR 3/8/15 #### deprecating this
                #####################################


                ### new ways of making the whole thing
                destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks )
                #destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='DataOps')
                #anaops_destinations,anaops_all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='AnalysisOps' )
                print json.dumps(destinations, indent=2)

                ## get where the dataset is in full and completed
                prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1]
                ## the rest is places it is going to be
                prim_destination = [site for site in destinations.keys() if not site in prim_location]
                ## need to take out the transfer veto
                prim_destination = [site for site in prim_destination if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                for dsite in prim_destination:
                    needing_locks[dsite].append( prim )

                if len(prim_location) >= copies_needed:
                    print "The output is all fully in place at",len(prim_location),"sites",prim_location
                    continue
                copies_needed = max(0,copies_needed - len(prim_location))
                print "now need",copies_needed
                
                copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names]

                latching_on_transfers = set()
                [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination]
                latching_on_transfers = list(latching_on_transfers)
                #print latching_on_transfers

                ## figure out where all this is going to go
                prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location]
                prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination]
                ## take out the ones that cannot receive transfers
                prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]

                if any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]):
                    ## means there is openings let me go
                    print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute]
                    for site in sites_allowed:
                        #increment accross the board, regardless of real destination: could be changed
                        transfers_per_sites[site] += 1
                else:
                    if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                        print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over transfer slots available"
                    else:
                        print "Not allowed to transfer more than",max_staging_per_site," per site at a time. Going overboard for",[site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site]
                        if not options.go:
                            allowed = False
                            break

                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first()
                    if not tfo:
                        tfo = Transfer( phedexid = latching)
                        tfo.workflows_id = []
                        session.add(tfo)
                            
                    if not wfo.id in tfo.workflows_id:
                        print "adding",wfo.id,"to",tfo.id,"with phedexid",latching
                        l = copy.deepcopy( tfo.workflows_id )
                        l.append( wfo.id )
                        tfo.workflows_id = l
                    if not options.test:
                        session.commit()
                    else:
                        session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                #copies_needed = max(0,copies_needed - len(prim_destination))
                copies_needed = max(0,copies_needed - min(copies_being_made))
                print "then need",copies_needed
                if copies_needed == 0:
                    print "The output is either fully in place or getting in full somewhere with",latching_on_transfers
                    can_go = True
                    continue

                if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the 
                    if not options or options.chop:
                        chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks)
                        spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes)
                        transfer_sizes[prim] = sum(sizes)
                    else:
                        spreading = {} 
                        for site in prim_to_distribute: 
                            if blocks:
                                spreading[site]=blocks
                            else:
                                spreading[site]=[prim]
                        transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified
                    can_go = False
                    print "selected CE destinations",spreading.keys()
                    for (site,items) in spreading.items():
                        all_transfers[site].extend( items )

        if not allowed:
            print "Not allowed to move on with",wfo.name
            continue


        if secondary:
            if talk:
                print wfo.name,'reads',', '.join(secondary),'in secondary'
            for sec in secondary:
                workflow_dependencies[sec].add( wfo.id )

                if False:
                    ## new style, failing on minbias
                    if not sec in destination_cache:
                        ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist
                        destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])
                    destinations = destination_cache[sec]
                    ## truncate location/destination to those making up for >90% of the dataset
                    bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9]
                    sec_location = [site for (site,info) in destinations.items() if info['completion']>=95]
                    sec_destination = [site for site in destinations.keys() if not site in sec_location]
                else:
                    ## old style
                    presence = getDatasetPresence( url, sec )
                    sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites
                    subscriptions = listSubscriptions( url ,sec )
                    sec_destination = [site for site in subscriptions] 

                for site in sec_location:
                    needing_locks[site].append( sec )
                for site in sec_destination:
                    needing_locks[site].append( sec )

                sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                sec_to_distribute = [site for site in sec_to_distribute if not  any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                if len( sec_to_distribute )>0:
                    sec_size = dss.get( sec )
                    for site in sec_to_distribute:
                        site_se =SI.CE_to_SE(site)
                        if (SI.disk[site_se]*1024.) > sec_size:
                            all_transfers[site].append( sec )
                            can_go = False
                        else:
                            print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size
                            #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024))

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                print wfo.name,"latches on existing transfers, and nothing else"
                wfo.status = 'staging'
                needs_transfer+=1
            else:
                print wfo.name,"should just be assigned NOW to",sites_allowed
                wfo.status = 'staged'
            passing_along+=1
            print "setting status to",wfo.status
            session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                print wfo.name,"latches on existing transfers"
                if not options.test:
                    wfo.status = 'staging'
                    print "setting status to",wfo.status
                    session.commit()
            print wfo.name,"needs a transfer"
            needs_transfer+=1
            passing_along+=1

    print "accumulated locks of dataset in place"
    print json.dumps(needing_locks, indent=2)
    for site,items in needing_locks.items():
        for item in items:
            LI.lock( item, SI.CE_to_SE(site), 'usable input')
        
    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    fake_id=-1
    wf_id_in_prestaging=set()

    for (site,items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))

        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer: 
            print site,"does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        datasets = [it for it in items_to_transfer if not '#' in it]

        if execute:
            print "Making a replica to",site,"(CE)",site_se,"(SE) for"
        else:
            print "Would make a replica to",site,"(CE)",site_se,"(SE) for"

        print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [block for block in blocks if not block.split('#')[0] in datasets]
        print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        print "\t",len(datasets),"datasets"
        print "\t",datasets
        items_to_transfer = blocks + datasets

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y','yes','go']:
                continue

        if execute:
            result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal')
            ## make use of max_priority dataset:priority to set the subscriptions priority
            """
            ## does not function
            once = True
            for item in items_to_transfer:
                bds = item.split('#')[0]
                if max_priority[bds] >= 90000:
                    if once:
                        w=10
                        print "waiting",w,"s before raising priority"
                        time.sleep(w)
                        once=False
                    ## raise it to high priority
                    print item,"subscription priority raised to high at",site_se
                    #print "This does not work yet properly it seems"
                    print updateSubscription(url, site_se, item, priority='high')
            """
            #for item in list(set([it.split('#')[0] for it in items_to_transfer])):
            for item in items_to_transfer:
                LI.lock( item, site_se, 'pre-staging')
        else:
            #result= {'phedex':{'request_created' : [{'id' : fake_id}]}}
            result= {'phedex':{'request_created' : []}}
            fake_id-=1



        if not result:
            print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first()
            print phedexid,"transfer created"
            if not new_transfer:
                new_transfer = Transfer( phedexid = phedexid)
                session.add( new_transfer )                
            new_transfer.workflows_id = set()
            for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update( workflow_dependencies[transfering] )
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status!='staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting",tr_wf.name,"to staging"
        session.commit()

Exemple #13

0

Afficher le fichier

def cleanor(url, specific=None):
    print "Deprecated"
    return

    if duplicateLock() : return 

    delete_per_site = {}
    do_not_autoapprove = []#'T2_FR_CCIN2P3']
    SI = siteInfo()
    CI = campaignInfo()
    LI = lockInfo()

    counts=0
    for wfo in session.query(Workflow).filter(Workflow.status == 'done').all():
        keep_a_copy = False
        if specific and not specific in wfo.name: continue
        ## what was in input 
        wl = getWorkLoad(url,  wfo.name )

        if 'Campaign' in wl and wl['Campaign'] in CI.campaigns and 'clean-in' in CI.campaigns[wl['Campaign']] and CI.campaigns[wl['Campaign']]['clean-in']==False:
            print "Skipping cleaning on input for campaign",wl['Campaign'], "as per campaign configuration"
            continue

        dataset= 'N/A'
        if 'InputDataset' in wl:
            dataset = wl['InputDataset']

        print dataset,"in input"
        #print json.dumps(wl, indent=2)
        announced_log = filter(lambda change : change["Status"] in ["closed-out","normal-archived","announced"],wl['RequestTransition'])
        if not announced_log: 
            print "Cannot figure out when",wfo.name,"was finished"
            continue
        now = time.mktime(time.gmtime()) / (60*60*24.)
        then = announced_log[-1]['UpdateTime'] / (60.*60.*24.)
        if (now-then) <2:
            print "workflow",wfo.name, "finished",now-then,"days ago. Too fresh to clean"
            continue
        else:
            print "workflow",wfo.name,"has finished",now-then,"days ago."

        if not 'InputDataset' in wl: 
            ## should we set status = clean ? or something even further
            print "passing along",wfo.name,"with no input"
            wfo.status = 'clean'
            session.commit()
            continue

        if 'MinBias' in dataset:
            print "Should not clean anything using",dataset,"setting status further"
            wfo.status = 'clean'
            session.commit()
            continue

        total_size = getDatasetSize( dataset ) ## in Gb        
        #if counts> 20:            break
        counts+=1
        ## find any location it is at
        our_presence = getDatasetPresence(url, dataset, complete=None, group="DataOps")
        also_our_presence = getDatasetPresence(url, dataset, complete=None, group="")

        ## is there a custodial !!!
        custodials = findCustodialLocation(url, dataset)
        if not len(custodials):
            print dataset,"has no custodial site yet, excluding from cleaning"
            continue

        ## find out whether it is still in use
        using_the_same = getWorkflowByInput(url, dataset, details=True)
        conflict=False
        for other in using_the_same:
            if other['RequestName'] == wfo.name: continue
            if other['RequestType'] == 'Resubmission': continue
            if not other['RequestStatus'] in ['announced','normal-archived','aborted','rejected','aborted-archived','aborted-completed','rejected-archived','closed-out','None',None,'new']:
                print other['RequestName'],'is in status',other['RequestStatus'],'preventing from cleaning',dataset
                conflict=True
                break
            if 'Campaign' in other and other['Campaign'] in CI.campaigns and 'clean-in' in CI.campaigns[other['Campaign']] and CI.campaigns[other['Campaign']]['clean-in']==False:
                print other['RequestName'],'is in campaign',other['Campaign']
                conflict = True
                break
        if conflict: continue
        print "other statuses:",[other['RequestStatus'] for other in using_the_same if other['RequestName'] != wfo.name]


        ## find all disks
        to_be_cleaned = filter(lambda site : site.startswith('T2') or site.endswith('Disk') ,our_presence.keys())
        to_be_cleaned.extend( filter(lambda site : site.startswith('T2') or site.endswith('Disk') ,also_our_presence.keys()))
        print to_be_cleaned,"for",total_size,"GB"

        anaops_presence = getDatasetPresence(url, dataset, complete=None, group="AnalysisOps")
        own_by_anaops = anaops_presence.keys()
        print "Own by analysis ops and vetoing"
        print own_by_anaops
        ## need to black list the sites where there is a copy of analysis ops
        to_be_cleaned = [site for site in to_be_cleaned if not site in own_by_anaops ]

        ## keep one copy out there
        if 'Campaign' in wl and wl['Campaign'] in CI.campaigns and 'keep-one' in CI.campaigns[wl['Campaign']] and CI.campaigns[wl['Campaign']]['keep-one']==True:
            print "Keeping a copy of input for",wl['Campaign']
            keep_a_copy = True
            
        if keep_a_copy:
            keep_at = None
            full_copies = [site for (site,(there,_)) in our_presence.items() if there and site.startswith('T1')]
            full_copies.extend( [site for (site,(there,_)) in also_our_presence.items() if there and site.startswith('T1')] )
            if not full_copies:
                full_copies = [site for (site,(there,_)) in our_presence.items() if there and site.startswith('T2')]
                full_copies.extend( [site for (site,(there,_)) in also_our_presence.items() if there and site.startswith('T2')] )

            if full_copies:
                keep_at = random.choice( full_copies )
                
            if not keep_at:
                print "We are enable to find a place to keep a full copy of",dataset,"skipping"
                continue
            else:
                ## keeping that copy !
                print "Keeping a full copy of",dataset,"at",keep_at,"not setting the status further"
                to_be_cleaned.remove( keep_at )
        else:
            wfo.status = 'clean'

        ## collect delete request per site
        for site in to_be_cleaned :
            if not site in delete_per_site: delete_per_site[site] = []
            if not dataset in [existing[0] for existing in delete_per_site[site]]:
                delete_per_site[site].append( (dataset, total_size) )
        
        session.commit()

    #open('deletes.json','w').write( json.dumps(delete_per_site,indent=2) )

    print json.dumps(delete_per_site, indent=2)
    print "\n\n ------- \n\n"
    ## unroll the deletion per site
    ## maybe find the optimum site/dataset dataset/site to limit the number of ph requests
    for site in delete_per_site:
        dataset_list = [info[0] for info in delete_per_site[site]]
        size_removal = sum([info[1] for info in delete_per_site[site]]) / 1024.
        if site in SI.disk:
            free = SI.disk[site]
            print site,"has",size_removal,"TB of potential cleanup.",free,"TB available."
        else:
            print site,"has",size_removal,"TB of potential cleanup. no info on available."

        print "\t",','.join(dataset_list)
    
    ## make deletion requests
    for site in delete_per_site:
        site_datasets = [info[0] for info in delete_per_site[site]]
        is_tape = any([v in site for v in ['MSS','Export','Buffer'] ])
        #comments="Cleanup input after production. DataOps will take care of approving it."
        #if is_tape:
        #    comments="Cleanup input after production."
        for item in site_datasets:
            LI.release( item, site, 'cleanup of input after production')