Ejemplo n.º 1
0
def summary():
    ## not used anymore IMO
    RDI = remainingDatasetInfo()
    si = siteInfo()
    remainings={}
    for site in RDI.sites():
        load = RDI.get(site)
        if si.disk[site] : continue
        print site,si.disk[site],"[TB] free",si.quota[site],"[TB] quota"

        if not load: continue
        tags = ['pilup','input','output','lock','unlock','tape','stuck-tape','missing-tape']
        for tag in tags:
            v = sum([ info['size'] for ds,info in load.items() if tag in info['reasons']]) / 1024.
            print "\t %10f [TB] remaining because of %s"%(v,tag)
Ejemplo n.º 2
0
def parse(options):
    RDI = remainingDatasetInfo()
    UC = unifiedConfiguration()

    spec_site = filter(None, options.site.split(','))

    ## fetching global information
    locks = [
        l.item.split('#')[0]
        for l in session.query(Lock).filter(Lock.lock == True).all()
    ]
    waiting = {}
    stuck = {}
    missing = {}
    si = siteInfo()
    sis = si.disk.keys()
    random.shuffle(sis)
    n_site = options.nsites
    i_site = 0
    threads = []
    for site in sis:
        if spec_site and not site in spec_site:
            continue
        space = si.disk[site]
        if space and not spec_site:
            continue
        if n_site and i_site > n_site:
            break
        i_site += 1

        print site, "has", space, "[TB] left out of", si.quota[site]
        threads.append(
            SiteBuster(site=site,
                       UC=UC,
                       RDI=RDI,
                       SI=si,
                       locks=copy.deepcopy(locks),
                       waiting=copy.deepcopy(waiting),
                       stuck=copy.deepcopy(stuck),
                       missing=copy.deepcopy(missing),
                       options=copy.deepcopy(options)))
    run_threads = ThreadHandler(threads=threads,
                                label='Site Threads',
                                n_threads=5,
                                start_wait=0,
                                timeout=None,
                                verbose=True)
    run_threads.run()
Ejemplo n.º 3
0
def parse( options ):
    RDI = remainingDatasetInfo()
    UC = unifiedConfiguration()

    spec_site = filter(None,options.site.split(','))

    ## fetching global information
    locks = [l.item.split('#')[0] for l in session.query(Lock).filter(Lock.lock == True).all()]
    waiting = {}
    stuck = {}
    missing = {} 
    si = siteInfo()
    sis = si.disk.keys()
    random.shuffle( sis )
    n_site = options.nsites
    i_site = 0
    threads = []
    for site in sis:
        if spec_site and not site in spec_site:
            continue
        space = si.disk[site]
        if space and not spec_site: 
            continue
        if n_site and i_site>n_site:
            break
        i_site += 1
        
        print site,"has",space,"[TB] left out of",si.quota[site]
        threads.append( SiteBuster( site = site,
                                    UC = UC,
                                    RDI = RDI,
                                    SI = si,
                                    locks = copy.deepcopy(locks),
                                    waiting = copy.deepcopy(waiting),
                                    stuck = copy.deepcopy(stuck),
                                    missing = copy.deepcopy(missing),
                                    options = copy.deepcopy(options)
                                ))
    run_threads = ThreadHandler( threads = threads, 
                                 label = 'Site Threads',
                                 n_threads = 5 , 
                                 start_wait = 0,
                                 timeout = None,
                                 verbose=True)
    run_threads.run()
Ejemplo n.º 4
0
def summary():
    ## not used anymore IMO
    RDI = remainingDatasetInfo()
    si = siteInfo()
    remainings = {}
    for site in RDI.sites():
        load = RDI.get(site)
        if si.disk[site]: continue
        print site, si.disk[site], "[TB] free", si.quota[site], "[TB] quota"

        if not load: continue
        tags = [
            'pilup', 'input', 'output', 'lock', 'unlock', 'tape', 'stuck-tape',
            'missing-tape'
        ]
        for tag in tags:
            v = sum([
                info['size']
                for ds, info in load.items() if tag in info['reasons']
            ]) / 1024.
            print "\t %10f [TB] remaining because of %s" % (v, tag)
Ejemplo n.º 5
0
def main():
    url = 'cmsweb.cern.ch'
    url_tb = 'cmsweb-testbed.cern.ch'
    
    # Example: python assign.py -w amaltaro_RVZTT_120404_163607_6269
    # -t testbed-relval -s T1_US_FNAL -e CMSSW_6_0_0_pre1_FS_TEST_WMA -p v1 -a
    # relval -l /store/backfill/1
    usage = "usage: %prog [options] [WORKFLOW]"
    
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-t', '--team', help='Type of Requests', dest='team', default='production')
    parser.add_option('-s', '--sites', help=' "t1" for Tier-1\'s and "t2" for Tier-2\'s', dest='sites')
    parser.add_option('--special',  help='Use it for special workflows. You also have to change the code according to the type of WF', dest='special')
    parser.add_option('-r', '--replica', action='store_true', dest='replica', default=False, help='Adds a _Disk Non-Custodial Replica parameter')
    parser.add_option('-p', '--procversion', help='Processing Version, if empty it will leave the processing version that comes by default in the request', dest='procversion')
    parser.add_option('-a', '--activity', help='Dashboard Activity (reprocessing, production or test), if empty will set reprocessing as default', dest='activity')
    parser.add_option( '--xrootd', help='Assign with TrustSitelists=True (allows xrootd capabilities)',
                      action='store_true', dest='xrootd')
    parser.add_option('--no_xrootd', help='Assign with TrustSitelists=False',
                      action='store_false', dest='xrootd')
    parser.add_option('--secondary_xrootd', help='Assign with TrustPUSitelists=True (allows xrootd capabilities)',
                      action='store_true', dest='secondary_xrootd')
    parser.add_option('--no_secondary_xrootd', help='Assign with TrustPUSitelists=False',
                      action='store_false', dest='secondary_xrootd')
    parser.add_option('-l', '--lfn', help='Merged LFN base', dest='lfn')
    parser.add_option('-v', '--verbose', help='Verbose', action='store_true', default=False, dest='verbose')
    parser.add_option('--testbed', help='Assign in testbed', action='store_true', default=False, dest='testbed')
    parser.add_option('--test', action="store_true",help='Nothing is injected, only print infomation about workflow and Era', dest='test')
    parser.add_option('-f', '--file', help='Text file with a list of wokflows. If this option is used, the same settings will be applied to all workflows', dest='file')
    parser.add_option('-w', '--workflow', help='Workflow Name, or coma sperated list', dest='workflow')
    parser.add_option('-m', '--memory', help='Set the Memory parameter to the workflow', dest='memory', default=None)
    parser.add_option('--lumisperjob',help='Set the number of lumis per job', default=None, type=int)
    parser.add_option('--maxmergeevents',help='Set the number of event to merge at max', default=None, type=int)
    parser.add_option('-c', '--multicore', help='Set the multicore parameter to the workfllow', dest='multicore', default=None)
    parser.add_option('-e', '--era', help='Acquistion era', dest='era')
    parser.add_option("--procstr", dest="procstring", help="Overrides Processing String with a single string")
    parser.add_option('--checksite', default=False,action='store_true')
    (options, args) = parser.parse_args()
    
    if options.testbed:
        url = url_tb

    # parse input workflows and files. If both -w and -f options are used, then only the -w inputs are considered.
    if not options.workflow:
        if args:
            wfs = args
        elif options.file:
            wfs = [l.strip() for l in open(options.file) if l.strip()]
        else:
            parser.error("Input a workflow name or a file to read them")
            sys.exit(0)
    else:
        wfs = options.workflow.split(',')

    #Default values
    era = {}
    procversion = 1
    procstring = {}
    memory = None
    multicore = None
    replica = False
    sites = []
    specialStr = ''
    taskchain = False
    xrootd= False
    secondary_xrootd= False

    SI = siteInfo()
    getRandomDiskSite.T1 = SI.sites_T1s
    # Handling the parameters given in the command line
    # parse site list
    if options.sites:
        if options.sites.lower() == "t1":
            sites = SI.sites_T1s
        elif options.sites.lower() == "t2":
            sites = SI.sites_T2s
        elif options.sites.lower() in ["all","t1+t2","t2+t1"] :
            sites = SI.sites_T2s+SI.sites_T1s
        elif options.sites.lower() == "mcore":
            sites = SI.sites_mcore_ready
        elif hasattr(SI,options.sites):
            sites = getattr(SI,options.sites)
        #elif options.sites.lower() == 'acdc':
        #    sites = []
        else: 
            sites = [site for site in options.sites.split(',')]
    else: 
        sites = SI.sites_T1s + SI.sites_T2s

    if options.replica:
        replica = True

    for wfn in wfs:
        # Getting the original dictionary
        wfi = workflowInfo( url, wfn )
        schema = wfi.request
        if 'OriginalRequestName' in schema:
            print "Original workflow is:",schema['OriginalRequestName']
            original_wf = workflowInfo(url, schema['OriginalRequestName'])            
            ancestor_wf = workflowInfo(url, schema['OriginalRequestName'])
            ## go back as up as possible
            while ancestor_wf.request['RequestType'] == 'Resubmission':
                if 'OriginalRequestName' not in ancestor_wf.request:
                    ancestor_wf = None
                    break
                ancestor_wf = workflowInfo(url, ancestor_wf.request['OriginalRequestName'])
        else:
            original_wf = None
            ancestor_wf = None

        is_resubmission = (schema['RequestType'] == 'Resubmission')

        if options.sites.lower() == 'original' and original_wf:
            sites = original_wf.request['SiteWhitelist']
            print "Using",sorted(sites),"from the original request",original_wf.request['RequestName']

        #print json.dumps( schema, indent=2 )
        wf_name = wfn
        wf_info = schema

        # WF must be in assignment-approved in order to be assigned
        if (schema["RequestStatus"] != "assignment-approved") and not options.test:
            print("The workflow '" + wf_name + "' you are trying to assign is not in assignment-approved")
            sys.exit(1)

        #Check to see if the workflow is a task chain or an ACDC of a taskchain
        taskchain = (schema["RequestType"] == "TaskChain") or (ancestor_wf and ancestor_wf.request["RequestType"] == "TaskChain")

        # Adding the special string - in case it was provided in the command line
        if options.special:
            specialStr = '_' + str(options.special)
            for key, value in procstring.items():
                procstring[key] = value + specialStr

        # Override if a value is given using the procstring command
        if options.procstring:
            procstring = options.procstring
        elif is_resubmission:
            procstring = ancestor_wf.processingString()
        else:
            procstring = wfi.processingString()

        if options.era:
            era = options.era
        elif is_resubmission:
            era = ancestor_wf.acquisitionEra()
        else:
            era = wfi.acquisitionEra()
        #Dealing with era and proc string
        if (not era or not procstring) or (taskchain and (type(era)!=dict or type(procstring)!=dict)):
            print "We do not have a valid AcquisitionEra and ProcessingString"
            sys.exit(1)

        # Must use --lfn option, otherwise workflow won't be assigned
        if options.lfn:
            lfn = options.lfn
        elif "MergedLFNBase" in wf_info:
            lfn = wf_info['MergedLFNBase']
        elif ancestor_wf and "MergedLFNBase" in ancestor_wf.request:
            lfn = ancestor_wf.request['MergedLFNBase']
        else:
            print "Can't assign the workflow! Please include workflow lfn using --lfn option."
            sys.exit(0)
        # activity production by default for taskchains, reprocessing for default by workflows
        if options.activity:
            activity = options.activity
        elif taskchain:
            activity = 'production'
        else:
            activity = 'reprocessing'

        if options.memory:
            memory = options.memory

        if options.multicore:
            multicore = options.multicore

        # given or default processing version
        if options.procversion:
            procversion = int(options.procversion)
        else:
            if is_resubmission:
                procversion = ancestor_wf.request['ProcessingVersion']
            else:
                procversion = wf_info["ProcessingVersion"]

        # reading xrootd and secondary_xrootd values
        if options.xrootd is not None:
            xrootd = options.xrootd
        elif original_wf:
            xrootd= original_wf.request["TrustSitelists"]

        if options.secondary_xrootd is not None:
            secondary_xrootd = options.secondary_xrootd
        elif original_wf:
            secondary_xrootd= original_wf.request["TrustPUSitelists"]

        # Check for output dataset existence, and abort if output datasets already exist!
        # Don't perform this check for ACDC's
        datasets = schema["OutputDatasets"]
        i = 0
        if not is_resubmission:
            exist = False
            maxv = 1
            for key, value in schema.items():
                if type(value) is dict and key.startswith("Task"):
                    dbsapi = DbsApi(url=dbs3_url)
                    
                    # list all datasets with same name but different version
                    # numbers
                    datasets = dbsapi.listDatasets(acquisition_era_name=value['AcquisitionEra'], primary_ds_name=value['PrimaryDataset'], detail=True, dataset_access_type='*')
                    processedName = value['AcquisitionEra'] + '-' + value['ProcessingString'] + "-v\\d+"
                    # see if any of the dataset names is a match
                    for ds in datasets:
                        if re.match(processedName, ds['processed_ds_name']):
                            print "Existing dset:", ds['dataset'], "(%s)" % ds['dataset_access_type']
                            maxv = max(maxv, ds['processing_version'])
                            exist = True
                        else:
                             pass
                    i += 1
            # suggest max version
            if exist and procversion <= maxv:
                print "Some output datasets exist, its advised to assign with v ==", maxv + 1
                sys.exit(0)
        else:
            ## this is a resubmission !
            print "The taks in resubmission is:",schema['InitialTaskPath']
            ## pick up the sites from acdc
            if options.sites.lower() == 'acdc':
                where_to_run, _,_ =  original_wf.getRecoveryInfo()
                task = schema['InitialTaskPath']
                sites = list(set([SI.SE_to_CE(site) for site in where_to_run[task]]) & set(SI.all_sites))
                print "Found",sorted(sites),"as sites where to run the ACDC at, from the acdc doc of ",original_wf.request['RequestName']

        if options.checksite:
            ## check that the sites are all compatible and up
            check_mem = schema['Memory'] if not memory else memory
            ncores = wfi.getMulticore() if not multicore else multicore
            memory_allowed = SI.sitesByMemory( float(check_mem), maxCore=ncores)
            not_ready = sorted(set(sites) & set(SI.sites_not_ready))
            not_existing = sorted(set(sites) - set(SI.all_sites))
            not_matching = sorted((set(sites) - set(memory_allowed) - set(not_ready) - set(not_existing)))
            previously_used = []
            if schema['SiteWhitelist']: previously_used = schema['SiteWhitelist']
            if original_wf: previously_used = original_wf.request['SiteWhitelist']
            if previously_used: not_matching = sorted(set(not_matching) & set(previously_used))
            
            sites = sorted( set(sites) - set(not_matching) - set(not_existing))
            
            print sorted(memory_allowed),"to allow",check_mem,ncores
            if not_ready:
                print not_ready,"is/are not ready"
                sys.exit(0)
            if not_matching:
                print "The memory requirement",check_mem,"is too much for",not_matching
                sys.exit(0)


        ## need to play with memory setting
        if taskchain:
            if memory:
                ## transform into a dictionnary
                increase = set_to = None
                tasks,set_to = memory.split(':') if ':' in memory else ("",memory)
                tasks = tasks.split(',') if tasks else []
                if set_to.startswith('+'):
                    increase = int(set_to[1:])
                else:
                    set_to = int(set_to)
                it = 1
                memory_dict = {}
                while True:
                    t = 'Task%d'%it
                    it += 1
                    if t in schema:
                        tname = schema[t]['TaskName']
                        if tasks and not tname in tasks:
                            print tname,"not concerned"
                            memory_dict[tname] = schema[t]['Memory']
                            continue
                        if set_to:
                            memory_dict[tname] = set_to
                        else:
                            memory_dict[tname] =schema[t]['Memory'] + increase
                    else:
                        break
                memory = memory_dict
                print memory_dict
            ## need to play with multicore setting
            if multicore:
                tasks,set_to = multicore.split(':') if ':' in multicore else ("",multicore)
                tasks = tasks.split(',') if tasks else []
                set_to = int(set_to)
                multicore_dict = {}
                timeperevent_dict = {}
                it=1
                while True:
                    t = 'Task%d'%it
                    it += 1
                    if t in schema:
                        tname = schema[t]['TaskName']
                        mcore = schema[t]['Multicore']
                        if tasks and not tname in tasks:
                            print tname,"not concerned"
                            multicore_dict[tname] = schema[t]['Multicore']
                            timeperevent_dict[tname] = schema[t]['TimePerEvent']
                            continue
                        mem = memory[tname]
                        factor = (set_to / float(mcore))
                        fraction_constant = 0.4
                        mem_per_core_c = int((1-fraction_constant) * mem / float(mcore))
                        print "mem per core", mem_per_core_c
                        print "base mem", mem
                        ## need to adjut the memory at the same time
                        ## will crash of --mem was not set in argument :FINE
                        memory[tname] = mem + (set_to-mcore)*mem_per_core_c
                        print "final mem",memory[tname]
                        timeperevent_dict[tname] = schema[t]['TimePerEvent']/factor
                        print "setting mcore",set_to
                        multicore_dict[tname] = set_to
                    else:
                        break
                multicore = multicore_dict
                print multicore
                print timeperevent_dict,"cannot be used yet."
    # If the --test argument was provided, then just print the information
    # gathered so far and abort the assignment
        print wf_name
        print "Era:",era
        print "ProcStr:",procstring
        print "ProcVer:",procversion
        print "LFN:",lfn
        print "Team:",options.team
        print "Site:",sites
        print "Taskchain? ", str(taskchain)
        print "Activity:", activity
        print "ACDC:", str(is_resubmission)
        print "Xrootd:", str(xrootd)
        print "Secondary_xrootd:", str(secondary_xrootd)
        #if options.test:            continue
        
        # Really assigning the workflow now
        #print wf_name, '\tEra:', era, '\tProcStr:', procstring, '\tProcVer:', procversion, '\tTeam:', team, '\tSite:', sites
        assignRequest(url, 
                      workflow = wf_name,
                      team = options.team,
                      sites = sites,
                      era = era, 
                      procversion = procversion,
                      activity = activity,
                      lfn = lfn,
                      procstring = procstring, 
                      trust_site = xrootd, 
                      replica = options.replica, 
                      verbose = options.test, 
                      taskchain = taskchain, 
                      trust_secondary_site = secondary_xrootd,
                      memory=memory,
                      multicore=multicore,
                      lumisperjob = options.lumisperjob,
                      maxmergeevents = options.maxmergeevents
                      )
    
    sys.exit(0)
Ejemplo n.º 6
0
def stagor(url, specific=None, options=None):

    if not componentInfo().check(): return
    SI = siteInfo()
    CI = campaignInfo()
    UC = unifiedConfiguration()

    TS = transferStatuses()
    cached_transfer_statuses = TS.content()
    transfer_statuses = {}

    done_by_wf_id = {}
    done_by_input = {}
    completion_by_input = {}
    good_enough = 100.0

    lost_blocks = json.loads(
        eosRead('%s/lost_blocks_datasets.json' % monitor_dir))
    lost_files = json.loads(
        eosRead('%s/lost_files_datasets.json' % monitor_dir))
    known_lost_blocks = {}
    known_lost_files = {}
    for dataset in set(lost_blocks.keys() + lost_files.keys()):
        b, f = findLostBlocksFiles(url, dataset)
        if dataset in lost_blocks and not b:
            print dataset, "has no really lost blocks"
        else:
            known_lost_blocks[dataset] = [i['name'] for i in b]

        if dataset in lost_files and not f:
            print dataset, "has no really lost files"
        else:
            known_lost_files[dataset] = [i['name'] for i in f]

    def time_point(label="", sub_lap=False):
        now = time.mktime(time.gmtime())
        nows = time.asctime(time.gmtime())

        print "Time check (%s) point at : %s" % (label, nows)
        print "Since start: %s [s]" % (now - time_point.start)
        if sub_lap:
            print "Sub Lap : %s [s]" % (now - time_point.sub_lap)
            time_point.sub_lap = now
        else:
            print "Lap : %s [s]" % (now - time_point.lap)
            time_point.lap = now
            time_point.sub_lap = now

    time_point.sub_lap = time_point.lap = time_point.start = time.mktime(
        time.gmtime())

    time_point("Check cached transfer")

    ## collect all datasets that are needed for wf in staging, correcting the status of those that are not really in staging
    wfois = []
    needs = defaultdict(list)
    needs_by_priority = defaultdict(list)
    for wfo in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wfi = workflowInfo(url, wfo.name)
        if wfi.request['RequestStatus'] in [
                'running-open', 'running-closed', 'completed', 'assigned',
                'acquired'
        ]:
            wfi.sendLog('stagor',
                        "is in status %s" % wfi.request['RequestStatus'])
            wfo.status = 'away'
            session.commit()
            continue
        if not wfi.request['RequestStatus'] in ['assignment-approved']:
            ## should be setting 'away' too
            ## that usually happens for relvals
            if wfi.request['RequestStatus'] in [
                    'rejected', 'aborted', 'aborted-completed',
                    'aborted-archived', 'rejected-archived'
            ] and wfi.isRelval():
                wfo.status = 'forget'
                session.commit()
                continue
            else:
                print wfo.name, "is", wfi.request['RequestStatus']
                #sendEmail("wrong status in staging. debug","%s is in %s, should set away."%(wfo.name,wfi.request['RequestStatus']))
                sendLog("stagor",
                        "%s is in %s, set away" %
                        (wfo.name, wfi.request['RequestStatus']),
                        level='critical')
                wfo.status = 'away'
                session.commit()
                continue

        wfois.append((wfo, wfi))
        _, primaries, _, secondaries = wfi.getIO()
        for dataset in list(primaries) + list(secondaries):
            needs[wfo.name].append(dataset)
            done_by_input[dataset] = {}
            completion_by_input[dataset] = {}
            needs_by_priority[wfi.request['RequestPriority']].append(dataset)
            wfi.sendLog('stagor', '%s needs %s' % (wfo.name, dataset))

    time_point("Check staging workflows")

    open('%s/dataset_requirements.json' % monitor_dir,
         'w').write(json.dumps(needs, indent=2))
    for prio in needs_by_priority:
        needs_by_priority[prio] = list(set(needs_by_priority[prio]))
    open('%s/dataset_priorities.json' % monitor_dir,
         'w').write(json.dumps(needs_by_priority, indent=2))

    dataset_endpoints = defaultdict(set)
    endpoint_in_downtime = defaultdict(set)
    #endpoint_completed = defaultdict(set)
    endpoint_incompleted = defaultdict(set)
    #endpoint = defaultdict(set)
    send_back_to_considered = set()

    ## first check if anything is inactive
    all_actives = set([
        transfer.phedexid for transfer in session.query(TransferImp).filter(
            TransferImp.active).all()
    ])
    for active_phedexid in all_actives:
        skip = True
        transfers_phedexid = session.query(TransferImp).filter(
            TransferImp.phedexid == active_phedexid).all()
        for imp in transfers_phedexid:
            if imp.workflow.status == 'staging':
                skip = False
                sendLog(
                    'stagor', "\t%s is staging for %s" %
                    (imp.phedexid, imp.workflow.name))
        if skip:
            sendLog('stagor', "setting %s inactive" % active_phedexid)
            for imp in transfers_phedexid:
                imp.active = False
        session.commit()

    all_actives = sorted(
        set([
            transfer.phedexid for transfer in session.query(
                TransferImp).filter(TransferImp.active).all()
        ]))
    for phedexid in all_actives:

        if specific: continue

        ## check on transfer completion
        not_cached = False
        if phedexid in cached_transfer_statuses:
            ### use a cache for transfer that already looked done
            sendLog('stagor', "read %s from cache" % phedexid)
            checks = cached_transfer_statuses[phedexid]
        else:
            ## I actually would like to avoid that all I can
            sendLog('stagor',
                    'Performing spurious transfer check on %s' % phedexid,
                    level='critical')
            checks = checkTransferStatus(url, phedexid, nocollapse=True)
            try:
                print json.dumps(checks, indent=2)
            except:
                print checks

            if not checks:
                ## this is going to bias quite heavily the rest of the code. we should abort here
                #sendLog('stagor','Ending stagor because of skewed input from checkTransferStatus', level='critical')
                #return False
                sendLog(
                    'stagor',
                    'Stagor has got a skewed input from checkTransferStatus',
                    level='critical')
                checks = {}
                pass
            else:
                TS.add(phedexid, checks)

        time_point("Check transfer status %s" % phedexid, sub_lap=True)

        if not specific:
            for dsname in checks:
                if not dsname in done_by_input: done_by_input[dsname] = {}
                if not dsname in completion_by_input:
                    completion_by_input[dsname] = {}
                done_by_input[dsname][phedexid] = all(
                    map(lambda i: i >= good_enough, checks[dsname].values()))
                completion_by_input[dsname][phedexid] = checks[dsname].values()
        if checks:
            sendLog(
                'stagor', "Checks for %s are %s" %
                (phedexid, [node.values() for node in checks.values()]))
            done = all(
                map(
                    lambda i: i >= good_enough,
                    list(
                        itertools.chain.from_iterable(
                            [node.values() for node in checks.values()]))))
        else:
            ## it is empty, is that a sign that all is done and away ?
            if not_cached:
                print "Transfer status was not cached"
            else:
                print "ERROR with the scubscriptions API of ", phedexid
                print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists"
            done = False

        transfers_phedexid = session.query(TransferImp).filter(
            TransferImp.phedexid == phedexid).all()
        for imp in transfers_phedexid:
            tr_wf = imp.workflow
            if tr_wf:  # and tr_wf.status == 'staging':
                if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id] = {}
                done_by_wf_id[tr_wf.id][phedexid] = done
            if done:
                imp.active = False
                session.commit()

        for ds in checks:
            for s, v in checks[ds].items():
                dataset_endpoints[ds].add(s)

        if done:
            sendLog('stagor', "%s is done" % phedexid)
            TS.add(phedexid, checks)
        else:
            sendLog(
                'stagor',
                "%s is not finished %s" % (phedexid, pprint.pformat(checks)))
            ##pprint.pprint( checks )
            ## check if the destination is in down-time
            for ds in checks:
                sites_incomplete = [
                    SI.SE_to_CE(s) for s, v in checks[ds].items()
                    if v < good_enough
                ]
                sites_incomplete_down = [
                    s for s in sites_incomplete if not s in SI.sites_ready
                ]
                ## no space means no transfer should go there : NO, it does not work in the long run
                #sites_incomplete_down = [SI.SE_to_CE(s) for s,v in checks[ds].items() if (v<good_enough and (SI.disk[s]==0 or (not SI.SE_to_CE(s) in SI.sites_ready)))]

                if sites_incomplete_down:
                    sendLog(
                        'stagor',
                        "%s are in downtime, while waiting for %s to get there"
                        % (",".join(sites_incomplete_down), ds))
                    endpoint_in_downtime[ds].update(sites_incomplete_down)
                if sites_incomplete:
                    endpoint_incompleted[ds].update(sites_incomplete)

    time_point("Check on-going transfers")

    print "End points"
    for k in dataset_endpoints:
        dataset_endpoints[k] = list(dataset_endpoints[k])
    print json.dumps(dataset_endpoints, indent=2)

    print "End point in down time"
    for k in endpoint_in_downtime:
        endpoint_in_downtime[k] = list(endpoint_in_downtime[k])
    print json.dumps(endpoint_in_downtime, indent=2)

    print "End point incomplete in down time"
    for k in endpoint_incompleted:
        endpoint_incompleted[k] = list(endpoint_incompleted[k])
    print json.dumps(endpoint_incompleted, indent=2)

    #open('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( transfer_statuses, indent=2))
    eosFile('%s/transfer_statuses.json' % monitor_dir,
            'w').write(json.dumps(TS.content(), indent=2)).close()
    eosFile('%s/dataset_endpoints.json' % monitor_dir,
            'w').write(json.dumps(dataset_endpoints, indent=2)).close()

    already_stuck = json.loads(
        eosRead('%s/stuck_transfers.json' % monitor_pub_dir)).keys()
    already_stuck.extend(getAllStuckDataset())

    missing_in_action = defaultdict(list)

    print "-" * 10, "Checking on workflows in staging", "-" * 10
    #forget_about = ['/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM']
    #for what in forget_about:
    #    if not done_by_input[what]:
    #        done_by_input[what] = {'fake':True}

    ## come back to workflows and check if they can go
    available_cache = defaultdict(lambda: defaultdict(float))
    presence_cache = defaultdict(dict)

    time_point("Preparing for more")
    for wfo, wfi in wfois:
        print "#" * 30
        time_point("Forward checking %s" % wfo.name, sub_lap=True)
        ## the site white list takes site, campaign, memory and core information
        (_, primaries, _, secondaries,
         sites_allowed) = wfi.getSiteWhiteList(verbose=False)
        se_allowed = [SI.CE_to_SE(site) for site in sites_allowed]
        se_allowed.sort()
        se_allowed_key = ','.join(se_allowed)
        readys = {}
        for need in list(primaries) + list(secondaries):
            if not need in done_by_input:
                wfi.sendLog('stagor', "missing transfer report for %s" % need)
                readys[need] = False
                ## should warn someone about this !!!
                ## it cannot happen, by construction
                sendEmail('missing transfer report',
                          '%s does not have a transfer report' % (need))
                continue

            if not done_by_input[need] and need in list(secondaries):
                wfi.sendLog(
                    'stagor',
                    "assuming it is OK for secondary %s to have no attached transfers"
                    % need)
                readys[need] = True
                done_by_input[need] = {"fake": True}
                continue

            if len(done_by_input[need]) and all(done_by_input[need].values()):
                wfi.sendLog('stagor', "%s is ready" % need)
                print json.dumps(done_by_input[need], indent=2)
                readys[need] = True
            else:
                wfi.sendLog(
                    'stagor', "%s is not ready \n%s" %
                    (need, json.dumps(done_by_input[need], indent=2)))
                readys[need] = False

        if readys and all(readys.values()):
            if wfo.status == 'staging':
                wfi.sendLog('stagor',
                            "all needs are fullfilled, setting staged")
                wfo.status = 'staged'
                session.commit()
            else:
                wfi.sendLog('stagor', "all needs are fullfilled, already")
                print json.dumps(readys, indent=2)
        else:
            wfi.sendLog('stagor', "missing requirements")
            copies_needed, _ = wfi.getNCopies()
            jump_ahead = False
            re_transfer = False
            ## there is missing input let's do something more elaborated
            for need in list(primaries):  #+list(secondaries):
                if endpoint_in_downtime[need] and endpoint_in_downtime[
                        need] == endpoint_incompleted[need]:
                    #print need,"is going to an end point in downtime"
                    wfi.sendLog(
                        'stagor',
                        "%s has only incomplete endpoint in downtime\n%s" %
                        (need, endpoint_in_downtime[need]))
                    re_transfer = True

                if not se_allowed_key in available_cache[need]:
                    available_cache[need][
                        se_allowed_key] = getDatasetBlocksFraction(
                            url, need, sites=se_allowed)
                    if available_cache[need][se_allowed_key] >= copies_needed:
                        wfi.sendLog(
                            'stagor',
                            "assuming it is OK to move on like this already for %s"
                            % need)
                        jump_ahead = True
                    else:
                        wfi.sendLog(
                            'stagor', "Available %s times" %
                            available_cache[need][se_allowed_key])
                        missing_and_downtime = list(
                            set(endpoint_in_downtime[need])
                            & set(endpoint_incompleted[need]))
                        if missing_and_downtime:
                            wfi.sendLog(
                                'stagor',
                                "%s is incomplete at %s which is in downtime, trying to move along"
                                % (need, ','.join(missing_and_downtime)))
                            jump_ahead = True
                        else:
                            wfi.sendLog(
                                'stagor',
                                "continue waiting for transfers for optimum production performance."
                            )

            ## compute a time since staging to filter jump starting ?
            # check whether the inputs is already in the stuck list ...
            for need in list(primaries) + list(secondaries):
                if need in already_stuck:
                    wfi.sendLog('stagor',
                                "%s is stuck, so try to jump ahead" % need)
                    jump_ahead = True

            if jump_ahead or re_transfer:
                details_text = "checking on availability for %s to jump ahead" % wfo.name
                details_text += '\n%s wants %s copies' % (wfo.name,
                                                          copies_needed)
                copies_needed = max(1, copies_needed - 1)
                details_text += '\nlowering by one unit to %s' % copies_needed
                wfi.sendLog('stagor', details_text)
                all_check = True

                prim_where = set()
                for need in list(primaries):
                    if not se_allowed_key in presence_cache[need]:
                        presence_cache[need][
                            se_allowed_key] = getDatasetPresence(
                                url, need, within_sites=se_allowed)
                    presence = presence_cache[need][se_allowed_key]
                    prim_where.update(presence.keys())
                    available = available_cache[need][se_allowed_key]
                    this_check = (available >= copies_needed)
                    wfi.sendLog(
                        'stagor', "%s is available %s times (%s), at %s" %
                        (need, available, this_check, se_allowed_key))
                    all_check &= this_check
                    if not all_check: break

                for need in list(secondaries):
                    ## I do not want to check on the secon
                    ## this below does not function because the primary could be all available, and the secondary not complete at a certain site that does not matter at that point
                    this_check = all(done_by_input[need].values())
                    wfi.sendLog(
                        'stagor', "%s is this much transfered %s" %
                        (need, json.dumps(done_by_input[need], indent=2)))
                    all_check &= this_check
                    #if not se_allowed_key in presence_cache[need]:
                    #    presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed)

                    ## restrict to where the primary is
                    #presence = dict([(k,v) for (k,v) in presence_cache[need][se_allowed_key].items() if k in prim_where])
                    #this_check = all([there for (there,frac) in presence.values()])
                    #print need,"is present at all sites:",this_check
                    #all_check&= this_check

                if all_check and not re_transfer:
                    wfi.sendLog(
                        'stagor',
                        "needs are sufficiently fullfilled, setting staged")
                    wfo.status = 'staged'
                    session.commit()
                else:
                    print wfo.name, "has to wait a bit more"
                    wfi.sendLog('stagor', "needs to wait a bit more")
            else:
                wfi.sendLog('stagor', "not checking availability")

            if re_transfer:
                wfi.sendLog(
                    'stagor',
                    "Sending back to considered because of endpoint in downtime"
                )
                if wfo.status == 'staging':
                    wfo.status = 'considered'
                    session.commit()
                    send_back_to_considered.add(wfo.name)

    time_point("Checked affected workflows")

    if send_back_to_considered:
        #sendEmail("transfer to endpoint in downtime","sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)))
        sendLog('stagor',
                "sending back to considered the following workflows \n%s" %
                ('\n'.join(send_back_to_considered)),
                level='critical')

    print "-" * 10, "Checking on non-available datasets", "-" * 10
    ## now check on those that are not fully available

    for dsname in available_cache.keys():
        ## squash the se_allowed_key key
        available_cache[dsname] = min(available_cache[dsname].values())

    really_stuck_dataset = set()

    for dsname, available in available_cache.items():
        using_its = getWorkflowByInput(url, dsname)
        #print using_its
        using_wfos = []
        for using_it in using_its:
            wf = session.query(Workflow).filter(
                Workflow.name == using_it).first()
            if wf:
                using_wfos.append(wf)

        if not len(done_by_input[dsname]):
            print "For dataset", dsname, "there are no transfer report. That's an issue."
            for wf in using_wfos:
                if wf.status == 'staging':
                    if UC.get("stagor_sends_back"):
                        print "sending", wf.name, "back to considered"
                        wf.status = 'considered'
                        session.commit()
                        #sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name)
                        sendLog('stagor',
                                "%s was send back and might be trouble" %
                                wf.name,
                                level='critical')
                    else:
                        print "would send", wf.name, "back to considered"
                        #sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name)
                        sendLog(
                            'stagor',
                            "susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."
                            % wf.name,
                            level='critical')
            continue

        ## not compatible with checking on secondary availability
        #if all([wf.status != 'staging' for wf in using_wfos]):
        #    ## means despite all checks that input is not needed
        #    continue

        if available < 1.:
            print "incomplete", dsname
            ## there is a problem in the method below that it does not account for files stuck in T1*Buffer only
            lost_blocks, lost_files = findLostBlocksFiles(
                url, dsname) if (not dsname.endswith('/RAW')) else ([], [])
            lost_block_names = [item['name'] for item in lost_blocks]
            lost_file_names = [item['name'] for item in lost_files]

            if lost_blocks:
                #print json.dumps( lost , indent=2 )
                ## estimate for how much !
                fraction_loss, _, n_missing = getDatasetBlockFraction(
                    dsname, lost_block_names)
                print "We have lost", len(
                    lost_block_names
                ), "blocks", lost_block_names, "for %f%%" % (100. *
                                                             fraction_loss)
                if fraction_loss > 0.05:  ## 95% completion mark
                    #sendEmail('we have lost too many blocks','%s is missing %d blocks, for %d events, %f %% loss'%(dsname, len(lost_block_names), n_missing, fraction_loss))
                    sendLog(
                        'stagor',
                        '%s is missing %d blocks, for %d events, %3.2f %% loss'
                        % (dsname, len(lost_block_names), n_missing,
                           100 * fraction_loss),
                        level='critical')
                    ## the workflow should be rejected !
                    for wf in using_wfos:
                        if wf.status == 'staging':
                            print wf.name, "is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                            sendLog(
                                'stagor',
                                '%s has too much loss on the input dataset %s. Missing  %d blocks, for %d events, %3.2f %% loss'
                                % (wf.name, dsname, len(lost_block_names),
                                   n_missing, 100 * fraction_loss),
                                level='critical')
                else:
                    ## probably enough to make a ggus and remove
                    if not dsname in known_lost_blocks:
                        #sendEmail('we have lost a few blocks', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ))
                        sendLog(
                            'stagor',
                            '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'
                            % (dsname, len(lost_block_names), n_missing,
                               fraction_loss, '\n'.join(lost_block_names)),
                            level='critical')
                        known_lost_blocks[dsname] = [
                            i['name'] for i in lost_blocks
                        ]
                really_stuck_dataset.add(dsname)

            if lost_files:
                fraction_loss, _, n_missing = getDatasetFileFraction(
                    dsname, lost_file_names)
                print "We have lost", len(
                    lost_file_names
                ), "files", lost_file_names, "for %f%%" % fraction_loss

                if fraction_loss > 0.05:
                    #sendEmail('we have lost too many files','%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss))
                    sendLog(
                        'stagor',
                        '%s is missing %d files, for %d events, %f %% loss' %
                        (dsname, len(lost_file_names), n_missing,
                         fraction_loss),
                        level='critical')
                    for wf in using_wfos:
                        if wf.status == 'staging':
                            print wf.name, "is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                else:
                    ## probably enough to make a ggus and remove
                    if not dsname in known_lost_files:
                        #sendEmail('we have lost a few files','%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)))
                        sendLog(
                            'stagor',
                            '%s is missing %d files, for %d events, %f %% loss\n\n%s'
                            % (dsname, len(lost_file_names), n_missing,
                               fraction_loss, '\n'.join(lost_file_names)),
                            level='critical')
                        known_lost_files[dsname] = [
                            i['name'] for i in lost_files
                        ]

                ## should the status be change to held-staging and pending on a ticket

            missings = [
                pid for (pid, d) in done_by_input[dsname].items() if d == False
            ]
            print "\t", done_by_input[dsname]
            print "\tneeds", len(done_by_input[dsname])
            print "\tgot", done_by_input[dsname].values().count(True)
            print "\tmissing", missings
            missing_in_action[dsname].extend(missings)

    rr = eosFile('%s/lost_blocks_datasets.json' % monitor_dir, 'w')
    rr.write(json.dumps(known_lost_blocks, indent=2))
    rr.close()

    rr = eosFile('%s/lost_files_datasets.json' % monitor_dir, 'w')
    rr.write(json.dumps(known_lost_files, indent=2))
    rr.close()

    eosFile('%s/incomplete_transfers.json' % monitor_dir,
            'w').write(json.dumps(missing_in_action, indent=2)).close()
    print "Stuck transfers and datasets"
    print json.dumps(missing_in_action, indent=2)

    TD = transferDataset()
    datasets_by_phid = defaultdict(set)
    for dataset in missing_in_action:
        for phid in missing_in_action[dataset]:
            #print dataset,"stuck through",phid
            datasets_by_phid[phid].add(dataset)

    for k in datasets_by_phid:
        #datasets_by_phid[k] = list(datasets_by_phid[k])
        TD.add(k, list(datasets_by_phid[k]))

    #eosFile('%s/datasets_by_phid.json'%base_eos_dir,'w').write( json.dumps(datasets_by_phid, indent=2 )).close()

    eosFile('%s/really_stuck_dataset.json' % base_eos_dir,
            'w').write(json.dumps(list(really_stuck_dataset),
                                  indent=2)).close()
    print '\n' * 2, "Datasets really stuck"
    print '\n'.join(really_stuck_dataset)

    #############
    ## not going further for what matters
    #############
    return
Ejemplo n.º 7
0
def completor(url, specific):
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']
    if use_mcm:
        mcm = McMClient(dev=False)

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()

    wfs = []
    wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )
    wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() )

    ## just take it in random order so that not always the same is seen
    random.shuffle( wfs )

    max_per_round = UC.get('max_per_round').get('completor',None)
    if max_per_round and not specific: wfs = wfs[:max_per_round]

    ## by workflow a list of fraction / timestamps
    completions = json.loads( open('%s/completions.json'%monitor_dir).read())
    
    good_fractions = {}
    timeout = {}
    for c in CI.campaigns:
        if 'force-complete' in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]['force-complete']
        if 'force-timeout' in CI.campaigns[c]:
            timeout[c] = CI.campaigns[c]['force-timeout']

    long_lasting = {}

    overrides = getForceCompletes()
    if use_mcm:    
        ## add all workflow that mcm wants to get force completed
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        ## assuming this will be a list of actual prepids
        overrides['mcm'] = mcm_force

    print "can force complete on"
    print json.dumps( good_fractions ,indent=2)
    print json.dumps( overrides, indent=2)
    max_force = UC.get("max_force_complete")
    
    #wfs_no_location_in_GQ = set()
    #block_locations = defaultdict(lambda : defaultdict(list))
    #wfs_no_location_in_GQ = defaultdict(list)

    set_force_complete = set()


    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        print "looking at",wfo.name
        ## get all of the same
        wfi = workflowInfo(url, wfo.name)
        pids = wfi.getPrepIDs()
        skip=False
        if not any([c in wfo.name for c in good_fractions]): skip=True
        for user,spec in overrides.items():

            if wfi.request['RequestStatus']!='force-complete':
                if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec):
                    sendEmail('force-complete requested','%s is asking for %s to be force complete'%(user,wfo.name))
                    wfi = workflowInfo(url, wfo.name)
                    forceComplete(url , wfi )
                    skip=True
                    wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False)
                    wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name))
                    break
    
        if wfo.status.startswith('assistance'): skip = True

        if skip: 
            continue

        priority = wfi.request['RequestPriority']

        if not 'Campaign' in wfi.request: continue

        if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue

        c = wfi.request['Campaign']
        if not c in good_fractions: continue
        good_fraction = good_fractions[c]
        ignore_fraction = 2.
        
        lumi_expected = None
        event_expected = None
        if not 'TotalInputEvents' in wfi.request: 
            if 'RequestNumEvents' in wfi.request:
                event_expected = wfi.request['RequestNumEvents']
            else:
                print "truncated, cannot do anything"
                continue
        else:
            lumi_expected = wfi.request['TotalInputLumis']
            event_expected = wfi.request['TotalInputEvents']

        now = time.mktime(time.gmtime()) / (60*60*24.)

        running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition'])
        if not running_log:
            print "\tHas no running log"
            # cannot figure out when the thing started running
            continue
        then = running_log[-1]['UpdateTime'] / (60.*60.*24.)
        delay = now - then ## in days

        (w,d) = divmod(delay, 7 )
        print "\t"*int(w)+"Running since",delay,"[days] priority=",priority

        monitor_delay = 7
        allowed_delay = 14
        if c in timeout:
            allowed_delay = timeout[c]
            
        monitor_delay = min(monitor_delay, allowed_delay)
        ### just skip if too early
        if delay <= monitor_delay: continue

        long_lasting[wfo.name] = { "delay" : delay }

        percent_completions = {}
        for output in wfi.request['OutputDatasets']:
            if "/DQM" in output: continue ## that does not count
            if not output in completions: completions[output] = { 'injected' : None, 'checkpoints' : [], 'workflow' : wfo.name}
            ## get completion fraction
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            lumi_completion=0.
            event_completion=0.
            if lumi_expected:
                lumi_completion = lumi_count / float( lumi_expected )
            if event_expected:
                event_completion = event_count / float( event_expected )

            #take the less optimistic
            percent_completions[output] = min( lumi_completion, event_completion )
            completions[output]['checkpoints'].append( (now, event_completion ) )

        if all([percent_completions[out] >= good_fraction for out in percent_completions]):
            wfi.sendLog('completor', "all is above %s \n%s"%( good_fraction, 
                                                              json.dumps( percent_completions, indent=2 )
                                                              ))
        else:
            long_lasting[wfo.name].update({
                    'completion': sum(percent_completions.values()) / len(percent_completions),
                    'completions' : percent_completions
                    })
            
            ## do something about the agents this workflow is in
            long_lasting[wfo.name]['agents'] = wfi.getAgents()
            wfi.sendLog('completor', "%s not over bound %s\n%s"%(percent_completions.values(), good_fraction,
                                                                 json.dumps( long_lasting[wfo.name]['agents'], indent=2) ))
            continue

        if all([percent_completions[out] >= ignore_fraction for out in percent_completions]):
            print "all is done, just wait a bit"
            continue

        for output in  percent_completions:
            completions[output]['injected'] = then

        #further check on delays
        cpuh = wfi.getComputingTime(unit='d')

        ran_at = wfi.request['SiteWhitelist']
                        
        wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay))
                    
        ##### WILL FORCE COMPLETE BELOW
        # only really force complete after n days

        if delay <= allowed_delay: continue
        ## find ACDCs that might be running
        if max_force>0:
            forceComplete(url, wfi )
            set_force_complete.add( wfo.name )
            print "going for force-complete of",wfo.name
            wfi.sendLog('completor','going for force completing')
            wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name)
            max_force -=1
        else:
            wfi.sendLog('completor',"too many completion this round, cannot force complete")

        ## do it once only for testing
        #break
    
    if set_force_complete:
        sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete)))
        #sendEmail('set force-complete', 'The followings were set force-complete \n%s'%('\n'.join(set_force_complete)))
    
    open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2))
    text="These have been running for long"
    
    open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 ))

    for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True):
        delay = info['delay']
        text += "\n %s : %s days"% (wf, delay)
        if 'completion' in info:
            text += " %d%%"%( info['completion']*100 )

    #if wfs_no_location_in_GQ:
    #    sendEmail('workflow with no location in GQ',"there won't be able to run anytime soon\n%s"%( '\n'.join(wfs_no_location_in_GQ)))

    #sendEmail("long lasting workflow",text)
    ## you can check the log
    print text
Ejemplo n.º 8
0
def recoveror(url, specific, options=None):
    if userLock('recoveror'): return

    up = componentInfo(soft=['mcm', 'wtc'])
    if not up.check(): return

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    use_recoveror = UC.get('use_recoveror')

    if not use_recoveror and not options.go:
        print "We are told not to run recoveror"
        return

    def make_int_keys(d):
        for code in d:
            d[int(code)] = d.pop(code)

    error_codes_to_recover = UC.get('error_codes_to_recover')
    error_codes_to_block = UC.get('error_codes_to_block')
    error_codes_to_notify = UC.get('error_codes_to_notify')
    make_int_keys(error_codes_to_recover)
    make_int_keys(error_codes_to_block)
    make_int_keys(error_codes_to_notify)

    #wfs = session.query(Workflow).filter(Workflow.status == 'assistance-recovery').all()
    wfs = session.query(Workflow).filter(
        Workflow.status.contains('recovery')).all()
    if specific:
        wfs.extend(
            session.query(Workflow).filter(
                Workflow.status == 'assistance-manual').all())

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        if not specific and 'manual' in wfo.status: continue

        wfi = workflowInfo(url, wfo.name)

        ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves

        all_errors = {}
        try:
            ## this is clearly very truncated and should be changed completely
            wfi.getSummary()
            all_errors = wfi.summary['errors']
        except:
            pass

        print '-' * 100
        print "Looking at", wfo.name, "for recovery options"

        recover = True

        if not 'MergedLFNBase' in wfi.request:
            print "f****d up"
            sendEmail('missing lfn', '%s wl cache is screwed up' % wfo.name)
            recover = False

        if not len(all_errors):
            print "\tno error for", wfo.name
            recover = False

        task_to_recover = defaultdict(list)
        message_to_ops = ""
        message_to_user = ""

        if 'LheInputFilese' in wfi.request and wfi.request['LheInputFiles']:
            ## we do not try to recover pLHE
            recover = False

        if wfi.request['RequestType'] in ['MonteCarlo', 'ReReco']:
            recover = False

        if 'Campaign' in wfi.request:
            c = wfi.request['Campaign']
            if c in CI.campaigns and 'recover' in CI.campaigns[c]:
                recover = CI.campaigns[c]['recover']

        for task, errors in all_errors.items():
            print "\tTask", task
            ## collect all error codes and #jobs regardless of step at which it occured
            all_codes = []
            for name, codes in errors.items():
                if type(codes) == int: continue
                all_codes.extend([
                    (int(code), info['jobs'], name,
                     list(set([e['type'] for e in info['errors']])),
                     list(set([e['details'] for e in info['errors']])))
                    for code, info in codes.items()
                ])

            all_codes.sort(key=lambda i: i[1], reverse=True)
            sum_failed = sum([l[1] for l in all_codes])

            for errorCode, njobs, name, types, details in all_codes:
                rate = 100 * njobs / float(sum_failed)
                #print ("\t\t %10d (%6s%%) failures with error code %10d (%"+str(max_legend)+"s) at stage %s")%(njobs, "%4.2f"%rate, errorCode, legend, name)
                print(
                    "\t\t %10d (%6s%%) failures with error code %10d (%30s) at stage %s"
                ) % (njobs, "%4.2f" % rate, errorCode, ','.join(types), name)

                added_in_recover = False

                #if options.go:
                # force the recovery of any task with error ?

                if errorCode in error_codes_to_recover:
                    ## the error code is registered
                    for case in error_codes_to_recover[errorCode]:
                        match = case['details']
                        matched = (match == None)
                        if not matched:
                            matched = False
                            for detail in details:
                                if match in detail:
                                    print "[recover] Could find keyword", match, "in"
                                    print 50 * "#"
                                    print detail
                                    print 50 * "#"
                                    matched = True
                                    break
                        if matched and rate > case['rate']:
                            print "\t\t => we should be able to recover that", case[
                                'legend']
                            task_to_recover[task].append((code, case))
                            added_in_recover = True
                            message_to_user = ""
                        else:
                            print "\t\t recoverable but not frequent enough, needs", case[
                                'rate']

                if errorCode in error_codes_to_block:
                    for case in error_codes_to_block[errorCode]:
                        match = case['details']
                        matched = (match == None)
                        if not matched:
                            matched = False
                            for detail in details:
                                if match in detail:
                                    print "[block] Could find keyword", match, "in"
                                    print 50 * "#"
                                    print detail
                                    print 50 * "#"
                                    matched = True
                                    break
                        if matched and rate > case['rate']:
                            print "\t\t => that error means no ACDC on that workflow", case[
                                'legend']
                            if not options.go:
                                message_to_ops += "%s has an error %s blocking an ACDC.\n%s\n " % (
                                    wfo.name, errorCode, '#' * 50)
                                recover = False
                                added_in_recover = False

                if errorCode in error_codes_to_notify and not added_in_recover:
                    print "\t\t => we should notify people on this"
                    message_to_user += "%s has an error %s in processing.\n%s\n" % (
                        wfo.name, errorCode, '#' * 50)

        if message_to_user:
            print wfo.name, "to be notified to user(DUMMY)", message_to_user

        if message_to_ops:
            #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**'])
            sendLog('recoveror', message_to_ops, level='warning')

        if len(task_to_recover) != len(all_errors):
            print "Should not be doing partial ACDC. skipping"
            #sendEmail('recoveror','do not want to make partial acdc on %s'%wfo.name)
            sendLog('recoveror',
                    'do not want to make partial acdc on %s' % wfo.name,
                    level='warning')
            recover = False

        if task_to_recover and recover:
            print "Initiating recovery"
            print ', '.join(task_to_recover.keys()), "to be recovered"

            recovering = set()
            for task in task_to_recover:
                print "Will be making a recovery workflow for", task

                ## from here you can fetch known solutions, to known error codes
                actions = list(
                    set([
                        case['solution']
                        for code, case in task_to_recover[task]
                    ]))
                acdc = singleRecovery(url,
                                      task,
                                      wfi.request,
                                      actions,
                                      do=options.do)

                if not acdc:
                    if options.do:
                        if recovering:
                            print wfo.name, "has been partially ACDCed. Needs manual attention"
                            #sendEmail( "failed ACDC partial recovery","%s has had %s/%s recoveries %s only"%( wfo.name, len(recovering), len(task_to_recover), list(recovering)), destination=['*****@*****.**'])
                            sendLog('recoveror',
                                    "%s has had %s/%s recoveries %s only" %
                                    (wfo.name, len(recovering),
                                     len(task_to_recover), list(recovering)),
                                    level='critical')
                            continue
                        else:
                            print wfo.name, "failed recovery once"
                            #break
                            continue
                    else:
                        print "no action to take further"
                        sendLog('recoveror',
                                "ACDC for %s can be done automatically" %
                                wfo.name,
                                level='critical')
                        continue

                ## and assign it ?
                team = wfi.request['Team']
                #assign_to_sites = set(SI.sites_ready) ## that needs to be massaged to prevent assigning to something out.
                assign_to_sites = set(SI.all_sites)
                parameters = {
                    #'SiteWhitelist' : wfi.request['SiteWhitelist'],
                    'SiteWhitelist': sorted(assign_to_sites),
                    'AcquisitionEra': wfi.acquisitionEra(),
                    'ProcessingString': wfi.processingString(),
                    'MergedLFNBase': wfi.request['MergedLFNBase'],
                    'ProcessingVersion': wfi.request['ProcessingVersion'],
                }
                ## hackery for ACDC merge assignment
                if wfi.request[
                        'RequestType'] == 'TaskChain' and 'Merge' in task.split(
                            '/')[-1]:
                    parameters['AcquisitionEra'] = None
                    parameters['ProcessingString'] = None

                ## xrootd setttings on primary and secondary
                if 'TrustSitelists' in wfi.request and wfi.request[
                        'TrustSitelists']:
                    parameters['TrustSitelists'] = True
                if 'TrustPUSitelists' in wfi.request and wfi.request[
                        'TrustPUSitelists']:
                    parameters['TrustPUSitelists'] = True

                if options.ass:
                    print "really doing the assignment of the ACDC", acdc
                    parameters['execute'] = True
                    wfi.sendLog('recoveror',
                                "%s  was assigned for recovery" % acdc)
                else:
                    print "no assignment done with this ACDC", acdc
                    sendLog('recoveror',
                            "%s needs to be assigned" % (acdc),
                            level='critical')

                result = reqMgrClient.assignWorkflow(url, acdc, team,
                                                     parameters)
                if not result:
                    print acdc, "was not asigned"
                    sendLog('recoveror',
                            "%s needs to be assigned" % (acdc),
                            level='critical')
                else:
                    recovering.add(acdc)

            current = None
            if recovering:
                #if all went well, set the status to -recovering
                current = wfo.status
                if options.ass:
                    current = current.replace('recovery', 'recovering')
                else:
                    current = 'assistance-manual'
                print 'created ACDC: ' + ', '.join(recovering)
            else:
                ## was set to be recovered, and no acdc was made
                current = 'assistance-manual'

            if current:
                print wfo.name, "setting the status to", current
                wfo.status = current
                session.commit()
        else:
            ## this workflow should be handled manually at that point
            print wfo.name, "needs manual intervention"
            wfo.status = 'assistance-manual'
            session.commit()
Ejemplo n.º 9
0
def transferor(url, specific=None, talk=True, options=None):
    if userLock(): return
    mlock = moduleLock()
    if mlock(): return

    use_mcm = True
    up = componentInfo(soft=['mcm', 'wtc', 'jira'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    #NLI = newLockInfo()
    #if not NLI.free(): return
    LI = lockInfo()
    if not LI.free(): return

    mcm = McMClient(dev=False)
    dss = DSS()

    #allowed_secondary = UC.get('')
    print "counting all being handled..."
    being_handled = len(
        session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('stag')).all())
    being_transfered = len(
        session.query(Workflow).filter(Workflow.status == 'staging').all())
    #being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance-')).filter(
                ~Workflow.status.contains('custodial')).all())

    max_to_handle = options.maxworkflows
    max_to_transfer = options.maxstaging

    allowed_to_handle = max(0, max_to_handle - being_handled)
    allowed_to_transfer = max(0, max_to_transfer - being_transfered)
    wf_buffer = 5
    if allowed_to_handle <= wf_buffer:  ## buffer for having several wf per transfer
        print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer"
    else:
        print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer"

    if allowed_to_transfer <= wf_buffer:
        print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer"
    else:
        print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer"

    print "... done"

    all_transfers = defaultdict(list)
    workflow_dependencies = defaultdict(
        set)  ## list of wf.id per input dataset
    wfs_and_wfh = []
    max_per_round = UC.get('max_per_round').get('transferor', None)

    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    all_to_include = session.query(Workflow).filter(
        Workflow.status.startswith('considered')).all()
    if len(cache) > 2000:
        max_to_include = max_per_round
        random.shuffle(cache)  ## randomize first by wf name
        cache = sorted(cache, key=lambda r: r['RequestPriority'],
                       reverse=True)  ## order by prio
        highest = [r['RequestName'] for r in cache[:max_to_include]]
        all_to_include = [wfo for wfo in all_to_include if wfo.name in highest]
        print "limiting what to consider to", max_to_include, "because there is too much stuff going on. Got", len(
            all_to_include)

    for wfo in all_to_include:
        print "\t", wfo.name
        if specific and not specific in wfo.name: continue
        cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append((wfo,
                                workflowInfo(url,
                                             wfo.name,
                                             spec=False,
                                             request=cache_r[0])))
        else:
            wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False)))
    print "... done"

    transfers_per_sites = defaultdict(int)
    input_sizes = defaultdict(float)
    ignored_input_sizes = defaultdict(float)
    input_cput = {}
    input_st = {}
    ## list the size of those in transfer already
    in_transfer_priority = None
    min_transfer_priority = None
    print "getting all wf in staging ..."
    #stucks = json.loads(open('%s/stuck_transfers.json'%monitor_pub_dir).read())
    stucks = json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir))

    for wfo in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wfh = workflowInfo(url, wfo.name, spec=False)
        #(lheinput,primary,parent,secondary) = wfh.getIO()
        #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        print wfo.name, "staging"
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        for site in sites_allowed:  ## we should get the actual transfer destination instead of the full white list
            transfers_per_sites[site] += 1
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        blocks = wfh.getBlocks()
        for prim in primary:
            ds_s = dss.get(prim, blocks=blocks)
            if prim in stucks:
                wfh.sendLog(
                    'transferor',
                    "%s appears stuck, so not counting it %s [GB]" %
                    (prim, ds_s))
                ignored_input_sizes[prim] = max(ds_s,
                                                ignored_input_sizes[prim])
            else:
                input_sizes[prim] = max(ds_s, input_sizes[prim])
                wfh.sendLog('transferor',
                            "%s needs %s [GB]" % (wfo.name, ds_s))
        if in_transfer_priority == None:
            in_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            in_transfer_priority = max(in_transfer_priority,
                                       int(wfh.request['RequestPriority']))
        if min_transfer_priority == None:
            min_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            min_transfer_priority = min(min_transfer_priority,
                                        int(wfh.request['RequestPriority']))

    try:
        print "Ignored input sizes"
        ignored_values = list(ignored_input_sizes.items())
        ignored_values.sort(key=lambda i: i[1])
        print "\n".join(map(str, ignored_values))
        print "Considered input sizes"
        considered_values = list(input_sizes.items())
        considered_values.sort(key=lambda i: i[1])
        print "\n".join(map(str, considered_values))
    except Exception as e:
        print "trying to print the summary of input size"
        print str(e)

    print "... done"
    print "Max priority in transfer already", in_transfer_priority
    print "Min priority in transfer already", min_transfer_priority
    print "transfers per sites"
    print json.dumps(transfers_per_sites, indent=2)
    in_transfer_already = sum(input_sizes.values())
    cput_in_transfer_already = sum(input_cput.values())
    st_in_transfer_already = sum(input_st.values())

    ## list the size of all inputs
    primary_input_per_workflow_gb = defaultdict(float)
    print "getting all input sizes ..."
    input_blocks = {}
    for (wfo, wfh) in wfs_and_wfh:
        (_, primary, _, _) = wfh.getIO()
        blocks = wfh.getBlocks()
        input_blocks[wfo.name] = blocks
        for prim in primary:
            ## do not count it if it appears stalled !
            prim_size = dss.get(prim, blocks=blocks)
            input_sizes[prim] = max(prim_size, input_sizes[prim])
            primary_input_per_workflow_gb[wfo.name] += prim_size
    print "... done"

    # shuffle first by name
    random.shuffle(wfs_and_wfh)

    # Sort smallest transfers first; allows us to transfer as many as possible workflows.
    def prio_and_size(i, j):
        if int(i[1].request['RequestPriority']) == int(
                j[1].request['RequestPriority']):
            return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)),
                       int(primary_input_per_workflow_gb.get(i[0].name, 0)))
        else:
            return cmp(int(i[1].request['RequestPriority']),
                       int(j[1].request['RequestPriority']))

    #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True)
    #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) ))
    #sort by priority higher first
    wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[
        'RequestPriority']), int(j[1].request['RequestPriority'])),
                     reverse=True)

    if min_transfer_priority == None or in_transfer_priority == None:
        print "nothing is lining up for transfer"
        sendLog(
            "transferor",
            "No request in staging, using first request to set priority limit")
        if len(wfs_and_wfh):
            min_transfer_priority = wfs_and_wfh[0][1].request[
                'RequestPriority']
            in_transfer_priority = wfs_and_wfh[0][1].request['RequestPriority']
        else:
            return

    cput_grand_total = sum(input_cput.values())
    cput_to_transfer = cput_grand_total - cput_in_transfer_already
    st_grand_total = sum(input_st.values())
    st_to_transfer = st_grand_total - st_in_transfer_already
    print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already
    print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer
    print "%15.4f [h] worth of absolute system time in transfer" % (
        cput_in_transfer_already / SI.availableSlots())
    print "%15.4f [h] worth of absolute system time is the current requested transfer load" % (
        cput_to_transfer / SI.availableSlots())
    print "%15.4f [h] worth of theoritical system time in transfer" % (
        st_in_transfer_already)
    print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % (
        st_to_transfer)

    grand_total = sum(input_sizes.values())
    to_transfer = grand_total - in_transfer_already
    grand_transfer_limit = options.maxtransfer
    #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB

    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered" % in_transfer_already
    print "%15.4f GB is the current requested transfer load" % to_transfer
    print "%15.4f GB is the global transfer limit" % grand_transfer_limit
    print "%15.4f GB is the available limit" % transfer_limit

    max_staging_per_site = options.maxstagingpersite

    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer = 0  ## so that we can count'em
    passing_along = 0
    transfer_sizes = defaultdict(float)
    went_over_budget = False
    destination_cache = {}
    no_goes = set()

    if max_per_round and not spec:
        wfs_and_wfh = wfs_and_wfh[:max_per_round]

    for (wfo, wfh) in wfs_and_wfh:
        print wfo.name, "to be transfered with priority", wfh.request[
            'RequestPriority']

        if wfh.request['RequestStatus'] != 'assignment-approved':
            if wfh.request['RequestStatus'] in [
                    'aborted', 'rejected', 'rejected-archived',
                    'aborted-archived'
            ]:
                if wfh.isRelval():
                    wfo.status = 'forget'
                else:
                    wfo.status = 'trouble'  ## so that we look or a replacement
            else:
                wfo.status = 'away'
            wfh.sendLog(
                'transferor', '%s in status %s, setting %s' %
                (wfo.name, wfh.request['RequestStatus'], wfo.status))
            continue

        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        blocks = input_blocks.get(wfo.name, wfh.getBlocks())
        if blocks:
            print "Reading only", len(blocks), "blocks in input"
        this_load = sum([dss.get(prim, blocks=blocks) for prim in primary])
        no_budget = False
        if (this_load
                and (sum(transfer_sizes.values()) + this_load > transfer_limit
                     or went_over_budget)):
            if went_over_budget:
                wfh.sendLog('transferor', "Transfer has gone over bubget.")
            else:
                wfh.sendLog('transferor', "Transfer will go over bubget.")
            wfh.sendLog(
                'transferor',
                "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"
                % (this_load, sum(transfer_sizes.values()), transfer_limit))
            #if sum(transfer_sizes.values()) > transfer_limit:
            went_over_budget = True
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(
                        wfh.request['RequestPriority']
                ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over budget" %
                        (wfh.request['RequestPriority'], in_transfer_priority))
                else:
                    if not options.go:
                        wfh.sendLog(
                            'transferor',
                            "%s minimum priority %s < %s : stop" %
                            (min_transfer_priority,
                             wfh.request['RequestPriority'],
                             in_transfer_priority))
                        no_budget = True

        ## throtlle by campaign go
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True
            no_goes.add(wfo.name)

        allowed_secondary = {}
        overide_parameters = {}
        check_secondary = (not wfh.isRelval())
        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                overide_parameters.update(CI.campaigns[campaign])
            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'transferor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('transferor',
                            'These data tiers %s are not allowed in %s' %
                            (','.join(banned_tier), wfo.name),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                msg = '%s is not an allowed secondary' % (
                    ', '.join(set(secondary) - set(allowed_secondary.keys())))
                wfh.sendLog('transferor', msg)
                critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format(
                    wfh.getPrepIDs()[0])
                sendLog('transferor', critical_msg, level='critical')
                if not options.go:
                    no_go = True
            for sec in secondary:
                if sec in allowed_secondary:
                    overide_parameters.update(allowed_secondary[sec])

        if 'SiteWhitelist' in overide_parameters:
            sites_allowed = list(
                set(sites_allowed) & set(overide_parameters['SiteWhitelist']))
            wfh.sendLog(
                'transferor',
                'Intersecting with the overriding whitelist parameters, allowed sites become {}'
                .format(sites_allowed))

        if no_go:
            continue

        if passing_along >= allowed_to_handle:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(wfh.request['RequestPriority']
                       ) >= in_transfer_priority and int(
                           wfh.request['RequestPriority']
                       ) != min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over %s" %
                        (wfh.request['RequestPriority'], in_transfer_priority,
                         max_to_handle))
                else:
                    wfh.sendLog(
                        'transferor',
                        " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"
                        % (max_to_handle, being_handled, passing_along))
                    if not options.go:
                        ## should not allow to jump that fence
                        break

        if this_load and needs_transfer >= allowed_to_transfer:
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(wfh.request['RequestPriority']
                       ) >= in_transfer_priority and int(
                           wfh.request['RequestPriority']
                       ) != min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over %s" %
                        (wfh.request['RequestPriority'], in_transfer_priority,
                         max_to_transfer))
                else:
                    wfh.sendLog(
                        'transferor',
                        "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"
                        % (max_to_transfer, being_transfered, needs_transfer))
                    if not options.go:
                        no_budget = True

        if no_budget:
            continue
        #    break ## try this for a while to make things faster

        ## the site white list considers site, campaign, memory and core information
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')

        for dataset in list(primary) + list(parent) + list(secondary):
            LI.lock(dataset, reason='staging')

        if not sites_allowed:
            wfh.sendLog('transferor', "not possible site to run at")
            #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name ))
            sendLog('transferor',
                    "%s has no possible sites to run at" % (wfo.name),
                    level='critical')
            continue

        can_go = True
        staging = False
        allowed = True
        primary_destinations = set()
        if primary:

            copies_needed_from_CPUh, CPUh = wfh.getNCopies()

            if talk:
                print wfo.name, 'reads', ', '.join(primary), 'in primary'
            ## chope the primary dataset
            for prim in primary:
                ## keep track of what needs what
                workflow_dependencies[prim].add(wfo.id)

                max_priority[prim] = max(max_priority[prim],
                                         int(wfh.request['RequestPriority']))

                wfh.sendLog(
                    'transferor', "Would make %s  from cpu requirement %s" %
                    (copies_needed_from_CPUh, CPUh))
                copies_needed = copies_needed_from_CPUh

                if 'Campaign' in wfh.request and wfh.request[
                        'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                            wfh.request['Campaign']]:
                    copies_needed_from_campaign = CI.campaigns[
                        wfh.request['Campaign']]['maxcopies']
                    copies_needed = min(copies_needed_from_campaign,
                                        copies_needed)

                    wfh.sendLog(
                        'transferor',
                        "Maxed to %s by campaign configuration %s" %
                        (copies_needed, wfh.request['Campaign']))

                if blocks:
                    print "limiting to blocks", "\n".join(sorted(blocks))
                ### new ways of making the whole thing
                destinations, all_block_names = getDatasetDestinations(
                    url,
                    prim,
                    within_sites=[SI.CE_to_SE(site) for site in sites_allowed],
                    only_blocks=blocks)
                print json.dumps(destinations, indent=2)

                ## get where the dataset is in full and completed
                prim_location = [
                    site for (site, info) in destinations.items()
                    if info['completion'] == 100 and info['data_fraction'] == 1
                ]
                ## the rest is places it is going to be
                #prim_destination = [site for site in destinations.keys() if not site in prim_location]
                prim_destination = [
                    site for (site, info) in destinations.items()
                    if info['data_fraction'] == 1 and info['completion'] != 100
                ]
                ## veto the site with no current disk space, for things that are not relval
                prim_destination = [
                    site for site in prim_destination
                    if (SI.disk[site] or wfh.isRelval())
                ]

                if len(prim_location) >= copies_needed:
                    wfh.sendLog(
                        'transferor',
                        "The input is all fully in place at %s sites %s" %
                        (len(prim_location), sorted(prim_location)))
                    continue
                copies_needed = max(0, copies_needed - len(prim_location))
                wfh.sendLog(
                    'transferor',
                    "Counting existing copies ; now need %s" % copies_needed)
                copies_being_made = [
                    sum([
                        info['blocks'].keys().count(block)
                        for site, info in destinations.items()
                        if site in prim_destination
                    ]) for block in all_block_names
                ]

                latching_on_transfers = set()
                [
                    latching_on_transfers.update(info['blocks'].values())
                    for site, info in destinations.items()
                    if site in prim_destination
                ]
                latching_on_transfers = list(latching_on_transfers)
                #print latching_on_transfers

                ## figure out where all this is going to go
                prim_to_distribute = [
                    site for site in sites_allowed
                    if not SI.CE_to_SE(site) in prim_location
                ]
                prim_to_distribute = [
                    site for site in prim_to_distribute
                    if not SI.CE_to_SE(site) in prim_destination
                ]
                ## take out the ones that cannot receive transfers
                potential_destinations = len(prim_to_distribute)
                #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer]
                prim_to_distribute = [
                    site for site in prim_to_distribute
                    if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval())
                ]

                ## do we want to restrict transfers if the amount of site in vetoe are too large ?

                wfh.sendLog(
                    'transferor',
                    "Could be going to: %s" % sorted(prim_to_distribute))
                if not prim_to_distribute or any([
                        transfers_per_sites[site] < max_staging_per_site
                        for site in prim_to_distribute
                ]):
                    ## means there is openings let me go
                    print "There are transfer slots available:", [
                        (site, transfers_per_sites[site])
                        for site in prim_to_distribute
                    ]
                else:
                    if int(
                            wfh.request['RequestPriority']
                    ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                        wfh.sendLog(
                            'transferor',
                            "Higher priority sample %s >= %s go-on over transfer slots available"
                            % (wfh.request['RequestPriority'],
                               in_transfer_priority))
                    else:
                        wfh.sendLog(
                            'transferor',
                            "Not allowed to transfer more than %s per site at a time. Going overboard for %s"
                            % (max_staging_per_site,
                               sorted([
                                   site for site in prim_to_distribute
                                   if transfers_per_sites[site] >=
                                   max_staging_per_site
                               ])))
                        if not options.go:
                            allowed = False
                            break

                for latching in latching_on_transfers:

                    existings = session.query(TransferImp).filter(
                        TransferImp.phedexid == int(latching)).filter(
                            TransferImp.workflow_id == wfo.id).all()
                    if not existings:
                        tri = TransferImp(phedexid=int(latching), workflow=wfo)
                        print "adding", wfo.id, "with phedexid", latching
                        session.add(tri)
                    else:
                        for existing in existings:
                            existing.active = True

                    session.flush()

                    can_go = False
                    transfer_sizes[prim] = max(this_load, transfer_sizes[prim])
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                #copies_needed = max(0,copies_needed - len(prim_destination))
                copies_needed = max(0, copies_needed - min(copies_being_made))
                wfh.sendLog(
                    'transferor',
                    "Counting the copies being made ; then need %s" %
                    copies_needed)
                if copies_needed == 0:
                    wfh.sendLog(
                        'transferor',
                        "The input is either fully in place or getting in full somewhere with %s"
                        % latching_on_transfers)
                    can_go = True
                    continue
                elif len(prim_to_distribute) == 0:
                    wfh.sendLog(
                        'transferor',
                        "We are going to need extra copies of %s, but no destinations seems available"
                        % (prim))
                    sendLog(
                        'transferor',
                        "We are going to need extra copies of %s, but no destinations seems available"
                        % (prim),
                        level='critical')

                    print json.dumps(prim_to_distribute, indent=2)
                    print json.dumps(prim_location, indent=2)
                    print json.dumps(prim_destination, indent=2)

                    prim_to_distribute = [
                        site for site in sites_allowed
                        if not SI.CE_to_SE(site) in prim_location
                    ]
                    #prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in SI.sites_veto_transfer ]
                    prim_to_distribute = [
                        site for site in prim_to_distribute
                        if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval())
                    ]

                    print "changed to"
                    print json.dumps(prim_to_distribute, indent=2)

                if len(
                        prim_to_distribute
                ) > 0:  ## maybe that a parameter we can play with to limit the
                    if not options or options.chop:
                        ### hard include the tape disk andpoint ?
                        #tapes = [site for site in  getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')]
                        chops, sizes = getDatasetChops(
                            prim,
                            chop_threshold=options.chopsize,
                            only_blocks=blocks)
                        spreading = distributeToSites(chops,
                                                      prim_to_distribute,
                                                      n_copies=copies_needed,
                                                      weights=SI.cpu_pledges,
                                                      sizes=sizes)
                        ## prune the blocks/destination that are already in the making, so that subscription don't overlap
                        for site in spreading:
                            for block in list(spreading[site]):
                                if site in destinations and block in destinations[
                                        site]['blocks'].keys():
                                    ## prune it
                                    spreading[site].remove(block)

                        transfer_sizes[prim] = sum(sizes)
                        if not spreading:
                            sendLog(
                                'transferor',
                                'cannot send %s to any site, it cannot fit anywhere'
                                % prim,
                                level='critical')
                            wfh.sendLog(
                                'transferor',
                                "cannot send to any site. %s cannot seem to fit anywhere"
                                % (prim))
                            staging = False
                            can_go = False

                    else:
                        spreading = {}
                        for site in prim_to_distribute:
                            if blocks:
                                spreading[site] = blocks
                            else:
                                spreading[site] = [prim]
                        transfer_sizes[prim] = max(this_load,
                                                   transfer_sizes[prim])
                    can_go = False
                    wfh.sendLog(
                        'transferor', "selected CE destinations %s" %
                        (sorted(spreading.keys())))
                    for (site, items) in spreading.items():
                        all_transfers[site].extend(items)
                        transfers_per_sites[site] += 1
                        primary_destinations.add(site)
                else:
                    can_go = False
                    allowed = False

        if not allowed:
            wfh.sendLog('transferor', "Not allowed to move on with")
            continue

        if secondary:

            override_sec_destination = []
            if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]:
                override_sec_destination = CI.campaigns[
                    wfh.request['Campaign']]['SecondaryLocation']
            if 'SecondaryLocation' in overide_parameters:
                override_sec_destination = overide_parameters[
                    'SecondaryLocation']
            print wfo.name, 'reads', ', '.join(secondary), 'in secondary'
            for sec in secondary:

                workflow_dependencies[sec].add(wfo.id)

                if True:
                    ## new style, failing on minbias
                    if not sec in destination_cache:
                        ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist
                        destination_cache[sec], _ = getDatasetDestinations(
                            url, sec)  ## NO SITE WHITE LIST ADDED
                        #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])

                    ## limit to the site whitelist NOW
                    se_allowed = set(
                        [SI.CE_to_SE(site) for site in sites_allowed])
                    destinations = dict([
                        (k, v) for (k, v) in destination_cache[sec].items()
                        if k in se_allowed
                    ])
                    ## truncate location/destination to those making up for >90% of the dataset
                    bad_destinations = [
                        destinations.pop(site)
                        for (site, info) in destinations.items()
                        if info['data_fraction'] < 0.9
                    ]
                    print sec, json.dumps(destinations, indent=2)
                    sec_location = [
                        site for (site, info) in destinations.items()
                        if info['completion'] >= 95
                    ]
                    sec_destination = [
                        site for site in destinations.keys()
                        if not site in sec_location
                    ]  ## this is in SE
                else:
                    ## old style
                    presence = getDatasetPresence(url, sec)
                    sec_location = [
                        site for site, pres in presence.items()
                        if pres[1] > 90.
                    ]  ## more than 90% of the minbias at sites
                    subscriptions = listSubscriptions(url, sec)
                    sec_destination = [site for site in subscriptions]

                ## how to make unified understand that it has to wait for the secondary if the sec_destination and

                #sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [
                    site for site in sites_allowed
                    if not SI.CE_to_SE(site) in sec_location
                ]
                #sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                sec_to_distribute = [
                    site for site in sec_to_distribute
                    if not SI.CE_to_SE(site) in sec_destination
                ]
                presitespace_sec_to_distribute = copy.deepcopy(
                    sec_to_distribute)
                #sec_to_distribute = [site for site in sec_to_distribute if not  any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                #sec_to_distribute = [site for site in sec_to_distribute if not  SI.CE_to_SE(site) in SI.sites_veto_transfer]
                sec_to_distribute = [
                    site for site in sec_to_distribute
                    if (SI.disk[SI.CE_to_SE(site)] or wfh.isRelval())
                ]
                ## at this point you have a problem
                if len(sec_to_distribute) == 0 and len(
                        presitespace_sec_to_distribute):
                    sendLog(
                        'transferor',
                        '%s is getting no possible destinations because of lack of space. To be decided what to do in general'
                        % (sec),
                        level='critical')

                if override_sec_destination:
                    ## intersect with where we want the PU to be
                    not_needed_anymore = list(
                        set(sec_to_distribute) - set(override_sec_destination))
                    #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore ))
                    sendLog(
                        'transferor',
                        "the dataset %s could be removed from %s" %
                        (sec, not_needed_anymore))
                    sec_to_distribute = list(
                        set(sec_to_distribute) & set(override_sec_destination))

                if len(sec_to_distribute) > 0:
                    print "secondary could go to", sorted(sec_to_distribute)
                    sec_size = dss.get(sec)
                    for site in sec_to_distribute:
                        site_se = SI.CE_to_SE(site)
                        if (SI.disk[site_se] *
                                1024.) > sec_size or wfh.isRelval():
                            wfh.sendLog('transferor',
                                        'Sending %s to %s' % (sec, site))
                            all_transfers[site].append(sec)
                            can_go = False
                        else:
                            print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[
                                site_se] * 1024, "GB need", sec_size
                            if primary_destinations and site in primary_destinations:
                                #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024))
                                sendLog(
                                    'transferor',
                                    '%s is too big (%s) for %s (%s). %s will not be able to run there.'
                                    % (sec, sec_size, site_se,
                                       SI.disk[site_se] * 1024, wfo.name),
                                    level='critical')
                                wfh.sendLog(
                                    'transferor',
                                    '%s is too big (%s) for %s (%s). will not be able to run there.'
                                    % (sec, sec_size, site_se,
                                       SI.disk[site_se] * 1024))
                else:
                    ## this is bas overall
                    print "the secondary input does not have to be send to site"

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                wfh.sendLog(
                    'transferor',
                    "latches on existing transfers, and nothing else, settin staging"
                )
                wfo.status = 'staging'
                needs_transfer += 1
            else:
                wfh.sendLog(
                    'transferor', "should just be assigned now to %s" %
                    sorted(sites_allowed))
                wfo.status = 'staged'
            passing_along += 1
            wfh.sendLog('transferor',
                        "setting %s status to %s" % (wfo.name, wfo.status))
            #session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                wfh.sendLog('transferor', "latches on existing transfers")
                if not options.test:
                    wfo.status = 'staging'
                    wfh.sendLog(
                        'transferor',
                        "setting %s status to %s" % (wfo.name, wfo.status))
                    #session.commit()
            wfh.sendLog('transferor', "needs a transfer")
            needs_transfer += 1
            passing_along += 1

    if no_goes:
        #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes ))
        sendLog('transferor',
                "No go for \n" + "\n".join(sorted(no_goes)),
                level='critical')

    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    fake_id = -1
    wf_id_in_prestaging = set()

    for (site, items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))

        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        #if site in SI.sites_veto_transfer:
        #    print site,"does not want transfers"
        #    continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        block_datasets = list(set([it.split('#')[0] for it in blocks]))
        datasets = [it for it in items_to_transfer if not '#' in it]

        details_text = "Making a replica to %s (CE) %s (SE) for" % (site,
                                                                    site_se)

        #print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [
            block for block in blocks if not block.split('#')[0] in datasets
        ]
        #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        #print "\t",len(datasets),"datasets"
        #print "\t",datasets
        details_text += '\n\t%d blocks' % len(blocks)
        details_text += '\n\t%d needed blocks for %s' % (
            len(blocks),
            sorted(list(set([block.split('#')[0] for block in blocks]))))
        details_text += '\n\t%d datasets' % len(datasets)
        details_text += '\n\t%s' % sorted(datasets)

        items_to_transfer = blocks + datasets

        if execute:
            sendLog('transferor', details_text)
        else:
            print "Would make a replica to", site, "(CE)", site_se, "(SE) for"
            print details_text

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y', 'yes', 'go']:
                continue
        transfered_items = defaultdict(set)
        if execute:
            priority = 'normal'
            cds = [
                ds for ds in set(datasets + block_datasets)
                if ds in max_priority
            ]
            ## bucketize the transfers by priority of workflows
            prioritized_items = defaultdict(set)
            for item in items_to_transfer:
                d = item.split('#')[0]
                p = max_priority.get(d, 80000)
                q = 'normal'
                if p > 100000:
                    q = 'reserved'
                elif p < 70000:
                    q = 'low'
                prioritized_items[q].add(item)

            for priority, items in prioritized_items.items():
                result = makeReplicaRequest(url,
                                            site_se,
                                            list(items),
                                            'prestaging',
                                            priority=priority,
                                            approve=True)
                if result:
                    these_transfers = [
                        o['id'] for o in result['phedex']['request_created']
                    ]
                    #phedexids.extend( these_transfers )
                    for ph in these_transfers:
                        transfered_items[ph].update(items)
                else:
                    sendLog(
                        'transferor',
                        'Could not make a replica request for items %s to site %s'
                        % (items, site_se),
                        level='critical')

            #result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority, approve=True)
            #phedexids = [o['id'] for o in result['phedex']['request_created']]:
        #else:
        #    #result= {'phedex':{'request_created' : []}}
        #    phedexids = []
        #    fake_id-=1

        if not transfered_items:
            sendLog(
                'transferor',
                'Could not make a replica request for items %s to site %s' %
                (items_to_transfer, site),
                level='critical')
            continue
        for phedexid, items in transfered_items.items():
            print phedexid, "transfer created"
            for transfering in list(
                    set(map(lambda it: it.split('#')[0], items))):
                for wfid in workflow_dependencies[transfering]:
                    new_transfer = session.query(TransferImp).filter(
                        TransferImp.phedexid == int(phedexid)).filter(
                            TransferImp.workflow_id == wfid).first()
                    if not new_transfer:
                        new_transfer = TransferImp(
                            phedexid=phedexid,
                            workflow=session.query(Workflow).get(wfid))
                        session.add(new_transfer)
                    else:
                        new_transfer.active = True

                    wf_id_in_prestaging.add(wfid)
            #session.commit()

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status != 'staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting", tr_wf.name, "to staging"
        #session.commit()

    ## one big session commit at the end that everything went fine
    session.commit()
Ejemplo n.º 10
0
def spawn_harvesting(url, wfi, in_full):
    SI = siteInfo()

    all_OK = {}
    requests = []
    outputs = wfi.request['OutputDatasets']
    if ('EnableHarvesting' in wfi.request and wfi.request['EnableHarvesting']
        ) or ('DQMConfigCacheID' in wfi.request
              and wfi.request['DQMConfigCacheID']):
        if not 'MergedLFNBase' in wfi.request:
            print "f****d up"
            sendEmail('screwed up wl cache',
                      '%s wl cache is bad' % (wfi.request['RequestName']))
            all_OK['fake'] = False
            return all_OK, requests

        wfi = workflowInfo(url, wfi.request['RequestName'])
        dqms = [out for out in outputs if '/DQM' in out]
        if not all([in_full[dqm_input] for dqm_input in dqms]):
            wfi.sendLog(
                'closor',
                "will not be able to assign the harvesting: holding up")
            for dqm_input in dqms:
                all_OK[dqm_input] = False
                return all_OK, requests

        for dqm_input in dqms:
            ## handle it properly
            harvesting_schema = {
                'Requestor': os.getenv('USER'),
                'RequestType': 'DQMHarvest',
                'Group': 'DATAOPS'
            }
            copy_over = [
                'ProcessingString',
                'DQMUploadUrl',
                'CMSSWVersion',
                'CouchDBName',
                'CouchWorkloadDBName',
                'CouchURL',
                'DbsUrl',
                'inputMode',
                'DQMConfigCacheID',
                'OpenRunningTimeout',
                'ScramArch',
                'CMSSWVersion',
                'Campaign',
                'Memory',  #dummy
                'SizePerEvent',  #dummy
                'GlobalTag',  #dummy
            ]
            for item in copy_over:
                harvesting_schema[item] = copy.deepcopy(wfi.request[item])
            harvesting_schema['InputDataset'] = dqm_input
            harvesting_schema['TimePerEvent'] = 1
            harvesting_schema['PrepID'] = 'Harvest-' + wfi.request['PrepID']
            harvesting_schema[
                'RequestString'] = 'HARVEST-' + wfi.request['RequestString']
            harvesting_schema['DQMHarvestUnit'] = 'byRun'
            harvesting_schema['ConfigCacheUrl'] = harvesting_schema[
                'CouchURL']  ## uhm, how stupid is that ?
            harvesting_schema[
                'RequestPriority'] = wfi.request['RequestPriority'] * 10

            harvest_request = reqMgrClient.submitWorkflow(
                url, harvesting_schema)
            if not harvest_request:
                print "Error in making harvesting for", wfo.name
                print "schema"
                print json.dumps(harvesting_schema, indent=2)
                harvest_request = reqMgrClient.submitWorkflow(
                    url, harvesting_schema)
                if not harvest_request:
                    print "Error twice in harvesting for", wfo.name
                    print "schema"
                    print json.dumps(harvesting_schema, indent=2)

            if harvest_request:
                requests.append(harvest_request)
                ## should we protect for setting approved ? no, it's notified below, assignment will fail, likely
                data = reqMgrClient.setWorkflowApproved(url, harvest_request)
                print "created", harvest_request, "for harvesting of", dqm_input
                wfi.sendLog(
                    'closor', "created %s for harvesting of %s" %
                    (harvest_request, dqm_input))
                ## assign it directly
                team = wfi.request['Teams'][0]
                parameters = {
                    'SiteWhitelist': [
                        SI.SE_to_CE(se)
                        for se in wfi.request['NonCustodialSites']
                    ],
                    'AcquisitionEra':
                    wfi.acquisitionEra(),
                    'ProcessingString':
                    wfi.processingString(),
                    'MergedLFNBase':
                    wfi.request['MergedLFNBase'],
                    'ProcessingVersion':
                    wfi.request['ProcessingVersion'],
                    'execute':
                    True
                }
                if in_full[dqm_input]:
                    print "using full copy at", in_full[dqm_input]
                    parameters['SiteWhitelist'] = [
                        SI.SE_to_CE(se) for se in in_full[dqm_input]
                    ]
                else:
                    print "cannot do anything if not having a full copy somewhere"
                    all_OK[dqm_input] = False
                    continue

                result = reqMgrClient.assignWorkflow(url, harvest_request,
                                                     team, parameters)
                if not result:
                    #sendEmail('harvesting request created','%s was created at announcement of %s in %s, failed to assign'%(harvest_request, dqm_input, wfi.request['RequestName']), destination=[wfi.request['Requestor']+'@cern.ch'])
                    wfi.sendLog(
                        'closor',
                        '%s was created at announcement of %s in %s, failed to assign'
                        % (harvest_request, dqm_input,
                           wfi.request['RequestName']))
                    sendLog(
                        'closor',
                        '%s was created at announcement of %s in %s, failed to assign'
                        % (harvest_request, dqm_input,
                           wfi.request['RequestName']),
                        level='critical')
                else:
                    #sendEmail('harvesting request assigned','%s was created at announcement of %s in %s, and assigned'%(harvest_request, dqm_input, wfi.request['RequestName']), destination=[wfi.request['Requestor']+'@cern.ch'])
                    wfi.sendLog(
                        'closor',
                        '%s was created at announcement of %s in %s, and assigned'
                        % (harvest_request, dqm_input,
                           wfi.request['RequestName']))

            else:
                #print "could not make the harvesting for",wfo.name,"not announcing"
                wfi.sendLog('closor', "could not make the harvesting request")
                sendLog('closor',
                        "could not make the harvesting request for %s" %
                        wfi.request['RequestName'],
                        level='critical')
                all_OK[dqm_input] = False
    return (all_OK, requests)
Ejemplo n.º 11
0
def transferor(url ,specific = None, talk=True, options=None):
    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()

    all_transfers=defaultdict(list)
    workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset
    data_to_wf = {}
    for wfo in session.query(Workflow).filter(Workflow.status=='considered').all():
        if specific and not specific in wfo.name: continue

        print wfo.name,"to be transfered"
        wfh = workflowInfo( url, wfo.name)

        #injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.)
        #now = time.mktime(time.gmtime()) / (60.*60.)
        #if float(now - injection_time) < 4.:
        #    print "It is too soon to transfer", now, injection_time
        #    continue

        (lheinput,primary,parent,secondary) = wfh.getIO()
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')
        else:
            sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist']

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']

        can_go = True
        if primary:
            if talk:
                print wfo.name,'reads',', '.join(primary),'in primary'
            ## chope the primary dataset 
            for prim in primary:
                workflow_dependencies[prim].add( wfo.id )
                presence = getDatasetPresence( url, prim )
                prim_location = [site for site,pres in presence.items() if pres[0]==True]
                subscriptions = listSubscriptions( url , prim )
                prim_destination = [site for site in subscriptions]
                prim_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in prim_location])]
                prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in prim_destination])]
                if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the 
                    if not options or options.chop:
                        spreading = distributeToSites( [[prim]]+getDatasetChops(prim), prim_to_distribute, n_copies = 3, weights=SI.cpu_pledges)
                    else:
                        spreading = {} 
                        for site in prim_to_distribute: spreading[site]=[prim]
                    can_go = False
                for (site,items) in spreading.items():
                    all_transfers[site].extend( items )



        if secondary:
            if talk:
                print wfo.name,'reads',', '.join(secondary),'in secondary'
            for sec in secondary:
                workflow_dependencies[sec].add( wfo.id )
                presence = getDatasetPresence( url, sec )
                sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites
                subscriptions = listSubscriptions( url ,sec )
                sec_destination = [site for site in subscriptions] 
                sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                if len( sec_to_distribute )>0:
                    for site in sec_to_distribute:
                        all_transfers[site].append( sec )
                        can_go = False
        
        ## is that possible to do something more
        if can_go:
            print wfo.name,"should just be assigned NOW to",sites_allowed
            wfo.status = 'staged'
            session.commit()
            continue
        else:
            print wfo.name,"needs a transfer"

    #print json.dumps(all_transfers)
    fake_id=-1
    wf_id_in_prestaging=set()

    for (site,items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))
        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        datasets = [it for it in items_to_transfer if not '#' in it]

        print "Making a replica to",site,"(CE)",site_se,"(SE) for"
        print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [block for block in blocks if not block.split('#')[0] in datasets]
        print "\t",len(blocks),"needed blocks"
        print "\t",len(datasets),"datasets"
        print "\t",datasets
        items_to_transfer = blocks + datasets

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y','yes','go']:
                continue

        if execute:
            result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging')
        else:
            #result= {'phedex':{'request_created' : [{'id' : fake_id}]}}
            result= {'phedex':{'request_created' : []}}
            fake_id-=1



        if not result:
            print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first()
            if not new_transfer:
                new_transfer = Transfer( phedexid = phedexid)
                session.add( new_transfer )                
            new_transfer.workflows_id = set()
            for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update( workflow_dependencies[transfering] )
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status!='staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting",tr_wf.name,"to staging"
        session.commit()
Ejemplo n.º 12
0
def assignor(url ,specific = None, talk=True, options=None):
    if userLock('assignor'): return

    CI = campaignInfo()
    SI = siteInfo()

    wfos=[]
    if specific:
        wfos = session.query(Workflow).filter(Workflow.name==specific).all()
    if not wfos:
        if specific:
            wfos = session.query(Workflow).filter(Workflow.status=='considered').all()
            wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all())
        wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all())

    for wfo in wfos:
        if specific:
            if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue
            #if not specific in wfo.name: continue
        print wfo.name,"to be assigned"
        wfh = workflowInfo( url, wfo.name)


        ## check if by configuration we gave it a GO
        if not CI.go( wfh.request['Campaign'] ) and not options.go:
            print "No go for",wfh.request['Campaign']
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] !='assignment-approved':
            print wfo.name,wfh.request['RequestStatus'],"skipping"
            if not options.test:
                continue

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version=wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                print "cannot decide on version number"
                continue

        (lheinput,primary,parent,secondary) = wfh.getIO()
        sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        print "Allowed",sites_allowed
        sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])]
        sites_custodial = []
        if len(sites_custodial)==0:
            print "No custodial, it's fine, it's covered in close-out"

        if len(sites_custodial)>1:
            print "more than one custodial for",wfo.name
            sys.exit(36)

        secondary_locations=None
        for sec in list(secondary):
            presence = getDatasetPresence( url, sec )
            print sec
            print json.dumps(presence, indent=2)
            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>90.]
            one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations==None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            

        sites_all_data = copy.deepcopy( sites_allowed )
        sites_with_data = copy.deepcopy( sites_allowed )
        sites_with_any_data = copy.deepcopy( sites_allowed )
        primary_locations = None
        available_fractions = {}
        for prim in list(primary):
            presence = getDatasetPresence( url, prim )
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] =  getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] )
            sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])]
            sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])]
            sites_with_any_data = [site for site in sites_with_any_data if any([osite.startswith(site) for osite in presence.keys()])]
            if primary_locations==None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(set(primary_locations) & set(presence.keys() ))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites=[]
        ## opportunistic running where any piece of data is available
        if secondary_locations and primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set(sites_allowed))]
            print "We could be running at",opportunistic_sites,"in addition"

        if available_fractions and not all([available>=1. for available in available_fractions.values()]):
            print "The input dataset is not located in full at any site"
            print json.dumps(available_fractions)
            if not options.test and not options.go: continue ## skip skip skip
        copies_wanted = 2.
        if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]):
            print "The input dataset is not available",copies_wanted,"times, only",available_fractions.values()
            if not options.go:
                continue

        ## default back to white list to original white list with any data
        print "Allowed",sites_allowed
        sites_allowed = sites_with_any_data
        print "Selected for any data",sites_allowed

        if options.restrict:
            print "Allowed",sites_allowed
            sites_allowed = sites_with_any_data
            print "Selected",sites_allowed
        else:
            if set(sites_with_data) != set(sites_allowed):
                ## the data is not everywhere we wanted to run at : enable aaa
                print "Sites with 90% data not matching site white list (block choping!)"
                print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?"
                print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data))
                #options.useSiteListAsLocation = True
                #print "Not commissioned yet"
                #continue
            #print "We could be running at",opportunistic_sites,"in addition"
            ##sites_allowed = list(set(sites_allowed+ opportunistic_sites))

        if not len(sites_allowed):
            print wfo.name,"cannot be assign with no matched sites"
            continue

        parameters={
            'SiteWhitelist' : sites_allowed,
            'CustodialSites' : sites_custodial,
            'NonCustodialSites' : sites_out,
            'AutoApproveSubscriptionSites' : list(set(sites_out)),
            'AcquisitionEra' : wfh.acquisitionEra(),
            'ProcessingString' : wfh.processingString(),
            'MergedLFNBase' : '/store/mc', ## to be figured out ! from Hi shit
            'ProcessingVersion' : version,
            }

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v=getattr(options,key)
                if v!=None:
                    if ',' in v: parameters[key] = filter(None,v.split(','))
                    else: parameters[key] = v

        ## pick up campaign specific assignment parameters
        parameters.update( CI.parameters(wfh.request['Campaign']) )

        if not options.test:
            parameters['execute'] = True

        if not wfh.checkWorkflowSplitting():
            ## needs to go to event based ? fail for now
            print "Falling back to event splitting ?"
            #parameters['SplittingAlgorithm'] = 'EventBased'
            continue

        ## plain assignment here
        team='production'
        if options and options.team:
            team = options.team
        result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters)

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
            else:
                print "ERROR could not assign",wfo.name
        else:
            pass
Ejemplo n.º 13
0
def equalizor(url, specific=None, options=None):
    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return

    if not specific:
        workflows = getWorkflows(url, status='running-closed', details=True)
        workflows.extend(getWorkflows(url, status='running-open',
                                      details=True))

    ## start from scratch
    modifications = defaultdict(dict)
    ## define regionality site => fallback allowed. feed on an ssb metric ??
    mapping = defaultdict(list)
    reversed_mapping = defaultdict(list)
    regions = defaultdict(list)
    SI = siteInfo()
    for site in SI.sites_ready:
        region = site.split('_')[1]
        if not region in ['US', 'DE', 'IT']: continue
        regions[region] = [region]

    def site_in_depletion(s):
        return True
        if s in SI.sites_pressure:
            (m, r, pressure) = SI.sites_pressure[s]
            if float(m) < float(r):
                print s, m, r, "lacking pressure"
                return True
            else:
                print s, m, r, "pressure"
                pass

        return False

    for site in SI.sites_ready:
        region = site.split('_')[1]
        ## fallback to the region, to site with on-going low pressure
        mapping[site] = [
            fb for fb in SI.sites_ready
            if any([('_%s_' %
                     (reg) in fb and fb != site and site_in_depletion(fb))
                    for reg in regions[region]])
        ]

    use_T0 = False
    if options.augment: use_T0 = True

    use_HLT = False
    if options.augment: use_HLT = True

    if use_HLT:
        mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT')
    #mapping['T2_IT_Legnaro'].append('T1_IT_CNAF')
    for reg in ['IT', 'DE', 'UK']:
        mapping['T2_CH_CERN'].extend(
            [fb for fb in SI.sites_ready if '_%s_' % reg in fb])

    for site, fallbacks in mapping.items():
        for fb in fallbacks:
            reversed_mapping[fb].append(site)

    ## this is the fallback mapping
    print json.dumps(mapping, indent=2)
    #print json.dumps( reversed_mapping, indent=2)

    altered_tasks = set()

    def running_idle(wfi, task_name):
        gmon = wfi.getGlideMon()
        #print gmon
        if not gmon: return (0, 0)
        if not task_name in gmon: return (0, 0)
        return (gmon[task_name]['Running'], gmon[task_name]['Idle'])

    def needs_action(wfi, task, min_idled=100, pressure=0.2):
        task_name = task.pathName.split('/')[-1]
        running, idled = running_idle(wfi, task_name)
        go = True
        if not idled and not running:
            go = False
        if idled < 100:
            go = False
        if (not running and idled) or (running and
                                       (idled / float(running) > pressure)):
            go = True
        else:
            go = False
        return go, task_name, running, idled

    def getcampaign(task):
        taskname = task.pathName.split('/')[-1]
        if hasattr(task, 'prepID'):
            return task.prepID.split('-')[1]
        elif taskname.count('-') >= 1:
            return taskname.split('-')[1]
        else:
            return None

    def close(interface):
        open('%s/equalizor.json.new' % monitor_dir,
             'w').write(json.dumps(interface, indent=2))
        os.system('mv %s/equalizor.json.new %s/equalizor.json' %
                  (monitor_dir, monitor_dir))
        os.system('cp %s/equalizor.json %s/logs/equalizor/equalizor.%s.json' %
                  (monitor_dir, monitor_dir, time.mktime(time.gmtime())))

    interface = {'reversed_mapping': reversed_mapping, 'modifications': {}}
    if options.augment or options.remove:
        interface['modifications'] = json.loads(
            open('%s/equalizor.json' % monitor_dir).read())['modifications']

    if options.remove:
        if specific in interface['modifications']:
            print "poping", specific
            interface['modifications'].pop(specific)
            close(interface)
        return

    PU_locations = {}
    PU_overflow = {
        #'RunIISpring15PrePremix' : {
        #    'sites' : ["T1_US_FNAL", "T1_DE_KIT" , "T1_IT_CNAF", "T1_RU_JINR" ,"T2_CH_CERN"],
        #    'max' : 20000,
        #    'pending' : 0
        #    },
        'RunIIFall15DR76': {
            'sites': [
                'T1_ES_PIC', 'T2_US_Purdue', 'T2_UK_SGrid_RALPP', 'T2_BE_IIHE',
                'T2_DE_DESY', 'T2_IT_Legnaro', 'T2_US_Caltech', 'T1_DE_KIT',
                'T2_UK_London_Brunel', 'T2_IT_Pisa', 'T1_US_FNAL',
                'T2_IT_Rome', 'T2_US_Florida', 'T1_IT_CNAF', 'T1_RU_JINR',
                'T2_UK_London_IC', 'T2_US_Nebraska', 'T2_FR_CCIN2P3',
                'T2_US_UCSD', 'T2_ES_CIEMAT', 'T1_FR_CCIN2P3',
                'T2_US_Wisconsin', 'T2_US_MIT', 'T2_DE_RWTH', 'T1_UK_RAL',
                'T2_US_Vanderbilt', 'T2_CH_CERN'
            ],
            'max':
            20000,
            'pending':
            0
        },
        'RunIISpring16DR80': {
            'sites': [
                'T1_ES_PIC', 'T2_US_Purdue', 'T2_UK_SGrid_RALPP', 'T2_BE_IIHE',
                'T2_DE_DESY', 'T2_IT_Legnaro', 'T2_US_Caltech', 'T1_DE_KIT',
                'T2_UK_London_Brunel', 'T2_IT_Pisa', 'T1_US_FNAL',
                'T2_IT_Rome', 'T2_US_Florida', 'T1_IT_CNAF', 'T1_RU_JINR',
                'T2_UK_London_IC', 'T2_US_Nebraska', 'T2_FR_CCIN2P3',
                'T2_US_UCSD', 'T2_ES_CIEMAT', 'T1_FR_CCIN2P3',
                'T2_US_Wisconsin', 'T2_US_MIT', 'T2_DE_RWTH', 'T1_UK_RAL',
                'T2_US_Vanderbilt', 'T2_CH_CERN'
            ],
            'max':
            20000,
            'pending':
            0,
            'force':
            True
        },
        'RunIISpring15DR74': {
            'sites': [
                'T1_ES_PIC', 'T1_DE_KIT', 'T1_US_FNAL', 'T1_IT_CNAF',
                'T1_RU_JINR', 'T1_FR_CCIN2P3', 'T1_UK_RAL', 'T2_CH_CERN'
            ],
            'max':
            20000,
            'pending':
            0
        }
    }

    set_to = SI.sites_AAA
    LHE_overflow = {
        'RunIIWinter15GS': set_to,
        'RunIISummer15GS': set_to,
        'Summer12': set_to,
        'Summer11Leg': set_to
        #'RunIIFall15MiniAODv2' : set_to,
    }

    pending_HLT = 0
    max_HLT = 60000
    pending_T0 = 0
    max_T0 = 60000
    try:
        gmon = json.loads(
            os.popen(
                'curl -s http://cms-gwmsmon.cern.ch/prodview/json/T2_CH_CERN_HLT'
            ).read())
        pending_HLT += gmon["Running"]
        pending_HLT += gmon["MatchingIdle"]
    except:
        pass

    t0_special = [
        'vlimant_BPH-RunIISummer15GS-00030_00212_v0__160129_135314_9755',
        'pdmvserv_TSG-RunIISummer15GS-00044_00240_v0__160210_121223_8582'
    ]
    no_routing = [
        #'vlimant_BPH-RunIISummer15GS-00030_00212_v0__160129_135314_9755',
        #'pdmvserv_TOP-RunIIWinter15GS-00074_00187_v0__160207_162312_1992',
    ]

    stay_within_site_whitelist = False
    specific_task = None
    if specific and ":" in specific:
        specific, specific_task = specific.split(':')

    if specific:
        wfs = session.query(Workflow).filter(
            Workflow.name.contains(specific)).all()
    else:
        wfs = session.query(Workflow).filter(Workflow.status == 'away').all()

    random.shuffle(wfs)
    for wfo in wfs:
        if wfo.name in no_routing and not options.augment:
            continue

        if specific and not specific in wfo.name:
            continue
        if specific:
            wfi = workflowInfo(url, wfo.name)
        else:
            cached = filter(lambda d: d['RequestName'] == wfo.name, workflows)
            if not cached: continue
            wfi = workflowInfo(url, wfo.name, request=cached[0])

        ## only running should get re-routed
        if not wfi.request['RequestStatus'] in [
                'running-open', 'running-closed'
        ] and not specific:
            continue

        tasks_and_campaigns = []
        for task in wfi.getWorkTasks():
            tasks_and_campaigns.append((task, getcampaign(task)))

        _, _, _, sec = wfi.getIO()

        ## check needs override
        needs_overide = False
        if not needs_overide and options.augment: needs_overide = True

        def overide_from_agent(wfi, needs_overide):
            bad_agents = []  #'http://cmssrv219.fnal.gov:5984']
            if not bad_agents: return needs_overide
            if needs_overide: return True
            agents = wfi.getAgents()

            wqss = ['Running', 'Acquired']
            if any([
                    agent in agents.get(wqs, {}).keys()
                    for wqs, agent in itertools.product(wqss, bad_agents)
            ]):
                print "overriding the need for bad agent"
                needs_overide = True
            return needs_overide

        ## now parse this for action
        for i_task, (task, campaign) in enumerate(tasks_and_campaigns):
            if options.augment:
                print task.pathName
                print campaign

            ### rule to avoid the issue of taskchain secondary jobs being stuck at sites processing the initial step
            if campaign in LHE_overflow:
                if task.taskType in ['Processing']:
                    needs, task_name, running, idled = needs_action(wfi, task)
                    needs_overide = overide_from_agent(wfi, needs_overide)
                    extend_to = copy.deepcopy(LHE_overflow[campaign])
                    if stay_within_site_whitelist:
                        extend_to = list(
                            set(extend_to) & set(wfi.request['SiteWhitelist'])
                        )  ## restrict to stupid-site-whitelist

                    if extend_to and needs or needs_overide:
                        print "\t", task_name, "of", wfo.name, "running", running, "and pending", idled, "taking action : ReplaceSiteWhitelist"
                        modifications[wfo.name][task.pathName] = {
                            "ReplaceSiteWhitelist":
                            copy.deepcopy(LHE_overflow[campaign]),
                            "Running":
                            running,
                            "Pending":
                            idled,
                            "Priority":
                            wfi.request['RequestPriority']
                        }
                        #print json.dumps( modifications[wfo.name][task.pathName]['ReplaceSiteWhitelist']
                        altered_tasks.add(task.pathName)
                    else:
                        print task_name, "of", wfo.name, "running", running, "and pending", idled

            ### overflow the 76 digi-reco to the site holding the pileup
            if campaign in PU_overflow:
                force = PU_overflow[campaign][
                    'force'] if 'force' in PU_overflow[campaign] else False
                secondary_locations = set(SI.sites_ready)
                for s in sec:
                    if not s in PU_locations:
                        presence = getDatasetPresence(url, s)
                        #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
                        one_secondary_locations = [
                            site for (site, (there, frac)) in presence.items()
                            if frac > 98.
                        ]
                        PU_locations[s] = one_secondary_locations
                    print "secondary is at", sorted(PU_locations[s])
                    secondary_locations = set(
                        PU_locations[s]) & secondary_locations

                ## we should add all sites that hold the secondary input if any
                secondary_locations = list(
                    set(PU_overflow[campaign]['sites']) & set(SI.sites_ready))
                if any([
                        task.pathName.endswith(finish)
                        for finish in ['_0', 'StepOneProc', 'Production']
                ]):
                    needs, task_name, running, idled = needs_action(wfi, task)
                    ## removing the ones in the site whitelist already since they encode the primary input location
                    if stay_within_site_whitelist:
                        original_site_in_use = set(
                            wfi.request['SiteWhitelist'])
                    else:
                        original_site_in_use = set(secondary_locations)
                    ## remove the sites that have already running jobs
                    gmon = wfi.getGlideMon()
                    if gmon and task_name in gmon and 'Sites' in gmon[
                            task_name]:
                        site_in_use = set(gmon[task_name]['Sites'])
                        ## that determines where you want to run in addition
                        #augment_by = list((set(secondary_locations)- site_in_use))
                        augment_by = list(
                            (set(secondary_locations) - site_in_use)
                            & original_site_in_use
                        )  ## restrict to stupid-site-whitelist
                    else:
                        augment_by = list(original_site_in_use)

                    needs_overide = overide_from_agent(wfi, needs_overide)
                    if augment_by and (
                            needs or needs_overide
                            or force) and PU_overflow[campaign][
                                'pending'] < PU_overflow[campaign]['max']:
                        PU_overflow[campaign]['pending'] += idled
                        print "raising overflow to", PU_overflow[campaign][
                            'pending'], "for", PU_overflow[campaign]['max']
                        ## the step with an input ought to be the digi part : make this one go anywhere
                        modifications[wfo.name][task.pathName] = {
                            "AddWhitelist": augment_by,
                            "Running": running,
                            "Pending": idled,
                            "Priority": wfi.request['RequestPriority']
                        }
                        altered_tasks.add(task.pathName)
                        print "\t", task_name, "of", wfo.name, "running", running, "and pending", idled, "taking action : AddWhitelist"
                        #print json.dumps( augment_by, indent=2 )
                    else:
                        print task_name, "of", wfo.name, "running", running, "and pending", idled

            ### overflow the skims back to multi-core
            if campaign in ['Run2015D', 'Run2015C_25ns'
                            ] and task.taskType == 'Skim':
                original_swl = wfi.request['SiteWhitelist']
                needs, task_name, running, idled = needs_action(wfi, task)
                if (needs or needs_overide):
                    modifications[wfo.name][task.pathName] = {
                        'AddWhitelist': original_swl,
                        "Running": running,
                        "Pending": idled,
                        "Priority": wfi.request['RequestPriority']
                    }
                    altered_tasks.add(task.pathName)
                    print "\t", task_name, "of", wfo.name, "running", running, "and pending", idled, "taking action : AddWhitelist"

            if options.augment:
                print sorted(wfi.request['SiteWhitelist']), i_task, use_HLT
            ### add the HLT at partner of CERN
            if 'T2_CH_CERN' in wfi.request[
                    'SiteWhitelist'] and i_task == 0 and use_HLT:
                needs, task_name, running, idled = needs_action(wfi, task)
                if options.augment: needs = True
                needs = True
                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide) and pending_HLT < max_HLT:
                    pending_HLT += idled
                    if task.pathName in modifications[
                            wfo.name] and 'AddWhitelist' in modifications[
                                wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName][
                            "AddWhitelist"].append("T2_CH_CERN_HLT")
                        print "\t", wfo.name, "adding addHLT up to", pending_HLT, "for", max_HLT
                        print task.pathName
                    ## this Replace does not work at all for HLT
                    #elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]:
                    #modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" )
                    #print "\t",wfo.name,"adding replace HLT up to",pending_HLT,"for",max_HLT
                    else:
                        modifications[wfo.name][task.pathName] = {
                            "AddWhitelist": ["T2_CH_CERN_HLT"],
                            "Priority": wfi.request['RequestPriority'],
                            "Running": running,
                            "Pending": idled
                        }
                        print "\t", wfo.name, "adding HLT up to", pending_HLT, "for", max_HLT
                        print task.pathName

            if i_task == 0 and not sec and use_T0:
                needs, task_name, running, idled = needs_action(wfi, task)

                if options.augment: needs = True
                #needs = True
                #if not (wfo.name in t0_special) and not options.augment: needs = False
                if not wfi.request['RequestType'] in [
                        'MonteCarlo', 'MonteCarloFromGEN'
                ] and not options.augment:
                    needs = False

                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide):
                    pending_T0 += idled
                    if task.pathName in modifications[
                            wfo.name] and 'AddWhitelist' in modifications[
                                wfo.name][task.pathName]:
                        modifications[wfo.name][
                            task.pathName]["AddWhitelist"].append("T0_CH_CERN")
                        print "\t", wfo.name, "adding addT0 up to", pending_T0, "for", max_T0
                        print task.pathName
                    elif task.pathName in modifications[
                            wfo.
                            name] and 'ReplaceSiteWhitelist' in modifications[
                                wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName][
                            "ReplaceSiteWhitelist"].append("T0_CH_CERN")
                        print "\t", wfo.name, "adding replace T0 up to", pending_T0, "for", max_T0
                    else:
                        modifications[wfo.name][task.pathName] = {
                            "AddWhitelist": ["T0_CH_CERN"],
                            "Priority": wfi.request['RequestPriority'],
                            "Running": running,
                            "Pending": idled
                        }
                        print "\t", wfo.name, "adding T0 up to", pending_T0, "for", max_T0
                        print task.pathName

    interface['modifications'].update(modifications)

    ## temporary core managing
    interface['cores'] = {
        'T2_CH_CERN_HLT': {
            'min': 4,
            'max': 16
        },
        'default': {
            'min': 1,
            'max': 4
        }
    }
    #interface['max_cores']={'T2_CH_CERN_HLT': 16, 'default': 4}
    #interface['min_cores']={'T2_CH_CERN_HLT': 4, 'default': 1}
    #interface['resize_subtasks'] = 'RunIISpring16DR80'
    interface['resizes'] = ['RunIISpring16DR80', 'NotACampaign']

    ## close and save
    close(interface)
Ejemplo n.º 14
0
def transferor(url ,specific = None, talk=True, options=None):
    if userLock():   return
    if duplicateLock():  return

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    LI = lockInfo()
    NLI = newLockInfo()
    mcm = McMClient(dev=False)
    dss = DSS()

    print "counting all being handled..."
    being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all())
    being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all())

    max_to_handle = options.maxworkflows
    max_to_transfer = options.maxstaging

    allowed_to_handle = max(0,max_to_handle - being_handled)
    allowed_to_transfer = max(0,max_to_transfer - being_transfered)
    wf_buffer = 5
    if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer
        print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer"
    else:
        print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer"

    if allowed_to_transfer <= wf_buffer:
        print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer"
    else:
        print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer"

    print "... done"

    all_transfers=defaultdict(list)
    needing_locks=defaultdict(list)
    workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset
    wfs_and_wfh=[]
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(Workflow.status=='considered').all():
        print "\t",wfo.name
        if specific and not specific in wfo.name: continue
        cache_r =filter(lambda d:d['RequestName']==wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) )
        else:
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) )
    print "... done"

    transfers_per_sites = defaultdict(int)
    input_sizes = {}
    input_cput = {}
    input_st = {}
    ## list the size of those in transfer already
    in_transfer_priority=0
    min_transfer_priority=100000000
    print "getting all wf in staging ..."
    for wfo in session.query(Workflow).filter(Workflow.status=='staging').all():
        wfh = workflowInfo( url, wfo.name, spec=False)
        (lheinput,primary,parent,secondary) = wfh.getIO()
        sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list
            transfers_per_sites[site] += 1 
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:  
            input_sizes[prim] = dss.get( prim )
            print "\t",wfo.name,"needs",input_sizes[prim],"GB"
        in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority']))
        min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority']))

    print "... done"
    print "Max priority in transfer already",in_transfer_priority
    print "Min priority in transfer already",min_transfer_priority
    print "transfers per sites"
    print json.dumps( transfers_per_sites, indent=2)
    in_transfer_already = sum(input_sizes.values())
    cput_in_transfer_already = sum(input_cput.values())
    st_in_transfer_already = sum(input_st.values())
    # shuffle first by name
    random.shuffle( wfs_and_wfh )
    #sort by priority higher first
    wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True)
    

    ## list the size of all inputs
    print "getting all input sizes ..."
    for (wfo,wfh) in wfs_and_wfh:
        (_,primary,_,_) = wfh.getIO()
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:
            input_sizes[prim] = dss.get( prim )
    print "... done"

    cput_grand_total = sum(input_cput.values())
    cput_to_transfer = cput_grand_total - cput_in_transfer_already
    st_grand_total = sum(input_st.values())
    st_to_transfer = st_grand_total - st_in_transfer_already
    print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already
    print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer
    print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots())
    print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots())
    print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already )
    print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer )


    grand_total =  sum(input_sizes.values()) 
    to_transfer = grand_total  - in_transfer_already
    grand_transfer_limit = options.maxtransfer 
    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered"%in_transfer_already
    print "%15.4f GB is the current requested transfer load"%to_transfer
    print "%15.4f GB is the global transfer limit"%grand_transfer_limit
    print "%15.4f GB is the available limit"%transfer_limit


    max_staging_per_site = options.maxstagingpersite
                    
    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer=0 ## so that we can count'em
    passing_along = 0
    transfer_sizes={}
    went_over_budget=False
    destination_cache = {}
    for (wfo,wfh) in wfs_and_wfh:
        print wfh.request['RequestPriority']
        print wfo.name,"to be transfered"
        #wfh = workflowInfo( url, wfo.name)

        (_,primary,_,_) = wfh.getIO()
        this_load=sum([input_sizes[prim] for prim in primary])
        if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ):
            if went_over_budget:
                print "Transfer has gone over bubget."
            else:
                print "Transfer will go over bubget."
            print "%15.4f GB this load"%this_load
            print "%15.4f GB already this round"%sum(transfer_sizes.values())
            print "%15.4f GB is the available limit"%transfer_limit
            went_over_budget=True
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget"
            else:
                if not options.go: 
                    print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop"
                    continue


        ## throtlle by campaign go
        if not CI.go( wfh.request['Campaign'] ):
            print "No go for",wfh.request['Campaign']
            if not options.go: 
                sendEmail("no go for managing","No go for "+wfh.request['Campaign'])
                continue

        ## check if the batch is announced

        def check_mcm(wfn):
            announced=False
            is_real=False
            if not wfn.startswith('pdmvserv'):
                is_real = True
            try:
                for b in mcm.getA('batches',query='contains=%s'% wfo.name):
                    is_real = True
                    if b['status']=='announced': 
                        announced=True 
                        break
            except:
                try:
                    for b in mcm.getA('batches',query='contains=%s'% wfo.name):
                        is_real = True
                        if b['status']=='announced': 
                            announced=True 
                            break
                except:
                    print "could not get mcm batch announcement, assuming not real"
            return announced,is_real

        if not use_mcm:
            announced,is_real = False,True
        else:
            announced,is_real = check_mcm( wfo.name )

        if not announced:
            print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?"
            
        if not is_real:
            print wfo.name,"does not appear to be genuine."
            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.)
        now = time.mktime(time.gmtime()) / (60.*60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced: 
                print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time)
                continue


        if passing_along >= allowed_to_handle:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority:
                ## higher priority, and not only this priority being transfered
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle
            else:
                print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along
                if not options.go: break

        if this_load and needs_transfer >= allowed_to_transfer:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority:
                ## higher priority, and not only this priority being transfered
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_transfer
            else:
                print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"transfering, and adding",needs_transfer
                if not options.go: continue


        (lheinput,primary,parent,secondary) = wfh.getIO()
        for dataset in list(primary)+list(parent)+list(secondary):
            ## lock everything flat
            NLI.lock( dataset )

        if options and options.tosites:
            sites_allowed = options.tosites.split(',')
        else:
            sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )

        if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist']

        if 'SiteBlacklist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfh.request['Campaign'])['SiteBlacklist']))

        ## reduce right away to sites in case of memory limitation
        memory_allowed = SI.sitesByMemory( wfh.request['Memory'] )
        if memory_allowed!=None:
            print "sites allowing", wfh.request['Memory'],"are",memory_allowed
            sites_allowed = list(set(sites_allowed) & set(memory_allowed))

        if not sites_allowed:
            print wfo.name,"has no possible sites to run at"
            print "available for",wfh.request['Memory'],"are",memory_allowed
            sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name ))
            continue

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## should make the block selection here
            pass

        if 'LumiList' in wfh.request and wfh.request['LumiList']:
            ## same, we could be doing the white list here too
            pass


        if blocks:
            print "Reading",len(blocks),"in whitelist"

        can_go = True
        staging=False
        allowed=True
        if primary:
            
            copies_needed_from_CPUh,CPUh = wfh.getNCopies()

            if talk:
                print wfo.name,'reads',', '.join(primary),'in primary'
            ## chope the primary dataset 
            for prim in primary:
                max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority']))
                sites_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                print "Sites allowed minus the vetoed transfer"
                print sorted(sites_allowed)

                copies_needed_from_site = int(0.35*len(sites_allowed))+1 ## should just go for a fixed number based if the white list grows that big
                print "Would make",copies_needed_from_site,"copies from site white list"
                copies_needed = copies_needed_from_site

                print "Would make",copies_needed_from_CPUh,"from cpu requirement",CPUh
                copies_needed = copies_needed_from_CPUh

                if options.maxcopy>0:
                    ## stop maxing things out ??
                    #copies_needed = min(options.maxcopy,copies_needed)
                    #print "Maxed to",copies_needed
                    if copies_needed_from_CPUh > options.maxcopy:
                        sendEmail('An example of more than three copies','for %s it could have been beneficial to make %s copies'%( wfo.name, copies_needed_from_CPUh))

                
                if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]:
                    copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies']
                    copies_needed = min(copies_needed_from_campaign,copies_needed_from_site)
                    print "Maxed to",copies_needed,"by campaign configuration",wfh.request['Campaign']

                ## remove the sites that do not want transfers                
                workflow_dependencies[prim].add( wfo.id )

                #####################################
                ###### JR 3/8/15 #### deprecating this
                """
                presence = getDatasetPresence( url, prim , within_sites = [SI.CE_to_SE(site) for site in sites_allowed])
                prim_location = [site for site,pres in presence.items() if pres[0]==True]
                prim_parts = [site for site,pres in presence.items() if pres[0]==False]
                if len(prim_location) >= copies_needed:
                    print "The output is all fully in place at",len(prim_location),"sites"
                    continue
                # reduce the number of copies required by existing full copies
                copies_needed = max(0,copies_needed - len(prim_location))
                print "now need",copies_needed
                subscriptions = listSubscriptions( url , prim , sites_allowed )
                prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])]))
                ## remove the subscription where the dataset is in parts at
                #prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']]) and not site in prim_parts]))
                ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place
                prim_destination = [site for site in prim_destination if not site in prim_location]
                ## add transfer dependencies
                latching_on_transfers =  list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])]))
                print latching_on_transfers
                """
                ###### JR 3/8/15 #### deprecating this
                #####################################


                ### new ways of making the whole thing
                destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks )
                #destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='DataOps')
                #anaops_destinations,anaops_all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks, group='AnalysisOps' )
                print json.dumps(destinations, indent=2)

                ## get where the dataset is in full and completed
                prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1]
                ## the rest is places it is going to be
                prim_destination = [site for site in destinations.keys() if not site in prim_location]
                ## need to take out the transfer veto
                prim_destination = [site for site in prim_destination if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                for dsite in prim_destination:
                    needing_locks[dsite].append( prim )

                if len(prim_location) >= copies_needed:
                    print "The output is all fully in place at",len(prim_location),"sites",prim_location
                    continue
                copies_needed = max(0,copies_needed - len(prim_location))
                print "now need",copies_needed
                
                copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names]

                latching_on_transfers = set()
                [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination]
                latching_on_transfers = list(latching_on_transfers)
                #print latching_on_transfers

                ## figure out where all this is going to go
                prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location]
                prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination]
                ## take out the ones that cannot receive transfers
                prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]

                if any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]):
                    ## means there is openings let me go
                    print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute]
                    for site in sites_allowed:
                        #increment accross the board, regardless of real destination: could be changed
                        transfers_per_sites[site] += 1
                else:
                    if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                        print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over transfer slots available"
                    else:
                        print "Not allowed to transfer more than",max_staging_per_site," per site at a time. Going overboard for",[site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site]
                        if not options.go:
                            allowed = False
                            break

                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first()
                    if not tfo:
                        tfo = Transfer( phedexid = latching)
                        tfo.workflows_id = []
                        session.add(tfo)
                            
                    if not wfo.id in tfo.workflows_id:
                        print "adding",wfo.id,"to",tfo.id,"with phedexid",latching
                        l = copy.deepcopy( tfo.workflows_id )
                        l.append( wfo.id )
                        tfo.workflows_id = l
                    if not options.test:
                        session.commit()
                    else:
                        session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                #copies_needed = max(0,copies_needed - len(prim_destination))
                copies_needed = max(0,copies_needed - min(copies_being_made))
                print "then need",copies_needed
                if copies_needed == 0:
                    print "The output is either fully in place or getting in full somewhere with",latching_on_transfers
                    can_go = True
                    continue

                if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the 
                    if not options or options.chop:
                        chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks)
                        spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes)
                        transfer_sizes[prim] = sum(sizes)
                    else:
                        spreading = {} 
                        for site in prim_to_distribute: 
                            if blocks:
                                spreading[site]=blocks
                            else:
                                spreading[site]=[prim]
                        transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified
                    can_go = False
                    print "selected CE destinations",spreading.keys()
                    for (site,items) in spreading.items():
                        all_transfers[site].extend( items )

        if not allowed:
            print "Not allowed to move on with",wfo.name
            continue


        if secondary:
            if talk:
                print wfo.name,'reads',', '.join(secondary),'in secondary'
            for sec in secondary:
                workflow_dependencies[sec].add( wfo.id )

                if False:
                    ## new style, failing on minbias
                    if not sec in destination_cache:
                        ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist
                        destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])
                    destinations = destination_cache[sec]
                    ## truncate location/destination to those making up for >90% of the dataset
                    bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9]
                    sec_location = [site for (site,info) in destinations.items() if info['completion']>=95]
                    sec_destination = [site for site in destinations.keys() if not site in sec_location]
                else:
                    ## old style
                    presence = getDatasetPresence( url, sec )
                    sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites
                    subscriptions = listSubscriptions( url ,sec )
                    sec_destination = [site for site in subscriptions] 

                for site in sec_location:
                    needing_locks[site].append( sec )
                for site in sec_destination:
                    needing_locks[site].append( sec )

                sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                sec_to_distribute = [site for site in sec_to_distribute if not  any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                if len( sec_to_distribute )>0:
                    sec_size = dss.get( sec )
                    for site in sec_to_distribute:
                        site_se =SI.CE_to_SE(site)
                        if (SI.disk[site_se]*1024.) > sec_size:
                            all_transfers[site].append( sec )
                            can_go = False
                        else:
                            print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size
                            #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024))

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                print wfo.name,"latches on existing transfers, and nothing else"
                wfo.status = 'staging'
                needs_transfer+=1
            else:
                print wfo.name,"should just be assigned NOW to",sites_allowed
                wfo.status = 'staged'
            passing_along+=1
            print "setting status to",wfo.status
            session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                print wfo.name,"latches on existing transfers"
                if not options.test:
                    wfo.status = 'staging'
                    print "setting status to",wfo.status
                    session.commit()
            print wfo.name,"needs a transfer"
            needs_transfer+=1
            passing_along+=1

    print "accumulated locks of dataset in place"
    print json.dumps(needing_locks, indent=2)
    for site,items in needing_locks.items():
        for item in items:
            LI.lock( item, SI.CE_to_SE(site), 'usable input')
        
    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    fake_id=-1
    wf_id_in_prestaging=set()

    for (site,items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))

        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer: 
            print site,"does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        datasets = [it for it in items_to_transfer if not '#' in it]

        if execute:
            print "Making a replica to",site,"(CE)",site_se,"(SE) for"
        else:
            print "Would make a replica to",site,"(CE)",site_se,"(SE) for"

        print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [block for block in blocks if not block.split('#')[0] in datasets]
        print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        print "\t",len(datasets),"datasets"
        print "\t",datasets
        items_to_transfer = blocks + datasets

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y','yes','go']:
                continue

        if execute:
            result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal')
            ## make use of max_priority dataset:priority to set the subscriptions priority
            """
            ## does not function
            once = True
            for item in items_to_transfer:
                bds = item.split('#')[0]
                if max_priority[bds] >= 90000:
                    if once:
                        w=10
                        print "waiting",w,"s before raising priority"
                        time.sleep(w)
                        once=False
                    ## raise it to high priority
                    print item,"subscription priority raised to high at",site_se
                    #print "This does not work yet properly it seems"
                    print updateSubscription(url, site_se, item, priority='high')
            """
            #for item in list(set([it.split('#')[0] for it in items_to_transfer])):
            for item in items_to_transfer:
                LI.lock( item, site_se, 'pre-staging')
        else:
            #result= {'phedex':{'request_created' : [{'id' : fake_id}]}}
            result= {'phedex':{'request_created' : []}}
            fake_id-=1



        if not result:
            print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first()
            print phedexid,"transfer created"
            if not new_transfer:
                new_transfer = Transfer( phedexid = phedexid)
                session.add( new_transfer )                
            new_transfer.workflows_id = set()
            for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update( workflow_dependencies[transfering] )
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status!='staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting",tr_wf.name,"to staging"
        session.commit()
Ejemplo n.º 15
0
def cleanor(url, specific=None):
    print "Deprecated"
    return

    if duplicateLock() : return 

    delete_per_site = {}
    do_not_autoapprove = []#'T2_FR_CCIN2P3']
    SI = siteInfo()
    CI = campaignInfo()
    LI = lockInfo()

    counts=0
    for wfo in session.query(Workflow).filter(Workflow.status == 'done').all():
        keep_a_copy = False
        if specific and not specific in wfo.name: continue
        ## what was in input 
        wl = getWorkLoad(url,  wfo.name )

        if 'Campaign' in wl and wl['Campaign'] in CI.campaigns and 'clean-in' in CI.campaigns[wl['Campaign']] and CI.campaigns[wl['Campaign']]['clean-in']==False:
            print "Skipping cleaning on input for campaign",wl['Campaign'], "as per campaign configuration"
            continue

        dataset= 'N/A'
        if 'InputDataset' in wl:
            dataset = wl['InputDataset']

        print dataset,"in input"
        #print json.dumps(wl, indent=2)
        announced_log = filter(lambda change : change["Status"] in ["closed-out","normal-archived","announced"],wl['RequestTransition'])
        if not announced_log: 
            print "Cannot figure out when",wfo.name,"was finished"
            continue
        now = time.mktime(time.gmtime()) / (60*60*24.)
        then = announced_log[-1]['UpdateTime'] / (60.*60.*24.)
        if (now-then) <2:
            print "workflow",wfo.name, "finished",now-then,"days ago. Too fresh to clean"
            continue
        else:
            print "workflow",wfo.name,"has finished",now-then,"days ago."

        if not 'InputDataset' in wl: 
            ## should we set status = clean ? or something even further
            print "passing along",wfo.name,"with no input"
            wfo.status = 'clean'
            session.commit()
            continue

        if 'MinBias' in dataset:
            print "Should not clean anything using",dataset,"setting status further"
            wfo.status = 'clean'
            session.commit()
            continue

        total_size = getDatasetSize( dataset ) ## in Gb        
        #if counts> 20:            break
        counts+=1
        ## find any location it is at
        our_presence = getDatasetPresence(url, dataset, complete=None, group="DataOps")
        also_our_presence = getDatasetPresence(url, dataset, complete=None, group="")

        ## is there a custodial !!!
        custodials = findCustodialLocation(url, dataset)
        if not len(custodials):
            print dataset,"has no custodial site yet, excluding from cleaning"
            continue

        ## find out whether it is still in use
        using_the_same = getWorkflowByInput(url, dataset, details=True)
        conflict=False
        for other in using_the_same:
            if other['RequestName'] == wfo.name: continue
            if other['RequestType'] == 'Resubmission': continue
            if not other['RequestStatus'] in ['announced','normal-archived','aborted','rejected','aborted-archived','aborted-completed','rejected-archived','closed-out','None',None,'new']:
                print other['RequestName'],'is in status',other['RequestStatus'],'preventing from cleaning',dataset
                conflict=True
                break
            if 'Campaign' in other and other['Campaign'] in CI.campaigns and 'clean-in' in CI.campaigns[other['Campaign']] and CI.campaigns[other['Campaign']]['clean-in']==False:
                print other['RequestName'],'is in campaign',other['Campaign']
                conflict = True
                break
        if conflict: continue
        print "other statuses:",[other['RequestStatus'] for other in using_the_same if other['RequestName'] != wfo.name]


        ## find all disks
        to_be_cleaned = filter(lambda site : site.startswith('T2') or site.endswith('Disk') ,our_presence.keys())
        to_be_cleaned.extend( filter(lambda site : site.startswith('T2') or site.endswith('Disk') ,also_our_presence.keys()))
        print to_be_cleaned,"for",total_size,"GB"

        anaops_presence = getDatasetPresence(url, dataset, complete=None, group="AnalysisOps")
        own_by_anaops = anaops_presence.keys()
        print "Own by analysis ops and vetoing"
        print own_by_anaops
        ## need to black list the sites where there is a copy of analysis ops
        to_be_cleaned = [site for site in to_be_cleaned if not site in own_by_anaops ]

        ## keep one copy out there
        if 'Campaign' in wl and wl['Campaign'] in CI.campaigns and 'keep-one' in CI.campaigns[wl['Campaign']] and CI.campaigns[wl['Campaign']]['keep-one']==True:
            print "Keeping a copy of input for",wl['Campaign']
            keep_a_copy = True
            
        if keep_a_copy:
            keep_at = None
            full_copies = [site for (site,(there,_)) in our_presence.items() if there and site.startswith('T1')]
            full_copies.extend( [site for (site,(there,_)) in also_our_presence.items() if there and site.startswith('T1')] )
            if not full_copies:
                full_copies = [site for (site,(there,_)) in our_presence.items() if there and site.startswith('T2')]
                full_copies.extend( [site for (site,(there,_)) in also_our_presence.items() if there and site.startswith('T2')] )

            if full_copies:
                keep_at = random.choice( full_copies )
                
            if not keep_at:
                print "We are enable to find a place to keep a full copy of",dataset,"skipping"
                continue
            else:
                ## keeping that copy !
                print "Keeping a full copy of",dataset,"at",keep_at,"not setting the status further"
                to_be_cleaned.remove( keep_at )
        else:
            wfo.status = 'clean'

        ## collect delete request per site
        for site in to_be_cleaned :
            if not site in delete_per_site: delete_per_site[site] = []
            if not dataset in [existing[0] for existing in delete_per_site[site]]:
                delete_per_site[site].append( (dataset, total_size) )
        
        session.commit()

    #open('deletes.json','w').write( json.dumps(delete_per_site,indent=2) )

    print json.dumps(delete_per_site, indent=2)
    print "\n\n ------- \n\n"
    ## unroll the deletion per site
    ## maybe find the optimum site/dataset dataset/site to limit the number of ph requests
    for site in delete_per_site:
        dataset_list = [info[0] for info in delete_per_site[site]]
        size_removal = sum([info[1] for info in delete_per_site[site]]) / 1024.
        if site in SI.disk:
            free = SI.disk[site]
            print site,"has",size_removal,"TB of potential cleanup.",free,"TB available."
        else:
            print site,"has",size_removal,"TB of potential cleanup. no info on available."

        print "\t",','.join(dataset_list)
    
    ## make deletion requests
    for site in delete_per_site:
        site_datasets = [info[0] for info in delete_per_site[site]]
        is_tape = any([v in site for v in ['MSS','Export','Buffer'] ])
        #comments="Cleanup input after production. DataOps will take care of approving it."
        #if is_tape:
        #    comments="Cleanup input after production."
        for item in site_datasets:
            LI.release( item, site, 'cleanup of input after production')
Ejemplo n.º 16
0
def recoveror(url,specific,options=None):
    if userLock('recoveror'): return

    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()

    def make_int_keys( d ):
        for code in d:
            d[int(code)] = d.pop(code)

    error_codes_to_recover = UC.get('error_codes_to_recover')
    error_codes_to_block = UC.get('error_codes_to_block')
    error_codes_to_notify = UC.get('error_codes_to_notify')
    make_int_keys( error_codes_to_recover )
    make_int_keys( error_codes_to_block )
    make_int_keys( error_codes_to_notify )

    #wfs = session.query(Workflow).filter(Workflow.status == 'assistance-recovery').all()
    wfs = session.query(Workflow).filter(Workflow.status.contains('recovery')).all()
    if specific:
        wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-manual').all() )

    for wfo in wfs:
        if specific and not specific in wfo.name:continue

        if not specific and 'manual' in wfo.status: continue
        
        wfi = workflowInfo(url, wfo.name)

        ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves

        all_errors = {}
        try:
            wfi.getSummary()
            all_errors = wfi.summary['errors']
        except:
            pass

        print '-'*100        
        print "Looking at",wfo.name,"for recovery options"

        recover = True       

        if not 'MergedLFNBase' in wfi.request:
            print "f****d up"
            sendEmail('missing lfn','%s wl cache is screwed up'%wfo.name)
            recover = False
 
        if not len(all_errors): 
            print "\tno error for",wfo.name
            recover = False

        task_to_recover = defaultdict(list)
        message_to_ops = ""
        message_to_user = ""

        if 'LheInputFilese' in wfi.request and wfi.request['LheInputFiles']:
            ## we do not try to recover pLHE
            recover = False

        if wfi.request['RequestType'] in  ['MonteCarlo','ReReco']:
            recover = False

        if 'Campaign' in wfi.request:
            c = wfi.request['Campaign']
            if c in CI.campaigns and 'recover' in CI.campaigns[c]:
                recover=CI.campaigns[c]['recover']

        for task,errors in all_errors.items():
            print "\tTask",task
            ## collect all error codes and #jobs regardless of step at which it occured
            all_codes = []
            for name, codes in errors.items():
                if type(codes)==int: continue
                all_codes.extend( [(int(code),info['jobs'],name,list(set([e['type'] for e in info['errors']])),list(set([e['details'] for e in info['errors']])) ) for code,info in codes.items()] )

            all_codes.sort(key=lambda i:i[1], reverse=True)
            sum_failed = sum([l[1] for l in all_codes])

            for errorCode,njobs,name,types,details in all_codes:
                rate = 100*njobs/float(sum_failed)
                #print ("\t\t %10d (%6s%%) failures with error code %10d (%"+str(max_legend)+"s) at stage %s")%(njobs, "%4.2f"%rate, errorCode, legend, name)
                print ("\t\t %10d (%6s%%) failures with error code %10d (%30s) at stage %s")%(njobs, "%4.2f"%rate, errorCode, ','.join(types), name)
                    
                added_in_recover=False

                #if options.go:
                # force the recovery of any task with error ?

                if errorCode in error_codes_to_recover:
                    ## the error code is registered
                    for case in error_codes_to_recover[errorCode]:
                        match = case['details']
                        matched= (match==None)
                        if not matched:
                            matched=False
                            for detail in details:
                                if match in detail:
                                    print "[recover] Could find keyword",match,"in"
                                    print 50*"#"
                                    print detail
                                    print 50*"#"
                                    matched = True
                                    break
                        if matched and rate > case['rate']:
                            print "\t\t => we should be able to recover that", case['legend']
                            task_to_recover[task].append( (code,case) )
                            added_in_recover=True
                            message_to_user = ""
                        else:
                            print "\t\t recoverable but not frequent enough, needs",case['rate']

                if errorCode in error_codes_to_block:
                    for case in error_codes_to_block[errorCode]:
                        match = case['details']
                        matched= (match==None)
                        if not matched:
                            matched=False
                            for detail in details:
                                if match in detail:
                                    print "[block] Could find keyword",match,"in"
                                    print 50*"#"
                                    print detail
                                    print 50*"#"
                                    matched = True
                                    break
                        if matched and rate > case['rate']:
                            print "\t\t => that error means no ACDC on that workflow", case['legend']
                            if not options.go:
                                message_to_ops += "%s has an error %s blocking an ACDC.\n%s\n "%( wfo.name, errorCode, '#'*50 )
                                recover = False
                                added_in_recover=False

                            
                
                if errorCode in error_codes_to_notify and not added_in_recover:
                    print "\t\t => we should notify people on this"
                    message_to_user += "%s has an error %s in processing.\n%s\n" %( wfo.name, errorCode, '#'*50 )



        if message_to_user:
            print wfo.name,"to be notified to user(DUMMY)",message_to_user

        if message_to_ops:
            sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**'])

        if len(task_to_recover) != len(all_errors):
            print "Should not be doing partial ACDC. skipping"
            #sendEmail('recoveror','do not want to make partial acdc on %s'%wfo.name)
            sendLog('recoveror','do not want to make partial acdc on %s'%wfo.name, level='warning')
            recover = False

        if task_to_recover and recover:
            print "Initiating recovery"
            print ', '.join(task_to_recover.keys()),"to be recovered"

            recovering=set()
            for task in task_to_recover:
                print "Will be making a recovery workflow for",task

                ## from here you can fetch known solutions, to known error codes
                actions = list(set([case['solution'] for code,case in task_to_recover[task]  ]))
                acdc = singleRecovery(url, task, wfi.request , actions, do = options.do)

                if not acdc:
                    if options.do:
                        if recovering:
                            print wfo.name,"has been partially ACDCed. Needs manual attention"
                            #sendEmail( "failed ACDC partial recovery","%s has had %s/%s recoveries %s only"%( wfo.name, len(recovering), len(task_to_recover), list(recovering)), destination=['*****@*****.**'])
                            sendLog('recoveror', "%s has had %s/%s recoveries %s only"%( wfo.name, len(recovering), len(task_to_recover), list(recovering)), level='critical')
                            continue
                        else:
                            print wfo.name,"failed recovery once"
                            #break
                            continue
                    else:
                        print "no action to take further"
                        #sendEmail("an ACDC that can be done automatically","please check https://cmst2.web.cern.ch/cmst2/unified/logs/recoveror/last.log for details", destination=['*****@*****.**'])
                        sendLog('recoveror', "ACDC for %s can be done automatically"% wfo.name, level='critical')
                        continue
                        
                
                ## and assign it ?
                team = wfi.request['Teams'][0]
                parameters={
                    #'SiteWhitelist' : wfi.request['SiteWhitelist'],
                    'SiteWhitelist' : SI.sites_ready,
                    'AcquisitionEra' : wfi.acquisitionEra(),
                    'ProcessingString' :  wfi.processingString(),
                    'MergedLFNBase' : wfi.request['MergedLFNBase'],
                    'ProcessingVersion' : wfi.request['ProcessingVersion'],
                    }
                ## hackery for ACDC merge assignment
                if wfi.request['RequestType'] == 'TaskChain' and 'Merge' in task.split('/')[-1]:
                    parameters['AcquisitionEra'] = None
                    parameters['ProcessingString'] = None

                if options.ass:
                    print "really doing the assignment of the ACDC",acdc
                    parameters['execute']=True
                    #sendEmail("an ACDC was done and WAS assigned", "%s  was assigned, please check https://cmst2.web.cern.ch/cmst2/unified/logs/recoveror/last.log for details"%( acdc ), destination=['*****@*****.**'])
                    wfi.sendLog('recoveror',"%s  was assigned for recovery"% acdc)
                else:
                    print "no assignment done with this ACDC",acdc
                    #sendEmail("an ACDC was done and need to be assigned", "%s needs to be assigned, please check https://cmst2.web.cern.ch/cmst2/unified/logs/recoveror/last.log for details"%( acdc ), destination=['*****@*****.**'])
                    sendLog('recoveror',"%s needs to be assigned"%(acdc), level='critical')


                result = reqMgrClient.assignWorkflow(url, acdc, team, parameters)
                if not result:
                    print acdc,"was not asigned"
                    #sendEmail("an ACDC was done and need to be assigned","%s needs to be assigned, please check https://cmst2.web.cern.ch/cmst2/unified/logs/recoveror/last.log for details"%( acdc ), destination=['*****@*****.**'])
                    sendLog('recoveror',"%s needs to be assigned"%(acdc), level='critical')
                else:
                    recovering.add( acdc )

            current = None
            if recovering:
                #if all went well, set the status to -recovering 
                current = wfo.status 
                if options.ass:
                    current = current.replace('recovery','recovering')
                else:
                    current = 'assistance-manual'
                print 'created ACDC: '+', '.join( recovering )
            else:
                ## was set to be recovered, and no acdc was made
                current = 'assistance-manual'

            if current:
                print wfo.name,"setting the status to",current
                wfo.status = current
                session.commit()
        else:
            ## this workflow should be handled manually at that point
            print wfo.name,"needs manual intervention"
            wfo.status = 'assistance-manual'
            session.commit()
Ejemplo n.º 17
0
def stagor(url,specific =None, options=None):
    
    if not componentInfo().check(): return
    SI = siteInfo()
    CI = campaignInfo()
    UC = unifiedConfiguration()

    done_by_wf_id = {}
    done_by_input = {}
    completion_by_input = {}
    good_enough = 100.0
    
    lost_blocks = json.loads(open('%s/lost_blocks_datasets.json'%monitor_dir).read())
    lost_files = json.loads(open('%s/lost_files_datasets.json'%monitor_dir).read())
    known_lost_blocks = {}
    known_lost_files = {}
    for dataset in set(lost_blocks.keys()+lost_files.keys()):
        b,f = findLostBlocksFiles(url, dataset)
        if dataset in lost_blocks and not b:
            print dataset,"has no really lost blocks"
        else:
            known_lost_blocks[dataset] = [i['name'] for i in b]

        if dataset in lost_files and not f: 
            print dataset,"has no really lost files"
        else:
            known_lost_files[dataset] = [i['name'] for i in f]

    try:
        cached_transfer_statuses = json.loads(open('cached_transfer_statuses.json').read())
    except:
        print "inexisting transfer statuses. starting fresh"
        cached_transfer_statuses = {}

    transfer_statuses = {}

    ## pop all that are now in negative values
    for phedexid in cached_transfer_statuses.keys():
        transfers = session.query(Transfer).filter(Transfer.phedexid==int(phedexid)).all()
        if not transfers:
            print phedexid,"does not look relevant to be in cache anymore. poping"
            print cached_transfer_statuses.pop( phedexid )


            
    ## collect all datasets that are needed for wf in staging, correcting the status of those that are not really in staging
    wfois = []
    needs = defaultdict(list)
    for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all():
        wfi = workflowInfo(url, wfo.name)
        if wfi.request['RequestStatus'] in ['running-open','running-closed','completed','assigned','acquired']:
            wfi.sendLog('stagor', "is in status %s"%wfi.request['RequestStatus'])
            wfi.status='away'
            session.commit()
            continue
        if not wfi.request['RequestStatus'] in ['assignment-approved']:
            ## should be setting 'away' too
            print wfo.name,"is",wfi.request['RequestStatus']
            sendEmail("wrong status in staging. debug","%s is in %s, should set away."%(wfo.name,wfi.request['RequestStatus']))
        wfois.append( (wfo,wfi) )            
        _,primaries,_,secondaries = wfi.getIO()
        for dataset in list(primaries)+list(secondaries):
            needs[wfo.name].append( dataset)
            done_by_input[dataset] = {}
            completion_by_input[dataset] = {}
            wfi.sendLog('stagor', '%s needs %s'%( wfo.name, dataset))

    open('%s/dataset_requirements.json'%monitor_dir,'w').write( json.dumps( needs, indent=2))

    dataset_endpoints = defaultdict(set)
    endpoint_in_downtime = defaultdict(set)
    #endpoint_completed = defaultdict(set)
    endpoint_incompleted = defaultdict(set)
    #endpoint = defaultdict(set)
    send_back_to_considered = set()
    ## phedexid are set negative when not relevant anymore
    # probably there is a db schema that would allow much faster and simpler query
    for transfer in session.query(Transfer).filter(Transfer.phedexid>0).all():
        if specific  and str(transfer.phedexid)!=str(specific): continue

        skip=True
        for wfid in transfer.workflows_id:
            tr_wf = session.query(Workflow).get(wfid)
            if tr_wf: 
                if tr_wf.status == 'staging':
                    sendLog('stagor',"\t%s is staging for %s"%(transfer.phedexid, tr_wf.name))
                    skip=False

        if skip: 
            sendLog('stagor',"setting %s to negative value"%transfer.phedexid)
            transfer.phedexid = -transfer.phedexid
            session.commit()
            continue
        if transfer.phedexid<0: continue

        ## check the status of transfers
        checks = checkTransferApproval(url,  transfer.phedexid)
        approved = all(checks.values())
        if not approved:
            sendLog('stagor', "%s is not yet approved"%transfer.phedexid)
            approveSubscription(url, transfer.phedexid)
            continue

        ## check on transfer completion
        if str(transfer.phedexid) in cached_transfer_statuses:
            ### use a cache for transfer that already looked done
            sendLog('stagor',"read %s from cache"%transfer.phedexid)
            checks = cached_transfer_statuses[str(transfer.phedexid)]
        else:
            checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True)
        ## just write this out
        transfer_statuses[str(transfer.phedexid)] = copy.deepcopy(checks)

        if not specific:
            for dsname in checks:
                if not dsname in done_by_input: done_by_input[dsname]={}
                if not dsname in completion_by_input: completion_by_input[dsname] = {}
                done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values()))
                completion_by_input[dsname][transfer.phedexid]=checks[dsname].values()
        if checks:
            sendLog('stagor',"Checks for %s are %s"%( transfer.phedexid, [node.values() for node in checks.values()]))
            done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()]))))
        else:
            ## it is empty, is that a sign that all is done and away ?
            print "ERROR with the scubscriptions API of ",transfer.phedexid
            print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists"
            done = False

        ## the thing above is NOT giving the right number
        #done = False

        for wfid in transfer.workflows_id:
            tr_wf = session.query(Workflow).get(wfid)
            if tr_wf:# and tr_wf.status == 'staging':  
                if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={}
                done_by_wf_id[tr_wf.id][transfer.phedexid]=done
            ## for those that are in staging, and the destination site is in drain
            #if not done and tr_wf.status == 'staging':
                

        for ds in checks:
            for s,v in checks[ds].items():
                dataset_endpoints[ds].add( s )

        if done:
            ## transfer.status = 'done'
            sendLog('stagor',"%s is done"%transfer.phedexid)
            cached_transfer_statuses[str(transfer.phedexid)] = copy.deepcopy(checks)
        else:
            sendLog('stagor',"%s is not finished %s"%(transfer.phedexid, pprint.pformat( checks )))
            pprint.pprint( checks )
            ## check if the destination is in down-time
            for ds in checks:
                sites_incomplete = [SI.SE_to_CE(s) for s,v in checks[ds].items() if v<good_enough]
                sites_incomplete_down = [s for s in sites_incomplete if not s in SI.sites_ready]
                if sites_incomplete_down:
                    sendLog('stagor',"%s are in downtime, while waiting for %s to get there"%( ",".join(sites_incomplete_down), ds))
                #sites_complete = [SI.SE_to_CE(s) for s,v in checks[ds].items() if v>=good_enough]
                #endpoint[ds].update( sites_complete )
                #endpoint[ds].update( sites_incomplete )
                #endpoint_completed[ds].update( sites_complete )
                endpoint_incompleted[ds].update( sites_incomplete )
                endpoint_in_downtime[ds].update( sites_incomplete_down )
            


    print "End point in down time"
    for k in endpoint_in_downtime: endpoint_in_downtime[k] = list(endpoint_in_downtime[k])
    for k in dataset_endpoints: dataset_endpoints[k] = list(dataset_endpoints[k])
    print json.dumps( endpoint_in_downtime , indent=2)


    open('cached_transfer_statuses.json','w').write( json.dumps( cached_transfer_statuses, indent=2))
    open('%s/transfer_statuses.json'%monitor_dir,'w').write( json.dumps( transfer_statuses, indent=2))
    open('%s/dataset_endpoints.json'%monitor_dir,'w').write( json.dumps(dataset_endpoints, indent=2))

    already_stuck = json.loads( open('%s/stuck_transfers.json'%monitor_dir).read() )
    missing_in_action = defaultdict(list)


    print "-"*10,"Checking on workflows in staging","-"*10
    #forget_about = ['/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM']
    #for what in forget_about:
    #    if not done_by_input[what]:
    #        done_by_input[what] = {'fake':True}

    ## come back to workflows and check if they can go
    available_cache = defaultdict(lambda : defaultdict(float))
    presence_cache = defaultdict(dict)
    for wfo,wfi in wfois:
        print "#"*30
        ## the site white list takes site, campaign, memory and core information
        (_,primaries,_,secondaries,sites_allowed) = wfi.getSiteWhiteList(verbose=False)
        se_allowed = [SI.CE_to_SE(site) for site in sites_allowed]
        se_allowed.sort()
        se_allowed_key = ','.join(se_allowed)
        readys={}
        for need in list(primaries)+list(secondaries):
            if not need in done_by_input:
                wfi.sendLog('stagor',"missing transfer report for %s"%need)
                readys[need] = False      
                ## should warn someone about this !!!
                ## it cannot happen, by construction
                sendEmail('missing transfer report','%s does not have a transfer report'%(need))
                continue

            if not done_by_input[need] and need in list(secondaries):
                wfi.sendLog('stagor',"assuming it is OK for secondary %s to have no attached transfers"% need)
                readys[need] = True
                done_by_input[need] = { "fake" : True }
                continue

            if len(done_by_input[need]) and all(done_by_input[need].values()):
                wfi.sendLog('stagor',"%s is ready"%need)
                print json.dumps( done_by_input[need] , indent=2)
                readys[need] = True
            else:
                wfi.sendLog('stagor',"%s is not ready"%need)
                print json.dumps( done_by_input[need] , indent=2)
                readys[need] = False

        if readys and all(readys.values()):
            if wfo.status == 'staging':
                wfi.sendLog('stagor',"all needs are fullfilled, setting staged")
                wfo.status = 'staged'
                session.commit()
            else:
                wfi.sendLog('stagor',"all needs are fullfilled, already")
                print json.dumps( readys, indent=2 )
        else:
            wfi.sendLog('stagor',"missing requirements")
            copies_needed,_ = wfi.getNCopies()
            jump_ahead = False
            re_transfer = False
            ## there is missing input let's do something more elaborated
            for need in list(primaries):#+list(secondaries):
                if endpoint_in_downtime[need] == endpoint_incompleted[need]:
                    #print need,"is going to an end point in downtime"
                    wfi.sendLog('stagor',"%s has only incomplete endpoint in downtime"%need)
                    re_transfer=True

                if not se_allowed_key in available_cache[need]:
                    available_cache[need][se_allowed_key]  = getDatasetBlocksFraction( url , need, sites=se_allowed )
                    if available_cache[need][se_allowed_key] >= copies_needed:
                        wfi.sendLog('stagor',"assuming it is OK to move on like this already for %s"%need)
                        jump_ahead = True

            ## compute a time since staging to filter jump starting ?                    
            # check whether the inputs is already in the stuck list ...
            for need in list(primaries)+list(secondaries):
                if need in already_stuck: 
                    wfi.sendLog('stagor',"%s is stuck, so try to jump ahead"%need)
                    jump_ahead = True
                    
            if jump_ahead or re_transfer:
                details_text = "checking on availability for %s to jump ahead"%wfo.name
                details_text += '\n%s wants %s copies'%(wfo.name,copies_needed)
                copies_needed = max(1,copies_needed-1)
                details_text += '\nlowering by one unit to %s'%copies_needed
                wfi.sendLog('stagor', details_text)
                all_check = True
                
                prim_where = set()
                for need in list(primaries):
                    if not se_allowed_key in presence_cache[need]:
                        presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed)
                    presence = presence_cache[need][se_allowed_key]
                    prim_where.update( presence.keys() )
                    available = available_cache[need][se_allowed_key]
                    this_check = (available >= copies_needed)
                    wfi.sendLog('stagor', "%s is available %s times %s"%( need, available, this_check))
                    all_check &= this_check
                    if not all_check: break

                for need in list(secondaries):
                    ## I do not want to check on the secon
                    this_check = all(done_by_input[need].values())
                    wfi.sendLog('stagor',"%s is all transfered %s"%(need, json.dumps(done_by_input[need], indent=2)))
                    all_check&= this_check
                    #if not se_allowed_key in presence_cache[need]:
                    #    presence_cache[need][se_allowed_key] = getDatasetPresence( url, need , within_sites=se_allowed)

                    ## restrict to where the primary is
                    #presence = dict([(k,v) for (k,v) in presence_cache[need][se_allowed_key].items() if k in prim_where])
                    #this_check = all([there for (there,frac) in presence.values()])
                    #print need,"is present at all sites:",this_check
                    #all_check&= this_check

                if all_check:    
                    wfi.sendLog('stagor',"needs are sufficiently fullfilled, setting staged")
                    wfo.status = 'staged'
                    session.commit()
                else:
                    print wfo.name,"has to wait a bit more"
                    wfi.sendLog('stagor',"needs to wait a bit more")
            else:
                wfi.sendLog('stagor',"not checking availability")
            if re_transfer:
                wfi.sendLog('stagor',"Sending back to considered because of endpoint in downtime")
                if wfo.status == 'staging':
                    wfo.status = 'considered'
                    session.commit()
                    send_back_to_considered.add( wfo.name )



    if send_back_to_considered:
        #sendEmail("transfer to endpoint in downtime","sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)))
        sendLog('stagor', "sending back to considered the following workflows \n%s"%('\n'.join( send_back_to_considered)), level='critical')

    print "-"*10,"Checking on non-available datasets","-"*10    
    ## now check on those that are not fully available
    
    for dsname in available_cache.keys():
        ## squash the se_allowed_key key
        available_cache[dsname] = min( available_cache[dsname].values() )
            
    for dsname,available in available_cache.items():
        using_its = getWorkflowByInput(url, dsname)
        #print using_its
        using_wfos = []
        for using_it in using_its:
            wf = session.query(Workflow).filter(Workflow.name == using_it).first()
            if wf:
                using_wfos.append( wf )

        if not len(done_by_input[dsname]):
            print "For dataset",dsname,"there are no transfer report. That's an issue."
            for wf in using_wfos:
                if wf.status == 'staging':
                    if UC.get("stagor_sends_back"):
                        print "sending",wf.name,"back to considered"
                        wf.status = 'considered'
                        session.commit()
                        #sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name)
                        sendLog('stagor', "%s was send back and might be trouble"% wf.name, level='critical')
                    else:
                        print "would send",wf.name,"back to considered"
                        #sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name)
                        sendLog('stagor', "susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name, level='critical')
            continue

        ## not compatible with checking on secondary availability
        #if all([wf.status != 'staging' for wf in using_wfos]):
        #    ## means despite all checks that input is not needed
        #    continue

        if available < 1.:
            print "incomplete",dsname
            ## there is a problem in the method below that it does not account for files stuck in T1*Buffer only
            lost_blocks,lost_files = findLostBlocksFiles( url, dsname )
            lost_block_names = [item['name'] for item in lost_blocks]
            lost_file_names = [item['name'] for item in lost_files]

            if lost_blocks:
                #print json.dumps( lost , indent=2 )
                ## estimate for how much !
                fraction_loss,_,n_missing = getDatasetBlockFraction(dsname, lost_block_names)
                print "We have lost",len(lost_block_names),"blocks",lost_block_names,"for %f%%"%(100.*fraction_loss)
                if fraction_loss > 0.05: ## 95% completion mark
                    #sendEmail('we have lost too many blocks','%s is missing %d blocks, for %d events, %f %% loss'%(dsname, len(lost_block_names), n_missing, fraction_loss))
                    sendLog('stagor', '%s is missing %d blocks, for %d events, %3.2f %% loss'%(dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='warning')
                    ## the workflow should be rejected !
                    for wf in using_wfos: 
                        if wf.status == 'staging':
                            print wf.name,"is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                            #sendEmail('doomed workflow','%s has too much loss on the input dataset %s. please check on stagor logs https://cmst2.web.cern.ch/cmst2/unified/logs/stagor/last.log'%(wf.name, dsname))
                            sendLog('stagor', '%s has too much loss on the input dataset %s. Missing  %d blocks, for %d events, %3.2f %% loss'%(wf.name, dsname, len(lost_block_names), n_missing, 100*fraction_loss), level='critical')
                else:
                    ## probably enough to make a ggus and remove
                    if not dsname in known_lost_blocks:
                        #sendEmail('we have lost a few blocks', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ))
                        sendLog('stagor', '%s is missing %d blocks, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_block_names), n_missing, fraction_loss, '\n'.join( lost_block_names ) ), level='critical')
                        known_lost_blocks[dsname] = [i['name'] for i in lost_blocks]
                                  
            if lost_files:
                fraction_loss,_,n_missing = getDatasetFileFraction(dsname, lost_file_names)
                print "We have lost",len(lost_file_names),"files",lost_file_names,"for %f%%"%fraction_loss
                
                if fraction_loss > 0.05:
                    #sendEmail('we have lost too many files','%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss))
                    sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss'%(dsname, len(lost_file_names),n_missing, fraction_loss), level='critical')
                    for wf in using_wfos:
                        if wf.status == 'staging':
                            print wf.name,"is doomed. setting to trouble"
                            wf.status = 'trouble'
                            session.commit()
                else:
                    ## probably enough to make a ggus and remove    
                    if not dsname in known_lost_files:
                        #sendEmail('we have lost a few files','%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)))
                        sendLog('stagor', '%s is missing %d files, for %d events, %f %% loss\n\n%s'%(dsname, len(lost_file_names),n_missing, fraction_loss, '\n'.join(lost_file_names)), level='critical')
                        known_lost_files[dsname] = [i['name'] for i in lost_files]

                ## should the status be change to held-staging and pending on a ticket



            missings = [pid for (pid,d) in done_by_input[dsname].items() if d==False] 
            print "\t",done_by_input[dsname]
            print "\tneeds",len(done_by_input[dsname])
            print "\tgot",done_by_input[dsname].values().count(True)
            print "\tmissing",missings
            missing_in_action[dsname].extend( missings )
        


    rr= open('%s/lost_blocks_datasets.json'%monitor_dir,'w')
    rr.write( json.dumps( known_lost_blocks, indent=2))
    rr.close()

    rr= open('%s/lost_files_datasets.json'%monitor_dir,'w')
    rr.write( json.dumps( known_lost_files, indent=2))
    rr.close()


    open('%s/incomplete_transfers.json'%monitor_dir,'w').write( json.dumps(missing_in_action, indent=2) )
    print "Stuck transfers and datasets"
    print json.dumps( missing_in_action, indent=2 )

    print "Going further and make a report of stuck transfers"

    datasets_by_phid = defaultdict(set)
    for dataset in missing_in_action:
        for phid in missing_in_action[dataset]:
            #print dataset,"stuck through",phid
            datasets_by_phid[phid].add( dataset )

    bad_destinations = defaultdict(set)
    bad_sources = defaultdict(set)
    report = ""
    really_stuck_dataset = set()
    transfer_timeout = UC.get("transfer_timeout")
    transfer_lowrate = UC.get("transfer_lowrate")
    for phid,datasets in datasets_by_phid.items():
        issues = checkTransferLag( url, phid , datasets=list(datasets) )
        for dataset in issues:
            for block in issues[dataset]:
                for destination in issues[dataset][block]:
                    (block_size,destination_size,delay,rate,dones) = issues[dataset][block][destination]
                    ## count x_Buffer and x_MSS as one source
                    redones=[]
                    for d in dones:
                        if d.endswith('Buffer') or d.endswith('Export'):
                            if d.replace('Buffer','MSS').replace('Export','MSS') in dones: 
                                continue
                            else: 
                                redones.append( d )
                        else:
                            redones.append( d )
                    dones = list(set( redones ))
                    #dones = filter(lambda s : (s.endswith('Buffer') and not s.replace('Buffer','MSS') in dones) or (not s.endswith('Buffer')) , dones)
                    if delay>transfer_timeout and rate<transfer_lowrate:
                        if len(dones)>1:
                            ## its the destination that sucks
                            bad_destinations[destination].add( block )
                        else:
                            dum=[bad_sources[d].add( block ) for d in dones]
                        really_stuck_dataset.add( dataset )
                        print "add",dataset,"to really stuck"
                        report += "%s is not getting to %s, out of %s faster than %f [GB/s] since %f [d]\n"%(block,destination,", ".join(dones), rate, delay)
    print "\n"*2

    ## create tickets right away ?
    report+="\nbad sources "+",".join(bad_sources.keys())+"\n"
    for site,blocks in bad_sources.items():
        report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks))
    report+="\nbad destinations "+",".join(bad_destinations.keys())+"\n"
    for site,blocks in bad_destinations.items():
        report+="\n\n%s:"%site+"\n\t".join(['']+list(blocks))

    print '\n'*2,"Datasets really stuck"
    print '\n'.join( really_stuck_dataset )

    print '\n'*2,"report written at https://cmst2.web.cern.ch/cmst2/unified/logs/incomplete_transfers.log"
    print report

    stuck_transfers = dict([(k,v) for (k,v) in missing_in_action.items() if k in really_stuck_dataset])
    print '\n'*2,'Stuck dataset transfers'
    print json.dumps(stuck_transfers , indent=2)
    open('%s/stuck_transfers.json'%monitor_dir,'w').write( json.dumps(stuck_transfers , indent=2) )
    open('%s/logs/incomplete_transfers.log'%monitor_dir,'w').write( report )
Ejemplo n.º 18
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock() and not options.manual: return
    mlock = moduleLock()
    if mlock() and not options.manual: return
    if not componentInfo().check() and not options.manual: return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    SI = siteInfo()
    SI = global_SI()
    ###NLI = newLockInfo()
    ###if not NLI.free() and not options.go: return
    LI = lockInfo()
    if not LI.free() and not options.go and not options.manual: return

    n_assigned = 0
    n_stalled = 0

    wfos = []
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered', 'staging'])
    if specific:
        fetch_from.extend(['considered-tried'])

    if options.early:
        print "Option Early is on"

    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from", fetch_from

    for status in fetch_from:
        print "getting wf in", status
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == status).all())
        print len(wfos)

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    dataset_endpoints = json.loads(
        eosRead('%s/dataset_endpoints.json' % monitor_dir))
    aaa_mapping = json.loads(eosRead('%s/equalizor.json' %
                                     monitor_pub_dir))['mapping']
    all_stuck = set()
    all_stuck.update(
        json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir)))

    max_per_round = UC.get('max_per_round').get('assignor', None)
    max_cpuh_block = UC.get('max_cpuh_block')

    ##order by priority instead of random
    if options.early:
        cache = sorted(getWorkflows(url, 'assignment-approved', details=True),
                       key=lambda r: r['RequestPriority'])
        cache = [r['RequestName'] for r in cache]

        def rank(wfn):
            return cache.index(wfn) if wfn in cache else 0

        wfos = sorted(wfos, key=lambda wfo: rank(wfo.name), reverse=True)
        print "10 first", [wfo.name for wfo in wfos[:10]]
        print "10 last", [wfo.name for wfo in wfos[-10:]]
    else:
        random.shuffle(wfos)

    for wfo in wfos:

        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)

        if wfh.request['RequestStatus'] in [
                'rejected', 'aborted', 'aborted-completed', 'aborted-archived',
                'rejected-archived'
        ] and wfh.isRelval():
            wfo.status = 'forget'
            session.commit()
            n_stalled += 1
            continue

        if options.priority and int(
                wfh.request['RequestPriority']) < options.priority:
            continue

        options_text = ""
        if options.early: options_text += ", early option is ON"
        if options.partial:
            options_text += ", partial option is ON"
            options_text += ", good fraction is %.2f" % options.good_enough

        wfh.sendLog('assignor',
                    "%s to be assigned%s" % (wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))

        if not output_tiers:
            n_stalled += 1
            wfh.sendLog('assignor', 'There is no output at all')
            sendLog('assignor',
                    'Workflow %s has no output at all' % (wfo.name),
                    level='critical')
            continue

        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',
                        "%s are stuck input" % (','.join(is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = (not wfh.isRelval())
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update(CI.campaigns[campaign])

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'assignor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('assignor',
                            'These data tiers %s are not allowed' %
                            (','.join(banned_tier)),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                msg = '%s is not an allowed secondary' % (
                    ', '.join(set(secondary) - set(allowed_secondary.keys())))
                wfh.sendLog('assignor', msg)
                critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format(
                    wfh.getPrepIDs()[0])
                sendLog('assignor', critical_msg, level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:  # and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update(allowed_secondary[sec])

        if no_go:
            n_stalled += 1
            ## make a very loud noise if >100k priority stalled
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        original_sites_allowed = copy.deepcopy(sites_allowed)
        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request['Campaign'],
                                       'SecondaryLocation', [])

        blocks = wfh.getBlocks()

        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))
        secondary_locations = None

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa
        do_partial = False  #options.good_enough if options.partial else 0

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns:
            assign_parameters.update(CI.campaigns[wfh.request['Campaign']])

        if 'primary_AAA' in assign_parameters:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']
        if 'partial_copy' in assign_parameters:
            ## can this only work if there is a stuck input ? maybe not
            ## this is a number. 0 means no
            print "Could do partial disk copy assignment"
            if is_stuck or options.partial:
                do_partial = assign_parameters['partial_copy']
                wfh.sendLog(
                    'assignor',
                    "Overiding partial copy assignment to %.2f fraction" %
                    do_partial)
                #sendEmail('stuck input to assignment','%s is stuck for assigning %s and going fractional'%(','.join( is_stuck), wfo.name))

        do_partial = options.good_enough if options.partial else do_partial

        for sec in list(secondary):
            if override_sec_location:
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                #sendEmail("tempting to pass sec location check","but we cannot yet IMO")
                #pass

            presence = getDatasetPresence(url, sec)
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [
                site for (site, (there, frac)) in presence.items()
                if frac > 98.
            ]

            if secondary_aaa:
                if not one_secondary_locations:
                    sec_availability = getDatasetBlocksFraction(url, sec)
                    if sec_availability >= 1. and options.go:
                        ## there is at least one copy of each block on disk. We should go ahead and let it go.
                        wfh.sendLog(
                            'assignor',
                            "The secondary %s is available %s times on disk, and usable"
                            % (sec, sec_availability))
                    else:
                        ## not even a copy on disk anywhere !!!!
                        sites_allowed = []  ## will block the assignment
                        wfh.sendLog(
                            'assignor',
                            "The secondary %s is nowhere on disk" % sec)
                #just continue without checking
                continue

            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations == None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(
                    set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [
                site for site in sites_allowed
                if SI.CE_to_SE(site) in one_secondary_locations
            ]

        wfh.sendLog(
            'assignor',
            "Intersecting with secondary requirement, now allowed %s" %
            sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy(
            sites_allowed
        )  ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy(sites_allowed)
        sites_with_data = copy.deepcopy(sites_allowed)
        sites_with_any_data = copy.deepcopy(sites_allowed)
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc'  ## by default

        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor", dataset_endpoints[prim]
                endpoints.update(dataset_endpoints[prim])
            set_lfn = getLFNbase(prim)
            ## if they are requested for processing, they should bbe all closed already
            closeAllBlocks(url, prim, blocks)
            presence = getDatasetPresence(url, prim, only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] = getDatasetBlocksFraction(
                url,
                prim,
                sites=[SI.CE_to_SE(site) for site in sites_allowed],
                only_blocks=blocks)
            if primary_aaa:
                available_fractions[prim] = getDatasetBlocksFraction(
                    url, prim, only_blocks=blocks)

            sites_all_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in [
                    psite for (psite, (there, frac)) in presence.items()
                    if there
                ]
            ]
            if primary_aaa:
                sites_all_data = set()
                for (psite, (there, frac)) in presence.items():
                    if there:
                        sites_all_data.update(SI.SE_to_CEs(psite))
                sites_all_data = list(sites_all_data)
                #sites_all_data = list(set([SI.SE_to_CE(psite) for (psite,(there,frac)) in presence.items() if there]))
            sites_with_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in
                [psite for (psite, frac) in presence.items() if frac[1] > 90.]
            ]
            sites_with_any_data = [
                site for site in sites_with_any_data
                if SI.CE_to_SE(site) in presence.keys()
            ]
            if primary_aaa:
                sites_with_any_data = set()
                for psite in presence.keys():
                    sites_with_any_data.update(SI.SE_to_CEs(psite))
                sites_with_any_data = list(sites_with_any_data)
                #sites_with_any_data = list(set([SI.SE_to_CE(psite) for psite in presence.keys()]))

            holding_but_not_allowed = set()
            for se_site in presence.keys():
                if not (set(SI.SE_to_CEs(se_site)) & set(sites_allowed)):
                    holding_but_not_allowed.add(se_site)
            #wfh.sendLog('assignor',"Holding the data but not allowed %s"%sorted(list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed]))))
            wfh.sendLog(
                'assignor', "Holding the data but not allowed %s" %
                sorted(holding_but_not_allowed))
            if primary_locations == None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(
                    set(primary_locations) & set(presence.keys()))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites = []
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in
                    list((set(secondary_locations) & set(primary_locations)) -
                         set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            elif primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in list(
                        set(primary_locations) -
                        set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            else:
                opportunistic_sites = []
            wfh.sendLog(
                'assignor', "We could be running in addition at %s" %
                sorted(opportunistic_sites))
            if any(
                [osite in SI.sites_not_ready
                 for osite in opportunistic_sites]):
                wfh.sendLog(
                    'assignor', "One of the usable site is in downtime %s" % ([
                        osite for osite in opportunistic_sites
                        if osite in SI.sites_not_ready
                    ]))
                down_time = True
                ## should this be send back to considered ?

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog('assignor', "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                'assignor',
                '%s requires a large numbr of CPUh %s , not assigning, please check with requester'
                % (wfo.name, cpuh),
                level='critical')
            wfh.sendLog(
                'assignor',
                "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                    wfh.request['Campaign']]:
            copies_needed_from_campaign = CI.campaigns[
                wfh.request['Campaign']]['maxcopies']
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)

        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(
                1, copies_wanted -
                less_copies_than_requested)  # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog('assignor',
                    "needed availability fraction %s" % copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed))

        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_all_data:
                wfh.sendLog('assignor',
                            "Overiding the primary on AAA setting to Off")
                primary_aaa = False
            else:
                aaa_grid = set(sites_all_data)
                for site in list(aaa_grid):
                    aaa_grid.update(aaa_mapping.get(site, []))
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog(
                    'assignor', "Selected to read primary through xrootd %s" %
                    sorted(sites_allowed))

        isStoreResults = ('StoreResults' == wfh.request.setdefault(
            'RequestType', None))

        if isStoreResults:
            if 'MergedLFNBase' in wfh.request:
                set_lfn = wfh.request['MergedLFNBase']
            else:
                n_stalled += 1
                wfh.sendLog(
                    'assignor',
                    "Cannot assign StoreResults request because MergedLFN is missing"
                )
                sendLog(
                    'assignor',
                    'Cannot assign StoreResults request because MergedLFN is missing',
                    level='critical')
                continue

        if not primary_aaa:
            if not isStoreResults:
                sites_allowed = sites_with_any_data
            else:
                ## if we are dealing with a StoreResults request, we don't need to check dataset availability and
                ## should use the SiteWhiteList set in the original request
                if 'SiteWhitelist' in wfh.request:
                    sites_allowed = wfh.request['SiteWhitelist']
                else:
                    wfh.sendLog(
                        'assignor',
                        "Cannot assign StoreResults request because SiteWhitelist is missing"
                    )
                    sendLog(
                        'assignor',
                        'Cannot assign StoreResults request because SiteWhitelist is missing',
                        level='critical')
                    n_stalled += 1
                    continue
                available_fractions = {}
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        ### check on endpoints for on-going transfers
        if do_partial:
            if endpoints:
                end_sites = [SI.SE_to_CE(s) for s in endpoints]
                sites_allowed = list(set(sites_allowed + end_sites))
                if down_time and not any(osite in SI.sites_not_ready
                                         for osite in end_sites):
                    print "Flip the status of downtime, since our destinations are good"
                    down_time = False
                print "with added endpoints", sorted(end_sites)
            else:
                print "Cannot do partial assignment without knowin the endpoints"
                n_stalled += 1
                continue

        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue

        low_pressure = SI.sites_low_pressure(0.4)
        ## if any of the site allowed is low pressure : reduce to 1 copy so that it gets started
        allowed_and_low = sorted(set(low_pressure) & set(sites_allowed))
        if allowed_and_low:
            wfh.sendLog(
                'assignor',
                "The workflow can run at %s under low pressure currently" %
                (','.join(allowed_and_low)))
            copies_wanted = max(1., copies_wanted - 1.)

        if available_fractions and not all([
                available >= copies_wanted
                for available in available_fractions.values()
        ]):
            not_even_once = not all([
                available >= 1. for available in available_fractions.values()
            ])
            above_good = all([
                available >= do_partial
                for available in available_fractions.values()
            ])
            wfh.sendLog(
                'assignor',
                "The input dataset is not available %s times, only %s" %
                (copies_wanted, available_fractions.values()))
            if down_time and not options.go and not options.early:
                wfo.status = 'considered'
                session.commit()
                wfh.sendLog(
                    'assignor',
                    "sending back to considered because of site downtime, instead of waiting"
                )
                #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog(
                    'assignor',
                    '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'
                    % (wfo.name),
                    level='delay')
                n_stalled += 1
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known and not options.limit and not options.go and not options.early and not (
                        do_partial and above_good):
                    wfh.sendLog(
                        'assignor',
                        "cannot be assigned, %s is not sufficiently available.\n %s"
                        % (wfo.name, json.dumps(available_fractions)))
                    #sendEmail( "cannot be assigned","%s is not sufficiently available.\n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append(wfo.name)
                    open('cannot_assign.json',
                         'w').write(json.dumps(known, indent=2))

                if options.early:
                    if wfo.status == 'considered':
                        wfh.sendLog('assignor', "setting considered-tried")
                        wfo.status = 'considered-tried'
                        session.commit()
                    else:
                        print "tried but status is", wfo.status
                if do_partial and above_good:
                    print "Will move on with partial locations"
                else:
                    n_stalled += 1
                    continue

        if not len(sites_allowed) and not options.SiteWhitelist:
            if not options.early:
                wfh.sendLog('assignor',
                            "cannot be assign with no matched sites")
                sendLog('assignor',
                        '%s has no whitelist' % wfo.name,
                        level='critical')
            n_stalled += 1
            continue

        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog(
                'assignor',
                "Reading primary through xrootd at %s" % sorted(sites_allowed))

        if secondary_aaa:
            parameters['TrustPUSitelists'] = True
            wfh.sendLog(
                'assignor', "Reading secondary through xrootd at %s" %
                sorted(sites_allowed))

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team
        parameters['Team'] = team

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog('assignor',
                        'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        def pick_options(options, parameters):
            ##parse options entered in command line if any
            if options:
                for key in reqMgrClient.assignWorkflow.keys:
                    v = getattr(options, key)
                    if v != None:
                        if type(v) == str and ',' in v:
                            parameters[key] = filter(None, v.split(','))
                        else:
                            parameters[key] = v

        def pick_campaign(assign_parameters, parameters):
            ## pick up campaign specific assignment parameters
            parameters.update(assign_parameters.get('parameters', {}))

        if options.force_options:
            pick_campaign(assign_parameters, parameters)
            pick_options(options, parameters)
        else:
            ## campaign parameters update last
            pick_options(options, parameters)
            pick_campaign(assign_parameters, parameters)

        if not options.test:
            parameters['execute'] = True

        hold_split, split_check = wfh.checkSplitting()
        if hold_split and not options.go:
            if split_check:
                wfh.sendLog(
                    'assignor', 'Holding on to the change in splitting %s' %
                    ('\n\n'.join([str(i) for i in split_check])))
            else:
                wfh.sendLog('assignor', 'Change of splitting is on hold')
            n_stalled += 1
            continue

        if split_check == None or split_check == False:
            n_stalled += 1
            continue
        elif split_check:
            ## operate all recommended changes
            reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check)
            wfh.sendLog(
                'assignor', 'Applying the change in splitting %s' %
                ('\n\n'.join([str(i) for i in split_check])))

        split_check = True  ## bypass completely and use the above

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor',
                            "%s needs to be split by event with %s per job" %
                            (wfo.name, eventsPerJob),
                            level='critical')
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',
                                "%s was assigned with %s lumis/job" %
                                (wfo.name, lumisPerJob),
                                level='critical')
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            'assignor',
                            "leaving splitting untouched for %s, please check on %s"
                            % (pstring, wfo.name),
                            level='critical')
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )

        if isHEPCloudReady(url) and wfh.isGoodForNERSC():
            parameters['Team'] = 'hepcloud'
            parameters['SiteWhitelist'] = ['T3_US_NERSC']
            if primary:
                parameters['TrustSitelists'] = True
            if secondary:
                parameters['TrustPUSitelists'] = True
            sendEmail("sending work to hepcloud",
                      "pleasse check on %s" % wfh.request['RequestName'],
                      destination=['*****@*****.**'])

        ## make sure to autoapprove all NonCustodialSites
        parameters['AutoApproveSubscriptionSites'] = list(
            set(parameters['NonCustodialSites'] +
                parameters.get('AutoApproveSubscriptionSites', [])))
        result = reqMgrClient.assignWorkflow(
            url, wfo.name, None,
            parameters)  ## team is not relevant anymore here

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                wfh.sendLog(
                    'assignor', "Properly assigned\n%s" %
                    (json.dumps(parameters, indent=2)))
                if wfh.producePremix():
                    title = "Heavy workflow assigned to {}".format(
                        parameters['SiteWhitelist'])
                    body = "Workflow name: {}".format(
                        wfh.request['RequestName'])
                    body += "\nOutput dataset(s): {}".format(
                        wfh.request['OutputDatasets'])
                    body += "\nAssigned to: {}".format(
                        parameters['SiteWhitelist'])
                    sendEmail(
                        title,
                        body,
                        destination=[
                            '*****@*****.**'
                        ])

                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs
                        LI.lock(secure, reason='assigning')

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                wfh.sendLog(
                    'assignor',
                    "Failed to assign %s.\n%s \n Please check the logs" %
                    (wfo.name, reqMgrClient.assignWorkflow.errorMessage))
                sendLog('assignor',
                        "Failed to assign %s.\n%s \n Please check the logs" %
                        (wfo.name, reqMgrClient.assignWorkflow.errorMessage),
                        level='critical')
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
    if n_stalled and not options.go and not options.early:
        sendLog('assignor',
                "%s workflows cannot be assigned. Please take a look" %
                (n_stalled),
                level='critical')
Ejemplo n.º 19
0
    # check if there is a custodial
    # check = findCustodialLocation(url, dataset)
    check = getDatasetPresence(url, dataset,complete=None)
    if len(check):
        print "OK for dataset at",check
    else:
        print "need to pick a site and transfer"
        get_those.append( dataset )
print get_those
#res= makeReplicaRequest(url, get_those_to, get_those, "restaging because of Redigi Move custodial screw up")
print res

"""
sys.exit(1)

SI = siteInfo()

#items = getDatasetChops(dataset)
items = [['block'] for i in range(100)]
siteblacklist = ['T2_TH_CUNSTDA','T1_TW_ASGC','T2_TW_Taiwan']
sites = [s for s in json.loads(open('/afs/cern.ch/user/c/cmst2/www/mc/whitelist.json').read()) if s not in siteblacklist]
random.shuffle(sites)
sites = sites[:10]
#weights = { }
#for (i,site) in enumerate(sites):
    #weights[site]= random.random()
#    weights[site] = i

SI.cpu_pledges
spreading = distributeToSites( items, sites, n_per_site = 2 , weights=SI.cpu_pledges)
Ejemplo n.º 20
0
def htmlor(caller=""):
    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return

    try:
        boost = json.loads(open('%s/equalizor.json' %
                                monitor_dir).read())['modifications']
    except:
        boost = {}
    cache = getWorkflows(reqmgr_url, 'assignment-approved', details=True)
    cache.extend(getWorkflows(reqmgr_url, 'acquired', details=True))
    cache.extend(getWorkflows(reqmgr_url, 'running-open', details=True))
    cache.extend(getWorkflows(reqmgr_url, 'running-closed', details=True))

    def getWL(wfn):
        cached = filter(lambda d: d['RequestName'] == wfn, cache)
        if cached:
            wl = cached[0]
        else:
            wl = getWorkLoad(reqmgr_url, wfn)
        return wl

    def wfl(wf,
            view=False,
            p=False,
            ms=False,
            within=False,
            ongoing=False,
            status=False,
            update=False):
        wfn = wf.name
        wfs = wf.wm_status
        wl = None
        pid = None
        wl_pid = None
        pids = filter(lambda seg: seg.count('-') == 2, wf.name.split('_'))
        if len(pids):
            pids = pids[:1]
            pid = pids[0]

        if not pids:
            wl = getWL(wf.name)
            pids = getPrepIDs(wl)
            pid = pids[0]

        wl_pid = pid
        if 'task' in wf.name:
            wl_pid = 'task_' + pid

        text = ', '.join([
            #wfn,
            #'<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">%s</a> '%(wfn,wfn),
            #'<table><tr><td>%s</td></tr></table>'%(wfn),
            #'<span>%s</span>'%(wfn),
            "%s " % wfn,
            '(%s) <br>' % wfs
        ])
        text += ', '.join([
            '<a href="https://%s/reqmgr2/fetch?rid=%s" target="_blank">dts</a>'
            % (reqmgr_url, wfn),
            '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">dts-req1</a>'
            % wfn,
            #TOFIX '<a href=https://cmsweb.cern.ch/reqmgr/view/showWorkload?requestName=%s target="_blank">wkl</a>'%wfn,
            '<a href="https://%s/couchdb/reqmgr_workload_cache/%s" target="_blank">wfc</a>'
            % (reqmgr_url, wfn),
            '<a href="https://%s/reqmgr2/data/request?name=%s" target="_blank">req</a>'
            % (reqmgr_url, wfn),
            #'<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/request?requestName=%s" target="_blank">dwkc</a>'%wfn,
            #TOFIX '<a href="https://cmsweb.cern.ch/reqmgr/view/splitting/%s" target="_blank">spl</a>'%wfn,
            '<a href="https://cms-pdmv.cern.ch/stats/?RN=%s" target="_blank">vw</a>'
            % wfn,
            '<a href="https://cms-pdmv.cern.ch/stats/restapi/get_one/%s" target="_blank">vwo</a>'
            % wfn,
            '<a href="https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=full&reverse=0&reverse=1&npp=20&subtext=%s&sall=q" target="_blank">elog</a>'
            % pid,
            '<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank">pv</a>'
            % wfn,
            #deprecated '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/outputDatasetsByRequestName/%s" target="_blank">out</a>'%wfn,
            '<a href="closeout.html#%s" target="_blank">clo</a>' % wfn,
            '<a href="statuses.html#%s" target="_blank">st</a>' % wfn,
            '<a href="https://%s/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s" target="_blank">perf</a>'
            % (reqmgr_url, wfn)
        ])
        if within and (not view or wfs == 'completed'):
            wl = getWL(wfn)
            dataset = None
            if 'InputDataset' in wl:
                dataset = wl['InputDataset']
            if 'Task1' in wl and 'InputDataset' in wl['Task1']:
                dataset = wl['Task1']['InputDataset']

            if dataset:
                text += ', '.join([
                    '',
                    '<a href=https://cmsweb.cern.ch/das/request?input=%s target=_blank>input</a>'
                    % dataset,
                    '<a href=https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions#state=create_since=0;filter=%s target=_blank>sub</a>'
                    % dataset,
                    '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?dataset=%s&collapse=n target=_blank>ds</a>'
                    % dataset,
                    '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/blockreplicas?dataset=%s target=_blank>rep</a>'
                    % dataset,
                ])

        if p:
            cached = filter(lambda d: d['RequestName'] == wfn, cache)
            if cached:
                wl = cached[0]
            else:
                wl = getWorkLoad('cmsweb.cern.ch', wfn)
            text += ', (%s)' % (wl['RequestPriority'])
            pass

        if pid:
            if ms:
                mcm_s = json.loads(
                    os.popen(
                        'curl https://cms-pdmv.cern.ch/mcm/public/restapi/requests/get_status/%s --insecure'
                        % pid).read())[pid]
                text += ', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm (%s)</a>' % (
                    pid, mcm_s)
            else:
                text += ', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm</a>' % (
                    pid)
                text += ', <a href="https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s" target="_blank">ac</a>' % (
                    wl_pid)

        if status:
            if wf.status.startswith('assistance'):
                text += ', <a href="assistance.html#%s" target="_blank">assist</a>' % wfn
            text += ' : %s ' % (wf.status)

        if view and wfs != 'acquired':
            text += '<a href="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" target="_blank"><img src="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" style="height:50px"></a>' % (
                wfn.replace('_', '/'), wfn.replace('_', '/'))
        if ongoing:
            text += '<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a>' % (
                wfn, wfn)

        if ongoing:
            date1 = time.strftime(
                '%Y-%m-%d+%H:%M',
                time.gmtime(time.mktime(time.gmtime()) - (15 * 24 * 60 * 60)))
            date2 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime())
            text += '<a href="http://dashb-cms-job.cern.ch/dashboard/templates/web-job2/#table=Jobs&date1=%s&date2=%s&sortby=site&task=wmagent_%s">dashb</a>' % (
                date1, date2, wfn)

        if ongoing and wfn in boost:
            for task in boost[wfn]:
                overflow = boost[wfn][task].get('ReplaceSiteWhitelist', None)
                if not overflow:
                    overflow = boost[wfn][task].get('AddWhitelist', None)
                if overflow:
                    text += ',boost (<a href=equalizor.json>%d</a>)' % len(
                        overflow)

        #text+="<hr>"
        return text

    def phl(phid):
        text = ', '.join([
            str(phid),
            '<a href="https://cmsweb.cern.ch/phedex/prod/Request::View?request=%s" target="_blank">vw</a>'
            % phid,
            '<a href="https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions?reqfilter=%s" target="_blank">sub</a>'
            % phid,
        ])
        return text

    def ol(out):
        return '<a href="https://cmsweb.cern.ch/das/request?input=%s" target="_blank"> %s</a>' % (
            out, out)

    def lap(comment):

        l = time.mktime(time.gmtime())
        spend = l - lap.start
        lap.start = l
        print "Spend %d [s] for %s" % (spend, comment)

    lap.start = time.mktime(time.gmtime())

    ## start to write it
    #html_doc = open('/afs/cern.ch/user/v/vlimant/public/ops/index.html','w')
    html_doc = open('%s/index.html.new' % monitor_dir, 'w')
    print "Updating the status page ..."

    UC = unifiedConfiguration()

    if not caller:
        try:
            #caller = sys._getframe(1).f_code.co_name
            caller = sys.argv[0].split('/')[-1].replace('.py', '')
            print "caller is"
            print caller
        except Exception as es:
            caller = 'none found'
            print "not getting frame"
            print str(es)

    html_doc.write("""
<html>
<head>
<META HTTP-EQUIV="refresh" CONTENT="900">
<script type="text/javascript">
 function showhide(id) {
    var e = document.getElementById(id);
    e.style.display = (e.style.display == 'block') ? 'none' : 'block';
 }
</script>
</head>
<body>

Last update on %s(CET), %s(GMT)
<br>
<a href=logs/ target=_blank>logs</a> <a href=logs/last.log target=_blank>last</a> <a href=statuses.html>statuses</a> <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/ target=_blank>prod mon</a> <a href=https://%s/wmstats/index.html target=_blank>wmstats</a> <a href=http://t3serv001.mit.edu/~cmsprod/IntelROCCS/Detox/SitesInfo.txt target=_blank>detox</a> <a href=locked.html>space</a> <a href=logs/subscribor/last.log target=_blank>blocks</a> <a href=logs/agents/last.log>agents</a>
<br>
<a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/CompOpsWorkflowL3Responsibilities#Automatic_Assignment_and_Unified>what am I</a> <a href=data.html>json interfaces</a> <a href=logs/addHoc/last.log>add-hoc op</a> created from <b>%s <a href=logs/last_running>last running</a></b> <object height=20 type="text/html" data="logs/last_running"><p>backup content</p></object>
<br><br>

""" % (time.asctime(time.localtime()), time.asctime(
        time.gmtime()), reqmgr_url, caller))

    text = ""
    count = 0
    count_by_campaign = defaultdict(lambda: defaultdict(int))
    for wf in session.query(Workflow).filter(
            Workflow.status.startswith('considered')).all():
        wl = getWL(wf.name)
        count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])] += 1
        #print wf.name
        text += "<li> %s </li> \n" % wfl(wf, p=True)
        count += 1
    text_by_c = ""
    for c in count_by_campaign:
        text_by_c += "<li> %s (%d) : " % (c, sum(
            count_by_campaign[c].values()))
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c += "%d (%d), " % (p, count_by_campaign[c][p])
        text_by_c += "</li>"

    html_doc.write("""
Worflow next to handle (%d) <a href=https://cms-pdmv.cern.ch/mcm/batches?status=new&page=-1 target="_blank"> batches</a> <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/transferor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('considered')">[Click to show/hide]</a>
<br>
<div id="considered" style="display:none;">
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('considered_bywf')">[Click to show/hide]</a><div id="considered_bywf" style="display:none;">
 <ul>
 %s
 </ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('considered_bycamp')">[Click to show/hide]</a><div id="considered_bycamp" style="display:none;">
 <ul>
 %s
 </ul></div>
</ul>
</div>
""" % (count, count, text, len(count_by_campaign), text_by_c))

    lap('done with considered')
    text = ""
    count = 0
    count_by_campaign = defaultdict(lambda: defaultdict(int))
    for wf in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wl = getWL(wf.name)
        count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])] += 1
        text += "<li> %s </li> \n" % wfl(wf, within=True)
        count += 1

    text_by_c = ""
    for c in count_by_campaign:
        text_by_c += "<li> %s (%d) : " % (c, sum(
            count_by_campaign[c].values()))
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c += "%d (%d), " % (p, count_by_campaign[c][p])
        text_by_c += "</li>"

    html_doc.write("""
Worflow waiting in staging (%d) <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staging')">[Click to show/hide]</a>
<br>
<div id="staging" style="display:none;">
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('staging_bywf')">[Click to show/hide]</a><div id="staging_bywf" style="display:none;">                                                                                                                                                                       
 <ul>            
 %s
 </ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('staging_bycamp')">[Click to show/hide]</a><div id="staging_bycamp" style="display:none;">                                                                                                                                                                  
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>                                                                                                                                                                                                                                                                                                               
</ul>      
</div>
""" % (count, count, text, len(count_by_campaign), text_by_c))

    lap('done with staging')

    text = "<ul>"
    count = 0
    transfer_per_wf = defaultdict(list)
    for ts in session.query(Transfer).filter(Transfer.phedexid > 0).all():
        hide = True
        t_count = 0
        stext = ""
        for pid in ts.workflows_id:
            w = session.query(Workflow).get(pid)
            hide &= (w.status != 'staging')
            if w.status in ['considered', 'staging', 'staged']:
                stext += "<li> %s </li>\n" % (wfl(w, status=True))
                transfer_per_wf[w].append(ts.phedexid)
                t_count += 1
        stext = '<li> %s serves %d workflows<br><a href="javascript:showhide(\'%s\')">[show/hide]</a> <div id="%s" style="display:none;"><ul>\n' % (
            phl(ts.phedexid), t_count, ts.phedexid, ts.phedexid) + stext

        stext += "</ul></li>\n"
        if hide:
            #text+="<li> %s not needed anymore to start running (does not mean it went through completely)</li>"%phl(ts.phedexid)
            pass
        else:
            count += 1
            text += stext
    text += "</ul>"

    text_bywf = "<ul>"
    for wf in transfer_per_wf:
        text_bywf += "<li> %s </li>" % (wfl(wf, within=True))
        text_bywf += '<a href=javascript:showhide("transfer_%s")>[Click to show/hide] %d transfers</a>' % (
            wf.name, len(transfer_per_wf[wf]))
        text_bywf += '<div id="transfer_%s" style="display:none;">' % wf.name
        text_bywf += "<ul>"
        for pid in sorted(transfer_per_wf[wf]):
            text_bywf += "<li> %s </li>" % (phl(pid))
        text_bywf += "</ul></div><hr>"
    text_bywf += '</ul>'

    html_doc.write("""
Transfer on-going (%d) <a href=http://cmstransferteam.web.cern.ch/cmstransferteam/ target=_blank>dashboard</a> <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('transfer')">[Click to show/hide]</a>
<br>
<div id="transfer" style="display:none;">
 <ul>
  <li> By Workflow
    <a href="javascript:showhide('transfer_bywf')">[Click to show/hide]</a>
    <div id="transfer_bywf" style="display:none;">
%s
    </div>
  </li>
  <li> By transfer request
    <a href="javascript:showhide('transfer_byreq')">[Click to show/hide]</a>
    <div id="transfer_byreq" style="display:none;"> 
%s
    </div>
  </li>
 </ul>
</div>
""" % (count, text_bywf, text))

    lap('done with transfers')

    text = ""
    count = 0
    count_by_campaign = defaultdict(lambda: defaultdict(int))
    for wf in session.query(Workflow).filter(
            Workflow.status == 'staged').all():
        wl = getWL(wf.name)
        count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])] += 1
        text += "<li> %s </li> \n" % wfl(wf, p=True)
        count += 1
    text_by_c = ""
    for c in count_by_campaign:
        text_by_c += "<li> %s (%d) : " % (c, sum(
            count_by_campaign[c].values()))
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c += "%d (%d), " % (p, count_by_campaign[c][p])
        text_by_c += "</li>"

    html_doc.write(
        """Worflow ready for assigning (%d) <a href=logs/stagor/last.log target=_blank>log</a> <a href=logs/assignor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staged')">[Click to show/hide]</a>
<br>
<div id="staged" style="display:none;">
<br>
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('staged_bywf')">[Click to show/hide]</a><div id="staged_bywf" style="display:none;">                                                                                                                                                                             
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>                                                                                                                                                                                                                                                                                                               
<li> By campaigns (%d) </li><a href="javascript:showhide('staged_bycamp')">[Click to show/hide]</a><div id="staged_bycamp" style="display:none;">                                                                                                                                                                        
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>
</ul>
</div>
""" % (count, count, text, len(count_by_campaign), text_by_c))

    lap('done with staged')

    lines = []
    count_by_campaign = defaultdict(lambda: defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status == 'away').all():
        wl = getWL(wf.name)
        count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])] += 1
        lines.append("<li> %s <hr></li>" % wfl(wf, view=True, ongoing=True))
    text_by_c = ""
    for c in count_by_campaign:
        text_by_c += "<li> %s (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/campaign.php?campaign=%s>mon</a> <a href=https://cms-pdmv.cern.ch/pmp/historical?r=%s target=_blank>pmp</a> " % (
            c, sum(count_by_campaign[c].values()), c, c)
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c += "%d (%d), " % (p, count_by_campaign[c][p])
        text_by_c += "</li>"

    lines.sort()
    html_doc.write("""
Worflow on-going (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests_in_production.php target=_blank>ongoing</a> <a href=https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=summary target=_blank>elog</a> <a href=http://cms-gwmsmon.cern.ch/prodview target=_blank>queues</a> <a href=logs/assignor/last.log target=_blank>log</a> <a href=logs/checkor/last.log target=_blank>postlog</a> <a href=logs/equalizor/last.log target=_blank>equ</a> <a href=logs/completor/last.log target=_blank>comp</a>
<a href="javascript:showhide('away')">[Click to show/hide]</a>
<br>
<div id="away" style="display:none;">
<ul> 
<li>By workflow (%d) </li>
<a href="javascript:showhide('away_bywf')">[Click to show/hide]</a><div id="away_bywf" style="display:none;">
<ul>
%s
</ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('away_bycamp')">[Click to show/hide]</a><div id="away_bycamp" style="display:none;">
<ul>
%s
</ul></div>
</ul>
</div>
""" % (len(lines), len(lines), '\n'.join(lines), len(count_by_campaign),
       text_by_c))

    lap('done with away')

    text = ""
    count = 0
    #for wf in session.query(Workflow).filter(Workflow.status == 'assistance-custodial').all():
    for wf in session.query(Workflow).filter(
            Workflow.status.startswith('assistance')).filter(
                Workflow.status.contains('custodial')).all():
        text += "<li> %s </li> \n" % wfl(
            wf, view=True, update=True, status=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""Worflow that are closing (%d)
<a href=closeout.html target=_blank>closeout</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('closing')">[Click to show/hide]</a>
<br>
<div id="closing" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    lap('done with closing')

    assistance_by_type = defaultdict(list)
    text = ""
    count = 0
    for wf in session.query(Workflow).filter(
            Workflow.status.startswith('assistance-')).all():
        assistance_by_type[wf.status].append(wf)
        count += 1
    for assistance_type in assistance_by_type:
        text += "<li> %s (%d) <a href=\"javascript:showhide('%s')\">[Click to show/hide]</a><br><div id=\"%s\" style=\"display:none;\"><ul>" % (
            assistance_type,
            len(assistance_by_type[assistance_type]),
            assistance_type,
            assistance_type,
        )
        for wf in assistance_by_type[assistance_type]:
            text += "<li> %s <hr></li> \n" % wfl(
                wf, view=True, within=True, status=True, update=True)
        text += "</ul></div></li>\n"
    html_doc.write("""Worflow which need assistance (%d)
<a href=assistance.html target=_blank>assistance</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/recoveror/last.log target=_blank>postlog</a>
<a href="javascript:showhide('assistance')">[Click to show/hide]</a>
<br>
<div id="assistance" style="display:none;">
<br>
<ul>
%s
</ul>
</div>
""" % (count, text))

    lap('done with assistance')

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == 'close').all():
        text += "<li> %s </li> \n" % wfl(wf)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""Worflow ready to close (%d)
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('close')">[Click to show/hide]</a>
<br>
<div id="close" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    lap('done with annoucing')

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(
            Workflow.status == 'trouble').all():
        text += "<li> %s </li> \n" % wfl(wf)
        count += 1
    text += "</ul></div>\n"
    html_doc.write(
        """Worflow with issue (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/injector/last.log target=_blank>postlog</a>
<a href="javascript:showhide('trouble')">[Click to show/hide]</a>
<br>
<div id="trouble" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    lap('done with trouble')

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(
            Workflow.status == 'forget').all():
        text += "<li> %s </li> \n" % wfl(wf)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""
Worflow to forget (%d) <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/lockor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('forget')">[Click to show/hide]</a>
<br>
<div id="forget" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    lap('done with forget')

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == 'done').all():
        text += "<li> %s </li> \n" % wfl(wf)  #,ms=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write("""
Worflow through (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/lockor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('done')">[Click to show/hide]</a>
<br>
<div id="done" style="display:none;">
<br>
<ul>
""" % count)
    html_doc.write(text)

    lap('done with done')

    wfs = session.query(Workflow).filter(
        Workflow.status.endswith('-unlock')).all()
    html_doc.write(
        " Workflows unlocked : %s <a href=logs/lockor/last.log target=_blank>log</a><br>"
        % (len(wfs)))
    lap('done with unlocked')

    text = ""
    lines_thisweek = []
    lines_lastweek = []
    now = time.mktime(time.gmtime())
    this_week = int(time.strftime("%W", time.gmtime()))
    start_time_two_weeks_ago = time.mktime(
        time.gmtime(now - (20 * 24 * 60 * 60)))  # 20
    last_week = int(time.strftime("%W", time.gmtime(now - (7 * 24 * 60 * 60))))

    all_locks = json.loads(open('%s/globallocks.json' % monitor_dir).read())
    waiting_custodial = json.loads(
        open('%s/waiting_custodial.json' % monitor_dir).read())
    all_pending_approval_custodial = dict([
        (k, item) for k, item in waiting_custodial.items() if 'nodes' in item
        and not any([node['decided'] for node in item['nodes'].values()])
    ])
    n_pending_approval = len(all_pending_approval_custodial)
    #n_pending_approval = len([item for item in waiting_custodial.values() if 'nodes' in item and not any([node['decided'] for node in item['nodes'].values() ])])
    missing_approval_custodial = json.loads(
        open('%s/missing_approval_custodial.json' % monitor_dir).read())

    stuck_custudial = json.loads(
        open('%s/stuck_custodial.json' % monitor_dir).read())
    lagging_custudial = json.loads(
        open('%s/lagging_custodial.json' % monitor_dir).read())
    if len(stuck_custudial):
        stuck_string = ', <font color=red>%d appear to be <a href=stuck_custodial.json>stuck</a></font>' % len(
            stuck_custudial)
    else:
        stuck_string = ''
    if len(missing_approval_custodial):
        long_approve_string = ', <font color=red>%d more than %d days</font>' % (
            len(missing_approval_custodial), UC.get('transfer_timeout'))
    else:
        long_approve_string = ''

    output_within_two_weeks = session.query(Output).filter(
        Output.date >= start_time_two_weeks_ago).all()
    waiting_custodial_string = ""
    waiting_custodial_strings = []
    for ds in waiting_custodial:
        out = None
        ## lots of it will be within two weeks
        of = filter(lambda odb: odb.datasetname == ds, output_within_two_weeks)
        if of:
            out = of[0]
        else:
            out = session.query(Output).filter(
                Output.datasetname == ds).first()
        if out:
            info = waiting_custodial[out.datasetname]
            action = 'going'
            if out.datasetname in all_pending_approval_custodial:
                action = '<font color=red>pending</font>'
            try:
                size = str(info['size'])
            except:
                size = "x"

            destination = ",".join(info['nodes'].keys())
            if not destination:
                destination = '<font color=red>NO SITE</font>'

            a_waiting_custodial_string = "<li>on week %s : %s %s</li>" % (
                time.strftime("%W (%x %X)", time.gmtime(
                    out.date)), ol(out.datasetname),
                ' %s [GB] %s to %s on %s (<a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/requestlist?dataset=%s&node=T*MSS">%d missing</a>)'
                % (size, action, destination,
                   time.asctime(time.gmtime(
                       info['checked'])), out.datasetname, info['nmissing']))
            waiting_custodial_strings.append(
                (out.date, a_waiting_custodial_string))

        waiting_custodial_strings.sort(key=lambda i: i[0])
        waiting_custodial_string = "\n".join(
            [i[1] for i in waiting_custodial_strings])
    #start_time_two_weeks_ago = time.mktime(time.strptime("15-0-%d"%(this_week-2), "%y-%w-%W"))
    for out in output_within_two_weeks:
        if not out.workflow:
            print "This is a problem with", out.datasetname
            continue
        if out.workflow.status in [
                'done-unlock', 'done', 'clean', 'clean-out', 'clean-unlock'
        ]:
            custodial = ''
            if out.datasetname in waiting_custodial:
                info = waiting_custodial[out.datasetname]
                try:
                    try:
                        size = str(info['size'])
                    except:
                        size = "x"
                    destination = ",".join(info['nodes'].keys())
                    if not destination:
                        destination = '<font color=red>NO SITE</font>'
                    action = 'going'
                    if out.datasetname in all_pending_approval_custodial:
                        action = '<font color=red>pending</font>'

                    custodial = ' %s [GB] %s to %s on %s (<a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/requestlist?dataset=%s&node=T*MSS">%d missing</a>)' % (
                        size, action, destination,
                        time.asctime(time.gmtime(info['checked'])),
                        out.datasetname, info['nmissing'])
                except Exception as e:
                    #print info
                    #print str(e)
                    pass
            elif out.datasetname in all_locks:
                custodial = '<font color=green>LOCKED</font>'
            out_week = int(time.strftime("%W", time.gmtime(out.date)))
            ##only show current week, and the previous.
            if last_week == out_week:
                lines_lastweek.append(
                    "<li>on week %s : %s %s</li>" %
                    (time.strftime("%W (%x %X)", time.gmtime(
                        out.date)), ol(out.datasetname), custodial))
            if this_week == out_week:

                lines_thisweek.append(
                    "<li>on week %s : %s %s</li>" %
                    (time.strftime("%W (%x %X)", time.gmtime(
                        out.date)), ol(out.datasetname), custodial))
    lines_thisweek.sort()
    lines_lastweek.sort()

    html_doc.write(
        """Output produced (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?in_disagreement=1 target=_blank>disagreements</a>
<a href="javascript:showhide('output')">[Click to show/hide]</a>
<br>
<div id="output" style="display:none;">
<br>
<ul>
<li> %d waiting to go to tape</li>
<ul>
<li> %d waiting for tape approval%s</li>
<li> %d are not completed after %d days%s</li>
<li> Full list (%d) <a href="javascript:showhide('waiting-custodial')">[Click to show/hide]</a>
<div id="waiting-custodial" style="display:none;">
<ul>
%s
</ul>
</div>
</li>
</ul>
<li> Last week (%d) </li><a href="javascript:showhide('output_lastweek')">[Click to show/hide]</a><div id="output_lastweek" style="display:none;"><ul>
%s
</ul></div>
<li> This week (%d) </li><a href="javascript:showhide('output_thisweek')">[Click to show/hide]</a><div id="output_thisweek" style="display:none;"><ul>
%s
</ul></div></div>
""" % (len(lines_lastweek) + len(lines_thisweek),
       len(waiting_custodial), n_pending_approval, long_approve_string,
       len(lagging_custudial), UC.get('transfer_timeout'), stuck_string,
       len(waiting_custodial), waiting_custodial_string, len(lines_lastweek),
       '\n'.join(lines_lastweek), len(lines_thisweek),
       '\n'.join(lines_thisweek)))

    lap('done with output')

    html_doc.write("""Job installed
<a href="javascript:showhide('acron')">[Click to show/hide]</a>
<br>
<div id="acron" style="display:none;">
<br>
<pre>
%s
</pre>
""" % (os.popen('acrontab -l | grep Unified | grep -v \#').read()))

    per_module = defaultdict(list)
    for t in filter(
            None,
            os.popen('cat %s/logs/*/*.time' % monitor_dir).read().split('\n')):
        module_name, run_time, spend = t.split(':')
        ## then do what you want with it !
        if 'cleanor' in module_name: continue

        per_module[module_name].append(int(spend))

    def display_time(sec):
        m, s = divmod(sec, 60)
        h, m = divmod(m, 60)
        dis = ""
        if h:
            dis += "%d [h] " % h
        if h or m:
            dis += "%d [m] " % m
        if h or m or s:
            dis += "%d [s]" % s

        return dis

    html_doc.write("Module running time<ul>\n")
    for m, spends in per_module.items():
        avg = sum(spends) / float(len(spends))
        lasttime = spends[-1]
        html_doc.write("<li>%s : last %s, avg %s</li>\n" %
                       (m, display_time(lasttime), display_time(avg)))
    html_doc.write("</ul>")

    html_doc.write(
        "Last running <pre>%s</pre><br>" %
        (os.popen("tac %s/logs/running | head -5" % monitor_dir).read()))

    html_doc.write("Order in cycle <pre>%s</pre><br>" % ('\n'.join(
        map(
            lambda l: l.split('/')[-1].replace('.py', ''),
            filter(
                lambda l: not l.startswith('#') and 'Unified' in l and 'py' in
                l.split('/')[-1],
                open('%s/WmAgentScripts/cycle.sh' %
                     base_dir).read().split('\n'))))))

    html_doc.write("</div>\n")
    lap('done with jobs')

    text = ""
    count = 0
    for (c, info) in campaignInfo().campaigns.items():
        #if 'go' in info and info['go']:
        text += "<li>%s <br> <pre>%s</pre>  </li>" % (
            c, json.dumps(info, indent=2))
        count += 1

    html_doc.write("""Campaign configuration
<a href="javascript:showhide('campaign')">[Click to show/hide]</a>
<br>
<div id="campaign" style="display:none;">
<br>
<ul>
%s
</ul></div>
""" % (text))

    text = ""
    count = 0
    n_column = 4
    SI = siteInfo()
    date1 = time.strftime(
        '%Y-%m-%d+%H:%M',
        time.gmtime(time.mktime(time.gmtime()) -
                    (15 * 24 * 60 * 60)))  ## 15 days
    date2 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime())
    for t in SI.types():
        text += "<li>%s<table border=1>" % t
        c = 0
        for site in getattr(SI, t):
            cpu = SI.cpu_pledges[site] if site in SI.cpu_pledges else 'N/A'
            disk = SI.disk[SI.CE_to_SE(site)] if SI.CE_to_SE(
                site) in SI.disk else 'N/A'
            if c == 0:
                text += "<tr>"
            if not disk:
                ht_disk = '<font color=red>Disk available: %s</font>' % disk
            else:
                ht_disk = 'Disk available: %s' % disk

            text += '<td><a href=http://dashb-ssb.cern.ch/dashboard/templates/sitePendingRunningJobs.html?site=%s>%s</a><br><a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a><br><a href="http://dashb-cms-job.cern.ch/dashboard/templates/web-job2/#user=&refresh=0&table=Jobs&p=1&records=25&activemenu=1&site=%s&submissiontool=wmagent&check=submitted&sortby=activity&scale=linear&bars=20&data1=%s&date2=%s">dashb</a><br>CPU pledge: %s<br>%s</td>' % (
                site, site, site, site, site, date1, date2, cpu, ht_disk)
            if c == n_column:
                c = 0
            else:
                c += 1
        text += "</table></li>"

    text += "<li> Sites in auto-approved transfer<ul>"
    for site in sorted(SI.sites_auto_approve):
        text += "<li>%s" % site
    text += "</ul></li>"

    text += "<li> Sites with vetoe transfer<ul>"
    for site in sorted(SI.sites_veto_transfer):
        text += "<li>%s" % site
    text += "</ul></li>"

    text += "<li> Sites banned from production<ul>"
    for site in sorted(SI.sites_banned):
        text += "<li>%s" % site
    text += "</ul></li>"

    text += "<li> Approximate Free Tape<ul>"
    for mss in SI.storage:
        waiting = 0
        try:
            waiting = float(
                os.popen(
                    "grep '%s is pending . Created since' %s/logs/lockor/last.log  -B 3 | grep size | awk '{ sum+=$6 ; print sum }' | tail -1"
                    % (mss, monitor_dir)).readline())
        except Exception as e:
            print str(e)

        oldest = ""
        os.system(
            'grep pending %s/logs/lockor/last.log | sort -u > %s/logs/pending.log'
            % (monitor_dir, monitor_dir))
        try:
            oldest = os.popen(
                "grep '%s is pending . Created since ' %s/logs/lockor/last.log | sort | awk '{print $10, $11, $12, $13, $14 }' | head -1"
                % (mss, monitor_dir)).readline()
        except Exception as e:
            print str(e)
        waiting /= 1024.
        text += "<li>%s : %d [TB]. Waiting for approval %d [TB] since %s </li>" % (
            mss, SI.storage[mss], waiting, oldest)
    text += "</ul></li>"

    lap('done with sites')

    open('%s/siteInfo.json' % monitor_dir, 'w').write(
        json.dumps(dict([(t, getattr(SI, t)) for t in SI.types()]), indent=2))

    lap('done with sites json')

    chart_data = defaultdict(list)
    for site in SI.quota:
        chart_data[site].append("""
var data_%s = google.visualization.arrayToDataTable([ 
['Overall', 'Space in TB'],
//['Quota' , %s],
['Locked' , %s],
['Free' , %s]
]);
""" % (
            site,
            SI.quota[site],
            SI.locked[site],
            SI.disk[site],
        ))
        chart_data[site].append("""
var chart_%s = new google.visualization.PieChart(document.getElementById('donutchart_%s'));
chart_%s.draw(data_%s, {title: '%s %s [TB]', pieHole:0.4, slices:{0:{color:'red'},1:{color:'green'}}});
""" % (site, site, site, site, site, SI.quota[site]))
        chart_data[site].append("""
<div id="donutchart_%s" style="height: 200px;width: 300px"></div>
""" % (site))

    ## make the locked/available donut chart
    donut_html = open('%s/locked.html' % monitor_dir, 'w')
    tables = "\n".join([info[0] for site, info in chart_data.items()])
    draws = "\n".join([info[1] for site, info in chart_data.items()])
    divs = "\n".join([info[2] for site, info in chart_data.items()])

    divs_table = "<table border=0>"
    for c, site in enumerate(sorted(chart_data.keys())):
        if c % 5 == 0:
            divs_table += "<tr>"
        divs_table += "<td>%s</td>" % (chart_data[site][2])
    divs_table += "</table>"

    donut_html.write("""
<html>
  <head>
    <script type="text/javascript" src="https://www.google.com/jsapi"></script>
    <script type="text/javascript">
      google.load("visualization", "1", {packages:["corechart"]});
      google.setOnLoadCallback(drawChart);
      function drawChart() {
%s

%s
      }
    </script>
  </head>
  <body>
%s
  </body>
</html>
""" % (tables, draws, divs_table))
    donut_html.close()

    html_doc.write("""Site configuration
<a href="javascript:showhide('site')">[Click to show/hide]</a>
<br>
<div id="site" style="display:none;">
<br>
<ul>
%s
</ul></div>
""" % (text))

    lap('done with space')

    text = ""
    for param in UC.configs:
        text += "<li>%s</li><ul>\n" % param
        for sub in sorted(UC.configs[param].keys()):
            text += "<li> %s : %s </li>\n" % (sub, UC.configs[param][sub])
        text += '</ul>\n'

    html_doc.write("""Unified configuration
<a href="javascript:showhide('config')">[Click to show/hide]</a>
<br>
<div id="config" style="display:none;">
<br>
<ul>
%s
</ul></div>                                                                                                                                                                                                                                                                                                                
""" % (text))

    lap('done with configuration')

    print "... done with status page."
    html_doc.write("""
</body>
</html>
""")

    html_doc.close()
    ## and put the file in place
    os.system('mv %s/index.html.new %s/index.html' %
              (monitor_dir, monitor_dir))

    statuses = json.loads(open('%s/statusmon.json' % monitor_dir).read())
    s_count = defaultdict(int)
    now = time.mktime(time.gmtime())
    for wf in session.query(Workflow).all():
        s_count[wf.status] += 1
    statuses[now] = dict(s_count)
    ## remove old entries
    for t in statuses.keys():
        if (now - float(t)) > 7 * 24 * 60 * 60:
            statuses.pop(t)
    open('%s/statusmon.json' % monitor_dir,
         'w').write(json.dumps(statuses, indent=2))

    html_doc = open('%s/statuses.html' % monitor_dir, 'w')
    html_doc.write(
        """                                                                                                                                                                                                                                                                                                      <html>        
<table border=1>
<thead>
<tr>
<th> workflow </th><th> status </th><th> wm status</th>
</tr>
</thead>
""")
    wfs = {}
    for wfo in session.query(Workflow).all():
        ## pass all that is unlocked and considered it gone
        wfs[wfo.name] = (wfo.status, wfo.wm_status)

    open('%s/statuses.json' % monitor_dir, 'w').write(json.dumps(wfs))
    for wfn in sorted(wfs.keys()):
        ## pass all that is unlocked and considered it gone
        if 'unlock' in wfs[wfn][0]: continue
        html_doc.write(
            '<tr><td><a id="%s">%s</a></td><td>%s</td><td>%s</td></tr>\n' %
            (wfn, wfn, wfs[wfn][0], wfs[wfn][1]))
    html_doc.write("</table>")
    html_doc.write("<br>" * 100)
    html_doc.write("end of page</html>")
    html_doc.close()
Ejemplo n.º 21
0
#!/usr/bin/env python

import json
import os
from collections import defaultdict
from utils import lockInfo, siteInfo, getDatasetBlocksFraction, getDatasetChops, distributeToSites
url = 'cmsweb.cern.ch'


act = False

LI = lockInfo()
SI = siteInfo()

full_spread = defaultdict(set)

dss = set()
to_lock = set()

### should be done in input
prim_to_distribute = [site for site in SI.sites_ready if site.startswith('T1')]
goods = ['T2_CH_CERN','T2_US','T2_DE']
for g in goods:
    prim_to_distribute.extend([site for site in SI.sites_ready if site.startswith(g)])
    
prim_to_distribute = [SI.SE_to_CE(site) for site in [SI.CE_to_SE(site) for site in prim_to_distribute]]
prim_to_distribute = list(set(prim_to_distribute))
    

print "will use these sites as destinations"
print sorted( prim_to_distribute )
Ejemplo n.º 22
0
def main():
    url = 'cmsweb.cern.ch'
    url_tb = 'cmsweb-testbed.cern.ch'

    # Example: python assign.py -w amaltaro_RVZTT_120404_163607_6269
    # -t testbed-relval -s T1_US_FNAL -e CMSSW_6_0_0_pre1_FS_TEST_WMA -p v1 -a
    # relval -l /store/backfill/1
    usage = "usage: %prog [options] [WORKFLOW]"

    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-t', '--team', help='Type of Requests', dest='team')
    parser.add_option('-s',
                      '--sites',
                      help=' "t1" for Tier-1\'s and "t2" for Tier-2\'s',
                      dest='sites')
    parser.add_option(
        '--special',
        help=
        'Use it for special workflows. You also have to change the code according to the type of WF',
        dest='special')
    parser.add_option('-r',
                      '--replica',
                      action='store_true',
                      dest='replica',
                      default=False,
                      help='Adds a _Disk Non-Custodial Replica parameter')
    parser.add_option(
        '-p',
        '--procversion',
        help=
        'Processing Version, if empty it will leave the processing version that comes by default in the request',
        dest='procversion')
    parser.add_option(
        '-a',
        '--activity',
        help=
        'Dashboard Activity (reprocessing, production or test), if empty will set reprocessing as default',
        dest='activity')
    parser.add_option(
        '-x',
        '--xrootd',
        help='Assign with trustSiteLocation=True (allows xrootd capabilities)',
        action='store_true',
        default=False,
        dest='xrootd')
    parser.add_option('-l', '--lfn', help='Merged LFN base', dest='lfn')
    parser.add_option('-v',
                      '--verbose',
                      help='Verbose',
                      action='store_true',
                      default=False,
                      dest='verbose')
    parser.add_option('--testbed',
                      help='Assign in testbed',
                      action='store_true',
                      default=False,
                      dest='testbed')
    parser.add_option(
        '--test',
        action="store_true",
        help=
        'Nothing is injected, only print infomation about workflow and Era',
        dest='test')
    parser.add_option(
        '-f',
        '--file',
        help=
        'Text file with a list of wokflows. If this option is used, the same settings will be applied to all workflows',
        dest='file')
    parser.add_option('-w',
                      '--workflow',
                      help='Workflow Name',
                      dest='workflow')
    parser.add_option('-e', '--era', help='Acquistion era', dest='era')
    parser.add_option("--procstr",
                      dest="procstring",
                      help="Overrides Processing String with a single string")

    (options, args) = parser.parse_args()

    if options.testbed:
        url = url_tb

    # parse input workflows and files. If both -w and -f options are used, then only the -w inputs are considered.
    if not options.workflow:
        if args:
            wfs = args
        elif options.file:
            wfs = [l.strip() for l in open(options.file) if l.strip()]
        else:
            parser.error("Input a workflow name or a file to read them")
            sys.exit(0)
    else:
        wfs = [options.workflow]

    #Default values
    era = {}
    procversion = 1
    procstring = {}
    replica = False
    sites = ALL_SITES
    specialStr = ''
    taskchain = False
    team = 'production'
    trust_site = False

    SI = siteInfo()
    # Handling the parameters given in the command line
    # parse site list
    if options.sites:
        if options.sites == "t1":
            sites = SI.sites_T1s
        elif options.sites == "t2":
            sites = SI.sites_T2s
        else:
            sites = [site for site in options.sites.split(',')]
    else:
        sites = SI.sites_T1s + SI.sites_T2s
    if options.team:
        team = options.team

    if options.xrootd:
        trust_site = True

    if options.replica:
        replica = True

    for wf in wfs:
        # Getting the original dictionary
        schema = getRequestDict(url, wf)
        wf = reqMgr.Workflow(wf, url=url)

        # WF must be in assignment-approved in order to be assigned
        if (schema["RequestStatus"] != "assignment-approved"):
            print("The workflow '" + wf.name +
                  "' you are trying to assign is not in assignment-approved")
            sys.exit(1)

        #Check to see if the workflow is a task chain or an ACDC of a taskchain
        taskchain = (schema["RequestType"] == "TaskChain") or (
            (schema["RequestType"] == "Resubmission")
            and "task" in schema["InitialTaskPath"].split("/")[1])

        #Dealing with era and proc string
        if taskchain:
            # Setting the Era and ProcStr values per Task
            for key, value in schema.items():
                if type(value) is dict and key.startswith("Task"):
                    try:
                        if 'ProcessingString' in value:
                            procstring[
                                value['TaskName']] = value['ProcessingString']
                        else:
                            procstring[
                                value['TaskName']] = schema['ProcessingString']
                        if 'AcquisitionEra' in value:
                            era[value['TaskName']] = value['AcquisitionEra']
                        else:
                            procstring[
                                value['TaskName']] = schema['AcquisitionEra']
                    except KeyError:
                        print(
                            "This taskchain request has no AcquisitionEra or ProcessingString defined into the Tasks, aborting..."
                        )
                        sys.exit(1)
        # Adding the special string - in case it was provided in the command line
        if options.special:
            specialStr = '_' + str(options.special)
            for key, value in procstring.items():
                procstring[key] = value + specialStr
        # Override if a value is given using the procstring command
        if options.procstring:
            procstring = options.procstring
        elif not taskchain:
            procstring = wf.info['ProcessingString']
        if options.era:
            era = options.era
        elif not taskchain:
            era = wf.info['AcquisitionEra']
        #Set era and procstring to none for merge ACDCs inside a task chain
        if schema["RequestType"] == "Resubmission" and wf.info[
                "PrepID"].startswith("task") and "Merge" in schema[
                    "InitialTaskPath"].split("/")[-1]:
            era = None
            procstring = None

        # Must use --lfn option, otherwise workflow won't be assigned
        if options.lfn:
            lfn = options.lfn
        elif "MergedLFNBase" in wf.info:
            lfn = wf.info['MergedLFNBase']
        else:
            print "Can't assign the workflow! Please include workflow lfn using --lfn option."
            sys.exit(0)
        # activity production by default for taskchains, reprocessing for default by workflows
        if options.activity:
            activity = options.activity
        elif taskchain:
            activity = 'production'
        else:
            activity = 'reprocessing'

        # given or default processing version
        if options.procversion:
            procversion = int(options.procversion)
        else:
            procversion = wf.info["ProcessingVersion"]

        # Check for output dataset existence, and abort if output datasets already exist!
        # Don't perform this check for ACDC's
        datasets = schema["OutputDatasets"]
        i = 0
        if not (schema["RequestType"] == "Resubmission"):
            exist = False
            maxv = 1
            for key, value in schema.items():
                if type(value) is dict and key.startswith("Task"):
                    dbsapi = DbsApi(url=dbs3_url)

                    # list all datasets with same name but different version
                    # numbers
                    datasets = dbsapi.listDatasets(
                        acquisition_era_name=value['AcquisitionEra'],
                        primary_ds_name=value['PrimaryDataset'],
                        detail=True,
                        dataset_access_type='*')
                    processedName = value['AcquisitionEra'] + '-' + value[
                        'ProcessingString'] + "-v\\d+"
                    # see if any of the dataset names is a match
                    for ds in datasets:
                        if re.match(processedName, ds['processed_ds_name']):
                            print "Existing dset:", ds[
                                'dataset'], "(%s)" % ds['dataset_access_type']
                            maxv = max(maxv, ds['processing_version'])
                            exist = True
                        else:
                            pass
                    i += 1
            # suggest max version
            if exist and procversion <= maxv:
                print "Some output datasets exist, its advised to assign with v ==", maxv + 1
                sys.exit(0)

    # If the --test argument was provided, then just print the information
    # gathered so far and abort the assignment
        if options.test:
            print "%s \tEra: %s \tProcStr: %s \tProcVer: %s" % (
                wf.name, era, procstring, procversion)
            print "LFN: %s \tTeam: %s \tSite: %s" % (lfn, team, sites)
            print "Taskchain? " + str(taskchain)
            print "Activity:" + activity
            sys.exit(0)

        # Really assigning the workflow now
        print wf.name, '\tEra:', era, '\tProcStr:', procstring, '\tProcVer:', procversion, '\tTeam:', team, '\tSite:', sites
        assignRequest(url, wf.name, team, sites, era, procversion, activity,
                      lfn, procstring, trust_site, options.replica,
                      options.verbose, taskchain)

    sys.exit(0)
Ejemplo n.º 23
0
def transferor(url, specific=None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    NLI = newLockInfo()
    mcm = McMClient(dev=False)
    dss = DSS()

    #allowed_secondary = UC.get('')
    print "counting all being handled..."
    being_handled = len(
        session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('stag')).all())
    being_transfered = len(
        session.query(Workflow).filter(Workflow.status == 'staging').all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance-')).all())

    max_to_handle = options.maxworkflows
    max_to_transfer = options.maxstaging

    allowed_to_handle = max(0, max_to_handle - being_handled)
    allowed_to_transfer = max(0, max_to_transfer - being_transfered)
    wf_buffer = 5
    if allowed_to_handle <= wf_buffer:  ## buffer for having several wf per transfer
        print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer"
    else:
        print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer"

    if allowed_to_transfer <= wf_buffer:
        print "Not allowed to transfer more than", max_to_transfer, "at a time. Currently", being_transfered, "and", wf_buffer, "buffer"
    else:
        print being_transfered, "already being transfered", max_to_transfer, "max allowed,", allowed_to_transfer, "remaining", "and", wf_buffer, "buffer"

    print "... done"

    all_transfers = defaultdict(list)
    workflow_dependencies = defaultdict(
        set)  ## list of wf.id per input dataset
    wfs_and_wfh = []
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(
            Workflow.status.startswith('considered')).all():
        print "\t", wfo.name
        if specific and not specific in wfo.name: continue
        cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append((wfo,
                                workflowInfo(url,
                                             wfo.name,
                                             spec=False,
                                             request=cache_r[0])))
        else:
            wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False)))
    print "... done"

    transfers_per_sites = defaultdict(int)
    input_sizes = {}
    ignored_input_sizes = {}
    input_cput = {}
    input_st = {}
    ## list the size of those in transfer already
    in_transfer_priority = None
    min_transfer_priority = None
    print "getting all wf in staging ..."
    stucks = json.loads(open('%s/stuck_transfers.json' % monitor_dir).read())

    for wfo in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wfh = workflowInfo(url, wfo.name, spec=False)
        #(lheinput,primary,parent,secondary) = wfh.getIO()
        #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        for site in sites_allowed:  ## we should get the actual transfer destination instead of the full white list
            transfers_per_sites[site] += 1
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:
            ds_s = dss.get(prim)
            if prim in stucks:
                sendLog('transferor',
                        "%s appears stuck, so not counting it %s [GB]" %
                        (prim, ds_s),
                        wfi=wfh)
                ignored_input_sizes[prim] = ds_s
            else:
                input_sizes[prim] = ds_s
                sendLog('transferor',
                        "%s needs %s [GB]" % (wfo.name, ds_s),
                        wfi=wfh)
        if in_transfer_priority == None:
            in_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            in_transfer_priority = max(in_transfer_priority,
                                       int(wfh.request['RequestPriority']))
        if min_transfer_priority == None:
            min_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            min_transfer_priority = min(min_transfer_priority,
                                        int(wfh.request['RequestPriority']))

    if min_transfer_priority == None or in_transfer_priority == None:
        print "nothing is lining up for transfer"
        sendEmail("no request in staging", "no request in staging")
        return
        pass

    try:
        print "Ignored input sizes"
        ignored_values = list(ignored_input_sizes.items())
        ignored_values.sort(key=lambda i: i[1])
        print "\n".join(map(str, ignored_values))
        print "Considered input sizes"
        considered_values = list(input_sizes.items())
        considered_values.sort(key=lambda i: i[1])
        print "\n".join(map(str, considered_values))
    except Exception as e:
        print "trying to print the summary of input size"
        print str(e)

    print "... done"
    print "Max priority in transfer already", in_transfer_priority
    print "Min priority in transfer already", min_transfer_priority
    print "transfers per sites"
    print json.dumps(transfers_per_sites, indent=2)
    in_transfer_already = sum(input_sizes.values())
    cput_in_transfer_already = sum(input_cput.values())
    st_in_transfer_already = sum(input_st.values())

    ## list the size of all inputs
    primary_input_per_workflow_gb = defaultdict(float)
    print "getting all input sizes ..."
    for (wfo, wfh) in wfs_and_wfh:
        (_, primary, _, _) = wfh.getIO()
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:
            ## do not count it if it appears stalled !
            prim_size = dss.get(prim)
            input_sizes[prim] = prim_size
            primary_input_per_workflow_gb[wfo.name] += prim_size
    print "... done"

    # shuffle first by name
    random.shuffle(wfs_and_wfh)

    # Sort smallest transfers first; allows us to transfer as many as possible workflows.
    def prio_and_size(i, j):
        if int(i[1].request['RequestPriority']) == int(
                j[1].request['RequestPriority']):
            return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)),
                       int(primary_input_per_workflow_gb.get(i[0].name, 0)))
        else:
            return cmp(int(i[1].request['RequestPriority']),
                       int(j[1].request['RequestPriority']))

    #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True)
    #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) ))
    #sort by priority higher first
    wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[
        'RequestPriority']), int(j[1].request['RequestPriority'])),
                     reverse=True)

    cput_grand_total = sum(input_cput.values())
    cput_to_transfer = cput_grand_total - cput_in_transfer_already
    st_grand_total = sum(input_st.values())
    st_to_transfer = st_grand_total - st_in_transfer_already
    print "%15.4f [CPU h] worth already in transfer" % cput_in_transfer_already
    print "%15.4f [CPU h] worth is the current requested transfer load" % cput_to_transfer
    print "%15.4f [h] worth of absolute system time in transfer" % (
        cput_in_transfer_already / SI.availableSlots())
    print "%15.4f [h] worth of absolute system time is the current requested transfer load" % (
        cput_to_transfer / SI.availableSlots())
    print "%15.4f [h] worth of theoritical system time in transfer" % (
        st_in_transfer_already)
    print "%15.4f [h] worth of theoritical system time is the current requested transfer load" % (
        st_to_transfer)

    grand_total = sum(input_sizes.values())
    to_transfer = grand_total - in_transfer_already
    grand_transfer_limit = options.maxtransfer
    #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB

    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered" % in_transfer_already
    print "%15.4f GB is the current requested transfer load" % to_transfer
    print "%15.4f GB is the global transfer limit" % grand_transfer_limit
    print "%15.4f GB is the available limit" % transfer_limit

    max_staging_per_site = options.maxstagingpersite

    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer = 0  ## so that we can count'em
    passing_along = 0
    transfer_sizes = {}
    went_over_budget = False
    destination_cache = {}
    no_goes = set()

    max_per_round = UC.get('max_per_round').get('transferor', None)
    if max_per_round and not spec:
        wfs_and_wfh = wfs_and_wfh[:max_per_round]

    for (wfo, wfh) in wfs_and_wfh:
        print wfo.name, "to be transfered with priority", wfh.request[
            'RequestPriority']

        if wfh.request['RequestStatus'] != 'assignment-approved':
            if wfh.request['RequestStatus'] in [
                    'aborted', 'rejected', 'rejected-archived',
                    'aborted-archived'
            ]:
                wfo.status = 'trouble'  ## so that we look or a replacement
            else:
                wfo.status = 'away'
            wfh.sendLog(
                'transferor', '%s in status %s, setting %s' %
                (wfo.name, wfh.request['RequestStatus'], wfo.status))
            continue

        (_, primary, _, _) = wfh.getIO()
        this_load = sum([input_sizes[prim] for prim in primary])
        no_budget = False
        if (this_load
                and (sum(transfer_sizes.values()) + this_load > transfer_limit
                     or went_over_budget)):
            if went_over_budget:
                wfh.sendLog('transferor', "Transfer has gone over bubget.")
            else:
                wfh.sendLog('transferor', "Transfer will go over bubget.")
            wfh.sendLog(
                'transferor',
                "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"
                % (this_load, sum(transfer_sizes.values()), transfer_limit))
            #if sum(transfer_sizes.values()) > transfer_limit:
            went_over_budget = True
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(
                        wfh.request['RequestPriority']
                ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over budget" %
                        (wfh.request['RequestPriority'], in_transfer_priority))
                else:
                    if not options.go:
                        wfh.sendLog(
                            'transferor',
                            "%s minimum priority %s < %s : stop" %
                            (min_transfer_priority,
                             wfh.request['RequestPriority'],
                             in_transfer_priority))
                        no_budget = True

        ## throtlle by campaign go
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True
            no_goes.add(wfo.name)

        allowed_secondary = set()
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                allowed_secondary.update(CI.campaigns[campaign]['secondaries'])
        if secondary:
            if (secondary and allowed_secondary) and (
                    set(secondary) & allowed_secondary != set(secondary)):
                wfh.sendLog(
                    'assignor', '%s is not an allowed secondary' %
                    (', '.join(set(secondary) - allowed_secondary)))
                no_go = True

        if no_go:
            continue
        ## check if the batch is announced

        def check_mcm(wfn):
            announced = False
            is_real = False
            if not wfn.startswith('pdmvserv'):
                is_real = True
            try:
                for b in mcm.getA('batches', query='contains=%s' % wfo.name):
                    is_real = True
                    if b['status'] == 'announced':
                        announced = True
                        break
            except:
                try:
                    for b in mcm.getA('batches',
                                      query='contains=%s' % wfo.name):
                        is_real = True
                        if b['status'] == 'announced':
                            announced = True
                            break
                except:
                    print "could not get mcm batch announcement, assuming not real"
            return announced, is_real

        if not use_mcm:
            announced, is_real = False, True
        else:
            if wfh.request['RequestType'] in ['ReReco']:
                announced, is_real = True, True
            else:
                announced, is_real = check_mcm(wfo.name)

        if not announced:
            wfh.sendLog('transferor', "does not look announced.")

        if not is_real:
            wfh.sendLog('transferor', "does not appear to be genuine.")

            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(
            time.strptime('.'.join(map(str, wfh.request['RequestDate'])),
                          "%Y.%m.%d.%H.%M.%S")) / (60. * 60.)
        now = time.mktime(time.gmtime()) / (60. * 60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced:
                wfh.sendLog(
                    'transferor',
                    "It is too soon to start transfer: %3.2fH remaining" %
                    (now - injection_time))
                continue

        if passing_along >= allowed_to_handle:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(wfh.request['RequestPriority']
                       ) >= in_transfer_priority and int(
                           wfh.request['RequestPriority']
                       ) != min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over %s" %
                        (wfh.request['RequestPriority'], in_transfer_priority,
                         max_to_handle))
                else:
                    wfh.sendLog(
                        'transferor',
                        " Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"
                        % (max_to_handle, being_handled, passing_along))
                    if not options.go:
                        ## should not allow to jump that fence
                        break

        if this_load and needs_transfer >= allowed_to_transfer:
            if in_transfer_priority != None and min_transfer_priority != None:
                if int(wfh.request['RequestPriority']
                       ) >= in_transfer_priority and int(
                           wfh.request['RequestPriority']
                       ) != min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog(
                        'transferor',
                        "Higher priority sample %s >= %s go-on over %s" %
                        (wfh.request['RequestPriority'], in_transfer_priority,
                         max_to_transfer))
                else:
                    wfh.sendLog(
                        'transferor',
                        "Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"
                        % (max_to_transfer, being_transfered, needs_transfer))
                    if not options.go:
                        no_budget = True

        if no_budget:
            continue

        ## the site white list considers site, campaign, memory and core information
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')

        for dataset in list(primary) + list(parent) + list(secondary):
            ## lock everything flat
            NLI.lock(dataset)

        if not sites_allowed:
            wfh.sendLog('transferor', "not possible site to run at")
            #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name ))
            sendLog('transferor',
                    "%s has no possible sites to run at" % (wfo.name),
                    level='critical')
            continue

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## augment with run white list
            for dataset in primary:
                blocks = list(
                    set(blocks + getDatasetBlocks(
                        dataset, runs=wfh.request['RunWhitelist'])))
        if 'LumiList' in wfh.request and wfh.request['LumiList']:
            ## augment with the lumi white list
            blocks = list(
                set(blocks +
                    getDatasetBlocks(dataset, lumis=wfh.request['LumiList'])))

        if blocks:
            print "Reading", len(blocks), "in block whitelist"

        can_go = True
        staging = False
        allowed = True
        primary_destinations = set()
        if primary:

            copies_needed_from_CPUh, CPUh = wfh.getNCopies()

            if talk:
                print wfo.name, 'reads', ', '.join(primary), 'in primary'
            ## chope the primary dataset
            for prim in primary:
                ## keep track of what needs what
                workflow_dependencies[prim].add(wfo.id)

                max_priority[prim] = max(max_priority[prim],
                                         int(wfh.request['RequestPriority']))

                wfh.sendLog(
                    'transferor', "Would make %s  from cpu requirement %s" %
                    (copies_needed_from_CPUh, CPUh))
                copies_needed = copies_needed_from_CPUh

                if 'Campaign' in wfh.request and wfh.request[
                        'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                            wfh.request['Campaign']]:
                    copies_needed_from_campaign = CI.campaigns[
                        wfh.request['Campaign']]['maxcopies']
                    copies_needed = min(copies_needed_from_campaign,
                                        copies_needed)

                    wfh.sendLog(
                        'transferor',
                        "Maxed to %s by campaign configuration %s" %
                        (copies_needed, wfh.request['Campaign']))

                ### new ways of making the whole thing
                destinations, all_block_names = getDatasetDestinations(
                    url,
                    prim,
                    within_sites=[SI.CE_to_SE(site) for site in sites_allowed],
                    only_blocks=blocks)
                print json.dumps(destinations, indent=2)

                ## get where the dataset is in full and completed
                prim_location = [
                    site for (site, info) in destinations.items()
                    if info['completion'] == 100 and info['data_fraction'] == 1
                ]
                ## the rest is places it is going to be
                prim_destination = [
                    site for site in destinations.keys()
                    if not site in prim_location
                ]

                if len(prim_location) >= copies_needed:
                    wfh.sendLog(
                        'transferor',
                        "The input is all fully in place at %s sites %s" %
                        (len(prim_location), sorted(prim_location)))
                    continue
                copies_needed = max(0, copies_needed - len(prim_location))
                wfh.sendLog(
                    'transferor',
                    "not counting existing copies ; now need %s" %
                    copies_needed)
                copies_being_made = [
                    sum([
                        info['blocks'].keys().count(block)
                        for site, info in destinations.items()
                        if site in prim_destination
                    ]) for block in all_block_names
                ]

                latching_on_transfers = set()
                [
                    latching_on_transfers.update(info['blocks'].values())
                    for site, info in destinations.items()
                    if site in prim_destination
                ]
                latching_on_transfers = list(latching_on_transfers)
                #print latching_on_transfers

                ## figure out where all this is going to go
                prim_to_distribute = [
                    site for site in sites_allowed
                    if not SI.CE_to_SE(site) in prim_location
                ]
                prim_to_distribute = [
                    site for site in prim_to_distribute
                    if not SI.CE_to_SE(site) in prim_destination
                ]
                ## take out the ones that cannot receive transfers
                prim_to_distribute = [
                    site for site in prim_to_distribute if not any([
                        osite.startswith(site)
                        for osite in SI.sites_veto_transfer
                    ])
                ]

                wfh.sendLog(
                    'transferor',
                    "Could be going to: %s" % sorted(prim_to_distribute))
                if not prim_to_distribute or any([
                        transfers_per_sites[site] < max_staging_per_site
                        for site in prim_to_distribute
                ]):
                    ## means there is openings let me go
                    print "There are transfer slots available:", [
                        (site, transfers_per_sites[site])
                        for site in prim_to_distribute
                    ]
                    #for site in sites_allowed:
                    #    #increment accross the board, regardless of real destination: could be changed
                    #    transfers_per_sites[site] += 1
                else:
                    if int(
                            wfh.request['RequestPriority']
                    ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                        wfh.sendLog(
                            'transferor',
                            "Higher priority sample %s >= %s go-on over transfer slots available"
                            % (wfh.request['RequestPriority'],
                               in_transfer_priority))
                    else:
                        wfh.sendLog(
                            'transferor',
                            "Not allowed to transfer more than %s per site at a time. Going overboard for %s"
                            % (max_staging_per_site,
                               sorted([
                                   site for site in prim_to_distribute
                                   if transfers_per_sites[site] >=
                                   max_staging_per_site
                               ])))
                        if not options.go:
                            allowed = False
                            break

                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(
                        Transfer.phedexid == int(latching)).first()
                    if not tfo:
                        tfo = session.query(Transfer).filter(
                            Transfer.phedexid == -int(latching)).first()

                    if not tfo:
                        tfo = Transfer(phedexid=latching)
                        tfo.workflows_id = []
                        session.add(tfo)
                    else:
                        tfo.phedexid = latching  ## make it positive ever

                    if not wfo.id in tfo.workflows_id:
                        print "adding", wfo.id, "to", tfo.id, "with phedexid", latching
                        l = copy.deepcopy(tfo.workflows_id)
                        l.append(wfo.id)
                        tfo.workflows_id = l
                    if not options.test:
                        session.commit()
                    else:
                        session.flush(
                        )  ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                #copies_needed = max(0,copies_needed - len(prim_destination))
                copies_needed = max(0, copies_needed - min(copies_being_made))
                wfh.sendLog(
                    'transferor',
                    "Not counting the copies being made ; then need %s" %
                    copies_needed)
                if copies_needed == 0:
                    wfh.sendLog(
                        'transferor',
                        "The output is either fully in place or getting in full somewhere with %s"
                        % latching_on_transfers)
                    can_go = True
                    continue
                elif len(prim_to_distribute) == 0:
                    wfh.sendLog(
                        'transferor',
                        "We are going to need extra copies, but no destinations seems available"
                    )
                    prim_to_distribute = [
                        site for site in sites_allowed
                        if not SI.CE_to_SE(site) in prim_location
                    ]
                    prim_to_distribute = [
                        site for site in prim_to_distribute if not any([
                            osite.startswith(site)
                            for osite in SI.sites_veto_transfer
                        ])
                    ]

                if len(
                        prim_to_distribute
                ) > 0:  ## maybe that a parameter we can play with to limit the
                    if not options or options.chop:
                        ### hard include the tape disk andpoint ?
                        #tapes = [site for site in  getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')]
                        chops, sizes = getDatasetChops(
                            prim,
                            chop_threshold=options.chopsize,
                            only_blocks=blocks)
                        spreading = distributeToSites(chops,
                                                      prim_to_distribute,
                                                      n_copies=copies_needed,
                                                      weights=SI.cpu_pledges,
                                                      sizes=sizes)
                        transfer_sizes[prim] = sum(sizes)
                        if not spreading:
                            sendLog(
                                'transferor',
                                'cannot send %s to any site, it cannot fit anywhere'
                                % prim,
                                level='critical')
                            wfh.sendLog(
                                'transferor',
                                "cannot send to any site. %s cannot seem to fit anywhere"
                                % (prim))
                            staging = False
                            can_go = False

                    else:
                        spreading = {}
                        for site in prim_to_distribute:
                            if blocks:
                                spreading[site] = blocks
                            else:
                                spreading[site] = [prim]
                        transfer_sizes[prim] = input_sizes[
                            prim]  ## this is approximate if blocks are specified
                    can_go = False
                    wfh.sendLog(
                        'transferor', "selected CE destinations %s" %
                        (sorted(spreading.keys())))
                    for (site, items) in spreading.items():
                        all_transfers[site].extend(items)
                        transfers_per_sites[site] += 1
                        primary_destinations.add(site)
        if not allowed:
            wfh.sendLog('transferor', "Not allowed to move on with")
            continue

        if secondary:

            override_sec_destination = []
            if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]:
                override_sec_destination = CI.campaigns[
                    wfh.request['Campaign']]['SecondaryLocation']

            print wfo.name, 'reads', ', '.join(secondary), 'in secondary'
            for sec in secondary:

                workflow_dependencies[sec].add(wfo.id)

                if True:
                    ## new style, failing on minbias
                    if not sec in destination_cache:
                        ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist
                        destination_cache[sec], _ = getDatasetDestinations(
                            url, sec)  ## NO SITE WHITE LIST ADDED
                        #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])

                    ## limit to the site whitelist NOW
                    se_allowed = [SI.CE_to_SE(site) for site in sites_allowed]
                    destinations = dict([
                        (k, v) for (k, v) in destination_cache[sec].items()
                        if site in se_allowed
                    ])
                    ## truncate location/destination to those making up for >90% of the dataset
                    bad_destinations = [
                        destinations.pop(site)
                        for (site, info) in destinations.items()
                        if info['data_fraction'] < 0.9
                    ]
                    sec_location = [
                        site for (site, info) in destinations.items()
                        if info['completion'] >= 95
                    ]
                    sec_destination = [
                        site for site in destinations.keys()
                        if not site in sec_location
                    ]
                else:
                    ## old style
                    presence = getDatasetPresence(url, sec)
                    sec_location = [
                        site for site, pres in presence.items()
                        if pres[1] > 90.
                    ]  ## more than 90% of the minbias at sites
                    subscriptions = listSubscriptions(url, sec)
                    sec_destination = [site for site in subscriptions]

                sec_to_distribute = [
                    site for site in sites_allowed if
                    not any([osite.startswith(site) for osite in sec_location])
                ]
                sec_to_distribute = [
                    site for site in sec_to_distribute if not any(
                        [osite.startswith(site) for osite in sec_destination])
                ]
                sec_to_distribute = [
                    site for site in sec_to_distribute if not any([
                        osite.startswith(site)
                        for osite in SI.sites_veto_transfer
                    ])
                ]
                if override_sec_destination:
                    ## intersect with where we want the PU to be
                    not_needed_anymore = list(
                        set(sec_to_distribute) - set(override_sec_destination))
                    #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore ))
                    sendLog(
                        'transferor',
                        "the dataset %s could be removed from %s" %
                        (sec, not_needed_anymore))
                    sec_to_distribute = list(
                        set(sec_to_distribute) & set(override_sec_destination))

                if len(sec_to_distribute) > 0:
                    print "secondary could go to", sorted(sec_to_distribute)
                    sec_size = dss.get(sec)
                    for site in sec_to_distribute:
                        site_se = SI.CE_to_SE(site)
                        if (SI.disk[site_se] * 1024.) > sec_size:
                            all_transfers[site].append(sec)
                            can_go = False
                        else:
                            print "could not send the secondary input to", site_se, "because it is too big for the available disk", SI.disk[
                                site_se] * 1024, "GB need", sec_size
                            if primary_destinations and site in primary_destinations:
                                #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024))
                                sendLog('transferor',
                                        '%s is too big (%s) for %s (%s)' %
                                        (sec, sec_size, site_se,
                                         SI.disk[site_se] * 1024),
                                        level='critical')
                else:
                    print "the secondary input does not have to be send to site"

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                wfh.sendLog(
                    'transferor',
                    "latches on existing transfers, and nothing else, settin staging"
                )
                wfo.status = 'staging'
                needs_transfer += 1
            else:
                wfh.sendLog(
                    'transferor', "should just be assigned now to %s" %
                    sorted(sites_allowed))
                wfo.status = 'staged'
            passing_along += 1
            wfh.sendLog('transferor', "setting status to %s" % wfo.status)
            session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                wfh.sendLog('transferor', "latches on existing transfers")
                if not options.test:
                    wfo.status = 'staging'
                    wfh.sendLog('transferor',
                                "setting status to %s" % wfo.status)
                    session.commit()
            wfh.sendLog('transferor', "needs a transfer")
            needs_transfer += 1
            passing_along += 1

    if no_goes:
        #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes ))
        sendLog('transferor',
                "No go for \n" + "\n".join(no_goes),
                level='critical')

    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    fake_id = -1
    wf_id_in_prestaging = set()

    for (site, items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))

        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer:
            print site, "does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        block_datasets = list(set([it.split('#')[0] for it in blocks]))
        datasets = [it for it in items_to_transfer if not '#' in it]

        details_text = "Making a replica to %s (CE) %s (SE) for" % (site,
                                                                    site_se)

        #print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [
            block for block in blocks if not block.split('#')[0] in datasets
        ]
        #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        #print "\t",len(datasets),"datasets"
        #print "\t",datasets
        details_text += '\n\t%d blocks' % len(blocks)
        details_text += '\n\t%d needed blocks for %s' % (
            len(blocks),
            sorted(list(set([block.split('#')[0] for block in blocks]))))
        details_text += '\n\t%d datasets' % len(datasets)
        details_text += '\n\t%s' % sorted(datasets)

        items_to_transfer = blocks + datasets

        if execute:
            sendLog('transferor', details_text)
        else:
            print "Would make a replica to", site, "(CE)", site_se, "(SE) for"
            print details_text

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y', 'yes', 'go']:
                continue

        if execute:
            priority = 'normal'
            cds = [
                ds for ds in datasets + block_datasets if ds in max_priority
            ]
            if cds and False:  ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed
                ## decide on an overall priority : that's a bit too large though
                if any([max_priority[ds] >= 90000 for ds in cds]):
                    priority = 'high'
                elif all([max_priority[ds] < 80000 for ds in cds]):
                    priority = 'low'

            result = makeReplicaRequest(url,
                                        site_se,
                                        items_to_transfer,
                                        'prestaging',
                                        priority=priority)
        else:
            result = {'phedex': {'request_created': []}}
            fake_id -= 1

        if not result:
            print "ERROR Could not make a replica request for", site, items_to_transfer, "pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(
                Transfer.phedexid == int(phedexid)).first()
            if not new_transfer:
                new_transfer = session.query(Transfer).filter(
                    Transfer.phedexid == -int(phedexid)).first()
            print phedexid, "transfer created"
            if not new_transfer:
                new_transfer = Transfer(phedexid=phedexid)
                session.add(new_transfer)
            else:
                new_transfer.phedexid = phedexid  ## make it positive again

            new_transfer.workflows_id = set()
            for transfering in list(
                    set(map(lambda it: it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update(
                    workflow_dependencies[transfering])
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status != 'staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting", tr_wf.name, "to staging"
        session.commit()
Ejemplo n.º 24
0
def equalizor(url , specific = None, options=None):
    up = componentInfo(mcm=False, soft=['mcm']) 
    if not up.check(): return 

    if not specific:
        workflows = getWorkflows(url, status='running-closed', details=True)
        workflows.extend(getWorkflows(url, status='running-open', details=True))

    ## start from scratch
    modifications = defaultdict(dict)
    ## define regionality site => fallback allowed. feed on an ssb metric ??
    mapping = defaultdict(list)
    reversed_mapping = defaultdict(list)
    regions = defaultdict(list)
    SI = siteInfo()
    CI = campaignInfo()
    UC = unifiedConfiguration()
    for site in SI.sites_ready:
        region = site.split('_')[1]
        if not region in ['US','DE','IT']: continue
        regions[region] = [region] 

    def site_in_depletion(s):
        return True
        if s in SI.sites_pressure:
            (m, r, pressure) = SI.sites_pressure[s]
            if float(m) < float(r):
                print s,m,r,"lacking pressure"
                return True
            else:
                print s,m,r,"pressure"
                pass
                
        return False

    for site in SI.sites_ready:
        region = site.split('_')[1]
        ## fallback to the region, to site with on-going low pressure
        mapping[site] = [fb for fb in SI.sites_ready if any([('_%s_'%(reg) in fb and fb!=site and site_in_depletion(fb))for reg in regions[region]]) ]
    

    use_T0 = ('T0_CH_CERN' in UC.get("site_for_overflow"))
    if options.t0: use_T0 = True
    #if options.augment : use_T0 = True

    use_HLT = ('T2_CH_CERN_HLT' in UC.get("site_for_overflow"))
    if options.hlt: use_HLT = True
    #if options.augment : use_HLT=True

    if use_HLT:
        mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT')

    if use_T0:
        mapping['T2_CH_CERN'].append('T0_CH_CERN')
        #mapping['T1_FR_CCIN2P3'].append('T0_CH_CERN')

    #mapping['T2_IT_Legnaro'].append('T1_IT_CNAF')
    for reg in ['IT','DE','UK']:
        mapping['T2_CH_CERN'].extend([fb for fb in SI.sites_ready if '_%s_'%reg in fb])


    ## make them appear as OK to use
    force_sites = []

    ## overflow CERN to underutilized T1s
    upcoming = json.loads( open('%s/GQ.json'%monitor_dir).read())
    for possible in SI.sites_T1s:
        if not possible in upcoming:
            mapping['T2_CH_CERN'].append(possible)

    ## remove add-hoc sites from overflow mapping
    prevent_sites = ['T2_US_Purdue']
    for prevent in prevent_sites:
        if prevent in mapping: mapping.pop( prevent )
    for src in mapping:
        for prevent in prevent_sites:
            if prevent in mapping[src]:
                mapping[src].remove( prevent )

    ## create the reverse mapping for the condor module
    for site,fallbacks in mapping.items():
        for fb in fallbacks:
            reversed_mapping[fb].append(site)

    ## this is the fallback mapping
    print "Direct mapping : site => overflow"
    print json.dumps( mapping, indent=2)
    print "Reverse mapping : dest <= from origin"
    print json.dumps( reversed_mapping, indent=2)

    altered_tasks = set()

    def running_idle( wfi , task_name):
        gmon = wfi.getGlideMon()
        #print gmon
        if not gmon: return (0,0)
        if not task_name in gmon: return (0,0)
        return (gmon[task_name]['Running'], gmon[task_name]['Idle'])

    def needs_action( wfi, task, min_idled = 100, pressure = 0.2):
        task_name = task.pathName.split('/')[-1]
        running, idled = running_idle( wfi, task_name)
        go = True
        if not idled and not running : 
            go = False
        if idled < 100: 
            go = False
        if (not running and idled) or (running and (idled / float(running) > pressure)):
            go = True
        else:
            go = False
        return go, task_name, running, idled

    def getPerf( task ):
        task = task.split('/')[1]+'/'+task.split('/')[-1]
        try:
            u = 'http://cms-gwmsmon.cern.ch/prodview/json/history/memoryusage720/%s'%task
            print u
            perf_data = json.loads(os.popen('curl -s --retry 5 %s'%u).read())
        except Exception as e:
            print str(e)
            return (None,None)
        buckets = perf_data['aggregations']["2"]['buckets']
        s_m = sum( bucket['key']*bucket['doc_count'] for bucket in buckets)
        w_m = sum( bucket['doc_count'] for bucket in buckets)
        m_m = max( bucket['key'] for bucket in buckets) if buckets else None
        
        b_m = None
        if w_m > 100:
            b_m = m_m

        try:
            perf_data = json.loads(os.popen('curl -s --retry 5 http://cms-gwmsmon.cern.ch/prodview/json/history/runtime720/%s'%task).read())
        except Exception as e:
            print str(e)
            return (b_m,None)

        buckets = perf_data['aggregations']["2"]['buckets']
        s_t = sum( bucket['key']*bucket['doc_count'] for bucket in buckets)
        w_t = sum( bucket['doc_count'] for bucket in buckets)
        m_t = max( bucket['key'] for bucket in buckets) if buckets else None
        
        b_t = None
        if w_t > 100:
            b_t = m_t

        return (b_m,b_t)
        
    def getcampaign( task ):
        taskname = task.pathName.split('/')[-1]
        if hasattr( task, 'prepID'):
            return task.prepID.split('-')[1]
        elif taskname.count('-')>=1:
            return taskname.split('-')[1]
        else:
            return None

    def close( interface ):
        open('%s/equalizor.json.new'%monitor_dir,'w').write( json.dumps( interface, indent=2))
        os.system('mv %s/equalizor.json.new %s/equalizor.json'%(monitor_dir,monitor_dir))
        os.system('cp %s/equalizor.json %s/logs/equalizor/equalizor.%s.json'%(monitor_dir,monitor_dir,time.mktime(time.gmtime())))

    interface = {
        'reversed_mapping' : reversed_mapping,
        'modifications' : {}
        }
    if options.augment or options.remove:
        interface['modifications'] = json.loads( open('%s/equalizor.json'%monitor_dir).read())['modifications']

    if options.remove:
        if specific in interface['modifications']:
            print "poping",specific
            interface['modifications'].pop(specific)
            close( interface )
        return 


    PU_locations = {}
    PU_overflow = {}
    LHE_overflow = {}
    tune_performance = []

    pending_HLT = 0
    max_HLT = 60000
    pending_T0 = 0
    max_T0 = 60000
    try:
        gmon = json.loads(os.popen('curl -s http://cms-gwmsmon.cern.ch/prodview/json/T2_CH_CERN_HLT').read())
        pending_HLT += gmon["Running"]
        pending_HLT += gmon["MatchingIdle"]
    except:
        pass


    stay_within_site_whitelist = False
    specific_task=None
    if specific and ":" in specific:
        specific,specific_task = specific.split(':')

    if specific:
        wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all()
    else:
        wfs = session.query(Workflow).filter(Workflow.status == 'away').all()
        
    performance = {}
    no_routing = [
        ]
    random.shuffle( wfs )
    for wfo in wfs:
        if wfo.name in no_routing and not options.augment:
            continue

        if specific and not specific in wfo.name: 
            continue
        if specific:
            wfi = workflowInfo(url, wfo.name)
        else:
            cached = filter(lambda d : d['RequestName']==wfo.name, workflows)
            if not cached : continue
            wfi = workflowInfo(url, wfo.name, request = cached[0])
        
        ## only running should get re-routed
        if not wfi.request['RequestStatus'] in ['running-open','running-closed'] and not specific: continue

        tasks_and_campaigns = []
        for task in wfi.getWorkTasks():
            tasks_and_campaigns.append( (task, getcampaign(task) ) )
        
        _,_,_,sec = wfi.getIO()

        ## check needs override
        needs_overide = False
        if not needs_overide and  options.augment: needs_overide=True

        def overide_from_agent( wfi, needs_overide):
            bad_agents = []#'http://cmssrv219.fnal.gov:5984']
            if not bad_agents: return needs_overide
            if needs_overide: return True
            agents = wfi.getAgents()

            wqss = ['Running','Acquired']
            if any([agent in agents.get(wqs,{}).keys() for wqs,agent in itertools.product( wqss, bad_agents)]):
                print "overriding the need for bad agent"
                needs_overide = True
            return needs_overide

        ## now parse this for action
        for i_task,(task,campaign) in enumerate(tasks_and_campaigns):
            if options.augment:
                print task.pathName
                print campaign
    
            
            tune = CI.get(campaign,'tune',options.tune)
            if tune and not campaign in tune_performance:
                tune_performance.append( campaign )

            overflow = CI.get(campaign,'overflow',{})
            if overflow:
                if "PU" in overflow and not campaign in PU_overflow:
                    PU_overflow[campaign] = copy.deepcopy(overflow['PU'])
                    print "adding",campaign,"to PU overflow rules"
                if "LHE" in overflow and not campaign in LHE_overflow:
                    print "adding",campaign,"to light input overflow rules"
                    site_list = overflow['LHE']['site_list']
                    LHE_overflow[campaign] = copy.deepcopy( getattr(SI,site_list) )
                    

            ### get the task performance, for further massaging.
            if campaign in tune_performance or options.tune:
                print "performance",task.taskType,task.pathName
                if task.taskType in ['Processing','Production']:
                    set_memory,set_time = getPerf( task.pathName )
                    #print "Performance %s GB %s min"%( set_memory,set_time)
                    wfi.sendLog('equalizor','Performance tuning to %s GB %s min'%( set_memory,set_time))
                    ## get values from gmwsmon
                    # massage the values : 95% percentile
                    performance[task.pathName] = {}
                    if set_memory:
                        performance[task.pathName]['memory']=set_memory
                    if set_time and False:
                        performance[task.pathName]['time'] = set_time
            
            ### rule to avoid the issue of taskchain secondary jobs being stuck at sites processing the initial step
            if campaign in LHE_overflow:
                if task.taskType in ['Processing']:
                    needs, task_name, running, idled = needs_action(wfi, task)
                    needs_overide = overide_from_agent( wfi, needs_overide)
                    extend_to = list(set(copy.deepcopy( LHE_overflow[campaign] )))
                    if stay_within_site_whitelist:
                        extend_to = list(set(extend_to) & set(wfi.request['SiteWhitelist'])) ## restrict to stupid-site-whitelist
                    extend_to = list(set(extend_to) & set(SI.sites_ready + force_sites))

                    if extend_to and needs or needs_overide:

                        modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : extend_to ,"Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                        wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : ReplaceSiteWhitelist \n %s'%( task_name,
                                                                                                                                      wfo.name,
                                                                                                                                      running,
                                                                                                                                      idled ,
                                                                                                                                      json.dumps( sorted(modifications[wfo.name][task.pathName]['ReplaceSiteWhitelist']))))

                        altered_tasks.add( task.pathName )
                    else:
                        wfi.sendLog('equalizor','%s of %s is running %d and pending %d'%( task_name, wfo.name, running, idled))
                        


            ### overflow the 76 digi-reco to the site holding the pileup
            if campaign in PU_overflow:
                force = PU_overflow[campaign]['force'] if 'force' in PU_overflow[campaign] else False
                secondary_locations = set(SI.sites_ready + force_sites)
                for s in sec:
                    if not s in PU_locations:
                        presence = getDatasetPresence( url, s)
                        #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
                        one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.]
                        PU_locations[s] = one_secondary_locations
                    print "secondary is at",sorted(PU_locations[s])
                    secondary_locations = set([SI.SE_to_CE(site) for site in PU_locations[s]]) & secondary_locations
                    
                ## we should add all sites that hold the secondary input if any
                ### given that we have the secondary location available, it is not necessary to use the add-hoc list
                ##secondary_locations = list(set(PU_overflow[campaign]['sites']) & set( SI.sites_ready ))

                if any([task.pathName.endswith(finish) for finish in ['_0','StepOneProc','Production']]) :
                    needs, task_name, running, idled = needs_action(wfi, task)
                    ## removing the ones in the site whitelist already since they encode the primary input location
                    if stay_within_site_whitelist:
                        original_site_in_use = set(wfi.request['SiteWhitelist'])
                    else:
                        original_site_in_use = set(secondary_locations)
                    ## remove the sites that have already running jobs
                    gmon = wfi.getGlideMon()
                    if gmon and task_name in gmon and 'Sites' in gmon[task_name]:
                        site_in_use = set(gmon[task_name]['Sites'])
                        print "removing",sorted(site_in_use)
                        ## that determines where you want to run in addition
                        augment_by = list((set(secondary_locations)- site_in_use) & original_site_in_use)
                    else:
                        print "no existing running site"
                        augment_by = list(original_site_in_use)

                    needs_overide = overide_from_agent( wfi, needs_overide)
                    if augment_by and (needs or needs_overide or force) and PU_overflow[campaign]['pending'] < PU_overflow[campaign]['max']:
                        PU_overflow[campaign]['pending'] += idled
                        print "raising overflow to",PU_overflow[campaign]['pending'],"for",PU_overflow[campaign]['max']
                        ## the step with an input ought to be the digi part : make this one go anywhere
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : augment_by , "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                        altered_tasks.add( task.pathName )
                        wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s'%( task_name, wfo.name,
                                                                                                                              running, idled,
                                                                                                                              json.dumps( sorted(augment_by), indent=2 )))
                    else:
                        print task_name,"of",wfo.name,"running",running,"and pending",idled

            ### overflow the skims back to multi-core 
            if campaign in ['Run2015D','Run2015C_25ns'] and task.taskType =='Skim':
                original_swl = wfi.request['SiteWhitelist']
                needs, task_name, running, idled = needs_action(wfi, task)
                if (needs or needs_overide):
                    modifications[wfo.name][task.pathName] = { 'AddWhitelist' : original_swl, 
                                                               "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                    altered_tasks.add( task.pathName )
                    wfi.sendLog('equalizor','%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s'%( task_name, wfo.name,
                                                                                                                              running, idled,
                                                                                                                          json.dumps( sorted(original_swl), indent=2 )))


            if options.augment:
                print sorted(wfi.request['SiteWhitelist']),i_task,use_HLT

            ### add the HLT at partner of CERN
            if 'T2_CH_CERN' in wfi.request['SiteWhitelist'] and i_task in [0,1] and use_HLT:
                needs, task_name, running, idled = needs_action(wfi, task)
                if options.augment: needs=True
                needs = True
                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide) and pending_HLT < max_HLT:
                    pending_HLT += idled
                    if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T2_CH_CERN_HLT" )
                        print "\t",wfo.name,"adding addHLT up to",pending_HLT,"for",max_HLT
                        print task.pathName
                    ## this Replace does not work at all for HLT
                    #elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]:
                        #modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" )
                        #print "\t",wfo.name,"adding replace HLT up to",pending_HLT,"for",max_HLT
                    else:
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T2_CH_CERN_HLT"],
                                                                   "Priority" : wfi.request['RequestPriority'],
                                                                   "Running" : running,
                                                                   "Pending" : idled}
                        wfi.sendLog('equalizor','adding the HLT in whitelist of %s to %d for %d'%( task.pathName, pending_HLT, max_HLT))

            if i_task==0 and not sec and use_T0:
                needs, task_name, running, idled = needs_action(wfi, task)
                
                if options.augment: needs=True
                #needs = True
                good_type = wfi.request['RequestType'] in ['MonteCarlo','MonteCarloFromGEN'] 
                read_lhe = ((not 'LheInputFiles' in wfi.request) or bool(wfi.request['LheInputFiles']))
                good_type &= not read_lhe
                if not good_type and not options.augment: needs = False
                
                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide):
                    pending_T0 += idled
                    if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]:
                        if not "T0_CH_CERN" in modifications[wfo.name][task.pathName]["AddWhitelist"]:
                            modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T0_CH_CERN" )
                            wfi,sendLog('equalizor','adding the T0 for %s to %d for %d'%( task.pathName, pending_T0, max_T0))
                    elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]:
                        if not "T0_CH_CERN" in modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"]:
                            modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T0_CH_CERN" )
                            wfi,sendLog('equalizor','adding the T0 to replacement for %s to %d for %d'%( task.pathName, pending_T0, max_T0))
                    else:
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T0_CH_CERN"],
                                                                   "Priority" : wfi.request['RequestPriority'],
                                                                   "Running" : running,
                                                                   "Pending" : idled}
                        wfi,sendLog('equalizor','adding the T0 for %s to %d for %d'%( task.pathName, pending_T0, max_T0))


    interface['modifications'].update( modifications )



    ###  manage the number of core and job resizing
    interface['cores']={'T2_CH_CERN_HLT': {'min':4,'max':16}, 'default': {'min':1, 'max':4}}
    interface['resizes'] = ['RunIISpring16DR80']

    ### manage the modification of the memory and target time
    interface['time'] = defaultdict(list)
    interface['memory'] = defaultdict(list)

    max_N_mem = 10
    max_N_time = 10
    ## discretize the memory to 10 at most values
    mems = set([o['memory'] for t,o in performance.items() if 'memory' in o])
    times = set([o['time'] for t,o in performance.items() if 'time' in o])
    if len(mems)>max_N_mem:
        mem_step = int((max(mems) - min(mems))/ float(max_N_mem))
        for t in performance:
            if not 'memory' in performance[t]: continue
            (m,r) = divmod(performance[t]['memory'], mem_step)
            performance[t]['memory'] = (m+1)*mem_step
    if len(times)>max_N_time:
        time_step = int((max(times) - min(times))/float(max_N_time))
        for t in performance:
            if not 'time' in performance[t]: continue
            (m,r) = divmod(performance[t]['time'], time_step)
            performance[t]['time'] = (m+1)*time_step

    for t,o in performance.items():
        if 'time' in o:
            interface['time'][str(o['time'])] .append( t )
        if 'memory' in o:
            interface['memory'][str(o['memory'])].append( t )

    ## close and save
    close( interface )
Ejemplo n.º 25
0
def stagor(url,specific =None, options=None):
    
    if not componentInfo().check(): return
    SI = siteInfo()
    CI = campaignInfo()
    UC = unifiedConfiguration()

    done_by_wf_id = {}
    done_by_input = {}
    completion_by_input = {}
    good_enough = 100.0
    
    lost = json.loads(open('lost_blocks_datasets.json').read())
    still_lost = []
    for dataset in lost:
        l = findLostBlocks(url ,dataset)
        if not l:
            print dataset,"is not really lost"
        else:
            still_lost.append( dataset )
    open('lost_blocks_datasets.json','w').write( json.dumps( still_lost, indent=2) )

    if options.fast:
        print "doing the fast check of staged with threshold:",options.goodavailability
        for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all():
            if specific and not specific in wfo.name: continue
            wfi = workflowInfo(url, wfo.name)
            sites_allowed = getSiteWhiteList( wfi.getIO() )
            if 'SiteWhitelist' in CI.parameters(wfi.request['Campaign']):
                sites_allowed = CI.parameters(wfi.request['Campaign'])['SiteWhitelist']
            if 'SiteBlacklist' in CI.parameters(wfi.request['Campaign']):
                sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfi.request['Campaign'])['SiteBlacklist']))
            _,primaries,_,secondaries = wfi.getIO()
            se_allowed = [SI.CE_to_SE(site) for site in sites_allowed] 
            all_check = True
            for dataset in list(primaries):#+list(secondaries) ?
                #print se_allowed
                available = getDatasetBlocksFraction( url , dataset , sites=se_allowed )
                all_check &= (available >= options.goodavailability)
                if not all_check: break

            if all_check:
                print "\t\t",wfo.name,"can go staged"
                wfo.status = 'staged'
                session.commit()
            else:
                print "\t",wfo.name,"can wait a bit more"
        return 

    for wfo in session.query(Workflow).filter(Workflow.status == 'staging').all():
        wfi = workflowInfo(url, wfo.name)
        _,primaries,_,secondaries = wfi.getIO()
        for dataset in list(primaries)+list(secondaries):
            done_by_input[dataset] = {}
            completion_by_input[dataset] = {}
            print wfo.name,"needs",dataset

    for transfer in session.query(Transfer).all():
        if specific  and str(transfer.phedexid)!=str(specific): continue

        skip=True
        for wfid in transfer.workflows_id:
            tr_wf = session.query(Workflow).get(wfid)
            if tr_wf: 
                if tr_wf.status == 'staging':
                    print "\t",transfer.phedexid,"is staging for",tr_wf.name
                    skip=False

        if skip: continue
        if transfer.phedexid<0: continue

        ## check the status of transfers
        checks = checkTransferApproval(url,  transfer.phedexid)
        approved = all(checks.values())
        if not approved:
            print transfer.phedexid,"is not yet approved"
            approveSubscription(url, transfer.phedexid)
            continue

        ## check on transfer completion
        checks = checkTransferStatus(url, transfer.phedexid, nocollapse=True)

        if not specific:
            for dsname in checks:
                if not dsname in done_by_input: done_by_input[dsname]={}
                if not dsname in completion_by_input: completion_by_input[dsname] = {}
                done_by_input[dsname][transfer.phedexid]=all(map(lambda i:i>=good_enough, checks[dsname].values()))
                completion_by_input[dsname][transfer.phedexid]=checks[dsname].values()
        if checks:
            print "Checks for",transfer.phedexid,[node.values() for node in checks.values()]
            done = all(map(lambda i:i>=good_enough,list(itertools.chain.from_iterable([node.values() for node in checks.values()]))))
        else:
            ## it is empty, is that a sign that all is done and away ?
            print "ERROR with the scubscriptions API of ",transfer.phedexid
            print "Most likely something else is overiding the transfer request. Need to work on finding the replacement automatically, if the replacement exists"
            done = False

        ## the thing above is NOT giving the right number
        #done = False

        for wfid in transfer.workflows_id:
            tr_wf = session.query(Workflow).get(wfid)
            if tr_wf:# and tr_wf.status == 'staging':  
                if not tr_wf.id in done_by_wf_id: done_by_wf_id[tr_wf.id]={}
                done_by_wf_id[tr_wf.id][transfer.phedexid]=done


        if done:
            ## transfer.status = 'done'
            print transfer.phedexid,"is done"
        else:
            print transfer.phedexid,"not finished"
            pprint.pprint( checks )

    #print done_by_input
    print "\n----\n"
    for dsname in done_by_input:
        fractions = None
        if dsname in completion_by_input:
            fractions = itertools.chain.from_iterable([check.values() for check in completion_by_input.values()])
        
        ## the workflows in the waiting room for the dataset
        using_its = getWorkflowByInput(url, dsname)
        #print using_its
        using_wfos = []
        for using_it in using_its:
            wf = session.query(Workflow).filter(Workflow.name == using_it).first()
            if wf:
                using_wfos.append( wf )
        
        if not len(done_by_input[dsname]):
            print "For dataset",dsname,"there are no transfer report. That's an issue."
            for wf in using_wfos:
                if wf.status == 'staging':
                    if UC.get("stagor_sends_back"):
                        print "sending",wf.name,"back to considered"
                        wf.status = 'considered'
                        session.commit()
                        sendEmail( "send back to considered","%s was send back and might be trouble"% wf.name)
                    else:
                        print "would send",wf.name,"back to considered"
                        sendEmail( "subscription lagging behind","susbscriptions to get %s running are not appearing in phedex. I would have send it back to considered but that's not good."% wf.name)
            continue

        #need_sites = int(len(done_by_input[dsname].values())*0.7)+1
        need_sites = len(done_by_input[dsname].values())
        #if need_sites > 10:            need_sites = int(need_sites/2.)
        got = done_by_input[dsname].values().count(True)
        if all([wf.status != 'staging' for wf in using_wfos]):
            ## not a single ds-using wf is in staging => moved on already
            ## just forget about it
            print "presence of",dsname,"does not matter anymore"
            print "\t",done_by_input[dsname]
            print "\t",[wf.status for wf in using_wfos]
            print "\tneeds",need_sites
            continue #??
            
        ## should the need_sites reduces with time ?
        # with dataset choping, reducing that number might work as a block black-list.

        if len(done_by_input[dsname].values()) and all(done_by_input[dsname].values()):
            print dsname,"is everywhere we wanted"
            ## the input dataset is fully transfered, should consider setting the corresponding wf to staged
            for wf in using_wfos:
                if wf.status == 'staging':
                    print wf.name,"is with us. setting staged and move on"
                    wf.status = 'staged'
                    session.commit()
        elif fractions and len(list(fractions))>1 and set(fractions)==1:
            print dsname,"is everywhere at the same fraction"
            print "We do not want this in the end. we want the data we asked for"
            continue
            ## the input dataset is fully transfered, should consider setting the corresponding wf to staged
            for wf in using_wfos:
                if wf.status == 'staging':
                    print wf.name,"is with us everywhere the same. setting staged and move on"
                    wf.status = 'staged'
                    session.commit()
        elif got >= need_sites:
            print dsname,"is almost everywhere we wanted"
            #print "We do not want this in the end. we want the data we asked for"
            #continue
            ## the input dataset is fully transfered, should consider setting the corresponding wf to staged
            for wf in using_wfos:
                if wf.status == 'staging':
                    print wf.name,"is almost with us. setting staged and move on"
                    wf.status = 'staged'
                    session.commit()
        else:
            print "incomplete",dsname
            lost = findLostBlocks(url, dsname)
            try:
                known_lost = json.loads(open('lost_blocks_datasets.json').read())
            except:
                print "enable to get the known_lost from local json file"
                known_lost = []

            if lost and not dsname in known_lost:
                lost_names = [item['name'] for item in lost]
                ## make a deeper investigation of the block location to see whether it's really no-where no-where

                print "We have lost",len(lost),"blocks",lost_names
                #print json.dumps( lost , indent=2 )
                sendEmail('we have lost a few blocks', str(len(lost))+" in total.\nDetails \n:"+json.dumps( lost , indent=2 ))
                known_lost.append(dsname)
                rr= open('lost_blocks_datasets.json','w')
                rr.write( json.dumps( known_lost, indent=2))
                rr.close()
                ## should the status be change to held-staging and pending on a ticket



            print "\t",done_by_input[dsname]
            print "\tneeds",need_sites
            print "\tgot",got

    for wfid in done_by_wf_id:
        #print done_by_wf_id[wfid].values()
        ## ask that all related transfer get into a valid state
        if all(done_by_wf_id[wfid].values()):
            pass
Ejemplo n.º 26
0
def actor(url, options=None):

    if moduleLock(wait=False, silent=True)(): return
    if userLock('actor'): return

    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return

    # CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()

    # Need to look at the actions page https://vocms0113.cern.ch:80/getaction (can add ?days=20) and perform any actions listed
    try:
        action_list = json.loads(
            os.popen(
                'curl -s -k https://vocms0113.cern.ch:80/getaction?days=15').
            read())
        ## now we have a list of things that we can take action on
    except:
        try:
            action_list = json.loads(
                os.popen(
                    'curl -s -k https://vocms0113.cern.ch/getaction?days=15').
                read())
        except:
            print "Not able to load action list :("
            sendLog('actor', 'Not able to load action list', level='critical')
            return

    if options.actions:
        action_list = json.loads(open(options.actions).read())

    print json.dumps(action_list, indent=2)
    if not action_list:
        print "EMPTY!"
        return

    wf_list = action_list.keys()
    print json.dumps(sorted(wf_list), indent=2)
    if options.spec:
        wf_list = [wf for wf in wf_list if options.spec in wf]

    max_per_round = UC.get('max_per_round').get('actor', None)
    if max_per_round:
        random.shuffle(wf_list)
        wf_list = wf_list[:max_per_round]

    for wfname in wf_list:
        print '-' * 100
        print "Looking at", wfname, "for recovery options"

        to_clone = False
        to_acdc = False
        to_force = False
        to_hold = False
        something_to_do = False
        tasks = action_list[wfname].get('Parameters', None)
        to_acdc = action_list[wfname].get('Action', None) == 'acdc'
        to_clone = action_list[wfname].get('Action', None) == 'clone'
        to_force = action_list[wfname].get(
            'Action', None) == 'special' and action_list[wfname].get(
                'Parameters', {}).get('action', None) in ['by-pass', 'bypass']
        to_hold = action_list[wfname].get(
            'Action', None) == 'special' and action_list[wfname].get(
                'Parameters', {}).get('action', None) in ['onhold', 'on-hold']

        if not to_acdc and not to_clone and not to_force and not to_hold:
            sendLog(
                'actor',
                'Action submitted for something other than acdc, clone, bypass or hold for workflow %s'
                % wfname,
                level='critical')
            print json.dumps(action_list[wfname], indent=2)
            continue

        if not tasks and to_acdc:
            sendLog('actor',
                    'Empty action submitted for workflow %s' % wfname,
                    level='critical')
            print "Moving on. Parameters is blank for " + wfname
            continue

        wfi = workflowInfo(url, wfname)

        recover = True
        message_to_ops = ""
        message_to_user = ""

        #===========================================================
        if to_clone and options.do:
            print "Let's try kill and clone: "
            wfi.sendLog('actor', 'Going to clone %s' % wfname)
            results = []
            datasets = set(wfi.request['OutputDatasets'])

            comment = ""

            if 'comment' in tasks: comment = ", reason: " + tasks['comment']
            wfi.sendLog(
                'actor',
                "invalidating the workflow by traffic controller %s" % comment)

            #Reject all workflows in the family
            #first reject the original workflow.
            reqMgrClient.invalidateWorkflow(
                url,
                wfi.request['RequestName'],
                current_status=wfi.request['RequestStatus'],
                cascade=False)
            #Then reject any ACDCs associated with that workflow
            family = getWorkflowById(url, wfi.request['PrepID'], details=True)
            for fwl in family:
                print "rejecting", fwl['RequestName'], fwl['RequestStatus']
                wfi.sendLog(
                    'actor', "rejecting %s, previous status %s" %
                    (fwl['RequestName'], fwl['RequestStatus']))
                reqMgrClient.invalidateWorkflow(
                    url,
                    fwl['RequestName'],
                    current_status=fwl['RequestStatus'],
                    cascade=False)
                datasets.update(fwl['OutputDatasets'])
            #Invalidate all associated output datasets
            for dataset in datasets:
                results.append(setDatasetStatus(dataset, 'INVALID'))

            if all(map(lambda result: result in ['None', None, True],
                       results)):
                wfi.sendLog('actor', "%s and children are rejected" % wfname)

            cloned = None
            try:
                cloned = singleClone(url, wfname, tasks, comment, options.do)
            except Exception as e:
                sendLog(
                    'actor',
                    'Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.'
                    % wfname,
                    level='critical')
                wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname)
                print str(e)
                ##let's not remove the action other the workflow goes to "trouble" and the WTC cannot set the action again
                #remove_action(wfname)
            if not cloned:
                recover = False
                wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname)
                sendLog('actor',
                        'Failed to create clone for %s!' % wfname,
                        level='critical')

            else:
                wfi.sendLog('actor', "Workflow %s cloned" % wfname)

#===========================================================
        elif to_force:
            wfi.sendLog('actor',
                        'Bypassing from workflow traffic controler request')
            forcing = json.loads(
                open(
                    '/afs/cern.ch/user/v/vlimant/public/ops/forcecomplete.json'
                ).read())
            forcing.append(wfname)
            open('/afs/cern.ch/user/v/vlimant/public/ops/forcecomplete.json',
                 'w').write(json.dumps(sorted(set(forcing))))
        elif to_hold:
            wfi.sendLog('actor',
                        'Holding on workflow traffic controler request')
            holding = json.loads(
                open('/afs/cern.ch/user/v/vlimant/public/ops/onhold.json').
                read())
            holding.append(wfname)
            open('/afs/cern.ch/user/v/vlimant/public/ops/onhold.json',
                 'w').write(json.dumps(sorted(set(holding))))
#===========================================================
        elif to_acdc:
            if 'AllSteps' in tasks:
                allTasksDefaults = tasks['AllSteps']
                tasks.pop('AllSteps')
                for setting in allTasksDefaults:
                    for task in tasks:
                        if setting in tasks[task]:
                            tasks[task][setting] = allTasksDefaults[setting]
                        else:
                            tasks[task].append(
                                {setting: allTasksDefaults[setting]})
            print "Tasks is "
            print json.dumps(tasks, indent=2)

            all_tasks = wfi.getAllTasks()

            ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves

            try:
                WMErr = wfi.getWMErrors()
#               print WMErr
            except:
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because WMErr cannot be reached.'
                    % wfname,
                    level='critical')
                continue
            if not WMErr:
                wfi.sendLog('actor', 'WMErrors is blank for %s.' % wfname)
                print "FYI getWMErrors is blank. Presumably there are only unreported errors"
#                continue

            try:
                where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo(
                )
                print "Where to run = "
                print where_to_run
            except:
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because recovery info cannot be found.'
                    % wfname,
                    level='critical')
                print "Moving on. Cannot access recovery info for " + wfname
                continue
            if not where_to_run:
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because site list cannot be found.'
                    % wfname,
                    level='critical')
                print "Moving on. where to run is blank"
                continue

            message_to_ops = ""
            message_to_user = ""

            num_tasks_to_recover = 0

            if WMErr:
                for task in WMErr:
                    if 'LogCollect' in task: continue
                    if 'Cleanup' in task: continue
                    if not 'jobfailed' in WMErr[task]:
                        continue
                    else:
                        num_tasks_to_recover += 1
#                print "Task to recover: " + task

            if not num_tasks_to_recover:
                print "\tno error for", wfname
#            recover = False

            if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']:
                ## we do not try to recover pLHE
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because it is a pLHE workflow.'
                    % wfname,
                    level='critical')
                print "We don't try to recover pLHE. Moving on."
                recover = False
        #            sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname)

#        if wfi.request['RequestType'] in ['ReReco']:
#            recover= False
#            print 'cannot submit action. ReReco'
#   sendEmail('cannot submit action', '%s is request type ReReco'%wfname)

            recovering = set()
            for task in tasks:
                assign_to_sites = set()
                print "Task names is " + task
                fulltaskname = '/' + wfname + '/' + task
                #                print "Full task name is " + fulltaskname
                wrong_task = False
                for task_info in all_tasks:
                    if fulltaskname == task_info.pathName:
                        if task_info.taskType not in [
                                'Processing', 'Production', 'Merge'
                        ]:
                            wrong_task = True
                            wfi.sendLog(
                                'actor',
                                "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks"
                                % (fulltaskname, task_info.taskType))
                if wrong_task:
                    continue
                print tasks[task]
                actions = tasks[task]
                for action in actions:
                    if action.startswith('sites'):
                        if type(actions[action]) != list:
                            assign_to_sites = [SI.SE_to_CE(actions[action])]
                        else:
                            assign_to_sites = list(
                                set([
                                    SI.SE_to_CE(site)
                                    for site in actions[action]
                                ]))
#                    if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']:
#                        recover = False;
#                        print  "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname
#                        wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname)
                if not 'sites' in actions:
                    assign_to_sites = list(
                        set([SI.SE_to_CE(site)
                             for site in where_to_run[task]]))
                    print "Found", sorted(
                        assign_to_sites
                    ), "as sites where to run the ACDC at, from the acdc doc of ", wfname
                print "Going to run at", sorted(assign_to_sites)
                if recover:
                    print "Initiating recovery"
                    acdc = singleRecovery(url,
                                          fulltaskname,
                                          wfi.request,
                                          actions,
                                          do=options.do)
                    if not acdc:
                        if options.do:
                            if recovering:
                                print wfname + " has been partially ACDC'ed. Needs manual attention."
                                sendLog(
                                    'actor',
                                    "%s has had %s/%s recoveries %s only" %
                                    (wfname, len(recovering),
                                     num_tasks_to_recover, list(recovering)),
                                    level='critical')
                                wfi.sendLog(
                                    'actor',
                                    "%s has had %s/%s recoveries %s only" %
                                    (wfname, len(recovering),
                                     num_tasks_to_recover, list(recovering)))
                                break
                            else:
                                print wfname + " failed recovery once"
                                recover = False
                                break
                        else:
                            print "no action to take further"
                            #                        sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical')
                            continue

                    else:  #ACDC was made correctly. Now we have to assign it.
                        wfi.sendLog(
                            'actor',
                            'ACDC created for task %s. Actions taken \n%s' %
                            (fulltaskname, json.dumps(actions)))
                        #team = wfi.request['Teams'][0]
                        team = 'production'
                        parameters = {
                            'SiteWhitelist': sorted(assign_to_sites),
                            'AcquisitionEra': wfi.acquisitionEra(),
                            'ProcessingString': wfi.processingString(),
                            'MergedLFNBase': wfi.request['MergedLFNBase'],
                            'ProcessingVersion':
                            wfi.request['ProcessingVersion'],
                        }
                        ## hackery for ACDC merge assignment
                        if wfi.request[
                                'RequestType'] == 'TaskChain' and 'Merge' in task.split(
                                    '/')[-1]:
                            parameters['AcquisitionEra'] = None
                            parameters['ProcessingString'] = None

                ## xrootd setttings on primary and secondary
                        if 'xrootd' in actions:
                            if actions['xrootd'] == 'enabled':
                                print "Going to assign via xrootd"
                                parameters['TrustSitelists'] = True
                            elif actions['xrootd'] == 'disabled':
                                parameters['TrustSitelists'] = False
                        elif ('TrustSitelists' in wfi.request
                              and wfi.request['TrustSitelists'] == 'true'):
                            parameters['TrustSitelists'] = True
                        else:
                            parameters['TrustSitelists'] = False

                        if 'secondary' in actions:
                            if actions['secondary'] == 'enabled':
                                print 'Enabling reading the secondary input via xrootd'
                                parameters['TrustPUSitelists'] = True
                            elif actions['secondary'] == 'disabled':
                                parameters['TrustPUSitelists'] = False
                            #in case secondary is blank or not set to enabled or disabled
                            elif 'TrustPUSitelists' in wfi.request and wfi.request[
                                    'TrustPUSitelists']:
                                parameters['TrustPUSitelists'] = True
                        elif 'TrustPUSitelists' in wfi.request and wfi.request[
                                'TrustPUSitelists']:
                            parameters['TrustPUSitelists'] = True

                        if options.ass:
                            print "really doing the assignment of the ACDC", acdc
                            parameters['execute'] = True
                            wfi.sendLog('actor',
                                        "%s  was assigned for recovery" % acdc)
                        else:
                            print "no assignment done with this ACDC", acdc
                            sendLog('actor',
                                    "%s needs to be assigned" % (acdc),
                                    level='critical')
                            continue

#                       print parameters
                        result = reqMgrClient.assignWorkflow(
                            url, acdc, team, parameters)
                        if not result:
                            print acdc, "was not assigned"
                            sendLog('actor',
                                    "%s needs to be assigned" % (acdc),
                                    level='critical')
                        else:
                            recovering.add(acdc)
                        wfi.sendLog('actor', "ACDCs created for %s" % wfname)
        #===========================================================

        if recover and options.do:
            remove_action(wfname)

        if message_to_user:
            print wfname, "to be notified to user(DUMMY)", message_to_user

        if message_to_ops:
            print 'message'
            #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**'])
        #            sendLog('recoveror',message_to_ops,level='warning')

    return
Ejemplo n.º 27
0
def transferor(url ,specific = None, talk=True, options=None):
    if userLock():   return
    if duplicateLock():  return

    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    NLI = newLockInfo()
    mcm = McMClient(dev=False)
    dss = DSS()

    #allowed_secondary = UC.get('')
    print "counting all being handled..."
    being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all())
    being_transfered = len(session.query(Workflow).filter(Workflow.status == 'staging').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all())

    max_to_handle = options.maxworkflows
    max_to_transfer = options.maxstaging

    allowed_to_handle = max(0,max_to_handle - being_handled)
    allowed_to_transfer = max(0,max_to_transfer - being_transfered)
    wf_buffer = 5
    if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer
        print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer"
    else:
        print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer"

    if allowed_to_transfer <= wf_buffer:
        print "Not allowed to transfer more than",max_to_transfer,"at a time. Currently",being_transfered,"and",wf_buffer,"buffer"
    else:
        print being_transfered,"already being transfered",max_to_transfer,"max allowed,",allowed_to_transfer,"remaining","and",wf_buffer,"buffer"

    print "... done"

    all_transfers=defaultdict(list)
    workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset
    wfs_and_wfh=[]
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(Workflow.status.startswith('considered')).all():
        print "\t",wfo.name
        if specific and not specific in wfo.name: continue
        cache_r =filter(lambda d:d['RequestName']==wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) )
        else:
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) )
    print "... done"

    transfers_per_sites = defaultdict(int)
    input_sizes = {}
    ignored_input_sizes = {}
    input_cput = {}
    input_st = {}
    ## list the size of those in transfer already
    in_transfer_priority=None
    min_transfer_priority=None
    print "getting all wf in staging ..."
    stucks = json.loads(open('%s/stuck_transfers.json'%monitor_dir).read())
    
    for wfo in session.query(Workflow).filter(Workflow.status=='staging').all():
        wfh = workflowInfo( url, wfo.name, spec=False)
        #(lheinput,primary,parent,secondary) = wfh.getIO()
        #sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList()
        for site in sites_allowed: ## we should get the actual transfer destination instead of the full white list
            transfers_per_sites[site] += 1 
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:  
            ds_s = dss.get( prim )
            if prim in stucks: 
                sendLog('transferor', "%s appears stuck, so not counting it %s [GB]"%( prim, ds_s), wfi=wfh)
                ignored_input_sizes[prim] = ds_s
            else:
                input_sizes[prim] = ds_s
                sendLog('transferor', "%s needs %s [GB]"%( wfo.name, ds_s), wfi=wfh)
        if in_transfer_priority==None:
            in_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority']))
        if min_transfer_priority==None:
            min_transfer_priority = int(wfh.request['RequestPriority'])
        else:
            min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority']))

    if min_transfer_priority==None or in_transfer_priority ==None:
        print "nothing is lining up for transfer"
        sendEmail("no request in staging","no request in staging")
        return 
        pass

    try:
        print "Ignored input sizes"
        ignored_values = list(ignored_input_sizes.items())
        ignored_values.sort( key = lambda i : i[1] )
        print "\n".join( map(str, ignored_values ) )
        print "Considered input sizes"
        considered_values = list(input_sizes.items())
        considered_values.sort( key = lambda i : i[1] )
        print "\n".join( map(str, considered_values) )
    except Exception as e:
        print "trying to print the summary of input size"
        print str(e)

    print "... done"
    print "Max priority in transfer already",in_transfer_priority
    print "Min priority in transfer already",min_transfer_priority
    print "transfers per sites"
    print json.dumps( transfers_per_sites, indent=2)
    in_transfer_already = sum(input_sizes.values())
    cput_in_transfer_already = sum(input_cput.values())
    st_in_transfer_already = sum(input_st.values())

    ## list the size of all inputs
    primary_input_per_workflow_gb = defaultdict(float)
    print "getting all input sizes ..."
    for (wfo,wfh) in wfs_and_wfh:
        (_,primary,_,_) = wfh.getIO()
        #input_cput[wfo.name] = wfh.getComputingTime()
        #input_st[wfo.name] = wfh.getSystemTime()
        for prim in primary:
            ## do not count it if it appears stalled !
            prim_size = dss.get( prim )
            input_sizes[prim] = prim_size
            primary_input_per_workflow_gb[wfo.name] += prim_size
    print "... done"

    # shuffle first by name
    random.shuffle( wfs_and_wfh )
    # Sort smallest transfers first; allows us to transfer as many as possible workflows.
    def prio_and_size( i, j):
        if int(i[1].request['RequestPriority']) == int(j[1].request['RequestPriority']):
            return cmp(int(primary_input_per_workflow_gb.get(j[0].name, 0)), int(primary_input_per_workflow_gb.get(i[0].name, 0)) )
        else:
            return cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']))

    #wfs_and_wfh.sort(cmp = prio_and_size, reverse=True)
    #wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(primary_input_per_workflow_gb.get(i[0].name, 0)), int(primary_input_per_workflow_gb.get(j[0].name, 0)) ))
    #sort by priority higher first
    wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True)

    cput_grand_total = sum(input_cput.values())
    cput_to_transfer = cput_grand_total - cput_in_transfer_already
    st_grand_total = sum(input_st.values())
    st_to_transfer = st_grand_total - st_in_transfer_already
    print "%15.4f [CPU h] worth already in transfer"%cput_in_transfer_already
    print "%15.4f [CPU h] worth is the current requested transfer load"%cput_to_transfer
    print "%15.4f [h] worth of absolute system time in transfer"%( cput_in_transfer_already / SI.availableSlots())
    print "%15.4f [h] worth of absolute system time is the current requested transfer load"%( cput_to_transfer / SI.availableSlots())
    print "%15.4f [h] worth of theoritical system time in transfer"%( st_in_transfer_already )
    print "%15.4f [h] worth of theoritical system time is the current requested transfer load"%( st_to_transfer )


    grand_total =  sum(input_sizes.values()) 
    to_transfer = grand_total  - in_transfer_already
    grand_transfer_limit = options.maxtransfer
    #grand_transfer_limit = SI.total_disk()*0.25*1024## half of the free sapce in TB->GB
    
    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered"%in_transfer_already
    print "%15.4f GB is the current requested transfer load"%to_transfer
    print "%15.4f GB is the global transfer limit"%grand_transfer_limit
    print "%15.4f GB is the available limit"%transfer_limit


    max_staging_per_site = options.maxstagingpersite
                    
    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer=0 ## so that we can count'em
    passing_along = 0
    transfer_sizes={}
    went_over_budget=False
    destination_cache = {}
    no_goes = set()

    max_per_round = UC.get('max_per_round').get('transferor',None)
    if max_per_round and not spec:
        wfs_and_wfh = wfs_and_wfh[:max_per_round]
    
    for (wfo,wfh) in wfs_and_wfh:
        print wfo.name,"to be transfered with priority",wfh.request['RequestPriority']

        if wfh.request['RequestStatus']!='assignment-approved':
            if wfh.request['RequestStatus'] in ['aborted','rejected','rejected-archived','aborted-archived']:
                wfo.status = 'trouble' ## so that we look or a replacement
            else:
                wfo.status = 'away'
            wfh.sendLog('transferor', '%s in status %s, setting %s'%( wfo.name,wfh.request['RequestStatus'],wfo.status))
            continue

        (_,primary,_,_) = wfh.getIO()
        this_load=sum([input_sizes[prim] for prim in primary])
        no_budget = False
        if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ):
            if went_over_budget:
                wfh.sendLog('transferor', "Transfer has gone over bubget.")
            else:
                wfh.sendLog('transferor', "Transfer will go over bubget.")
            wfh.sendLog('transferor', "%15.4f GB this load, %15.4f GB already this round, %15.4f GB is the available limit"%(this_load, sum(transfer_sizes.values()), transfer_limit))
            #if sum(transfer_sizes.values()) > transfer_limit:
            went_over_budget = True
            if in_transfer_priority!=None and min_transfer_priority!=None:
                if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                    wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over budget"%( wfh.request['RequestPriority'], in_transfer_priority))
                else:
                    if not options.go: 
                        wfh.sendLog('transferor',"%s minimum priority %s < %s : stop"%( min_transfer_priority,wfh.request['RequestPriority'],in_transfer_priority))
                        no_budget = True

        ## throtlle by campaign go
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True
            no_goes.add( wfo.name )
            
        allowed_secondary = set()
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[campaign]:
                allowed_secondary.update( CI.campaigns[campaign]['secondaries'] )
        if secondary:
            if (secondary and allowed_secondary) and (set(secondary)&allowed_secondary!=set(secondary)):
                wfh.sendLog('assignor','%s is not an allowed secondary'%(', '.join(set(secondary)-allowed_secondary)))
                no_go = True

        if no_go:
            continue
        ## check if the batch is announced

        def check_mcm(wfn):
            announced=False
            is_real=False
            if not wfn.startswith('pdmvserv'):
                is_real = True
            try:
                for b in mcm.getA('batches',query='contains=%s'% wfo.name):
                    is_real = True
                    if b['status']=='announced': 
                        announced=True 
                        break
            except:
                try:
                    for b in mcm.getA('batches',query='contains=%s'% wfo.name):
                        is_real = True
                        if b['status']=='announced': 
                            announced=True 
                            break
                except:
                    print "could not get mcm batch announcement, assuming not real"
            return announced,is_real

        if not use_mcm:
            announced,is_real = False,True
        else:
            if wfh.request['RequestType'] in ['ReReco']:
                announced,is_real = True,True
            else:
                announced,is_real = check_mcm( wfo.name )

        if not announced:
            wfh.sendLog('transferor', "does not look announced.")

            
        if not is_real:
            wfh.sendLog('transferor', "does not appear to be genuine.")

            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.)
        now = time.mktime(time.gmtime()) / (60.*60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced: 
                wfh.sendLog('transferor', "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time))
                continue


        if passing_along >= allowed_to_handle:
            #if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
            if in_transfer_priority!=None and min_transfer_priority!=None:
                if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%( wfh.request['RequestPriority'], in_transfer_priority, max_to_handle))
                else:
                    wfh.sendLog('transferor'," Not allowed to pass more than %s at a time. Currently %s handled, and adding %s"%( max_to_handle, being_handled, passing_along))
                    if not options.go: 
                        ## should not allow to jump that fence
                        break

        if this_load and needs_transfer >= allowed_to_transfer:
            if in_transfer_priority!=None and min_transfer_priority!=None:
                if int(wfh.request['RequestPriority']) >= in_transfer_priority and int(wfh.request['RequestPriority']) !=min_transfer_priority:
                    ## higher priority, and not only this priority being transfered
                    wfh.sendLog('transferor',"Higher priority sample %s >= %s go-on over %s"%(wfh.request['RequestPriority'], in_transfer_priority,max_to_transfer))
                else:
                    wfh.sendLog('transferor',"Not allowed to transfer more than %s at a time. Currently %s transfering, and adding %s"%( max_to_transfer, being_transfered, needs_transfer))
                    if not options.go: 
                        no_budget = True


        if no_budget:
            continue

        ## the site white list considers site, campaign, memory and core information
        (lheinput,primary,parent,secondary,sites_allowed) = wfh.getSiteWhiteList()
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')


        for dataset in list(primary)+list(parent)+list(secondary):
            ## lock everything flat
            NLI.lock( dataset )

        if not sites_allowed:
            wfh.sendLog('transferor',"not possible site to run at")
            #sendEmail("no possible sites","%s has no possible sites to run at"%( wfo.name ))
            sendLog('transferor',"%s has no possible sites to run at"%( wfo.name ),level='critical')
            continue

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set( blocks + getDatasetBlocks( dataset, runs=wfh.request['RunWhitelist'] ) ))
        if 'LumiList' in wfh.request and wfh.request['LumiList']:
            ## augment with the lumi white list
            blocks = list(set( blocks + getDatasetBlocks( dataset, lumis= wfh.request['LumiList'] ) ))

        if blocks:
            print "Reading",len(blocks),"in block whitelist"

        can_go = True
        staging=False
        allowed=True
        primary_destinations = set()
        if primary:
            
            copies_needed_from_CPUh,CPUh = wfh.getNCopies()

            if talk:
                print wfo.name,'reads',', '.join(primary),'in primary'
            ## chope the primary dataset 
            for prim in primary:
                ## keep track of what needs what
                workflow_dependencies[prim].add( wfo.id )

                max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority']))

                wfh.sendLog('transferor',"Would make %s  from cpu requirement %s"%( copies_needed_from_CPUh, CPUh))
                copies_needed = copies_needed_from_CPUh

                if 'Campaign' in wfh.request and wfh.request['Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[wfh.request['Campaign']]:
                    copies_needed_from_campaign = CI.campaigns[wfh.request['Campaign']]['maxcopies']
                    copies_needed = min(copies_needed_from_campaign, copies_needed)
                    
                    wfh.sendLog('transferor',"Maxed to %s by campaign configuration %s"%( copies_needed, wfh.request['Campaign']))


                ### new ways of making the whole thing
                destinations,all_block_names = getDatasetDestinations(url, prim, within_sites = [SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks )
                print json.dumps(destinations, indent=2)

                ## get where the dataset is in full and completed
                prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1]
                ## the rest is places it is going to be
                prim_destination = [site for site in destinations.keys() if not site in prim_location]


                if len(prim_location) >= copies_needed:
                    wfh.sendLog('transferor',"The input is all fully in place at %s sites %s"%( len(prim_location), sorted(prim_location)))
                    continue
                copies_needed = max(0,copies_needed - len(prim_location))
                wfh.sendLog('transferor',"not counting existing copies ; now need %s"% copies_needed)
                copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names]

                latching_on_transfers = set()
                [latching_on_transfers.update(info['blocks'].values()) for site,info in destinations.items() if site in prim_destination]
                latching_on_transfers = list(latching_on_transfers)
                #print latching_on_transfers

                ## figure out where all this is going to go
                prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location]
                prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination]
                ## take out the ones that cannot receive transfers
                prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]

                wfh.sendLog('transferor',"Could be going to: %s"% sorted( prim_to_distribute))
                if not prim_to_distribute or any([transfers_per_sites[site] < max_staging_per_site for site in prim_to_distribute]):
                    ## means there is openings let me go
                    print "There are transfer slots available:",[(site,transfers_per_sites[site]) for site in prim_to_distribute]
                    #for site in sites_allowed:
                    #    #increment accross the board, regardless of real destination: could be changed
                    #    transfers_per_sites[site] += 1
                else:
                    if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                        wfh.sendLog('transferor', "Higher priority sample %s >= %s go-on over transfer slots available"%(wfh.request['RequestPriority'], in_transfer_priority))
                    else:
                        wfh.sendLog('transferor',"Not allowed to transfer more than %s per site at a time. Going overboard for %s"%( max_staging_per_site, sorted([site for site in prim_to_distribute if transfers_per_sites[site]>=max_staging_per_site])))
                        if not options.go:
                            allowed = False
                            break

                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(Transfer.phedexid == int(latching)).first()
                    if not tfo:
                        tfo = session.query(Transfer).filter(Transfer.phedexid == -int(latching)).first()
                        
                    if not tfo:
                        tfo = Transfer( phedexid = latching)
                        tfo.workflows_id = []
                        session.add(tfo)
                    else:
                        tfo.phedexid = latching ## make it positive ever

                    if not wfo.id in tfo.workflows_id:
                        print "adding",wfo.id,"to",tfo.id,"with phedexid",latching
                        l = copy.deepcopy( tfo.workflows_id )
                        l.append( wfo.id )
                        tfo.workflows_id = l
                    if not options.test:
                        session.commit()
                    else:
                        session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                #copies_needed = max(0,copies_needed - len(prim_destination))
                copies_needed = max(0,copies_needed - min(copies_being_made))
                wfh.sendLog('transferor', "Not counting the copies being made ; then need %s"% copies_needed)                    
                if copies_needed == 0:
                    wfh.sendLog('transferor', "The output is either fully in place or getting in full somewhere with %s"% latching_on_transfers)
                    can_go = True
                    continue
                elif len(prim_to_distribute)==0:
                    wfh.sendLog('transferor', "We are going to need extra copies, but no destinations seems available")
                    prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location]
                    prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]

                if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the 
                    if not options or options.chop:
                        ### hard include the tape disk andpoint ?
                        #tapes = [site for site in  getDatasetPresence( url, prim, vetos=['T0','T2','T3','Disk']) if site.endswith('MSS')]
                        chops,sizes = getDatasetChops(prim, chop_threshold = options.chopsize, only_blocks=blocks)
                        spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes)
                        transfer_sizes[prim] = sum(sizes)
                        if not spreading:
                            sendLog('transferor','cannot send %s to any site, it cannot fit anywhere'% prim, level='critical')
                            wfh.sendLog('transferor', "cannot send to any site. %s cannot seem to fit anywhere"%(prim))
                            staging=False
                            can_go = False
                    
                    else:
                        spreading = {} 
                        for site in prim_to_distribute: 
                            if blocks:
                                spreading[site]=blocks
                            else:
                                spreading[site]=[prim]
                        transfer_sizes[prim] = input_sizes[prim] ## this is approximate if blocks are specified
                    can_go = False
                    wfh.sendLog('transferor', "selected CE destinations %s"%(sorted( spreading.keys())))
                    for (site,items) in spreading.items():
                        all_transfers[site].extend( items )
                        transfers_per_sites[site] += 1
                        primary_destinations.add( site ) 
        if not allowed:
            wfh.sendLog('transferor', "Not allowed to move on with")
            continue


        if secondary:

            override_sec_destination = []
            if 'SecondaryLocation' in CI.campaigns[wfh.request['Campaign']]:
                override_sec_destination  = CI.campaigns[wfh.request['Campaign']]['SecondaryLocation']

            print wfo.name,'reads',', '.join(secondary),'in secondary'
            for sec in secondary:

                workflow_dependencies[sec].add( wfo.id )

                if True:
                    ## new style, failing on minbias
                    if not sec in destination_cache:
                        ## this is barbbaric, and does not show the correct picture on workflow by workflow with different whitelist
                        destination_cache[sec],_ = getDatasetDestinations(url, sec) ## NO SITE WHITE LIST ADDED
                        #destination_cache[sec],_ = getDatasetDestinations(url, sec, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])

                    ## limit to the site whitelist NOW
                    se_allowed = [SI.CE_to_SE(site) for site in sites_allowed]
                    destinations = dict([(k,v) for (k,v) in destination_cache[sec].items() if site in se_allowed])
                    ## truncate location/destination to those making up for >90% of the dataset
                    bad_destinations = [destinations.pop(site) for (site,info) in destinations.items() if info['data_fraction']<0.9]
                    sec_location = [site for (site,info) in destinations.items() if info['completion']>=95]
                    sec_destination = [site for site in destinations.keys() if not site in sec_location]
                else:
                    ## old style
                    presence = getDatasetPresence( url, sec )
                    sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites
                    subscriptions = listSubscriptions( url ,sec )
                    sec_destination = [site for site in subscriptions] 


                sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                sec_to_distribute = [site for site in sec_to_distribute if not  any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                if override_sec_destination:
                    ## intersect with where we want the PU to be
                    not_needed_anymore = list(set(sec_to_distribute) - set(override_sec_destination))
                    #sendEmail("secondary superfluous","the dataset %s could be removed from %s"%( sec, not_needed_anymore ))
                    sendLog('transferor', "the dataset %s could be removed from %s"%( sec, not_needed_anymore ))
                    sec_to_distribute = list(set(sec_to_distribute) & set(override_sec_destination))

                if len( sec_to_distribute )>0:
                    print "secondary could go to",sorted(sec_to_distribute)
                    sec_size = dss.get( sec )
                    for site in sec_to_distribute:
                        site_se =SI.CE_to_SE(site)
                        if (SI.disk[site_se]*1024.) > sec_size:
                            all_transfers[site].append( sec )
                            can_go = False
                        else:
                            print "could not send the secondary input to",site_se,"because it is too big for the available disk",SI.disk[site_se]*1024,"GB need",sec_size
                            if primary_destinations and site in primary_destinations:
                                #sendEmail('secondary input too big','%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024))
                                sendLog('transferor', '%s is too big (%s) for %s (%s)'%( sec, sec_size, site_se, SI.disk[site_se]*1024), level='critical')
                else:
                    print "the secondary input does not have to be send to site"

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                wfh.sendLog('transferor', "latches on existing transfers, and nothing else, settin staging")
                wfo.status = 'staging'
                needs_transfer+=1
            else:
                wfh.sendLog('transferor', "should just be assigned now to %s"%sorted(sites_allowed))
                wfo.status = 'staged'
            passing_along+=1
            wfh.sendLog('transferor', "setting status to %s"%wfo.status)
            session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                wfh.sendLog('transferor', "latches on existing transfers")
                if not options.test:
                    wfo.status = 'staging'
                    wfh.sendLog('transferor', "setting status to %s"%wfo.status)
                    session.commit()
            wfh.sendLog('transferor',"needs a transfer")
            needs_transfer+=1
            passing_along+=1

    if no_goes:
        #sendEmail("no go for managing","No go for \n"+"\n".join( no_goes ))
        sendLog('transferor', "No go for \n"+"\n".join( no_goes ), level='critical')

    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    fake_id=-1
    wf_id_in_prestaging=set()

    for (site,items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))

        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer: 
            print site,"does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        block_datasets = list(set([it.split('#')[0] for it in blocks]))
        datasets = [it for it in items_to_transfer if not '#' in it]

        details_text = "Making a replica to %s (CE) %s (SE) for"%( site, site_se)
        

        #print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [block for block in blocks if not block.split('#')[0] in datasets]
        #print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        #print "\t",len(datasets),"datasets"
        #print "\t",datasets
        details_text += '\n\t%d blocks'%len(blocks)
        details_text += '\n\t%d needed blocks for %s'%( len(blocks), sorted(list(set([block.split('#')[0] for block in blocks]))))
        details_text += '\n\t%d datasets'% len(datasets)
        details_text += '\n\t%s'%sorted(datasets)
        
        items_to_transfer = blocks + datasets

        if execute:
            sendLog('transferor', details_text)
        else:
            print "Would make a replica to",site,"(CE)",site_se,"(SE) for"
            print details_text

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y','yes','go']:
                continue

        if execute:
            priority = 'normal'
            cds = [ds for ds in datasets+block_datasets if ds in max_priority]
            if cds and False: ## I don't think this is working. subscription should be updated on the fly and regularly for raising the priority if needed
                ## decide on an overall priority : that's a bit too large though
                if any([max_priority[ds]>=90000 for ds in cds]):
                    priority = 'high'
                elif all([max_priority[ds]<80000 for ds in cds]):
                    priority = 'low'
                
            result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority=priority)
        else:
            result= {'phedex':{'request_created' : []}}
            fake_id-=1



        if not result:
            print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(Transfer.phedexid == int(phedexid)).first()
            if not new_transfer:
                new_transfer = session.query(Transfer).filter(Transfer.phedexid == -int(phedexid)).first()
            print phedexid,"transfer created"
            if not new_transfer:
                new_transfer = Transfer( phedexid = phedexid)
                session.add( new_transfer )                
            else:
                new_transfer.phedexid = phedexid ## make it positive again

            new_transfer.workflows_id = set()
            for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update( workflow_dependencies[transfering] )
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status!='staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting",tr_wf.name,"to staging"
        session.commit()
Ejemplo n.º 28
0
def assignor(url ,specific = None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return
    #if notRunningBefore( 'stagor' ): return
    if not componentInfo().check(): return

    CI = campaignInfo()
    SI = siteInfo()
    LI = lockInfo()
    NLI = newLockInfo()

    n_assigned = 0
    n_stalled = 0

    wfos=[]
    if specific:
        wfos = session.query(Workflow).filter(Workflow.name==specific).all()
    if not wfos:
        if specific:
            wfos = session.query(Workflow).filter(Workflow.status=='considered').all()
            wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all())
        wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all())

    for wfo in wfos:
        if specific:
            if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue
            #if not specific in wfo.name: continue
        print "\n\n",wfo.name,"\n\tto be assigned"
        wfh = workflowInfo( url, wfo.name)


        ## check if by configuration we gave it a GO
        if not CI.go( wfh.request['Campaign'] ) and not options.go:
            print "No go for",wfh.request['Campaign']
            n_stalled+=1
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] !='assignment-approved':
            if not options.test:
                print wfo.name,wfh.request['RequestStatus'],"setting away and skipping"
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name,wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version=wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                print "cannot decide on version number"
                n_stalled+=1
                continue

        (lheinput,primary,parent,secondary) = wfh.getIO()
        sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )

        print "Site white list",sorted(sites_allowed)

        if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist']

        if 'SiteBlacklist' in CI.parameters(wfh.request['Campaign']):
            print "Reducing the whitelist due to black list in campaign configuration"
            print "Removing",CI.parameters(wfh.request['Campaign'])['SiteBlacklist']
            sites_allowed = list(set(sites_allowed) - set(CI.parameters(wfh.request['Campaign'])['SiteBlacklist']))

        blocks = []
        if 'BlockWhitelist' in wfh.request:
            blocks = wfh.request['BlockWhitelist']

        memory_allowed = SI.sitesByMemory( wfh.request['Memory'] )
        if memory_allowed!=None:
            print "sites allowing", wfh.request['Memory'],"are",sorted(memory_allowed)
            sites_allowed = list(set(sites_allowed) & set(memory_allowed))

        print "Allowed",sorted(sites_allowed)
        secondary_locations=None
        for sec in list(secondary):
            presence = getDatasetPresence( url, sec )
            print sec
            print json.dumps(presence, indent=2)
            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>90.]
            one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations==None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations]
            
        print "From secondary requirement, now Allowed",sorted(sites_allowed)
        sites_all_data = copy.deepcopy( sites_allowed )
        sites_with_data = copy.deepcopy( sites_allowed )
        sites_with_any_data = copy.deepcopy( sites_allowed )
        primary_locations = None
        available_fractions = {}
        for prim in list(primary):
            presence = getDatasetPresence( url, prim , only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] =  getDatasetBlocksFraction(url, prim, sites = [SI.CE_to_SE(site) for site in sites_allowed] , only_blocks = blocks)
            #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])]
            #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])]
            sites_all_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,(there,frac)) in presence.items() if there]]
            sites_with_data = [site for site in sites_with_data if SI.CE_to_SE(site) in [psite for (psite,frac) in presence.items() if frac[1]>90.]]
            sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()]
            print "Holding the data but not allowed",list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed]))
            if primary_locations==None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(set(primary_locations) & set(presence.keys() ))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites=[]
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations)) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            elif primary_locations:
                opportunistic_sites = [SI.SE_to_CE(site) for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))]
            else:
                opportunistic_sites = []
            print "We could be running at",sorted(opportunistic_sites),"in addition"
            if any([osite in SI.sites_not_ready for osite in opportunistic_sites]):
                print "One of the destination site is in downtime"
                down_time = True
                ## should this be send back to considered ?
                

        """
        if available_fractions and not all([available>=1. for available in available_fractions.values()]):
            print "The input dataset is not located in full over sites"
            print json.dumps(available_fractions)
            if not options.test and not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known:
                    sendEmail( "cannot be assigned","%s is not full over sites \n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append( wfo.name )
                    open('cannot_assign.json','w').write(json.dumps( known, indent=2))
                n_stalled+=1
                continue ## skip skip skip
        """

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted,cpuh = wfh.getNCopies()
        
        if available_fractions and not all([available>=copies_wanted for available in available_fractions.values()]):
            print "The input dataset is not available",copies_wanted,"times, only",available_fractions.values()
            if down_time:
                wfo.status = 'considered'
                session.commit()
                print "sending back to considered because of site downtime, instead of waiting"
                sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                continue
                #pass

            print json.dumps(available_fractions)
            if not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known:
                    sendEmail( "cannot be assigned","%s is not sufficiently available. Probably phedex information lagging behind. \n %s"%(wfo.name,json.dumps(available_fractions)))
                    known.append( wfo.name )
                    open('cannot_assign.json','w').write(json.dumps( known, indent=2))
                n_stalled+=1
                continue

        ## default back to white list to original white list with any data
        print "Allowed",sites_allowed
        sites_allowed = sites_with_any_data
        print "Selected for any data",sites_allowed

        if options.restrict:
            print "Allowed",sites_allowed
            sites_allowed = sites_with_any_data
            print "Selected",sites_allowed
        else:
            if set(sites_with_data) != set(sites_allowed):
                ## the data is not everywhere we wanted to run at : enable aaa
                print "Sites with 90% data not matching site white list (block choping!)"
                print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data)),"?"
                print "Whitelist site with any data",list(set(sites_allowed) - set(sites_with_any_data))
                #options.useSiteListAsLocation = True
                #print "Not commissioned yet"
                #continue
            #print "We could be running at",opportunistic_sites,"in addition"
            ##sites_allowed = list(set(sites_allowed+ opportunistic_sites))

        if not len(sites_allowed):
            print wfo.name,"cannot be assign with no matched sites"
            sendEmail( "cannot be assigned","%s has no whitelist"%(wfo.name))
            n_stalled+=1
            continue

        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])]


        print "Placing the output on", sites_out
        parameters={
            'SiteWhitelist' : sites_allowed,
            #'CustodialSites' : sites_custodial,
            'NonCustodialSites' : sites_out,
            'AutoApproveSubscriptionSites' : list(set(sites_out)),
            'AcquisitionEra' : wfh.acquisitionEra(),
            'ProcessingString' : wfh.processingString(),
            'MergedLFNBase' : '/store/mc', ## to be figured out
            'ProcessingVersion' : version,
            }


        ## plain assignment here
        team='production'
        if options and options.team:
            team = options.team

        if "T2_US_UCSD" in sites_with_data and random.random() < -0.5 and wfh.request['Campaign']=='RunIISpring15DR74' and int(wfh.getRequestNumEvents()) < 600000 and not any([out.endswith('RAW') for out in wfh.request['OutputDatasets']]):
            ## consider SDSC
            parameters['SiteWhitelist'] = ['T2_US_UCSD','T3_US_SDSC']
            parameters['useSiteListAsLocation'] = True
            team = 'allocation-based'
            sendEmail("sending work to SDSC","%s was assigned to SDSC/UCSD"% wfo.name, destination=['*****@*****.**'])
            
        if wfh.request['Campaign']=='RunIIWinter15GS' and random.random() < -1.0:
            parameters['SiteWhitelist'] = ['T3_US_SDSC']
            team = 'allocation-based'
            sendEmail("sending work to SDSC","%s was assigned to SDSC"% wfo.name, destination=['*****@*****.**'])
        

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v=getattr(options,key)
                if v!=None:
                    if ',' in v: parameters[key] = filter(None,v.split(','))
                    else: parameters[key] = v

        ## pick up campaign specific assignment parameters
        parameters.update( CI.parameters(wfh.request['Campaign']) )

        if not options.test:
            parameters['execute'] = True

        if not wfh.checkWorkflowSplitting():
            print "Falling back to event splitting."
            parameters['SplittingAlgorithm'] = 'EventBased'
            sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name)
            ## needs to go to event based ? fail for now
            #print "Falling back to event splitting ?"
            #sendEmail("Cannot assign","the workflow %s is too heavy to be processed as it is. Could fallback to EventBased splitting"%wfo.name)
            #continue

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi)/float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents/(reqJobs*1.4))
                lumisPerJob = int(eventsPerJob/eventsPerLumi)
                if lumisPerJob==0:
                    print "There is no go for assigning that request without event splitting"
                    sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    print "need to go down to",eventsPerJob,"events per job"
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl['events_per_job'] if 'events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        print "need to go down to",lumisPerJob,"in assignment"
                        sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        print "the regular splitting should work for",pstring
                        sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)

        result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters)


        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned+=1
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo( url, wfo.name)
                    (_,prim,_,sec) = new_wfi.getIO()
                    for output in new_wfi.request['OutputDatasets']:
                        ## lock all outputs flat
                        NLI.lock( output )
                    for site in [SI.CE_to_SE(site) for site in sites_allowed]:
                        for output in new_wfi.request['OutputDatasets']:
                            LI.lock( output, site, 'dataset in production')
                        for primary in prim:
                            LI.lock( primary, site, 'dataset used in input')
                        for secondary in sec:
                            LI.lock( secondary, site, 'required for mixing' )

                except Exception as e:
                    print "fail in locking output"
                    print str(e)
                    sendEmail("failed locking of output",str(e))


            else:
                print "ERROR could not assign",wfo.name
        else:
            pass
    print "Assignment summary:"
    print "Assigned",n_assigned
    print "Stalled",n_stalled
Ejemplo n.º 29
0
def transferor(url, specific=None, talk=True, options=None):
    if userLock('transferor'): return

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False)
    dss = DSS()

    print "counting all being handled..."
    being_handled = len(
        session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('stag')).all())
    being_handled += len(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance')).all())
    max_to_handle = options.maxworkflows
    allowed_to_handle = max(0, max_to_handle - being_handled)
    wf_buffer = 5
    if allowed_to_handle <= wf_buffer:  ## buffer for having several wf per transfer
        print "Not allowed to run more than", max_to_handle, "at a time. Currently", being_handled, "and", wf_buffer, "buffer"
    else:
        print being_handled, "already being handled", max_to_handle, "max allowed,", allowed_to_handle, "remaining", "and", wf_buffer, "buffer"
    print "... done"

    all_transfers = defaultdict(list)
    workflow_dependencies = defaultdict(
        set)  ## list of wf.id per input dataset
    wfs_and_wfh = []
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(
            Workflow.status == 'considered').all():
        if specific and not specific in wfo.name: continue
        cache_r = filter(lambda d: d['RequestName'] == wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append((wfo,
                                workflowInfo(url,
                                             wfo.name,
                                             spec=False,
                                             request=cache_r[0])))
        else:
            wfs_and_wfh.append((wfo, workflowInfo(url, wfo.name, spec=False)))
    print "... done"

    input_sizes = {}
    ## list the size of those in transfer already
    in_transfer_priority = 0
    min_transfer_priority = 100000000
    print "getting all wf in staging ..."
    for wfo in session.query(Workflow).filter(
            Workflow.status == 'staging').all():
        wfh = workflowInfo(url, wfo.name, spec=False)
        (_, primary, _, _) = wfh.getIO()
        for prim in primary:
            input_sizes[prim] = dss.get(prim)
        in_transfer_priority = max(in_transfer_priority,
                                   int(wfh.request['RequestPriority']))
        min_transfer_priority = min(min_transfer_priority,
                                    int(wfh.request['RequestPriority']))
    print "... done"
    print "Max priority in transfer already", in_transfer_priority
    print "Min priority in transfer already", min_transfer_priority
    in_transfer_already = sum(input_sizes.values())

    #sort by priority higher first
    wfs_and_wfh.sort(cmp=lambda i, j: cmp(int(i[1].request[
        'RequestPriority']), int(j[1].request['RequestPriority'])),
                     reverse=True)

    ## list the size of all inputs
    print "getting all input sizes ..."
    for (wfo, wfh) in wfs_and_wfh:
        (_, primary, _, _) = wfh.getIO()
        for prim in primary:
            input_sizes[prim] = dss.get(prim)
    print "... done"

    grand_total = sum(input_sizes.values())
    to_transfer = grand_total - in_transfer_already
    grand_transfer_limit = options.maxtransfer
    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered" % in_transfer_already
    print "%15.4f GB is the current requested transfer load" % to_transfer
    print "%15.4f GB is the global transfer limit" % grand_transfer_limit
    print "%15.4f GB is the available limit" % transfer_limit

    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer = 0  ## so that we can count'em
    passing_along = 0
    transfer_sizes = {}
    went_over_budget = False
    for (wfo, wfh) in wfs_and_wfh:
        print wfh.request['RequestPriority']
        print wfo.name, "to be transfered"
        #wfh = workflowInfo( url, wfo.name)

        (_, primary, _, _) = wfh.getIO()
        this_load = sum([input_sizes[prim] for prim in primary])
        if (this_load
                and (sum(transfer_sizes.values()) + this_load > transfer_limit
                     or went_over_budget)):
            if went_over_budget:
                print "Transfer has gone over bubget."
            else:
                print "Transfer will go over bubget."
            print "%15.4f GB this load" % this_load
            print "%15.4f GB already this round" % sum(transfer_sizes.values())
            print "%15.4f GB is the available limit" % transfer_limit
            went_over_budget = True
            if int(
                    wfh.request['RequestPriority']
            ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                print "Higher priority sample", wfh.request[
                    'RequestPriority'], ">=", in_transfer_priority, "go-on over budget"
            else:
                if not options.go:
                    print min_transfer_priority, "minimum priority", wfh.request[
                        'RequestPriority'], "<", in_transfer_priority, "stop"
                    continue

        ## throtlle by campaign go
        if not CI.go(wfh.request['Campaign']):
            print "No go for", wfh.request['Campaign']
            if not options.go: continue

        ## check if the batch is announced
        announced = False
        is_real = False
        for b in mcm.getA('batches', query='contains=%s' % wfo.name):
            is_real = True
            if b['status'] == 'announced':
                announced = True
                break

        if not announced:
            print wfo.name, "does not look announced."  # skipping?, rejecting?, reporting?"

        if not is_real:
            print wfo.name, "does not appear to be genuine."
            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(
            time.strptime('.'.join(map(str, wfh.request['RequestDate'])),
                          "%Y.%m.%d.%H.%M.%S")) / (60. * 60.)
        now = time.mktime(time.gmtime()) / (60. * 60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced:
                print "It is too soon to start transfer: %3.2fH remaining" % (
                    now - injection_time)
                continue

        passing_along += 1
        if passing_along >= allowed_to_handle:
            if int(
                    wfh.request['RequestPriority']
            ) >= in_transfer_priority and min_transfer_priority != in_transfer_priority:
                print "Higher priority sample", wfh.request[
                    'RequestPriority'], ">=", in_transfer_priority, "go-on over", max_to_handle
            else:
                print "Not allowed to pass more than", max_to_handle, "at a time. Currently", being_handled, "handled, and adding", passing_along
                break

        (lheinput, primary, parent, secondary) = wfh.getIO()
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')
        else:
            sites_allowed = getSiteWhiteList(
                (lheinput, primary, parent, secondary))

        if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = CI.parameters(
                wfh.request['Campaign'])['SiteWhitelist']

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']

        can_go = True
        staging = False
        if primary:
            if talk:
                print wfo.name, 'reads', ', '.join(primary), 'in primary'
            ## chope the primary dataset
            for prim in primary:
                max_priority[prim] = max(max_priority[prim],
                                         int(wfh.request['RequestPriority']))
                sites_really_allowed = [
                    site for site in sites_allowed if not any([
                        osite.startswith(site)
                        for osite in SI.sites_veto_transfer
                    ])
                ]
                print "Sites allowed minus the vetoed transfer"
                print sites_really_allowed
                copies_needed = int(
                    0.35 * len(sites_really_allowed)
                ) + 1  ## should just go for a fixed number based if the white list grows that big
                print "Would make", copies_needed, "copies"
                if options.maxcopy > 0:
                    copies_needed = min(options.maxcopy, copies_needed)

                ## remove the sites that do not want transfers
                print "need", copies_needed
                workflow_dependencies[prim].add(wfo.id)
                presence = getDatasetPresence(url, prim)
                prim_location = [
                    site for site, pres in presence.items() if pres[0] == True
                ]
                if len(prim_location) >= copies_needed:
                    print "The output is all fully in place at", len(
                        prim_location), "sites"
                    continue
                # reduce the number of copies required by existing full copies
                copies_needed = max(0, copies_needed - len(prim_location))
                print "now need", copies_needed
                subscriptions = listSubscriptions(url, prim)
                prim_destination = list(
                    set([
                        site
                        for (site, (tid, decision)) in subscriptions.items()
                        if decision and not any([
                            site.endswith(veto)
                            for veto in ['MSS', 'Export', 'Buffer']
                        ])
                    ]))
                ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place
                prim_destination = [
                    site for site in prim_destination
                    if not site in prim_location
                ]
                ## add transfer dependencies
                latching_on_transfers = list(
                    set([
                        tid
                        for (site, (tid, decision)) in subscriptions.items()
                        if decision and site in prim_destination and not any([
                            site.endswith(veto)
                            for veto in ['MSS', 'Export', 'Buffer']
                        ])
                    ]))
                print latching_on_transfers
                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(
                        Transfer.phedexid == latching).first()
                    if not tfo:
                        tfo = Transfer(phedexid=latching)
                        tfo.workflows_id = []
                        session.add(tfo)

                    if not wfo.id in tfo.workflows_id:
                        print "adding", wfo.id, "to", tfo.id, "with phedexid", latching
                        l = copy.deepcopy(tfo.workflows_id)
                        l.append(wfo.id)
                        tfo.workflows_id = l
                    if not options.test:
                        session.commit()
                    else:
                        session.flush(
                        )  ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                copies_needed = max(0, copies_needed - len(prim_destination))
                print "then need", copies_needed
                if copies_needed == 0:
                    print "The output is either fully in place or getting in full somewhere with", latching_on_transfers
                    can_go = True
                    continue
                prim_to_distribute = [
                    site for site in sites_allowed if not any(
                        [osite.startswith(site) for osite in prim_location])
                ]
                prim_to_distribute = [
                    site for site in prim_to_distribute if not any(
                        [osite.startswith(site) for osite in prim_destination])
                ]
                ## take out the ones that cannot receive transfers
                prim_to_distribute = [
                    site for site in prim_to_distribute if not any([
                        osite.startswith(site)
                        for osite in SI.sites_veto_transfer
                    ])
                ]
                if len(
                        prim_to_distribute
                ) > 0:  ## maybe that a parameter we can play with to limit the
                    if not options or options.chop:
                        spreading = distributeToSites(getDatasetChops(prim),
                                                      prim_to_distribute,
                                                      n_copies=copies_needed,
                                                      weights=SI.cpu_pledges)
                    else:
                        spreading = {}
                        for site in prim_to_distribute:
                            spreading[site] = [prim]
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    for (site, items) in spreading.items():
                        all_transfers[site].extend(items)

        if secondary:
            if talk:
                print wfo.name, 'reads', ', '.join(secondary), 'in secondary'
            for sec in secondary:
                workflow_dependencies[sec].add(wfo.id)
                presence = getDatasetPresence(url, sec)
                sec_location = [
                    site for site, pres in presence.items() if pres[1] > 90.
                ]  ## more than 90% of the minbias at sites
                subscriptions = listSubscriptions(url, sec)
                sec_destination = [site for site in subscriptions]
                sec_to_distribute = [
                    site for site in sites_allowed if
                    not any([osite.startswith(site) for osite in sec_location])
                ]
                sec_to_distribute = [
                    site for site in sec_to_distribute if not any(
                        [osite.startswith(site) for osite in sec_destination])
                ]
                sec_to_distribute = [
                    site for site in sec_to_distribute if not any([
                        osite.startswith(site)
                        for osite in SI.sites_veto_transfer
                    ])
                ]
                if len(sec_to_distribute) > 0:
                    for site in sec_to_distribute:
                        all_transfers[site].append(sec)
                        can_go = False

        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                print wfo.name, "latches on existing transfers, and nothing else"
                wfo.status = 'staging'
            else:
                print wfo.name, "should just be assigned NOW to", sites_allowed
                wfo.status = 'staged'
            print "setting status to", wfo.status
            session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                print wfo.name, "latches on existing transfers"
                if not options.test:
                    wfo.status = 'staging'
                    print "setting status to", wfo.status
                    session.commit()
            print wfo.name, "needs a transfer"
            needs_transfer += 1

    #print json.dumps(all_transfers)
    fake_id = -1
    wf_id_in_prestaging = set()

    for (site, items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))
        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer:
            print site, "does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        datasets = [it for it in items_to_transfer if not '#' in it]

        if execute:
            print "Making a replica to", site, "(CE)", site_se, "(SE) for"
        else:
            print "Would make a replica to", site, "(CE)", site_se, "(SE) for"

        print "\t", len(blocks), "blocks"
        ## remove blocks if full dataset is send out
        blocks = [
            block for block in blocks if not block.split('#')[0] in datasets
        ]
        print "\t", len(blocks), "needed blocks for", list(
            set([block.split('#')[0] for block in blocks]))
        print "\t", len(datasets), "datasets"
        print "\t", datasets
        items_to_transfer = blocks + datasets

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y', 'yes', 'go']:
                continue

        if execute:
            result = makeReplicaRequest(url,
                                        site_se,
                                        items_to_transfer,
                                        'prestaging',
                                        priority='normal')
            ## make use of max_priority dataset:priority to set the subscriptions priority
            """
            ## does not function
            once = True
            for item in items_to_transfer:
                bds = item.split('#')[0]
                if max_priority[bds] >= 90000:
                    if once:
                        w=10
                        print "waiting",w,"s before raising priority"
                        time.sleep(w)
                        once=False
                    ## raise it to high priority
                    print item,"subscription priority raised to high at",site_se
                    #print "This does not work yet properly it seems"
                    print updateSubscription(url, site_se, item, priority='high')
            """
        else:
            #result= {'phedex':{'request_created' : [{'id' : fake_id}]}}
            result = {'phedex': {'request_created': []}}
            fake_id -= 1

        if not result:
            print "ERROR Could not make a replica request for", site, items_to_transfer, "pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(
                Transfer.phedexid == phedexid).first()
            print phedexid, "transfer created"
            if not new_transfer:
                new_transfer = Transfer(phedexid=phedexid)
                session.add(new_transfer)
            new_transfer.workflows_id = set()
            for transfering in list(
                    set(map(lambda it: it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update(
                    workflow_dependencies[transfering])
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status != 'staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting", tr_wf.name, "to staging"
        session.commit()
Ejemplo n.º 30
0
def collector(url, specific, options):
    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return

    SI = siteInfo()
    dss = DSS()
    #NL = newLockInfo()
    mcm = McMClient(dev=False)
    fetch_in_campaigns = ['RunIISummer15GS']
    mcm_statuses = ['new']  #,'validation','defined','approved']

    will_be_used = defaultdict(list)
    secondary_used = defaultdict(list)
    for campaign, status in itertools.product(fetch_in_campaigns,
                                              mcm_statuses):
        queries = []
        if campaign:
            print "getting for", campaign
            queries.append('member_of_campaign=%s' % campaign)
        if status:
            print "getting for", status
            queries.append('status=%s' % status)
        rs = mcm.getA('requests', query='&'.join(queries))
        for r in rs:
            #if r['type'] != 'Prod': continue
            dataset = r['input_dataset']
            if dataset:
                #print r['prepid'],dataset
                will_be_used[dataset].append(r)
            pileup = r['pileup_dataset_name']
            if pileup:
                secondary_used['pileup'].append(r)

    all_transfers = defaultdict(list)
    print len(will_be_used), "datasets that can be pre-fetched"
    ## for secondary we really need to have the campaign right
    print len(secondary_used), "pileup will be used"

    datasets = will_be_used.keys()
    if options.limit:
        print "Restricting to randomly picked", options.limit
        random.shuffle(datasets)
        datasets = datasets[:options.limit]

    for dataset in datasets:
        print "\tlooking at", dataset
        #presence = getDatasetPresence(url, dataset)#, within_sites=['T2_CH_CERN'])
        ## lock all those, and pre-fecth them
        #NL.lock( dataset )
        ## we could get the reqmgr dictionnary from McM if it was implemented and use standard workflowInfo !!!
        for request in will_be_used[dataset]:
            print "will be used by", request['prepid']
            campaign = request['member_of_campaign']
            ## based on the campaign, pre-fetch a site list
            sites_allowed = SI.sites_T1s + SI.sites_with_goodIO
            if options.spread:
                ## pick up the number of copies from campaign
                copies_needed = 1  ## hard coded for now
            else:
                copies_needed = 1  ## hard coded for now

            print "Will look for", copies_needed, "of", dataset
            ## figure out where it is and going
            destinations, all_block_names = getDatasetDestinations(
                url,
                dataset,
                within_sites=[SI.CE_to_SE(site) for site in sites_allowed])
            print json.dumps(destinations, indent=2)
            prim_location = [
                site for (site, info) in destinations.items()
                if info['completion'] == 100 and info['data_fraction'] == 1
            ]
            prim_destination = [
                site for site in destinations.keys()
                if not site in prim_location
            ]
            prim_destination = [
                site for site in prim_destination if not any([
                    osite.startswith(site) for osite in SI.sites_veto_transfer
                ])
            ]
            copies_needed = max(0, copies_needed - len(prim_location))
            copies_being_made = [
                sum([
                    info['blocks'].keys().count(block)
                    for site, info in destinations.items()
                    if site in prim_destination
                ]) for block in all_block_names
            ]

            prim_to_distribute = [
                site for site in sites_allowed
                if not SI.CE_to_SE(site) in prim_location
            ]
            prim_to_distribute = [
                site for site in prim_to_distribute
                if not SI.CE_to_SE(site) in prim_destination
            ]
            ## take out the ones that cannot receive transfers
            prim_to_distribute = [
                site for site in prim_to_distribute if not any([
                    osite.startswith(site) for osite in SI.sites_veto_transfer
                ])
            ]
            copies_needed = max(0, copies_needed - min(copies_being_made))
            spreading = {}
            if copies_needed:
                print "needing", copies_needed
                chops, sizes = getDatasetChops(dataset,
                                               chop_threshold=options.chopsize)
                spreading = distributeToSites(chops,
                                              prim_to_distribute,
                                              n_copies=copies_needed,
                                              weights=SI.cpu_pledges,
                                              sizes=sizes)
            else:
                print "no copy needed for", dataset
            for (site, items) in spreading.items():
                all_transfers[site].extend(items)

    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    if not options.test:
        sendEmail(
            'dataset to be fetched',
            'the following datasets and location were figured from mcm up-coming requests\n%s'
            % (json.dumps(all_transfers, indent=2)),
            destination=['*****@*****.**'])

    ## now collect and make transfer request
    for (site, items_to_transfer) in all_transfers.iteritems():
        print "Directing at", site
        items_to_transfer = list(set(items_to_transfer))

        site_se = SI.CE_to_SE(site)
        blocks = [it for it in items_to_transfer if '#' in it]
        datasets = [it for it in items_to_transfer if not '#' in it]

        print "\t", len(blocks), "blocks"
        ## remove blocks if full dataset is send out
        blocks = [
            block for block in blocks if not block.split('#')[0] in datasets
        ]
        blocks_dataset = list(set([block.split('#')[0] for block in blocks]))
        print "\t", len(blocks), "needed blocks for", blocks_dataset
        print "\t", len(datasets), "datasets"
        print "\t", datasets
        items_to_transfer = blocks + datasets
        total_size = 0
        for dataset in datasets:
            ds_size, _ = dss.get_block_size(dataset)
            total_size += ds_size
        for dataset in blocks_dataset:
            _, bs_size = dss.get_block_size(dataset)
            total_size += sum([s for b, s in bs_size if b in blocks])

        print "For a total of", total_size, "[GB]"

        if options.test:
            result = {'phedex': {'request_created': []}}
        else:
            ##result = makeReplicaRequest(url, site_se, items_to_transfer, 'fetching pre-production', priority='normal', approve=True)
            ## should make sure there is something in it
            pass
Ejemplo n.º 31
0
def checkor(url, spec=None, options=None):
    fDB = falseDB()

    wfs=[]
    if options.fetch:
        #workflows = getWorkflows(url, status='completed')
        #for wf in workflows:
        #    wfo = session.query(Workflow).filter(Workflow.name == wf ).first()
        #    if wfo:
        #        if not wfo.status in ['away','assistance']: continue
        #        wfs.append(wfo )
        wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )
        wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() )
    else:
        ## than get all in need for assistance
        wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() )


    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = siteInfo()
    CI = campaignInfo()

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split('/')[2].split('-')[0]
        except:
            if 'Campaign' in wfi.request:
                campaign = wfi.request['Campaign']
        return campaign

    for wfo in wfs:
        if spec and not (spec in wfo.name): continue

        ## get info
        wfi = workflowInfo(url, wfo.name)

        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            print wfo.name,"is already",wfo.wm_status
            wfo.status = 'close'
            session.commit()
            continue
        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived']:
            ## went into trouble
            wfo.status = 'trouble'
            print wfo.name,"is in trouble",wfo.wm_status
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            print wfo.name,"not running yet"
            session.commit()
            continue
        
        
        if wfo.wm_status != 'completed':
            ## for sure move on with closeout check if in completed
            print "no need to check on",wfo.name,"in status",wfo.wm_status
            session.commit()
            continue

        session.commit()        
        sub_assistance="" # if that string is filled, there will be need for manual assistance

        is_closing = True
        ## do the closed-out checks one by one

        # tuck out DQMIO/DQM
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not '/DQM' in out]

        ## anything running on acdc
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        for member in familly:
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestStatus'] in ['running-opened','running-closed','assignment-approved','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                #print json.dumps(member,indent=2)
                ## hook for just waiting ...
                is_closing = False

        ## completion check
        percent_completions = {}
        event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']
        fractions_pass = {}
        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.
            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )

            fractions_pass[output] = 0.95
            c = get_campaign(output, wfi)
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]['fractionpass']
                print "overriding fraction to",fractions_pass[output],"for",output
            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

        if not all([percent_completions[out] > fractions_pass[out] for out in fractions_pass]):
            print wfo.name,"is not completed"
            print json.dumps(percent_completions, indent=2)
            print json.dumps(fractions_pass, indent=2)
            ## hook for creating automatically ACDC ?
            sub_assistance+='-recovery'
            is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request['OutputDatasets']:
            events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 300.
            campaign = get_campaign(output, wfi)
            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign
            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
            lumi_upper_limit[output] = upper_limit
        
        if any([ events_per_lumi[out] > lumi_upper_limit[out] for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            sub_assistance+='-biglumi'
            is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        vetoed_custodial_tier = ['MINIAODSIM']
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = get_campaign(output, wfi)
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"
                        break
            ## get from the parent
            if not custodial and 'InputDataset' in wfi.request:
                parents_custodial = findCustodialLocation(url, wfi.request['InputDataset'])
                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",wfi.request['InputDataset'],"does not have custodial in the first place. abort"
                    continue

            if not custodial:
                ## pick one at random
                custodial = SI.pick_SE()

            if custodial and not sub_assistance and not acdc:
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        custodials[custodial].append( output )
            else:
                print "cannot find a custodial for",wfo.name
            is_closing = False

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )

        fraction_invalid = 0.01
        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## hook for just waiting ...
            is_closing = False

        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            sub_assistance+="-invalidfiles"
            is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing:
            for output in wfi.request['OutputDatasets']:
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi( output )
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi( output )
                    except:
                        print "was not possible to get the duplicate count for",output
                        is_closing=False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name,"has duplicates"
                print json.dumps(duplications,indent=2)
                ## hook for making file invalidation ?
                sub_assistance+='-duplicates'
                is_closing = False 



        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = len(acdc)

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting",wfo.name,"closed-out"
            if not options.test:
                reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                # set it from away/assistance* to close
                wfo.status = 'close'
                session.commit()
        else:
            print wfo.name,"needs assistance"
            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            wfo.status = 'assistance'+sub_assistance
            if not options.test:
                print "setting",wfo.name,"to",wfo.status
                session.commit()

    fDB.summary()
    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ','.join(custodials[site]),'=>',site
        if not options.test:
            result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low')
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ','.join(transfers[site]),'=>',site
        if not options.test:
            result = None
            #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations
Ejemplo n.º 32
0
def htmlor():
    cache = getWorkflows('cmsweb.cern.ch','assignment-approved', details=True)
    def wfl(wf,view=False,p=False,ms=False,within=False,ongoing=False,status=False,update=False):
        wfn = wf.name
        wfs = wf.wm_status
        pid = None
        pids=filter(lambda seg: seg.count('-')==2, wf.name.split('_'))
        if len(pids):
            pid=pids[0]
        text=', '.join([
                #wfn,
                '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">%s</a>'%(wfn,wfn),
                '(%s) <br>'%wfs])
        text+=', '.join([
                '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">dts</a>'%wfn,
                '<a href=https://cmsweb.cern.ch/reqmgr/view/showWorkload?requestName=%s target="_blank">wkl</a>'%wfn,
                '<a href="https://cmsweb.cern.ch/couchdb/reqmgr_workload_cache/%s" target="_blank">wfc</a>'%wfn,
                '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/request?requestName=%s" target="_blank">dwkc</a>'%wfn,
                '<a href="https://cmsweb.cern.ch/reqmgr/view/splitting/%s" target="_blank">spl</a>'%wfn,
                '<a href="https://cms-pdmv.cern.ch/stats/?RN=%s" target="_blank">vw</a>'%wfn,
                '<a href="https://cms-pdmv.cern.ch/stats/restapi/get_one/%s" target="_blank">vwo</a>'%wfn,
                '<a href="https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=full&reverse=0&reverse=1&npp=20&subtext=%s&sall=q" target="_blank">elog</a>'%pid,
                '<a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank">pv</a>'%wfn,
                '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/outputDatasetsByRequestName/%s" target="_blank">out</a>'%wfn,
                '<a href="closeout.html#%s" target="_blank">clo</a>'%wfn,
                '<a href="statuses.html#%s" target="_blank">st</a>'%wfn,
                '<a href="https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s" target="_blank">perf</a>'%wfn
                ])
        if within and (not view or wfs=='completed'):
            cached = filter(lambda d : d['RequestName']==wfn, cache)
            if cached:
                wl = cached[0]
            else:
                wl = getWorkLoad('cmsweb.cern.ch',wfn)
            if 'InputDataset' in wl:
                dataset = wl['InputDataset']
                text+=', '.join(['',
                                 '<a href=https://cmsweb.cern.ch/das/request?input=%s target=_blank>input</a>'%dataset,
                                 '<a href=https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions#state=create_since=0;filter=%s target=_blank>sub</a>'%dataset,
                                 '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?dataset=%s&collapse=n target=_blank>ds</a>'%dataset,
                                 '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/blockreplicas?dataset=%s target=_blank>rep</a>'%dataset,
                                 ])

        if p:
            cached = filter(lambda d : d['RequestName']==wfn, cache)
            if cached:
                wl = cached[0]
            else:
                wl = getWorkLoad('cmsweb.cern.ch',wfn)
            text+=', (%s)'%(wl['RequestPriority'])
            pass

        if pid:
            if ms:
                mcm_s = json.loads(os.popen('curl https://cms-pdmv.cern.ch/mcm/public/restapi/requests/get_status/%s --insecure'%pid).read())[pid]
                text+=', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm (%s)</a>'%(pid,mcm_s)
            else:
                text+=', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm</a>'%(pid)
                text+=', <a href="https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s" target="_blank">ac</a>'%(pid)
                
        if status:
            if wf.status.startswith('assistance'):
                text+=', <a href="assistance.html#%s" target="_blank">assist</a>'%wfn
            text+=' : %s '%(wf.status)


        if view and wfs!='acquired':
            text+='<a href="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" target="_blank"><img src="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" style="height:50px"></a>'%(wfn.replace('_','/'),wfn.replace('_','/'))
        if ongoing:
            text+='<a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank"><img src="http://hcc-briantest.unl.edu/prodview/graphs/%s/daily" style="height:50px"></a>'%(wfn,wfn)
        text+="<hr>"
        return text


    def phl(phid):
        text=', '.join([
                str(phid),
                '<a href="https://cmsweb.cern.ch/phedex/prod/Request::View?request=%s" target="_blank">vw</a>'%phid,
                '<a href="https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions?reqfilter=%s" target="_blank">sub</a>'%phid,
                ])
        return text
            

    def ol(out):
        return '<a href="https://cmsweb.cern.ch/das/request?input=%s" target="_blank"> %s</a>'%(out,out)

    
    ## start to write it
    #html_doc = open('/afs/cern.ch/user/v/vlimant/public/ops/index.html','w')
    html_doc = open('/afs/cern.ch/user/c/cmst2/www/unified/index.html','w')
    print "Updating the status page ..." 
    html_doc.write("""
<html>
<head>
<META HTTP-EQUIV="refresh" CONTENT="900">
<script type="text/javascript">
 function showhide(id) {
    var e = document.getElementById(id);
    e.style.display = (e.style.display == 'block') ? 'none' : 'block';
 }
</script>
</head>
<body>

Last update on %s(CET), %s(GMT), <a href=logs/ target=_blank> logs</a> <a href=logs/last.log target=_blank>last</a> <a href=statuses.html>statuses</a> <a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/CompOpsWorkflowL3Responsibilities#Automatic_Assignment_and_Unified>what am I</a> <br><br>

""" %(time.asctime(time.localtime()),
      time.asctime(time.gmtime())))

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='considered').all():
        text+="<li> %s </li> \n"%wfl(wf,p=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""
Worlfow next to handle <a href=https://cms-pdmv.cern.ch/mcm/batches?status=new&page=-1 target="_blank"> batches</a> (%d) <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/transferor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('considered')">[Click to show/hide]</a>
<br>
<div id="considered" style="display:none;">
<ul>
"""%count)
    html_doc.write(text)

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='staging').all():
        text+="<li> %s </li> \n"%wfl(wf,within=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""
Worlfow waiting in staging (%d) <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staging')">[Click to show/hide]</a>
<br>
<div id="staging" style="display:none;">
<ul>
"""%count)
    html_doc.write(text)

    text=""
    count=0
    for ts in session.query(Transfer).all():
        stext='<li> %s serves </li><a href="javascript:showhide(\'%s\')">[show/hide]</a> <div id="%s" style="display:none;"><ul>'%( phl(ts.phedexid), ts.phedexid, ts.phedexid )
        hide = True
        for pid in ts.workflows_id:
            w = session.query(Workflow).get(pid)
            hide &= (w.status != 'staging' )
            stext+="<li> %s </li>\n"%( wfl(w,status=True))
        stext+="</ul></div>\n"
        if hide:
            #text+="<li> %s not needed anymore to start running (does not mean it went through completely)</li>"%phl(ts.phedexid)
            pass
        else:
            count+=1
            text+=stext
    text+="</ul></div>"
    html_doc.write("""
Transfer on-going (%d) <a href=https://transferteam.web.cern.ch/transferteam/dashboard/ target=_blank>dashboard</a> <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('transfer')">[Click to show/hide]</a>
<br>
<div id="transfer" style="display:none;">
<br>
<ul>"""%count)
    html_doc.write(text)



    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='staged').all():
        text+="<li> %s </li> \n"%wfl(wf,p=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""Worlfow ready for assigning (%d) <a href=logs/stagor/last.log target=_blank>log</a> <a href=logs/assignor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staged')">[Click to show/hide]</a>
<br>
<div id="staged" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    lines=[]
    for wf in session.query(Workflow).filter(Workflow.status=='away').all():
        lines.append("<li> %s </li>"%wfl(wf,view=True,ongoing=True))
    lines.sort()
    html_doc.write("""
Worlfow on-going (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests_in_production.php target=_blank>ongoing</a> <a href=https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=summary target=_blank>elog</a> <a href=http://hcc-briantest.unl.edu/prodview target=_blank>queues</a> <a href=logs/assignor/last.log target=_blank>log</a> <a href=logs/checkor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('away')">[Click to show/hide]</a>
<br>
<div id="away" style="display:none;">
<br>
<ul>
%s
</ul>
</div>
"""%(len(lines),'\n'.join(lines)))

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status == 'assistance').all():
        text+="<li> %s </li> \n"%wfl(wf,view=True,update=True,status=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""Worlfow that are closing (%d)
<a href=closeout.html target=_blank>closeout</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('closing')">[Click to show/hide]</a>
<br>
<div id="closing" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all():
        text+="<li> %s </li> \n"%wfl(wf,view=True,within=True,status=True,update=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""Worlfow which need assistance (%d)
<a href=assistance.html target=_blank>assistance</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('assistance')">[Click to show/hide]</a>
<br>
<div id="assistance" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)
    
    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status == 'close').all():
        text+="<li> %s </li> \n"%wfl(wf)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""Worlfow ready to close (%d)
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('close')">[Click to show/hide]</a>
<br>
<div id="close" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='trouble').all():
        text+="<li> %s </li> \n"%wfl(wf)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""Worlfow with issue (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/injector/last.log target=_blank>postlog</a>
<a href="javascript:showhide('trouble')">[Click to show/hide]</a>
<br>
<div id="trouble" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)



    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='forget').all():
        text+="<li> %s </li> \n"%wfl(wf)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""
Worlfow to forget (%d)
<a href="javascript:showhide('forget')">[Click to show/hide]</a>
<br>
<div id="forget" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='done').all():
        text+="<li> %s </li> \n"%wfl(wf)#,ms=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""
Worlfow through (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/cleanor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('done')">[Click to show/hide]</a>
<br>
<div id="done" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='clean').all():
        text+="<li> %s </li> \n"%wfl(wf)#,ms=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""
Worlfow clean for input (%d) <a href=logs/cleanor/last.log target=_blank>log</a>
<a href="javascript:showhide('clean')">[Click to show/hide]</a>
<br>
<div id="clean" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)


    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status.endswith('-out')).all():
        text+="<li> %s </li> \n"%wfl(wf,status=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""
Worlfow clean for output (%d) <a href=logs/outcleanor/last.log target=_blank>log</a>
<a href="javascript:showhide('cleanout')">[Click to show/hide]</a>
<br>
<div id="cleanout" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)






    text=""
    lines_thisweek=[]
    lines_lastweek=[]
    now = time.mktime(time.gmtime())
    this_week = int(time.strftime("%W",time.gmtime()))
    for out in session.query(Output).all():
        if not out.workflow: 
            print "This is a problem with",out.datasetname
            continue
        if  out.workflow.status in ['done','clean']:
            out_week = int(time.strftime("%W",time.gmtime(out.date)))
            ##only show current week, and the previous.
            if (this_week-out_week)==1:
                lines_lastweek.append("<li>on week %s : %s </li>"%(
                        time.strftime("%W (%x %X)",time.gmtime(out.date)),
                        ol(out.datasetname),
                        )
                             )
            if (this_week-out_week)==0:
                lines_thisweek.append("<li>on week %s : %s </li>"%(
                        time.strftime("%W (%x %X)",time.gmtime(out.date)),
                        ol(out.datasetname),
                        )
                             )
    lines_thisweek.sort()
    lines_lastweek.sort()

    html_doc.write("""Output produced <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?in_disagreement=1 target=_blank>disagreements</a> (%d)
<a href="javascript:showhide('output')">[Click to show/hide]</a>
<br>
<div id="output" style="display:none;">
<br>
<ul>
<li> Last week (%d) </li><a href="javascript:showhide('output_lastweek')">[Click to show/hide]</a><div id="output_lastweek" style="display:none;"><ul>
%s
</ul></div>
<li> This week (%d) </li><a href="javascript:showhide('output_thisweek')">[Click to show/hide]</a><div id="output_thisweek" style="display:none;"><ul>
%s
</ul></div></div>
"""%( len(lines_lastweek)+len(lines_thisweek),
      len(lines_lastweek),
     '\n'.join(lines_lastweek),
      len(lines_thisweek),
     '\n'.join(lines_thisweek))
                   )

    html_doc.write("""Job installed
<a href="javascript:showhide('acron')">[Click to show/hide]</a>
<br>
<div id="acron" style="display:none;">
<br>
<pre>
%s
</pre></div>
"""%(os.popen('acrontab -l | grep Unified').read()))

    text=""
    count=0
    for (c,info) in campaignInfo().campaigns.items():
        #if 'go' in info and info['go']:
        text+="<li>%s <br> <pre>%s</pre>  </li>"%( c, json.dumps( info, indent=2))
        count+=1

    html_doc.write("""Campaign configuration
<a href="javascript:showhide('campaign')">[Click to show/hide]</a>
<br>
<div id="campaign" style="display:none;">
<br>
<ul>
%s
</ul></div>
"""%(text))

    text=""
    count=0
    n_column = 4
    SI = siteInfo()
    for t in SI.types():
        #text+="<li>%s<ul>"%t
        #for site in getattr(SI,t):
        #    text+="<li><a href=http://hcc-briantest.unl.edu/prodview/%s>%s<a/> </li>"%( site, site)
        #    text+='<a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank"><img src="http://hcc-briantest.unl.edu/prodview/graphs/%s/daily" style="height:50px"></a>'%(site,site)
        #text+="</ul></li>"
        
        text+="<li>%s<table border=1>"%t
        c=0
        for site in getattr(SI,t):
            cpu = SI.cpu_pledges[site] if site in SI.cpu_pledges else 'N/A'
            disk = SI.disk[SI.CE_to_SE(site)] if SI.CE_to_SE(site) in SI.disk else 'N/A'
            if c==0:
                text+="<tr>"
            text+='<td><a href=http://dashb-ssb.cern.ch/dashboard/templates/sitePendingRunningJobs.html?site=%s>%s</a><br><a href="http://hcc-briantest.unl.edu/prodview/%s" target="_blank"><img src="http://hcc-briantest.unl.edu/prodview/graphs/%s/daily" style="height:50px"></a><br>CPU pledge: %s<br>Disk available: %s</td>'%(site,site,site,site,cpu,disk)
            if c==n_column:
                c=0
            else:
                c+=1
        text+="</table></li>"

    html_doc.write("""Site configuration
<a href="javascript:showhide('site')">[Click to show/hide]</a>
<br>
<div id="site" style="display:none;">
<br>
<ul>
%s
</ul></div>
"""%(text))

    print "... done with status page."
    html_doc.write("""
</body>
</html>
""")

    html_doc.close()

    html_doc = open('/afs/cern.ch/user/c/cmst2/www/unified/statuses.html','w')
    html_doc.write("""                                                                                                                                                                                                                                                                                                      <html>        
<table border=1>
<thead>
<tr>
<th> workflow </th><th> status </th><th> wm status</th>
</tr>
</thead>
""")
    wfs = {}
    for wfo in session.query(Workflow).all():
        wfs[wfo.name] = (wfo.status,wfo.wm_status)
    open('/afs/cern.ch/user/c/cmst2/www/unified/statuses.json','w').write(json.dumps( wfs ))
    for wfn in sorted(wfs.keys()):
        html_doc.write('<tr><td><a id="%s">%s</a></td><td>%s</td><td>%s</td></tr>\n'%( wfn, wfn, wfs[wfn][0],  wfs[wfn][1]))
    html_doc.write("</table>")
    html_doc.write("<br>"*100)
    html_doc.write("end of page</html>")
    html_doc.close()
Ejemplo n.º 33
0
def completor(url, specific):
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']
    if use_mcm:
        mcm = McMClient(dev=False)

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()

    wfs = []
    wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )
    wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() )

    ## just take it in random order so that not always the same is seen
    random.shuffle( wfs )

    ## by workflow a list of fraction / timestamps
    completions = json.loads( open('%s/completions.json'%monitor_dir).read())
    
    good_fractions = {}
    timeout = {}
    for c in CI.campaigns:
        if 'force-complete' in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]['force-complete']
        if 'force-timeout' in CI.campaigns[c]:
            timeout[c] = CI.campaigns[c]['force-timeout']

    long_lasting = {}

    overrides = getForceCompletes()
    if use_mcm:    
        ## add all workflow that mcm wants to get force completed
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        ## assuming this will be a list of actual prepids
        overrides['mcm'] = mcm_force

    print "can force complete on"
    print json.dumps( good_fractions ,indent=2)
    print json.dumps( overrides, indent=2)
    max_force = UC.get("max_force_complete")
    
    #wfs_no_location_in_GQ = set()
    #block_locations = defaultdict(lambda : defaultdict(list))
    #wfs_no_location_in_GQ = defaultdict(list)

    set_force_complete = set()

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        print "looking at",wfo.name
        ## get all of the same
        wfi = workflowInfo(url, wfo.name)
        pids = wfi.getPrepIDs()
        skip=False
        if not any([c in wfo.name for c in good_fractions]): skip=True
        for user,spec in overrides.items():

            if wfi.request['RequestStatus']!='force-complete':
                if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec):
                    sendEmail('force-complete requested','%s is asking for %s to be force complete'%(user,wfo.name))
                    wfi = workflowInfo(url, wfo.name)
                    forceComplete(url , wfi )
                    skip=True
                    wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False)
                    wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name))
                    if user == 'mcm' and use_mcm:
                        for pid in wfi.getPrepIDs():
                            mcm.delete('/restapi/requests/forcecomplete/%s'%pid)
                    #sendEmail('completor test','not force completing automatically, you have to go back to it')
                    #skip=False
                    break
    
        if wfo.status.startswith('assistance'): skip = True

        if skip: 
            continue

        priority = wfi.request['RequestPriority']

        if not 'Campaign' in wfi.request: continue

        if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue

        c = wfi.request['Campaign']
        if not c in good_fractions: continue
        good_fraction = good_fractions[c]
        ignore_fraction = 2.
        
        lumi_expected = None
        event_expected = None
        if not 'TotalInputEvents' in wfi.request: 
            if 'RequestNumEvents' in wfi.request:
                event_expected = wfi.request['RequestNumEvents']
            else:
                print "truncated, cannot do anything"
                continue
        else:
            lumi_expected = wfi.request['TotalInputLumis']
            event_expected = wfi.request['TotalInputEvents']

        now = time.mktime(time.gmtime()) / (60*60*24.)

        running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition'])
        if not running_log:
            print "\tHas no running log"
            # cannot figure out when the thing started running
            continue
        then = running_log[-1]['UpdateTime'] / (60.*60.*24.)
        delay = now - then ## in days

        (w,d) = divmod(delay, 7 )
        print "\t"*int(w)+"Running since",delay,"[days] priority=",priority

        monitor_delay = 7
        allowed_delay = 14
        if c in timeout:
            allowed_delay = timeout[c]
            
        monitor_delay = min(monitor_delay, allowed_delay)
        ### just skip if too early
        if delay <= monitor_delay: continue

        long_lasting[wfo.name] = { "delay" : delay }

        percent_completions = {}
        for output in wfi.request['OutputDatasets']:
            if "/DQM" in output: continue ## that does not count
            if not output in completions: completions[output] = { 'injected' : None, 'checkpoints' : [], 'workflow' : wfo.name}
            ## get completion fraction
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            lumi_completion=0.
            event_completion=0.
            if lumi_expected:
                lumi_completion = lumi_count / float( lumi_expected )
            if event_expected:
                event_completion = event_count / float( event_expected )

            #take the less optimistic
            percent_completions[output] = min( lumi_completion, event_completion )
            completions[output]['checkpoints'].append( (now, event_completion ) )

        if all([percent_completions[out] >= good_fraction for out in percent_completions]):
            wfi.sendLog('completor', "all is above %s \n%s"%( good_fraction, 
                                                              json.dumps( percent_completions, indent=2 )
                                                              ))
        else:
            long_lasting[wfo.name].update({
                    'completion': sum(percent_completions.values()) / len(percent_completions),
                    'completions' : percent_completions
                    })
            
            ## do something about the agents this workflow is in
            long_lasting[wfo.name]['agents'] = wfi.getAgents()
            wfi.sendLog('completor', "%s not over bound %s\n%s"%(percent_completions.values(), good_fraction,
                                                                 json.dumps( long_lasting[wfo.name]['agents'], indent=2) ))
            continue

        if all([percent_completions[out] >= ignore_fraction for out in percent_completions]):
            print "all is done, just wait a bit"
            continue

        for output in  percent_completions:
            completions[output]['injected'] = then

        #further check on delays
        cpuh = wfi.getComputingTime(unit='d')

        ran_at = wfi.request['SiteWhitelist']
                        
        wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay))
                    
        ##### WILL FORCE COMPLETE BELOW
        # only really force complete after n days

        if delay <= allowed_delay: continue
        ## find ACDCs that might be running
        if max_force>0:
            forceComplete(url, wfi )
            set_force_complete.add( wfo.name )
            print "going for force-complete of",wfo.name
            wfi.sendLog('completor','going for force completing')
            wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name)
            max_force -=1
        else:
            wfi.sendLog('completor',"too many completion this round, cannot force complete")

        ## do it once only for testing
        #break
    
    if set_force_complete:
        sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete)))
        #sendEmail('set force-complete', 'The followings were set force-complete \n%s'%('\n'.join(set_force_complete)))
    
    open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2))
    text="These have been running for long"
    
    open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 ))

    for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True):
        delay = info['delay']
        text += "\n %s : %s days"% (wf, delay)
        if 'completion' in info:
            text += " %d%%"%( info['completion']*100 )

    #if wfs_no_location_in_GQ:
    #    sendEmail('workflow with no location in GQ',"there won't be able to run anytime soon\n%s"%( '\n'.join(wfs_no_location_in_GQ)))

    #sendEmail("long lasting workflow",text)
    ## you can check the log
    print text
Ejemplo n.º 34
0
def htmlor( caller = ""):
    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return 
        
    try:
        boost = json.loads(open('%s/equalizor.json'%monitor_dir).read())['modifications']
    except:
        boost = {}
    cache = getWorkflows(reqmgr_url,'assignment-approved', details=True)
    cache.extend( getWorkflows(reqmgr_url,'acquired', details=True) )
    cache.extend( getWorkflows(reqmgr_url,'running-open', details=True) )
    cache.extend( getWorkflows(reqmgr_url,'running-closed', details=True) )
    def getWL( wfn ):
        cached = filter(lambda d : d['RequestName']==wfn, cache)
        if cached:
            wl = cached[0]
        else:
            wl = getWorkLoad(reqmgr_url,wfn)
        return wl

    def wfl(wf,view=False,p=False,ms=False,within=False,ongoing=False,status=False,update=False):
        wfn = wf.name
        wfs = wf.wm_status
        wl = None
        pid = None
        wl_pid = None
        pids=filter(lambda seg: seg.count('-')==2, wf.name.split('_'))
        if len(pids):
            pids = pids[:1]
            pid=pids[0]
            
        if not pids:
            wl = getWL( wf.name )
            pids = getPrepIDs( wl )
            pid = pids[0]

        wl_pid = pid
        if 'task' in wf.name:
            wl_pid = 'task_'+pid

        
        text=', '.join([
                #wfn,
                #'<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">%s</a> '%(wfn,wfn),
                #'<table><tr><td>%s</td></tr></table>'%(wfn),
                #'<span>%s</span>'%(wfn),
                "%s "%wfn,
                '(%s) <br>'%wfs])
        text+=', '.join([
                '<a href="https://%s/reqmgr2/fetch?rid=%s" target="_blank">dts</a>'%(reqmgr_url,wfn),
                '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">dts-req1</a>'%wfn,
                #TOFIX '<a href=https://cmsweb.cern.ch/reqmgr/view/showWorkload?requestName=%s target="_blank">wkl</a>'%wfn,
                '<a href="https://%s/couchdb/reqmgr_workload_cache/%s" target="_blank">wfc</a>'%(reqmgr_url,wfn),
                '<a href="https://%s/reqmgr2/data/request?name=%s" target="_blank">req</a>'%(reqmgr_url,wfn),
                #'<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/request?requestName=%s" target="_blank">dwkc</a>'%wfn,
                #TOFIX '<a href="https://cmsweb.cern.ch/reqmgr/view/splitting/%s" target="_blank">spl</a>'%wfn,
                '<a href="https://cms-pdmv.cern.ch/stats/?RN=%s" target="_blank">vw</a>'%wfn,
                '<a href="https://cms-pdmv.cern.ch/stats/restapi/get_one/%s" target="_blank">vwo</a>'%wfn,
                '<a href="https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=full&reverse=0&reverse=1&npp=20&subtext=%s&sall=q" target="_blank">elog</a>'%pid,
                '<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank">pv</a>'%wfn,
                #deprecated '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/outputDatasetsByRequestName/%s" target="_blank">out</a>'%wfn,
                '<a href="closeout.html#%s" target="_blank">clo</a>'%wfn,
                '<a href="statuses.html#%s" target="_blank">st</a>'%wfn,
                '<a href="https://%s/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s" target="_blank">perf</a>'%(reqmgr_url,wfn)
                ])
        if within and (not view or wfs=='completed'):
            wl = getWL( wfn )
            dataset =None
            if 'InputDataset' in wl:
                dataset = wl['InputDataset']                
            if 'Task1' in wl and 'InputDataset' in wl['Task1']:
                dataset = wl['Task1']['InputDataset']

            if dataset:
                text+=', '.join(['',
                                 '<a href=https://cmsweb.cern.ch/das/request?input=%s target=_blank>input</a>'%dataset,
                                 '<a href=https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions#state=create_since=0;filter=%s target=_blank>sub</a>'%dataset,
                                 '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?dataset=%s&collapse=n target=_blank>ds</a>'%dataset,
                                 '<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/blockreplicas?dataset=%s target=_blank>rep</a>'%dataset,
                                 ])

        if p:
            cached = filter(lambda d : d['RequestName']==wfn, cache)
            if cached:
                wl = cached[0]
            else:
                wl = getWorkLoad('cmsweb.cern.ch',wfn)
            text+=', (%s)'%(wl['RequestPriority'])
            pass

        if pid:
            if ms:
                mcm_s = json.loads(os.popen('curl https://cms-pdmv.cern.ch/mcm/public/restapi/requests/get_status/%s --insecure'%pid).read())[pid]
                text+=', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm (%s)</a>'%(pid,mcm_s)
            else:
                text+=', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm</a>'%(pid)
                text+=', <a href="https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s" target="_blank">ac</a>'%(wl_pid)
                
        if status:
            if wf.status.startswith('assistance'):
                text+=', <a href="assistance.html#%s" target="_blank">assist</a>'%wfn
            text+=' : %s '%(wf.status)

        if view and wfs!='acquired':
            text+='<a href="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" target="_blank"><img src="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" style="height:50px"></a>'%(wfn.replace('_','/'),wfn.replace('_','/'))
        if ongoing:
            text+='<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a>'%(wfn,wfn)

        if ongoing:
            date1 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime(time.mktime(time.gmtime())-(15*24*60*60)) )
            date2 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime())
            text+='<a href="http://dashb-cms-job.cern.ch/dashboard/templates/web-job2/#table=Jobs&date1=%s&date2=%s&sortby=site&task=wmagent_%s">dashb</a>'%( date1, date2, wfn )

        if ongoing and wfn in boost:
            for task in boost[wfn]:
                overflow = boost[wfn][task].get('ReplaceSiteWhitelist',None)
                if not overflow:
                    overflow = boost[wfn][task].get('AddWhitelist',None)
                if overflow:
                    text+=',boost (<a href=equalizor.json>%d</a>)'%len(overflow)

        #text+="<hr>"
        return text


    def phl(phid):
        text=', '.join([
                str(phid),
                '<a href="https://cmsweb.cern.ch/phedex/prod/Request::View?request=%s" target="_blank">vw</a>'%phid,
                '<a href="https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions?reqfilter=%s" target="_blank">sub</a>'%phid,
                ])
        return text
            

    def ol(out):
        return '<a href="https://cmsweb.cern.ch/das/request?input=%s" target="_blank"> %s</a>'%(out,out)


    def lap( comment ):
        
        l = time.mktime(time.gmtime())
        spend = l-lap.start
        lap.start =l 
        print "Spend %d [s] for %s"%( spend, comment )
    lap.start = time.mktime(time.gmtime())

    ## start to write it
    #html_doc = open('/afs/cern.ch/user/v/vlimant/public/ops/index.html','w')
    html_doc = open('%s/index.html.new'%monitor_dir,'w')
    print "Updating the status page ..." 

    UC = unifiedConfiguration()

    if not caller:
        try:
            #caller = sys._getframe(1).f_code.co_name
            caller = sys.argv[0].split('/')[-1].replace('.py','')
            print "caller is"
            print caller
        except Exception as es:
            caller = 'none found'
            print "not getting frame"
            print str(es)

    html_doc.write("""
<html>
<head>
<META HTTP-EQUIV="refresh" CONTENT="900">
<script type="text/javascript">
 function showhide(id) {
    var e = document.getElementById(id);
    e.style.display = (e.style.display == 'block') ? 'none' : 'block';
 }
</script>
</head>
<body>

Last update on %s(CET), %s(GMT)
<br>
<a href=logs/ target=_blank>logs</a> <a href=logs/last.log target=_blank>last</a> <a href=statuses.html>statuses</a> <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/ target=_blank>prod mon</a> <a href=https://%s/wmstats/index.html target=_blank>wmstats</a> <a href=http://t3serv001.mit.edu/~cmsprod/IntelROCCS/Detox/SitesInfo.txt target=_blank>detox</a> <a href=locked.html>space</a> <a href=logs/subscribor/last.log target=_blank>blocks</a> <a href=logs/agents/last.log>agents</a>
<br>
<a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/CompOpsWorkflowL3Responsibilities#Automatic_Assignment_and_Unified>what am I</a> <a href=data.html>json interfaces</a> <a href=logs/addHoc/last.log>add-hoc op</a> created from <b>%s <a href=logs/last_running>last running</a></b> <object height=20 type="text/html" data="logs/last_running"><p>backup content</p></object>
<br><br>

""" %(time.asctime(time.localtime()),
      time.asctime(time.gmtime()),
      reqmgr_url,
      caller
      )
                   )
        
    text=""
    count=0
    count_by_campaign=defaultdict(lambda : defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status.startswith('considered')).all():
        wl = getWL( wf.name )
        count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])]+=1
        #print wf.name
        text+="<li> %s </li> \n"%wfl(wf,p=True)
        count+=1
    text_by_c=""
    for c in count_by_campaign:
        text_by_c+="<li> %s (%d) : "%( c, sum(count_by_campaign[c].values()) )
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c+="%d (%d), "%(p,count_by_campaign[c][p])
        text_by_c+="</li>"

    html_doc.write("""
Worflow next to handle (%d) <a href=https://cms-pdmv.cern.ch/mcm/batches?status=new&page=-1 target="_blank"> batches</a> <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/transferor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('considered')">[Click to show/hide]</a>
<br>
<div id="considered" style="display:none;">
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('considered_bywf')">[Click to show/hide]</a><div id="considered_bywf" style="display:none;">
 <ul>
 %s
 </ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('considered_bycamp')">[Click to show/hide]</a><div id="considered_bycamp" style="display:none;">
 <ul>
 %s
 </ul></div>
</ul>
</div>
"""%(count,
     count, text,
     len(count_by_campaign), text_by_c))
                   
    lap( 'done with considered' )
    text=""
    count=0
    count_by_campaign=defaultdict(lambda : defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status=='staging').all():
        wl = getWL( wf.name )
        count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])]+=1
        text+="<li> %s </li> \n"%wfl(wf,within=True)
        count+=1

    text_by_c=""
    for c in count_by_campaign:
        text_by_c+="<li> %s (%d) : "%( c, sum(count_by_campaign[c].values()) )
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c+="%d (%d), "%(p,count_by_campaign[c][p])
        text_by_c+="</li>"


    html_doc.write("""
Worflow waiting in staging (%d) <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staging')">[Click to show/hide]</a>
<br>
<div id="staging" style="display:none;">
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('staging_bywf')">[Click to show/hide]</a><div id="staging_bywf" style="display:none;">                                                                                                                                                                       
 <ul>            
 %s
 </ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('staging_bycamp')">[Click to show/hide]</a><div id="staging_bycamp" style="display:none;">                                                                                                                                                                  
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>                                                                                                                                                                                                                                                                                                               
</ul>      
</div>
"""%(count, 
     count, text,
     len(count_by_campaign), text_by_c))

    lap ( 'done with staging' )

    text="<ul>"
    count=0
    transfer_per_wf = defaultdict(list)
    for ts in session.query(Transfer).filter(Transfer.phedexid>0).all():
        hide = True
        t_count = 0
        stext=""
        for pid in ts.workflows_id:
            w = session.query(Workflow).get(pid)
            hide &= (w.status != 'staging' )
            if w.status in ['considered','staging','staged']:
                stext += "<li> %s </li>\n"%( wfl(w,status=True))
                transfer_per_wf[w].append( ts.phedexid )
                t_count +=1
        stext = '<li> %s serves %d workflows<br><a href="javascript:showhide(\'%s\')">[show/hide]</a> <div id="%s" style="display:none;"><ul>\n'%( phl(ts.phedexid), t_count, ts.phedexid, ts.phedexid ) + stext
        
        stext+="</ul></li>\n"
        if hide:
            #text+="<li> %s not needed anymore to start running (does not mean it went through completely)</li>"%phl(ts.phedexid)
            pass
        else:
            count+=1
            text+=stext
    text+="</ul>"
    
    text_bywf="<ul>"
    for wf in transfer_per_wf:
        text_bywf += "<li> %s </li>"%(wfl(wf,within=True))
        text_bywf += '<a href=javascript:showhide("transfer_%s")>[Click to show/hide] %d transfers</a>'% (wf.name, len(transfer_per_wf[wf]))
        text_bywf += '<div id="transfer_%s" style="display:none;">'% wf.name
        text_bywf += "<ul>"
        for pid in sorted(transfer_per_wf[wf]):
            text_bywf += "<li> %s </li>"%(phl(pid))
        text_bywf += "</ul></div><hr>"
    text_bywf += '</ul>'

    html_doc.write("""
Transfer on-going (%d) <a href=http://cmstransferteam.web.cern.ch/cmstransferteam/ target=_blank>dashboard</a> <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('transfer')">[Click to show/hide]</a>
<br>
<div id="transfer" style="display:none;">
 <ul>
  <li> By Workflow
    <a href="javascript:showhide('transfer_bywf')">[Click to show/hide]</a>
    <div id="transfer_bywf" style="display:none;">
%s
    </div>
  </li>
  <li> By transfer request
    <a href="javascript:showhide('transfer_byreq')">[Click to show/hide]</a>
    <div id="transfer_byreq" style="display:none;"> 
%s
    </div>
  </li>
 </ul>
</div>
"""%(count,
     text_bywf,
     text))

    lap( 'done with transfers' )

    text=""
    count=0
    count_by_campaign=defaultdict(lambda : defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status=='staged').all():
        wl = getWL( wf.name )
        count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])]+=1
        text+="<li> %s </li> \n"%wfl(wf,p=True)
        count+=1
    text_by_c=""
    for c in count_by_campaign:
        text_by_c+="<li> %s (%d) : "%( c, sum(count_by_campaign[c].values()) )
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c+="%d (%d), "%(p,count_by_campaign[c][p])
        text_by_c+="</li>"

    html_doc.write("""Worflow ready for assigning (%d) <a href=logs/stagor/last.log target=_blank>log</a> <a href=logs/assignor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staged')">[Click to show/hide]</a>
<br>
<div id="staged" style="display:none;">
<br>
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('staged_bywf')">[Click to show/hide]</a><div id="staged_bywf" style="display:none;">                                                                                                                                                                             
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>                                                                                                                                                                                                                                                                                                               
<li> By campaigns (%d) </li><a href="javascript:showhide('staged_bycamp')">[Click to show/hide]</a><div id="staged_bycamp" style="display:none;">                                                                                                                                                                        
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>
</ul>
</div>
"""%(count, 
     count, text,
     len(count_by_campaign), text_by_c))

    lap( 'done with staged' )
    
    lines=[]
    count_by_campaign=defaultdict(lambda : defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status=='away').all():
        wl = getWL( wf.name )
        count_by_campaign[wl['Campaign']][int(wl['RequestPriority'])]+=1
        lines.append("<li> %s <hr></li>"%wfl(wf,view=True,ongoing=True))
    text_by_c=""
    for c in count_by_campaign:
        text_by_c+="<li> %s (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/campaign.php?campaign=%s>mon</a> <a href=https://cms-pdmv.cern.ch/pmp/historical?r=%s target=_blank>pmp</a> "%( c, sum(count_by_campaign[c].values()),c,c )
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c+="%d (%d), "%(p,count_by_campaign[c][p])
        text_by_c+="</li>"

    lines.sort()
    html_doc.write("""
Worflow on-going (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests_in_production.php target=_blank>ongoing</a> <a href=https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=summary target=_blank>elog</a> <a href=http://cms-gwmsmon.cern.ch/prodview target=_blank>queues</a> <a href=logs/assignor/last.log target=_blank>log</a> <a href=logs/checkor/last.log target=_blank>postlog</a> <a href=logs/equalizor/last.log target=_blank>equ</a> <a href=logs/completor/last.log target=_blank>comp</a>
<a href="javascript:showhide('away')">[Click to show/hide]</a>
<br>
<div id="away" style="display:none;">
<ul> 
<li>By workflow (%d) </li>
<a href="javascript:showhide('away_bywf')">[Click to show/hide]</a><div id="away_bywf" style="display:none;">
<ul>
%s
</ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('away_bycamp')">[Click to show/hide]</a><div id="away_bycamp" style="display:none;">
<ul>
%s
</ul></div>
</ul>
</div>
"""%(len(lines),
     len(lines),
     '\n'.join(lines),
     len(count_by_campaign),
     text_by_c
     ))


    lap ( 'done with away' )

    text=""
    count=0
    #for wf in session.query(Workflow).filter(Workflow.status == 'assistance-custodial').all():
    for wf in session.query(Workflow).filter(Workflow.status.startswith('assistance')).filter(Workflow.status.contains('custodial')).all():
        text+="<li> %s </li> \n"%wfl(wf,view=True,update=True,status=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""Worflow that are closing (%d)
<a href=closeout.html target=_blank>closeout</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('closing')">[Click to show/hide]</a>
<br>
<div id="closing" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    lap ( 'done with closing' )

    assistance_by_type = defaultdict(list)
    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all():
        assistance_by_type[wf.status].append( wf )
        count+=1
    for assistance_type in assistance_by_type:
        text += "<li> %s (%d) <a href=\"javascript:showhide('%s')\">[Click to show/hide]</a><br><div id=\"%s\" style=\"display:none;\"><ul>"%( assistance_type,
                                                                                                                                               len(assistance_by_type[assistance_type]),
                                                                                                                                               assistance_type,
                                                                                                                                               assistance_type,
                                                                                                                                               )
        for wf in assistance_by_type[assistance_type]:
            text+="<li> %s <hr></li> \n"%wfl(wf,view=True,within=True,status=True,update=True)
        text += "</ul></div></li>\n"
    html_doc.write("""Worflow which need assistance (%d)
<a href=assistance.html target=_blank>assistance</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/recoveror/last.log target=_blank>postlog</a>
<a href="javascript:showhide('assistance')">[Click to show/hide]</a>
<br>
<div id="assistance" style="display:none;">
<br>
<ul>
%s
</ul>
</div>
"""%(count, text))
    
    lap ( 'done with assistance' )

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status == 'close').all():
        text+="<li> %s </li> \n"%wfl(wf)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""Worflow ready to close (%d)
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('close')">[Click to show/hide]</a>
<br>
<div id="close" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    lap ( 'done with annoucing' )

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='trouble').all():
        text+="<li> %s </li> \n"%wfl(wf)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""Worflow with issue (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/injector/last.log target=_blank>postlog</a>
<a href="javascript:showhide('trouble')">[Click to show/hide]</a>
<br>
<div id="trouble" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    lap ( 'done with trouble' )

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='forget').all():
        text+="<li> %s </li> \n"%wfl(wf)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""
Worflow to forget (%d) <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/lockor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('forget')">[Click to show/hide]</a>
<br>
<div id="forget" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    lap ( 'done with forget' )

    text=""
    count=0
    for wf in session.query(Workflow).filter(Workflow.status=='done').all():
        text+="<li> %s </li> \n"%wfl(wf)#,ms=True)
        count+=1
    text+="</ul></div>\n"
    html_doc.write("""
Worflow through (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/lockor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('done')">[Click to show/hide]</a>
<br>
<div id="done" style="display:none;">
<br>
<ul>
"""%count)
    html_doc.write(text)

    lap ( 'done with done' )


    wfs = session.query(Workflow).filter(Workflow.status.endswith('-unlock')).all()
    html_doc.write(" Workflows unlocked : %s <a href=logs/lockor/last.log target=_blank>log</a><br>"%(len(wfs)))
    lap ( 'done with unlocked' )



    text=""
    lines_thisweek=[]
    lines_lastweek=[]
    now = time.mktime(time.gmtime())
    this_week = int(time.strftime("%W",time.gmtime()))
    start_time_two_weeks_ago = time.mktime(time.gmtime(now - (20*24*60*60))) # 20
    last_week =  int(time.strftime("%W",time.gmtime(now - ( 7*24*60*60))))

    all_locks = json.loads(open('%s/globallocks.json'%monitor_dir).read())    
    waiting_custodial = json.loads(open('%s/waiting_custodial.json'%monitor_dir).read())
    all_pending_approval_custodial = dict([(k,item) for k,item in waiting_custodial.items() if 'nodes' in item and not any([node['decided'] for node in item['nodes'].values()]) ])
    n_pending_approval = len( all_pending_approval_custodial )
    #n_pending_approval = len([item for item in waiting_custodial.values() if 'nodes' in item and not any([node['decided'] for node in item['nodes'].values() ])])
    missing_approval_custodial = json.loads(open('%s/missing_approval_custodial.json'%monitor_dir).read())

    stuck_custudial = json.loads(open('%s/stuck_custodial.json'%monitor_dir).read())
    lagging_custudial = json.loads(open('%s/lagging_custodial.json'%monitor_dir).read())
    if len(stuck_custudial):
        stuck_string = ', <font color=red>%d appear to be <a href=stuck_custodial.json>stuck</a></font>'% len(stuck_custudial)
    else:
        stuck_string = ''
    if len(missing_approval_custodial):
        long_approve_string = ', <font color=red>%d more than %d days</font>'%( len(missing_approval_custodial), UC.get('transfer_timeout'))
    else:
        long_approve_string = ''
    

    output_within_two_weeks=session.query(Output).filter(Output.date>=start_time_two_weeks_ago).all()
    waiting_custodial_string=""
    waiting_custodial_strings=[]
    for ds in waiting_custodial:
        out = None
        ## lots of it will be within two weeks
        of = filter(lambda odb: odb.datasetname == ds, output_within_two_weeks)
        if of:
            out = of[0]
        else:
            out = session.query(Output).filter(Output.datasetname == ds).first()
        if out:
            info = waiting_custodial[out.datasetname]
            action = 'going'
            if out.datasetname in all_pending_approval_custodial:
                action = '<font color=red>pending</font>'
            try:
                size = str(info['size'])
            except:
                size = "x"

            destination = ",".join(info['nodes'].keys())
            if not destination:
                destination ='<font color=red>NO SITE</font>'

            a_waiting_custodial_string = "<li>on week %s : %s %s</li>"%(
                time.strftime("%W (%x %X)",time.gmtime(out.date)),
                ol(out.datasetname),
                ' %s [GB] %s to %s on %s (<a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/requestlist?dataset=%s&node=T*MSS">%d missing</a>)'%( size, action, destination, time.asctime(time.gmtime(info['checked'])), out.datasetname, info['nmissing'])
                )
            waiting_custodial_strings.append( (out.date, a_waiting_custodial_string) )

        waiting_custodial_strings.sort( key = lambda i:i[0] )
        waiting_custodial_string="\n".join( [i[1] for i in waiting_custodial_strings] )
    #start_time_two_weeks_ago = time.mktime(time.strptime("15-0-%d"%(this_week-2), "%y-%w-%W"))
    for out in output_within_two_weeks:
        if not out.workflow: 
            print "This is a problem with",out.datasetname
            continue
        if  out.workflow.status in ['done-unlock','done','clean','clean-out','clean-unlock']:
            custodial=''
            if out.datasetname in waiting_custodial:
                info = waiting_custodial[out.datasetname]
                try:
                    try:
                        size = str(info['size'])
                    except:
                        size = "x"
                    destination = ",".join(info['nodes'].keys())
                    if not destination:
                        destination ='<font color=red>NO SITE</font>'
                    action = 'going'
                    if out.datasetname in all_pending_approval_custodial:
                        action = '<font color=red>pending</font>'

                    
                    custodial=' %s [GB] %s to %s on %s (<a href="https://cmsweb.cern.ch/phedex/datasvc/xml/prod/requestlist?dataset=%s&node=T*MSS">%d missing</a>)'%( size, action, destination, time.asctime(time.gmtime(info['checked'])), out.datasetname, info['nmissing'])
                except Exception as e:
                    #print info
                    #print str(e)
                    pass
            elif out.datasetname in all_locks:
                custodial='<font color=green>LOCKED</font>'
            out_week = int(time.strftime("%W",time.gmtime(out.date)))
            ##only show current week, and the previous.
            if last_week==out_week:
                lines_lastweek.append("<li>on week %s : %s %s</li>"%(
                        time.strftime("%W (%x %X)",time.gmtime(out.date)),
                        ol(out.datasetname),
                        custodial
                        )
                             )
            if this_week==out_week:

                lines_thisweek.append("<li>on week %s : %s %s</li>"%(
                        time.strftime("%W (%x %X)",time.gmtime(out.date)),
                        ol(out.datasetname),
                        custodial
                        )
                             )
    lines_thisweek.sort()
    lines_lastweek.sort()

    html_doc.write("""Output produced (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?in_disagreement=1 target=_blank>disagreements</a>
<a href="javascript:showhide('output')">[Click to show/hide]</a>
<br>
<div id="output" style="display:none;">
<br>
<ul>
<li> %d waiting to go to tape</li>
<ul>
<li> %d waiting for tape approval%s</li>
<li> %d are not completed after %d days%s</li>
<li> Full list (%d) <a href="javascript:showhide('waiting-custodial')">[Click to show/hide]</a>
<div id="waiting-custodial" style="display:none;">
<ul>
%s
</ul>
</div>
</li>
</ul>
<li> Last week (%d) </li><a href="javascript:showhide('output_lastweek')">[Click to show/hide]</a><div id="output_lastweek" style="display:none;"><ul>
%s
</ul></div>
<li> This week (%d) </li><a href="javascript:showhide('output_thisweek')">[Click to show/hide]</a><div id="output_thisweek" style="display:none;"><ul>
%s
</ul></div></div>
"""%( len(lines_lastweek)+len(lines_thisweek),
      len(waiting_custodial),
      n_pending_approval,long_approve_string,
      len(lagging_custudial),UC.get('transfer_timeout'),stuck_string,
      len(waiting_custodial),waiting_custodial_string,
      len(lines_lastweek),
     '\n'.join(lines_lastweek),
      len(lines_thisweek),
     '\n'.join(lines_thisweek))
                   )

    lap ( 'done with output' )


    html_doc.write("""Job installed
<a href="javascript:showhide('acron')">[Click to show/hide]</a>
<br>
<div id="acron" style="display:none;">
<br>
<pre>
%s
</pre>
"""%(os.popen('acrontab -l | grep Unified | grep -v \#').read()))


    per_module = defaultdict(list)
    for t in filter(None,os.popen('cat %s/logs/*/*.time'%monitor_dir).read().split('\n')):
        module_name,run_time,spend = t.split(':')
        ## then do what you want with it !
        if 'cleanor' in module_name: continue
        
        per_module[module_name].append( int(spend) )

    def display_time( sec ):
        m, s = divmod(sec, 60)
        h, m = divmod(m, 60)
        dis=""
        if h:
            dis += "%d [h] "%h
        if h or m:
            dis += "%d [m] "%m
        if h or m or s:
            dis += "%d [s]"%s
            
        return dis

    html_doc.write("Module running time<ul>\n")
    for m,spends in per_module.items():
        avg = sum(spends)/float(len(spends))
        lasttime =  spends[-1]
        html_doc.write("<li>%s : last %s, avg %s</li>\n"%( m, display_time(lasttime), display_time(avg)))
    html_doc.write("</ul>")

    html_doc.write("Last running <pre>%s</pre><br>"%( os.popen("tac %s/logs/running | head -5"%monitor_dir).read() ))


    html_doc.write("Order in cycle <pre>%s</pre><br>"%( '\n'.join(map(lambda l : l.split('/')[-1].replace('.py',''), filter(lambda l : not l.startswith('#') and 'Unified' in l and 'py' in l.split('/')[-1], open('%s/WmAgentScripts/cycle.sh'%base_dir).read().split('\n')))) ))


    html_doc.write("</div>\n")
    lap ( 'done with jobs' )


    text=""
    count=0
    for (c,info) in campaignInfo().campaigns.items():
        #if 'go' in info and info['go']:
        text+="<li>%s <br> <pre>%s</pre>  </li>"%( c, json.dumps( info, indent=2))
        count+=1

    html_doc.write("""Campaign configuration
<a href="javascript:showhide('campaign')">[Click to show/hide]</a>
<br>
<div id="campaign" style="display:none;">
<br>
<ul>
%s
</ul></div>
"""%(text))

    text=""
    count=0
    n_column = 4
    SI = siteInfo()
    date1 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime(time.mktime(time.gmtime())-(15*24*60*60)) ) ## 15 days
    date2 = time.strftime('%Y-%m-%d+%H:%M', time.gmtime())
    for t in SI.types():
        text+="<li>%s<table border=1>"%t
        c=0
        for site in getattr(SI,t):
            cpu = SI.cpu_pledges[site] if site in SI.cpu_pledges else 'N/A'
            disk = SI.disk[SI.CE_to_SE(site)] if SI.CE_to_SE(site) in SI.disk else 'N/A'
            if c==0:
                text+="<tr>"
            if not disk:
                ht_disk = '<font color=red>Disk available: %s</font>'%disk
            else:
                ht_disk = 'Disk available: %s'%disk

            text+='<td><a href=http://dashb-ssb.cern.ch/dashboard/templates/sitePendingRunningJobs.html?site=%s>%s</a><br><a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a><br><a href="http://dashb-cms-job.cern.ch/dashboard/templates/web-job2/#user=&refresh=0&table=Jobs&p=1&records=25&activemenu=1&site=%s&submissiontool=wmagent&check=submitted&sortby=activity&scale=linear&bars=20&data1=%s&date2=%s">dashb</a><br>CPU pledge: %s<br>%s</td>'%(site,site,site,site,site,date1,date2,cpu,ht_disk)
            if c==n_column:
                c=0
            else:
                c+=1
        text+="</table></li>"

    text += "<li> Sites in auto-approved transfer<ul>"
    for site in sorted(SI.sites_auto_approve):
        text+="<li>%s"% site
    text += "</ul></li>"

    text += "<li> Sites with vetoe transfer<ul>"
    for site in sorted(SI.sites_veto_transfer):
        text+="<li>%s"% site
    text += "</ul></li>"

    text += "<li> Sites banned from production<ul>"
    for site in sorted(SI.sites_banned):
        text+="<li>%s"% site
    text += "</ul></li>"

    text += "<li> Approximate Free Tape<ul>"
    for mss in SI.storage:
        waiting = 0
        try:
            waiting = float(os.popen("grep '%s is pending . Created since' %s/logs/lockor/last.log  -B 3 | grep size | awk '{ sum+=$6 ; print sum }' | tail -1" % (mss,monitor_dir)).readline())
        except Exception as e:
            print str(e)

        oldest = ""
        os.system('grep pending %s/logs/lockor/last.log | sort -u > %s/logs/pending.log'%(monitor_dir,monitor_dir))
        try:
            oldest = os.popen("grep '%s is pending . Created since ' %s/logs/lockor/last.log | sort | awk '{print $10, $11, $12, $13, $14 }' | head -1"% (mss,monitor_dir)).readline()
        except Exception as e:
            print str(e)
        waiting /= 1024.
        text+="<li>%s : %d [TB]. Waiting for approval %d [TB] since %s </li>"%(mss, SI.storage[mss], waiting, oldest)
    text += "</ul></li>"

    lap ( 'done with sites' )

    open('%s/siteInfo.json'%monitor_dir,'w').write(json.dumps(dict([(t,getattr(SI,t)) for t in SI.types()]),indent=2))

    lap ( 'done with sites json' )

    chart_data = defaultdict(list)
    for site in SI.quota:
        chart_data[site].append("""
var data_%s = google.visualization.arrayToDataTable([ 
['Overall', 'Space in TB'],
//['Quota' , %s],
['Locked' , %s],
['Free' , %s]
]);
"""%( site,
      SI.quota[site], SI.locked[site], SI.disk[site],
      ))
        chart_data[site].append("""
var chart_%s = new google.visualization.PieChart(document.getElementById('donutchart_%s'));
chart_%s.draw(data_%s, {title: '%s %s [TB]', pieHole:0.4, slices:{0:{color:'red'},1:{color:'green'}}});
"""%(site,site,
     site,site,
     site,SI.quota[site]))
        chart_data[site].append("""
<div id="donutchart_%s" style="height: 200px;width: 300px"></div>
"""%(site))

        
    ## make the locked/available donut chart
    donut_html = open('%s/locked.html'%monitor_dir,'w')
    tables = "\n".join([info[0] for site,info in chart_data.items()])
    draws = "\n".join([info[1] for site,info in chart_data.items()])
    divs = "\n".join([info[2] for site,info in chart_data.items()])

    
    divs_table="<table border=0>"
    for c,site in enumerate(sorted(chart_data.keys())):
        if c%5==0:
            divs_table += "<tr>"
        divs_table += "<td>%s</td>"%(chart_data[site][2])
    divs_table += "</table>"

    donut_html.write("""
<html>
  <head>
    <script type="text/javascript" src="https://www.google.com/jsapi"></script>
    <script type="text/javascript">
      google.load("visualization", "1", {packages:["corechart"]});
      google.setOnLoadCallback(drawChart);
      function drawChart() {
%s

%s
      }
    </script>
  </head>
  <body>
%s
  </body>
</html>
"""%( tables,draws,divs_table   )
                     )
    donut_html.close()

    html_doc.write("""Site configuration
<a href="javascript:showhide('site')">[Click to show/hide]</a>
<br>
<div id="site" style="display:none;">
<br>
<ul>
%s
</ul></div>
"""%(text))

    lap ( 'done with space' )


    text = ""
    for param in UC.configs:
        text +="<li>%s</li><ul>\n"% param
        for sub in sorted(UC.configs[param].keys()):
            text +="<li> %s : %s </li>\n"%( sub, UC.configs[param][sub] )
        text += '</ul>\n'
        
    html_doc.write("""Unified configuration
<a href="javascript:showhide('config')">[Click to show/hide]</a>
<br>
<div id="config" style="display:none;">
<br>
<ul>
%s
</ul></div>                                                                                                                                                                                                                                                                                                                
"""%(text))

    lap ( 'done with configuration' )


    print "... done with status page."
    html_doc.write("""
</body>
</html>
""")

    html_doc.close()
    ## and put the file in place
    os.system('mv %s/index.html.new %s/index.html'%(monitor_dir,monitor_dir))

        
    statuses = json.loads(open('%s/statusmon.json'%monitor_dir).read())
    s_count = defaultdict(int)
    now = time.mktime(time.gmtime())
    for wf in session.query(Workflow).all():
        s_count[wf.status]+=1
    statuses[now] = dict( s_count )
    ## remove old entries
    for t in statuses.keys():
        if (now-float(t)) > 7*24*60*60:
            statuses.pop(t)
    open('%s/statusmon.json'%monitor_dir,'w').write( json.dumps( statuses , indent=2))

    html_doc = open('%s/statuses.html'%monitor_dir,'w')
    html_doc.write("""                                                                                                                                                                                                                                                                                                      <html>        
<table border=1>
<thead>
<tr>
<th> workflow </th><th> status </th><th> wm status</th>
</tr>
</thead>
""")
    wfs = {}
    for wfo in session.query(Workflow).all():
        ## pass all that is unlocked and considered it gone
        wfs[wfo.name] = (wfo.status,wfo.wm_status)

    open('%s/statuses.json'%monitor_dir,'w').write(json.dumps( wfs ))
    for wfn in sorted(wfs.keys()):
        ## pass all that is unlocked and considered it gone
        if 'unlock' in wfs[wfn][0]: continue
        html_doc.write('<tr><td><a id="%s">%s</a></td><td>%s</td><td>%s</td></tr>\n'%( wfn, wfn, wfs[wfn][0],  wfs[wfn][1]))
    html_doc.write("</table>")
    html_doc.write("<br>"*100)
    html_doc.write("end of page</html>")
    html_doc.close()
Ejemplo n.º 35
0
def spawn_harvesting(url, wfi , in_full):
    SI = siteInfo()
    
    all_OK = {}
    requests = []
    outputs = wfi.request['OutputDatasets'] 
    if ('EnableHarvesting' in wfi.request and wfi.request['EnableHarvesting']) or ('DQMConfigCacheID' in wfi.request and wfi.request['DQMConfigCacheID']):
        if not 'MergedLFNBase' in wfi.request:
            print "f****d up"
            sendEmail('screwed up wl cache','%s wl cache is bad'%(wfi.request['RequestName']))
            all_OK['fake'] = False
            return all_OK,requests

        wfi = workflowInfo(url, wfi.request['RequestName'])
        dqms = [out for out in outputs if '/DQM' in out]
        if not all([in_full[dqm_input] for dqm_input in dqms]):
            wfi.sendLog('closor',"will not be able to assign the harvesting: holding up")
            for dqm_input in dqms:
                all_OK[dqm_input] = False
                return all_OK,requests

        for dqm_input in dqms:
            ## handle it properly
            harvesting_schema = {
                'Requestor': os.getenv('USER'),
                'RequestType' : 'DQMHarvest',
                'Group' : 'DATAOPS'
                }
            copy_over = ['ProcessingString',
                         'DQMUploadUrl',
                         'CMSSWVersion',
                         'CouchDBName',
                         'CouchWorkloadDBName',
                         'CouchURL',
                         'DbsUrl',
                         'inputMode',
                         'DQMConfigCacheID',
                         'OpenRunningTimeout',
                         'ScramArch',
                         'CMSSWVersion',
                         'Campaign',
                         'Memory', #dummy
                         'SizePerEvent', #dummy
                         'GlobalTag', #dummy
                         ]
            for item in copy_over:
                harvesting_schema[item] = copy.deepcopy(wfi.request[item])
            harvesting_schema['InputDataset'] = dqm_input
            harvesting_schema['TimePerEvent'] = 1
            harvesting_schema['PrepID'] = 'Harvest-'+wfi.request['PrepID']
            harvesting_schema['RequestString'] = 'HARVEST-'+wfi.request['RequestString']
            harvesting_schema['DQMHarvestUnit'] = 'byRun'
            harvesting_schema['ConfigCacheUrl'] = harvesting_schema['CouchURL'] ## uhm, how stupid is that ?
            harvesting_schema['RequestPriority'] = wfi.request['RequestPriority']*10

            harvest_request = reqMgrClient.submitWorkflow(url, harvesting_schema)
            if not harvest_request:
                print "Error in making harvesting for",wfo.name
                print "schema"
                print json.dumps( harvesting_schema, indent = 2)
                harvest_request = reqMgrClient.submitWorkflow(url, harvesting_schema)
                if not harvest_request:
                    print "Error twice in harvesting for",wfo.name
                    print "schema"
                    print json.dumps( harvesting_schema, indent = 2)

            if harvest_request:
                requests.append( harvest_request )
                ## should we protect for setting approved ? no, it's notified below, assignment will fail, likely
                data = reqMgrClient.setWorkflowApproved(url, harvest_request)
                print "created",harvest_request,"for harvesting of",dqm_input
                wfi.sendLog('closor',"created %s for harvesting of %s"%( harvest_request, dqm_input))
                ## assign it directly
                team = wfi.request['Teams'][0]
                parameters={
                    'SiteWhitelist' : [SI.SE_to_CE(se) for se in wfi.request['NonCustodialSites']],
                    'AcquisitionEra' : wfi.acquisitionEra(),
                    'ProcessingString' : wfi.processingString(),
                    'MergedLFNBase' : wfi.request['MergedLFNBase'], 
                    'ProcessingVersion' : wfi.request['ProcessingVersion'],
                    'execute' : True
                    }
                if in_full[dqm_input]:
                    print "using full copy at",in_full[dqm_input]
                    parameters['SiteWhitelist'] = [SI.SE_to_CE(se) for se in in_full[dqm_input]]
                else:
                    print "cannot do anything if not having a full copy somewhere"
                    all_OK[dqm_input]=False
                    continue

                result = reqMgrClient.assignWorkflow(url, harvest_request, team, parameters)
                if not result:
                    sendEmail('harvesting request created','%s was created at announcement of %s in %s, failed to assign'%(harvest_request, dqm_input, wfi.request['RequestName']), destination=[wfi.request['Requestor']+'@cern.ch'])
                else:
                    sendEmail('harvesting request assigned','%s was created at announcement of %s in %s, and assigned'%(harvest_request, dqm_input, wfi.request['RequestName']), destination=[wfi.request['Requestor']+'@cern.ch']) 
                    wfi.sendLog('closor','%s was created at announcement of %s in %s, and assigned'%(harvest_request, dqm_input, wfi.request['RequestName']))

            else:
                print "could not make the harvesting for",wfo.name,"not announcing"
                wfi.sendLog('closor',"could not make the harvesting request")
                all_OK[dqm_input]=False                    
    return (all_OK, requests)
Ejemplo n.º 36
0
def checkor(url, spec=None, options=None):
    if userLock():   return
    if duplicateLock():  return


    fDB = closeoutInfo()

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    wfs=[]
    if options.new:
        ## get all in running and check

        ## you want to intersect with what is completed !
        if options.strict:
            completed_wfi = getWorkflows(url, status='completed')
            for wfo in session.query(Workflow).filter(Workflow.status == 'away').all():
                if wfo.name in completed_wfi:
                    wfs.append( wfo )
                else:
                    print wfo.name,"is not completed"
                    sendLog('checkor','%s is not completed'%( wfo.name))
        else:
            wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )

    if options.current:
        ## recheck those already there, probably to just pass them along
        wfs.extend( session.query(Workflow).filter(Workflow.status== 'assistance').all() )

    if options.old:
        ## than get all in need for assistance
        wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance-')).all() )


    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False)

    def get_campaign(output, wfi):
        campaign = None
        try:
            campaign = output.split('/')[2].split('-')[0]
        except:
            if 'Campaign' in wfi.request:
                campaign = wfi.request['Campaign']
        return campaign

    ## retrieve bypass and onhold configuration
    bypasses = []
    forcings = []
    overrides = getForceCompletes()
    holdings = []

    for bypassor,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('prozober','*****@*****.**')]:
        bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor)
        if not os.path.isfile(bypass_file):
            #sendLog('checkor','no file %s',bypass_file)
            continue
        try:
            bypasses.extend( json.loads(open(bypass_file).read()))
        except:
            sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email])
        
        holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor)
        if not os.path.isfile(holding_file):
            #sendLog('checkor',"no file %s"%holding_file)
            continue
        try:
            holdings.extend( json.loads(open(holding_file).read()))
        except:
            sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email])

    ## once this was force-completed, you want to bypass
    for rider,email in [('vlimant','*****@*****.**'),('jen_a','*****@*****.**'),('srimanob','*****@*****.**')]:
        rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider)
        if not os.path.isfile(rider_file):
            print "no file",rider_file
            #sendLog('checkor',"no file %s"%rider_file)
            continue
        try:
            bypasses.extend( json.loads(open( rider_file ).read() ) )
        except:
            sendLog('checkor',"cannot get force complete list from %s"%rider)
            sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email])

    if use_mcm:
        forcings = mcm.get('/restapi/requests/forcecomplete')
        if forcings:
            sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings)))


    pattern_fraction_pass = UC.get('pattern_fraction_pass')

    total_running_time = 5.*60. 
    sleep_time = 1
    if len(wfs):
        sleep_time = min(max(0.5, total_running_time / len(wfs)), 10)

    random.shuffle( wfs )

    print len(wfs),"to consider, pausing for",sleep_time
    max_per_round = UC.get('max_per_round').get('checkor',None)
    if max_per_round and not spec: wfs = wfs[:max_per_round]

    for wfo in wfs:
        if spec and not (spec in wfo.name): continue
        time.sleep( sleep_time )
        
        ## get info
        wfi = workflowInfo(url, wfo.name)
        wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status))
        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status))
            wfo.status = 'close'
            session.commit()
            continue

        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            ## went into trouble
            wfo.status = 'trouble'
            wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            wfi.sendLog('checkor',"%s is not running yet"%wfo.name)
            session.commit()
            continue
        
        if '-onhold' in wfo.status:
            if wfo.name in holdings and wfo.name not in bypasses:
                wfi.sendLog('checkor',"%s is on hold"%wfo.name)
                continue

        if wfo.wm_status != 'completed': #and not wfo.name in bypasses:
            ## for sure move on with closeout check if in completed
            wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue

        if wfo.name in holdings and wfo.name not in bypasses:
            wfo.status = 'assistance-onhold'
            wfi.sendLog('checkor',"setting %s on hold"%wfo.name)
            session.commit()
            continue

        session.commit()        
        #sub_assistance="" # if that string is filled, there will be need for manual assistance
        existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance
        assistance_tags = set()

        is_closing = True

        ## get it from somewhere
        bypass_checks = False

        for bypass in bypasses:
            if bypass in wfo.name:
                wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass))
                bypass_checks = True
                break
        pids = wfi.getPrepIDs()
        force_by_mcm = False
        force_by_user = False
        for force in forcings:
            if force in pids:
                wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force))
                bypass_checks = True
                force_by_mcm = True
                break
        for user in overrides:
            for force in overrides[user]:
                if force in wfo.name:
                    wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user))
                    bypass_checks = True
                    force_by_user = True
                    break
        
        tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm*
        vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco
        campaigns = {}
        expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] )
        for out in wfi.request['OutputDatasets']:
            c = get_campaign(out, wfi)
            campaigns[out] = c
            if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]:
                vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override']))
                ## add those that we need to check for custodial copy
                tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check

        check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])]
        check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check ))
        check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) )

        wfi.sendLog('checkor', check_output_text )

        ## anything running on acdc : getting the real prepid is not worth it
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        acdc_inactive = []
        forced_already=False
        acdc_bads = []
        for member in familly:
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if 'OriginalRequestName' in member and member['OriginalRequestName'] != wfo.name: continue
            if member['RequestStatus'] == None: continue
            if not set(member['OutputDatasets']).issubset( set(expected_outputs)):
                if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']:
                    ##this is not good at all
                    wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] )
                    acdc_bads.append( member['RequestName'] )
                    is_closing = False
                    assistance_tags.add('manual')
                continue
            if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                ## cannot be bypassed!
                is_closing = False
                assistance_tags.add('recovering')
                if (force_by_mcm or force_by_user) and not forced_already:
                    wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name)
                    wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False)
                    forceComplete(url, wfi)
                    forced_already=True
            else:
                acdc_inactive.append( member['RequestName'] )
                assistance_tags.add('recovered')
        if acdc_bads:
            sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) ))

        ## completion check
        percent_completions = {}
        if not 'TotalInputEvents' in wfi.request:
            event_expected,lumi_expected = 0,0
            if not 'recovery' in wfo.status:
                #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**'])
                sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical')
        else:
            event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']

        if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']):
            event_expected = int(wfi.request['RequestNumEvents'])
        elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']:
            event_expected = wfi.request['Task1']['RequestNumEvents']
            for i in range(1,20):
                if 'Task%d'%i in wfi.request:
                    ## this is wrong ibsolute
                    if 'FilterEfficiency' in wfi.request['Task%d'%i]:
                        event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency'])
            event_expected = int(event_expected)

        fractions_pass = {}
        over_100_pass = False
        (lhe,prim,_,_) = wfi.getIO()
        if lhe or prim: over_100_pass = False

        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            percent_completions[output] = 0.

            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )

            if event_expected:
                wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected ))
                percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) )

            fractions_pass[output] = 0.95
            c = campaigns[output]
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                fractions_pass[output] = CI.campaigns[c]['fractionpass']
                wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output))

            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

            for key in pattern_fraction_pass:
                if key in output:
                    fractions_pass[output] = pattern_fraction_pass[key]
                    print "overriding fraction to",fractions_pass[output],"by dataset key",key
                    

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            possible_recoveries = wfi.getRecoveryDoc()
            if possible_recoveries == []:
                wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))
                sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name)
                bypass_checks = True
            else:
                wfi.sendLog('checkor','%s is not completed  \n%s \n%s'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))

            ## hook for creating automatically ACDC ?
            if not bypass_checks:
                assistance_tags.add('recovery')
                is_closing = False

        if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]):
            print wfo.name,"is over completed"
            print json.dumps(percent_completions, indent=2)
            if not bypass_checks:
                assistance_tags.add('over100')
                is_closing = False

        ## correct lumi < 300 event per lumi
        events_per_lumi = {}
        for output in wfi.request['OutputDatasets']:
            events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 301.
            campaign = campaigns[output]
            #if 'EventsPerLumi' in wfi.request and 'FilterEfficiency' in wfi.request:
            #    upper_limit = 1.5*wfi.request['EventsPerLumi']*wfi.request['FilterEfficiency']
            #    print "setting the upper limit of lumisize to",upper_limit,"by request configuration"

            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
                
            lumi_upper_limit[output] = upper_limit
            if wfi.request['RequestType'] in ['ReDigi']: lumi_upper_limit[output] = -1
        
        if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            if not bypass_checks:
                assistance_tags.add('biglumi')
                is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )


            
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:",custodial,"because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = campaigns[output]
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the campaign configuration custodial:",custodial,"because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            use_parent_custodial = UC.get('use_parent_custodial')
            _,prim,_,_ = wfi.getIO()
            if not custodial and prim and use_parent_custodial:
                parent_dataset = prim.pop()
                ## this is terribly dangerous to assume only 
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset )
                ###parents_custodial = findCustodialLocation(url, parent_dataset)
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",parent_dataset,"does not have custodial in the first place. abort"
                    #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset)
                    ## does not work for RAWOADSIM
                    sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset)
                    ## cannot be bypassed, this is an issue to fix
                    is_closing = False
                    pick_custodial = False
                    assistance_tags.add('parentcustodial')
                                
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:",custodial,"because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)

            if not custodial:
                print "cannot find a custodial for",wfo.name
                wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                #sendEmail( "cannot find a custodial","cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical')
                
            if custodial and (is_closing or bypass_checks):
                print "picked",custodial,"for tape copy"
                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output]>=1:
                            custodials[custodial].append( output )
                            ## let's wait and see if that's needed 
                            assistance_tags.add('custodial')
                        else:
                            print "no file in phedex for",output," not good to add to custodial requests"
            #cannot be bypassed


            is_closing = False

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        fraction_invalid = 0.01
        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs,phedex mismatch"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            if not 'recovering' in assistance_tags:
                assistance_tags.add('filemismatch')
                #print this for show and tell if no recovery on-going
                for out in dbs_presence:
                    _,_,missing_phedex,missing_dbs  = getDatasetFiles(url, out)
                    if missing_phedex:
                        wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex),
                                    "\n".join( missing_phedex )))
                    if missing_dbs:
                        wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs),
                                    "\n".join( missing_dbs )))

            #if not bypass_checks:
            ## I don't think we can by pass this
            is_closing = False

        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            assistance_tags.add('invalidfiles')
            if not bypass_checks:
                #sub_assistance+="-invalidfiles"
                is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        if is_closing or bypass_checks:
            print "starting duplicate checker for",wfo.name
            for output in wfi.request['OutputDatasets']:
                print "\tchecking",output
                duplications[output] = True
                try:
                    duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                except:
                    try:
                        duplications[output] = dbs3Client.duplicateRunLumi( output , skipInvalid=True, verbose=True)
                    except:
                        print "was not possible to get the duplicate count for",output
                        is_closing=False

            if any(duplications.values()) and not options.ignoreduplicates:
                print wfo.name,"has duplicates"
                print json.dumps(duplications,indent=2)
                ## hook for making file invalidation ?
                ## it shouldn't be allowed to bypass it
                assistance_tags.add('duplicates')
                is_closing = False 



        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority']
        fDB.record[wfo.name]['prepid'] = wfi.request['PrepID']

        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive))
            now = time.gmtime()
            rec['timestamp'] = time.mktime(now)
            rec['updated'] = time.asctime(now)+' (GMT)'

        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            print "setting",wfo.name,"closed-out"
            if not options.test:
                if wfo.wm_status in ['closed-out','announced','normal-archived']:
                    print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer",res

                if not res in ["None",None]:
                    print "try to get the current status again"
                    wfi_bis = workflowInfo(url, wfo.name)
                    if wfi_bis.request['RequestStatus'] == 'closed-out':
                        print "the request did toggle to closed-out"
                        res = None
                    
                if not res in ["None",None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    
                
                if res in [None,"None"]:
                    wfo.status = 'close'
                    session.commit()
                    if use_mcm and force_by_mcm:
                        ## shoot large on all prepids, on closing the wf
                        for pid in pids:
                            mcm.delete('/restapi/requests/forcecomplete/%s'%pid)
                else:
                    print "could not close out",wfo.name,"will try again next time"
        else:
            ## full known list
            #recovering # has active ACDC
            ##OUT #recovered #had inactive ACDC
            #recovery #not over the pass bar
            #over100 # over 100%
            #biglumi # has a big lumiblock
            #parentcustodial # the parent does not have a valid subscription yet
            #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear
            #filemismatch # there is a dbs/phedex mismatch
            #duplicates #a lumi section is there twice

            ## manual is not added yet, and should be so by recoveror
            print wfo.name,"was tagged with :",list(assistance_tags)
            if 'recovering' in assistance_tags:
                ## if active ACDC, being under threshold, filemismatch do not matter
                assistance_tags = assistance_tags - set(['recovery','filemismatch'])
            if 'recovery' in assistance_tags and 'recovered' in assistance_tags:
                ## should not set -recovery to anything that add ACDC already
                assistance_tags = assistance_tags - set(['recovery','recovered']) 
                ## straight to manual
                assistance_tags.add('manual')


            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            print wfo.name,"needs assistance with",",".join( assistance_tags )
            print wfo.name,"existing conditions",",".join( existing_assistance_tags )
            
            #########################################
            ##### notification to requester #########
            go_notify=False
            if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags:
                go_notify=True
            

            if go_notify:
                #if wfo.name in already_notified:
                #    print "double notification"
                #    sendEmail('double notification','please take a look at %s'%(wfo.name))                    
                #else:
                #    already_notified.append( wfo.name )

                detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s'
                perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name)
                splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name)
                ## notify templates
                messages= {
                    'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ),
                    'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink),
                    'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ),
                    'filemismatch': 'Samples completed with inconsistency in DBS/Phedex',
                    #'manual' :                     'Workflow completed and requires manual checks by Ops',
                    }
                
                content = "The request PREPID (WORKFLOW) is facing issue in production.\n"
                motive = False
                for case in messages:
                    if case in assistance_tags:
                        content+= "\n"+messages[case]+"\n"
                        motive = True
                content += "You are invited to check, while this is being taken care of by Comp-Ops.\n"
                content += "This is an automated message from Comp-Ops.\n"

                items_notified = set()
                if use_mcm and motive:
                    wfi.notifyRequestor( content , mcm = mcm)

            #########################################


            ## logic to set the status further
            if assistance_tags:
                new_status = 'assistance-'+'-'.join(sorted(assistance_tags) )
            else:
                new_status = 'assistance'

            ## case where the workflow was in manual from recoveror
            if not 'manual' in wfo.status or new_status!='assistance-recovery':
                wfo.status = new_status
                if not options.test:
                    print "setting",wfo.name,"to",wfo.status
                    session.commit()
            else:
                print "current status is",wfo.status,"not changing to anything"

    #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2))

    fDB.html()
    if not spec:
        #sendEmail("fresh assistance status available","Fresh status are available at https://cmst2.web.cern.ch/cmst2/unified/assistance.html",destination=['*****@*****.**'])
        #it's a bit annoying
        pass

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        print ','.join(custodials[site]),'=>',site
        if not options.test:
            result = makeReplicaRequest(url, site, list(set(custodials[site])),"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) )
            print result

    print "Transfers"
    print json.dumps(transfers, indent=2)
    ## replicas requests
    for site in transfers:
        print ','.join(transfers[site]),'=>',site
        if not options.test:
            result = None
            #result = makeReplicaRequest(url, site, list(set(transfers[site])),"copy to disk at production close-out")
            print result

    print "File Invalidation"
    print invalidations
Ejemplo n.º 37
0
def actor(url,options=None):
    
    if userLock('actor'): return
    
    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return
    
   # CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    
    # Need to look at the actions page https://vocms0113.cern.ch:80/getaction (can add ?days=20) and perform any actions listed
    try:
        action_list = json.loads(os.popen('curl -s -k https://vocms0113.cern.ch:80/getaction?days=15').read())
        ## now we have a list of things that we can take action on
    except:
        print "Not able to load action list :("
        sendLog('actor','Not able to load action list', level='critical')
        return

    print action_list
    if not action_list:
        print "EMPTY!"
        return

    for wfname in action_list:
        print '-'*100
        print "Looking at",wfname,"for recovery options"

        to_clone = False
        to_acdc = False
        for key in action_list[wfname]:
            if key == 'Parameters':
                tasks =  action_list[wfname][key]
            elif key == 'Action' and action_list[wfname][key] == 'acdc':
                print "Going to create ACDCs for ", wfname
                to_acdc = True
            elif key == 'Action' and action_list[wfname][key] == 'clone':
                print "Going to clone ", wfname
                to_clone = True

        if not to_acdc and not to_clone:
            sendLog('actor','Action submitted for something other than acdc and clone for workflow %s'%wfname,level='critical')
            print "Can only do acdcs and clones! Skipping workflow ",wfname
            continue
        if not tasks:
            sendLog('actor','Empty action submitted for workflow %s'%wfname,level='critical')
            print "Moving on. Parameters is blank for " + wfname
            continue

        wfi = workflowInfo(url, wfname)

        recover = True
        message_to_ops = ""
        message_to_user = ""

#===========================================================
        if to_clone and options.do:
            print "Let's try kill and clone: "
            wfi.sendLog('actor','Going to clone %s'%wfname)
            results=[]
            datasets = set(wfi.request['OutputDatasets'])

            comment=""

            if 'comment' in tasks: comment = ", reason: "+ tasks['comment']
            wfi.sendLog('actor',"invalidating the workflow by traffic controller %s"%comment)

            #Reject all workflows in the family
            #first reject the original workflow.
            reqMgrClient.invalidateWorkflow(url, wfi.request['RequestName'], current_status=wfi.request['RequestStatus'], cascade=False)
            #Then reject any ACDCs associated with that workflow
            if 'ACDCs' in action_list[wfname]:
                children = action_list[wfname]['ACDCs']
                for child in children:
                    wfi.sendLog('actor',"rejecting %s"%child)
                    wfi_acdc = workflowInfo(url, child)
                    reqMgrClient.invalidateWorkflow(url, wfi_acdc.request['RequestName'], current_status=wfi_acdc.request['RequestStatus'], cascade=False)
                    datasets.update( wfi_acdc.request['OutputDatasets'] )
            #Invalidate all associated output datasets
            for dataset in datasets:
                results.append( setDatasetStatus(dataset, 'INVALID') )

            if all(map(lambda result : result in ['None',None,True],results)):
                wfi.sendLog('actor',"%s and children are rejected"%wfname)

            cloned = None
            try:    
                cloned =  singleClone(url, wfname, tasks, comment, options.do)
            except:
                sendLog('actor','Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.'%wfname,level='critical')
                wfi.sendLog('actor','Failed to create clone for %s!'%wfname)
                remove_action(wfname)
            if not cloned:
                recover = False
                wfi.sendLog('actor','Failed to create clone for %s!'%wfname)
                sendLog('actor','Failed to create clone for %s!'%wfname,level='critical')

            else:
                wfi.sendLog('actor',"Workflow %s cloned"%wfname)


#===========================================================
        elif to_acdc:
            if 'AllSteps' in tasks:
                allTasksDefaults = tasks['AllSteps']
                tasks.pop('AllSteps')
                for setting in allTasksDefaults:
                    for task in tasks:
                        if setting in tasks[task]:
                            tasks[task][setting] = allTasksDefaults[setting]
                        else:
                                tasks[task].append({setting:allTasksDefaults[setting]})
            print "Tasks is "
            print tasks

            all_tasks = wfi.getAllTasks()

            ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves
        
            try:
                WMErr = wfi.getWMErrors()
#               print WMErr
            except:
                sendLog('actor','Cannot create ACDCS for %s because WMErr cannot be reached.'%wfname,level='critical')
                continue
            if not WMErr:
                sendLog('actor','Cannot create ACDCS for %s because WMErr is blank.'%wfname,level='critical')
                print "Moving on. WMErr is blank"
                continue

            try:
                where_to_run, missing_to_run,missing_to_run_at =  wfi.getRecoveryInfo()
                print "Where to run = "
                print where_to_run
            except:
                sendLog('actor','Cannot create ACDCS for %s because recovery info cannot be found.'%wfname,level='critical')
                print "Moving on. Cannot access recovery info for " + wfname
                continue
            if not where_to_run:
                sendLog('actor','Cannot create ACDCS for %s because site list cannot be found.'%wfname,level='critical')
                print "Moving on. where to run is blank"
                continue

            message_to_ops = ""
            message_to_user = ""
        
            num_tasks_to_recover = 0
        
            for task in WMErr:
                if 'LogCollect' in task: continue
                if 'Cleanup' in task: continue
                if not 'jobfailed' in WMErr[task]:
                    continue
                else:
                    num_tasks_to_recover += 1
#                print "Task to recover: " + task

            if not num_tasks_to_recover:
                print "\tno error for",wfname
#            recover = False
        
            if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']:
            ## we do not try to recover pLHE
                sendLog('actor','Cannot create ACDCS for %s because it is a pLHE workflow.'%wfname,level='critical')
                print "We don't try to recover pLHE. Moving on."
                recover = False
        #            sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname)


#        if wfi.request['RequestType'] in ['ReReco']:
#            recover= False
#            print 'cannot submit action. ReReco'
        #   sendEmail('cannot submit action', '%s is request type ReReco'%wfname)

            recovering = set()
            for task in tasks:
                assign_to_sites = set()
                print "Task names is " + task
                fulltaskname = '/' + wfname + '/' + task
#                print "Full task name is " + fulltaskname
                wrong_task = False
                for task_info in all_tasks:
                    if fulltaskname == task_info.pathName:
                        if task_info.taskType not in ['Processing','Production','Merge']:
                            wrong_task=True
                            wfi.sendLog('actor', "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks"%( fulltaskname, task_info.taskType))
                if wrong_task:
                    continue
                print tasks[task]
                actions = tasks[task]
                for action in actions:
                    if action.startswith('sites'):
                        if type(actions[action]) != list:
                            assign_to_sites=[SI.SE_to_CE(actions[action])]
                        else:
                            assign_to_sites=list(set([SI.SE_to_CE(site) for site in actions[action]]))
#                    if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']:
#                        recover = False;
#                        print  "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname
#                        wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname)
                if not 'sites' in actions:
                    assign_to_sites = list(set([SI.SE_to_CE(site) for site in where_to_run[task]]))
                    print "Found",sorted(assign_to_sites),"as sites where to run the ACDC at, from the acdc doc of ",wfname
                print "Going to run at",sorted(assign_to_sites)
                if recover:
                    print "Initiating recovery"
                    acdc = singleRecovery(url, fulltaskname, wfi.request, actions, do = options.do)
                    if not acdc:
                        if options.do:
                            if recovering:
                                print wfname + " has been partially ACDC'ed. Needs manual attention."
                                sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering)), level='critical')
                                wfi.sendLog('actor', "%s has had %s/%s recoveries %s only"%( wfname, len(recovering), num_tasks_to_recover, list(recovering)))
                                break
                            else:
                                print wfname + " failed recovery once"
                                recover = False
                                break
                        else:
                            print "no action to take further"
#                        sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical')
                            continue

                    else: #ACDC was made correctly. Now we have to assign it.
                        wfi.sendLog('actor','ACDC created for task %s. Actions taken \n%s'%(fulltaskname,list(actions)))
                        team = wfi.request['Teams'][0]
                        parameters={
                        'SiteWhitelist' : sorted(assign_to_sites),
                        'AcquisitionEra' : wfi.acquisitionEra(),
                        'ProcessingString' :  wfi.processingString(),
                        'MergedLFNBase' : wfi.request['MergedLFNBase'],
                        'ProcessingVersion' : wfi.request['ProcessingVersion'],
                        }
                    ## hackery for ACDC merge assignment
                        if wfi.request['RequestType'] == 'TaskChain' and 'Merge' in task.split('/')[-1]:
                            parameters['AcquisitionEra'] = None
                            parameters['ProcessingString'] = None

                ## xrootd setttings on primary and secondary 
                        if 'xrootd' in actions:
                            if actions['xrootd'] == 'enabled':
                                print "Going to assign via xrootd"
                                parameters['TrustSitelists'] = True
                            elif actions['xrootd'] == 'disabled':
                                parameters['TrustSitelists'] = False
                        elif ('TrustSitelists' in wfi.request and wfi.request['TrustSitelists']=='true'):
                            parameters['TrustSitelists'] = True
                        else:
                            parameters['TrustSitelists'] = False

                        if 'TrustPUSitelists' in wfi.request and wfi.request['TrustPUSitelists']:
                            parameters['TrustPUSitelists'] = True

                        if options.ass:
                            print "really doing the assignment of the ACDC",acdc
                            parameters['execute']=True
                            wfi.sendLog('actor',"%s  was assigned for recovery"% acdc)
                        else:
                            print "no assignment done with this ACDC",acdc
                            sendLog('actor',"%s needs to be assigned"%(acdc), level='critical')
                            continue
 #                       print parameters
                        result = reqMgrClient.assignWorkflow(url, acdc, team, parameters)
                        if not result:
                            print acdc,"was not assigned"
                            sendLog('actor',"%s needs to be assigned"%(acdc), level='critical')
                        else:
                            recovering.add( acdc )
                        wfi.sendLog('actor',"ACDCs created for %s"%wfname)
        #===========================================================
        
        
        if recover and options.do:
            remove_action(wfname)

        if message_to_user:
            print wfname,"to be notified to user(DUMMY)",message_to_user

        if message_to_ops:
            print 'message'
            #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**'])
        #            sendLog('recoveror',message_to_ops,level='warning')



    return
Ejemplo n.º 38
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock(): return
    if duplicateLock(): return
    if not componentInfo().check(): return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    SI = siteInfo()
    NLI = newLockInfo()

    n_assigned = 0
    n_stalled = 0

    wfos = []
    if specific or options.early:
        wfos.extend(
            session.query(Workflow).filter(
                Workflow.status == 'considered').all())
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == 'staging').all())
    if specific:
        wfos.extend(
            session.query(Workflow).filter(
                Workflow.status == 'considered-tried').all())
    wfos.extend(
        session.query(Workflow).filter(Workflow.status == 'staged').all())

    dataset_endpoints = json.loads(
        open('%s/dataset_endpoints.json' % monitor_dir).read())

    max_per_round = UC.get('max_per_round').get('assignor', None)
    max_cpuh_block = UC.get('max_cpuh_block')
    random.shuffle(wfos)
    for wfo in wfos:
        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)
        wfh.sendLog('assignor', "%s to be assigned" % wfo.name)

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary,
         sites_allowed) = wfh.getSiteWhiteList()

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            n_stalled += 1
            no_go = True

        allowed_secondary = set()
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                allowed_secondary.update(CI.campaigns[campaign]['secondaries'])
        if (secondary and allowed_secondary) and (
                set(secondary) & allowed_secondary != set(secondary)):
            wfh.sendLog(
                'assignor', '%s is not an allowed secondary' %
                (', '.join(set(secondary) - allowed_secondary)))
            #sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary)))
            sendLog('assignor',
                    '%s is not an allowed secondary' %
                    (', '.join(set(secondary) - allowed_secondary)),
                    level='critical')
            if not options.go:
                n_stalled += 1
                no_go = True

        if no_go:
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        original_sites_allowed = copy.deepcopy(sites_allowed)
        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request['Campaign'],
                                       'SecondaryLocation', [])

        blocks = []
        if 'BlockWhitelist' in wfh.request:
            blocks = wfh.request['BlockWhitelist']
        if 'RunWhitelist' in wfh.request and wfh.request['RunWhitelist']:
            ## augment with run white list
            for dataset in primary:
                blocks = list(
                    set(blocks + getDatasetBlocks(
                        dataset, runs=wfh.request['RunWhitelist'])))

        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))
        secondary_locations = None

        primary_aaa = options.primary_aaa
        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns and 'primary_AAA' in CI.campaigns[
                    wfh.request['Campaign']]:
            primary_aaa = primary_aaa or CI.campaigns[
                wfh.request['Campaign']]['primary_AAA']
        secondary_aaa = options.secondary_aaa
        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns and 'secondary_AAA' in CI.campaigns[
                    wfh.request['Campaign']]:
            secondary_aaa = secondary_aaa or CI.campaigns[
                wfh.request['Campaign']]['secondary_AAA']

        for sec in list(secondary):
            if override_sec_location:
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                sendEmail("tempting to pass sec location check",
                          "but we cannot yet IMO")
                #pass
            if secondary_aaa:
                #just continue without checking
                continue

            presence = getDatasetPresence(url, sec)
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [
                site for (site, (there, frac)) in presence.items()
                if frac > 98.
            ]
            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations == None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(
                    set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            #sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [
                site for site in sites_allowed
                if SI.CE_to_SE(site) in one_secondary_locations
            ]

        wfh.sendLog(
            'assignor', "From secondary requirement, now Allowed%s" %
            sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy(
            sites_allowed
        )  ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy(sites_allowed)
        sites_with_data = copy.deepcopy(sites_allowed)
        sites_with_any_data = copy.deepcopy(sites_allowed)
        primary_locations = None
        available_fractions = {}
        set_lfn = '/store/mc'  ## by default
        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor", dataset_endpoints[prim]
                endpoints.update(dataset_endpoints[prim])
            set_lfn = getLFNbase(prim)
            presence = getDatasetPresence(url, prim, only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] = getDatasetBlocksFraction(
                url,
                prim,
                sites=[SI.CE_to_SE(site) for site in sites_allowed],
                only_blocks=blocks)
            #sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])]
            #sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])]
            sites_all_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in [
                    psite for (psite, (there, frac)) in presence.items()
                    if there
                ]
            ]
            sites_with_data = [
                site for site in sites_with_data if SI.CE_to_SE(site) in
                [psite for (psite, frac) in presence.items() if frac[1] > 90.]
            ]
            sites_with_any_data = [
                site for site in sites_with_any_data
                if SI.CE_to_SE(site) in presence.keys()
            ]
            wfh.sendLog(
                'assignor', "Holding the data but not allowed %s" % sorted(
                    list(
                        set([
                            se_site for se_site in presence.keys()
                            if not SI.SE_to_CE(se_site) in sites_allowed
                        ]))))
            if primary_locations == None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(
                    set(primary_locations) & set(presence.keys()))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites = []
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in
                    list((set(secondary_locations) & set(primary_locations)) -
                         set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            elif primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site) for site in list(
                        set(primary_locations) -
                        set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            else:
                opportunistic_sites = []
            wfh.sendLog(
                'assignor', "We could be running in addition at %s" %
                sorted(opportunistic_sites))
            if any(
                [osite in SI.sites_not_ready
                 for osite in opportunistic_sites]):
                wfh.sendLog(
                    'assignor', "One of the usable site is in downtime %s" % ([
                        osite in SI.sites_not_ready
                        for osite in opportunistic_sites
                    ]))
                down_time = True
                ## should this be send back to considered ?

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog('assignor', "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                'assignor',
                '%s requires a large numbr of CPUh %s , not assigning, please check with requester'
                % (wfo.name, cpuh),
                level='critical')
            wfh.sendLog(
                'assignor',
                "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns and 'maxcopies' in CI.campaigns[
                    wfh.request['Campaign']]:
            copies_needed_from_campaign = CI.campaigns[
                wfh.request['Campaign']]['maxcopies']
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)

        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(
                1, copies_wanted -
                less_copies_than_requested)  # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog('assignor',
                    "needed availability fraction %s" % copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        if available_fractions and not all([
                available >= copies_wanted
                for available in available_fractions.values()
        ]):
            not_even_once = not all([
                available >= 1. for available in available_fractions.values()
            ])
            wfh.sendLog(
                'assignor',
                "The input dataset is not available %s times, only %s" %
                (copies_wanted, available_fractions.values()))
            if down_time and not options.go and not options.early:
                wfo.status = 'considered'
                session.commit()
                wfh.sendLog(
                    'assignor',
                    "sending back to considered because of site downtime, instead of waiting"
                )
                #sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog(
                    'assignor',
                    '%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered.'
                    % (wfo.name),
                    level='delay')
                continue
                #pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open('cannot_assign.json').read())
                except:
                    pass
                if not wfo.name in known and not options.limit and not options.go and not options.early and not options.partial:
                    wfh.sendLog(
                        'assignor',
                        "cannot be assigned, %s is not sufficiently available.\n %s"
                        % (wfo.name, json.dumps(available_fractions)))
                    sendEmail(
                        "cannot be assigned",
                        "%s is not sufficiently available.\n %s" %
                        (wfo.name, json.dumps(available_fractions)))
                    known.append(wfo.name)
                    open('cannot_assign.json',
                         'w').write(json.dumps(known, indent=2))
                n_stalled += 1
                if options.early:
                    if wfo.status == 'considered':
                        wfh.sendLog('assignor', "setting considered-tried")
                        wfo.status = 'considered-tried'
                        session.commit()
                    else:
                        print "tried but status is", wfo.status
                if options.partial:
                    print "Will move on with partial locations"
                else:
                    continue

        ## default back to white list to original white list with any data
        print "Allowed", sorted(sites_allowed)

        if primary_aaa:
            sites_allowed = initial_sites_allowed
            options.TrustSitelists = True
            wfh.sendLog(
                'assignor', "Selected to read primary through xrootd %s" %
                sorted(sites_allowed))
        else:
            sites_allowed = sites_with_any_data
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        if secondary_aaa:
            options.TrustPUSitelists = True
            wfh.sendLog(
                'assignor', "Reading secondary through xrootd from %s" %
                sorted(sites_allowed))

        ### check on endpoints for on-going transfers
        if endpoints and options.partial:
            sites_allowed = list(
                set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints]))
            print "with added endpoints", sorted(sites_allowed)

        if not len(sites_allowed):
            wfh.sendLog('assignor', "cannot be assign with no matched sites")
            sendLog('assignor',
                    '%s has no whitelist' % wfo.name,
                    level='critical')
            n_stalled += 1
            continue

        t1_only = [ce for ce in sites_allowed if ce.startswith('T1')]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team

        if False and 'T2_CH_CERN' in parameters['SiteWhitelist']:
            ## add some check on
            ### the amount pending to HLT
            ### the size of the request
            ### the priority of the request (maybe not if we decide to overflow during runs)
            parameters['SiteWhitelist'] = ['T2_CH_CERN_HLT']
            team = 'hlt'
            ## reduce the splitting by factor of 4, regardless of type of splitting
            sendEmail("sending work to HLT",
                      "%s was assigned to HLT" % wfo.name)

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v = getattr(options, key)
                if v != None:
                    if type(v) == str and ',' in v:
                        parameters[key] = filter(None, v.split(','))
                    else:
                        parameters[key] = v

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog('assignor',
                        'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        ## pick up campaign specific assignment parameters
        parameters.update(CI.parameters(wfh.request['Campaign']))

        if not options.test:
            parameters['execute'] = True

        split_check = wfh.checkWorkflowSplitting()
        if split_check != True:
            parameters.update(split_check)
            if 'EventBased' in split_check.values():
                wfh.sendLog('assignor', "Falling back to event splitting.")
                #sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name)
                sendLog(
                    'assignor',
                    'the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting'
                    % wfo.name,
                    level='critical')
            elif 'EventsPerJob' in split_check.values():
                wfh.sendLog('assignor',
                            "Modifying the number of job per event")
                #sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name)
                sendLog(
                    'assignor',
                    "the workflow %s is too heavy in number of jobs explosion"
                    % wfo.name,
                    level='critical')

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor',
                            "%s needs to be split by event with %s per job" %
                            (wfo.name, eventsPerJob),
                            level='critical')
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',
                                "%s was assigned with %s lumis/job" %
                                (wfo.name, lumisPerJob),
                                level='critical')
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            'assignor',
                            "leaving splitting untouched for %s, please check on %s"
                            % (pstring, wfo.name),
                            level='critical')
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )

        result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters)

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                wfh.sendLog(
                    'assignor', "Properly assigned\n%s" %
                    (json.dumps(parameters, indent=2)))
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs flat
                        NLI.lock(secure)
                    #for site in [SI.CE_to_SE(site) for site in sites_allowed]:
                    #    for output in new_wfi.request['OutputDatasets']:
                    #        LI.lock( output, site, 'dataset in production')
                    #    for primary in prim:
                    #        LI.lock( primary, site, 'dataset used in input')
                    #    for secondary in sec:
                    #        LI.lock( secondary, site, 'required for mixing' )

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
Ejemplo n.º 39
0
def checkor(url, spec=None, options=None):
    if userLock():   return
    if duplicateLock() and not options.go:  return

    fDB = closeoutInfo()

    UC = unifiedConfiguration()
    use_mcm = True
    up = componentInfo(mcm=use_mcm, soft=['mcm'])
    if not up.check(): return
    use_mcm = up.status['mcm']

    def time_point(label="",sub_lap=False):
        now = time.mktime(time.gmtime())
        nows = time.asctime(time.gmtime())

        print "Time check (%s) point at : %s"%(label, nows)
        print "Since start: %s [s]"% ( now - time_point.start)
        if sub_lap:
            print "Sub Lap : %s [s]"% ( now - time_point.sub_lap ) 
            time_point.sub_lap = now
        else:
            print "Lap : %s [s]"% ( now - time_point.lap ) 
            time_point.lap = now            
            time_point.sub_lap = now

    time_point.sub_lap = time_point.lap = time_point.start = time.mktime(time.gmtime())
    
    runnings = session.query(Workflow).filter(Workflow.status == 'away').all()
    standings = session.query(Workflow).filter(Workflow.status.startswith('assistance')).all()

    ## intersect with what is actually in completed status in request manager now
    all_completed = set(getWorkflows(url, 'completed' ))

    wfs=[]

    if options.strict:
        ## the one which were running and now have completed
        print "strict option is on: checking workflows that freshly completed"
        wfs.extend( filter(lambda wfo: wfo.name in all_completed , runnings))
    if options.update:
        print "update option is on: checking workflows that have not completed yet"
        wfs.extend( filter(lambda wfo: not wfo.name in all_completed , runnings))

    if options.clear:
        print "clear option is on: checking workflows that are ready to toggle closed-out"
        wfs.extend( filter(lambda wfo: 'custodial' in wfo.status, standings))
    if options.review:
        print "review option is on: checking the workflows that needed intervention"
        wfs.extend( filter(lambda wfo: not 'custodial' in wfo.status, standings))

    ## what is left out are the wf which were running and ended up aborted/failed/...

    

    custodials = defaultdict(list) #sites : dataset list
    transfers = defaultdict(list) #sites : dataset list
    invalidations = [] #a list of files
    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False) if use_mcm else None

    def get_campaign(output, wfi):
        ## this should be a perfect matching of output->task->campaign
        campaign = None
        era = None
        wf_campaign = None
        if 'Campaign' in wfi.request:   wf_campaign = wfi.request['Campaign']
        try:
            era = output.split('/')[2].split('-')[0]
        except:
            era = None
            
        if wfi.isRelval(): 
            campaign = wf_campaign
        else:
            campaign = era if era else wf_campaign
        return campaign

    ## retrieve bypass and onhold configuration
    bypasses = []
    forcings = []
    overrides = getForceCompletes()
    holdings = []

    
    actors = UC.get('allowed_bypass')

    for bypassor,email in actors:
        bypass_file = '/afs/cern.ch/user/%s/%s/public/ops/bypass.json'%(bypassor[0],bypassor)
        if not os.path.isfile(bypass_file):
            #sendLog('checkor','no file %s',bypass_file)
            continue
        try:
            bypasses.extend( json.loads(open(bypass_file).read()))
        except:
            sendLog('checkor',"cannot get by-passes from %s for %s"%(bypass_file ,bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(bypass_file), destination=[email])
        
        holding_file = '/afs/cern.ch/user/%s/%s/public/ops/onhold.json'%(bypassor[0],bypassor)
        if not os.path.isfile(holding_file):
            #sendLog('checkor',"no file %s"%holding_file)
            continue
        try:
            extending = json.loads(open(holding_file).read())
            print bypassor,"is holding",extending
            holdings.extend( extending )
        except:
            sendLog('checkor',"cannot get holdings from %s for %s"%(holding_file, bypassor))
            sendEmail("malformated by-pass information","%s is not json readable"%(holding_file), destination=[email])

    ## once this was force-completed, you want to bypass
    for rider,email in actors:
        rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json'%(rider[0],rider)
        if not os.path.isfile(rider_file):
            print "no file",rider_file
            #sendLog('checkor',"no file %s"%rider_file)
            continue
        try:
            bypasses.extend( json.loads(open( rider_file ).read() ) )
        except:
            sendLog('checkor',"cannot get force complete list from %s"%rider)
            sendEmail("malformated force complet file","%s is not json readable"%rider_file, destination=[email])

    if use_mcm:
        forcings = mcm.get('/restapi/requests/forcecomplete')
        #if forcings:
        #    sendEmail('force completing mechanism','please check what checkor is doing with %s'%( ','.join(forcings)))


    pattern_fraction_pass = UC.get('pattern_fraction_pass')

    total_running_time = 5.*60. 
    sleep_time = 1
    if len(wfs):
        sleep_time = min(max(0.5, total_running_time / len(wfs)), 10)

    random.shuffle( wfs )

    in_manual = 0

    ## now you have a record of what file was invalidated globally from TT
    TMDB_invalid = dataCache.get('file_invalidation') 
    #try:
    #    TMDB_invalid = set([row[3] for row in csv.reader( os.popen('curl -s "https://docs.google.com/spreadsheets/d/11fFsDOTLTtRcI4Q3gXw0GNj4ZS8IoXMoQDC3CbOo_2o/export?format=csv"'))])
    #    TMDB_invalid = map(lambda e : e.split(':')[-1], TMDB_invalid)
    #    print len(TMDB_invalid),"globally invalidated files"
    #except Exception as e:
    #    print "TMDB not fetched"
    #    print str(e)
    #    TMDB_invalid = []


    print len(wfs),"to consider, pausing for",sleep_time
    max_per_round = UC.get('max_per_round').get('checkor',None)
    if options.limit: max_per_round=options.limit
    if max_per_round and not spec: wfs = wfs[:max_per_round]



    for wfo in wfs:
        if spec and not (spec in wfo.name): continue
        
        time.sleep( sleep_time )
        
        time_point("Starting with %s"% wfo.name)

        ## get info
        wfi = workflowInfo(url, wfo.name)
        wfi.sendLog('checkor',"checking on %s %s"%( wfo.name,wfo.status))
        ## make sure the wm status is up to date.
        # and send things back/forward if necessary.
        wfo.wm_status = wfi.request['RequestStatus']
        if wfo.wm_status == 'closed-out':
            ## manually closed-out
            wfi.sendLog('checkor',"%s is already %s, setting close"%( wfo.name , wfo.wm_status))
            wfo.status = 'close'
            session.commit()
            continue

        elif wfo.wm_status in ['failed','aborted','aborted-archived','rejected','rejected-archived','aborted-completed']:
            ## went into trouble
            wfo.status = 'trouble'
            wfi.sendLog('checkor',"%s is in trouble %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue
        elif wfo.wm_status in ['assigned','acquired']:
            ## not worth checking yet
            wfi.sendLog('checkor',"%s is not running yet"%wfo.name)
            session.commit()
            continue
        
        if '-onhold' in wfo.status:
            if wfo.name in holdings and wfo.name not in bypasses:
                wfi.sendLog('checkor',"%s is on hold"%wfo.name)
                continue

        if wfo.wm_status != 'completed': #and not wfo.name in bypasses:
            ## for sure move on with closeout check if in completed
            wfi.sendLog('checkor',"no need to check on %s in status %s"%(wfo.name, wfo.wm_status))
            session.commit()
            continue

        if wfo.name in holdings and wfo.name not in bypasses:
            wfo.status = 'assistance-onhold'
            wfi.sendLog('checkor',"setting %s on hold"%wfo.name)
            session.commit()
            continue

        session.commit()        
        #sub_assistance="" # if that string is filled, there will be need for manual assistance
        existing_assistance_tags = set(wfo.status.split('-')[1:]) #[0] should be assistance
        assistance_tags = set()

        is_closing = True

        ## get it from somewhere
        bypass_checks = False

        for bypass in bypasses:
            if bypass in wfo.name:
                wfi.sendLog('checkor',"we can bypass checks on %s because of keyword %s "%( wfo.name, bypass))
                bypass_checks = True
                break
        pids = wfi.getPrepIDs()
        force_by_mcm = False
        force_by_user = False
        for force in forcings:
            if force in pids:
                wfi.sendLog('checkor',"we can bypass checks and force complete %s because of prepid %s "%( wfo.name, force))
                bypass_checks = True
                force_by_mcm = True
                break
        for user in overrides:
            for force in overrides[user]:
                if force in wfo.name:
                    wfi.sendLog('checkor',"we can bypass checks and force complete %s because of keyword %s of user %s"%( wfo.name, force, user))
                    bypass_checks = True
                    force_by_user = True
                    break
        
        tiers_with_no_check = copy.deepcopy(UC.get('tiers_with_no_check')) # dqm*
        vetoed_custodial_tier = copy.deepcopy(UC.get('tiers_with_no_custodial')) #dqm*, reco
        to_ddm_tier = copy.deepcopy(UC.get('tiers_to_DDM'))
        campaigns = {} ## this mapping of campaign per output dataset assumes era==campaing, which is not true for relval
        expected_outputs = copy.deepcopy( wfi.request['OutputDatasets'] )
        for out in wfi.request['OutputDatasets']:
            c = get_campaign(out, wfi)
            campaigns[out] = c 
            if c in CI.campaigns and 'custodial_override' in CI.campaigns[c]:
                vetoed_custodial_tier = list(set(vetoed_custodial_tier) - set(CI.campaigns[c]['custodial_override']))
                ## add those that we need to check for custodial copy
                tiers_with_no_check = list(set(tiers_with_no_check) - set(CI.campaigns[c]['custodial_override'])) ## would remove DQM from the vetoed check

        check_output_text = "Initial outputs:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        wfi.request['OutputDatasets'] = [ out for out in wfi.request['OutputDatasets'] if not any([out.split('/')[-1] == veto_tier for veto_tier in tiers_with_no_check])]
        check_output_text += "\nWill check on:"+",".join(sorted(wfi.request['OutputDatasets'] ))
        check_output_text += "\ntiers out:"+",".join( sorted(tiers_with_no_check ))
        check_output_text += "\ntiers no custodial:"+",".join( sorted(vetoed_custodial_tier) )

        wfi.sendLog('checkor', check_output_text )

        ## anything running on acdc : getting the real prepid is not worth it
        familly = getWorkflowById(url, wfi.request['PrepID'], details=True)
        acdc = []
        acdc_inactive = []
        forced_already=False
        acdc_bads = []
        true_familly = []
        for member in familly:
            if member['RequestType'] != 'Resubmission': continue
            if member['RequestName'] == wfo.name: continue
            if member['RequestDate'] < wfi.request['RequestDate']: continue
            if member['PrepID'] != wfi.request['PrepID'] : continue
            #if 'OriginalRequestName' in member and (not 'ACDC' in member['OriginalRequestName']) and member['OriginalRequestName'] != wfo.name: continue
            if member['RequestStatus'] == None: continue

            if not set(member['OutputDatasets']).issubset( set(expected_outputs)):
                if not member['RequestStatus'] in ['rejected-archived','rejected','aborted','aborted-archived']:
                    ##this is not good at all
                    wfi.sendLog('checkor','inconsistent ACDC %s'%member['RequestName'] )
                    #sendLog('checkor','inconsistent ACDC %s'%member['RequestName'], level='critical')
                    acdc_bads.append( member['RequestName'] )
                    is_closing = False
                    assistance_tags.add('manual')
                continue

            true_familly.append( member['RequestName'] )
            #try:
            #    parse_one(url, member['RequestName'])
            #except:
            #    print "Could not make error report for",member['RequestName']

            if member['RequestStatus'] in ['running-open','running-closed','assigned','acquired']:
                print wfo.name,"still has an ACDC running",member['RequestName']
                acdc.append( member['RequestName'] )
                ## cannot be bypassed!
                is_closing = False
                assistance_tags.add('recovering')
                if (force_by_mcm or force_by_user) and not forced_already:
                    wfi.sendLog('checkor','%s is being forced completed while recovering'%wfo.name)
                    wfi.notifyRequestor("The workflow %s was force completed"% wfo.name, do_batch=False)
                    forceComplete(url, wfi)
                    forced_already=True
            else:
                acdc_inactive.append( member['RequestName'] )
                assistance_tags.add('recovered')
        if acdc_bads:
            #sendEmail('inconsistent ACDC','for %s, ACDC %s is inconsistent, preventing from closing'%( wfo.name, ','.join(acdc_bads) ))
            sendLog('checkor','For %s, ACDC %s is inconsistent, preventing from closing or will create a mess.'%( wfo.name, ','.join(acdc_bads) ), level='critical')

        time_point("checked workflow familly", sub_lap=True)


        ## completion check
        percent_completions = {}
        if not 'TotalInputEvents' in wfi.request:
            event_expected,lumi_expected = 0,0
            if not 'recovery' in wfo.status:
                #sendEmail("missing member of the request","TotalInputEvents is missing from the workload of %s"% wfo.name, destination=['*****@*****.**'])
                sendLog('checkor',"TotalInputEvents is missing from the workload of %s"% wfo.name, level='critical')
        else:
            event_expected,lumi_expected =  wfi.request['TotalInputEvents'],wfi.request['TotalInputLumis']

        if 'RequestNumEvents' in wfi.request and int(wfi.request['RequestNumEvents']):
            event_expected = int(wfi.request['RequestNumEvents'])
        elif 'Task1' in wfi.request and 'RequestNumEvents' in wfi.request['Task1']:
            event_expected = wfi.request['Task1']['RequestNumEvents']
            for i in range(1,20):
                if 'Task%d'%i in wfi.request:
                    ## this is wrong ibsolute
                    if 'FilterEfficiency' in wfi.request['Task%d'%i]:
                        event_expected *= float(wfi.request['Task%d'%i]['FilterEfficiency'])
            event_expected = int(event_expected)

        fractions_pass = {}
        events_per_lumi = {}

        over_100_pass = False
        (lhe,prim,_,_) = wfi.getIO()
        if lhe or prim: over_100_pass = False

        time_point("execpted statistics", sub_lap=True)

        for output in wfi.request['OutputDatasets']:
            event_count,lumi_count = getDatasetEventsAndLumis(dataset=output)
            events_per_lumi[output] = event_count/float(lumi_count) if lumi_count else 100
                
            percent_completions[output] = 0.

            if lumi_expected:
                percent_completions[output] = lumi_count / float( lumi_expected )

            if event_expected:
                wfi.sendLog('checkor', "event completion real %s expected %s"%(event_count, event_expected ))
                percent_completions[output] = max(percent_completions[output], float(event_count) / float( event_expected ) )

            default_pass = UC.get('default_fraction_pass')
            fractions_pass[output] = default_pass
            c = campaigns[output]
            if c in CI.campaigns and 'fractionpass' in CI.campaigns[c]:
                if type(CI.campaigns[c]['fractionpass']) == dict:
                    tier = output.split('/')[-1]
                    priority = str(wfi.request['RequestPriority'])
                    ## defined per tier
                    fractions_pass[output] = CI.campaigns[c]['fractionpass'].get('all', default_pass)
                    if tier in CI.campaigns[c]['fractionpass']:
                        fractions_pass[output] = CI.campaigns[c]['fractionpass'][tier]
                    if priority in CI.campaigns[c]['fractionpass']:
                        fractions_pass[output] = CI.campaigns[c]['fractionpass'][priority]
                else:
                    fractions_pass[output] = CI.campaigns[c]['fractionpass']
                wfi.sendLog('checkor', "overriding fraction to %s for %s by campaign requirement"%( fractions_pass[output], output))

            if options.fractionpass:
                fractions_pass[output] = options.fractionpass
                print "overriding fraction to",fractions_pass[output],"by command line for",output

            for key in pattern_fraction_pass:
                if key in output:
                    fractions_pass[output] = pattern_fraction_pass[key]
                    print "overriding fraction to",fractions_pass[output],"by dataset key",key
                    

        if not all([percent_completions[out] >= fractions_pass[out] for out in fractions_pass]):
            possible_recoveries = wfi.getRecoveryDoc()
            if possible_recoveries == []:
                wfi.sendLog('checkor','%s has missing statistics \n%s \n%s, but nothing is recoverable. passing through to annoucement'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))
                sendLog('checkor','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name, level='critical')
                #sendEmail('nothing is recoverable','%s is not completed, but has nothing to be recovered, passing along ?'%wfo.name)#,destination=['*****@*****.**'])
                ## do not bypass for now, until Alan understands why we are loosing ACDC docs 
                bypass_checks = True
            else:
                wfi.sendLog('checkor','%s is not completed  \n%s \n%s'%( 
                        wfo.name, json.dumps(percent_completions, indent=2), json.dumps(fractions_pass, indent=2) ))

            ## hook for creating automatically ACDC ?
            if not bypass_checks:
                assistance_tags.add('recovery')
                is_closing = False

        if over_100_pass and any([percent_completions[out] >100 for out in fractions_pass]):
            print wfo.name,"is over completed"
            print json.dumps(percent_completions, indent=2)
            if not bypass_checks:
                assistance_tags.add('over100')
                is_closing = False

        time_point("checked output size", sub_lap=True)

        ## correct lumi < 300 event per lumi
        #for output in wfi.request['OutputDatasets']:
        #events_per_lumi[output] = getDatasetEventsPerLumi( output )


        lumi_upper_limit = {}
        for output in wfi.request['OutputDatasets']:
            upper_limit = 301.
            campaign = campaigns[output]

            if campaign in CI.campaigns and 'lumisize' in CI.campaigns[campaign]:
                upper_limit = CI.campaigns[campaign]['lumisize']
                print "overriding the upper lumi size to",upper_limit,"for",campaign

            if options.lumisize:
                upper_limit = options.lumisize
                print "overriding the upper lumi size to",upper_limit,"by command line"
                
            lumi_upper_limit[output] = upper_limit
            if wfi.request['RequestType'] in ['ReDigi','ReReco']: lumi_upper_limit[output] = -1
        
        if any([ (lumi_upper_limit[out]>0 and events_per_lumi[out] >= lumi_upper_limit[out]) for out in events_per_lumi]):
            print wfo.name,"has big lumisections"
            print json.dumps(events_per_lumi, indent=2)
            ## hook for rejecting the request ?
            if not bypass_checks:
                assistance_tags.add('biglumi')
                is_closing = False 


        any_presence = {}
        for output in wfi.request['OutputDatasets']:
            any_presence[output] = getDatasetPresence(url, output, vetoes=[])

        time_point("checked dataset presence", sub_lap=True)

        ## custodial copy
        custodial_locations = {}
        custodial_presences = {}
        for output in wfi.request['OutputDatasets']:
            custodial_presences[output] = [s for s in any_presence[output] if 'MSS' in s]
            custodial_locations[output] = phedexClient.getCustodialSubscriptionRequestSite(output)

            if not custodial_locations[output]:
                custodial_locations[output] = []

        time_point("checked custodiality", sub_lap=True)

        ## presence in phedex
        phedex_presence ={}
        for output in wfi.request['OutputDatasets']:
            phedex_presence[output] = phedexClient.getFileCountDataset(url, output )

        time_point("checked phedex count", sub_lap=True)


            
        out_worth_checking = [out for out in custodial_locations.keys() if out.split('/')[-1] not in vetoed_custodial_tier]
        size_worth_checking = sum([getDatasetSize(out)/1023. for out in out_worth_checking ]) ## size in TBs of all outputs
        size_worht_going_to_ddm = sum([getDatasetSize(out)/1023. for out in out_worth_checking if out.split('/')[-1] in to_ddm_tier ]) ## size in TBs of all outputs
        if not all(map( lambda sites : len(sites)!=0, [custodial_locations[out] for out in out_worth_checking])):
            print wfo.name,"has not all custodial location"
            print json.dumps(custodial_locations, indent=2)

            ##########
            ## hook for making a custodial replica ?
            custodial = None
            ## get from other outputs
            for output in out_worth_checking:
                if len(custodial_locations[output]): 
                    custodial = custodial_locations[output][0]
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the other output custodial:",custodial,"because of limited space"
                custodial = None

            ## try to get it from campaign configuration
            if not custodial:
                for output in out_worth_checking:
                    campaign = campaigns[output]
                    if campaign in CI.campaigns and 'custodial' in CI.campaigns[campaign]:
                        custodial = CI.campaigns[campaign]['custodial']
                        print "Setting custodial to",custodial,"from campaign configuration"

            group = None
            if campaign in CI.campaigns and 'phedex_group' in CI.campaigns[campaign]:
                group = CI.campaigns[campaign]['phedex_group']
                print "using group",group,"for replica"

            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the campaign configuration custodial:",custodial,"because of limited space"
                custodial = None

            ## get from the parent
            pick_custodial = True
            use_parent_custodial = UC.get('use_parent_custodial')
            tape_size_limit = options.tape_size_limit if options.tape_size_limit else UC.get("tape_size_limit")
                
            _,prim,_,_ = wfi.getIO()
            if not custodial and prim and use_parent_custodial:
                parent_dataset = prim.pop()
                ## this is terribly dangerous to assume only 
                parents_custodial = phedexClient.getCustodialSubscriptionRequestSite( parent_dataset )
                ###parents_custodial = findCustodialLocation(url, parent_dataset)
                if not parents_custodial:
                    parents_custodial = []

                if len(parents_custodial):
                    custodial = parents_custodial[0]
                else:
                    print "the input dataset",parent_dataset,"does not have custodial in the first place. abort"
                    #sendEmail( "dataset has no custodial location", "Please take a look at %s in the logs of checkor"%parent_dataset)
                    ## does not work for RAWOADSIM
                    sendLog('checkor',"Please take a look at %s for missing custodial location"% parent_dataset)
                    ## cannot be bypassed, this is an issue to fix
                    is_closing = False
                    pick_custodial = False
                    assistance_tags.add('parentcustodial')
                                
            if custodial and float(SI.storage[custodial]) < size_worth_checking:
                print "cannot use the parent custodial:",custodial,"because of limited space"
                custodial = None

            if not custodial and pick_custodial:
                ## pick one at random
                custodial = SI.pick_SE(size=size_worth_checking)


            if custodial and size_worht_going_to_ddm > tape_size_limit:
                print wfi.sendLog('checkor',"The total output size (%s TB) is too large for the limit set (%s TB)"%( size_worth_checking, tape_size_limit))
                custodial = None

            if not custodial:
                print "cannot find a custodial for",wfo.name
                wfi.sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking))
                sendLog('checkor',"cannot find a custodial for %s probably because of the total output size %d"%( wfo.name, size_worth_checking), level='critical')
                
            if custodial and (is_closing or bypass_checks):
                print "picked",custodial,"for tape copy"

                ## remember how much you added this round already ; this stays locally
                SI.storage[custodial] -= size_worth_checking
                ## register the custodial request, if there are no other big issues
                for output in out_worth_checking:
                    if not len(custodial_locations[output]):
                        if phedex_presence[output]>=1:
                            wfi.sendLog('checkor','Using %s as a tape destination for %s'%(custodial, output))
                            custodials[custodial].append( output )
                            if group: custodials[custodial][-1]+='@%s'%group
                            ## let's wait and see if that's needed 
                            assistance_tags.add('custodial')
                        else:
                            print "no file in phedex for",output," not good to add to custodial requests"
            #cannot be bypassed


            is_closing = False

        time_point("determined tape location", sub_lap=True)

        ## disk copy 
        disk_copies = {}
        for output in wfi.request['OutputDatasets']:
            disk_copies[output] = [s for s in any_presence[output] if (not 'MSS' in s) and (not 'Buffer' in s)]

        if not all(map( lambda sites : len(sites)!=0, disk_copies.values())):
            print wfo.name,"has not all output on disk"
            print json.dumps(disk_copies, indent=2)


        ## presence in dbs
        dbs_presence = {}
        dbs_invalid = {}
        for output in wfi.request['OutputDatasets']:
            dbs_presence[output] = dbs3Client.getFileCountDataset( output )
            dbs_invalid[output] = dbs3Client.getFileCountDataset( output, onlyInvalid=True)

        
        time_point("dbs file count", sub_lap=True)

        if not all([dbs_presence[out] == (dbs_invalid[out]+phedex_presence[out]) for out in wfi.request['OutputDatasets']]) and not options.ignorefiles:
            mismatch_notice = wfo.name+" has a dbs,phedex mismatch\n"
            mismatch_notice += "in dbs\n"+json.dumps(dbs_presence, indent=2) +"\n"
            mismatch_notice += "invalide in dbs\n"+json.dumps(dbs_invalid, indent=2) +"\n"
            mismatch_notice += "in phedex\n"+json.dumps(phedex_presence, indent=2) +"\n"

            wfi.sendLog('checkor',mismatch_notice)
            if not 'recovering' in assistance_tags:
                assistance_tags.add('filemismatch')
                #print this for show and tell if no recovery on-going
                for out in dbs_presence:
                    _,_,missing_phedex,missing_dbs  = getDatasetFiles(url, out)
                    if missing_phedex:
                        wfi.sendLog('checkor',"These %d files are missing in phedex\n%s"%(len(missing_phedex),
                                                                                          "\n".join( missing_phedex )))
                        were_invalidated = sorted(set(missing_phedex) & set(TMDB_invalid ))
                        if were_invalidated:
                            wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated),
                                                                                                  "\n".join(were_invalidated)))
                            sendLog('checkor',"These %d files were invalidated globally\n%s\nand are invalidated in dbs"%(len(were_invalidated),
                                                                                                                          "\n".join(were_invalidated)), level='critical')
                            dbs3Client.setFileStatus( were_invalidated, newstatus=0 )
                                
                    if missing_dbs:
                        wfi.sendLog('checkor',"These %d files are missing in dbs\n%s"%(len(missing_dbs),
                                    "\n".join( missing_dbs )))
                        were_invalidated = sorted(set(missing_dbs) & set(TMDB_invalid ))
                        if were_invalidated:
                            wfi.sendLog('checkor',"These %d files were invalidated globally\n%s"%(len(were_invalidated),
                                                                                                  "\n".join(were_invalidated)))
            #if not bypass_checks:
            ## I don't think we can by pass this
            is_closing = False
        
        time_point("checked file count", sub_lap=True)

        fraction_invalid = 0.20
        if not all([(dbs_invalid[out] <= int(fraction_invalid*dbs_presence[out])) for out in wfi.request['OutputDatasets']]) and not options.ignoreinvalid:
            print wfo.name,"has a dbs invalid file level too high"
            print json.dumps(dbs_presence, indent=2)
            print json.dumps(dbs_invalid, indent=2)
            print json.dumps(phedex_presence, indent=2)
            ## need to be going and taking an eye
            assistance_tags.add('invalidfiles')
            if not bypass_checks:
                #sub_assistance+="-invalidfiles"
                is_closing = False

        ## put that heavy part at the end
        ## duplication check
        duplications = {}
        files_per_rl = {}
        for output in wfi.request['OutputDatasets']:
            duplications[output] = "skiped"
            files_per_rl[output] = "skiped"

        time_point("checked invalidation", sub_lap=True)

        if (is_closing or bypass_checks) and (not options.ignoreduplicates):
            print "starting duplicate checker for",wfo.name
            for output in wfi.request['OutputDatasets']:
                print "\tchecking",output
                duplications[output] = True
                try:
                    duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True)
                except:
                    try:
                        duplications[output],files_per_rl[output] = dbs3Client.duplicateRunLumiFiles( output , skipInvalid=True, verbose=True)
                    except Exception as e:
                        wfi.sendLog('checkor','Not possible to check on duplicate lumi count on %s'%(output))
                        sendLog('checkor','Not possible to check on duplicate lumi count on %s\n%s'%(output,str(e)),level='critical')
                        is_closing=False

            if is_closing and any(duplications.values()) and not options.ignoreduplicates:
                duplicate_notice = ""
                duplicate_notice += "%s has duplicates\n"%wfo.name
                duplicate_notice += json.dumps( duplications,indent=2)
                duplicate_notice += '\n'
                duplicate_notice += json.dumps( files_per_rl, indent=2)
                wfi.sendLog('checkor',duplicate_notice)
                ## hook for making file invalidation ?
                ## it shouldn't be allowed to bypass it
                assistance_tags.add('duplicates')
                is_closing = False 


        time_point("checked duplicates", sub_lap=True)

        time_point("done with %s"%wfo.name)

        ## for visualization later on
        if not wfo.name in fDB.record: 
            #print "adding",wfo.name,"to close out record"
            fDB.record[wfo.name] = {
            'datasets' :{},
            'name' : wfo.name,
            'closeOutWorkflow' : None,
            }
        fDB.record[wfo.name]['closeOutWorkflow'] = is_closing
        fDB.record[wfo.name]['priority'] = wfi.request['RequestPriority']
        fDB.record[wfo.name]['prepid'] = wfi.request['PrepID']

        for output in wfi.request['OutputDatasets']:
            if not output in fDB.record[wfo.name]['datasets']: fDB.record[wfo.name]['datasets'][output] = {}
            rec = fDB.record[wfo.name]['datasets'][output]
            #rec['percentage'] = float('%.2f'%(percent_completions[output]*100))
            rec['percentage'] = math.floor(percent_completions[output]*10000)/100.## round down
            rec['duplicate'] = duplications[output] if output in duplications else 'N/A'
            rec['phedexReqs'] = float('%.2f'%any_presence[output][custodial_presences[output][0]][1]) if len(custodial_presences[output])!=0 else 'N/A'
            rec['closeOutDataset'] = is_closing
            rec['transPerc'] = float('%.2f'%any_presence[output][ disk_copies[output][0]][1]) if len(disk_copies[output])!=0 else 'N/A'
            rec['correctLumis'] = int(events_per_lumi[output]) if (events_per_lumi[output] > lumi_upper_limit[output]) else True
            rec['missingSubs'] = False if len(custodial_locations[output])==0 else ','.join(list(set(custodial_locations[output])))
            rec['dbsFiles'] = dbs_presence[output]
            rec['dbsInvFiles'] = dbs_invalid[output]
            rec['phedexFiles'] = phedex_presence[output]
            rec['acdc'] = "%d / %d"%(len(acdc),len(acdc+acdc_inactive))
            rec['familly'] = true_familly
            now = time.gmtime()
            rec['timestamp'] = time.mktime(now)
            rec['updated'] = time.asctime(now)+' (GMT)'

        ## make the lumi summary 
        if wfi.request['RequestType'] == 'ReReco':
            try:
                os.system('python Unified/lumi_summary.py %s 1 > /dev/null'%(wfi.request['PrepID']))
                os.system('python Unified/lumi_plot.py %s > /dev/null'%(wfi.request['PrepID']))
                wfi.sendLog('checkor','Lumi summary available at %s/datalumi/lumi.%s.html'%(unified_url,wfi.request['PrepID']))
            except Exception as e:
                print str(e)
        ## make the error report
        
    
        ## and move on
        if is_closing:
            ## toggle status to closed-out in request manager
            wfi.sendLog('checkor',"setting %s closed-out"% wfo.name)
            if not options.test:
                if wfo.wm_status in ['closed-out','announced','normal-archived']:
                    print wfo.name,"is already",wfo.wm_status,"not trying to closed-out and assuming it does"
                    res = None
                else:
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    print "close out answer",res

                if not res in ["None",None]:
                    print "try to get the current status again"
                    wfi_bis = workflowInfo(url, wfo.name)
                    if wfi_bis.request['RequestStatus'] == 'closed-out':
                        print "the request did toggle to closed-out"
                        res = None
                    
                if not res in ["None",None]:
                    print "retrying to closing out"
                    print res
                    res = reqMgrClient.closeOutWorkflowCascade(url, wfo.name)
                    
                
                if res in [None,"None"]:
                    wfo.status = 'close'
                    session.commit()
                    if use_mcm and force_by_mcm:
                        ## shoot large on all prepids, on closing the wf
                        for pid in pids:
                            mcm.delete('/restapi/requests/forcecomplete/%s'%pid)
                else:
                    print "could not close out",wfo.name,"will try again next time"
        else:
            if not 'custodial' in assistance_tags or wfi.isRelval():
                ## do only the report for those
                for member in acdc+acdc_inactive+[wfo.name]:
                    try:
                        parse_one(url, member)
                    except:
                        print "Could not make error report for",member

            ## full known list
            #recovering # has active ACDC
            ##OUT #recovered #had inactive ACDC
            #recovery #not over the pass bar
            #over100 # over 100%
            #biglumi # has a big lumiblock
            #parentcustodial # the parent does not have a valid subscription yet
            #custodial # has had the transfer made, is waiting for a valid custodial subscription to appear
            #filemismatch # there is a dbs/phedex mismatch
            #duplicates #a lumi section is there twice

            ## manual is not added yet, and should be so by recoveror
            print wfo.name,"was tagged with :",list(assistance_tags)
            if 'recovering' in assistance_tags:
                ## if active ACDC, being under threshold, filemismatch do not matter
                assistance_tags = assistance_tags - set(['recovery','filemismatch'])
            if 'recovery' in assistance_tags and 'recovered' in assistance_tags:
                ## should not set -recovery to anything that had ACDC already
                assistance_tags = assistance_tags - set(['recovery','recovered']) 
                ## straight to manual
                assistance_tags.add('manual')
                in_manual += 1
            if 'recovery' in assistance_tags and 'manual' in assistance_tags:
                ## this is likely because something bad is happening, so leave it to manual
                assistance_tags = assistance_tags - set(['recovery'])
                assistance_tags.add('manual')
                in_manual += 1

            ## that means there is something that needs to be done acdc, lumi invalidation, custodial, name it
            print wfo.name,"needs assistance with",",".join( assistance_tags )
            print wfo.name,"existing conditions",",".join( existing_assistance_tags )
            
            #########################################
            ##### notification to requester #########
            go_notify=False
            if assistance_tags and not 'manual' in existing_assistance_tags and existing_assistance_tags != assistance_tags:
                go_notify=True
            

            if go_notify:
                #if wfo.name in already_notified:
                #    print "double notification"
                #    sendEmail('double notification','please take a look at %s'%(wfo.name))                    
                #else:
                #    already_notified.append( wfo.name )

                ###detailslink = 'https://cmsweb.cern.ch/reqmgr/view/details/%s'
                #detailslink = 'https://cmsweb.cern.ch/reqmgr2/fetch?rid=%s'%(wfo.name)
                ###perflink = 'https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s'%(wfo.name)
                perflink = '%s/report/%s'%(unified_url,wfo.name)
                splitlink = 'https://cmsweb.cern.ch/reqmgr/view/splitting/%s'%(wfo.name)
                ## notify templates
                messages= {
                    'recovery': 'Samples completed with missing statistics:\n%s\n%s '%( '\n'.join(['%.2f %% complete for %s'%(percent_completions[output]*100, output) for output in wfi.request['OutputDatasets'] ] ), perflink ),
                    'biglumi': 'Samples completed with large luminosity blocks:\n%s\n%s '%('\n'.join(['%d > %d for %s'%(events_per_lumi[output], lumi_upper_limit[output], output) for output in wfi.request['OutputDatasets'] if (events_per_lumi[output] > lumi_upper_limit[output])]), splitlink),
                    'duplicates': 'Samples completed with duplicated luminosity blocks:\n%s\n'%( '\n'.join(['%s'%output for output in wfi.request['OutputDatasets'] if output in duplications and duplications[output] ] ) ),
                    'filemismatch': 'Samples completed with inconsistency in DBS/Phedex',
                    #'manual' :                     'Workflow completed and requires manual checks by Ops',
                    }
                
                content = "The request PREPID (WORKFLOW) is facing issue in production.\n"
                motive = False
                for case in messages:
                    if case in assistance_tags:
                        content+= "\n"+messages[case]+"\n"
                        motive = True
                content += "You are invited to check, while this is being taken care of by Comp-Ops.\n"
                content += "This is an automated message from Comp-Ops.\n"

                items_notified = set()
                if use_mcm and motive:
                    wfi.notifyRequestor( content , mcm = mcm)

            #########################################


            ## logic to set the status further
            if assistance_tags:
                new_status = 'assistance-'+'-'.join(sorted(assistance_tags) )
            else:
                new_status = 'assistance'

            ## case where the workflow was in manual from recoveror
            if not 'manual' in wfo.status or new_status!='assistance-recovery':
                wfo.status = new_status
                if not options.test:
                    wfi.sendLog('checkor','setting %s to %s'%(wfo.name, wfo.status))
                    session.commit()
            else:
                print "current status is",wfo.status,"not changing to anything"

    #open('already_notifified.json','w').write( json.dumps( already_notified , indent=2))

    fDB.html()
    if not spec and in_manual!=0:
        sendEmail("fresh assistance status available","Fresh status are available at %s/assistance.html"%unified_url,destination=['*****@*****.**'])
        #it's a bit annoying
        pass

    ## custodial requests
    print "Custodials"
    print json.dumps(custodials, indent=2)
    for site in custodials:
        items_at = defaultdict(set)
        for i in custodials[site]:
            item, group = i.split('@') if '@' in i else (i,'DataOps')
            items_at[group].add( item )
        for group,items in items_at.items():
            print ','.join(items),'=>',site,'@',group
            if not options.test:
                result = makeReplicaRequest(url, site, sorted(items) ,"custodial copy at production close-out",custodial='y',priority='low', approve = (site in SI.sites_auto_approve) , group=group)
                print result

    print "File Invalidation"
    print invalidations
Ejemplo n.º 40
0
def htmlor(caller=""):
    cache = getWorkflows("cmsweb.cern.ch", "assignment-approved", details=True)
    cache.extend(getWorkflows("cmsweb.cern.ch", "running-open", details=True))
    cache.extend(getWorkflows("cmsweb.cern.ch", "running-closed", details=True))

    def getWL(wfn):
        cached = filter(lambda d: d["RequestName"] == wfn, cache)
        if cached:
            wl = cached[0]
        else:
            wl = getWorkLoad("cmsweb.cern.ch", wfn)
        return wl

    def wfl(wf, view=False, p=False, ms=False, within=False, ongoing=False, status=False, update=False):
        wfn = wf.name
        wfs = wf.wm_status
        wl = None
        pid = None
        pids = filter(lambda seg: seg.count("-") == 2, wf.name.split("_"))
        if len(pids):
            pids = pids[:1]
            pid = pids[0]

        if not pids:
            wl = getWL(wf.name)
            pids = getPrepIDs(wl)
            pid = pids[0]

        text = ", ".join(
            [
                # wfn,
                '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">%s</a>' % (wfn, wfn),
                "(%s) <br>" % wfs,
            ]
        )
        text += ", ".join(
            [
                '<a href="https://cmsweb.cern.ch/reqmgr/view/details/%s" target="_blank">dts</a>' % wfn,
                '<a href=https://cmsweb.cern.ch/reqmgr/view/showWorkload?requestName=%s target="_blank">wkl</a>' % wfn,
                '<a href="https://cmsweb.cern.ch/couchdb/reqmgr_workload_cache/%s" target="_blank">wfc</a>' % wfn,
                '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/request?requestName=%s" target="_blank">dwkc</a>' % wfn,
                '<a href="https://cmsweb.cern.ch/reqmgr/view/splitting/%s" target="_blank">spl</a>' % wfn,
                '<a href="https://cms-pdmv.cern.ch/stats/?RN=%s" target="_blank">vw</a>' % wfn,
                '<a href="https://cms-pdmv.cern.ch/stats/restapi/get_one/%s" target="_blank">vwo</a>' % wfn,
                '<a href="https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=full&reverse=0&reverse=1&npp=20&subtext=%s&sall=q" target="_blank">elog</a>'
                % pid,
                '<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank">pv</a>' % wfn,
                '<a href="https://cmsweb.cern.ch/reqmgr/reqMgr/outputDatasetsByRequestName/%s" target="_blank">out</a>'
                % wfn,
                '<a href="closeout.html#%s" target="_blank">clo</a>' % wfn,
                '<a href="statuses.html#%s" target="_blank">st</a>' % wfn,
                '<a href="https://cmsweb.cern.ch/couchdb/workloadsummary/_design/WorkloadSummary/_show/histogramByWorkflow/%s" target="_blank">perf</a>'
                % wfn,
            ]
        )
        if within and (not view or wfs == "completed"):
            wl = getWL(wfn)
            dataset = None
            if "InputDataset" in wl:
                dataset = wl["InputDataset"]
            if "Task1" in wl and "InputDataset" in wl["Task1"]:
                dataset = wl["Task1"]["InputDataset"]

            if dataset:
                text += ", ".join(
                    [
                        "",
                        "<a href=https://cmsweb.cern.ch/das/request?input=%s target=_blank>input</a>" % dataset,
                        "<a href=https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions#state=create_since=0;filter=%s target=_blank>sub</a>"
                        % dataset,
                        "<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/subscriptions?dataset=%s&collapse=n target=_blank>ds</a>"
                        % dataset,
                        "<a href=https://cmsweb.cern.ch/phedex/datasvc/xml/prod/blockreplicas?dataset=%s target=_blank>rep</a>"
                        % dataset,
                    ]
                )

        if p:
            cached = filter(lambda d: d["RequestName"] == wfn, cache)
            if cached:
                wl = cached[0]
            else:
                wl = getWorkLoad("cmsweb.cern.ch", wfn)
            text += ", (%s)" % (wl["RequestPriority"])
            pass

        if pid:
            if ms:
                mcm_s = json.loads(
                    os.popen(
                        "curl https://cms-pdmv.cern.ch/mcm/public/restapi/requests/get_status/%s --insecure" % pid
                    ).read()
                )[pid]
                text += ', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm (%s)</a>' % (
                    pid,
                    mcm_s,
                )
            else:
                text += ', <a href="https://cms-pdmv.cern.ch/mcm/requests?prepid=%s" target="_blank">mcm</a>' % (pid)
                text += (
                    ', <a href="https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=%s" target="_blank">ac</a>'
                    % (pid)
                )

        if status:
            if wf.status.startswith("assistance"):
                text += ', <a href="assistance.html#%s" target="_blank">assist</a>' % wfn
            text += " : %s " % (wf.status)

        if view and wfs != "acquired":
            text += (
                '<a href="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" target="_blank"><img src="https://cms-pdmv.web.cern.ch/cms-pdmv/stats/growth/%s.gif" style="height:50px"></a>'
                % (wfn.replace("_", "/"), wfn.replace("_", "/"))
            )
        if ongoing:
            text += (
                '<a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a>'
                % (wfn, wfn)
            )

        if ongoing:
            date1 = time.strftime("%Y-%m-%d+%H:%M", time.gmtime(time.mktime(time.gmtime()) - (15 * 24 * 60 * 60)))
            date2 = time.strftime("%Y-%m-%d+%H:%M", time.gmtime())
            text += (
                '<a href="http://dashb-cms-job.cern.ch/dashboard/templates/web-job2/#table=Jobs&date1=%s&date2=%s&sortby=site&task=wmagent_%s">dashb</a>'
                % (date1, date2, wfn)
            )

        text += "<hr>"
        return text

    def phl(phid):
        text = ", ".join(
            [
                str(phid),
                '<a href="https://cmsweb.cern.ch/phedex/prod/Request::View?request=%s" target="_blank">vw</a>' % phid,
                '<a href="https://cmsweb.cern.ch/phedex/prod/Data::Subscriptions?reqfilter=%s" target="_blank">sub</a>'
                % phid,
            ]
        )
        return text

    def ol(out):
        return '<a href="https://cmsweb.cern.ch/das/request?input=%s" target="_blank"> %s</a>' % (out, out)

    def lap(comment):

        l = time.mktime(time.gmtime())
        spend = l - lap.start
        lap.start = l
        print "Spend %d [s] for %s" % (spend, comment)

    lap.start = time.mktime(time.gmtime())

    ## start to write it
    # html_doc = open('/afs/cern.ch/user/v/vlimant/public/ops/index.html','w')
    html_doc = open("/afs/cern.ch/user/c/cmst2/www/unified/index.html", "w")
    print "Updating the status page ..."

    if not caller:
        try:
            # caller = sys._getframe(1).f_code.co_name
            caller = sys.argv[0].split("/")[-1].replace(".py", "")
            print "caller is"
            print caller
        except Exception as es:
            caller = "none found"
            print "not getting frame"
            print str(es)

    html_doc.write(
        """
<html>
<head>
<META HTTP-EQUIV="refresh" CONTENT="900">
<script type="text/javascript">
 function showhide(id) {
    var e = document.getElementById(id);
    e.style.display = (e.style.display == 'block') ? 'none' : 'block';
 }
</script>
</head>
<body>

Last update on %s(CET), %s(GMT), <a href=logs/ target=_blank>logs</a> <a href=logs/last.log target=_blank>last</a> <a href=statuses.html>statuses</a> <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/ target=_blank>prod mon</a> <a href=https://cmsweb.cern.ch/wmstats/index.html target=_blank>wmstats</a> <a href=http://t3serv001.mit.edu/~cmsprod/IntelROCCS/Detox/SitesInfo.txt target=_blank>detox</a> <a href=locked.html>space</a> <a href=logs/subscribor/last.log target=_blank>blocks</a> <a href=https://twiki.cern.ch/twiki/bin/view/CMSPublic/CompOpsWorkflowL3Responsibilities#Automatic_Assignment_and_Unified>what am I</a> <a href=logs/addHoc/last.log>add-hoc op</a> created from <b>%s <a href=logs/last_running>last running</a></b><br><br>

"""
        % (time.asctime(time.localtime()), time.asctime(time.gmtime()), caller)
    )

    text = ""
    count = 0
    count_by_campaign = defaultdict(lambda: defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status == "considered").all():
        wl = getWL(wf.name)
        count_by_campaign[wl["Campaign"]][int(wl["RequestPriority"])] += 1
        text += "<li> %s </li> \n" % wfl(wf, p=True)
        count += 1
    text_by_c = ""
    for c in count_by_campaign:
        text_by_c += "<li> %s (%d) : " % (c, sum(count_by_campaign[c].values()))
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c += "%d (%d), " % (p, count_by_campaign[c][p])
        text_by_c += "</li>"

    html_doc.write(
        """
Worflow next to handle (%d) <a href=https://cms-pdmv.cern.ch/mcm/batches?status=new&page=-1 target="_blank"> batches</a> <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/transferor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('considered')">[Click to show/hide]</a>
<br>
<div id="considered" style="display:none;">
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('considered_bywf')">[Click to show/hide]</a><div id="considered_bywf" style="display:none;">
 <ul>
 %s
 </ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('considered_bycamp')">[Click to show/hide]</a><div id="considered_bycamp" style="display:none;">
 <ul>
 %s
 </ul></div>
</ul>
</div>
"""
        % (count, count, text, len(count_by_campaign), text_by_c)
    )

    lap("done with considered")
    text = ""
    count = 0
    count_by_campaign = defaultdict(lambda: defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status == "staging").all():
        wl = getWL(wf.name)
        count_by_campaign[wl["Campaign"]][int(wl["RequestPriority"])] += 1
        text += "<li> %s </li> \n" % wfl(wf, within=True)
        count += 1

    text_by_c = ""
    for c in count_by_campaign:
        text_by_c += "<li> %s (%d) : " % (c, sum(count_by_campaign[c].values()))
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c += "%d (%d), " % (p, count_by_campaign[c][p])
        text_by_c += "</li>"

    html_doc.write(
        """
Worflow waiting in staging (%d) <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staging')">[Click to show/hide]</a>
<br>
<div id="staging" style="display:none;">
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('staging_bywf')">[Click to show/hide]</a><div id="staging_bywf" style="display:none;">                                                                                                                                                                       
 <ul>            
 %s
 </ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('staging_bycamp')">[Click to show/hide]</a><div id="staging_bycamp" style="display:none;">                                                                                                                                                                  
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>                                                                                                                                                                                                                                                                                                               
</ul>      
</div>
"""
        % (count, count, text, len(count_by_campaign), text_by_c)
    )

    lap("done with staging")

    text = ""
    count = 0
    for ts in session.query(Transfer).all():
        stext = (
            '<li> %s serves </li><a href="javascript:showhide(\'%s\')">[show/hide] relevant workflows</a> <div id="%s" style="display:none;"><ul>'
            % (phl(ts.phedexid), ts.phedexid, ts.phedexid)
        )
        hide = True
        for pid in ts.workflows_id:
            w = session.query(Workflow).get(pid)
            hide &= w.status != "staging"
            if w.status in ["considered", "staging", "staged"]:
                stext += "<li> %s </li>\n" % (wfl(w, status=True))
        stext += "</ul></div>\n"
        if hide:
            # text+="<li> %s not needed anymore to start running (does not mean it went through completely)</li>"%phl(ts.phedexid)
            pass
        else:
            count += 1
            text += stext
    text += "</ul></div>"
    html_doc.write(
        """
Transfer on-going (%d) <a href=https://transferteam.web.cern.ch/transferteam/dashboard/ target=_blank>dashboard</a> <a href=logs/transferor/last.log target=_blank>log</a> <a href=logs/stagor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('transfer')">[Click to show/hide]</a>
<br>
<div id="transfer" style="display:none;">
<br>
<ul>"""
        % count
    )
    html_doc.write(text)

    lap("done with transfers")

    text = ""
    count = 0
    count_by_campaign = defaultdict(lambda: defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status == "staged").all():
        wl = getWL(wf.name)
        count_by_campaign[wl["Campaign"]][int(wl["RequestPriority"])] += 1
        text += "<li> %s </li> \n" % wfl(wf, p=True)
        count += 1
    text_by_c = ""
    for c in count_by_campaign:
        text_by_c += "<li> %s (%d) : " % (c, sum(count_by_campaign[c].values()))
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c += "%d (%d), " % (p, count_by_campaign[c][p])
        text_by_c += "</li>"

    html_doc.write(
        """Worflow ready for assigning (%d) <a href=logs/stagor/last.log target=_blank>log</a> <a href=logs/assignor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('staged')">[Click to show/hide]</a>
<br>
<div id="staged" style="display:none;">
<br>
<ul>
<li> By workflow (%d) </li><a href="javascript:showhide('staged_bywf')">[Click to show/hide]</a><div id="staged_bywf" style="display:none;">                                                                                                                                                                             
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>                                                                                                                                                                                                                                                                                                               
<li> By campaigns (%d) </li><a href="javascript:showhide('staged_bycamp')">[Click to show/hide]</a><div id="staged_bycamp" style="display:none;">                                                                                                                                                                        
 <ul>                                                                                                                                                                                                                                                                                                                      
 %s                                                                                                                                                                                                                                                                                                                        
 </ul></div>
</ul>
</div>
"""
        % (count, count, text, len(count_by_campaign), text_by_c)
    )

    lap("done with staged")

    lines = []
    count_by_campaign = defaultdict(lambda: defaultdict(int))
    for wf in session.query(Workflow).filter(Workflow.status == "away").all():
        wl = getWL(wf.name)
        count_by_campaign[wl["Campaign"]][int(wl["RequestPriority"])] += 1
        lines.append("<li> %s </li>" % wfl(wf, view=True, ongoing=True))
    text_by_c = ""
    for c in count_by_campaign:
        text_by_c += "<li> %s (%d) : " % (c, sum(count_by_campaign[c].values()))
        for p in sorted(count_by_campaign[c].keys()):
            text_by_c += "%d (%d), " % (p, count_by_campaign[c][p])
        text_by_c += "</li>"

    lines.sort()
    html_doc.write(
        """
Worflow on-going (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests_in_production.php target=_blank>ongoing</a> <a href=https://cms-logbook.cern.ch/elog/Workflow+processing/?mode=summary target=_blank>elog</a> <a href=http://cms-gwmsmon.cern.ch/prodview target=_blank>queues</a> <a href=logs/assignor/last.log target=_blank>log</a> <a href=logs/checkor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('away')">[Click to show/hide]</a>
<br>
<div id="away" style="display:none;">
<ul> 
<li>By workflow (%d) </li>
<a href="javascript:showhide('away_bywf')">[Click to show/hide]</a><div id="away_bywf" style="display:none;">
<ul>
%s
</ul></div>
<li> By campaigns (%d) </li><a href="javascript:showhide('away_bycamp')">[Click to show/hide]</a><div id="away_bycamp" style="display:none;">
<ul>
%s
</ul></div>
</ul>
</div>
"""
        % (len(lines), len(lines), "\n".join(lines), len(count_by_campaign), text_by_c)
    )

    lap("done with away")

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == "assistance").all():
        text += "<li> %s </li> \n" % wfl(wf, view=True, update=True, status=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write(
        """Worflow that are closing (%d)
<a href=closeout.html target=_blank>closeout</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('closing')">[Click to show/hide]</a>
<br>
<div id="closing" style="display:none;">
<br>
<ul>
"""
        % count
    )
    html_doc.write(text)

    lap("done with closing")

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status.startswith("assistance-")).all():
        text += "<li> %s </li> \n" % wfl(wf, view=True, within=True, status=True, update=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write(
        """Worflow which need assistance (%d)
<a href=assistance.html target=_blank>assistance</a> 
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/recoveror/last.log target=_blank>postlog</a>
<a href="javascript:showhide('assistance')">[Click to show/hide]</a>
<br>
<div id="assistance" style="display:none;">
<br>
<ul>
"""
        % count
    )
    html_doc.write(text)

    lap("done with assistance")

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == "close").all():
        text += "<li> %s </li> \n" % wfl(wf)
        count += 1
    text += "</ul></div>\n"
    html_doc.write(
        """Worflow ready to close (%d)
<a href=logs/checkor/last.log target=_blank>log</a> <a href=logs/closor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('close')">[Click to show/hide]</a>
<br>
<div id="close" style="display:none;">
<br>
<ul>
"""
        % count
    )
    html_doc.write(text)

    lap("done with annoucing")

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == "trouble").all():
        text += "<li> %s </li> \n" % wfl(wf)
        count += 1
    text += "</ul></div>\n"
    html_doc.write(
        """Worflow with issue (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/injector/last.log target=_blank>postlog</a>
<a href="javascript:showhide('trouble')">[Click to show/hide]</a>
<br>
<div id="trouble" style="display:none;">
<br>
<ul>
"""
        % count
    )
    html_doc.write(text)

    lap("done with trouble")

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == "forget").all():
        text += "<li> %s </li> \n" % wfl(wf)
        count += 1
    text += "</ul></div>\n"
    html_doc.write(
        """
Worflow to forget (%d) <a href=logs/injector/last.log target=_blank>log</a> <a href=logs/outcleanor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('forget')">[Click to show/hide]</a>
<br>
<div id="forget" style="display:none;">
<br>
<ul>
"""
        % count
    )
    html_doc.write(text)

    lap("done with forget")

    text = ""
    count = 0
    for wf in session.query(Workflow).filter(Workflow.status == "done").all():
        text += "<li> %s </li> \n" % wfl(wf)  # ,ms=True)
        count += 1
    text += "</ul></div>\n"
    html_doc.write(
        """
Worflow through (%d) <a href=logs/closor/last.log target=_blank>log</a> <a href=logs/cleanor/last.log target=_blank>postlog</a>
<a href="javascript:showhide('done')">[Click to show/hide]</a>
<br>
<div id="done" style="display:none;">
<br>
<ul>
"""
        % count
    )
    html_doc.write(text)

    lap("done with done")

    wfs = session.query(Workflow).filter(Workflow.status.endswith("-unlock")).all()
    html_doc.write(" Workflows unlocked : %s <br>" % (len(wfs)))
    lap("done with unlocked")

    text = ""
    lines_thisweek = []
    lines_lastweek = []
    now = time.mktime(time.gmtime())
    this_week = int(time.strftime("%W", time.gmtime()))
    start_time_two_weeks_ago = time.mktime(time.strptime("15-0-%d" % (this_week - 2), "%y-%w-%W"))
    for out in session.query(Output).filter(Output.date >= start_time_two_weeks_ago).all():
        if not out.workflow:
            print "This is a problem with", out.datasetname
            continue
        if out.workflow.status in ["done", "clean", "clean-out", "clean-unlock"]:
            out_week = int(time.strftime("%W", time.gmtime(out.date)))
            ##only show current week, and the previous.
            if (this_week - out_week) == 1:
                lines_lastweek.append(
                    "<li>on week %s : %s </li>"
                    % (time.strftime("%W (%x %X)", time.gmtime(out.date)), ol(out.datasetname))
                )
            if (this_week - out_week) == 0:
                lines_thisweek.append(
                    "<li>on week %s : %s </li>"
                    % (time.strftime("%W (%x %X)", time.gmtime(out.date)), ol(out.datasetname))
                )
    lines_thisweek.sort()
    lines_lastweek.sort()

    html_doc.write(
        """Output produced (%d) <a href=https://dmytro.web.cern.ch/dmytro/cmsprodmon/requests.php?in_disagreement=1 target=_blank>disagreements</a>
<a href="javascript:showhide('output')">[Click to show/hide]</a>
<br>
<div id="output" style="display:none;">
<br>
<ul>
<li> Last week (%d) </li><a href="javascript:showhide('output_lastweek')">[Click to show/hide]</a><div id="output_lastweek" style="display:none;"><ul>
%s
</ul></div>
<li> This week (%d) </li><a href="javascript:showhide('output_thisweek')">[Click to show/hide]</a><div id="output_thisweek" style="display:none;"><ul>
%s
</ul></div></div>
"""
        % (
            len(lines_lastweek) + len(lines_thisweek),
            len(lines_lastweek),
            "\n".join(lines_lastweek),
            len(lines_thisweek),
            "\n".join(lines_thisweek),
        )
    )

    lap("done with output")

    html_doc.write(
        """Job installed
<a href="javascript:showhide('acron')">[Click to show/hide]</a>
<br>
<div id="acron" style="display:none;">
<br>
<pre>
%s
</pre>
"""
        % (os.popen("acrontab -l | grep Unified | grep -v \#").read())
    )

    per_module = defaultdict(list)
    for t in filter(None, os.popen("cat /afs/cern.ch/user/c/cmst2/www/unified/logs/*/*.time").read().split("\n")):
        module_name, run_time, spend = t.split(":")
        ## then do what you want with it !
        per_module[module_name].append(int(spend))

    html_doc.write("Module running time<ul>\n")
    for m, spends in per_module.items():
        html_doc.write("<li>%s : last %d [s], avg %d [s]</li>\n" % (m, spends[-1], sum(spends) / float(len(spends))))
    html_doc.write("</ul>")

    html_doc.write(
        "Last running <pre>%s</pre>"
        % (os.popen("tac /afs/cern.ch/user/c/cmst2/www/unified/logs/running | head -5").read())
    )
    html_doc.write("</div>\n")
    lap("done with jobs")

    text = ""
    count = 0
    for (c, info) in campaignInfo().campaigns.items():
        # if 'go' in info and info['go']:
        text += "<li>%s <br> <pre>%s</pre>  </li>" % (c, json.dumps(info, indent=2))
        count += 1

    html_doc.write(
        """Campaign configuration
<a href="javascript:showhide('campaign')">[Click to show/hide]</a>
<br>
<div id="campaign" style="display:none;">
<br>
<ul>
%s
</ul></div>
"""
        % (text)
    )

    text = ""
    count = 0
    n_column = 4
    SI = siteInfo()
    for t in SI.types():
        text += "<li>%s<table border=1>" % t
        c = 0
        for site in getattr(SI, t):
            cpu = SI.cpu_pledges[site] if site in SI.cpu_pledges else "N/A"
            disk = SI.disk[SI.CE_to_SE(site)] if SI.CE_to_SE(site) in SI.disk else "N/A"
            if c == 0:
                text += "<tr>"
            text += (
                '<td><a href=http://dashb-ssb.cern.ch/dashboard/templates/sitePendingRunningJobs.html?site=%s>%s</a><br><a href="http://cms-gwmsmon.cern.ch/prodview/%s" target="_blank"><img src="http://cms-gwmsmon.cern.ch/prodview/graphs/%s/daily" style="height:50px"></a><br>CPU pledge: %s<br>Disk available: %s</td>'
                % (site, site, site, site, cpu, disk)
            )
            if c == n_column:
                c = 0
            else:
                c += 1
        text += "</table></li>"

    lap("done with campaigns")

    open("/afs/cern.ch/user/c/cmst2/www/unified/siteInfo.json", "w").write(
        json.dumps(dict([(t, getattr(SI, t)) for t in SI.types()]), indent=2)
    )

    lap("done with sites json")

    chart_data = defaultdict(list)
    for site in SI.quota:
        chart_data[site].append(
            """
var data_%s = google.visualization.arrayToDataTable([ 
['Overall', 'Space in TB'],
//['Quota' , %s],
['Locked' , %s],
['Free' , %s]
]);
"""
            % (site, SI.quota[site], SI.locked[site], SI.disk[site])
        )
        chart_data[site].append(
            """
var chart_%s = new google.visualization.PieChart(document.getElementById('donutchart_%s'));
chart_%s.draw(data_%s, {title: '%s %s [TB]', pieHole:0.4, slices:{0:{color:'red'},1:{color:'green'}}});
"""
            % (site, site, site, site, site, SI.quota[site])
        )
        chart_data[site].append(
            """
<div id="donutchart_%s" style="height: 200px;"></div>
"""
            % (site)
        )

    ## make the locked/available donut chart
    donut_html = open("/afs/cern.ch/user/c/cmst2/www/unified/locked.html", "w")
    tables = "\n".join([info[0] for site, info in chart_data.items()])
    draws = "\n".join([info[1] for site, info in chart_data.items()])
    divs = "\n".join([info[2] for site, info in chart_data.items()])

    divs_table = "<table border=0>"
    for c, site in enumerate(sorted(chart_data.keys())):
        if c % 6 == 0:
            divs_table += "<tr>"
        divs_table += "<td>%s</td>" % (chart_data[site][2])
    divs_table += "</table>"

    donut_html.write(
        """
<html>
  <head>
    <script type="text/javascript" src="https://www.google.com/jsapi"></script>
    <script type="text/javascript">
      google.load("visualization", "1", {packages:["corechart"]});
      google.setOnLoadCallback(drawChart);
      function drawChart() {
%s

%s
      }
    </script>
  </head>
  <body>
%s
  </body>
</html>
"""
        % (tables, draws, divs_table)
    )
    donut_html.close()

    html_doc.write(
        """Site configuration
<a href="javascript:showhide('site')">[Click to show/hide]</a>
<br>
<div id="site" style="display:none;">
<br>
<ul>
%s
</ul></div>
"""
        % (text)
    )

    lap("done with space")

    UC = unifiedConfiguration()
    text = ""
    for param in UC.configs:
        text += "<li>%s</li><ul>\n" % param
        for sub in sorted(UC.configs[param].keys()):
            text += "<li> %s : %s </li>\n" % (sub, UC.configs[param][sub])
        text += "</ul>\n"

    html_doc.write(
        """Unified configuration
<a href="javascript:showhide('config')">[Click to show/hide]</a>
<br>
<div id="config" style="display:none;">
<br>
<ul>
%s
</ul></div>                                                                                                                                                                                                                                                                                                                
"""
        % (text)
    )

    lap("done with configuration")

    print "... done with status page."
    html_doc.write(
        """
</body>
</html>
"""
    )

    html_doc.close()

    html_doc = open("/afs/cern.ch/user/c/cmst2/www/unified/statuses.html", "w")
    html_doc.write(
        """                                                                                                                                                                                                                                                                                                      <html>        
<table border=1>
<thead>
<tr>
<th> workflow </th><th> status </th><th> wm status</th>
</tr>
</thead>
"""
    )
    wfs = {}
    for wfo in session.query(Workflow).all():
        wfs[wfo.name] = (wfo.status, wfo.wm_status)
    open("/afs/cern.ch/user/c/cmst2/www/unified/statuses.json", "w").write(json.dumps(wfs))
    for wfn in sorted(wfs.keys()):
        html_doc.write(
            '<tr><td><a id="%s">%s</a></td><td>%s</td><td>%s</td></tr>\n' % (wfn, wfn, wfs[wfn][0], wfs[wfn][1])
        )
    html_doc.write("</table>")
    html_doc.write("<br>" * 100)
    html_doc.write("end of page</html>")
    html_doc.close()
Ejemplo n.º 41
0
def completor(url, specific):
    mlock = moduleLock(silent=True)
    if mlock(): return

    use_mcm = True
    up = componentInfo(soft=['mcm', 'wtc', 'jira'])
    if not up.check(): return
    use_mcm = up.status['mcm']
    if use_mcm:
        mcm = McMClient(dev=False)

    safe_mode = False

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    JC = JIRAClient() if up.status.get('jira', False) else None

    wfs = []
    wfs.extend(session.query(Workflow).filter(Workflow.status == 'away').all())
    wfs.extend(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance')).all())

    ## just take it in random order so that not always the same is seen
    random.shuffle(wfs)

    max_per_round = UC.get('max_per_round').get('completor', None)
    if max_per_round and not specific: wfs = wfs[:max_per_round]

    all_stuck = set()
    ## take into account what stagor was saying
    for itry in range(5):
        try:
            all_stuck.update(
                json.loads(eosRead('%s/stuck_transfers.json' %
                                   monitor_pub_dir)))
            break
        except:
            time.sleep(2)

    for itry in range(5):
        try:
            ## take into account the block that needed to be repositioned recently
            all_stuck.update([
                b.split('#')[0] for b in json.loads(
                    eosRead('%s/missing_blocks.json' % monitor_dir))
            ])
            break
        except:
            time.sleep(2)

    ## take into account all stuck block and dataset from transfer team
    all_stuck.update(getAllStuckDataset())

    good_fractions = {}
    overdoing_fractions = {}
    truncate_fractions = {}
    timeout = {}
    campaign_injection_delay = {}
    for c in CI.campaigns:
        if 'force-complete' in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]['force-complete']
        if 'truncate-complete' in CI.campaigns[c]:
            truncate_fractions[c] = CI.campaigns[c]['truncate-complete']
        if 'force-timeout' in CI.campaigns[c]:
            timeout[c] = CI.campaigns[c]['force-timeout']
        if 'injection-delay' in CI.campaigns[c]:
            campaign_injection_delay[c] = CI.campaigns[c]['injection-delay']
        if 'overdoing-complete' in CI.campaigns[c]:
            overdoing_fractions[c] = CI.campaigns[c]['overdoing-complete']

    long_lasting = {}

    WI = wtcInfo()
    overrides = WI.getForce()
    if use_mcm:
        ## add all workflow that mcm wants to get force completed
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        ## assuming this will be a list of actual prepids
        overrides['mcm'] = mcm_force

    print "can force complete on"
    print json.dumps(good_fractions, indent=2)
    print "can truncate complete on"
    print json.dumps(truncate_fractions, indent=2)
    print "can overide on"
    print json.dumps(overrides, indent=2)
    max_force = UC.get("max_force_complete")
    max_priority = UC.get("max_tail_priority")
    injection_delay_threshold = UC.get("injection_delay_threshold")
    injection_delay_priority = UC.get("injection_delay_priority")
    delay_priority_increase = UC.get("delay_priority_increase")
    default_fraction_overdoing = UC.get('default_fraction_overdoing')

    set_force_complete = set()

    # priority and time above which to fire a JIRA
    jira_priority_and_delays = {
        110000: 21,
        90000: 28,
        #     80000 : 60,
        #0 : 90
    }

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        print "looking at", wfo.name

        ## get all of the same
        wfi = workflowInfo(url, wfo.name)
        pids = wfi.getPrepIDs()
        skip = False
        campaigns = wfi.getCampaigns()

        #if not any([c in good_fractions.keys() for c in campaigns]): skip=True
        #if not any([c in truncate_fractions.keys() for c in campaigns]): skip=True

        for user, spec in overrides.items():
            if not spec: continue
            spec = filter(None, spec)
            if not wfi.request['RequestStatus'] in [
                    'force-complete', 'completed'
            ]:
                if any(s in wfo.name
                       for s in spec) or (wfo.name in spec) or any(
                           pid in spec for pid in pids) or any(s in pids
                                                               for s in spec):

                    wfi = workflowInfo(url, wfo.name)
                    forceComplete(url, wfi)
                    skip = True
                    wfi.notifyRequestor(
                        "The workflow %s was force completed by request of %s"
                        % (wfo.name, user),
                        do_batch=False)
                    wfi.sendLog(
                        'completor',
                        '%s is asking for %s to be force complete' %
                        (user, wfo.name))
                    break

        if wfo.status.startswith('assistance'): skip = True

        if skip:
            continue

        priority = wfi.request['RequestPriority']

        if not 'Campaign' in wfi.request: continue

        if not wfi.request['RequestStatus'] in [
                'acquired', 'running-open', 'running-closed'
        ]:
            continue

        ## until we can map the output to task ...
        output_per_task = wfi.getOutputPerTask(
        )  ## can use that one, and follow mapping
        good_fraction_per_out = {}
        good_fraction_nodelay_per_out = {}
        truncate_fraction_per_out = {}
        #allowed_delay_per_out = {}
        for task, outs in output_per_task.items():
            task_campaign = wfi.getCampaignPerTask(task)
            for out in outs:
                good_fraction_per_out[out] = good_fractions.get(
                    task_campaign, 1000.)
                good_fraction_nodelay_per_out[out] = overdoing_fractions.get(
                    task_campaign, default_fraction_overdoing)
                truncate_fraction_per_out[out] = truncate_fractions.get(
                    task_campaign, 1000.)
                #allowed_delay_per_out[out] = timeout.get(task_campaign, 14)

        #print "force at", json.dumps( good_fraction_per_out, indent=2)
        #print "truncate at",json.dumps( truncate_fraction_per_out, indent=2)

        now = time.mktime(time.gmtime()) / (60 * 60 * 24.)

        priority_log = filter(lambda change: change['Priority'] == priority,
                              wfi.request.get('PriorityTransition', []))
        if not priority_log:
            print "\tHas no priority log"
            priority_delay = 0
        else:
            then = max([change['UpdateTime']
                        for change in priority_log]) / (60. * 60. * 24.)
            priority_delay = now - then  ## in days
            print "priority was set to", priority, priority_delay, "[days] ago"

        running_log = filter(
            lambda change: change["Status"
                                  ] in ["running-open", "running-closed"],
            wfi.request['RequestTransition'])
        if not running_log:
            print "\tHas no running log"
            delay = 0
        else:
            then = max([change['UpdateTime']
                        for change in running_log]) / (60. * 60. * 24.)
            delay = now - then  ## in days

        #further check on delays
        cpuh = wfi.getComputingTime(unit='d')
        wfi.sendLog(
            'completor',
            "Requires %.1f [CPUd], running since %.1f [day], last priority change to %d done %.1f [day] ago"
            % (cpuh, delay, priority, priority_delay))
        if priority_delay != 0 and priority_delay < delay:
            ## regardless when it started running, set the delay to when priority was changed last
            delay = priority_delay

        ## this is supposed to be the very initial request date, inherited from clones
        injection_delay = None
        original = wfi
        if 'OriginalRequestName' in original.request:
            ## go up the clone chain
            original = workflowInfo(url,
                                    original.request['OriginalRequestName'])
        injected_log = filter(
            lambda change: change["Status"] in ["assignment-approved"],
            original.request['RequestTransition'])
        if injected_log:
            injected_on = injected_log[-1]['UpdateTime'] / (60. * 60. * 24.)
            injection_delay = now - injected_on

        delay_for_priority_increase = injection_delay
        #delay_for_priority_increase = delay

        (w, d) = divmod(delay, 7)
        print "\t" * int(
            w) + "Running since", delay, "[days] priority=", priority

        pop_a_jira = False
        ping_on_jira = 7 * (24 * 60 * 60)  # 7 days
        for jp, jd in jira_priority_and_delays.items():
            if priority >= jp and delay >= jd: pop_a_jira = True

        if pop_a_jira and JC:
            j, reopened, just_created = JC.create_or_last(
                prepid=wfi.request['PrepID'],
                priority=wfi.request['RequestPriority'],
                label='Late',
                reopen=True)
            last_time = JC.last_time(j)
            since_last_ping = time.mktime(time.gmtime()) - last_time
            if since_last_ping > ping_on_jira or just_created:
                j_comment = "Running since %.1f [days] at priority %d" % (
                    delay, priority)
                JC.comment(j.key, j_comment)

        if delay_for_priority_increase != None and delay_for_priority_increase > injection_delay_threshold and priority >= injection_delay_priority:
            quantized = 5000  ## quantize priority
            tail_cutting_priority = wfi.request['InitialPriority'] + int(
                (delay_priority_increase *
                 (delay_for_priority_increase - injection_delay_threshold) / 7)
                / quantized) * quantized
            tail_cutting_priority += 101  ## to signal it is from this mechanism
            tail_cutting_priority = min(
                400000, tail_cutting_priority)  ## never go above 400k priority
            tail_cutting_priority = max(
                tail_cutting_priority,
                priority)  ## never go below the current value

            if priority < tail_cutting_priority:
                if max_priority:
                    sendLog(
                        'completor',
                        "%s Injected since %s [days] priority=%s, increasing to %s"
                        % (wfo.name, delay_for_priority_increase, priority,
                           tail_cutting_priority),
                        level='critical')
                    wfi.sendLog(
                        'completor',
                        'bumping priority to %d for being injected since %s' %
                        (tail_cutting_priority, delay_for_priority_increase))

                    reqMgrClient.changePriorityWorkflow(
                        url, wfo.name, tail_cutting_priority)
                    max_priority -= 1
                else:
                    sendLog(
                        'completor',
                        "%s Injected since %s [days] priority=%s, would like to increase to %s"
                        % (wfo.name, delay_for_priority_increase, priority,
                           tail_cutting_priority),
                        level='critical')
                    wfi.sendLog(
                        'completor',
                        'would like to bump priority to %d for being injected since %s'
                        % (tail_cutting_priority, delay_for_priority_increase))

                    print "Could be changing the priority to higher value, but too many already were done"

        _, prim, _, _ = wfi.getIO()
        is_stuck = all_stuck & prim
        if is_stuck:
            wfi.sendLog('completor', '%s is stuck' % ','.join(is_stuck))

        monitor_delay = 7
        allowed_delay = max([timeout.get(c, 14) for c in campaigns])

        monitor_delay = min(monitor_delay, allowed_delay)

        ### just skip if too early, just for the sake of not computing the completion fraction just now.
        # maybe this is fast enough that we can do it for all
        if delay <= monitor_delay:
            print "not enough time has passed yet"
            continue

        long_lasting[wfo.name] = {
            "delay": delay,
            "injection_delay": injection_delay
        }

        percent_completions = wfi.getCompletionFraction(caller='completor')

        if not percent_completions:
            sendLog('completor',
                    '%s has no output at all' % wfo.name,
                    level='critical')
            continue

        is_over_allowed_delay = (all([
            percent_completions[out] >= good_fraction_per_out.get(out, 1000.)
            for out in percent_completions
        ]) and delay >= allowed_delay)
        is_over_truncation_delay = (is_stuck and (all([
            percent_completions[out] >= truncate_fraction_per_out.get(
                out, 1000.) for out in percent_completions
        ])) and delay >= allowed_delay)
        is_over_completion = (all([
            percent_completions[out] >= good_fraction_nodelay_per_out.get(
                out, 1000.) for out in percent_completions
        ]))

        if is_over_completion:
            wfi.sendLog(
                'completor', "all is over completed %s\n %s" %
                (json.dumps(good_fraction_nodelay_per_out, indent=2),
                 json.dumps(percent_completions, indent=2)))
        elif is_over_allowed_delay:
            wfi.sendLog(
                'completor', "all is above %s \n%s" %
                (json.dumps(good_fraction_per_out, indent=2),
                 json.dumps(percent_completions, indent=2)))
        elif is_over_truncation_delay:
            wfi.sendLog(
                'completor',
                "all is above %s truncation level, and the input is stuck\n%s"
                % (json.dumps(truncate_fraction_per_out, indent=2),
                   json.dumps(percent_completions, indent=2)))

        else:
            long_lasting[wfo.name].update({
                'completion':
                sum(percent_completions.values()) / len(percent_completions),
                'completions':
                percent_completions
            })

            ## do something about the agents this workflow is in
            long_lasting[wfo.name]['agents'] = wfi.getAgents()
            wfi.sendLog(
                'completor',
                "%s not over bound \ncomplete at %s \n truncate at %s \nRunning %s"
                % (json.dumps(percent_completions, indent=2),
                   json.dumps(good_fraction_per_out, indent=2),
                   json.dumps(truncate_fraction_per_out, indent=2),
                   json.dumps(long_lasting[wfo.name]['agents'], indent=2)))
            continue

        #for output in  percent_completions:
        #    completions[output]['injected'] = then

        ran_at = wfi.request['SiteWhitelist']

        wfi.sendLog('completor', "Required %s, time spend %s" % (cpuh, delay))

        ##### WILL FORCE COMPLETE BELOW
        # only really force complete after n days

        ## find ACDCs that might be running
        if max_force > 0:
            print "going for force-complete of", wfo.name
            if not safe_mode:
                forceComplete(url, wfi)
                set_force_complete.add(wfo.name)
                wfi.sendLog('completor', 'going for force completing')
                wfi.notifyRequestor(
                    "The workflow %s was force completed for running too long"
                    % wfo.name)
                max_force -= 1
            else:
                sendEmail(
                    'completor',
                    'The workflow %s is ready for force complete, but completor is in safe mode'
                    % wfo.name)
        else:
            wfi.sendLog(
                'completor',
                "too many completion this round, cannot force complete")

    if set_force_complete:
        sendLog(
            'completor', 'The followings were set force-complete \n%s' %
            ('\n'.join(set_force_complete)))

    #open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2))
    text = "These have been running for long"

    #open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 ))
    eosFile('%s/longlasting.json' % monitor_dir,
            'w').write(json.dumps(long_lasting, indent=2)).close()

    for wf, info in sorted(long_lasting.items(),
                           key=lambda tp: tp[1]['delay'],
                           reverse=True):
        delay = info['delay']
        text += "\n %s : %s days" % (wf, delay)
        if 'completion' in info:
            text += " %d%%" % (info['completion'] * 100)

    print text
Ejemplo n.º 42
0
def assignor(url ,specific = None, talk=True, options=None):
    CI = campaignInfo()
    SI = siteInfo()
    wfos=[]
    if specific:
        wfos = session.query(Workflow).filter(Workflow.name==specific).all()
    if not wfos:
        if specific:
            wfos = session.query(Workflow).filter(Workflow.status=='considered').all()
            wfos.extend( session.query(Workflow).filter(Workflow.status=='staging').all())
        wfos.extend(session.query(Workflow).filter(Workflow.status=='staged').all())

    for wfo in wfos:
        if specific:
            if not any(map(lambda sp: sp in wfo.name,specific.split(','))): continue
            #if not specific in wfo.name: continue
        print wfo.name,"to be assigned"
        wfh = workflowInfo( url, wfo.name)
        #wl = getWorkLoad(url, wfo.name )

        if not CI.go( wfh.request['Campaign'] ):
            print "No go for",wfh.request['Campaign']
            continue

        injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.)
        now = time.mktime(time.gmtime()) / (60.*60.)
        if float(now - injection_time) < 4.:
            print "It is too soon to inject: %3.2fH remaining"%(now - injection_time)
            if not options.test:
                continue

        #grace_period = 4 #days
        #if float(now - injection_time) > grace_period*24.:
        #    print "it has been",grace_period,"need to do something"
        #    options.restrict = True

        #else:
        #    print now,injection_time,now - injection_time

        #print wl
        if wfh.request['RequestStatus'] !='assignment-approved':
            print wfo.name,wfh.request['RequestStatus'],"skipping"
            if not options.test:
                continue

        version=wfh.getNextVersion()

        (lheinput,primary,parent,secondary) = wfh.getIO()
        sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )
        sites_custodial = list(set(itertools.chain.from_iterable([findCustodialLocation(url, prim) for prim in primary])))
        sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])]
        if len(sites_custodial)==0:
            sites_custodial = [SI.pick_SE()]
            print "picked",sites_custodial," as custodial for",wfo.name

        if len(sites_custodial)>1:
            print "more than one custodial for",wfo.name
            sys.exit(36)

        sites_with_data = copy.deepcopy( sites_allowed )
        for prim in list(primary)+list(secondary):
            presence = getDatasetPresence( url, prim )
            if talk:
                print prim,presence
            sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])]
        sites_with_data = list(set(sites_with_data))

        if options.restrict:
            if talk:
                print sites_allowed
            sites_allowed = sites_with_data
        else:
            if set(sites_with_data) != set(sites_allowed):
                ## the data is not everywhere we wanted to run at : enable aaa
                print "Resorting to AAA reading for",list(set(sites_allowed) - set(sites_with_data))
                #options.useSiteListAsLocation = True
                print "Not commissioned yet"
                continue
                
        if not len(sites_allowed):
            print wfo.name,"cannot be assign with no matched sites"
            continue

        parameters={
            'SiteWhitelist' : sites_allowed,
            'CustodialSites' : sites_custodial,
            'NonCustodialSites' : sites_out,
            'AutoApproveSubscriptionSites' : list(set(sites_out+sites_custodial)),
            'AcquisitionEra' : wfh.acquisitionEra(),
            'ProcessingString' : wfh.processingString(),
            'MergedLFNBase' : '/store/mc', ## to be figured out ! from Hi shit
            'ProcessingVersion' : version,
            }

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v=getattr(options,key)
                if v!=None:
                    if ',' in v: parameters[key] = filter(None,v.split(','))
                    else: parameters[key] = v

        ## take care of a few exceptions
        if (wfh.request['Memory']*1000) > 3000000:
            parameters['MaxRSS'] = 4000000

        ## pick up campaign specific assignment parameters
        parameters.update( CI.parameters(wfh.request['Campaign']) )

        if not options.test:
            parameters['execute'] = True

        if not wfh.checkWorkflowSplitting():
            ## needs to go to event based ? fail for now
            print "Falling back to event splitting ?"
            parameters['SplittingAlgorithm'] = 'EventBased'

        ## plain assignment here
        result = reqMgrClient.assignWorkflow(url, wfo.name, 'production', parameters)

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
            else:
                print "ERROR could not assign",wfo.name
        else:
            pass
Ejemplo n.º 43
0
if len(sys.argv) >1:
    spec = sys.argv[1]

url = 'cmsweb.cern.ch'

wfs = getWorkflows(url, 'acquired', details=True)
wfs.extend( getWorkflows(url, 'running-open', details=True) )
wfs.extend( getWorkflows(url, 'running-closed', details=True) )

jobs_for = defaultdict(lambda : defaultdict(int))
wf_for = defaultdict(lambda : defaultdict(set))
agent_for = defaultdict(lambda : defaultdict(set))
s_block_locations = {}
block_locations = defaultdict(lambda : defaultdict(list))
wfs_no_location_in_GQ = defaultdict(list)
si = siteInfo()  
#bad_blocks = defaultdict( set )
unprocessable = set()

not_runable_acdc=set()
agents_down = defaultdict(set)
failed_workflow = set()
files_locations = {}
stuck_all_done = set()
heavy_duty = {}

for wf in wfs:
    if spec and not spec in wf['RequestName']: continue

    wfi = workflowInfo(url, wf['RequestName'], request=wf)
    sitewhitelist = wfi.request['SiteWhitelist']
Ejemplo n.º 44
0
def transferor(url ,specific = None, talk=True, options=None):
    if userLock('transferor'):   return

    if options and options.test:
        execute = False
    else:
        execute = True

    SI = siteInfo()
    CI = campaignInfo()
    mcm = McMClient(dev=False)
    dss = DSS()

    print "counting all being handled..."
    being_handled = len(session.query(Workflow).filter(Workflow.status == 'away').all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('stag')).all())
    being_handled += len(session.query(Workflow).filter(Workflow.status.startswith('assistance')).all())
    max_to_handle = options.maxworkflows
    allowed_to_handle = max(0,max_to_handle - being_handled)
    wf_buffer = 5
    if allowed_to_handle<=wf_buffer: ## buffer for having several wf per transfer
        print "Not allowed to run more than",max_to_handle,"at a time. Currently",being_handled,"and",wf_buffer,"buffer"
    else:
        print being_handled,"already being handled",max_to_handle,"max allowed,",allowed_to_handle,"remaining","and",wf_buffer,"buffer"
    print "... done"

    all_transfers=defaultdict(list)
    workflow_dependencies = defaultdict(set) ## list of wf.id per input dataset
    wfs_and_wfh=[]
    print "getting all wf to consider ..."
    cache = getWorkflows(url, 'assignment-approved', details=True)
    for wfo in session.query(Workflow).filter(Workflow.status=='considered').all():
        if specific and not specific in wfo.name: continue
        cache_r =filter(lambda d:d['RequestName']==wfo.name, cache)
        if len(cache_r):
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False, request = cache_r[0]) ) )
        else:
            wfs_and_wfh.append( (wfo, workflowInfo( url, wfo.name, spec=False) ) )
    print "... done"

    input_sizes = {}
    ## list the size of those in transfer already
    in_transfer_priority=0
    min_transfer_priority=100000000
    print "getting all wf in staging ..."
    for wfo in session.query(Workflow).filter(Workflow.status=='staging').all():
        wfh = workflowInfo( url, wfo.name, spec=False)
        (_,primary,_,_) = wfh.getIO()
        for prim in primary: 
            input_sizes[prim] = dss.get( prim )
        in_transfer_priority = max(in_transfer_priority, int(wfh.request['RequestPriority']))
        min_transfer_priority = min(min_transfer_priority, int(wfh.request['RequestPriority']))
    print "... done"
    print "Max priority in transfer already",in_transfer_priority
    print "Min priority in transfer already",min_transfer_priority
    in_transfer_already = sum(input_sizes.values())


    #sort by priority higher first
    wfs_and_wfh.sort(cmp = lambda i,j : cmp(int(i[1].request['RequestPriority']),int(j[1].request['RequestPriority']) ), reverse=True)


    ## list the size of all inputs
    print "getting all input sizes ..."
    for (wfo,wfh) in wfs_and_wfh:
        (_,primary,_,_) = wfh.getIO()
        for prim in primary:
            input_sizes[prim] = dss.get( prim )
    print "... done"

    grand_total =  sum(input_sizes.values()) 
    to_transfer = grand_total  - in_transfer_already
    grand_transfer_limit = options.maxtransfer 
    transfer_limit = grand_transfer_limit - in_transfer_already
    print "%15.4f GB already being transfered"%in_transfer_already
    print "%15.4f GB is the current requested transfer load"%to_transfer
    print "%15.4f GB is the global transfer limit"%grand_transfer_limit
    print "%15.4f GB is the available limit"%transfer_limit

    # the max priority value per dataset.
    max_priority = defaultdict(int)
    needs_transfer=0 ## so that we can count'em
    passing_along = 0
    transfer_sizes={}
    went_over_budget=False
    for (wfo,wfh) in wfs_and_wfh:
        print wfh.request['RequestPriority']
        print wfo.name,"to be transfered"
        #wfh = workflowInfo( url, wfo.name)

        (_,primary,_,_) = wfh.getIO()
        this_load=sum([input_sizes[prim] for prim in primary])
        if ( this_load and (sum(transfer_sizes.values())+this_load > transfer_limit or went_over_budget ) ):
            if went_over_budget:
                print "Transfer has gone over bubget."
            else:
                print "Transfer will go over bubget."
            print "%15.4f GB this load"%this_load
            print "%15.4f GB already this round"%sum(transfer_sizes.values())
            print "%15.4f GB is the available limit"%transfer_limit
            went_over_budget=True
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over budget"
            else:
                if not options.go: 
                    print min_transfer_priority,"minimum priority",wfh.request['RequestPriority'],"<",in_transfer_priority,"stop"
                    continue


        ## throtlle by campaign go
        if not CI.go( wfh.request['Campaign'] ):
            print "No go for",wfh.request['Campaign']
            if not options.go: continue

        ## check if the batch is announced
        announced=False
        is_real=False
        for b in mcm.getA('batches',query='contains=%s'% wfo.name):
            is_real = True
            if b['status']=='announced': 
                announced=True 
                break

        if not announced:
            print wfo.name,"does not look announced."# skipping?, rejecting?, reporting?"
            
        if not is_real:
            print wfo.name,"does not appear to be genuine."
            ## prevent any duplication. if the wf is not mentioned in any batch, regardless of status
            continue

        ## check on a grace period
        injection_time = time.mktime(time.strptime('.'.join(map(str,wfh.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.)
        now = time.mktime(time.gmtime()) / (60.*60.)
        if float(now - injection_time) < 4.:
            if not options.go and not announced: 
                print "It is too soon to start transfer: %3.2fH remaining"%(now - injection_time)
                continue


        passing_along += 1
        if passing_along >= allowed_to_handle:
            if int(wfh.request['RequestPriority']) >= in_transfer_priority and min_transfer_priority!=in_transfer_priority:
                print "Higher priority sample",wfh.request['RequestPriority'],">=",in_transfer_priority,"go-on over",max_to_handle
            else:
                print "Not allowed to pass more than",max_to_handle,"at a time. Currently",being_handled,"handled, and adding",passing_along
                break

        (lheinput,primary,parent,secondary) = wfh.getIO()
        if options and options.tosites:
            sites_allowed = options.tosites.split(',')
        else:
            sites_allowed = getSiteWhiteList( (lheinput,primary,parent,secondary) )

        if 'SiteWhitelist' in CI.parameters(wfh.request['Campaign']):
            sites_allowed = CI.parameters(wfh.request['Campaign'])['SiteWhitelist']

        blocks = []
        if 'BlockWhitelist' in wfh.request and wfh.request['BlockWhitelist']:
            blocks = wfh.request['BlockWhitelist']

        can_go = True
        staging=False
        if primary:
            if talk:
                print wfo.name,'reads',', '.join(primary),'in primary'
            ## chope the primary dataset 
            for prim in primary:
                max_priority[prim] = max(max_priority[prim],int(wfh.request['RequestPriority']))
                sites_really_allowed = [site for site in sites_allowed if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                print "Sites allowed minus the vetoed transfer"
                print sites_really_allowed
                copies_needed = int(0.35*len(sites_really_allowed))+1 ## should just go for a fixed number based if the white list grows that big
                print "Would make",copies_needed,"copies"
                if options.maxcopy>0:
                    copies_needed = min(options.maxcopy,copies_needed)

                ## remove the sites that do not want transfers                
                print "need",copies_needed
                workflow_dependencies[prim].add( wfo.id )
                presence = getDatasetPresence( url, prim )
                prim_location = [site for site,pres in presence.items() if pres[0]==True]
                if len(prim_location) >= copies_needed:
                    print "The output is all fully in place at",len(prim_location),"sites"
                    continue
                # reduce the number of copies required by existing full copies
                copies_needed = max(0,copies_needed - len(prim_location))
                print "now need",copies_needed
                subscriptions = listSubscriptions( url , prim )
                prim_destination = list(set([site for (site,(tid,decision)) in subscriptions.items() if decision and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])]))
                ## need to reject from that list the ones with a full copy already: i.e the transfer corresponds to the copy in place
                prim_destination = [site for site in prim_destination if not site in prim_location]
                ## add transfer dependencies
                latching_on_transfers =  list(set([ tid for (site,(tid,decision)) in subscriptions.items() if decision and site in prim_destination and not any([site.endswith(veto) for veto in ['MSS','Export','Buffer']])]))
                print latching_on_transfers
                for latching in latching_on_transfers:
                    tfo = session.query(Transfer).filter(Transfer.phedexid == latching).first()
                    if not tfo:
                        tfo = Transfer( phedexid = latching)
                        tfo.workflows_id = []
                        session.add(tfo)
                            
                    if not wfo.id in tfo.workflows_id:
                        print "adding",wfo.id,"to",tfo.id,"with phedexid",latching
                        l = copy.deepcopy( tfo.workflows_id )
                        l.append( wfo.id )
                        tfo.workflows_id = l
                    if not options.test:
                        session.commit()
                    else:
                        session.flush() ## regardless of commit later on, we need to let the next wf feeding on this transfer to see it in query
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    staging = True

                # reduce the number of copies required by the on-going full transfer : how do we bootstrap on waiting for them ??
                copies_needed = max(0,copies_needed - len(prim_destination))
                print "then need",copies_needed
                if copies_needed == 0:
                    print "The output is either fully in place or getting in full somewhere with",latching_on_transfers
                    can_go = True
                    continue
                prim_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in prim_location])]
                prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in prim_destination])]
                ## take out the ones that cannot receive transfers
                prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                if len(prim_to_distribute)>0: ## maybe that a parameter we can play with to limit the 
                    if not options or options.chop:
                        spreading = distributeToSites( getDatasetChops(prim), prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges)
                    else:
                        spreading = {} 
                        for site in prim_to_distribute: spreading[site]=[prim]
                    can_go = False
                    transfer_sizes[prim] = input_sizes[prim]
                    for (site,items) in spreading.items():
                        all_transfers[site].extend( items )




        if secondary:
            if talk:
                print wfo.name,'reads',', '.join(secondary),'in secondary'
            for sec in secondary:
                workflow_dependencies[sec].add( wfo.id )
                presence = getDatasetPresence( url, sec )
                sec_location = [site for site,pres in presence.items() if pres[1]>90.] ## more than 90% of the minbias at sites
                subscriptions = listSubscriptions( url ,sec )
                sec_destination = [site for site in subscriptions] 
                sec_to_distribute = [site for site in sites_allowed if not any([osite.startswith(site) for osite in sec_location])]
                sec_to_distribute = [site for site in sec_to_distribute if not any([osite.startswith(site) for osite in sec_destination])]
                sec_to_distribute = [site for site in sec_to_distribute if not  any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
                if len( sec_to_distribute )>0:
                    for site in sec_to_distribute:
                        all_transfers[site].append( sec )
                        can_go = False
        
        ## is that possible to do something more
        if can_go:
            ## no explicit transfer required this time
            if staging:
                ## but using existing ones
                print wfo.name,"latches on existing transfers, and nothing else"
                wfo.status = 'staging'
            else:
                print wfo.name,"should just be assigned NOW to",sites_allowed
                wfo.status = 'staged'
            print "setting status to",wfo.status
            session.commit()
            continue
        else:
            ## there is an explicit transfer required
            if staging:
                ## and also using an existing one
                print wfo.name,"latches on existing transfers"
                if not options.test:
                    wfo.status = 'staging'
                    print "setting status to",wfo.status
                    session.commit()
            print wfo.name,"needs a transfer"
            needs_transfer+=1

    #print json.dumps(all_transfers)
    fake_id=-1
    wf_id_in_prestaging=set()

    for (site,items_to_transfer) in all_transfers.iteritems():
        items_to_transfer = list(set(items_to_transfer))
        ## convert to storage element
        site_se = SI.CE_to_SE(site)

        ## site that do not want input datasets
        if site in SI.sites_veto_transfer: 
            print site,"does not want transfers"
            continue

        ## throttle the transfer size to T2s ? we'd be screwed by a noPU sample properly configured.

        ## massage a bit the items
        blocks = [it for it in items_to_transfer if '#' in it]
        datasets = [it for it in items_to_transfer if not '#' in it]

        if execute:
            print "Making a replica to",site,"(CE)",site_se,"(SE) for"
        else:
            print "Would make a replica to",site,"(CE)",site_se,"(SE) for"

        print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out
        blocks = [block for block in blocks if not block.split('#')[0] in datasets]
        print "\t",len(blocks),"needed blocks for",list(set([block.split('#')[0] for block in blocks]))
        print "\t",len(datasets),"datasets"
        print "\t",datasets
        items_to_transfer = blocks + datasets

        ## operate the transfer
        if options and options.stop:
            ## ask to move-on
            answer = raw_input('Continue with that ?')
            if not answer.lower() in ['y','yes','go']:
                continue

        if execute:
            result = makeReplicaRequest(url, site_se, items_to_transfer, 'prestaging', priority='normal')
            ## make use of max_priority dataset:priority to set the subscriptions priority
            """
            ## does not function
            once = True
            for item in items_to_transfer:
                bds = item.split('#')[0]
                if max_priority[bds] >= 90000:
                    if once:
                        w=10
                        print "waiting",w,"s before raising priority"
                        time.sleep(w)
                        once=False
                    ## raise it to high priority
                    print item,"subscription priority raised to high at",site_se
                    #print "This does not work yet properly it seems"
                    print updateSubscription(url, site_se, item, priority='high')
            """
        else:
            #result= {'phedex':{'request_created' : [{'id' : fake_id}]}}
            result= {'phedex':{'request_created' : []}}
            fake_id-=1



        if not result:
            print "ERROR Could not make a replica request for",site,items_to_transfer,"pre-staging"
            continue
        for phedexid in [o['id'] for o in result['phedex']['request_created']]:
            new_transfer = session.query(Transfer).filter(Transfer.phedexid == phedexid).first()
            print phedexid,"transfer created"
            if not new_transfer:
                new_transfer = Transfer( phedexid = phedexid)
                session.add( new_transfer )                
            new_transfer.workflows_id = set()
            for transfering in list(set(map(lambda it : it.split('#')[0], items_to_transfer))):
                new_transfer.workflows_id.update( workflow_dependencies[transfering] )
            new_transfer.workflows_id = list(new_transfer.workflows_id)
            wf_id_in_prestaging.update(new_transfer.workflows_id)
            session.commit()
            ## auto approve it
            if execute:
                approved = approveSubscription(url, phedexid, [site_se])

    for wfid in wf_id_in_prestaging:
        tr_wf = session.query(Workflow).get(wfid)
        if tr_wf and tr_wf.status!='staging':
            if execute:
                tr_wf.status = 'staging'
                if talk:
                    print "setting",tr_wf.name,"to staging"
        session.commit()
Ejemplo n.º 45
0
print time.asctime(time.gmtime())

if sys.argv[1] == 'parse':
    force = False
    if len(sys.argv) > 2:
        force = bool(sys.argv[2])
    locks = json.loads(open('%s/globallocks.json' % monitor_pub_dir).read())
    #waiting = json.loads(open('%s/waiting_custodial.json'%monitor_dir).read())
    #stuck = json.loads(open('%s/stuck_custodial.json'%monitor_pub_dir).read())
    #missing = json.loads(open('%s/missing_approval_custodial.json'%monitor_dir).read())

    waiting = {}
    stuck = {}
    missing = {}
    si = siteInfo()
    remainings = {}
    sis = si.disk.keys()
    random.shuffle(sis)
    for site in sis:
        space = si.disk[site]
        if space:
            continue
        print site, "has no disk space left"
        #if os.path.isfile('remaining_%s.json'%site) and not force:
        #    print site,"accounted for"
        #    continue

        remainings[site] = {}

        print site, "has", space, "[TB] left out of", si.quota[site]
Ejemplo n.º 46
0
def equalizor(url , specific = None, options=None):
    up = componentInfo(mcm=False, soft=['mcm']) 
    if not up.check(): return 

    if not specific:
        workflows = getWorkflows(url, status='running-closed', details=True)
        workflows.extend(getWorkflows(url, status='running-open', details=True))

    ## start from scratch
    modifications = defaultdict(dict)
    ## define regionality site => fallback allowed. feed on an ssb metric ??
    mapping = defaultdict(list)
    reversed_mapping = defaultdict(list)
    regions = defaultdict(list)
    SI = siteInfo()
    for site in SI.sites_ready:
        region = site.split('_')[1]
        if not region in ['US','DE','IT']: continue
        regions[region] = [region] 

    def site_in_depletion(s):
        return True
        if s in SI.sites_pressure:
            (m, r, pressure) = SI.sites_pressure[s]
            if float(m) < float(r):
                print s,m,r,"lacking pressure"
                return True
            else:
                print s,m,r,"pressure"
                pass
                
        return False

    for site in SI.sites_ready:
        region = site.split('_')[1]
        ## fallback to the region, to site with on-going low pressure
        mapping[site] = [fb for fb in SI.sites_ready if any([('_%s_'%(reg) in fb and fb!=site and site_in_depletion(fb))for reg in regions[region]]) ]
    

    use_T0 = False
    if options.augment : use_T0 = True

    use_HLT = False
    if options.augment : use_HLT=True

    if use_HLT:
        mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT')
    #mapping['T2_IT_Legnaro'].append('T1_IT_CNAF')
    for reg in ['IT','DE','UK']:
        mapping['T2_CH_CERN'].extend([fb for fb in SI.sites_ready if '_%s_'%reg in fb])

    for site,fallbacks in mapping.items():
        for fb in fallbacks:
            reversed_mapping[fb].append(site)

    ## this is the fallback mapping
    print json.dumps( mapping, indent=2)
    #print json.dumps( reversed_mapping, indent=2)

    altered_tasks = set()

    def running_idle( wfi , task_name):
        gmon = wfi.getGlideMon()
        #print gmon
        if not gmon: return (0,0)
        if not task_name in gmon: return (0,0)
        return (gmon[task_name]['Running'], gmon[task_name]['Idle'])

    def needs_action( wfi, task, min_idled = 100, pressure = 0.2):
        task_name = task.pathName.split('/')[-1]
        running, idled = running_idle( wfi, task_name)
        go = True
        if not idled and not running : 
            go = False
        if idled < 100: 
            go = False
        if (not running and idled) or (running and (idled / float(running) > pressure)):
            go = True
        else:
            go = False
        return go, task_name, running, idled

    def getcampaign( task ):
        taskname = task.pathName.split('/')[-1]
        if hasattr( task, 'prepID'):
            return task.prepID.split('-')[1]
        elif taskname.count('-')>=1:
            return taskname.split('-')[1]
        else:
            return None

    def close( interface ):
        open('%s/equalizor.json.new'%monitor_dir,'w').write( json.dumps( interface, indent=2))
        os.system('mv %s/equalizor.json.new %s/equalizor.json'%(monitor_dir,monitor_dir))
        os.system('cp %s/equalizor.json %s/logs/equalizor/equalizor.%s.json'%(monitor_dir,monitor_dir,time.mktime(time.gmtime())))

    interface = {
        'reversed_mapping' : reversed_mapping,
        'modifications' : {}
        }
    if options.augment or options.remove:
        interface['modifications'] = json.loads( open('%s/equalizor.json'%monitor_dir).read())['modifications']

    if options.remove:
        if specific in interface['modifications']:
            print "poping",specific
            interface['modifications'].pop(specific)
            close( interface )
        return 


    PU_locations = {}
    PU_overflow = {
        #'RunIISpring15PrePremix' : { 
        #    'sites' : ["T1_US_FNAL", "T1_DE_KIT" , "T1_IT_CNAF", "T1_RU_JINR" ,"T2_CH_CERN"],
        #    'max' : 20000,
        #    'pending' : 0
        #    },
        'RunIIFall15DR76' : {
            'sites':['T1_ES_PIC','T2_US_Purdue','T2_UK_SGrid_RALPP','T2_BE_IIHE','T2_DE_DESY','T2_IT_Legnaro','T2_US_Caltech','T1_DE_KIT',
                     'T2_UK_London_Brunel','T2_IT_Pisa',
                     'T1_US_FNAL',
                     'T2_IT_Rome','T2_US_Florida','T1_IT_CNAF','T1_RU_JINR','T2_UK_London_IC','T2_US_Nebraska','T2_FR_CCIN2P3','T2_US_UCSD','T2_ES_CIEMAT',
                     'T1_FR_CCIN2P3','T2_US_Wisconsin','T2_US_MIT','T2_DE_RWTH',
                     'T1_UK_RAL','T2_US_Vanderbilt','T2_CH_CERN'],
            'max': 20000,
            'pending' : 0},
        'RunIISpring16DR80' : {
            'sites':['T1_ES_PIC','T2_US_Purdue','T2_UK_SGrid_RALPP','T2_BE_IIHE','T2_DE_DESY','T2_IT_Legnaro','T2_US_Caltech','T1_DE_KIT',
                     'T2_UK_London_Brunel','T2_IT_Pisa',
                     'T1_US_FNAL',
                     'T2_IT_Rome','T2_US_Florida','T1_IT_CNAF','T1_RU_JINR','T2_UK_London_IC','T2_US_Nebraska','T2_FR_CCIN2P3','T2_US_UCSD','T2_ES_CIEMAT',
                     'T1_FR_CCIN2P3','T2_US_Wisconsin','T2_US_MIT','T2_DE_RWTH',
                     'T1_UK_RAL','T2_US_Vanderbilt','T2_CH_CERN'],
            'max': 20000,
            'pending' : 0,
            'force' : True},
        'RunIISpring15DR74' : {
            'sites' : ['T1_ES_PIC','T1_DE_KIT','T1_US_FNAL','T1_IT_CNAF','T1_RU_JINR','T1_FR_CCIN2P3','T1_UK_RAL','T2_CH_CERN'],
            'max' : 20000,
            'pending' : 0}
        }
    
    set_to = SI.sites_AAA
    LHE_overflow = {
        'RunIIWinter15GS' : set_to,
        'RunIISummer15GS' : set_to,
        'Summer12' : set_to,
        'Summer11Leg' : set_to
        #'RunIIFall15MiniAODv2' : set_to,
        }

    pending_HLT = 0
    max_HLT = 60000
    pending_T0 = 0
    max_T0 = 60000
    try:
        gmon = json.loads(os.popen('curl -s http://cms-gwmsmon.cern.ch/prodview/json/T2_CH_CERN_HLT').read())
        pending_HLT += gmon["Running"]
        pending_HLT += gmon["MatchingIdle"]
    except:
        pass

    t0_special = [
        'vlimant_BPH-RunIISummer15GS-00030_00212_v0__160129_135314_9755',
        'pdmvserv_TSG-RunIISummer15GS-00044_00240_v0__160210_121223_8582'
        ]
    no_routing = [ 
        #'vlimant_BPH-RunIISummer15GS-00030_00212_v0__160129_135314_9755',
        #'pdmvserv_TOP-RunIIWinter15GS-00074_00187_v0__160207_162312_1992',
                   ]

    stay_within_site_whitelist = False
    specific_task=None
    if specific and ":" in specific:
        specific,specific_task = specific.split(':')

    if specific:
        wfs = session.query(Workflow).filter(Workflow.name.contains(specific)).all()
    else:
        wfs = session.query(Workflow).filter(Workflow.status == 'away').all()
        
    random.shuffle( wfs )
    for wfo in wfs:
        if wfo.name in no_routing and not options.augment:
            continue

        if specific and not specific in wfo.name: 
            continue
        if specific:
            wfi = workflowInfo(url, wfo.name)
        else:
            cached = filter(lambda d : d['RequestName']==wfo.name, workflows)
            if not cached : continue
            wfi = workflowInfo(url, wfo.name, request = cached[0])
        
        ## only running should get re-routed
        if not wfi.request['RequestStatus'] in ['running-open','running-closed'] and not specific: continue

        tasks_and_campaigns = []
        for task in wfi.getWorkTasks():
            tasks_and_campaigns.append( (task, getcampaign(task) ) )
        
        _,_,_,sec = wfi.getIO()

        ## check needs override
        needs_overide = False
        if not needs_overide and  options.augment: needs_overide=True

        def overide_from_agent( wfi, needs_overide):
            bad_agents = []#'http://cmssrv219.fnal.gov:5984']
            if not bad_agents: return needs_overide
            if needs_overide: return True
            agents = wfi.getAgents()

            wqss = ['Running','Acquired']
            if any([agent in agents.get(wqs,{}).keys() for wqs,agent in itertools.product( wqss, bad_agents)]):
                print "overriding the need for bad agent"
                needs_overide = True
            return needs_overide

        ## now parse this for action
        for i_task,(task,campaign) in enumerate(tasks_and_campaigns):
            if options.augment:
                print task.pathName
                print campaign

            ### rule to avoid the issue of taskchain secondary jobs being stuck at sites processing the initial step
            if campaign in LHE_overflow:
                if task.taskType in ['Processing']:
                    needs, task_name, running, idled = needs_action(wfi, task)
                    needs_overide = overide_from_agent( wfi, needs_overide)
                    extend_to = copy.deepcopy( LHE_overflow[campaign] )
                    if stay_within_site_whitelist:
                        extend_to = list(set(extend_to) & set(wfi.request['SiteWhitelist'])) ## restrict to stupid-site-whitelist

                    if extend_to and needs or needs_overide:
                        print "\t",task_name,"of",wfo.name,"running",running,"and pending",idled,"taking action : ReplaceSiteWhitelist"
                        modifications[wfo.name][task.pathName] = { "ReplaceSiteWhitelist" : copy.deepcopy( LHE_overflow[campaign] ) ,"Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                        #print json.dumps( modifications[wfo.name][task.pathName]['ReplaceSiteWhitelist']
                        altered_tasks.add( task.pathName )
                    else:
                        print task_name,"of",wfo.name,"running",running,"and pending",idled


            ### overflow the 76 digi-reco to the site holding the pileup
            if campaign in PU_overflow:
                force = PU_overflow[campaign]['force'] if 'force' in PU_overflow[campaign] else False
                secondary_locations = set(SI.sites_ready)
                for s in sec:
                    if not s in PU_locations:
                        presence = getDatasetPresence( url, s)
                        #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
                        one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>98.]
                        PU_locations[s] = one_secondary_locations
                    print "secondary is at",sorted(PU_locations[s])
                    secondary_locations = set(PU_locations[s]) & secondary_locations
                    
                ## we should add all sites that hold the secondary input if any
                secondary_locations = list(set(PU_overflow[campaign]['sites']) & set( SI.sites_ready ))
                if any([task.pathName.endswith(finish) for finish in ['_0','StepOneProc','Production']]) :
                    needs, task_name, running, idled = needs_action(wfi, task)
                    ## removing the ones in the site whitelist already since they encode the primary input location
                    if stay_within_site_whitelist:
                        original_site_in_use = set(wfi.request['SiteWhitelist'])
                    else:
                        original_site_in_use = set(secondary_locations)
                    ## remove the sites that have already running jobs
                    gmon = wfi.getGlideMon()
                    if gmon and task_name in gmon and 'Sites' in gmon[task_name]:
                        site_in_use = set(gmon[task_name]['Sites'])
                        ## that determines where you want to run in addition
                        #augment_by = list((set(secondary_locations)- site_in_use))
                        augment_by = list((set(secondary_locations)- site_in_use) & original_site_in_use) ## restrict to stupid-site-whitelist
                    else:
                        augment_by = list(original_site_in_use)

                    needs_overide = overide_from_agent( wfi, needs_overide)
                    if augment_by and (needs or needs_overide or force) and PU_overflow[campaign]['pending'] < PU_overflow[campaign]['max']:
                        PU_overflow[campaign]['pending'] += idled
                        print "raising overflow to",PU_overflow[campaign]['pending'],"for",PU_overflow[campaign]['max']
                        ## the step with an input ought to be the digi part : make this one go anywhere
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : augment_by , "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                        altered_tasks.add( task.pathName )
                        print "\t",task_name,"of",wfo.name,"running",running,"and pending",idled,"taking action : AddWhitelist"
                        #print json.dumps( augment_by, indent=2 )
                    else:
                        print task_name,"of",wfo.name,"running",running,"and pending",idled

            ### overflow the skims back to multi-core 
            if campaign in ['Run2015D','Run2015C_25ns'] and task.taskType =='Skim':
                original_swl = wfi.request['SiteWhitelist']
                needs, task_name, running, idled = needs_action(wfi, task)
                if (needs or needs_overide):
                    modifications[wfo.name][task.pathName] = { 'AddWhitelist' : original_swl, 
                                                               "Running" : running, "Pending" : idled, "Priority" : wfi.request['RequestPriority']}
                    altered_tasks.add( task.pathName )
                    print "\t",task_name,"of",wfo.name,"running",running,"and pending",idled,"taking action : AddWhitelist"


            if options.augment:
                print sorted(wfi.request['SiteWhitelist']),i_task,use_HLT
            ### add the HLT at partner of CERN
            if 'T2_CH_CERN' in wfi.request['SiteWhitelist'] and i_task==0 and use_HLT:
                needs, task_name, running, idled = needs_action(wfi, task)
                if options.augment: needs=True
                needs = True
                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide) and pending_HLT < max_HLT:
                    pending_HLT += idled
                    if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T2_CH_CERN_HLT" )
                        print "\t",wfo.name,"adding addHLT up to",pending_HLT,"for",max_HLT
                        print task.pathName
                    ## this Replace does not work at all for HLT
                    #elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]:
                        #modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" )
                        #print "\t",wfo.name,"adding replace HLT up to",pending_HLT,"for",max_HLT
                    else:
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T2_CH_CERN_HLT"],
                                                                   "Priority" : wfi.request['RequestPriority'],
                                                                   "Running" : running,
                                                                   "Pending" : idled}
                        print "\t",wfo.name,"adding HLT up to",pending_HLT,"for",max_HLT
                        print task.pathName

            if i_task==0 and not sec and use_T0:
                needs, task_name, running, idled = needs_action(wfi, task)
                
                if options.augment: needs=True
                #needs = True
                #if not (wfo.name in t0_special) and not options.augment: needs = False
                if not wfi.request['RequestType'] in ['MonteCarlo','MonteCarloFromGEN'] and not options.augment: needs = False
                
                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide):
                    pending_T0 += idled
                    if task.pathName in modifications[wfo.name] and 'AddWhitelist' in modifications[wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName]["AddWhitelist"].append( "T0_CH_CERN" )
                        print "\t",wfo.name,"adding addT0 up to",pending_T0,"for",max_T0
                        print task.pathName
                    elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T0_CH_CERN" )
                        print "\t",wfo.name,"adding replace T0 up to",pending_T0,"for",max_T0
                    else:
                        modifications[wfo.name][task.pathName] = { "AddWhitelist" : ["T0_CH_CERN"],
                                                                   "Priority" : wfi.request['RequestPriority'],
                                                                   "Running" : running,
                                                                   "Pending" : idled}
                        print "\t",wfo.name,"adding T0 up to",pending_T0,"for",max_T0
                        print task.pathName


    interface['modifications'].update( modifications )


    ## temporary core managing
    interface['cores']={'T2_CH_CERN_HLT': {'min':4,'max':16}, 'default': {'min':1, 'max':4}}
    #interface['max_cores']={'T2_CH_CERN_HLT': 16, 'default': 4}
    #interface['min_cores']={'T2_CH_CERN_HLT': 4, 'default': 1}
    #interface['resize_subtasks'] = 'RunIISpring16DR80'
    interface['resizes'] = ['RunIISpring16DR80','NotACampaign']

    ## close and save
    close( interface )
Ejemplo n.º 47
0
def new_recoveror(url, specific, options=None):
    if userLock('recoveror'): return

    up = componentInfo(soft=['mcm', 'wtc'])
    if not up.check(): return

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()

    wfs = session.query(Workflow).filter(
        Workflow.status.contains('recovery')).all()
    if specific:
        wfs.extend(
            session.query(Workflow).filter(
                Workflow.status == 'assistance-manual').all())

    try:
        from_operator = json.loads(
            os.popen(
                'curl -s http://vocms0113.cern.ch/actions/test.json').read())
        ## now we have a list of things that we can take action on
    except:
        pass

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        if not specific and 'manual' in wfo.status: continue

        wfi = workflowInfo(url, wfo.name)

        send_recovery = False  ## will make all acdc
        send_clone = False  ## will make a clone
        send_back = False  ## should just reject. manual ?
        send_manual = False  ## will set in manual

        where_to_run, missing_to_run = wfi.getRecoveryInfo()

        task_to_recover = where_to_run.keys()

        ## if the site at which the recovery could run in drain or out ?
        for task in task_to_recover:
            not_ready = set(where_to_run[task]) - set(SI.sites_ready)
            if not_ready:
                print "the following sites are not ready for the ACDC", ",".join(
                    sorted(not_ready))
                ## do we have a way of telling if a site is going to be out for a long time ?
                # check on priority: high prio, restart
                if wfi.request['RequestPriority'] >= 85000:
                    send_clone = True
                # check on age of the request
                injection_time = time.mktime(
                    time.strptime(
                        '.'.join(map(str, wfi.request['RequestDate'])),
                        "%Y.%m.%d.%H.%M.%S")) / (60. * 60.)
                now = time.mktime(time.gmtime()) / (60. * 60.)
                if float(now - injection_time) < 14.:
                    ## less than 14 days, start over
                    send_clone = True
                else:
                    send_manual = True

        if not send_recovery:
            ## check on whether the stats is very low
            pass

        if send_recovery:
            ## make acdc for all tasks
            for task in task_to_recover:
                actions = list(
                    set([
                        case['solution']
                        for code, case in task_to_recover[task]
                    ]))
                acdc = singleRecovery(url, task, wfi.request, actions, do=True)
        elif send_clone:
            ## this will get it cloned
            wfo.status = 'assistance-clone'
            session.commit()
        elif send_manual:
            wfo.status = 'assistance-manual'
Ejemplo n.º 48
0
def equalizor(url, specific=None, options=None):
    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return

    if not specific:
        workflows = getWorkflows(url, status='running-closed', details=True)
        workflows.extend(getWorkflows(url, status='running-open',
                                      details=True))

    ## start from scratch
    modifications = defaultdict(dict)
    ## define regionality site => fallback allowed. feed on an ssb metric ??
    mapping = defaultdict(list)
    reversed_mapping = defaultdict(list)
    regions = defaultdict(list)
    SI = siteInfo()
    CI = campaignInfo()
    UC = unifiedConfiguration()
    for site in SI.sites_ready:
        region = site.split('_')[1]
        if not region in ['US', 'DE', 'IT']: continue
        regions[region] = [region]

    def site_in_depletion(s):
        return True
        if s in SI.sites_pressure:
            (m, r, pressure) = SI.sites_pressure[s]
            if float(m) < float(r):
                print s, m, r, "lacking pressure"
                return True
            else:
                print s, m, r, "pressure"
                pass

        return False

    for site in SI.sites_ready:
        region = site.split('_')[1]
        ## fallback to the region, to site with on-going low pressure
        mapping[site] = [
            fb for fb in SI.sites_ready
            if any([('_%s_' %
                     (reg) in fb and fb != site and site_in_depletion(fb))
                    for reg in regions[region]])
        ]

    use_T0 = ('T0_CH_CERN' in UC.get("site_for_overflow"))
    if options.t0: use_T0 = True
    #if options.augment : use_T0 = True

    use_HLT = ('T2_CH_CERN_HLT' in UC.get("site_for_overflow"))
    if options.hlt: use_HLT = True
    #if options.augment : use_HLT=True

    if use_HLT:
        mapping['T2_CH_CERN'].append('T2_CH_CERN_HLT')

    if use_T0:
        mapping['T2_CH_CERN'].append('T0_CH_CERN')
        #mapping['T1_FR_CCIN2P3'].append('T0_CH_CERN')

    #mapping['T2_IT_Legnaro'].append('T1_IT_CNAF')
    for reg in ['IT', 'DE', 'UK']:
        mapping['T2_CH_CERN'].extend(
            [fb for fb in SI.sites_ready if '_%s_' % reg in fb])

    ## make them appear as OK to use
    force_sites = []

    ## overflow CERN to underutilized T1s
    upcoming = json.loads(open('%s/GQ.json' % monitor_dir).read())
    for possible in SI.sites_T1s:
        if not possible in upcoming:
            mapping['T2_CH_CERN'].append(possible)

    ## remove add-hoc sites from overflow mapping
    prevent_sites = ['T2_US_Purdue']
    for prevent in prevent_sites:
        if prevent in mapping: mapping.pop(prevent)
    for src in mapping:
        for prevent in prevent_sites:
            if prevent in mapping[src]:
                mapping[src].remove(prevent)

    ## create the reverse mapping for the condor module
    for site, fallbacks in mapping.items():
        for fb in fallbacks:
            reversed_mapping[fb].append(site)

    ## this is the fallback mapping
    print "Direct mapping : site => overflow"
    print json.dumps(mapping, indent=2)
    print "Reverse mapping : dest <= from origin"
    print json.dumps(reversed_mapping, indent=2)

    altered_tasks = set()

    def running_idle(wfi, task_name):
        gmon = wfi.getGlideMon()
        #print gmon
        if not gmon: return (0, 0)
        if not task_name in gmon: return (0, 0)
        return (gmon[task_name]['Running'], gmon[task_name]['Idle'])

    def needs_action(wfi, task, min_idled=100, pressure=0.2):
        task_name = task.pathName.split('/')[-1]
        running, idled = running_idle(wfi, task_name)
        go = True
        if not idled and not running:
            go = False
        if idled < 100:
            go = False
        if (not running and idled) or (running and
                                       (idled / float(running) > pressure)):
            go = True
        else:
            go = False
        return go, task_name, running, idled

    def getPerf(task):
        task = task.split('/')[1] + '/' + task.split('/')[-1]
        try:
            u = 'http://cms-gwmsmon.cern.ch/prodview/json/history/memoryusage720/%s' % task
            print u
            perf_data = json.loads(os.popen('curl -s --retry 5 %s' % u).read())
        except Exception as e:
            print str(e)
            return (None, None)
        buckets = perf_data['aggregations']["2"]['buckets']
        s_m = sum(bucket['key'] * bucket['doc_count'] for bucket in buckets)
        w_m = sum(bucket['doc_count'] for bucket in buckets)
        m_m = max(bucket['key'] for bucket in buckets) if buckets else None

        b_m = None
        if w_m > 100:
            b_m = m_m

        try:
            perf_data = json.loads(
                os.popen(
                    'curl -s --retry 5 http://cms-gwmsmon.cern.ch/prodview/json/history/runtime720/%s'
                    % task).read())
        except Exception as e:
            print str(e)
            return (b_m, None)

        buckets = perf_data['aggregations']["2"]['buckets']
        s_t = sum(bucket['key'] * bucket['doc_count'] for bucket in buckets)
        w_t = sum(bucket['doc_count'] for bucket in buckets)
        m_t = max(bucket['key'] for bucket in buckets) if buckets else None

        b_t = None
        if w_t > 100:
            b_t = m_t

        return (b_m, b_t)

    def getcampaign(task):
        taskname = task.pathName.split('/')[-1]
        if hasattr(task, 'prepID'):
            return task.prepID.split('-')[1]
        elif taskname.count('-') >= 1:
            return taskname.split('-')[1]
        else:
            return None

    def close(interface):
        open('%s/equalizor.json.new' % monitor_dir,
             'w').write(json.dumps(interface, indent=2))
        os.system('mv %s/equalizor.json.new %s/equalizor.json' %
                  (monitor_dir, monitor_dir))
        os.system('cp %s/equalizor.json %s/logs/equalizor/equalizor.%s.json' %
                  (monitor_dir, monitor_dir, time.mktime(time.gmtime())))

    interface = {'reversed_mapping': reversed_mapping, 'modifications': {}}
    if options.augment or options.remove:
        interface['modifications'] = json.loads(
            open('%s/equalizor.json' % monitor_dir).read())['modifications']

    if options.remove:
        if specific in interface['modifications']:
            print "poping", specific
            interface['modifications'].pop(specific)
            close(interface)
        return

    PU_locations = {}
    PU_overflow = {}
    LHE_overflow = {}
    tune_performance = []

    pending_HLT = 0
    max_HLT = 60000
    pending_T0 = 0
    max_T0 = 60000
    try:
        gmon = json.loads(
            os.popen(
                'curl -s http://cms-gwmsmon.cern.ch/prodview/json/T2_CH_CERN_HLT'
            ).read())
        pending_HLT += gmon["Running"]
        pending_HLT += gmon["MatchingIdle"]
    except:
        pass

    stay_within_site_whitelist = False
    specific_task = None
    if specific and ":" in specific:
        specific, specific_task = specific.split(':')

    if specific:
        wfs = session.query(Workflow).filter(
            Workflow.name.contains(specific)).all()
    else:
        wfs = session.query(Workflow).filter(Workflow.status == 'away').all()

    performance = {}
    no_routing = []
    random.shuffle(wfs)
    for wfo in wfs:
        if wfo.name in no_routing and not options.augment:
            continue

        if specific and not specific in wfo.name:
            continue
        if specific:
            wfi = workflowInfo(url, wfo.name)
        else:
            cached = filter(lambda d: d['RequestName'] == wfo.name, workflows)
            if not cached: continue
            wfi = workflowInfo(url, wfo.name, request=cached[0])

        ## only running should get re-routed
        if not wfi.request['RequestStatus'] in [
                'running-open', 'running-closed'
        ] and not specific:
            continue

        tasks_and_campaigns = []
        for task in wfi.getWorkTasks():
            tasks_and_campaigns.append((task, getcampaign(task)))

        _, _, _, sec = wfi.getIO()

        ## check needs override
        needs_overide = False
        if not needs_overide and options.augment: needs_overide = True

        def overide_from_agent(wfi, needs_overide):
            bad_agents = []  #'http://cmssrv219.fnal.gov:5984']
            if not bad_agents: return needs_overide
            if needs_overide: return True
            agents = wfi.getAgents()

            wqss = ['Running', 'Acquired']
            if any([
                    agent in agents.get(wqs, {}).keys()
                    for wqs, agent in itertools.product(wqss, bad_agents)
            ]):
                print "overriding the need for bad agent"
                needs_overide = True
            return needs_overide

        ## now parse this for action
        for i_task, (task, campaign) in enumerate(tasks_and_campaigns):
            if options.augment:
                print task.pathName
                print campaign

            tune = CI.get(campaign, 'tune', options.tune)
            if tune and not campaign in tune_performance:
                tune_performance.append(campaign)

            overflow = CI.get(campaign, 'overflow', {})
            if overflow:
                if "PU" in overflow and not campaign in PU_overflow:
                    PU_overflow[campaign] = copy.deepcopy(overflow['PU'])
                    print "adding", campaign, "to PU overflow rules"
                if "LHE" in overflow and not campaign in LHE_overflow:
                    print "adding", campaign, "to light input overflow rules"
                    site_list = overflow['LHE']['site_list']
                    LHE_overflow[campaign] = copy.deepcopy(
                        getattr(SI, site_list))

            ### get the task performance, for further massaging.
            if campaign in tune_performance or options.tune:
                print "performance", task.taskType, task.pathName
                if task.taskType in ['Processing', 'Production']:
                    set_memory, set_time = getPerf(task.pathName)
                    #print "Performance %s GB %s min"%( set_memory,set_time)
                    wfi.sendLog(
                        'equalizor', 'Performance tuning to %s GB %s min' %
                        (set_memory, set_time))
                    ## get values from gmwsmon
                    # massage the values : 95% percentile
                    performance[task.pathName] = {}
                    if set_memory:
                        performance[task.pathName]['memory'] = set_memory
                    if set_time and False:
                        performance[task.pathName]['time'] = set_time

            ### rule to avoid the issue of taskchain secondary jobs being stuck at sites processing the initial step
            if campaign in LHE_overflow:
                if task.taskType in ['Processing']:
                    needs, task_name, running, idled = needs_action(wfi, task)
                    needs_overide = overide_from_agent(wfi, needs_overide)
                    extend_to = list(set(copy.deepcopy(
                        LHE_overflow[campaign])))
                    if stay_within_site_whitelist:
                        extend_to = list(
                            set(extend_to) & set(wfi.request['SiteWhitelist'])
                        )  ## restrict to stupid-site-whitelist
                    extend_to = list(
                        set(extend_to) & set(SI.sites_ready + force_sites))

                    if extend_to and needs or needs_overide:

                        modifications[wfo.name][task.pathName] = {
                            "ReplaceSiteWhitelist": extend_to,
                            "Running": running,
                            "Pending": idled,
                            "Priority": wfi.request['RequestPriority']
                        }
                        wfi.sendLog(
                            'equalizor',
                            '%s of %s is running %d and pending %d, taking action : ReplaceSiteWhitelist \n %s'
                            %
                            (task_name, wfo.name, running, idled,
                             json.dumps(
                                 sorted(modifications[wfo.name][task.pathName]
                                        ['ReplaceSiteWhitelist']))))

                        altered_tasks.add(task.pathName)
                    else:
                        wfi.sendLog(
                            'equalizor',
                            '%s of %s is running %d and pending %d' %
                            (task_name, wfo.name, running, idled))

            ### overflow the 76 digi-reco to the site holding the pileup
            if campaign in PU_overflow:
                force = PU_overflow[campaign][
                    'force'] if 'force' in PU_overflow[campaign] else False
                secondary_locations = set(SI.sites_ready + force_sites)
                for s in sec:
                    if not s in PU_locations:
                        presence = getDatasetPresence(url, s)
                        #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
                        one_secondary_locations = [
                            site for (site, (there, frac)) in presence.items()
                            if frac > 98.
                        ]
                        PU_locations[s] = one_secondary_locations
                    print "secondary is at", sorted(PU_locations[s])
                    secondary_locations = set(
                        [SI.SE_to_CE(site)
                         for site in PU_locations[s]]) & secondary_locations

                ## we should add all sites that hold the secondary input if any
                ### given that we have the secondary location available, it is not necessary to use the add-hoc list
                ##secondary_locations = list(set(PU_overflow[campaign]['sites']) & set( SI.sites_ready ))

                if any([
                        task.pathName.endswith(finish)
                        for finish in ['_0', 'StepOneProc', 'Production']
                ]):
                    needs, task_name, running, idled = needs_action(wfi, task)
                    ## removing the ones in the site whitelist already since they encode the primary input location
                    if stay_within_site_whitelist:
                        original_site_in_use = set(
                            wfi.request['SiteWhitelist'])
                    else:
                        original_site_in_use = set(secondary_locations)
                    ## remove the sites that have already running jobs
                    gmon = wfi.getGlideMon()
                    if gmon and task_name in gmon and 'Sites' in gmon[
                            task_name]:
                        site_in_use = set(gmon[task_name]['Sites'])
                        print "removing", sorted(site_in_use)
                        ## that determines where you want to run in addition
                        augment_by = list((set(secondary_locations) -
                                           site_in_use) & original_site_in_use)
                    else:
                        print "no existing running site"
                        augment_by = list(original_site_in_use)

                    needs_overide = overide_from_agent(wfi, needs_overide)
                    if augment_by and (
                            needs or needs_overide
                            or force) and PU_overflow[campaign][
                                'pending'] < PU_overflow[campaign]['max']:
                        PU_overflow[campaign]['pending'] += idled
                        print "raising overflow to", PU_overflow[campaign][
                            'pending'], "for", PU_overflow[campaign]['max']
                        ## the step with an input ought to be the digi part : make this one go anywhere
                        modifications[wfo.name][task.pathName] = {
                            "AddWhitelist": augment_by,
                            "Running": running,
                            "Pending": idled,
                            "Priority": wfi.request['RequestPriority']
                        }
                        altered_tasks.add(task.pathName)
                        wfi.sendLog(
                            'equalizor',
                            '%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s'
                            % (task_name, wfo.name, running, idled,
                               json.dumps(sorted(augment_by), indent=2)))
                    else:
                        print task_name, "of", wfo.name, "running", running, "and pending", idled

            ### overflow the skims back to multi-core
            if campaign in ['Run2015D', 'Run2015C_25ns'
                            ] and task.taskType == 'Skim':
                original_swl = wfi.request['SiteWhitelist']
                needs, task_name, running, idled = needs_action(wfi, task)
                if (needs or needs_overide):
                    modifications[wfo.name][task.pathName] = {
                        'AddWhitelist': original_swl,
                        "Running": running,
                        "Pending": idled,
                        "Priority": wfi.request['RequestPriority']
                    }
                    altered_tasks.add(task.pathName)
                    wfi.sendLog(
                        'equalizor',
                        '%s of %s is running %d and pending %d, taking action : AddWhitelist \n %s'
                        % (task_name, wfo.name, running, idled,
                           json.dumps(sorted(original_swl), indent=2)))

            if options.augment:
                print sorted(wfi.request['SiteWhitelist']), i_task, use_HLT

            ### add the HLT at partner of CERN
            if 'T2_CH_CERN' in wfi.request['SiteWhitelist'] and i_task in [
                    0, 1
            ] and use_HLT:
                needs, task_name, running, idled = needs_action(wfi, task)
                if options.augment: needs = True
                needs = True
                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide) and pending_HLT < max_HLT:
                    pending_HLT += idled
                    if task.pathName in modifications[
                            wfo.name] and 'AddWhitelist' in modifications[
                                wfo.name][task.pathName]:
                        modifications[wfo.name][task.pathName][
                            "AddWhitelist"].append("T2_CH_CERN_HLT")
                        print "\t", wfo.name, "adding addHLT up to", pending_HLT, "for", max_HLT
                        print task.pathName
                    ## this Replace does not work at all for HLT
                    #elif task.pathName in modifications[wfo.name] and 'ReplaceSiteWhitelist' in modifications[wfo.name][task.pathName]:
                    #modifications[wfo.name][task.pathName]["ReplaceSiteWhitelist"].append( "T2_CH_CERN_HLT" )
                    #print "\t",wfo.name,"adding replace HLT up to",pending_HLT,"for",max_HLT
                    else:
                        modifications[wfo.name][task.pathName] = {
                            "AddWhitelist": ["T2_CH_CERN_HLT"],
                            "Priority": wfi.request['RequestPriority'],
                            "Running": running,
                            "Pending": idled
                        }
                        wfi.sendLog(
                            'equalizor',
                            'adding the HLT in whitelist of %s to %d for %d' %
                            (task.pathName, pending_HLT, max_HLT))

            if i_task == 0 and not sec and use_T0:
                needs, task_name, running, idled = needs_action(wfi, task)

                if options.augment: needs = True
                #needs = True
                good_type = wfi.request['RequestType'] in [
                    'MonteCarlo', 'MonteCarloFromGEN'
                ]
                read_lhe = ((not 'LheInputFiles' in wfi.request)
                            or bool(wfi.request['LheInputFiles']))
                good_type &= not read_lhe
                if not good_type and not options.augment: needs = False

                ##needs = random.random()<0.40 remove the random, just add up to a limit
                if (needs or needs_overide):
                    pending_T0 += idled
                    if task.pathName in modifications[
                            wfo.name] and 'AddWhitelist' in modifications[
                                wfo.name][task.pathName]:
                        if not "T0_CH_CERN" in modifications[wfo.name][
                                task.pathName]["AddWhitelist"]:
                            modifications[wfo.name][task.pathName][
                                "AddWhitelist"].append("T0_CH_CERN")
                            wfi, sendLog(
                                'equalizor',
                                'adding the T0 for %s to %d for %d' %
                                (task.pathName, pending_T0, max_T0))
                    elif task.pathName in modifications[
                            wfo.
                            name] and 'ReplaceSiteWhitelist' in modifications[
                                wfo.name][task.pathName]:
                        if not "T0_CH_CERN" in modifications[wfo.name][
                                task.pathName]["ReplaceSiteWhitelist"]:
                            modifications[wfo.name][task.pathName][
                                "ReplaceSiteWhitelist"].append("T0_CH_CERN")
                            wfi, sendLog(
                                'equalizor',
                                'adding the T0 to replacement for %s to %d for %d'
                                % (task.pathName, pending_T0, max_T0))
                    else:
                        modifications[wfo.name][task.pathName] = {
                            "AddWhitelist": ["T0_CH_CERN"],
                            "Priority": wfi.request['RequestPriority'],
                            "Running": running,
                            "Pending": idled
                        }
                        wfi, sendLog(
                            'equalizor', 'adding the T0 for %s to %d for %d' %
                            (task.pathName, pending_T0, max_T0))

    interface['modifications'].update(modifications)

    ###  manage the number of core and job resizing
    interface['cores'] = {
        'T2_CH_CERN_HLT': {
            'min': 4,
            'max': 16
        },
        'default': {
            'min': 1,
            'max': 4
        }
    }
    interface['resizes'] = ['RunIISpring16DR80']

    ### manage the modification of the memory and target time
    interface['time'] = defaultdict(list)
    interface['memory'] = defaultdict(list)

    max_N_mem = 10
    max_N_time = 10
    ## discretize the memory to 10 at most values
    mems = set([o['memory'] for t, o in performance.items() if 'memory' in o])
    times = set([o['time'] for t, o in performance.items() if 'time' in o])
    if len(mems) > max_N_mem:
        mem_step = int((max(mems) - min(mems)) / float(max_N_mem))
        for t in performance:
            if not 'memory' in performance[t]: continue
            (m, r) = divmod(performance[t]['memory'], mem_step)
            performance[t]['memory'] = (m + 1) * mem_step
    if len(times) > max_N_time:
        time_step = int((max(times) - min(times)) / float(max_N_time))
        for t in performance:
            if not 'time' in performance[t]: continue
            (m, r) = divmod(performance[t]['time'], time_step)
            performance[t]['time'] = (m + 1) * time_step

    for t, o in performance.items():
        if 'time' in o:
            interface['time'][str(o['time'])].append(t)
        if 'memory' in o:
            interface['memory'][str(o['memory'])].append(t)

    ## close and save
    close(interface)
Ejemplo n.º 49
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock() and not options.manual: return
    mlock = moduleLock()
    if mlock() and not options.manual: return
    if not componentInfo().check() and not options.manual: return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    SI = siteInfo()
    SI = global_SI()
    ###NLI = newLockInfo()
    ###if not NLI.free() and not options.go: return
    LI = lockInfo()
    #if not LI.free() and not options.go and not options.manual: return

    n_assigned = 0
    n_stalled = 0

    wfos = []
    fetch_from = []
    if specific or options.early:
        fetch_from.extend(['considered', 'staging'])
    if specific:
        fetch_from.extend(['considered-tried'])

    if options.early:
        print "Option Early is on"

    fetch_from.extend(['staged'])

    if options.from_status:
        fetch_from = options.from_status.split(',')
        print "Overriding to read from", fetch_from

    for status in fetch_from:
        print "getting wf in", status
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == status).all())
        print len(wfos)

    ## in case of partial, go for fetching a list from json ?
    #if options.partial and not specific:
    #    pass

    aaa_mapping = json.loads(eosRead('%s/equalizor.json' %
                                     monitor_pub_dir))['mapping']
    all_stuck = set()
    all_stuck.update(
        json.loads(eosRead('%s/stuck_transfers.json' % monitor_pub_dir)))

    max_per_round = UC.get('max_per_round').get('assignor', None)
    max_cpuh_block = UC.get('max_cpuh_block')

    # Temporarily switch off prioritization
    random.shuffle(wfos)
    ##order by priority instead of random
    """
    if options.early:
        cache = sorted(getWorkflows(url, 'assignment-approved', details=True), key = lambda r : r['RequestPriority'])
        cache = [r['RequestName'] for r in cache]
        def rank( wfn ):
            return cache.index( wfn ) if wfn in cache else 0

        wfos = sorted(wfos, key = lambda wfo : rank( wfo.name ),reverse=True)
        print "10 first",[wfo.name for wfo in wfos[:10]]
        print "10 last",[wfo.name for wfo in wfos[-10:]]
    else:
        random.shuffle( wfos )
    """

    for wfo in wfos:

        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue

        if not options.manual and 'rucio' in (wfo.name).lower(): continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)

        if wfh.request['RequestStatus'] in [
                'rejected', 'aborted', 'aborted-completed', 'aborted-archived',
                'rejected-archived'
        ] and wfh.isRelval():
            wfo.status = 'forget'
            session.commit()
            n_stalled += 1
            continue

        if options.priority and int(
                wfh.request['RequestPriority']) < options.priority:
            continue

        options_text = ""
        if options.early: options_text += ", early option is ON"

        wfh.sendLog('assignor',
                    "%s to be assigned %s" % (wfo.name, options_text))

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary, sites_allowed,
         sites_not_allowed) = wfh.getSiteWhiteList()

        output_tiers = list(
            set([o.split('/')[-1] for o in wfh.request['OutputDatasets']]))

        if not output_tiers:
            n_stalled += 1
            wfh.sendLog('assignor', 'There is no output at all')
            sendLog('assignor',
                    'Workflow %s has no output at all' % (wfo.name),
                    level='critical')
            continue

        is_stuck = (all_stuck & primary)
        if is_stuck:
            wfh.sendLog('assignor',
                        "%s are stuck input" % (','.join(is_stuck)))

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            no_go = True

        allowed_secondary = {}
        assign_parameters = {}
        check_secondary = (not wfh.isRelval())
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns:
                assign_parameters.update(CI.campaigns[campaign])

            if campaign in CI.campaigns and 'secondaries' in CI.campaigns[
                    campaign]:
                if CI.campaigns[campaign]['secondaries']:
                    allowed_secondary.update(
                        CI.campaigns[campaign]['secondaries'])
                    check_secondary = True
            if campaign in CI.campaigns and 'banned_tier' in CI.campaigns[
                    campaign]:
                banned_tier = list(
                    set(CI.campaigns[campaign]['banned_tier'])
                    & set(output_tiers))
                if banned_tier:
                    no_go = True
                    wfh.sendLog(
                        'assignor', 'These data tiers %s are not allowed' %
                        (','.join(banned_tier)))
                    sendLog('assignor',
                            'These data tiers %s are not allowed' %
                            (','.join(banned_tier)),
                            level='critical')

        if secondary and check_secondary:
            if (set(secondary) & set(allowed_secondary.keys()) !=
                    set(secondary)):
                msg = '%s is not an allowed secondary' % (
                    ', '.join(set(secondary) - set(allowed_secondary.keys())))
                wfh.sendLog('assignor', msg)
                critical_msg = msg + '\nWorkflow URL: https://dmytro.web.cern.ch/dmytro/cmsprodmon/workflows.php?prep_id=task_{}'.format(
                    wfh.getPrepIDs()[0])
                sendLog('assignor', critical_msg, level='critical')
                if not options.go:
                    no_go = True
            ## then get whether there is something more to be done by secondary
            for sec in secondary:
                if sec in allowed_secondary:  # and 'parameters' in allowed_secondary[sec]:
                    assign_parameters.update(allowed_secondary[sec])

        if no_go:
            n_stalled += 1
            ## make a very loud noise if >100k priority stalled
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            if not options.test:
                wfh.sendLog('assignor',
                            "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request['RequestStatus']
                wfo.status = 'away'
                session.commit()
                continue
            else:
                print wfo.name, wfh.request['RequestStatus']

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog('assignor', "cannot decide on version number")
                n_stalled += 1
                wfo.status = 'trouble'
                session.commit()
                continue

        wfh.sendLog('assignor', "Site white list %s" % sorted(sites_allowed))

        blocks = wfh.getBlocks()
        if blocks:
            wfh.sendLog(
                'assignor',
                "Needs {} blocks in input {}".format(len(blocks),
                                                     '\n'.join(blocks)))
        wfh.sendLog('assignor', "Allowed %s" % sorted(sites_allowed))

        primary_aaa = options.primary_aaa
        secondary_aaa = options.secondary_aaa

        if 'Campaign' in wfh.request and wfh.request[
                'Campaign'] in CI.campaigns:
            assign_parameters.update(CI.campaigns[wfh.request['Campaign']])

        if 'primary_AAA' in assign_parameters and primary:
            primary_aaa = primary_aaa or assign_parameters['primary_AAA']
        if 'secondary_AAA' in assign_parameters:
            secondary_aaa = secondary_aaa or assign_parameters['secondary_AAA']

        wfh.sendLog(
            'assignor',
            "Initial values for primary_AAA=%s and secondary_AAA=%s" %
            (primary_aaa, secondary_aaa))

        if primary_aaa:
            if "T2_CH_CERN_HLT" in sites_allowed:
                sites_allowed.remove("T2_CH_CERN_HLT")
            if "T2_CH_CERN_HLT" not in sites_not_allowed:
                sites_not_allowed.append("T2_CH_CERN_HLT")

        ## keep track of this, after secondary input location restriction : that's how you want to operate it
        initial_sites_allowed = copy.deepcopy(sites_allowed)

        set_lfn = '/store/mc'  ## by default

        for prim in list(primary):
            set_lfn = getLFNbase(prim)
            ## if they are requested for processing, they should bbe all closed already
            # FIXME: remove this closeAllBlocks
            #closeAllBlocks(url, prim, blocks)

        ## should be 2 but for the time-being let's lower it to get things going
        _copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog('assignor', "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            #sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                'assignor',
                '%s requires a large numbr of CPUh %s , not assigning, please check with requester'
                % (wfo.name, cpuh),
                level='critical')
            wfh.sendLog(
                'assignor',
                "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        ## default back to white list to original white list with any data
        wfh.sendLog('assignor', "Allowed sites :%s" % sorted(sites_allowed))

        # TODO Alan on 1/april/2020: keep the AAA functionality
        if primary_aaa:
            ## remove the sites not reachable localy if not in having the data
            if not sites_allowed:
                wfh.sendLog('assignor',
                            "Overiding the primary on AAA setting to Off")
                primary_aaa = False
            else:
                aaa_grid = set(sites_allowed)
                for site in list(aaa_grid):
                    aaa_grid.update(aaa_mapping.get(site, []))
                sites_allowed = list(set(initial_sites_allowed) & aaa_grid)
                wfh.sendLog(
                    'assignor', "Selected to read primary through xrootd %s" %
                    sorted(sites_allowed))

        isStoreResults = ('StoreResults' == wfh.request.setdefault(
            'RequestType', None))

        if isStoreResults:
            if 'MergedLFNBase' in wfh.request:
                set_lfn = wfh.request['MergedLFNBase']
            else:
                n_stalled += 1
                wfh.sendLog(
                    'assignor',
                    "Cannot assign StoreResults request because MergedLFN is missing"
                )
                sendLog(
                    'assignor',
                    'Cannot assign StoreResults request because MergedLFN is missing',
                    level='critical')
                continue

        if not primary_aaa:
            if isStoreResults:
                ## if we are dealing with a StoreResults request, we don't need to check dataset availability and
                ## should use the SiteWhiteList set in the original request
                if 'SiteWhitelist' in wfh.request:
                    sites_allowed = wfh.request['SiteWhitelist']
                else:
                    wfh.sendLog(
                        'assignor',
                        "Cannot assign StoreResults request because SiteWhitelist is missing"
                    )
                    sendLog(
                        'assignor',
                        'Cannot assign StoreResults request because SiteWhitelist is missing',
                        level='critical')
                    n_stalled += 1
                    continue
            wfh.sendLog('assignor',
                        "Selected for any data %s" % sorted(sites_allowed))

        #if not len(sites_allowed):
        #    if not options.early:
        #        wfh.sendLog('assignor',"cannot be assign with no matched sites")
        #        sendLog('assignor','%s has no whitelist'% wfo.name, level='critical')
        #    n_stalled+=1
        #    continue

        if not len(sites_allowed) and not options.SiteWhitelist:
            if not options.early:
                wfh.sendLog('assignor',
                            "cannot be assign with no matched sites")
                sendLog('assignor',
                        '%s has no whitelist' % wfo.name,
                        level='critical')
            n_stalled += 1
            continue

        t1t2_only = [
            ce for ce in sites_allowed
            if [ce.startswith('T1') or ce.startswith('T2')]
        ]
        if t1t2_only:
            # try to pick from T1T2 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1t2_only])]
            # then pick any otherwise
        else:
            sites_out = [
                SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])
            ]

        print "available=", SI.disk[sites_out[0]]
        wfh.sendLog('assignor', "Placing the output on %s" % sites_out)
        parameters = {
            'SiteWhitelist': sites_allowed,
            'SiteBlacklist': sites_not_allowed,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': set_lfn,
            'ProcessingVersion': version,
        }

        if primary_aaa:
            parameters['TrustSitelists'] = True
            wfh.sendLog(
                'assignor',
                "Reading primary through xrootd at %s" % sorted(sites_allowed))

        if secondary_aaa:
            # Do not set TrustPUSitelist to True if there is no secondary
            if secondary:
                parameters['TrustPUSitelists'] = True
                wfh.sendLog(
                    'assignor', "Reading secondary through xrootd at %s" %
                    sorted(sites_allowed))

        ## plain assignment here
        team = 'production'
        if os.getenv('UNIFIED_TEAM'): team = os.getenv('UNIFIED_TEAM')
        if options and options.team:
            team = options.team
        parameters['Team'] = team

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog('assignor',
                        'Setting the number of events per job to 500k max')
            parameters['EventsPerJob'] = 500000

        def pick_options(options, parameters):
            ##parse options entered in command line if any
            if options:
                for key in reqMgrClient.assignWorkflow.keys:
                    v = getattr(options, key)
                    if v != None:
                        if type(v) == str and ',' in v:
                            parameters[key] = filter(None, v.split(','))
                        else:
                            parameters[key] = v

        def pick_campaign(assign_parameters, parameters):
            ## pick up campaign specific assignment parameters
            parameters.update(assign_parameters.get('parameters', {}))

        if options.force_options:
            pick_campaign(assign_parameters, parameters)
            pick_options(options, parameters)
        else:
            ## campaign parameters update last
            pick_options(options, parameters)
            pick_campaign(assign_parameters, parameters)

        if not options.test:
            parameters['execute'] = True

        hold_split, split_check = wfh.checkSplitting()
        if hold_split and not options.go:
            if split_check:
                wfh.sendLog(
                    'assignor', 'Holding on to the change in splitting %s' %
                    ('\n\n'.join([str(i) for i in split_check])))
            else:
                wfh.sendLog('assignor', 'Change of splitting is on hold')
            n_stalled += 1
            continue

        if split_check == None or split_check == False:
            n_stalled += 1
            continue
        elif split_check:
            ## operate all recommended changes
            reqMgrClient.setWorkflowSplitting(url, wfo.name, split_check)
            wfh.sendLog(
                'assignor', 'Applying the change in splitting %s' %
                ('\n\n'.join([str(i) for i in split_check])))

        split_check = True  ## bypass completely and use the above

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if 'PU_RD' in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if 'PU_RD2' in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    #sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog('assignor',
                            "%s needs to be split by event with %s per job" %
                            (wfo.name, eventsPerJob),
                            level='critical')
                    wfh.sendLog(
                        'assignor',
                        "%s needs to be split by event with %s per job" %
                        (wfo.name, eventsPerJob))
                    parameters['EventsPerJob'] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    # FIXME: decide which of the lines below needs to remain...
                    eventsPerJobEstimated = spl[
                        'events_per_job'] if 'events_per_job' in spl else None
                    eventsPerJobEstimated = spl[
                        'avg_events_per_job'] if 'avg_events_per_job' in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        #sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog('assignor',
                                "%s was assigned with %s lumis/job" %
                                (wfo.name, lumisPerJob),
                                level='critical')
                        wfh.sendLog(
                            'assignor', "%s was assigned with %s lumis/job" %
                            (wfo.name, lumisPerJob))
                        parameters['LumisPerJob'] = lumisPerJob
                    else:
                        #sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            'assignor',
                            "leaving splitting untouched for %s, please check on %s"
                            % (pstring, wfo.name),
                            level='critical')
                        wfh.sendLog(
                            'assignor',
                            "leaving splitting untouched for PU_RD*, please check."
                        )

        ## make sure to autoapprove all NonCustodialSites
        parameters['AutoApproveSubscriptionSites'] = list(
            set(parameters['NonCustodialSites'] +
                parameters.get('AutoApproveSubscriptionSites', [])))
        result = reqMgrClient.assignWorkflow(
            url, wfo.name, None,
            parameters)  ## team is not relevant anymore here

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
                n_assigned += 1
                wfh.sendLog(
                    'assignor', "Properly assigned\n%s" %
                    (json.dumps(parameters, indent=2)))
                if wfh.producePremix() and (not wfh.isRelval()):
                    title = "Heavy workflow assigned to {}".format(
                        parameters['SiteWhitelist'])
                    body = "Workflow name: {}".format(
                        wfh.request['RequestName'])
                    body += "\nOutput dataset(s): {}".format(
                        wfh.request['OutputDatasets'])
                    body += "\nAssigned to: {}".format(
                        parameters['SiteWhitelist'])
                    sendEmail(
                        title,
                        body,
                        destination=[
                            '*****@*****.**'
                        ])

                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(
                            sec) + new_wfi.request['OutputDatasets']:
                        ## lock all outputs
                        LI.lock(secure, reason='assigning')

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                wfh.sendLog(
                    'assignor',
                    "Failed to assign %s.\n%s \n Please check the logs" %
                    (wfo.name, reqMgrClient.assignWorkflow.errorMessage))
                sendLog('assignor',
                        "Failed to assign %s.\n%s \n Please check the logs" %
                        (wfo.name, reqMgrClient.assignWorkflow.errorMessage),
                        level='critical')
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog('assignor', "Assigned %d Stalled %s" % (n_assigned, n_stalled))
    if n_stalled and not options.go and not options.early:
        sendLog('assignor',
                "%s workflows cannot be assigned. Please take a look" %
                (n_stalled),
                level='critical')
Ejemplo n.º 50
0
def completor(url, specific):
    mlock = moduleLock(silent=True)
    if mlock(): return 


    use_mcm = True
    up = componentInfo(soft=['mcm','wtc','jira'])
    if not up.check(): return
    use_mcm = up.status['mcm']
    if use_mcm:
        mcm = McMClient(dev=False)

    safe_mode = False

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    JC = JIRAClient() if up.status.get('jira',False) else None

    wfs = []
    wfs.extend( session.query(Workflow).filter(Workflow.status == 'away').all() )
    wfs.extend( session.query(Workflow).filter(Workflow.status.startswith('assistance')).all() )

    ## just take it in random order so that not always the same is seen
    random.shuffle( wfs )

    max_per_round = UC.get('max_per_round').get('completor',None)
    if max_per_round and not specific: wfs = wfs[:max_per_round]
        

    all_stuck = set()
    ## take into account what stagor was saying
    for itry in range(5):
        try:
            all_stuck.update( json.loads( eosRead('%s/stuck_transfers.json'%monitor_pub_dir)))
            break
        except:
            time.sleep(2)
        
    for itry in range(5):
         try:
             ## take into account the block that needed to be repositioned recently
             all_stuck.update( [b.split('#')[0] for b in json.loads( eosRead('%s/missing_blocks.json'%monitor_dir)) ] )
             break
         except:
             time.sleep(2)

    ## take into account all stuck block and dataset from transfer team
    all_stuck.update( getAllStuckDataset()) 


    good_fractions = {}
    overdoing_fractions = {}
    truncate_fractions = {} 
    timeout = {}
    campaign_injection_delay = {}
    for c in CI.campaigns:
        if 'force-complete' in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]['force-complete']
        if 'truncate-complete' in CI.campaigns[c]:
            truncate_fractions[c] = CI.campaigns[c]['truncate-complete']
        if 'force-timeout' in CI.campaigns[c]:
            timeout[c] = CI.campaigns[c]['force-timeout']
        if 'injection-delay' in CI.campaigns[c]:
            campaign_injection_delay[c] = CI.campaigns[c]['injection-delay']
        if 'overdoing-complete' in CI.campaigns[c]:
            overdoing_fractions[c] = CI.campaigns[c]['overdoing-complete']

    long_lasting = {}

    WI = wtcInfo()
    overrides = WI.getForce()
    if use_mcm:    
        ## add all workflow that mcm wants to get force completed
        mcm_force = mcm.get('/restapi/requests/forcecomplete')
        ## assuming this will be a list of actual prepids
        overrides['mcm'] = mcm_force

    print "can force complete on"
    print json.dumps( good_fractions ,indent=2)
    print "can truncate complete on"
    print json.dumps( truncate_fractions ,indent=2)
    print "can overide on"
    print json.dumps( overrides, indent=2)
    max_force = UC.get("max_force_complete")
    max_priority = UC.get("max_tail_priority")
    injection_delay_threshold = UC.get("injection_delay_threshold")
    injection_delay_priority = UC.get("injection_delay_priority")
    delay_priority_increase = UC.get("delay_priority_increase")
    default_fraction_overdoing = UC.get('default_fraction_overdoing')

    set_force_complete = set()

    # priority and time above which to fire a JIRA
    jira_priority_and_delays = { 110000 : 21,
                                 90000 : 28,
                            #     80000 : 60,
                            #0 : 90
                             }

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        print "looking at",wfo.name

        ## get all of the same
        wfi = workflowInfo(url, wfo.name)
        pids = wfi.getPrepIDs()
        skip=False
        campaigns = wfi.getCampaigns()

        #if not any([c in good_fractions.keys() for c in campaigns]): skip=True
        #if not any([c in truncate_fractions.keys() for c in campaigns]): skip=True

        for user,spec in overrides.items():
            if not spec: continue
            spec = filter(None, spec)
            if not wfi.request['RequestStatus'] in ['force-complete', 'completed']:
                if any(s in wfo.name for s in spec) or (wfo.name in spec) or any(pid in spec for pid in pids) or any(s in pids for s in spec):

                    wfi = workflowInfo(url, wfo.name)
                    forceComplete(url , wfi )
                    skip=True
                    wfi.notifyRequestor("The workflow %s was force completed by request of %s"%(wfo.name,user), do_batch=False)
                    wfi.sendLog('completor','%s is asking for %s to be force complete'%(user,wfo.name))
                    break
    
        if wfo.status.startswith('assistance'): skip = True

        if skip: 
            continue

        priority = wfi.request['RequestPriority']

        if not 'Campaign' in wfi.request: continue

        if not wfi.request['RequestStatus'] in ['acquired','running-open','running-closed']: continue



        ## until we can map the output to task ...
        output_per_task = wfi.getOutputPerTask() ## can use that one, and follow mapping
        good_fraction_per_out = {}
        good_fraction_nodelay_per_out = {}
        truncate_fraction_per_out = {}
        #allowed_delay_per_out = {}
        for task,outs in output_per_task.items():
            task_campaign = wfi.getCampaignPerTask( task )
            for out in outs:
                good_fraction_per_out[out] = good_fractions.get(task_campaign,1000.)
                good_fraction_nodelay_per_out[out] = overdoing_fractions.get(task_campaign,default_fraction_overdoing)
                truncate_fraction_per_out[out] = truncate_fractions.get(task_campaign,1000.)
                #allowed_delay_per_out[out] = timeout.get(task_campaign, 14)

        #print "force at", json.dumps( good_fraction_per_out, indent=2)
        #print "truncate at",json.dumps( truncate_fraction_per_out, indent=2)

        now = time.mktime(time.gmtime()) / (60*60*24.)

        priority_log = filter(lambda change: change['Priority'] == priority,wfi.request.get('PriorityTransition',[]))
        if not priority_log:
            print "\tHas no priority log"
            priority_delay = 0
        else:
            then = max([change['UpdateTime'] for change in priority_log]) / (60.*60.*24.)
            priority_delay = now - then ## in days
            print "priority was set to",priority,priority_delay,"[days] ago"

        running_log = filter(lambda change : change["Status"] in ["running-open","running-closed"],wfi.request['RequestTransition'])
        if not running_log:
            print "\tHas no running log"
            delay = 0
        else:
            then = max([change['UpdateTime'] for change in running_log]) / (60.*60.*24.)
            delay = now - then ## in days

        #further check on delays
        cpuh = wfi.getComputingTime(unit='d')
        wfi.sendLog('completor',"Requires %.1f [CPUd], running since %.1f [day], last priority change to %d done %.1f [day] ago"%( cpuh, delay, priority, priority_delay))
        if priority_delay!=0 and priority_delay < delay:
            ## regardless when it started running, set the delay to when priority was changed last
            delay = priority_delay

        ## this is supposed to be the very initial request date, inherited from clones
        injection_delay = None
        original = wfi
        if 'OriginalRequestName' in original.request:
            ## go up the clone chain
            original = workflowInfo(url, original.request['OriginalRequestName'])
        injected_log = filter(lambda change : change["Status"] in ["assignment-approved"],original.request['RequestTransition'])
        if injected_log:
            injected_on = injected_log[-1]['UpdateTime'] / (60.*60.*24.)
            injection_delay = now - injected_on
        

        delay_for_priority_increase = injection_delay
        #delay_for_priority_increase = delay

        (w,d) = divmod(delay, 7 )
        print "\t"*int(w)+"Running since",delay,"[days] priority=",priority
        
        pop_a_jira = False
        ping_on_jira = 7 *(24*60*60) # 7 days
        for jp,jd in jira_priority_and_delays.items():
            if priority >= jp and delay >= jd: pop_a_jira = True

        if pop_a_jira and JC:
            j,reopened,just_created = JC.create_or_last( prepid = wfi.request['PrepID'],
                                                    priority = wfi.request['RequestPriority'],
                                                    label = 'Late',
                                                    reopen = True)
            last_time = JC.last_time( j )
            since_last_ping = time.mktime(time.gmtime()) - last_time
            if since_last_ping > ping_on_jira or just_created:
                j_comment = "Running since %.1f [days] at priority %d"%( delay, priority)
                JC.comment(j.key, j_comment)
            

        if delay_for_priority_increase!=None and delay_for_priority_increase > injection_delay_threshold and priority >= injection_delay_priority:
            quantized = 5000 ## quantize priority
            tail_cutting_priority = wfi.request['InitialPriority']+ int((delay_priority_increase * (delay_for_priority_increase - injection_delay_threshold) / 7) / quantized) * quantized
            tail_cutting_priority += 101 ## to signal it is from this mechanism
            tail_cutting_priority = min(400000, tail_cutting_priority) ## never go above 400k priority
            tail_cutting_priority = max(tail_cutting_priority, priority) ## never go below the current value
            
            if priority < tail_cutting_priority:
                if max_priority:
                    sendLog('completor',"%s Injected since %s [days] priority=%s, increasing to %s"%(wfo.name,delay_for_priority_increase,priority, tail_cutting_priority), level='critical')
                    wfi.sendLog('completor','bumping priority to %d for being injected since %s'%( tail_cutting_priority, delay_for_priority_increase))

                    reqMgrClient.changePriorityWorkflow(url, wfo.name, tail_cutting_priority)
                    max_priority-=1
                else:
                    sendLog('completor',"%s Injected since %s [days] priority=%s, would like to increase to %s"%(wfo.name,delay_for_priority_increase,priority, tail_cutting_priority), level='critical')
                    wfi.sendLog('completor','would like to bump priority to %d for being injected since %s'%( tail_cutting_priority, delay_for_priority_increase))

                    print "Could be changing the priority to higher value, but too many already were done"

        _,prim,_,_ = wfi.getIO()
        is_stuck = all_stuck & prim
        if is_stuck: wfi.sendLog('completor','%s is stuck'%','.join(is_stuck))

        monitor_delay = 7
        allowed_delay = max([timeout.get(c,14) for c in campaigns])
            
        monitor_delay = min(monitor_delay, allowed_delay)

        ### just skip if too early, just for the sake of not computing the completion fraction just now.
        # maybe this is fast enough that we can do it for all
        if delay <= monitor_delay: 
            print "not enough time has passed yet"
            continue

        long_lasting[wfo.name] = { "delay" : delay,
                                   "injection_delay" : injection_delay }

        percent_completions = wfi.getCompletionFraction(caller='completor')
        
        if not percent_completions:
            sendLog('completor','%s has no output at all'% wfo.name, level='critical')
            continue

        is_over_allowed_delay = (all([percent_completions[out] >= good_fraction_per_out.get(out,1000.) for out in percent_completions]) and delay >= allowed_delay)
        is_over_truncation_delay = (is_stuck and (all([percent_completions[out] >= truncate_fraction_per_out.get(out,1000.) for out in percent_completions])) and delay >= allowed_delay)
        is_over_completion = (all([percent_completions[out] >= good_fraction_nodelay_per_out.get(out,1000.) for out in percent_completions]))

        if is_over_completion:
            wfi.sendLog('completor', "all is over completed %s\n %s"%( json.dumps( good_fraction_nodelay_per_out, indent=2 ),
                                                                       json.dumps( percent_completions, indent=2 )
                                                                       ))
        elif is_over_allowed_delay:
            wfi.sendLog('completor', "all is above %s \n%s"%( json.dumps(good_fraction_per_out, indent=2 ), 
                                                              json.dumps( percent_completions, indent=2 )
                                                              ))
        elif is_over_truncation_delay:
            wfi.sendLog('completor', "all is above %s truncation level, and the input is stuck\n%s"%( json.dumps(truncate_fraction_per_out, indent=2 ),
                                                                                                      json.dumps( percent_completions, indent=2 ) ) )

        else:
            long_lasting[wfo.name].update({
                    'completion': sum(percent_completions.values()) / len(percent_completions),
                    'completions' : percent_completions
                    })
            
            ## do something about the agents this workflow is in
            long_lasting[wfo.name]['agents'] = wfi.getAgents()
            wfi.sendLog('completor', "%s not over bound \ncomplete at %s \n truncate at %s \nRunning %s"%(json.dumps( percent_completions, indent=2), 
                                                                                                 json.dumps(good_fraction_per_out, indent=2),
                                                                                                 json.dumps( truncate_fraction_per_out, indent=2),
                                                                                                 json.dumps( long_lasting[wfo.name]['agents'], indent=2) ))
            continue

        #for output in  percent_completions:
        #    completions[output]['injected'] = then
            

        ran_at = wfi.request['SiteWhitelist']
                        
        wfi.sendLog('completor',"Required %s, time spend %s"%( cpuh, delay))
                    
        ##### WILL FORCE COMPLETE BELOW
        # only really force complete after n days

        ## find ACDCs that might be running
        if max_force>0:
            print "going for force-complete of",wfo.name
            if not safe_mode:
                forceComplete(url, wfi )
                set_force_complete.add( wfo.name )
                wfi.sendLog('completor','going for force completing')
                wfi.notifyRequestor("The workflow %s was force completed for running too long"% wfo.name)
                max_force -=1
            else:
                sendEmail('completor', 'The workflow %s is ready for force complete, but completor is in safe mode'%wfo.name)
        else:
            wfi.sendLog('completor',"too many completion this round, cannot force complete")

    if set_force_complete:
        sendLog('completor','The followings were set force-complete \n%s'%('\n'.join(set_force_complete)))
    
    #open('%s/completions.json'%monitor_dir,'w').write( json.dumps( completions , indent=2))
    text="These have been running for long"
    
    #open('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 ))
    eosFile('%s/longlasting.json'%monitor_dir,'w').write( json.dumps( long_lasting, indent=2 )).close()

    for wf,info in sorted(long_lasting.items(), key=lambda tp:tp[1]['delay'], reverse=True):
        delay = info['delay']
        text += "\n %s : %s days"% (wf, delay)
        if 'completion' in info:
            text += " %d%%"%( info['completion']*100 )


    print text
Ejemplo n.º 51
0
def main():
    url = 'cmsweb.cern.ch'
    url_tb = 'cmsweb-testbed.cern.ch'
    
    # Example: python assign.py -w amaltaro_RVZTT_120404_163607_6269
    # -t testbed-relval -s T1_US_FNAL -e CMSSW_6_0_0_pre1_FS_TEST_WMA -p v1 -a
    # relval -l /store/backfill/1
    usage = "usage: %prog [options] [WORKFLOW]"
    
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-t', '--team', help='Type of Requests', dest='team')
    parser.add_option('-s', '--sites', help=' "t1" for Tier-1\'s and "t2" for Tier-2\'s', dest='sites')
    parser.add_option('--special',  help='Use it for special workflows. You also have to change the code according to the type of WF', dest='special')
    parser.add_option('-r', '--replica', action='store_true', dest='replica', default=False, help='Adds a _Disk Non-Custodial Replica parameter')
    parser.add_option('-p', '--procversion', help='Processing Version, if empty it will leave the processing version that comes by default in the request', dest='procversion')
    parser.add_option('-a', '--activity', help='Dashboard Activity (reprocessing, production or test), if empty will set reprocessing as default', dest='activity')
    parser.add_option('-x', '--xrootd', help='Assign with trustSiteLocation=True (allows xrootd capabilities)',
                                        action='store_true', default=False, dest='xrootd')
    parser.add_option('-l', '--lfn', help='Merged LFN base', dest='lfn')
    parser.add_option('-v', '--verbose', help='Verbose', action='store_true', default=False, dest='verbose')
    parser.add_option('--testbed', help='Assign in testbed', action='store_true', default=False, dest='testbed')
    parser.add_option('--test', action="store_true",help='Nothing is injected, only print infomation about workflow and Era', dest='test')
    parser.add_option('-f', '--file', help='Text file with a list of wokflows. If this option is used, the same settings will be applied to all workflows', dest='file')
    parser.add_option('-w', '--workflow', help='Workflow Name', dest='workflow')
    parser.add_option('-e', '--era', help='Acquistion era', dest='era')
    parser.add_option("--procstr", dest="procstring", help="Overrides Processing String with a single string")

    (options, args) = parser.parse_args()
    
    if options.testbed:
        url = url_tb

    # parse input workflows and files. If both -w and -f options are used, then only the -w inputs are considered.
    if not options.workflow:
        if args:
            wfs = args
        elif options.file:
            wfs = [l.strip() for l in open(options.file) if l.strip()]
        else:
            parser.error("Input a workflow name or a file to read them")
            sys.exit(0)
    else:
        wfs = [options.workflow]

    #Default values
    era = {}
    procversion = 1
    procstring = {}
    replica = False
    sites = ALL_SITES
    specialStr = ''
    taskchain = False
    team = 'production'
    trust_site = False

    SI = siteInfo()
    # Handling the parameters given in the command line
    # parse site list
    if options.sites:
        if options.sites == "t1":
            sites = SI.sites_T1s
        elif options.sites == "t2":
            sites = SI.sites_T2s
        else: 
            sites = [site for site in options.sites.split(',')]
    else: 
        sites = SI.sites_T1s + SI.sites_T2s
    if options.team:
        team = options.team

    if options.xrootd:
        trust_site = True

    if options.replica:
        replica = True

    for wf in wfs:
        # Getting the original dictionary
        schema = getRequestDict(url, wf)
        wf = reqMgr.Workflow(wf, url=url)

        # WF must be in assignment-approved in order to be assigned
        if (schema["RequestStatus"] != "assignment-approved"):
            print("The workflow '" + wf.name + "' you are trying to assign is not in assignment-approved")
            sys.exit(1)

        #Check to see if the workflow is a task chain or an ACDC of a taskchain
        taskchain = (schema["RequestType"] == "TaskChain") or ((schema["RequestType"] == "Resubmission") and "task" in schema["InitialTaskPath"].split("/")[1])

        #Dealing with era and proc string
        if taskchain:
            # Setting the Era and ProcStr values per Task
            for key, value in schema.items():
                if type(value) is dict and key.startswith("Task"):
                    try:
                        if 'ProcessingString' in value:
                            procstring[value['TaskName']] = value['ProcessingString']
                        else:
                            procstring[value['TaskName']] = schema['ProcessingString']
                        if 'AcquisitionEra' in value:
                            era[value['TaskName']] = value['AcquisitionEra']
                        else:
                            procstring[value['TaskName']] = schema['AcquisitionEra']
                    except KeyError:
                        print("This taskchain request has no AcquisitionEra or ProcessingString defined into the Tasks, aborting...")
                        sys.exit(1)
        # Adding the special string - in case it was provided in the command line
        if options.special:
            specialStr = '_' + str(options.special)
            for key, value in procstring.items():
                procstring[key] = value + specialStr
        # Override if a value is given using the procstring command
        if options.procstring:
            procstring = options.procstring
        elif not taskchain:
            procstring = wf.info['ProcessingString']
        if options.era:
            era = options.era
        elif not taskchain:
            era = wf.info['AcquisitionEra']
        #Set era and procstring to none for merge ACDCs inside a task chain
        if schema["RequestType"] == "Resubmission" and wf.info["PrepID"].startswith("task") and "Merge" in schema["InitialTaskPath"].split("/")[-2]:
            era = None
            procstring = None

        # Must use --lfn option, otherwise workflow won't be assigned
        if options.lfn:
            lfn = options.lfn
        elif "MergedLFNBase" in wf.info:
            lfn = wf.info['MergedLFNBase']
        else:
            print "Can't assign the workflow! Please include workflow lfn using --lfn option."
            sys.exit(0)
        # activity production by default for taskchains, reprocessing for default by workflows
        if options.activity:
            activity = options.activity
        elif taskchain:
            activity = 'production'
        else:
            activity = 'reprocessing'

        # given or default processing version
        if options.procversion:
            procversion = int(options.procversion)
        else:
            procversion = wf.info["ProcessingVersion"]

        # Check for output dataset existence, and abort if output datasets already exist!
        # Don't perform this check for ACDC's
        datasets = schema["OutputDatasets"]
        i = 0
        if not (schema["RequestType"] == "Resubmission" ):
            exist = False
            maxv = 1
            for key, value in schema.items():
                if type(value) is dict and key.startswith("Task"):
                    dbsapi = DbsApi(url=dbs3_url)
                    
                    # list all datasets with same name but different version
                    # numbers
                    datasets = dbsapi.listDatasets(acquisition_era_name=value['AcquisitionEra'], primary_ds_name=value['PrimaryDataset'], detail=True, dataset_access_type='*')
                    processedName = value['AcquisitionEra'] + '-' + value['ProcessingString'] + "-v\\d+"
                    # see if any of the dataset names is a match
                    for ds in datasets:
                        if re.match(processedName, ds['processed_ds_name']):
                            print "Existing dset:", ds['dataset'], "(%s)" % ds['dataset_access_type']
                            maxv = max(maxv, ds['processing_version'])
                            exist = True
                        else:
                             pass
                    i += 1
            # suggest max version
            if exist and procversion <= maxv:
                print "Some output datasets exist, its advised to assign with v ==", maxv + 1
                sys.exit(0)

    # If the --test argument was provided, then just print the information
    # gathered so far and abort the assignment
        if options.test:
            print "%s \tEra: %s \tProcStr: %s \tProcVer: %s" % (wf.name, era, procstring, procversion)
            print "LFN: %s \tTeam: %s \tSite: %s" % (lfn, team, sites)
            print "Taskchain? " + str(taskchain)
            print "Activity:" + activity
            sys.exit(0)
        
        # Really assigning the workflow now
        print wf.name, '\tEra:', era, '\tProcStr:', procstring, '\tProcVer:', procversion, '\tTeam:', team, '\tSite:', sites
        assignRequest(url, wf.name, team, sites, era, procversion, activity, lfn, procstring, trust_site, options.replica, options.verbose, taskchain)
    
    sys.exit(0)
Ejemplo n.º 52
0
def new_recoveror(url, specific, options=None):
    if userLock('recoveror'): return

    up = componentInfo(soft=['mcm','wtc','jira'])
    if not up.check(): return

    CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()

    wfs = session.query(Workflow).filter(Workflow.status.contains('recovery')).all()
    if specific:
        wfs.extend( session.query(Workflow).filter(Workflow.status == 'assistance-manual').all() )    

    try:
        from_operator = json.loads(os.popen('curl -s http://vocms0113.cern.ch/actions/test.json').read())
        ## now we have a list of things that we can take action on
    except:
        pass




    for wfo in wfs:
        if specific and not specific in wfo.name:continue

        if not specific and 'manual' in wfo.status: continue
        
        wfi = workflowInfo(url, wfo.name)
    
        send_recovery = False ## will make all acdc
        send_clone = False ## will make a clone
        send_back = False ## should just reject. manual ?
        send_manual = False ## will set in manual

        where_to_run, missing_to_run = wfi.getRecoveryInfo()

        task_to_recover = where_to_run.keys()

        ## if the site at which the recovery could run in drain or out ?
        for task in task_to_recover:
            not_ready = set(where_to_run[task]) - set(SI.sites_ready)
            if not_ready:
                print "the following sites are not ready for the ACDC",",".join( sorted(not_ready) )
                ## do we have a way of telling if a site is going to be out for a long time ?
                # check on priority: high prio, restart
                if wfi.request['RequestPriority'] >= 85000:
                    send_clone = True
                # check on age of the request
                injection_time = time.mktime(time.strptime('.'.join(map(str,wfi.request['RequestDate'])),"%Y.%m.%d.%H.%M.%S")) / (60.*60.)
                now = time.mktime(time.gmtime()) / (60.*60.)
                if float(now - injection_time) <14.:
                    ## less than 14 days, start over
                    send_clone = True
                else:
                    send_manual = True

        
        if not send_recovery:
            ## check on whether the stats is very low
            pass

        if send_recovery:
            ## make acdc for all tasks
            for task in task_to_recover:
                actions = list(set([case['solution'] for code,case in task_to_recover[task]  ]))
                acdc = singleRecovery(url, task, wfi.request , actions, do = True)
        elif send_clone:
            ## this will get it cloned
            wfo.status = 'assistance-clone'
            session.commit()
        elif send_manual:
            wfo.status = 'assistance-manual'
Ejemplo n.º 53
0
def collector(url, specific, options):
    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return 

    SI = siteInfo()
    dss = DSS()
    #NL = newLockInfo()
    mcm = McMClient(dev=False)
    fetch_in_campaigns = ['RunIISummer15GS']
    mcm_statuses=['new']#,'validation','defined','approved']

    will_be_used = defaultdict(list)
    secondary_used = defaultdict(list)
    for campaign,status in itertools.product( fetch_in_campaigns, mcm_statuses):
        queries=[]
        if campaign:
            print "getting for",campaign
            queries.append('member_of_campaign=%s'%campaign)
        if status:
            print "getting for",status
            queries.append('status=%s'%status)
        rs = mcm.getA('requests', query='&'.join(queries))
        for r in rs:
            #if r['type'] != 'Prod': continue
            dataset = r['input_dataset']
            if dataset:
                #print r['prepid'],dataset
                will_be_used[dataset].append( r )
            pileup = r['pileup_dataset_name']
            if pileup:
                secondary_used['pileup'].append( r )

    all_transfers = defaultdict(list)
    print len(will_be_used),"datasets that can be pre-fetched"
    ## for secondary we really need to have the campaign right
    print len(secondary_used),"pileup will be used"

    datasets = will_be_used.keys()
    if options.limit:
        print "Restricting to randomly picked",options.limit
        random.shuffle( datasets )
        datasets = datasets[:options.limit]
    
    for dataset in datasets:
        print "\tlooking at",dataset
        #presence = getDatasetPresence(url, dataset)#, within_sites=['T2_CH_CERN'])
        ## lock all those, and pre-fecth them
        #NL.lock( dataset )
        ## we could get the reqmgr dictionnary from McM if it was implemented and use standard workflowInfo !!!
        for request in will_be_used[dataset]:
            print "will be used by",request['prepid']
            campaign = request['member_of_campaign']
            ## based on the campaign, pre-fetch a site list
            sites_allowed = SI.sites_T1s + SI.sites_with_goodIO
            if options.spread:
                ## pick up the number of copies from campaign
                copies_needed = 1 ## hard coded for now
            else:
                copies_needed = 1 ## hard coded for now        

            print "Will look for",copies_needed,"of",dataset
            ## figure out where it is and going
            destinations, all_block_names = getDatasetDestinations(url, dataset, within_sites = [SI.CE_to_SE(site) for site in sites_allowed])
            print json.dumps( destinations, indent=2)
            prim_location = [site for (site,info) in destinations.items() if info['completion']==100 and info['data_fraction']==1]
            prim_destination = [site for site in destinations.keys() if not site in prim_location]
            prim_destination = [site for site in prim_destination if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
            copies_needed = max(0,copies_needed - len(prim_location))
            copies_being_made = [ sum([info['blocks'].keys().count(block) for site,info in destinations.items() if site in prim_destination]) for block in all_block_names]
            
            prim_to_distribute = [site for site in sites_allowed if not SI.CE_to_SE(site) in prim_location]
            prim_to_distribute = [site for site in prim_to_distribute if not SI.CE_to_SE(site) in prim_destination]
                ## take out the ones that cannot receive transfers
            prim_to_distribute = [site for site in prim_to_distribute if not any([osite.startswith(site) for osite in SI.sites_veto_transfer])]
            copies_needed = max(0,copies_needed - min(copies_being_made))
            spreading = {}
            if copies_needed:
                print "needing",copies_needed 
                chops,sizes = getDatasetChops(dataset, chop_threshold = options.chopsize)
                spreading = distributeToSites( chops, prim_to_distribute, n_copies = copies_needed, weights=SI.cpu_pledges, sizes=sizes)
            else:
                print "no copy needed for",dataset
            for (site,items) in spreading.items():
                all_transfers[site].extend( items )
    
    print "accumulated transfers"
    print json.dumps(all_transfers, indent=2)
    if not options.test:
        sendEmail('dataset to be fetched',
                  'the following datasets and location were figured from mcm up-coming requests\n%s'%( json.dumps(all_transfers, indent=2) ),
                  destination=['*****@*****.**'])
    
    ## now collect and make transfer request
    for (site,items_to_transfer) in all_transfers.iteritems():
        print "Directing at",site
        items_to_transfer = list(set(items_to_transfer))

        site_se = SI.CE_to_SE(site)
        blocks = [it for it in items_to_transfer if '#' in it]
        datasets = [it for it in items_to_transfer if not '#' in it]

        print "\t",len(blocks),"blocks"
        ## remove blocks if full dataset is send out                                                                                                 
        blocks = [block for block in blocks if not block.split('#')[0] in datasets]
        blocks_dataset = list(set([block.split('#')[0] for block in blocks]))
        print "\t",len(blocks),"needed blocks for",blocks_dataset
        print "\t",len(datasets),"datasets"
        print "\t",datasets
        items_to_transfer = blocks + datasets
        total_size = 0
        for dataset in datasets:
            ds_size,_ = dss.get_block_size( dataset )
            total_size += ds_size
        for dataset in blocks_dataset:
            _,bs_size = dss.get_block_size( dataset )
            total_size += sum([ s for b,s in bs_size if b in blocks ])

        print "For a total of",total_size,"[GB]"

        if options.test:
            result= {'phedex':{'request_created' : []}}
        else:
            ##result = makeReplicaRequest(url, site_se, items_to_transfer, 'fetching pre-production', priority='normal', approve=True)
            ## should make sure there is something in it
            pass
Ejemplo n.º 54
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock():
        return
    if duplicateLock():
        return
    if not componentInfo().check():
        return

    UC = unifiedConfiguration()
    CI = campaignInfo()
    SI = siteInfo()
    NLI = newLockInfo()

    n_assigned = 0
    n_stalled = 0

    wfos = []
    if specific or options.early:
        wfos.extend(session.query(Workflow).filter(Workflow.status == "considered").all())
        wfos.extend(session.query(Workflow).filter(Workflow.status == "staging").all())
    if specific:
        wfos.extend(session.query(Workflow).filter(Workflow.status == "considered-tried").all())
    wfos.extend(session.query(Workflow).filter(Workflow.status == "staged").all())

    dataset_endpoints = json.loads(open("%s/dataset_endpoints.json" % monitor_dir).read())

    max_per_round = UC.get("max_per_round").get("assignor", None)
    max_cpuh_block = UC.get("max_cpuh_block")
    random.shuffle(wfos)
    for wfo in wfos:
        if options.limit and (n_stalled + n_assigned) > options.limit:
            break

        if max_per_round and (n_stalled + n_assigned) > max_per_round:
            break

        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(","))):
                continue
            # if not specific in wfo.name: continue
        print "\n\n"
        wfh = workflowInfo(url, wfo.name)
        wfh.sendLog("assignor", "%s to be assigned" % wfo.name)

        ## the site whitelist takes into account siteInfo, campaignInfo, memory and cores
        (lheinput, primary, parent, secondary, sites_allowed) = wfh.getSiteWhiteList()

        ## check if by configuration we gave it a GO
        no_go = False
        if not wfh.go(log=True) and not options.go:
            n_stalled += 1
            no_go = True

        allowed_secondary = set()
        for campaign in wfh.getCampaigns():
            if campaign in CI.campaigns and "secondaries" in CI.campaigns[campaign]:
                allowed_secondary.update(CI.campaigns[campaign]["secondaries"])
        if (secondary and allowed_secondary) and (set(secondary) & allowed_secondary != set(secondary)):
            wfh.sendLog("assignor", "%s is not an allowed secondary" % (", ".join(set(secondary) - allowed_secondary)))
            # sendEmail('secondary not allowed','%s is not an allowed secondary'%( ', '.join(set(secondary)-allowed_secondary)))
            sendLog(
                "assignor",
                "%s is not an allowed secondary" % (", ".join(set(secondary) - allowed_secondary)),
                level="critical",
            )
            if not options.go:
                n_stalled += 1
                no_go = True

        if no_go:
            continue

        ## check on current status for by-passed assignment
        if wfh.request["RequestStatus"] != "assignment-approved":
            if not options.test:
                wfh.sendLog("assignor", "setting %s away and skipping" % wfo.name)
                ## the module picking up from away will do what is necessary of it
                wfo.wm_status = wfh.request["RequestStatus"]
                wfo.status = "away"
                session.commit()
                continue
            else:
                print wfo.name, wfh.request["RequestStatus"]

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                wfh.sendLog("assignor", "cannot decide on version number")
                n_stalled += 1
                wfo.status = "trouble"
                session.commit()
                continue

        original_sites_allowed = copy.deepcopy(sites_allowed)
        wfh.sendLog("assignor", "Site white list %s" % sorted(sites_allowed))
        override_sec_location = CI.get(wfh.request["Campaign"], "SecondaryLocation", [])

        blocks = []
        if "BlockWhitelist" in wfh.request:
            blocks = wfh.request["BlockWhitelist"]
        if "RunWhitelist" in wfh.request and wfh.request["RunWhitelist"]:
            ## augment with run white list
            for dataset in primary:
                blocks = list(set(blocks + getDatasetBlocks(dataset, runs=wfh.request["RunWhitelist"])))

        wfh.sendLog("assignor", "Allowed %s" % sorted(sites_allowed))
        secondary_locations = None

        primary_aaa = options.primary_aaa
        if (
            "Campaign" in wfh.request
            and wfh.request["Campaign"] in CI.campaigns
            and "primary_AAA" in CI.campaigns[wfh.request["Campaign"]]
        ):
            primary_aaa = primary_aaa or CI.campaigns[wfh.request["Campaign"]]["primary_AAA"]
        secondary_aaa = options.secondary_aaa
        if (
            "Campaign" in wfh.request
            and wfh.request["Campaign"] in CI.campaigns
            and "secondary_AAA" in CI.campaigns[wfh.request["Campaign"]]
        ):
            secondary_aaa = secondary_aaa or CI.campaigns[wfh.request["Campaign"]]["secondary_AAA"]

        for sec in list(secondary):
            if override_sec_location:
                print "We don't care where the secondary is"
                print "Cannot pass for now"
                sendEmail("tempting to pass sec location check", "but we cannot yet IMO")
                # pass
            if secondary_aaa:
                # just continue without checking
                continue

            presence = getDatasetPresence(url, sec)
            print sec
            print json.dumps(presence, indent=2)
            one_secondary_locations = [site for (site, (there, frac)) in presence.items() if frac > 98.0]
            # one_secondary_locations = [site for (site,(there,frac)) in presence.items() if there]
            if secondary_locations == None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            # sites_allowed = [site for site in sites_allowed if any([osite.startswith(site) for osite in one_secondary_locations])]
            sites_allowed = [site for site in sites_allowed if SI.CE_to_SE(site) in one_secondary_locations]

        wfh.sendLog("assignor", "From secondary requirement, now Allowed%s" % sorted(sites_allowed))

        initial_sites_allowed = copy.deepcopy(
            sites_allowed
        )  ## keep track of this, after secondary input location restriction : that's how you want to operate it

        sites_all_data = copy.deepcopy(sites_allowed)
        sites_with_data = copy.deepcopy(sites_allowed)
        sites_with_any_data = copy.deepcopy(sites_allowed)
        primary_locations = None
        available_fractions = {}
        set_lfn = "/store/mc"  ## by default
        endpoints = set()
        for prim in list(primary):
            if prim in dataset_endpoints:
                print "endpoints from stagor", dataset_endpoints[prim]
                endpoints.update(dataset_endpoints[prim])
            set_lfn = getLFNbase(prim)
            presence = getDatasetPresence(url, prim, only_blocks=blocks)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] = getDatasetBlocksFraction(
                url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed], only_blocks=blocks
            )
            # sites_all_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,(there,frac)) in presence.items() if there]])]
            # sites_with_data = [site for site in sites_with_data if any([osite.startswith(site) for osite in [psite for (psite,frac) in presence.items() if frac[1]>90.]])]
            sites_all_data = [
                site
                for site in sites_with_data
                if SI.CE_to_SE(site) in [psite for (psite, (there, frac)) in presence.items() if there]
            ]
            sites_with_data = [
                site
                for site in sites_with_data
                if SI.CE_to_SE(site) in [psite for (psite, frac) in presence.items() if frac[1] > 90.0]
            ]
            sites_with_any_data = [site for site in sites_with_any_data if SI.CE_to_SE(site) in presence.keys()]
            wfh.sendLog(
                "assignor",
                "Holding the data but not allowed %s"
                % sorted(
                    list(set([se_site for se_site in presence.keys() if not SI.SE_to_CE(se_site) in sites_allowed]))
                ),
            )
            if primary_locations == None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(set(primary_locations) & set(presence.keys()))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites = []
        down_time = False
        ## opportunistic running where any piece of data is available
        if secondary_locations or primary_locations:
            ## intersection of both any pieces of the primary and good IO
            # opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            if secondary_locations and primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site)
                    for site in list(
                        (set(secondary_locations) & set(primary_locations))
                        - set([SI.CE_to_SE(site) for site in sites_allowed])
                    )
                ]
            elif primary_locations:
                opportunistic_sites = [
                    SI.SE_to_CE(site)
                    for site in list(set(primary_locations) - set([SI.CE_to_SE(site) for site in sites_allowed]))
                ]
            else:
                opportunistic_sites = []
            wfh.sendLog("assignor", "We could be running in addition at %s" % sorted(opportunistic_sites))
            if any([osite in SI.sites_not_ready for osite in opportunistic_sites]):
                wfh.sendLog(
                    "assignor",
                    "One of the usable site is in downtime %s"
                    % ([osite in SI.sites_not_ready for osite in opportunistic_sites]),
                )
                down_time = True
                ## should this be send back to considered ?

        ## should be 2 but for the time-being let's lower it to get things going
        copies_wanted, cpuh = wfh.getNCopies()
        wfh.sendLog("assignor", "we need %s CPUh" % cpuh)
        if cpuh > max_cpuh_block and not options.go:
            # sendEmail('large workflow','that wf %s has a large number of CPUh %s, not assigning, please check the logs'%(wfo.name, cpuh))#,destination=['*****@*****.**'])
            sendLog(
                "assignor",
                "%s requires a large numbr of CPUh %s , not assigning, please check with requester" % (wfo.name, cpuh),
                level="critical",
            )
            wfh.sendLog("assignor", "Requiring a large number of CPUh %s, not assigning" % cpuh)
            continue

        if (
            "Campaign" in wfh.request
            and wfh.request["Campaign"] in CI.campaigns
            and "maxcopies" in CI.campaigns[wfh.request["Campaign"]]
        ):
            copies_needed_from_campaign = CI.campaigns[wfh.request["Campaign"]]["maxcopies"]
            copies_wanted = min(copies_needed_from_campaign, copies_wanted)

        if not options.early:
            less_copies_than_requested = UC.get("less_copies_than_requested")
            copies_wanted = max(1, copies_wanted - less_copies_than_requested)  # take one out for the efficiency
        else:
            ## find out whether there is a site in the whitelist, that is lacking jobs and reduce to 1 copy needed to get things going
            pass

        wfh.sendLog("assignor", "needed availability fraction %s" % copies_wanted)

        ## should also check on number of sources, if large enough, we should be able to overflow most, efficiently

        if available_fractions and not all([available >= copies_wanted for available in available_fractions.values()]):
            not_even_once = not all([available >= 1.0 for available in available_fractions.values()])
            wfh.sendLog(
                "assignor",
                "The input dataset is not available %s times, only %s" % (copies_wanted, available_fractions.values()),
            )
            if down_time and not options.go and not options.early:
                wfo.status = "considered"
                session.commit()
                wfh.sendLog("assignor", "sending back to considered because of site downtime, instead of waiting")
                # sendEmail( "cannot be assigned due to downtime","%s is not sufficiently available, due to down time of a site in the whitelist. check the assignor logs. sending back to considered."% wfo.name)
                sendLog(
                    "assignor",
                    "%s is not sufficiently available, due to down time of a site in the whitelist. sending back to considered."
                    % (wfo.name),
                    level="delay",
                )
                continue
                # pass

            print json.dumps(available_fractions)
            if (options.go and not_even_once) or not options.go:
                known = []
                try:
                    known = json.loads(open("cannot_assign.json").read())
                except:
                    pass
                if (
                    not wfo.name in known
                    and not options.limit
                    and not options.go
                    and not options.early
                    and not options.partial
                ):
                    wfh.sendLog(
                        "assignor",
                        "cannot be assigned, %s is not sufficiently available.\n %s"
                        % (wfo.name, json.dumps(available_fractions)),
                    )
                    sendEmail(
                        "cannot be assigned",
                        "%s is not sufficiently available.\n %s" % (wfo.name, json.dumps(available_fractions)),
                    )
                    known.append(wfo.name)
                    open("cannot_assign.json", "w").write(json.dumps(known, indent=2))
                n_stalled += 1
                if options.early:
                    if wfo.status == "considered":
                        wfh.sendLog("assignor", "setting considered-tried")
                        wfo.status = "considered-tried"
                        session.commit()
                    else:
                        print "tried but status is", wfo.status
                if options.partial:
                    print "Will move on with partial locations"
                else:
                    continue

        ## default back to white list to original white list with any data
        print "Allowed", sorted(sites_allowed)

        if primary_aaa:
            sites_allowed = initial_sites_allowed
            options.TrustSitelists = True
            wfh.sendLog("assignor", "Selected to read primary through xrootd %s" % sorted(sites_allowed))
        else:
            sites_allowed = sites_with_any_data
            wfh.sendLog("assignor", "Selected for any data %s" % sorted(sites_allowed))

        if secondary_aaa:
            options.TrustPUSitelists = True
            wfh.sendLog("assignor", "Reading secondary through xrootd from %s" % sorted(sites_allowed))

        ### check on endpoints for on-going transfers
        if endpoints and options.partial:
            sites_allowed = list(set(sites_allowed + [SI.SE_to_CE(s) for s in endpoints]))
            print "with added endpoints", sorted(sites_allowed)

        if not len(sites_allowed):
            wfh.sendLog("assignor", "cannot be assign with no matched sites")
            sendLog("assignor", "%s has no whitelist" % wfo.name, level="critical")
            n_stalled += 1
            continue

        t1_only = [ce for ce in sites_allowed if ce.startswith("T1")]
        if t1_only:
            # try to pick from T1 only first
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in t1_only])]
        else:
            # then pick any otherwise
            sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])]

        wfh.sendLog("assignor", "Placing the output on %s" % sites_out)
        parameters = {
            "SiteWhitelist": sites_allowed,
            "NonCustodialSites": sites_out,
            "AutoApproveSubscriptionSites": list(set(sites_out)),
            "AcquisitionEra": wfh.acquisitionEra(),
            "ProcessingString": wfh.processingString(),
            "MergedLFNBase": set_lfn,
            "ProcessingVersion": version,
        }

        ## plain assignment here
        team = "production"
        if os.getenv("UNIFIED_TEAM"):
            team = os.getenv("UNIFIED_TEAM")
        if options and options.team:
            team = options.team

        if False and "T2_CH_CERN" in parameters["SiteWhitelist"]:
            ## add some check on
            ### the amount pending to HLT
            ### the size of the request
            ### the priority of the request (maybe not if we decide to overflow during runs)
            parameters["SiteWhitelist"] = ["T2_CH_CERN_HLT"]
            team = "hlt"
            ## reduce the splitting by factor of 4, regardless of type of splitting
            sendEmail("sending work to HLT", "%s was assigned to HLT" % wfo.name)

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v = getattr(options, key)
                if v != None:
                    if type(v) == str and "," in v:
                        parameters[key] = filter(None, v.split(","))
                    else:
                        parameters[key] = v

        if lheinput:
            ## throttle reading LHE article
            wfh.sendLog("assignor", "Setting the number of events per job to 500k max")
            parameters["EventsPerJob"] = 500000

        ## pick up campaign specific assignment parameters
        parameters.update(CI.parameters(wfh.request["Campaign"]))

        if not options.test:
            parameters["execute"] = True

        split_check = wfh.checkWorkflowSplitting()
        if split_check != True:
            parameters.update(split_check)
            if "EventBased" in split_check.values():
                wfh.sendLog("assignor", "Falling back to event splitting.")
                # sendEmail("Fallback to EventBased","the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"%wfo.name)
                sendLog(
                    "assignor",
                    "the workflow %s is too heavy to be processed as it is. Fallback to EventBased splitting"
                    % wfo.name,
                    level="critical",
                )
            elif "EventsPerJob" in split_check.values():
                wfh.sendLog("assignor", "Modifying the number of job per event")
                # sendEmail("Modifying the job per events","the workflow %s is too heavy in number of jobs explosion"%wfo.name)
                sendLog(
                    "assignor", "the workflow %s is too heavy in number of jobs explosion" % wfo.name, level="critical"
                )

        # Handle run-dependent MC
        pstring = wfh.processingString()
        if "PU_RD" in pstring:
            numEvents = wfh.getRequestNumEvents()
            eventsPerLumi = [getDatasetEventsPerLumi(prim) for prim in primary]
            eventsPerLumi = sum(eventsPerLumi) / float(len(eventsPerLumi))
            reqJobs = 500
            if "PU_RD2" in pstring:
                reqJobs = 2000
                eventsPerJob = int(numEvents / (reqJobs * 1.4))
                lumisPerJob = int(eventsPerJob / eventsPerLumi)
                if lumisPerJob == 0:
                    # sendEmail("issue with event splitting for run-dependent MC","%s needs to be split by event with %s per job"%(wfo.name, eventsPerJob))
                    sendLog(
                        "assignor",
                        "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob),
                        level="critical",
                    )
                    wfh.sendLog("assignor", "%s needs to be split by event with %s per job" % (wfo.name, eventsPerJob))
                    parameters["EventsPerJob"] = eventsPerJob
                else:
                    spl = wfh.getSplittings()[0]
                    eventsPerJobEstimated = spl["events_per_job"] if "events_per_job" in spl else None
                    eventsPerJobEstimated = spl["avg_events_per_job"] if "avg_events_per_job" in spl else None
                    if eventsPerJobEstimated and eventsPerJobEstimated > eventsPerJob:
                        # sendEmail("setting lumi splitting for run-dependent MC","%s was assigned with %s lumis/job"%( wfo.name, lumisPerJob))
                        sendLog(
                            "assignor", "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob), level="critical"
                        )
                        wfh.sendLog("assignor", "%s was assigned with %s lumis/job" % (wfo.name, lumisPerJob))
                        parameters["LumisPerJob"] = lumisPerJob
                    else:
                        # sendEmail("leaving splitting untouched for PU_RD*","please check on "+wfo.name)
                        sendLog(
                            "assignor",
                            "leaving splitting untouched for %s, please check on %s" % (pstring, wfo.name),
                            level="critical",
                        )
                        wfh.sendLog("assignor", "leaving splitting untouched for PU_RD*, please check.")

        result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters)

        # set status
        if not options.test:
            if result:
                wfo.status = "away"
                session.commit()
                n_assigned += 1
                wfh.sendLog("assignor", "Properly assigned\n%s" % (json.dumps(parameters, indent=2)))
                try:
                    ## refetch information and lock output
                    new_wfi = workflowInfo(url, wfo.name)
                    (_, prim, _, sec) = new_wfi.getIO()
                    for secure in list(prim) + list(sec) + new_wfi.request["OutputDatasets"]:
                        ## lock all outputs flat
                        NLI.lock(secure)
                    # for site in [SI.CE_to_SE(site) for site in sites_allowed]:
                    #    for output in new_wfi.request['OutputDatasets']:
                    #        LI.lock( output, site, 'dataset in production')
                    #    for primary in prim:
                    #        LI.lock( primary, site, 'dataset used in input')
                    #    for secondary in sec:
                    #        LI.lock( secondary, site, 'required for mixing' )

                except Exception as e:
                    print "fail in locking output"

                    print str(e)
                    sendEmail("failed locking of output", str(e))

            else:
                print "ERROR could not assign", wfo.name
        else:
            pass
    print "Assignment summary:"
    sendLog("assignor", "Assigned %d Stalled %s" % (n_assigned, n_stalled))
Ejemplo n.º 55
0
def assignor(url, specific=None, talk=True, options=None):
    if userLock('assignor'): return

    CI = campaignInfo()
    SI = siteInfo()

    wfos = []
    if specific:
        wfos = session.query(Workflow).filter(Workflow.name == specific).all()
    if not wfos:
        if specific:
            wfos = session.query(Workflow).filter(
                Workflow.status == 'considered').all()
            wfos.extend(
                session.query(Workflow).filter(
                    Workflow.status == 'staging').all())
        wfos.extend(
            session.query(Workflow).filter(Workflow.status == 'staged').all())

    for wfo in wfos:
        if specific:
            if not any(map(lambda sp: sp in wfo.name, specific.split(','))):
                continue
            #if not specific in wfo.name: continue
        print wfo.name, "to be assigned"
        wfh = workflowInfo(url, wfo.name)

        ## check if by configuration we gave it a GO
        if not CI.go(wfh.request['Campaign']) and not options.go:
            print "No go for", wfh.request['Campaign']
            continue

        ## check on current status for by-passed assignment
        if wfh.request['RequestStatus'] != 'assignment-approved':
            print wfo.name, wfh.request['RequestStatus'], "skipping"
            if not options.test:
                continue

        ## retrieve from the schema, dbs and reqMgr what should be the next version
        version = wfh.getNextVersion()
        if not version:
            if options and options.ProcessingVersion:
                version = options.ProcessingVersion
            else:
                print "cannot decide on version number"
                continue

        (lheinput, primary, parent, secondary) = wfh.getIO()
        sites_allowed = getSiteWhiteList(
            (lheinput, primary, parent, secondary))
        print "Allowed", sites_allowed
        sites_out = [SI.pick_dSE([SI.CE_to_SE(ce) for ce in sites_allowed])]
        sites_custodial = []
        if len(sites_custodial) == 0:
            print "No custodial, it's fine, it's covered in close-out"

        if len(sites_custodial) > 1:
            print "more than one custodial for", wfo.name
            sys.exit(36)

        secondary_locations = None
        for sec in list(secondary):
            presence = getDatasetPresence(url, sec)
            print sec
            print json.dumps(presence, indent=2)
            #one_secondary_locations = [site for (site,(there,frac)) in presence.items() if frac>90.]
            one_secondary_locations = [
                site for (site, (there, frac)) in presence.items() if there
            ]
            if secondary_locations == None:
                secondary_locations = one_secondary_locations
            else:
                secondary_locations = list(
                    set(secondary_locations) & set(one_secondary_locations))
            ## reduce the site white list to site with secondary only
            sites_allowed = [
                site for site in sites_allowed if any([
                    osite.startswith(site) for osite in one_secondary_locations
                ])
            ]

        sites_all_data = copy.deepcopy(sites_allowed)
        sites_with_data = copy.deepcopy(sites_allowed)
        sites_with_any_data = copy.deepcopy(sites_allowed)
        primary_locations = None
        available_fractions = {}
        for prim in list(primary):
            presence = getDatasetPresence(url, prim)
            if talk:
                print prim
                print json.dumps(presence, indent=2)
            available_fractions[prim] = getDatasetBlocksFraction(
                url, prim, sites=[SI.CE_to_SE(site) for site in sites_allowed])
            sites_all_data = [
                site for site in sites_with_data if any([
                    osite.startswith(site) for osite in [
                        psite for (psite, (there, frac)) in presence.items()
                        if there
                    ]
                ])
            ]
            sites_with_data = [
                site for site in sites_with_data if any([
                    osite.startswith(site) for osite in [
                        psite for (psite, frac) in presence.items()
                        if frac[1] > 90.
                    ]
                ])
            ]
            sites_with_any_data = [
                site for site in sites_with_any_data
                if any([osite.startswith(site) for osite in presence.keys()])
            ]
            if primary_locations == None:
                primary_locations = presence.keys()
            else:
                primary_locations = list(
                    set(primary_locations) & set(presence.keys()))

        sites_with_data = list(set(sites_with_data))
        sites_with_any_data = list(set(sites_with_any_data))

        opportunistic_sites = []
        ## opportunistic running where any piece of data is available
        if secondary_locations and primary_locations:
            ## intersection of both any pieces of the primary and good IO
            #opportunistic_sites = [SI.SE_to_CE(site) for site in list((set(secondary_locations) & set(primary_locations) & set(SI.sites_with_goodIO)) - set(sites_allowed))]
            opportunistic_sites = [
                SI.SE_to_CE(site) for site in list((set(secondary_locations)
                                                    & set(primary_locations)) -
                                                   set(sites_allowed))
            ]
            print "We could be running at", opportunistic_sites, "in addition"

        if available_fractions and not all(
            [available >= 1. for available in available_fractions.values()]):
            print "The input dataset is not located in full at any site"
            print json.dumps(available_fractions)
            if not options.test and not options.go: continue  ## skip skip skip
        copies_wanted = 2.
        if available_fractions and not all([
                available >= copies_wanted
                for available in available_fractions.values()
        ]):
            print "The input dataset is not available", copies_wanted, "times, only", available_fractions.values(
            )
            if not options.go:
                continue

        ## default back to white list to original white list with any data
        print "Allowed", sites_allowed
        sites_allowed = sites_with_any_data
        print "Selected for any data", sites_allowed

        if options.restrict:
            print "Allowed", sites_allowed
            sites_allowed = sites_with_any_data
            print "Selected", sites_allowed
        else:
            if set(sites_with_data) != set(sites_allowed):
                ## the data is not everywhere we wanted to run at : enable aaa
                print "Sites with 90% data not matching site white list (block choping!)"
                print "Resorting to AAA reading for", list(
                    set(sites_allowed) - set(sites_with_data)), "?"
                print "Whitelist site with any data", list(
                    set(sites_allowed) - set(sites_with_any_data))
                #options.useSiteListAsLocation = True
                #print "Not commissioned yet"
                #continue
            #print "We could be running at",opportunistic_sites,"in addition"
            ##sites_allowed = list(set(sites_allowed+ opportunistic_sites))

        if not len(sites_allowed):
            print wfo.name, "cannot be assign with no matched sites"
            continue

        parameters = {
            'SiteWhitelist': sites_allowed,
            'CustodialSites': sites_custodial,
            'NonCustodialSites': sites_out,
            'AutoApproveSubscriptionSites': list(set(sites_out)),
            'AcquisitionEra': wfh.acquisitionEra(),
            'ProcessingString': wfh.processingString(),
            'MergedLFNBase': '/store/mc',  ## to be figured out ! from Hi shit
            'ProcessingVersion': version,
        }

        ##parse options entered in command line if any
        if options:
            for key in reqMgrClient.assignWorkflow.keys:
                v = getattr(options, key)
                if v != None:
                    if ',' in v: parameters[key] = filter(None, v.split(','))
                    else: parameters[key] = v

        ## pick up campaign specific assignment parameters
        parameters.update(CI.parameters(wfh.request['Campaign']))

        if not options.test:
            parameters['execute'] = True

        if not wfh.checkWorkflowSplitting():
            ## needs to go to event based ? fail for now
            print "Falling back to event splitting ?"
            #parameters['SplittingAlgorithm'] = 'EventBased'
            continue

        ## plain assignment here
        team = 'production'
        if options and options.team:
            team = options.team
        result = reqMgrClient.assignWorkflow(url, wfo.name, team, parameters)

        # set status
        if not options.test:
            if result:
                wfo.status = 'away'
                session.commit()
            else:
                print "ERROR could not assign", wfo.name
        else:
            pass
Ejemplo n.º 56
0
def main():
    url = 'cmsweb.cern.ch'
    url_tb = 'cmsweb-testbed.cern.ch'
    
    # Example: python assign.py -w amaltaro_RVZTT_120404_163607_6269
    # -t testbed-relval -s T1_US_FNAL -e CMSSW_6_0_0_pre1_FS_TEST_WMA -p v1 -a
    # relval -l /store/backfill/1
    usage = "usage: %prog [options] [WORKFLOW]"
    
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-t', '--team', help='Type of Requests', dest='team', default='production')
    parser.add_option('-s', '--sites', help=' "t1" for Tier-1\'s and "t2" for Tier-2\'s', dest='sites')
    parser.add_option('--special',  help='Use it for special workflows. You also have to change the code according to the type of WF', dest='special')
    parser.add_option('-r', '--replica', action='store_true', dest='replica', default=False, help='Adds a _Disk Non-Custodial Replica parameter')
    parser.add_option('-p', '--procversion', help='Processing Version, if empty it will leave the processing version that comes by default in the request', dest='procversion')
    parser.add_option('-a', '--activity', help='Dashboard Activity (reprocessing, production or test), if empty will set reprocessing as default', dest='activity')
    parser.add_option( '--xrootd', help='Assign with TrustSitelists=True (allows xrootd capabilities)',
                      action='store_true', dest='xrootd')
    parser.add_option('--no_xrootd', help='Assign with TrustSitelists=False',
                      action='store_false', dest='xrootd')
    parser.add_option('--secondary_xrootd', help='Assign with TrustPUSitelists=True (allows xrootd capabilities)',
                      action='store_true', dest='secondary_xrootd')
    parser.add_option('--no_secondary_xrootd', help='Assign with TrustPUSitelists=False',
                      action='store_false', dest='secondary_xrootd')
    parser.add_option('-l', '--lfn', help='Merged LFN base', dest='lfn')
    parser.add_option('-v', '--verbose', help='Verbose', action='store_true', default=False, dest='verbose')
    parser.add_option('--testbed', help='Assign in testbed', action='store_true', default=False, dest='testbed')
    parser.add_option('--test', action="store_true",help='Nothing is injected, only print infomation about workflow and Era', dest='test')
    parser.add_option('-f', '--file', help='Text file with a list of wokflows. If this option is used, the same settings will be applied to all workflows', dest='file')
    parser.add_option('-w', '--workflow', help='Workflow Name, or coma sperated list', dest='workflow')
    parser.add_option('-m', '--memory', help='Set the Memory parameter to the workflow', dest='memory', default=None)
    parser.add_option('--lumisperjob',help='Set the number of lumis per job', default=None, type=int)
    parser.add_option('--maxmergeevents',help='Set the number of event to merge at max', default=None, type=int)
    parser.add_option('-c', '--multicore', help='Set the multicore parameter to the workfllow', dest='multicore', default=None)
    parser.add_option('-e', '--era', help='Acquistion era', dest='era')
    parser.add_option("--procstr", dest="procstring", help="Overrides Processing String with a single string")
    parser.add_option('--checksite', default=False,action='store_true')
    (options, args) = parser.parse_args()
    
    if options.testbed:
        url = url_tb

    # parse input workflows and files. If both -w and -f options are used, then only the -w inputs are considered.
    if not options.workflow:
        if args:
            wfs = args
        elif options.file:
            wfs = [l.strip() for l in open(options.file) if l.strip()]
        else:
            parser.error("Input a workflow name or a file to read them")
            sys.exit(0)
    else:
        wfs = options.workflow.split(',')

    #Default values
    era = {}
    procversion = 1
    procstring = {}
    memory = None
    multicore = None
    replica = False
    sites = []
    specialStr = ''
    taskchain = False
    xrootd= False
    secondary_xrootd= False

    SI = siteInfo()
    getRandomDiskSite.T1 = SI.sites_T1s
    # Handling the parameters given in the command line
    # parse site list
    if options.sites:
        if options.sites.lower() == "t1":
            sites = SI.sites_T1s
        elif options.sites.lower() == "t2":
            sites = SI.sites_T2s
        elif options.sites.lower() in ["all","t1+t2","t2+t1"] :
            sites = SI.sites_T2s+SI.sites_T1s
        elif options.sites.lower() == "mcore":
            sites = SI.sites_mcore_ready
        elif hasattr(SI,options.sites):
            sites = getattr(SI,options.sites)
        #elif options.sites.lower() == 'acdc':
        #    sites = []
        else: 
            sites = [site for site in options.sites.split(',')]
    else: 
        sites = SI.sites_T1s + SI.sites_T2s

    if options.replica:
        replica = True

    for wfn in wfs:
        # Getting the original dictionary
        wfi = workflowInfo( url, wfn )
        schema = wfi.request
        if 'OriginalRequestName' in schema:
            print "Original workflow is:",schema['OriginalRequestName']
            original_wf = workflowInfo(url, schema['OriginalRequestName'])            
            ancestor_wf = workflowInfo(url, schema['OriginalRequestName'])
            ## go back as up as possible
            while ancestor_wf.request['RequestType'] == 'Resubmission':
                if 'OriginalRequestName' not in ancestor_wf.request:
                    ancestor_wf = None
                    break
                ancestor_wf = workflowInfo(url, ancestor_wf.request['OriginalRequestName'])
        else:
            original_wf = None
            ancestor_wf = None

        is_resubmission = (schema['RequestType'] == 'Resubmission')

        if options.sites.lower() == 'original' and original_wf:
            sites = original_wf.request['SiteWhitelist']
            print "Using",sorted(sites),"from the original request",original_wf.request['RequestName']

        #print json.dumps( schema, indent=2 )
        wf_name = wfn
        wf_info = schema

        # WF must be in assignment-approved in order to be assigned
        if (schema["RequestStatus"] != "assignment-approved") and not options.test:
            print("The workflow '" + wf_name + "' you are trying to assign is not in assignment-approved")
            sys.exit(1)

        #Check to see if the workflow is a task chain or an ACDC of a taskchain
        taskchain = (schema["RequestType"] == "TaskChain") or (ancestor_wf and ancestor_wf.request["RequestType"] == "TaskChain")

        # Adding the special string - in case it was provided in the command line
        if options.special:
            specialStr = '_' + str(options.special)
            for key, value in procstring.items():
                procstring[key] = value + specialStr

        # Override if a value is given using the procstring command
        if options.procstring:
            procstring = options.procstring
        elif is_resubmission:
            procstring = ancestor_wf.processingString()
        else:
            procstring = wfi.processingString()

        if options.era:
            era = options.era
        elif is_resubmission:
            era = ancestor_wf.acquisitionEra()
        else:
            era = wfi.acquisitionEra()
        #Dealing with era and proc string
        if (not era or not procstring) or (taskchain and (type(era)!=dict or type(procstring)!=dict)):
            print "We do not have a valid AcquisitionEra and ProcessingString"
            sys.exit(1)

        # Must use --lfn option, otherwise workflow won't be assigned
        if options.lfn:
            lfn = options.lfn
        elif "MergedLFNBase" in wf_info:
            lfn = wf_info['MergedLFNBase']
        elif ancestor_wf and "MergedLFNBase" in ancestor_wf.request:
            lfn = ancestor_wf.request['MergedLFNBase']
        else:
            print "Can't assign the workflow! Please include workflow lfn using --lfn option."
            sys.exit(0)
        # activity production by default for taskchains, reprocessing for default by workflows
        if options.activity:
            activity = options.activity
        elif taskchain:
            activity = 'production'
        else:
            activity = 'reprocessing'

        if options.memory:
            memory = options.memory

        if options.multicore:
            multicore = options.multicore

        # given or default processing version
        if options.procversion:
            procversion = int(options.procversion)
        else:
            if is_resubmission:
                procversion = ancestor_wf.request['ProcessingVersion']
            else:
                procversion = wf_info["ProcessingVersion"]

        # reading xrootd and secondary_xrootd values
        if options.xrootd is not None:
            xrootd = options.xrootd
        elif original_wf:
            xrootd= original_wf.request["TrustSitelists"]

        if options.secondary_xrootd is not None:
            secondary_xrootd = options.secondary_xrootd
        elif original_wf:
            secondary_xrootd= original_wf.request["TrustPUSitelists"]

        # Check for output dataset existence, and abort if output datasets already exist!
        # Don't perform this check for ACDC's
        datasets = schema["OutputDatasets"]
        i = 0
        if not is_resubmission:
            exist = False
            maxv = 1
            for key, value in schema.items():
                if type(value) is dict and key.startswith("Task"):
                    dbsapi = DbsApi(url=dbs3_url)
                    
                    # list all datasets with same name but different version
                    # numbers
                    datasets = dbsapi.listDatasets(acquisition_era_name=value['AcquisitionEra'], primary_ds_name=value['PrimaryDataset'], detail=True, dataset_access_type='*')
                    processedName = value['AcquisitionEra'] + '-' + value['ProcessingString'] + "-v\\d+"
                    # see if any of the dataset names is a match
                    for ds in datasets:
                        if re.match(processedName, ds['processed_ds_name']):
                            print "Existing dset:", ds['dataset'], "(%s)" % ds['dataset_access_type']
                            maxv = max(maxv, ds['processing_version'])
                            exist = True
                        else:
                             pass
                    i += 1
            # suggest max version
            if exist and procversion <= maxv:
                print "Some output datasets exist, its advised to assign with v ==", maxv + 1
                sys.exit(0)
        else:
            ## this is a resubmission !
            print "The taks in resubmission is:",schema['InitialTaskPath']
            ## pick up the sites from acdc
            if options.sites.lower() == 'acdc':
                where_to_run, _,_ =  original_wf.getRecoveryInfo()
                task = schema['InitialTaskPath']
                sites = list(set([SI.SE_to_CE(site) for site in where_to_run[task]]) & set(SI.all_sites))
                print "Found",sorted(sites),"as sites where to run the ACDC at, from the acdc doc of ",original_wf.request['RequestName']

        if options.checksite:
            ## check that the sites are all compatible and up
            check_mem = schema['Memory'] if not memory else memory
            ncores = wfi.getMulticore() if not multicore else multicore
            memory_allowed = SI.sitesByMemory( float(check_mem), maxCore=ncores)
            not_ready = sorted(set(sites) & set(SI.sites_not_ready))
            not_existing = sorted(set(sites) - set(SI.all_sites))
            not_matching = sorted((set(sites) - set(memory_allowed) - set(not_ready) - set(not_existing)))
            previously_used = []
            if schema['SiteWhitelist']: previously_used = schema['SiteWhitelist']
            if original_wf: previously_used = original_wf.request['SiteWhitelist']
            if previously_used: not_matching = sorted(set(not_matching) & set(previously_used))
            
            sites = sorted( set(sites) - set(not_matching) - set(not_existing))
            
            print sorted(memory_allowed),"to allow",check_mem,ncores
            if not_ready:
                print not_ready,"is/are not ready"
                sys.exit(0)
            if not_matching:
                print "The memory requirement",check_mem,"is too much for",not_matching
                sys.exit(0)


        ## need to play with memory setting
        if taskchain:
            if memory:
                ## transform into a dictionnary
                increase = set_to = None
                tasks,set_to = memory.split(':') if ':' in memory else ("",memory)
                tasks = tasks.split(',') if tasks else []
                if set_to.startswith('+'):
                    increase = int(set_to[1:])
                else:
                    set_to = int(set_to)
                it = 1
                memory_dict = {}
                while True:
                    t = 'Task%d'%it
                    it += 1
                    if t in schema:
                        tname = schema[t]['TaskName']
                        if tasks and not tname in tasks:
                            print tname,"not concerned"
                            memory_dict[tname] = schema[t]['Memory']
                            continue
                        if set_to:
                            memory_dict[tname] = set_to
                        else:
                            memory_dict[tname] =schema[t]['Memory'] + increase
                    else:
                        break
                memory = memory_dict
                print memory_dict
            ## need to play with multicore setting
            if multicore:
                tasks,set_to = multicore.split(':') if ':' in multicore else ("",multicore)
                tasks = tasks.split(',') if tasks else []
                set_to = int(set_to)
                multicore_dict = {}
                timeperevent_dict = {}
                it=1
                while True:
                    t = 'Task%d'%it
                    it += 1
                    if t in schema:
                        tname = schema[t]['TaskName']
                        mcore = schema[t]['Multicore']
                        if tasks and not tname in tasks:
                            print tname,"not concerned"
                            multicore_dict[tname] = schema[t]['Multicore']
                            timeperevent_dict[tname] = schema[t]['TimePerEvent']
                            continue
                        if memory:
                            mem = memory[tname]
                            print mem, memory
                            factor = (set_to / float(mcore))
                            fraction_constant = 0.4
                            mem_per_core_c = int((1-fraction_constant) * mem / float(mcore))
                            print "mem per core", mem_per_core_c
                            print "base mem", mem
                            
                            memory[tname] = mem + (set_to-mcore)*mem_per_core_c
                            print "final mem",memory[tname]
                            timeperevent_dict[tname] = schema[t]['TimePerEvent']/factor
                        print "setting mcore",set_to
                        multicore_dict[tname] = set_to
                    else:
                        break
                multicore = multicore_dict
                print multicore
                print timeperevent_dict,"cannot be used yet."
    # If the --test argument was provided, then just print the information
    # gathered so far and abort the assignment
        print wf_name
        print "Era:",era
        print "ProcStr:",procstring
        print "ProcVer:",procversion
        print "LFN:",lfn
        print "Team:",options.team
        print "Site:",sites
        print "Taskchain? ", str(taskchain)
        print "Activity:", activity
        print "ACDC:", str(is_resubmission)
        print "Xrootd:", str(xrootd)
        print "Secondary_xrootd:", str(secondary_xrootd)
        #if options.test:            continue
        
        # Really assigning the workflow now
        #print wf_name, '\tEra:', era, '\tProcStr:', procstring, '\tProcVer:', procversion, '\tTeam:', team, '\tSite:', sites
        assignRequest(url, 
                      workflow = wf_name,
                      team = options.team,
                      sites = sites,
                      era = era, 
                      procversion = procversion,
                      activity = activity,
                      lfn = lfn,
                      procstring = procstring, 
                      trust_site = xrootd, 
                      replica = options.replica, 
                      verbose = options.test, 
                      taskchain = taskchain, 
                      trust_secondary_site = secondary_xrootd,
                      memory=memory,
                      multicore=multicore,
                      lumisperjob = options.lumisperjob,
                      maxmergeevents = options.maxmergeevents
                      )
    
    sys.exit(0)
Ejemplo n.º 57
0
def actor(url, options=None):

    mlock = moduleLock(wait=False, silent=True)
    if mlock(): return
    if userLock('actor'): return

    up = componentInfo(soft=['mcm'])
    if not up.check(): return

    # CI = campaignInfo()
    SI = siteInfo()
    UC = unifiedConfiguration()
    WC = wtcClient()
    WI = wtcInfo()
    JC = JIRAClient()

    action_list = WC.get_actions()
    if action_list is None:
        print "Not able to load action list"
        sendLog('actor', 'Not able to load action list', level='critical')
        return

    if options.actions:
        action_list = json.loads(open(options.actions).read())

    print json.dumps(action_list, indent=2)
    if not action_list:
        print "EMPTY!"
        return

    wf_list = action_list.keys()
    print json.dumps(sorted(wf_list), indent=2)
    if options.spec:
        wf_list = [wf for wf in wf_list if options.spec in wf]

    max_per_round = UC.get('max_per_round').get('actor', None)
    if max_per_round:
        random.shuffle(wf_list)
        wf_list = wf_list[:max_per_round]

    for wfname in wf_list:
        print '-' * 100
        print "Looking at", wfname, "for recovery options"

        to_clone = False
        to_acdc = False
        to_force = False
        to_hold = False
        something_to_do = False
        tasks = action_list[wfname].get('Parameters', None)
        to_acdc = action_list[wfname].get('Action', None) == 'acdc'
        to_clone = action_list[wfname].get('Action', None) == 'clone'
        to_force = action_list[wfname].get(
            'Action', None) == 'special' and action_list[wfname].get(
                'Parameters', {}).get('action', None) in ['by-pass', 'bypass']
        to_hold = action_list[wfname].get(
            'Action', None) == 'special' and action_list[wfname].get(
                'Parameters', {}).get('action', None) in ['onhold', 'on-hold']

        if not to_acdc and not to_clone and not to_force and not to_hold:
            sendLog(
                'actor',
                'Action submitted for something other than acdc, clone, bypass or hold for workflow %s'
                % wfname,
                level='critical')
            print json.dumps(action_list[wfname], indent=2)
            continue

        if not tasks and to_acdc:
            sendLog('actor',
                    'Empty action submitted for workflow %s' % wfname,
                    level='critical')
            print "Moving on. Parameters is blank for " + wfname
            continue

        wfi = workflowInfo(url, wfname)

        recover = True
        message_to_ops = ""
        message_to_user = ""

        #===========================================================
        if to_clone and options.do:
            print "Let's try kill and clone: "
            wfi.sendLog('actor', 'Going to clone %s' % wfname)
            comment = ""
            if 'comment' in tasks: comment = ", reason: " + tasks['comment']
            wfi.sendLog(
                'actor',
                "invalidating the workflow by traffic controller %s" % comment)

            #Reject all workflows in the family
            inv_results = invalidate(url,
                                     wfi,
                                     only_resub=False,
                                     with_output=True)
            all_good = all(inv_results)
            if all_good:
                wfi.sendLog('actor', "%s and children are rejected" % wfname)
            else:
                wfi.sendLog('actor',
                            "Failed to reject the request and dependents")
                sendLog('actor',
                        'Failed to reject the familly of %s' % wfname,
                        level='critical')
                continue

            cloned = None
            try:
                cloned = singleClone(url, wfname, tasks, comment, options.do)
            except Exception as e:
                sendLog(
                    'actor',
                    'Failed to create clone for %s! Check logs for more information. Action will need to be resubmitted.'
                    % wfname,
                    level='critical')
                wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname)
                print str(e)
                ##let's not remove the action other the workflow goes to "trouble" and the WTC cannot set the action again
            if not cloned:
                recover = False
                wfi.sendLog('actor', 'Failed to create clone for %s!' % wfname)
                sendLog('actor',
                        'Failed to create clone for %s!' % wfname,
                        level='critical')

            else:
                wfi.sendLog('actor',
                            "Workflow %s cloned into %s" % (wfname, cloned))
                ## set to trouble for swift replacement
                for wfo in session.query(Workflow).filter(
                        Workflow.name == wfname).all():
                    wfo.status = 'trouble'
                session.commit()
#===========================================================
        elif to_force:
            wfi.sendLog(
                'actor',
                'Force-completing from workflow traffic controler request')
            WI.add(action='force',
                   keyword=wfname,
                   user=action_list[wfname].get('user', 'unified'))
        elif to_hold:
            wfi.sendLog('actor',
                        'Holding on workflow traffic controler request')
            WI.add(action='hold',
                   keyword=wfname,
                   user=action_list[wfname].get('user', 'unified'))
#===========================================================
        elif to_acdc:
            if 'AllSteps' in tasks:
                allTasksDefaults = tasks['AllSteps']
                tasks.pop('AllSteps')
                for setting in allTasksDefaults:
                    for task in tasks:
                        if setting in tasks[task]:
                            tasks[task][setting] = allTasksDefaults[setting]
                        else:
                            tasks[task].append(
                                {setting: allTasksDefaults[setting]})
            print "Tasks is "
            print json.dumps(tasks, indent=2)

            all_tasks = wfi.getAllTasks()

            ## need a way to verify that this is the first round of ACDC, since the second round will have to be on the ACDC themselves

            try:
                WMErr = wfi.getWMErrors()
#               print WMErr
            except:
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because WMErr cannot be reached.'
                    % wfname,
                    level='critical')
                continue
            if not WMErr:
                wfi.sendLog('actor', 'WMErrors is blank for %s.' % wfname)
                print "FYI getWMErrors is blank. Presumably there are only unreported errors"
#                continue

            try:
                where_to_run, missing_to_run, missing_to_run_at = wfi.getRecoveryInfo(
                )
                print "Where to run = "
                print where_to_run
                if not where_to_run:
                    sendLog(
                        'actor',
                        'Cannot create ACDCS for %s because recovery info cannot be found.'
                        % wfname,
                        level='critical')
                    continue
            except:
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because recovery info cannot be found.'
                    % wfname,
                    level='critical')
                print "Moving on. Cannot access recovery info for " + wfname
                continue
            if not where_to_run:
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because site list cannot be found.'
                    % wfname,
                    level='critical')
                print "Moving on. where to run is blank"
                continue

            message_to_ops = ""
            message_to_user = ""

            num_tasks_to_recover = 0

            if WMErr:
                for task in WMErr:
                    if 'LogCollect' in task: continue
                    if 'Cleanup' in task: continue
                    if not 'jobfailed' in WMErr[task]:
                        continue
                    else:
                        num_tasks_to_recover += 1
#                print "Task to recover: " + task

            if not num_tasks_to_recover:
                print "\tno error for", wfname
#            recover = False

            if 'LheInputFiles' in wfi.request and wfi.request['LheInputFiles']:
                ## we do not try to recover pLHE
                sendLog(
                    'actor',
                    'Cannot create ACDCS for %s because it is a pLHE workflow.'
                    % wfname,
                    level='critical')
                print "We don't try to recover pLHE. Moving on."
                recover = False
        #            sendEmail('cannot submit action', '%s is a pLHE workflow. We do not try to recover pLHE'%wfname)

#        if wfi.request['RequestType'] in ['ReReco']:
#            recover= False
#            print 'cannot submit action. ReReco'
#   sendEmail('cannot submit action', '%s is request type ReReco'%wfname)

            recovering = set()
            for task in tasks:
                assign_to_sites = set()
                print "Task names is " + task
                fulltaskname = '/' + wfname + '/' + task
                print "Full task name is " + fulltaskname
                print where_to_run.keys()
                wrong_task = False
                for task_info in all_tasks:
                    if fulltaskname == task_info.pathName:
                        if task_info.taskType not in [
                                'Processing', 'Production', 'Merge'
                        ]:
                            wrong_task = True
                            wfi.sendLog(
                                'actor',
                                "Skipping task %s because the taskType is %s. Can only ACDC Processing, Production, or Merge tasks"
                                % (fulltaskname, task_info.taskType))
                if not fulltaskname in where_to_run.keys():
                    wrong_task = True
                    wfi.sendLog(
                        'actor',
                        "Skipping task %s because there is no acdc doc for it anyways."
                        % (fulltaskname))
                if wrong_task:
                    continue
                print tasks[task]
                actions = tasks[task]
                for action in actions:
                    if action.startswith('sites'):
                        if type(actions[action]) != list:
                            assign_to_sites = [SI.SE_to_CE(actions[action])]
                        else:
                            assign_to_sites = list(
                                set([
                                    SI.SE_to_CE(site)
                                    for site in actions[action]
                                ]))
#                    if action.startswith('mem') and actions[action] != "" and actions[action] != 'Same' and wfi.request['RequestType'] in ['TaskChain']:
#                        recover = False;
#                        print  "Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname
#                        wfi.sendLog('actor',"Skipping %s for now until Allie fixes memory parameter for TaskChain ACDCs."%wfname)
                if not 'sites' in actions:
                    assign_to_sites = list(
                        set([
                            SI.SE_to_CE(site)
                            for site in where_to_run[fulltaskname]
                        ]))
                    print "Found", sorted(
                        assign_to_sites
                    ), "as sites where to run the ACDC at, from the acdc doc of ", wfname
                print "Going to run at", sorted(assign_to_sites)
                if recover:
                    print "Initiating recovery"
                    acdc = singleRecovery(url,
                                          fulltaskname,
                                          wfi.request,
                                          actions,
                                          do=options.do)
                    if not acdc:
                        if options.do:
                            if recovering:
                                print wfname + " has been partially ACDC'ed. Needs manual attention."
                                sendLog(
                                    'actor',
                                    "%s has had %s/%s recoveries %s only" %
                                    (wfname, len(recovering),
                                     num_tasks_to_recover, list(recovering)),
                                    level='critical')
                                wfi.sendLog(
                                    'actor',
                                    "%s has had %s/%s recoveries %s only" %
                                    (wfname, len(recovering),
                                     num_tasks_to_recover, list(recovering)))
                                break
                            else:
                                print wfname + " failed recovery once"
                                recover = False
                                break
                        else:
                            print "no action to take further"
                            #                        sendLog('recoveror', "ACDC for %s can be done automatically"% wfname, level='critical')
                            continue

                    else:  #ACDC was made correctly. Now we have to assign it.
                        wfi.sendLog(
                            'actor',
                            'ACDC created for task %s. Actions taken \n%s' %
                            (fulltaskname, json.dumps(actions)))
                        jira_comment = "%s created ACDC for task %s with action %s" % (
                            action_list[wfname].get('user', 'unified'),
                            task.split('/')[-1],
                            json.dumps(actions),
                        )
                        reason = action_list[wfname].get('Reason', None)
                        if reason:
                            jira_comment += '\ndue to: %s' % (reason)

                        #team = wfi.request['Teams'][0]
                        team = 'production'
                        parameters = {
                            'SiteWhitelist': sorted(assign_to_sites),
                            'AcquisitionEra': wfi.acquisitionEra(),
                            'ProcessingString': wfi.processingString(),
                            'MergedLFNBase': wfi.request['MergedLFNBase'],
                            'ProcessingVersion':
                            wfi.request['ProcessingVersion'],
                        }
                        ## hackery for ACDC merge assignment
                        if wfi.request[
                                'RequestType'] == 'TaskChain' and 'Merge' in task.split(
                                    '/')[-1]:
                            parameters['AcquisitionEra'] = None
                            parameters['ProcessingString'] = None

                ## xrootd setttings on primary and secondary
                        if 'xrootd' in actions:
                            if actions['xrootd'] == 'enabled':
                                print "Going to assign via xrootd"
                                parameters['TrustSitelists'] = True
                            elif actions['xrootd'] == 'disabled':
                                parameters['TrustSitelists'] = False
                        elif ('TrustSitelists' in wfi.request
                              and wfi.request['TrustSitelists'] == 'true'):
                            parameters['TrustSitelists'] = True
                        else:
                            parameters['TrustSitelists'] = False

                        if 'secondary' in actions:
                            if actions['secondary'] == 'enabled':
                                print 'Enabling reading the secondary input via xrootd'
                                parameters['TrustPUSitelists'] = True
                            elif actions['secondary'] == 'disabled':
                                parameters['TrustPUSitelists'] = False
                            #in case secondary is blank or not set to enabled or disabled
                            elif 'TrustPUSitelists' in wfi.request and wfi.request[
                                    'TrustPUSitelists']:
                                parameters['TrustPUSitelists'] = True
                        elif 'TrustPUSitelists' in wfi.request and wfi.request[
                                'TrustPUSitelists']:
                            parameters['TrustPUSitelists'] = True

                        if options.ass:
                            print "really doing the assignment of the ACDC", acdc
                            parameters['execute'] = True
                            #wfi.sendLog('actor',"%s  was assigned for recovery"% acdc)
                        else:
                            print "no assignment done with this ACDC", acdc
                            sendLog('actor',
                                    "%s needs to be assigned" % (acdc),
                                    level='critical')
                            wfi.sendLog(
                                'actor',
                                "%s needs to be assigned by hand" % (acdc))
                            continue

#                       print parameters
                        result = reqMgrClient.assignWorkflow(
                            url, acdc, team, parameters)
                        if not result:
                            print acdc, "was not assigned"
                            sendLog('actor',
                                    "%s failed to be assigned" % (acdc),
                                    level='critical')
                            wfi.sendLog(
                                'actor',
                                "%s failed to get assigned for recovery" %
                                acdc)
                        else:
                            wfi.sendLog('actor',
                                        "%s was assigned for recovery" % acdc)
                            recovering.add(acdc)

                        #wfi.sendLog('actor',"ACDCs created for %s"%wfname)
                        try:
                            if jira_comment:
                                jiras = JC.find(
                                    {'prepid': wfi.request['PrepID']})
                                if len(jiras) == 1:
                                    ## put a comment on the single corresponding ticket
                                    JC.comment(jiras[0].key, jira_comment)
                                    JC.progress(jiras[0].key)
                        except Exception as e:
                            print "failed with JIRA"
                            print str(e)

        #===========================================================

        if recover and options.do:
            r = WC.remove_action(wfname)
            if not r:
                sendLog(
                    'actor',
                    'not able to remove the action, interlocking the module',
                    level='critical')
                os.system('touch %s/actor.failed-%s.lock' %
                          (base_eos_dir, os.getpid()))
                sys.exit(-1)

        ## update the status with recovering removing manual
        for wfo in session.query(Workflow).filter(
                Workflow.name == wfname).all():
            wfo.status = wfo.status.replace('manual', 'recovering')
        session.commit()

        if message_to_user:
            print wfname, "to be notified to user(DUMMY)", message_to_user

        if message_to_ops:
            print 'message'
            #sendEmail( "notification in recoveror" , message_to_ops, destination=['*****@*****.**'])
        #            sendLog('recoveror',message_to_ops,level='warning')

    return
Ejemplo n.º 58
0
def completor(url, specific):
    up = componentInfo(mcm=False, soft=['mcm'])
    if not up.check(): return

    CI = campaignInfo()
    SI = siteInfo()

    wfs = []
    wfs.extend(session.query(Workflow).filter(Workflow.status == 'away').all())
    wfs.extend(
        session.query(Workflow).filter(
            Workflow.status.startswith('assistance')).all())

    ## just take it in random order so that not always the same is seen
    random.shuffle(wfs)

    ## by workflow a list of fraction / timestamps
    completions = json.loads(open('%s/completions.json' % monitor_dir).read())

    good_fractions = {}
    for c in CI.campaigns:
        if 'force-complete' in CI.campaigns[c]:
            good_fractions[c] = CI.campaigns[c]['force-complete']

    long_lasting = {}

    overrides = {}
    for rider, email in [('vlimant', '*****@*****.**'),
                         ('jen_a', '*****@*****.**'),
                         ('srimanob', '*****@*****.**')]:
        rider_file = '/afs/cern.ch/user/%s/%s/public/ops/forcecomplete.json' % (
            rider[0], rider)
        if not os.path.isfile(rider_file):
            print "no file", rider_file
            continue
        try:
            overrides[rider] = json.loads(open(rider_file).read())
        except:
            print "cannot get force complete list from", rider
            sendEmail("malformated force complet file",
                      "%s is not json readable" % rider_file,
                      destination=[email])

    print "can force complete on"
    print json.dumps(good_fractions, indent=2)
    print json.dumps(overrides, indent=2)
    max_force = 5

    wfs_no_location_in_GQ = set()

    for wfo in wfs:
        if specific and not specific in wfo.name: continue

        print "looking at", wfo.name
        ## get all of the same
        wfi = workflowInfo(url, wfo.name)

        skip = False
        if not any([c in wfo.name for c in good_fractions]): skip = True
        for user, spec in overrides.items():
            #print spec
            if wfo.name in spec and wfi.request[
                    'RequestStatus'] != 'force-complete':
                #skip=False ## do not do it automatically yet
                sendEmail(
                    'force-complete requested',
                    '%s is asking for %s to be force complete' %
                    (user, wfo.name))
                wfi = workflowInfo(url, wfo.name)
                forceComplete(url, wfi)
                skip = True
                wfi.notifyRequestor(
                    "The workflow %s was force completed by request of %s" %
                    (wfo.name, user),
                    do_batch=False)
                wfi.sendLog(
                    'completor', '%s is asking for %s to be force complete' %
                    (user, wfo.name))
                break

        if wfo.status.startswith('assistance'): skip = True

        if skip:
            continue

        priority = wfi.request['RequestPriority']

        if not 'Campaign' in wfi.request: continue

        if not wfi.request['RequestStatus'] in [
                'acquired', 'running-open', 'running-closed'
        ]:
            continue

        c = wfi.request['Campaign']
        if not c in good_fractions: continue
        good_fraction = good_fractions[c]
        ignore_fraction = 2.

        lumi_expected = None
        event_expected = None
        if not 'TotalInputEvents' in wfi.request:
            if 'RequestNumEvents' in wfi.request:
                event_expected = wfi.request['RequestNumEvents']
            else:
                print "truncated, cannot do anything"
                continue
        else:
            lumi_expected = wfi.request['TotalInputLumis']
            event_expected = wfi.request['TotalInputEvents']

        now = time.mktime(time.gmtime()) / (60 * 60 * 24.)

        running_log = filter(
            lambda change: change["Status"
                                  ] in ["running-open", "running-closed"],
            wfi.request['RequestTransition'])
        if not running_log:
            print "\tHas no running log"
            # cannot figure out when the thing started running
            continue
        then = running_log[-1]['UpdateTime'] / (60. * 60. * 24.)
        delay = now - then  ## in days

        (w, d) = divmod(delay, 7)
        print "\t" * int(
            w) + "Running since", delay, "[days] priority=", priority
        if delay <= 7: continue
        if delay >= 7:
            long_lasting[wfo.name] = {"delay": delay}
            pass

        percent_completions = {}
        for output in wfi.request['OutputDatasets']:
            if "/DQM" in output: continue  ## that does not count
            if not output in completions:
                completions[output] = {
                    'injected': None,
                    'checkpoints': [],
                    'workflow': wfo.name
                }
            ## get completion fraction
            event_count, lumi_count = getDatasetEventsAndLumis(dataset=output)
            lumi_completion = 0.
            event_completion = 0.
            if lumi_expected:
                lumi_completion = lumi_count / float(lumi_expected)
            if event_expected:
                event_completion = event_count / float(event_expected)

            #take the less optimistic
            percent_completions[output] = min(lumi_completion,
                                              event_completion)
            completions[output]['checkpoints'].append((now, event_completion))

        if all([
                percent_completions[out] >= good_fraction
                for out in percent_completions
        ]):
            print "all is above", good_fraction, "for", wfo.name
            print json.dumps(percent_completions, indent=2)
        else:
            print "\t", percent_completions.values(
            ), "not over bound", good_fraction
            long_lasting[wfo.name].update({
                'completion':
                sum(percent_completions.values()) / len(percent_completions),
                'completions':
                percent_completions
            })

            #print json.dumps( percent_completions, indent=2 )

            ## do something about the agents this workflow is in
            long_lasting[wfo.name]['agents'] = wfi.getAgents()
            print json.dumps(long_lasting[wfo.name]['agents'], indent=2)

            ## pick up on possible issue with global queue data location
            locs = wfi.getGQLocations()
            for b, loc in locs.items():
                if not loc:
                    print b, "has no location for GQ in", wfi.request[
                        'RequestName']
                    ## this is severe !
                    wfs_no_location_in_GQ.add(wfo.name)
                ## check location and site white list
                can_run = set([SI.SE_to_CE(se) for se in loc]) & set(
                    wfi.request['SiteWhitelist'])
                if loc and not can_run:
                    print b, "is missing site to run within the whitelist"
                    wfs_no_location_in_GQ.add(wfo.name)
                can_run = can_run & set(SI.sites_ready)
                if loc and not can_run:
                    print b, "is missing available site to run"
                    wfs_no_location_in_GQ.add(wfo.name)
            continue

        if all([
                percent_completions[out] >= ignore_fraction
                for out in percent_completions
        ]):
            print "all is done, just wait a bit"
            continue

        for output in percent_completions:
            completions[output]['injected'] = then

        #further check on delays
        cpuh = wfi.getComputingTime(unit='d')

        ran_at = wfi.request['SiteWhitelist']
        print "Required:", cpuh,
        print "Time spend:", delay

        # only really force complete after n days
        if delay <= 14: continue
        print "going for force-complete of", wfo.name
        ## find ACDCs that might be running
        if max_force > 0:
            forceComplete(url, wfi)
            max_force -= 1
        else:
            print "too many completion this round"
        ## do it once only for testing
        #break

    open('%s/completions.json' % monitor_dir,
         'w').write(json.dumps(completions, indent=2))
    text = "These have been running for long"

    open('%s/longlasting.json' % monitor_dir,
         'w').write(json.dumps(long_lasting, indent=2))

    for wf, info in sorted(long_lasting.items(),
                           key=lambda tp: tp[1]['delay'],
                           reverse=True):
        delay = info['delay']
        text += "\n %s : %s days" % (wf, delay)
        if 'completion' in info:
            text += " %d%%" % (info['completion'] * 100)

    if wfs_no_location_in_GQ:
        sendEmail(
            'workflow with no location in GQ',
            "there won't be able to run anytime soon\n%s" %
            ('\n'.join(wfs_no_location_in_GQ)))

    #sendEmail("long lasting workflow",text)
    ## you can check the log
    print text