def compare(fn1, fn2):
    pair_name = '%s->%s' % (fn1.replace('.gz',''), fn2.replace('.gz',''))
    process1, names1, ver1 = parse(fn1)
    process2, names2, ver2 = parse(fn2)

    if ver1 != ver2:
        print colors.yellow('%s changed versions %s to %s' % (pair_name, ver1, ver2))

    commn = [n for n in names1 if n in names2]
    commn_diff = []

    for x in commn:
        o1, o2 = [getattr(p, x).dumpPython() for p in process1, process2]
        if o1 != o2:
            if not commn_diff:
                print colors.yellow(pair_name + ' changed these:\n')
            print colors.yellow(x)
            print 'process1.%s =' % x, o1
            print 'process2.%s =' % x, o2
            commn_diff.append(x)

    added = [n for n in names2 if n not in names1]
    deled = [n for n in names1 if n not in names2]
  
    if added:
        print colors.yellow('%s added these: %s\n' % (pair_name, ' '.join(added)))
        for x in added:
            print 'process2.%s =' % x, getattr(process2, x).dumpPython()
    if deled:
        print colors.yellow('%s deled these: %s\n' % (pair_name, ' '.join(deled)))
        for x in deled:
            print 'process1.%s =' % x, getattr(process1, x).dumpPython()

    return commn_diff or added or deled
Beispiel #2
0
def crab_get_njobs_from_log(working_dir,
                            jobs_re=re.compile(r'\([\d ]+/([\d ]+)\)')):
    # find njobs using a line printed as result of crab status that looks like ( 76/788)
    njobs = []
    for line in crab_log_open(working_dir):
        mo = jobs_re.search(line)
        if mo:
            njobs.append(int(mo.group(1)))
    if not njobs:
        raise ValueError('problem parsing crab.log in wd=%s for njobs' %
                         working_dir)
    if crab_global_options.support_automatic_splitting:
        # njobs may only increase at later parts of the log
        # this should handle how crab automatic splitting resubmission jobs work
        for a, b in zip(njobs, njobs[1:]):
            if a > b:
                print colors.red('crab.log wd=%s has decreasing njobs: %r' %
                                 (working_dir, njobs))
        if len(set(njobs)) != 1:
            print colors.yellow(
                'crab_get_njobs_from_log for %s found more than one value: %r\n\tThis may have happened because of Automatic splitting. Support is still experimental, scrutinize the output well.'
                % (working_dir, sorted(set(njobs))))
    elif len(set(njobs)) != 1:
        raise ValueError('problem parsing crab.log in wd=%s for njobs: %r' %
                         (working_dir, njobs))
    return njobs[-1]
def cs_report(wd, partial=False):
    njobs = cs_njobs(wd)
    lls = []

    for i in xrange(njobs):
        fjr_fn = os.path.join(wd, 'fjr_%i.xml' % i)
        if os.path.isfile(fjr_fn):
            lls.append((i, fjr2ll(fjr_fn)))
        elif partial:
            print colors.yellow('missing fjr %s but partial allowed' % fjr_fn)
        else:
            raise IOError('missing fjr %s' % fjr_fn)

    for (ia, lla), (ib, llb) in combinations(lls, 2):
        if lla & llb:
            problem = 'problem with fjrs for %s: overlap found in pair %i + %i\n' % (
                wd, ia, ib)
            problem += repr((ia, lla)) + '\n'
            problem += repr((ib, llb)) + '\n'
            problem += 'and ' + repr(lla & llb) + '\n'
            raise ValueError(problem)

    ll_all = lls.pop()[1]
    for _, ll in lls:
        ll_all |= ll
    ll_all.writeJSON(os.path.join(wd, 'processedLumis.json'))
    return ll_all
Beispiel #4
0
 def runem(cb):
     for dataset in datasets:
         for sample in samples:
             if not sample.has_dataset(dataset):
                 print colors.yellow('no dataset %s for %s' % (dataset, sample.name))
                 continue
             sample.set_curr_dataset(dataset)
             cb(dataset, sample)
def crab_fjr_json_to_ll(fn):
    print colors.yellow('this is not fully tested')
    j = crab_fjr_json(fn)
    ll = LumiList()
    for x in j['steps']['cmsRun']['input']['source']:
        x2 = defaultdict(list)
        for k,v in x['runs'].iteritems():
            for l in v.keys():
                x2[int(k)].append(int(l))
        ll += LumiList(runsAndLumis=x2)
    return ll
Beispiel #6
0
def crab_fjr_json_to_ll(fn):
    print colors.yellow('this is not fully tested')
    j = crab_fjr_json(fn)
    ll = LumiList()
    for x in j['steps']['cmsRun']['input']['source']:
        x2 = defaultdict(list)
        for k, v in x['runs'].iteritems():
            for l in v.keys():
                x2[int(k)].append(int(l))
        ll += LumiList(runsAndLumis=x2)
    return ll
def crab_get_njobs_from_log(working_dir, jobs_re=re.compile(r'\([\d ]+/([\d ]+)\)')):
    # find njobs using a line printed as result of crab status that looks like ( 76/788)
    njobs = []
    for line in crab_log_open(working_dir):
        mo = jobs_re.search(line)
        if mo:
            njobs.append(int(mo.group(1)))
    if not njobs:
        raise ValueError('problem parsing crab.log in wd=%s for njobs' % working_dir)
    if crab_global_options.support_automatic_splitting:
        # njobs may only increase at later parts of the log
        # this should handle how crab automatic splitting resubmission jobs work
        for a, b in zip(njobs, njobs[1:]):
            if a > b:
                print colors.red('crab.log wd=%s has decreasing njobs: %r' % (working_dir, njobs))
        if len(set(njobs)) != 1:
            print colors.yellow('crab_get_njobs_from_log for %s found more than one value: %r\n\tThis may have happened because of Automatic splitting. Support is still experimental, scrutinize the output well.' % (working_dir, sorted(set(njobs))))
    elif len(set(njobs)) != 1:
        raise ValueError('problem parsing crab.log in wd=%s for njobs: %r' % (working_dir, njobs))
    return njobs[-1]
Beispiel #8
0
def compare(fn1, fn2):
    pair_name = '%s->%s' % (fn1.replace('.gz', ''), fn2.replace('.gz', ''))
    process1, names1, ver1 = parse(fn1)
    process2, names2, ver2 = parse(fn2)

    if ver1 != ver2:
        print colors.yellow('%s changed versions %s to %s' %
                            (pair_name, ver1, ver2))

    commn = [n for n in names1 if n in names2]
    commn_diff = []

    for x in commn:
        o1, o2 = [getattr(p, x).dumpPython() for p in process1, process2]
        if o1 != o2:
            if not commn_diff:
                print colors.yellow(pair_name + ' changed these:\n')
            print colors.yellow(x)
            print 'process1.%s =' % x, o1
            print 'process2.%s =' % x, o2
            commn_diff.append(x)

    added = [n for n in names2 if n not in names1]
    deled = [n for n in names1 if n not in names2]

    if added:
        print colors.yellow('%s added these: %s\n' %
                            (pair_name, ' '.join(added)))
        for x in added:
            print 'process2.%s =' % x, getattr(process2, x).dumpPython()
    if deled:
        print colors.yellow('%s deled these: %s\n' %
                            (pair_name, ' '.join(deled)))
        for x in deled:
            print 'process1.%s =' % x, getattr(process1, x).dumpPython()

    return commn_diff or added or deled
Beispiel #9
0
def crab_hadd(working_dir,
              new_name=None,
              new_dir=None,
              raise_on_empty=False,
              chunk_size=900,
              pattern=None,
              lpc_shortcut=False,
              range_filter=None):
    working_dir, new_name, new_dir = crab_hadd_args(working_dir, new_name,
                                                    new_dir)
    expected, files = crab_hadd_files(working_dir,
                                      lpc_shortcut,
                                      range_filter=range_filter)
    result = HaddBatchResult('crab', working_dir, new_name, new_dir, expected,
                             files)
    print '%s: expecting %i files if all jobs succeeded' % (working_dir,
                                                            expected)

    if pattern:
        if '/' not in pattern:
            pattern = '*/' + pattern
        files = fnmatch.filter(files, pattern)

    automatic_splitting = False
    pprinted = False
    jobs = []
    for f in files:
        jobnum = f.split('_')[-1].split('.root')[0]
        if crab_global_options.support_automatic_splitting and '-' in jobnum:
            automatic_splitting = True
            if not pprinted:
                pprint(files)
                pprinted = True
            it, jobnum = jobnum.split('-')
            it, jobnum = int(it), int(jobnum)
            assert it >= 1  # probe jobs "0-*" should not show up
            jobnum = it * 10000 + jobnum
        else:
            jobnum = int(jobnum)
        jobs.append(jobnum)
    jobs.sort()
    expected = range(1, expected + 1)

    if jobs != expected:
        print '\033[36;7m %i files found %s not what expected \033[m' % (
            len(jobs), crabify_list(jobs))
        missing = sorted(set(expected) - set(jobs))
        print '\033[36;7m    %i missing: %r \033[m' % (len(missing), ' '.join(
            str(j) for j in missing))

    l = len(files)
    if l == 0:
        result.success = False
        msg = 'crab_hadd: no files found in %s' % working_dir
        if raise_on_empty:
            raise CRABToolsException(msg)
        else:
            print '\033[36;7m', msg, '\033[m'
    elif l == 1:
        print working_dir, ': just one file found, copying'
        cmd = 'xrdcp -s %s %s' % (files[0], new_name)
        result.success = os.system(cmd) == 0
        if result.success and not new_name.startswith('root://'):
            os.chmod(new_name, 0644)
    else:
        result.success = hadd(new_name, files)

    if automatic_splitting:
        n = norm_from_file(new_name)
        sn, s = fn_to_sample(Samples, new_name)
        if not s:
            print colors.yellow(
                "\tnorm_from_file returns %r, couldn't get sample %s" %
                (n, sn))
        else:
            no1, no2 = s.datasets['main'].nevents_orig, s.datasets[
                'miniaod'].nevents_orig
            if n == no1 or n == no2:
                print '\tnorm_from_file returns nevents_orig = %i' % n
            else:
                print colors.yellow(
                    '\tnorm_from_file returns %r while %s.nevents_orig is %i (main) %i (miniaod'
                    % (n, sn, no1, no2))

    return result
Beispiel #10
0
    dses = ['miniaod']

for ds in dses:
    print colors.bold(ds)
    for sample in Samples.registry.all():
        if not sample.has_dataset(ds):
            continue

        sample.set_curr_dataset(ds)
        if '/None/' in sample.dataset or getattr(sample, 'is_private', False):
            continue

        try:
            sites = DBS.sites_for_dataset(sample.dataset, instance=sample.dbs_inst, json=True)
        except (RuntimeError, ValueError):
            print colors.yellow('%s %s DBS problem' % (sample.name, sample.dataset))
            continue

        if not sites:
            continue

        print sample.name,
        sites.sort(key=lambda site: DBS.site_completions(site, True))
        max_site_completion = DBS.site_completions(sites[-1], True)
        found = False
        for site in sites:
            if DBS.site_is_tape(site):
                continue

            is_complete = DBS.complete_at_site(site)
            is_good_as_possible = DBS.site_completions(site) >= max_site_completion
Beispiel #11
0
def main(samples_registry):
    from glob import glob
    from sys import argv
    from pprint import pprint
    from JMTucker.Tools import colors

    if 'merge' in argv:
        samples = samples_registry.from_argv(from_root_fns=True, raise_if_none=True)
        out_fn = [x for x in argv if x.endswith('.root') and not os.path.isfile(x)]
        out_fn = out_fn[0] if out_fn else 'merge.root'
        norm_to = typed_from_argv(float, default_value=1.)
        norm_path = typed_from_argv(str, default_value='', name='norm_path')
        merge(samples, output=out_fn, norm_to=norm_to, norm_path=norm_path)

    elif 'printmissing' in argv:
        samples = [s.name for s in samples_registry.from_argv(raise_if_none=True)]
        samples.sort()
        look_for_root_files = 'no_root' not in sys.argv
        no_batch_dir, no_root_file = [], []
        for s in samples:
            if not os.path.isdir('condor_' + s) and not glob('crab_*_' + s):
                no_batch_dir.append(s)
            if not os.path.isfile('%s.root' % s):
                no_root_file.append(s)
        if no_batch_dir:
            print colors.yellow('no batch dir for these:')
            for s in no_batch_dir:
                print s
        if look_for_root_files and no_root_file:
            print colors.yellow('no root file for these:')
            for s in no_root_file:
                print s

    elif 'ds' in argv:
        samples = samples_registry.from_argv(raise_if_none=True)
        if len(samples) != 1:
            raise ValueError('must have exactly one sample in argv')
        sample = samples[0]
        dataset = argv[argv.index(sample.name)+1]
        if not sample.has_dataset(dataset):
            raise KeyError('no dataset %s in %s' % (dataset, sample))
        print sample.datasets[dataset].dataset

    elif 'file' in argv:
        samples = samples_registry.from_argv(raise_if_none=True)
        if len(samples) != 1:
            raise ValueError('must have exactly one sample in argv')
        sample = samples[0]
        dataset = argv[argv.index(sample.name)+1]
        if not sample.has_dataset(dataset):
            raise KeyError('no dataset %s in %s' % (dataset, sample))
        sample.set_curr_dataset(dataset)
        for x in sample.filenames[:typed_from_argv(int, 5)]:
            print x

    elif 'nevents' in argv:
        samples = samples_registry.from_argv(raise_if_none=True)
        if len(samples) != 1:
            raise ValueError('must have exactly one sample in argv')
        sample = samples[0]
        dataset = argv[argv.index(sample.name)+1]
        if not sample.has_dataset(dataset):
            raise KeyError('no dataset %s in %s' % (dataset, sample))
        sample.set_curr_dataset(dataset)
        print DBS.numevents_in_dataset(sample.dataset)

    elif 'site' in argv:
        samples = samples_registry.from_argv(raise_if_none=True)
        dataset = samples_registry.datasets_from_argv()
        if len(dataset) > 1:
            raise ValueError('only zero/one dataset allowed')
        dataset = dataset[0] if len(dataset) == 1 else 'main'
        mlen = max(len(s.name) for s in samples)
        for sample in samples:
            sample.set_curr_dataset(dataset)
            try:
                sites = DBS.sites_for_dataset(sample.dataset, json=True)
            except RuntimeError:
                print sample.name, 'PROBLEM'
                continue
            print sample.name.ljust(mlen+5),
            sites.sort(key=lambda x: x['name'])
            for site in sites:
                if DBS.site_is_tape(site):
                    continue
                is_complete = DBS.complete_at_site(site)
                print (colors.green if is_complete else colors.yellow)(DBS.site_completions_string(site)),

    elif 'samplefiles' in argv:
        # rm a; touch a; for ds in '' miniaod; do for x in qcd ttbar leptonic; do ( samples samplefiles ${x}_samples_2017 $ds >> a ) ; done; done
        # rm a; touch a; for ds in '' miniaod; do for year in 2017 2018; do for x in data auxiliary_data ; do ( samples samplefiles ${x}_samples_${year} $ds >> a ) ; done; done; done
        samples = samples_registry.from_argv(raise_if_none=True)
        dataset = 'main'
        for arg in argv[1:]:
            if arg == 'miniaod' or arg.startswith('ntuple'):
                dataset = arg
                break
        print 'getting files for dataset %s:' % dataset, ', '.join(s.name for s in samples)
        import SampleFiles as sf
        for s in samples:
            d = {}
            if not s.has_dataset(dataset):
                print colors.yellow('no dataset %s for %s' % (dataset, s.name))
                continue
            s.set_curr_dataset(dataset)
            if sf.has(s.name, dataset):
                raise KeyError('SampleFiles already has an entry for %s' % s.name)
            else:
                fns = s.filenames
                print 'DBS has %i files for %s' % (len(fns), s.name)
                d[(s.name, dataset)] = (len(fns), fns)
            print "('%s:%s', '%s')," % (s.name, dataset, sf._enc(d))
Beispiel #12
0
def main(samples_registry):
    from glob import glob
    from sys import argv
    from pprint import pprint
    from JMTucker.Tools import colors
    from JMTucker.Tools.general import chunks, typed_from_argv

    samples = samples_registry.from_argv()
    datasets = samples_registry.datasets_from_argv()
    def prnt(*x):
        print ' '.join(str(y) for y in x)
    def runem(cb):
        for dataset in datasets:
            for sample in samples:
                if not sample.has_dataset(dataset):
                    print colors.yellow('no dataset %s for %s' % (dataset, sample.name))
                    continue
                sample.set_curr_dataset(dataset)
                cb(dataset, sample)

    if 'merge' in argv:
        samples = samples_registry.from_argv(from_root_fns=True, raise_if_none=True)
        out_fn = [x for x in argv if x.endswith('.root') and not os.path.isfile(x)]
        out_fn = out_fn[0] if out_fn else 'merge.root'
        norm_to = typed_from_argv(float, default_value=1.)
        norm_path = typed_from_argv(str, default_value='', name='norm_path')
        merge(samples, output=out_fn, norm_to=norm_to, norm_path=norm_path)

    elif 'printmissing' in argv:
        samples = [s.name for s in samples_registry.from_argv(raise_if_none=True)]
        samples.sort()
        look_for_root_files = 'no_root' not in sys.argv
        no_batch_dir, no_root_file = [], []
        for s in samples:
            if not os.path.isdir('condor_' + s) and not glob('crab_*_' + s):
                no_batch_dir.append(s)
            if not os.path.isfile('%s.root' % s):
                no_root_file.append(s)
        if no_batch_dir:
            print colors.yellow('no batch dir for these:')
            for s in no_batch_dir:
                print s
        if look_for_root_files and no_root_file:
            print colors.yellow('no root file for these:')
            for s in no_root_file:
                print s

    elif 'name' in argv:
        runem(lambda dataset, sample: prnt(sample.name, dataset))

    elif 'ds' in argv:
        runem(lambda dataset, sample: prnt(sample.name, dataset, sample.dataset))

    elif 'file' in argv:
        runem(lambda dataset, sample: [prnt(sample.name, dataset, x) for x in sample.filenames[:typed_from_argv(int, 5)]])

    elif 'nevents' in argv:
        runem(lambda dataset, sample: prnt(sample.name, dataset, DBS.numevents_in_dataset(sample.dataset)))

    elif 'files_for_events' in argv:
        rles = typed_from_argv(int, return_multiple=True)
        if len(rles) % 3 != 0:
            raise ValueError('expect list of ints in argv with length divisible by 3 [run1 lumi1 event1 ...]')
        rles = list(chunks(rles,3))
        runem(lambda dataset, sample: prnt(sample.name, dataset, ' '.join(DBS.files_for_events(rles, sample.dataset))))

    elif 'site' in argv:
        mlen = max(len(s.name) for s in samples)
        def cb(dataset, sample):
            ljname = sample.name.ljust(mlen+3)
            try:
                sites = DBS.sites_for_dataset(sample.dataset, json=True)
            except RuntimeError:
                print colors.boldred(ljname + ' DBS problem')
            else:
                print ljname,
                sites.sort(key=lambda x: x['name'])
                for site in sites:
                    if DBS.site_is_tape(site):
                        continue
                    is_complete = DBS.complete_at_site(site)
                    print (colors.green if is_complete else colors.yellow)(DBS.site_completions_string(site)), ' ',
                print
        runem(cb)

    elif 'samplefiles' in argv:
        import SampleFiles as sf
        def cb(dataset, sample):
            if sf.has(sample.name, dataset):
                raise KeyError('SampleFiles already has an entry for %s' % sample.name)
            fns = sample.filenames
            print 'DBS has %i files for %s' % (len(fns), sample.name)
            d = {(sample.name, dataset): (len(fns), fns)}
            print "('%s:%s', '%s')," % (sample.name, dataset, sf._enc(d))
        runem(cb)

    elif 'sfhas' in argv:
        neg = 'neg' in argv
        import SampleFiles as sf
        for dataset in datasets:
            for sample in samples:
                if sf.has(sample.name, dataset) != neg:
                    print sample.name
def crab_hadd(working_dir, new_name=None, new_dir=None, raise_on_empty=False, chunk_size=900, pattern=None, lpc_shortcut=False, range_filter=None):
    working_dir, new_name, new_dir = crab_hadd_args(working_dir, new_name, new_dir)
    expected, files = crab_hadd_files(working_dir, lpc_shortcut, range_filter=range_filter)
    print '%s: expecting %i files if all jobs succeeded' % (working_dir, expected)

    if pattern:
        if '/' not in pattern:
            pattern = '*/' + pattern
        files = fnmatch.filter(files, pattern)

    automatic_splitting = False
    pprinted = False
    jobs = []
    for f in files:
        jobnum = f.split('_')[-1].split('.root')[0]
        if crab_global_options.support_automatic_splitting and '-' in jobnum:
            automatic_splitting = True
            if not pprinted:
                pprint(files)
                pprinted = True
            it, jobnum = jobnum.split('-')
            it, jobnum = int(it), int(jobnum)
            assert it >= 1 # probe jobs "0-*" should not show up
            jobnum = it*10000 + jobnum
        else:
            jobnum = int(jobnum)
        jobs.append(jobnum)
    jobs.sort()
    expected = range(1, expected+1)

    if jobs != expected:
        print '\033[36;7m %i files found %s not what expected \033[m' % (len(jobs), crabify_list(jobs))
        missing = sorted(set(expected) - set(jobs))
        print '\033[36;7m    %i missing: %r \033[m' % (len(missing), ' '.join(str(j) for j in missing))

    l = len(files)
    if l == 0:
        msg = 'crab_hadd: no files found in %s' % working_dir
        if raise_on_empty:
            raise CRABToolsException(msg)
        else:
            print '\033[36;7m', msg, '\033[m'
    elif l == 1:
        print working_dir, ': just one file found, copying'
        cmd = 'xrdcp -s %s %s' % (files[0], new_name)
        os.system(cmd)
        os.chmod(new_name, 0644)
    else:
        hadd(new_name, files)

    if automatic_splitting:
        n = norm_from_file(new_name)
        sn, s = fn_to_sample(Samples, new_name)
        if not s:
            print colors.yellow("\tnorm_from_file returns %r, couldn't get sample %s" % (n, sn))
        else:
            no1, no2 = s.datasets['main'].nevents_orig, s.datasets['miniaod'].nevents_orig
            if n == no1 or n == no2:
                print '\tnorm_from_file returns nevents_orig = %i' % n
            else:
                print colors.yellow('\tnorm_from_file returns %r while %s.nevents_orig is %i (main) %i (miniaod' % (n, sn, no1, no2))

    return new_name