Esempio n. 1
0
def runFill(opts) :
    batchMode    = opts.batch
    inputFakeDir = opts.input_fake
    inputGenDir  = opts.input_other
    outputDir    = opts.output_dir
    verbose      = opts.verbose
    debug        = opts.debug
    blinded      = not opts.unblind
    tightight    = opts.require_tight_tight

    if debug : dataset.Dataset.verbose_parsing = True
    groups = dataset.DatasetGroup.build_groups_from_files_in_dir(opts.samples_dir)
    if not skip_charge_flip : groups.append(dataset.DatasetGroup.build_qflip_from_simulated_samples(groups))
    groups.append(first([g for g in groups if g.is_data]).clone_data_as_fake())
    groups = parse_group_option(opts, groups)
    if verbose : print '\n'.join("group {0} : {1} samples".format(g.name, len(g.datasets)) for g in groups)
    if debug :
        print '\n'.join("group {0} : {1} samples: {2}".format(g.name,
                                                              len(g.datasets),
                                                              '\n\t'+'\n\t'.join(d.name for d in g.datasets))
                        for g in groups)
    if verbose : print "filling histos"
    # eval will take care of aborting on typos
    onthefly_tight_def = eval(opts.tight_def) if opts.tight_def else None
    mkdirIfNeeded(outputDir)
    systematics = get_list_of_syst_to_fill(opts)
    regions = regions_to_plot(opts.include_regions, opts.exclude_regions, opts.regions)
    if verbose : print "about to loop over these systematics:\n %s"%str(systematics)
    if verbose : print "about to loop over these regions:\n %s"%str(regions)
    if batchMode:
        for group in groups:
            for systematic in systematics:
                if systUtils.Group(group.name).isNeededForSys(systematic):
                    opts.syst = systematic
                    for selection in regions:
                        submit_batch_fill_job_per_group_per_selection(group=group, selection=selection, opts=opts)
    else:
        for group in groups:
            systematics = [s for s in systematics if systUtils.Group(group.name).isNeededForSys(s)]
            if not systematics : print "warning, empty syst list. You should have at least the nominal"
            for systematic in systematics:
                # note to self: here you will want to use a modified Sample.setHftInputDir
                # for now we just have the fake syst that are in the nominal tree
                tree_name = 'hlfv_tuple'
                chain = IndexedChain(tree_name)
                input_dir = opts.input_fake if group.name=='fake' else opts.input_other
                for ds in group.datasets:
                    chain.Add(os.path.join(input_dir, systUtils.Sample(ds.name, group.name).setSyst(systematic).filename))
                if opts.verbose:
                    print "{0} : {1} entries from {2} samples".format(group.name,
                                                                      chain.GetEntries(),
                                                                      len(group.datasets))
                chain.cache_directory = os.path.abspath('./selection_cache/'+group.name+'/')
                tcuts = [r.TCut(reg, selection_formulas()[reg]) for reg in regions]
                chain.retrieve_entrylists(tcuts)
                counters_pre, histos_pre = dict(), dict()
                counters_npre, histos_npre = dict(), dict()
                cached_tcuts = [] if opts.disable_cache else chain.tcuts_with_existing_list()
                uncached_tcuts = tcuts if opts.disable_cache else chain.tcuts_without_existing_list()
                if verbose : print 'filling cached cuts: ',' '.join([c.GetName() for c in cached_tcuts])
                for cut in cached_tcuts:
                    chain.preselect(cut)
                    c_pre, h_pre = count_and_fill(chain=chain, sample=group.name,
                                                  syst=systematic, verbose=verbose,
                                                  debug=debug, blinded=blinded,
                                                  onthefly_tight_def=onthefly_tight_def,
                                                  tightight=tightight, quicktest=opts.quick_test,
                                                  cached_cut=cut)
                    out_filename = (systUtils.Group(group.name)
                                    .setSyst(systematic)
                                    .setHistosDir(outputDir)
                                    .setCurrentSelection(cut.GetName())).filenameHisto
                    writeObjectsToFile(out_filename, h_pre, verbose)
                    counters_pre = dictSum(counters_pre, c_pre)
                    histos_pre = dictSum(histos_pre, h_pre)
                if uncached_tcuts:
                    if verbose : print 'filling uncached cuts: ',' '.join([c.GetName() for c in uncached_tcuts])
                    counters_npre, histos_npre = count_and_fill(chain=chain, sample=group.name,
                                                                syst=systematic, verbose=verbose,
                                                                debug=debug, blinded=blinded,
                                                                onthefly_tight_def=onthefly_tight_def,
                                                                tightight=tightight,
                                                                quicktest=opts.quick_test,
                                                                noncached_cuts=uncached_tcuts)
                    for sel, histos in histos_npre.iteritems():
                        out_filename = (systUtils.Group(group.name)
                                        .setSyst(systematic)
                                        .setHistosDir(outputDir)
                                        .setCurrentSelection(sel)).filenameHisto
                        writeObjectsToFile(out_filename, histos, verbose)
                chain.save_lists()
Esempio n. 2
0
def runFill(opts):
    lepton = opts.lepton
    batchMode = opts.batch
    inputDir = opts.input_dir
    outputDir = opts.output_dir
    verbose = opts.verbose
    debug = opts.debug

    dataset.Dataset.verbose_parsing = True if debug else False
    groups = dataset.DatasetGroup.build_groups_from_files_in_dir(
        opts.samples_dir)
    if opts.group: groups = [g for g in groups if g.name == opts.group]
    if verbose:
        print '\n'.join(
            "group {0} : {1} samples".format(g.name, len(g.datasets))
            for g in groups)
    if debug:
        print '\n'.join("group {0} : {1} samples: {2}".format(
            g.name, len(g.datasets), '\n\t' + '\n\t'.join(d.name
                                                          for d in g.datasets))
                        for g in groups)
    if verbose: print "filling histos"
    outputDir = outputDir + '/' + lepton + '/histos'
    mkdirIfNeeded(outputDir)
    if batchMode:
        for group in groups:
            submit_batch_fill_job_per_group(group, opts)
    else:
        for group in groups:
            tree_name = 'ss3l_tuple'
            chain = IndexedChain(tree_name)
            for ds in group.datasets:
                chain.Add(os.path.join(inputDir, ds.name + '.root'))
            if opts.verbose:
                print "{0} : {1} entries from {2} samples".format(
                    group.name, chain.GetEntries(), len(group.datasets))
            chain.cache_directory = os.path.abspath('./selection_cache/' +
                                                    group.name + '/')
            tcuts = [
                r.TCut(reg,
                       selection_formulas()[reg])
                for reg in regions_to_plot(opts.include_regions,
                                           opts.exclude_regions, opts.regions)
            ]
            chain.retrieve_entrylists(tcuts)
            counters_pre, histos_pre = dict(), dict()
            counters_npre, histos_npre = dict(), dict()
            cached_tcuts = [] if opts.disable_cache else chain.tcuts_with_existing_list(
            )
            uncached_tcuts = tcuts if opts.disable_cache else chain.tcuts_without_existing_list(
            )
            print 'todo: skip cuts for which the histo files are there'
            if verbose:
                print 'filling cached cuts: ', ' '.join(
                    [c.GetName() for c in cached_tcuts])
            for cut in cached_tcuts:
                chain.preselect(cut)
                c_pre, h_pre = count_and_fill(chain=chain,
                                              opts=opts,
                                              group=group,
                                              cached_cut=cut)
                counters_pre = dictSum(counters_pre, c_pre)
                histos_pre = dictSum(histos_pre, h_pre)
            if verbose:
                print 'filling uncached cuts: ', ' '.join(
                    [c.GetName() for c in uncached_tcuts])
            if uncached_tcuts:
                counters_npre, histos_npre = count_and_fill(
                    chain=chain,
                    opts=opts,
                    group=group,
                    noncached_cuts=uncached_tcuts)
                chain.save_lists()
            all_histos = dictSum(histos_pre, histos_npre)
            for sel, histos in all_histos.iteritems():
                # write histos for each sel to a separate file (finer granularity, better caching)
                out_filename = os.path.join(outputDir,
                                            group.name + '_' + sel + '.root')
                if verbose: print 'saving to ', out_filename
                writeObjectsToFile(out_filename, histos, verbose)
Esempio n. 3
0
def runFill(opts):
    batchMode = opts.batch
    inputFakeDir = opts.input_fake
    inputGenDir = opts.input_other
    outputDir = opts.output_dir
    verbose = opts.verbose
    debug = opts.debug
    blinded = not opts.unblind
    tightight = opts.require_tight_tight

    if debug: dataset.Dataset.verbose_parsing = True
    groups = dataset.DatasetGroup.build_groups_from_files_in_dir(
        opts.samples_dir)
    if not skip_charge_flip:
        groups.append(
            dataset.DatasetGroup.build_qflip_from_simulated_samples(groups))
    groups.append(first([g for g in groups if g.is_data]).clone_data_as_fake())
    groups = parse_group_option(opts, groups)
    if verbose:
        print '\n'.join(
            "group {0} : {1} samples".format(g.name, len(g.datasets))
            for g in groups)
    if debug:
        print '\n'.join("group {0} : {1} samples: {2}".format(
            g.name, len(g.datasets), '\n\t' + '\n\t'.join(d.name
                                                          for d in g.datasets))
                        for g in groups)
    if verbose: print "filling histos"
    # eval will take care of aborting on typos
    onthefly_tight_def = eval(opts.tight_def) if opts.tight_def else None
    mkdirIfNeeded(outputDir)
    systematics = get_list_of_syst_to_fill(opts)
    regions = regions_to_plot(opts.include_regions, opts.exclude_regions,
                              opts.regions)
    if verbose:
        print "about to loop over these systematics:\n %s" % str(systematics)
    if verbose: print "about to loop over these regions:\n %s" % str(regions)
    if batchMode:
        for group in groups:
            for systematic in systematics:
                if systUtils.Group(group.name).isNeededForSys(systematic):
                    opts.syst = systematic
                    for selection in regions:
                        submit_batch_fill_job_per_group_per_selection(
                            group=group, selection=selection, opts=opts)
    else:
        for group in groups:
            systematics = [
                s for s in systematics
                if systUtils.Group(group.name).isNeededForSys(s)
            ]
            if not systematics:
                print "warning, empty syst list. You should have at least the nominal"
            for systematic in systematics:
                # note to self: here you will want to use a modified Sample.setHftInputDir
                # for now we just have the fake syst that are in the nominal tree
                tree_name = 'hlfv_tuple'
                chain = IndexedChain(tree_name)
                input_dir = opts.input_fake if group.name == 'fake' else opts.input_other
                for ds in group.datasets:
                    chain.Add(
                        os.path.join(
                            input_dir,
                            systUtils.Sample(
                                ds.name,
                                group.name).setSyst(systematic).filename))
                if opts.verbose:
                    print "{0} : {1} entries from {2} samples".format(
                        group.name, chain.GetEntries(), len(group.datasets))
                chain.cache_directory = os.path.abspath('./selection_cache/' +
                                                        group.name + '/')
                tcuts = [
                    r.TCut(reg,
                           selection_formulas()[reg]) for reg in regions
                ]
                chain.retrieve_entrylists(tcuts)
                counters_pre, histos_pre = dict(), dict()
                counters_npre, histos_npre = dict(), dict()
                cached_tcuts = [] if opts.disable_cache else chain.tcuts_with_existing_list(
                )
                uncached_tcuts = tcuts if opts.disable_cache else chain.tcuts_without_existing_list(
                )
                if verbose:
                    print 'filling cached cuts: ', ' '.join(
                        [c.GetName() for c in cached_tcuts])
                for cut in cached_tcuts:
                    chain.preselect(cut)
                    c_pre, h_pre = count_and_fill(
                        chain=chain,
                        sample=group.name,
                        syst=systematic,
                        verbose=verbose,
                        debug=debug,
                        blinded=blinded,
                        onthefly_tight_def=onthefly_tight_def,
                        tightight=tightight,
                        quicktest=opts.quick_test,
                        cached_cut=cut)
                    out_filename = (systUtils.Group(
                        group.name).setSyst(systematic).setHistosDir(
                            outputDir).setCurrentSelection(
                                cut.GetName())).filenameHisto
                    writeObjectsToFile(out_filename, h_pre, verbose)
                    counters_pre = dictSum(counters_pre, c_pre)
                    histos_pre = dictSum(histos_pre, h_pre)
                if uncached_tcuts:
                    if verbose:
                        print 'filling uncached cuts: ', ' '.join(
                            [c.GetName() for c in uncached_tcuts])
                    counters_npre, histos_npre = count_and_fill(
                        chain=chain,
                        sample=group.name,
                        syst=systematic,
                        verbose=verbose,
                        debug=debug,
                        blinded=blinded,
                        onthefly_tight_def=onthefly_tight_def,
                        tightight=tightight,
                        quicktest=opts.quick_test,
                        noncached_cuts=uncached_tcuts)
                    for sel, histos in histos_npre.iteritems():
                        out_filename = (systUtils.Group(
                            group.name).setSyst(systematic).setHistosDir(
                                outputDir).setCurrentSelection(sel)
                                        ).filenameHisto
                        writeObjectsToFile(out_filename, histos, verbose)
                chain.save_lists()
def main():
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-g', '--group', help='group to be processed (used only in fill mode)')
    parser.add_option('-i', '--input-dir', default='./out/fakerate')
    parser.add_option('-o', '--output-dir', default='./out/fake_scale_factor')
    parser.add_option('-l', '--lepton', default='el', help='either el or mu')
    parser.add_option('-r', '--region', help='one of the regions for which we saved the fake ntuples')
    parser.add_option('--samples-dir', default='samples/', help='directory with the list of samples; default ./samples/')
    parser.add_option('-T', '--tight-def', help='on-the-fly tight def, one of defs in fakeUtils.py: fakeu.lepIsTight_std, etc.')
    parser.add_option('-f', '--fill-histos', action='store_true', default=False, help='force fill (default only if needed)')
    parser.add_option('--keep-real', action='store_true', default=False, help='do not subtract real (to get real lep efficiency)')
    parser.add_option('--debug', action='store_true')
    parser.add_option('--verbose', action='store_true')
    parser.add_option('--disable-cache', action='store_true', help='disable the entry cache')
    (options, args) = parser.parse_args()
    inputDir  = options.input_dir
    outputDir = options.output_dir
    lepton    = options.lepton
    region    = options.region
    keepreal  = options.keep_real
    debug     = options.debug
    verbose   = options.verbose
    if lepton not in ['el', 'mu'] : parser.error("invalid lepton '%s'"%lepton)
    regions = kin.selection_formulas().keys()
    assert region in regions,"invalid region '%s', must be one of %s"%(region, str(sorted(regions)))
    regions = [region]

    dataset.Dataset.verbose_parsing = True if debug else False
    groups = dataset.DatasetGroup.build_groups_from_files_in_dir(options.samples_dir)
    if options.group : groups = [g for g in groups if g.name==options.group]
    group_names = [g.name for g in groups]

    outputDir = outputDir+'/'+region+'/'+lepton # split the output in subdirectories, so we don't overwrite things
    mkdirIfNeeded(outputDir)
    templateOutputFilename = "scale_factor_{0}.root".format(lepton)
    outputFileName = os.path.join(outputDir, templateOutputFilename)
    cacheFileName = outputFileName.replace('.root', '_cache.root')
    doFillHistograms = options.fill_histos or not os.path.exists(cacheFileName)
    onthefly_tight_def = eval(options.tight_def) if options.tight_def else None # eval will take care of aborting on typos
    if verbose : utils.print_running_conditions(parser, options)
    vars = ['mt0', 'mt1', 'pt0', 'pt1', 'eta1', 'pt1_eta1']
    #fill histos
    if doFillHistograms :
        start_time = time.clock()
        num_processed_entries = 0
        histosPerGroup = bookHistos(vars, group_names, region=region)
        histosPerSource = bookHistosPerSource(vars, leptonSources, region=region)
        histosPerGroupPerSource = bookHistosPerSamplePerSource(vars, group_names, leptonSources, region=region)
        for group in groups:
            tree_name = 'hlfv_tuple'
            chain = IndexedChain(tree_name)
            for ds in group.datasets:
                fname = os.path.join(inputDir, ds.name+'.root')
                if os.path.exists(fname):
                    chain.Add(fname)
            if verbose:
                print "{0} : {1} entries from {2} samples".format(group.name, chain.GetEntries(), len(group.datasets))
            chain.cache_directory = os.path.abspath('./selection_cache/'+group.name+'/')
            tcuts = [r.TCut(reg, selection_formulas()[reg]) for reg in regions]
            print 'tcuts ',[c.GetName() for c in tcuts]
            chain.retrieve_entrylists(tcuts)
            counters_pre, histos_pre = dict(), dict()
            counters_npre, histos_npre = dict(), dict()
            print 'tcuts_with_existing_list ',str([c.GetName() for c in chain.tcuts_with_existing_list()])
            print 'tcuts_without_existing_list ',str([c.GetName() for c in chain.tcuts_without_existing_list()])
            cached_tcuts = [] if options.disable_cache else chain.tcuts_with_existing_list()
            print 'cached_tcuts ',[c.GetName() for c in cached_tcuts]
            uncached_tcuts = tcuts if options.disable_cache else chain.tcuts_without_existing_list()
            print 'todo: skip cuts for which the histo files are there'
            if verbose:
                print " --- group : {0} ---".format(group.name)
                print '\n\t'.join(chain.filenames)
            if verbose : print 'filling cached cuts: ',' '.join([c.GetName() for c in cached_tcuts])
            if verbose: print "%s : %d entries"%(group.name, chain.GetEntries())
            histosThisGroup = histosPerGroup[group.name]
            histosThisGroupPerSource = dict((v, histosPerGroupPerSource[v][group.name]) for v in histosPerGroupPerSource.keys())
            for cut in cached_tcuts:
                print 'cached_tcut ',cut
                chain.preselect(cut)
                num_processed_entries += fillHistos(chain, histosThisGroup, histosPerSource,
                                                    histosThisGroupPerSource,
                                                    lepton, group,
                                                    cut, cut_is_cached=True,
                                                    onthefly_tight_def=onthefly_tight_def,
                                                    verbose=verbose)
            if verbose : print 'filling uncached cuts: ',' '.join([c.GetName() for c in uncached_tcuts])
            if uncached_tcuts:
                assert len(uncached_tcuts)==1, "expecting only one cut, got {}".format(len(uncached_tcuts))
                cut = uncached_tcuts[0]
                chain.preselect(None)
                num_processed_entries += fillHistos(chain, histosThisGroup, histosPerSource,
                                                    histosThisGroupPerSource,
                                                    lepton, group,
                                                    cut, cut_is_cached=False,
                                                    onthefly_tight_def=onthefly_tight_def,
                                                    verbose=verbose)
                chain.save_lists()

        writeHistos(cacheFileName, histosPerGroup, histosPerSource, histosPerGroupPerSource, verbose)
        end_time = time.clock()
        delta_time = end_time - start_time
        if verbose:
            print ("processed {0:d} entries ".format(num_processed_entries)
                   +"in "+("{0:d} min ".format(int(delta_time/60)) if delta_time>60 else
                           "{0:.1f} s ".format(delta_time))
                   +"({0:.1f} kHz)".format(num_processed_entries/delta_time))
    # return
    # compute scale factors
    histosPerGroup = fetchHistos(cacheFileName, histoNames(vars, group_names, region), verbose)
    histosPerSource = fetchHistos(cacheFileName, histoNamesPerSource(vars, leptonSources, region), verbose)
    histosPerSamplePerSource = fetchHistos(cacheFileName, histoNamesPerSamplePerSource(vars, group_names, leptonSources, region), verbose)
    plotStackedHistos(histosPerGroup, outputDir+'/by_group', region, verbose)
    plotStackedHistosSources(histosPerSource, outputDir+'/by_source', region, verbose)
    plotPerSourceEff(histosPerVar=histosPerSource, outputDir=outputDir+'/by_source', lepton=lepton, region=region, verbose=verbose)
    for g in group_names:
        hps = dict((v, histosPerSamplePerSource[v][g])for v in vars)
        plotPerSourceEff(histosPerVar=hps, outputDir=outputDir, lepton=lepton, region=region, sample=g, verbose=verbose)


    hn_sf_eta = histoname_sf_vs_eta           (lepton)
    hn_sf_pt  = histoname_sf_vs_pt            (lepton)
    hn_da_eta = histoname_data_fake_eff_vs_eta(lepton)
    hn_da_pt  = histoname_data_fake_eff_vs_pt (lepton)
    subtractReal = not keepreal
    objs_eta = subtractRealAndComputeScaleFactor(histosPerGroup, 'eta1', hn_sf_eta, hn_da_eta, outputDir, region, subtractReal, verbose)
    objs_pt  = subtractRealAndComputeScaleFactor(histosPerGroup, 'pt1',  hn_sf_pt,  hn_da_pt,  outputDir, region, subtractReal, verbose)
    objs_pt_eta  = subtractRealAndComputeScaleFactor(histosPerGroup, 'pt1_eta1',
                                                     histoname_sf_vs_pt_eta(lepton),
                                                     histoname_data_fake_eff_vs_pt_eta(lepton),
                                                     outputDir, region, subtractReal, verbose)
    rootUtils.writeObjectsToFile(outputFileName, dictSum(dictSum(objs_eta, objs_pt), objs_pt_eta), verbose)
    if verbose : print "saved scale factors to %s" % outputFileName
Esempio n. 5
0
def runFill(opts):
    lepton    = opts.lepton
    batchMode = opts.batch
    inputDir  = opts.input_dir
    outputDir = opts.output_dir
    verbose   = opts.verbose
    debug     = opts.debug

    dataset.Dataset.verbose_parsing = True if debug else False
    groups = dataset.DatasetGroup.build_groups_from_files_in_dir(opts.samples_dir)
    if opts.group : groups = [g for g in groups if g.name==opts.group]
    if verbose : print '\n'.join("group {0} : {1} samples".format(g.name, len(g.datasets)) for g in groups)
    if debug :
        print '\n'.join("group {0} : {1} samples: {2}".format(g.name,
                                                              len(g.datasets),
                                                              '\n\t'+'\n\t'.join(d.name for d in g.datasets))
                        for g in groups)
    if verbose : print "filling histos"
    outputDir = outputDir+'/'+lepton+'/histos'
    mkdirIfNeeded(outputDir)
    if batchMode:
        for group in groups:
            submit_batch_fill_job_per_group(group, opts)
    else:
        for group in groups:
            tree_name = 'hlfv_tuple'
            chain = IndexedChain(tree_name)
            for ds in group.datasets:
                chain.Add(os.path.join(inputDir, ds.name+'.root'))
            if opts.verbose:
                print "{0} : {1} entries from {2} samples".format(group.name,
                                                                  chain.GetEntries(),
                                                                  len(group.datasets))
            chain.cache_directory = os.path.abspath('./selection_cache/'+group.name+'/')
            tcuts = [r.TCut(reg, selection_formulas()[reg])
                     for reg in regions_to_plot(opts.include_regions, opts.exclude_regions, opts.regions)]
            chain.retrieve_entrylists(tcuts)
            counters_pre, histos_pre = dict(), dict()
            counters_npre, histos_npre = dict(), dict()
            cached_tcuts = [] if opts.disable_cache else chain.tcuts_with_existing_list()
            uncached_tcuts = tcuts if opts.disable_cache else chain.tcuts_without_existing_list()
            print 'todo: skip cuts for which the histo files are there'
            if verbose : print 'filling cached cuts: ',' '.join([c.GetName() for c in cached_tcuts])
            for cut in cached_tcuts:
                chain.preselect(cut)
                c_pre, h_pre = count_and_fill(chain=chain, opts=opts,
                                              group=group,
                                              cached_cut=cut)
                counters_pre = dictSum(counters_pre, c_pre)
                histos_pre = dictSum(histos_pre, h_pre)
            if verbose : print 'filling uncached cuts: ',' '.join([c.GetName() for c in uncached_tcuts])
            if uncached_tcuts:
                counters_npre, histos_npre = count_and_fill(chain=chain, opts=opts,
                                                            group=group,
                                                            noncached_cuts=uncached_tcuts)
                chain.save_lists()
            all_histos = dictSum(histos_pre, histos_npre)
            for sel, histos in all_histos.iteritems():
                # write histos for each sel to a separate file (finer granularity, better caching)
                out_filename = os.path.join(outputDir, group.name+'_'+sel+'.root')
                if verbose : print 'saving to ',out_filename
                writeObjectsToFile(out_filename, histos, verbose)
def main():
    parser = optparse.OptionParser(usage=usage)
    parser.add_option('-g',
                      '--group',
                      help='group to be processed (used only in fill mode)')
    parser.add_option('-i', '--input-dir', default='./out/fakerate')
    parser.add_option('-o', '--output-dir', default='./out/fake_scale_factor')
    parser.add_option('-l', '--lepton', default='el', help='either el or mu')
    parser.add_option(
        '-r',
        '--region',
        help='one of the regions for which we saved the fake ntuples')
    parser.add_option(
        '--samples-dir',
        default='samples/',
        help='directory with the list of samples; default ./samples/')
    parser.add_option(
        '-T',
        '--tight-def',
        help=
        'on-the-fly tight def, one of defs in fakeUtils.py: fakeu.lepIsTight_std, etc.'
    )
    parser.add_option('-f',
                      '--fill-histos',
                      action='store_true',
                      default=False,
                      help='force fill (default only if needed)')
    parser.add_option('--keep-real',
                      action='store_true',
                      default=False,
                      help='do not subtract real (to get real lep efficiency)')
    parser.add_option('--debug', action='store_true')
    parser.add_option('--verbose', action='store_true')
    parser.add_option('--disable-cache',
                      action='store_true',
                      help='disable the entry cache')
    (options, args) = parser.parse_args()
    inputDir = options.input_dir
    outputDir = options.output_dir
    lepton = options.lepton
    region = options.region
    keepreal = options.keep_real
    debug = options.debug
    verbose = options.verbose
    if lepton not in ['el', 'mu']: parser.error("invalid lepton '%s'" % lepton)
    regions = kin.selection_formulas().keys()
    assert region in regions, "invalid region '%s', must be one of %s" % (
        region, str(sorted(regions)))
    regions = [region]

    dataset.Dataset.verbose_parsing = True if debug else False
    groups = dataset.DatasetGroup.build_groups_from_files_in_dir(
        options.samples_dir)
    if options.group: groups = [g for g in groups if g.name == options.group]
    group_names = [g.name for g in groups]

    outputDir = outputDir + '/' + region + '/' + lepton  # split the output in subdirectories, so we don't overwrite things
    mkdirIfNeeded(outputDir)
    templateOutputFilename = "scale_factor_{0}.root".format(lepton)
    outputFileName = os.path.join(outputDir, templateOutputFilename)
    cacheFileName = outputFileName.replace('.root', '_cache.root')
    doFillHistograms = options.fill_histos or not os.path.exists(cacheFileName)
    onthefly_tight_def = eval(
        options.tight_def
    ) if options.tight_def else None  # eval will take care of aborting on typos
    if verbose: utils.print_running_conditions(parser, options)
    vars = ['mt0', 'mt1', 'pt0', 'pt1', 'eta1', 'pt1_eta1']
    #fill histos
    if doFillHistograms:
        start_time = time.clock()
        num_processed_entries = 0
        histosPerGroup = bookHistos(vars, group_names, region=region)
        histosPerSource = bookHistosPerSource(vars,
                                              leptonSources,
                                              region=region)
        histosPerGroupPerSource = bookHistosPerSamplePerSource(vars,
                                                               group_names,
                                                               leptonSources,
                                                               region=region)
        for group in groups:
            tree_name = 'hlfv_tuple'
            chain = IndexedChain(tree_name)
            for ds in group.datasets:
                fname = os.path.join(inputDir, ds.name + '.root')
                if os.path.exists(fname):
                    chain.Add(fname)
            if verbose:
                print "{0} : {1} entries from {2} samples".format(
                    group.name, chain.GetEntries(), len(group.datasets))
            chain.cache_directory = os.path.abspath('./selection_cache/' +
                                                    group.name + '/')
            tcuts = [r.TCut(reg, selection_formulas()[reg]) for reg in regions]
            print 'tcuts ', [c.GetName() for c in tcuts]
            chain.retrieve_entrylists(tcuts)
            counters_pre, histos_pre = dict(), dict()
            counters_npre, histos_npre = dict(), dict()
            print 'tcuts_with_existing_list ', str(
                [c.GetName() for c in chain.tcuts_with_existing_list()])
            print 'tcuts_without_existing_list ', str(
                [c.GetName() for c in chain.tcuts_without_existing_list()])
            cached_tcuts = [] if options.disable_cache else chain.tcuts_with_existing_list(
            )
            print 'cached_tcuts ', [c.GetName() for c in cached_tcuts]
            uncached_tcuts = tcuts if options.disable_cache else chain.tcuts_without_existing_list(
            )
            print 'todo: skip cuts for which the histo files are there'
            if verbose:
                print " --- group : {0} ---".format(group.name)
                print '\n\t'.join(chain.filenames)
            if verbose:
                print 'filling cached cuts: ', ' '.join(
                    [c.GetName() for c in cached_tcuts])
            if verbose:
                print "%s : %d entries" % (group.name, chain.GetEntries())
            histosThisGroup = histosPerGroup[group.name]
            histosThisGroupPerSource = dict(
                (v, histosPerGroupPerSource[v][group.name])
                for v in histosPerGroupPerSource.keys())
            for cut in cached_tcuts:
                print 'cached_tcut ', cut
                chain.preselect(cut)
                num_processed_entries += fillHistos(
                    chain,
                    histosThisGroup,
                    histosPerSource,
                    histosThisGroupPerSource,
                    lepton,
                    group,
                    cut,
                    cut_is_cached=True,
                    onthefly_tight_def=onthefly_tight_def,
                    verbose=verbose)
            if verbose:
                print 'filling uncached cuts: ', ' '.join(
                    [c.GetName() for c in uncached_tcuts])
            if uncached_tcuts:
                assert len(uncached_tcuts
                           ) == 1, "expecting only one cut, got {}".format(
                               len(uncached_tcuts))
                cut = uncached_tcuts[0]
                chain.preselect(None)
                num_processed_entries += fillHistos(
                    chain,
                    histosThisGroup,
                    histosPerSource,
                    histosThisGroupPerSource,
                    lepton,
                    group,
                    cut,
                    cut_is_cached=False,
                    onthefly_tight_def=onthefly_tight_def,
                    verbose=verbose)
                chain.save_lists()

        writeHistos(cacheFileName, histosPerGroup, histosPerSource,
                    histosPerGroupPerSource, verbose)
        end_time = time.clock()
        delta_time = end_time - start_time
        if verbose:
            print("processed {0:d} entries ".format(num_processed_entries) +
                  "in " +
                  ("{0:d} min ".format(int(delta_time / 60))
                   if delta_time > 60 else "{0:.1f} s ".format(delta_time)) +
                  "({0:.1f} kHz)".format(num_processed_entries / delta_time))
    # return
    # compute scale factors
    histosPerGroup = fetchHistos(cacheFileName,
                                 histoNames(vars, group_names, region),
                                 verbose)
    histosPerSource = fetchHistos(
        cacheFileName, histoNamesPerSource(vars, leptonSources, region),
        verbose)
    histosPerSamplePerSource = fetchHistos(
        cacheFileName,
        histoNamesPerSamplePerSource(vars, group_names, leptonSources, region),
        verbose)
    plotStackedHistos(histosPerGroup, outputDir + '/by_group', region, verbose)
    plotStackedHistosSources(histosPerSource, outputDir + '/by_source', region,
                             verbose)
    plotPerSourceEff(histosPerVar=histosPerSource,
                     outputDir=outputDir + '/by_source',
                     lepton=lepton,
                     region=region,
                     verbose=verbose)
    for g in group_names:
        hps = dict((v, histosPerSamplePerSource[v][g]) for v in vars)
        plotPerSourceEff(histosPerVar=hps,
                         outputDir=outputDir,
                         lepton=lepton,
                         region=region,
                         sample=g,
                         verbose=verbose)

    hn_sf_eta = histoname_sf_vs_eta(lepton)
    hn_sf_pt = histoname_sf_vs_pt(lepton)
    hn_da_eta = histoname_data_fake_eff_vs_eta(lepton)
    hn_da_pt = histoname_data_fake_eff_vs_pt(lepton)
    subtractReal = not keepreal
    objs_eta = subtractRealAndComputeScaleFactor(histosPerGroup, 'eta1',
                                                 hn_sf_eta, hn_da_eta,
                                                 outputDir, region,
                                                 subtractReal, verbose)
    objs_pt = subtractRealAndComputeScaleFactor(histosPerGroup, 'pt1',
                                                hn_sf_pt, hn_da_pt, outputDir,
                                                region, subtractReal, verbose)
    objs_pt_eta = subtractRealAndComputeScaleFactor(
        histosPerGroup, 'pt1_eta1', histoname_sf_vs_pt_eta(lepton),
        histoname_data_fake_eff_vs_pt_eta(lepton), outputDir, region,
        subtractReal, verbose)
    rootUtils.writeObjectsToFile(
        outputFileName, dictSum(dictSum(objs_eta, objs_pt), objs_pt_eta),
        verbose)
    if verbose: print "saved scale factors to %s" % outputFileName