Esempio n. 1
0
    def get_channel(self, hist_template, expr_or_clf, category, region,
                    cuts=None,
                    include_signal=True,
                    mass=125,
                    mode=None,
                    mixings=0.0,
                    clf=None,
                    min_score=None,
                    max_score=None,
                    systematics=True,
                    no_signal_fixes=False,
                    weighted=True):

        # TODO: implement blinding
        log.info("constructing channels")
        samples = [self.data] + self.backgrounds
        channel_name = 'hh_{0}_{1}'.format(self.year % 1000, category.name)
        suffix = None
        if include_signal:
            if isinstance(mass, list):
                suffix = '_' + ('_'.join(map(str, mass)))
            else:
                suffix = '_%d' % mass
            channel_name += suffix
            log.info("in get_channel signaling your higgle with mode {0} and mixing {1}".format(mode,mixings))
            samples += self.get_signals(mass=mass, mixing=mixings, mode=mode)

        # create HistFactory samples
        histfactory_samples = []
        for s in samples:
            sample = s.get_histfactory_sample(
                hist_template, expr_or_clf,
                category, region,
                cuts=cuts,
                clf=clf,
                min_score=min_score,
                max_score=max_score,
                suffix=suffix if not isinstance(s, Higgs) else None,
                no_signal_fixes=no_signal_fixes,
                systematics=systematics,
                weighted=weighted)
            histfactory_samples.append(sample)

        # create channel for this mass point
        return histfactory.make_channel(
            channel_name, histfactory_samples[1:], data=histfactory_samples[0])
Esempio n. 2
0
    def get_channel(self, hist_template, expr_or_clf, category, region,
                    cuts=None,
                    include_signal=True,
                    mass=125,
                    mode=None,
                    clf=None,
                    min_score=None,
                    max_score=None,
                    systematics=True,
                    no_signal_fixes=False,
                    weighted=True):

        # TODO: implement blinding
        log.info("constructing channels")
        samples = [self.data] + self.backgrounds
        channel_name = 'hh_{0}_{1}'.format(self.year % 1000, category.name)
        suffix = None
        if include_signal:
            if isinstance(mass, list):
                suffix = '_' + ('_'.join(map(str, mass)))
            else:
                suffix = '_%d' % mass
            channel_name += suffix
            samples += self.get_signals(mass, mode)

        # create HistFactory samples
        histfactory_samples = []
        for s in samples:
            sample = s.get_histfactory_sample(
                hist_template, expr_or_clf,
                category, region,
                cuts=cuts,
                clf=clf,
                min_score=min_score,
                max_score=max_score,
                suffix=suffix if not isinstance(s, Higgs) else None,
                no_signal_fixes=no_signal_fixes,
                systematics=systematics,
                weighted=weighted)
            histfactory_samples.append(sample)

        # create channel for this mass point
        return histfactory.make_channel(
            channel_name, histfactory_samples[1:], data=histfactory_samples[0])
Esempio n. 3
0
    def get_channel(self, hist_template, expr_or_clf, category, region,
                    cuts=None,
                    include_signal=True,
                    mass=125,
                    mode=None,
                    clf=None,
                    min_score=None,
                    max_score=None,
                    systematics=True,
                    no_signal_fixes=False):

        # TODO: implement blinding
        log.info("constructing channels")
        samples = [self.data] + self.backgrounds
        channel_name = category.name
        suffix = None
        if include_signal:
            if isinstance(mass, list):
                suffix = '_' + ('_'.join(map(str, mass)))
            else:
                suffix = '_%d' % mass
            channel_name += suffix
            samples += self.get_signals(mass, mode)

        # create HistFactory samples
        histfactory_samples = []
        for s in samples:
            sample = s.get_histfactory_sample(
                hist_template, expr_or_clf,
                category, region,
                cuts=cuts,
                clf=clf,
                min_score=min_score,
                max_score=max_score,
                suffix=suffix if not isinstance(s, Higgs) else None,
                no_signal_fixes=no_signal_fixes)
            histfactory_samples.append(sample)

        # create channel for this mass point
        return histfactory.make_channel(
            channel_name, histfactory_samples[1:], data=histfactory_samples[0])
Esempio n. 4
0
    def clf_channels(self, clf,
                     category, region,
                     cuts=None,
                     bins=10,
                     limits=None,
                     mass=None,
                     mode=None,
                     systematics=True,
                     unblind=False,
                     hybrid_data=False,
                     no_signal_fixes=False,
                     uniform=False,
                     mva=False,
                     min_score=None,
                     max_score=None,
                     include_signal=True):
        """
        Return a HistFactory Channel for each mass hypothesis
        """
        log.info("constructing channels")

        # determine min and max scores
        scores_obj = self.get_scores(
            clf, category, region, cuts=cuts,
            masses=[mass], mode=mode,
            systematics=systematics,
            unblind=unblind)

        data_scores = scores_obj.data_scores
        bkg_scores = scores_obj.bkg_scores
        all_sig_scores = scores_obj.all_sig_scores
        if min_score is None:
            min_score = scores_obj.min_score
        if max_score is None:
            max_score = scores_obj.max_score

        if isinstance(bins, int):
            if limits is not None:
                low, high = limits
                binning = Hist(bins, low, high, type='D')
            else:
                binning = Hist(bins, min_score, max_score, type='D')
        else: # iterable
            if bins[0] > min_score:
                log.warning("min score is less than first edge "
                            "(will be underflow)")
            if bins[-1] <= max_score:
                log.warning("max score is greater than or equal to last edge "
                            "(will be overflow)")
            binning = Hist(bins, type='D')

        bkg_samples = []
        for s, scores in bkg_scores:
            hist_template = binning.Clone(
                title=s.label,
                **s.hist_decor)
            sample = s.get_histfactory_sample(
                hist_template, clf,
                category, region,
                min_score=min_score, max_score=max_score ,
                cuts=cuts, scores=scores,
                systematics=systematics,
                uniform=uniform,
                mva=mva)
            bkg_samples.append(sample)

        data_sample = None
        if data_scores is not None:
            hist_template = binning.Clone(
                title=self.data.label,
                **self.data.hist_decor)
            data_sample = self.data.get_histfactory_sample(
                hist_template, clf,
                category, region,
                cuts=cuts, scores=data_scores,
                uniform=uniform)
            if unblind is False:
                # blind full histogram
                data_sample.hist[:] = (0, 0)
            elif (unblind is not True) and isinstance(unblind, int):
                # blind highest N bins
                data_sample.hist[-(unblind + 1):] = (0, 0)
            elif isinstance(unblind, float):
                # blind above a signal efficiency
                max_unblind_score = efficiency_cut(
                    sum([histogram_scores(hist_template, scores)
                        for s, scores in all_sig_scores[mass]]), unblind)
                blind_bin = hist_template.FindBin(max_unblind_score)
                data_sample.hist[blind_bin:] = (0, 0)

        # create signal HistFactory samples
        sig_samples = []
        if include_signal:
            for s, scores in all_sig_scores[mass]:
                hist_template = binning.Clone(
                    title=s.label,
                    **s.hist_decor)
                sample = s.get_histfactory_sample(
                    hist_template, clf,
                    category, region,
                    cuts=cuts, scores=scores,
                    no_signal_fixes=no_signal_fixes,
                    systematics=systematics,
                    uniform=uniform,
                    mva=mva)
                sig_samples.append(sample)

        # replace data in blind bins with signal + background
        if hybrid_data and (unblind is not True):
            sum_sig_bkg = sum([s.hist for s in (bkg_samples + sig_samples)])
            if unblind is False:
                # replace full hist
                data_sample.hist[:] = sum_sig_bkg[:]
            elif isinstance(unblind, int):
                # replace highest N bins
                bin = -(unblind + 1)
                data_sample.hist[bin:] = sum_sig_bkg[bin:]
            elif isinstance(unblind, float):
                data_sample.hist[blind_bin:] = sum_sig_bkg[blind_bin:]

        # create channel for this mass point
        channel = histfactory.make_channel(
            'hh_{0}_{1}_{2}'.format(self.year % 1000, category.name, mass),
            bkg_samples + sig_samples,
            data=data_sample)

        return scores_obj, channel
Esempio n. 5
0
    def get_channel_array(self, vars,
                          category, region,
                          cuts=None,
                          include_signal=True,
                          mass=125,
                          mixings=0.0,
                          mode=None,
                          scale_125=False,
                          clf=None,
                          min_score=None,
                          max_score=None,
                          weighted=True,
                          templates=None,
                          field_scale=None,
                          weight_hist=None,
                          systematics=True,
                          no_signal_fixes=False,
                          bootstrap_data=False,
                          ravel=True,
                          uniform=False,
                          hybrid_data=None):
        log.info("in get channel array {0}".format(str(mixings)))
        log.info("with mode {0}".format(str(mode)))
        """
        Return a dictionnary of histfactory channels for different variables
        (i.e. {'MMC_MASS':channel1, ...}).

        Parameters
        ----------
        vars: dict
            dictionary of histograms (i.e. {'MMC_MASS':hist_template, ...}
        category: Category
            analysis category (see mva/categories/*)
        region: str
            analysis region (i.e 'OS_ISOL', ...)
        cuts : str or Cut
            additional cuts that could be place when requesting the channel
            array (See mva/categories/common.py for examples)
        hybrid_data : dict
            if specified, it is a dictionary mapping the vars key to a tuple
            specifying the range to be replaced by s+b prediction.
        """
        mixing=mixings
        # TODO: implement blinding
        log.info("constructing channels")
        samples = [self.data] + self.backgrounds
        channel_name = 'hh_{0}_{1}'.format(self.year % 1000, category.name)
        suffix = None
        if include_signal:
            if isinstance(mass, list):
                suffix = '_' + ('_'.join(map(str, mass)))
            else:
                suffix = '_%d' % mass
            channel_name += suffix
            log.info("about to get signals with mixing of {0}".format(mixing))
            samples += self.get_signals(mass=mass, mixing=mixing, mode=mode, scale_125=scale_125)

        # create HistFactory samples
        histfactory_samples = []
        for s in samples:
            field_hist, _ = s.get_field_hist(
                vars, category, templates=templates)
            field_sample = s.get_histfactory_sample_array(
                field_hist,
                category, region,
                cuts=cuts,
                clf=clf,
                min_score=min_score,
                max_score=max_score,
                weighted=weighted,
                field_scale=field_scale,
                weight_hist=weight_hist,
                systematics=systematics,
                suffix=suffix if not isinstance(s, Higgs) else None,
                no_signal_fixes=no_signal_fixes,
                bootstrap_data=bootstrap_data,
                ravel=ravel,
                uniform=uniform)
            histfactory_samples.append(field_sample)

            if isinstance(s, Higgs):
                log.info("got a higgle signal")
        field_channels = {}
        for field in vars.keys():
            # create channel for this mass point
            channel = histfactory.make_channel(
                channel_name + '_{0}'.format(field),
                [s[field] for s in histfactory_samples[1:]],
                data=histfactory_samples[0][field])
            # implement hybrid data if requested
            # TODO: clean up
            if isinstance(hybrid_data, dict):
                log.info('constructing hybrid data')
                if field in hybrid_data.keys():
                    if isinstance(hybrid_data[field], (list, tuple)):
                        log.info('hybrid data: replacing data by s+b '
                                 'prediction for {0} in range {1}'.format(
                                    field, hybrid_data[field]))
                        if len(hybrid_data[field])!=2:
                            log.error('hybrid data: Need to specify a '
                                      'range with only two edged')
                        # Get the range of bins to be replaced (add 1
                        # additional bin on both side for safety)
                        (replace_low, replace_high) = (
                            hybrid_data[field][0], hybrid_data[field][1])
                        hist_data_template = self.data.get_field_hist(
                            vars, category)
                        log.info('hybrid data: template binning {0}'.format(
                            list(hist_data_template[0][field].xedges())))
                        replace_bin = (
                            hist_data_template[0][field].FindBin(float(replace_low))-1,
                            hist_data_template[0][field].FindBin(float(replace_high))+1)
                        total_bkg_sig = sum([s.hist for s in channel.samples])
                        log.info('hybrid data: before --> {0}'.format(
                            list(channel.data.hist.y())))
                        channel.data.hist[replace_bin[0]:replace_bin[1]] = \
                            total_bkg_sig[replace_bin[0]:replace_bin[1]]
                        log.info('hybrid data: after --> {0}'.format(
                            list(channel.data.hist.y())))
            field_channels[field] = channel
        return field_channels
Esempio n. 6
0
    def clf_channels(self, clf,
                     category, region,
                     cuts=None,
                     bins=10,
                     mass_points=None,
                     mode=None,
                     systematics=True,
                     unblind=False,
                     hybrid_data=False,
                     no_signal_fixes=False):
        """
        Return a HistFactory Channel for each mass hypothesis
        """
        log.info("constructing channels")
        channels = dict()

        scores_obj = self.get_scores(
            clf, category, region, cuts=cuts,
            mass_points=mass_points, mode=mode,
            systematics=systematics,
            unblind=unblind)

        data_scores = scores_obj.data_scores
        bkg_scores = scores_obj.bkg_scores
        all_sig_scores = scores_obj.all_sig_scores
        min_score = scores_obj.min_score
        max_score = scores_obj.max_score

        bkg_samples = []
        for s, scores in bkg_scores:
            hist_template = Hist(
                bins, min_score, max_score,
                title=s.label,
                type='D',
                **s.hist_decor)
            sample = s.get_histfactory_sample(
                hist_template, clf,
                category, region,
                cuts=cuts, scores=scores)
            bkg_samples.append(sample)

        data_sample = None
        if data_scores is not None:
            max_unblind_score = None
            if isinstance(unblind, float):
                """
                max_unblind_score = min([
                    efficiency_cut(
                        sum([histogram_scores(hist_template, scores)
                             for s, scores in all_sig_scores[mass]]), 0.3)
                        for mass in mass_points])
                """
                max_unblind_score = efficiency_cut(
                    sum([histogram_scores(hist_template, scores)
                         for s, scores in all_sig_scores[125]]), unblind)
            hist_template = Hist(
                bins, min_score, max_score,
                title=self.data.label,
                type='D',
                **self.data.hist_decor)
            data_sample = self.data.get_histfactory_sample(
                hist_template, clf,
                category, region,
                cuts=cuts, scores=data_scores,
                max_score=max_unblind_score)
            if not unblind and hybrid_data:
                # blinded bins filled with S+B, for limit/p0 plots
                # Swagato:
                # We have to make 2 kinds of expected sensitivity plots:
                # blinded sensitivity and unblinded sensitivity.
                # For the first one pure AsimovData is used, for second one I
                # suggest to use Hybrid, because the profiled NP's are not
                # always at 0 pull.
                pass

        if mass_points is None:
            # create channel without signal
            channel = histfactory.make_channel(
                category.name,
                bkg_samples,
                data=data_sample)
            return scores_obj, channel

        # signal scores
        for mass in samples.Higgs.MASS_POINTS:
            if mass not in mass_points:
                continue
            log.info('=' * 20)
            log.info("%d GeV mass hypothesis" % mass)

            # create HistFactory samples
            sig_samples = []
            for s, scores in all_sig_scores[mass]:
                hist_template = Hist(
                    bins, min_score, max_score,
                    title=s.label,
                    type='D',
                    **s.hist_decor)
                sample = s.get_histfactory_sample(
                    hist_template, clf,
                    category, region,
                    cuts=cuts, scores=scores,
                    no_signal_fixes=no_signal_fixes)
                sig_samples.append(sample)

            # create channel for this mass point
            channel = histfactory.make_channel(
                "%s_%d" % (category.name, mass),
                bkg_samples + sig_samples,
                data=data_sample)
            channels[mass] = channel

        return scores_obj, channels
Esempio n. 7
0
    def get_channel_array(self, vars,
                          category, region,
                          cuts=None,
                          include_signal=True,
                          mass=125,
                          mode=None,
                          scale_125=False,
                          clf=None,
                          min_score=None,
                          max_score=None,
                          weighted=True,
                          templates=None,
                          field_scale=None,
                          weight_hist=None,
                          systematics=True,
                          no_signal_fixes=False,
                          bootstrap_data=False,
                          ravel=True,
                          uniform=False):

        # TODO: implement blinding
        log.info("constructing channels")
        samples = [self.data] + self.backgrounds
        channel_name = category.name
        suffix = None
        if include_signal:
            if isinstance(mass, list):
                suffix = '_' + ('_'.join(map(str, mass)))
            else:
                suffix = '_%d' % mass
            channel_name += suffix
            samples += self.get_signals(mass, mode, scale_125=scale_125)

        # create HistFactory samples
        histfactory_samples = []
        for s in samples:
            field_hist = s.get_field_hist(vars, category, templates=templates)
            field_sample = s.get_histfactory_sample_array(
                field_hist,
                category, region,
                cuts=cuts,
                clf=clf,
                min_score=min_score,
                max_score=max_score,
                weighted=weighted,
                field_scale=field_scale,
                weight_hist=weight_hist,
                systematics=systematics,
                suffix=suffix if not isinstance(s, Higgs) else None,
                no_signal_fixes=no_signal_fixes,
                bootstrap_data=bootstrap_data,
                ravel=ravel,
                uniform=uniform)
            histfactory_samples.append(field_sample)

        field_channels = {}
        for field in vars.keys():
            # create channel for this mass point
            channel = histfactory.make_channel(
                channel_name + '_{0}'.format(field),
                [s[field] for s in histfactory_samples[1:]],
                data=histfactory_samples[0][field])
            field_channels[field] = channel
        return field_channels
Esempio n. 8
0
def optimized_channels(clf,
                       category,
                       region,
                       backgrounds,
                       data=None,
                       cuts=None,
                       mass_points=None,
                       mu=1.,
                       systematics=True,
                       lumi_rel_error=0.,
                       algo='EvenBinningByLimit'):
    """
    Return optimally binned HistFactory Channels for each mass hypothesis

    Determine the number of bins that yields the best limit at the 125 GeV mass
    hypothesis. Then construct and return the channels for all requested mass
    hypotheses.

    algos: EvenBinningByLimit, UnevenBinningBySignificance
    """
    log.info("constructing optimized channels")

    scores_obj = get_scores(clf,
                            category,
                            region,
                            backgrounds,
                            data=data,
                            cuts=cuts,
                            mass_points=mass_points,
                            mu=mu,
                            systematics=systematics)

    data_scores = scores_obj.data_scores
    bkg_scores = scores_obj.bkg_scores
    all_sig_scores = scores_obj.all_sig_scores
    min_score = scores_obj.min_score
    max_score = scores_obj.max_score

    sig_scores = all_sig_scores[125]

    best_hist_template = None
    if algo == 'EvenBinningByLimit':
        limit_hists = []
        best_limit = float('inf')
        best_nbins = 0
        nbins_range = xrange(2, 50)

        for nbins in nbins_range:

            hist_template = Hist(nbins, min_score, max_score, type='D')

            # create HistFactory samples
            samples = []
            for s, scores in bkg_scores + sig_scores:
                sample = s.get_histfactory_sample(hist_template,
                                                  clf,
                                                  category,
                                                  region,
                                                  cuts=cuts,
                                                  scores=scores)
                samples.append(sample)

            data_sample = None
            if data is not None:
                data_sample = data.get_histfactory_sample(hist_template,
                                                          clf,
                                                          category,
                                                          region,
                                                          cuts=cuts,
                                                          scores=data_scores)

            # create channel for this mass point
            channel = histfactory.make_channel("%s_%d" % (category.name, 125),
                                               samples,
                                               data=data_sample)

            # get limit
            limit_hist = get_limit(channel, lumi_rel_error=lumi_rel_error)
            limit_hist.SetName("%s_%d_%d" % (category, 125, nbins))

            # is this better than the best limit so far?
            hist_dict = hist_to_dict(limit_hist)
            limit_hists.append(hist_dict)
            if hist_dict['Expected'] < best_limit:
                best_limit = hist_dict['Expected']
                best_nbins = nbins
                best_hist_template = hist_template

        # plot limit vs nbins
        fig = plt.figure()
        ax = fig.add_subplot(111)
        central_values = np.array([h['Expected'] for h in limit_hists])
        high_values_1sig = np.array([h['+1sigma'] for h in limit_hists])
        low_values_1sig = np.array([h['-1sigma'] for h in limit_hists])
        high_values_2sig = np.array([h['+2sigma'] for h in limit_hists])
        low_values_2sig = np.array([h['-2sigma'] for h in limit_hists])
        plt.plot(nbins_range, central_values, 'k-')
        plt.fill_between(nbins_range,
                         low_values_2sig,
                         high_values_2sig,
                         linewidth=0,
                         facecolor='yellow')
        plt.fill_between(nbins_range,
                         low_values_1sig,
                         high_values_1sig,
                         linewidth=0,
                         facecolor='green')
        plt.xlim(nbins_range[0], nbins_range[-1])
        plt.xlabel("Number of Bins")
        plt.ylabel("Limit")
        plt.grid(True)
        plt.text(.5,
                 .8,
                 "Best limit of %.2f at %d bins" % (best_limit, best_nbins),
                 horizontalalignment='center',
                 verticalalignment='center',
                 transform=ax.transAxes,
                 fontsize=20)
        plt.savefig('category_%s_limit_vs_nbins.png' % category.name)

    elif algo == 'UnevenBinningBySignificance':
        #hist_template = Hist(200, min_score, max_score)
        hist_template = Hist(200, -1.0, 1.0, type='D')

        sig_hist = hist_template.Clone(title='Signal')
        sig_hist.systematics = {}
        for sig, scores_dict in sig_scores:
            scores, weight = scores_dict['NOMINAL']
            sig_hist.fill_array(scores, weight)
            for sys_term in scores_dict.keys():
                if sys_term == 'NOMINAL':
                    continue
                if not sys_term in sig_hist.systematics:
                    sys_hist = hist_template.Clone()
                    sig_hist.systematics[sys_term] = sys_hist
                else:
                    sys_hist = sig_hist.systematics[sys_term]
                scores, weight = scores_dict[sys_term]
                sys_hist.fill_array(scores, weight)

        bkg_hist = hist_template.Clone(title='Background')
        bkg_hist.systematics = {}
        for bkg, scores_dict in bkg_scores:
            scores, weight = scores_dict['NOMINAL']
            bkg_hist.fill_array(scores, weight)
            for sys_term in scores_dict.keys():
                if sys_term == 'NOMINAL':
                    continue
                if not sys_term in bkg_hist.systematics:
                    sys_hist = hist_template.Clone()
                    bkg_hist.systematics[sys_term] = sys_hist
                else:
                    sys_hist = bkg_hist.systematics[sys_term]
                scores, weight = scores_dict[sys_term]
                sys_hist.fill_array(scores, weight)

        print "SIG entries:", sig_hist.GetEntries()
        print "BKG entries:", bkg_hist.GetEntries()
        sig_hist, bkg_hist, best_hist_template = optimize_binning(
            sig_hist,
            bkg_hist,
            #starting_point='fine'
            starting_point='merged')
        if best_hist_template is None:
            best_hist_template = hist_template
        #raw_input("Hit enter to continue...")
    else:
        print "ERROR: binning optimisation algo %s not in list!" % algo
        exit(1)

    hist_template = best_hist_template
    channels = dict()

    # create HistFactory samples
    bkg_samples = []
    for s, scores in bkg_scores:
        sample = s.get_histfactory_sample(hist_template,
                                          clf,
                                          category,
                                          region,
                                          cuts=cuts,
                                          scores=scores)
        bkg_samples.append(sample)

    data_sample = None
    if data_scores is not None:
        data_sample = data.get_histfactory_sample(hist_template,
                                                  clf,
                                                  category,
                                                  region,
                                                  cuts=cuts,
                                                  scores=data_scores)

    # now use the optimal binning and construct channels for all requested mass
    # hypotheses
    for mass in Higgs.MASSES:
        if mass_points is not None and mass not in mass_points:
            continue
        log.info('=' * 20)
        log.info("%d GeV mass hypothesis" % mass)

        # create HistFactory samples
        sig_samples = []
        for s, scores in all_sig_scores[mass]:
            sample = s.get_histfactory_sample(hist_template,
                                              clf,
                                              category,
                                              region,
                                              cuts=cuts,
                                              scores=scores)
            sig_samples.append(sample)

        # create channel for this mass point
        channel = histfactory.make_channel("%s_%d" % (category.name, mass),
                                           bkg_samples + sig_samples,
                                           data=data_sample)

        channels[mass] = channel
    return channels
Esempio n. 9
0
    def clf_channels(self, clf,
                     category, region,
                     cuts=None,
                     bins=10,
                     limits=None,
                     mass=None,
                     mode=None,
                     systematics=True,
                     unblind=False,
                     hybrid_data=False,
                     no_signal_fixes=False,
                     uniform=False,
                     mva=False):
        """
        Return a HistFactory Channel for each mass hypothesis
        """
        log.info("constructing channels")

        # determine min and max scores
        scores_obj = self.get_scores(
            clf, category, region, cuts=cuts,
            masses=[mass], mode=mode,
            systematics=systematics,
            unblind=unblind)

        data_scores = scores_obj.data_scores
        bkg_scores = scores_obj.bkg_scores
        all_sig_scores = scores_obj.all_sig_scores
        min_score = scores_obj.min_score
        max_score = scores_obj.max_score

        if isinstance(bins, int):
            if limits is not None:
                low, high = limits
                binning = Hist(bins, low, high, type='D')
            else:
                binning = Hist(bins, min_score, max_score, type='D')
        else: # iterable
            if bins[0] > min_score:
                log.warning("min score is less than first edge "
                            "(will be underflow)")
            if bins[-1] <= max_score:
                log.warning("max score is greater than or equal to last edge "
                            "(will be overflow)")
            binning = Hist(bins, type='D')

        bkg_samples = []
        for s, scores in bkg_scores:
            hist_template = binning.Clone(
                title=s.label,
                **s.hist_decor)
            sample = s.get_histfactory_sample(
                hist_template, clf,
                category, region,
                cuts=cuts, scores=scores,
                systematics=systematics,
                uniform=uniform,
                mva=mva)
            bkg_samples.append(sample)

        data_sample = None
        if data_scores is not None:
            hist_template = binning.Clone(
                title=self.data.label,
                **self.data.hist_decor)
            data_sample = self.data.get_histfactory_sample(
                hist_template, clf,
                category, region,
                cuts=cuts, scores=data_scores,
                uniform=uniform)
            if unblind is False:
                # blind full histogram
                data_sample.hist[:] = (0, 0)
            elif (unblind is not True) and isinstance(unblind, int):
                # blind highest N bins
                data_sample.hist[-(unblind + 1):] = (0, 0)
            elif isinstance(unblind, float):
                # blind above a signal efficiency
                max_unblind_score = efficiency_cut(
                    sum([histogram_scores(hist_template, scores)
                        for s, scores in all_sig_scores[mass]]), unblind)
                blind_bin = hist_template.FindBin(max_unblind_score)
                data_sample.hist[blind_bin:] = (0, 0)

        # create signal HistFactory samples
        sig_samples = []
        for s, scores in all_sig_scores[mass]:
            hist_template = binning.Clone(
                title=s.label,
                **s.hist_decor)
            sample = s.get_histfactory_sample(
                hist_template, clf,
                category, region,
                cuts=cuts, scores=scores,
                no_signal_fixes=no_signal_fixes,
                systematics=systematics,
                uniform=uniform,
                mva=mva)
            sig_samples.append(sample)

        # replace data in blind bins with signal + background
        if hybrid_data and (unblind is not True):
            sum_sig_bkg = sum([s.hist for s in (bkg_samples + sig_samples)])
            if unblind is False:
                # replace full hist
                data_sample.hist[:] = sum_sig_bkg[:]
            elif isinstance(unblind, int):
                # replace highest N bins
                bin = -(unblind + 1)
                data_sample.hist[bin:] = sum_sig_bkg[bin:]
            elif isinstance(unblind, float):
                data_sample.hist[blind_bin:] = sum_sig_bkg[blind_bin:]

        # create channel for this mass point
        channel = histfactory.make_channel(
            'hh_{0}_{1}_{2}'.format(self.year % 1000, category.name, mass),
            bkg_samples + sig_samples,
            data=data_sample)

        return scores_obj, channel
Esempio n. 10
0
    def get_channel_array(self, vars,
                          category, region,
                          cuts=None,
                          include_signal=True,
                          mass=125,
                          mode=None,
                          scale_125=False,
                          clf=None,
                          min_score=None,
                          max_score=None,
                          weighted=True,
                          templates=None,
                          field_scale=None,
                          weight_hist=None,
                          systematics=True,
                          no_signal_fixes=False,
                          bootstrap_data=False,
                          ravel=True,
                          uniform=False,
                          hybrid_data=None):
        """
        Return a dictionnary of histfactory channels for different variables
        (i.e. {'MMC_MASS':channel1, ...}).

        Parameters
        ----------
        vars: dict
            dictionary of histograms (i.e. {'MMC_MASS':hist_template, ...}
        category: Category
            analysis category (see mva/categories/*)
        region: str
            analysis region (i.e 'OS_ISOL', ...)
        cuts : str or Cut
            additional cuts that could be place when requesting the channel
            array (See mva/categories/common.py for examples)
        hybrid_data : dict
            if specified, it is a dictionary mapping the vars key to a tuple
            specifying the range to be replaced by s+b prediction.
        """
        # TODO: implement blinding
        log.info("constructing channels")
        samples = [self.data] + self.backgrounds
        channel_name = 'hh_{0}_{1}'.format(self.year % 1000, category.name)
        suffix = None
        if include_signal:
            if isinstance(mass, list):
                suffix = '_' + ('_'.join(map(str, mass)))
            else:
                suffix = '_%d' % mass
            channel_name += suffix
            samples += self.get_signals(mass, mode, scale_125=scale_125)

        # create HistFactory samples
        histfactory_samples = []
        for s in samples:
            field_hist, _ = s.get_field_hist(
                vars, category, templates=templates)
            field_sample = s.get_histfactory_sample_array(
                field_hist,
                category, region,
                cuts=cuts,
                clf=clf,
                min_score=min_score,
                max_score=max_score,
                weighted=weighted,
                field_scale=field_scale,
                weight_hist=weight_hist,
                systematics=systematics,
                suffix=suffix if not isinstance(s, Higgs) else None,
                no_signal_fixes=no_signal_fixes,
                bootstrap_data=bootstrap_data,
                ravel=ravel,
                uniform=uniform)
            histfactory_samples.append(field_sample)

        field_channels = {}
        for field in vars.keys():
            # create channel for this mass point
            channel = histfactory.make_channel(
                channel_name + '_{0}'.format(field),
                [s[field] for s in histfactory_samples[1:]],
                data=histfactory_samples[0][field])
            # implement hybrid data if requested
            # TODO: clean up
            if isinstance(hybrid_data, dict):
                log.info('constructing hybrid data')
                if field in hybrid_data.keys():
                    if isinstance(hybrid_data[field], (list, tuple)):
                        log.info('hybrid data: replacing data by s+b '
                                 'prediction for {0} in range {1}'.format(
                                    field, hybrid_data[field]))
                        if len(hybrid_data[field])!=2:
                            log.error('hybrid data: Need to specify a '
                                      'range with only two edged')
                        # Get the range of bins to be replaced (add 1
                        # additional bin on both side for safety)
                        (replace_low, replace_high) = (
                            hybrid_data[field][0], hybrid_data[field][1])
                        hist_data_template = self.data.get_field_hist(
                            vars, category)
                        log.info('hybrid data: template binning {0}'.format(
                            list(hist_data_template[0][field].xedges())))
                        replace_bin = (
                            hist_data_template[0][field].FindBin(float(replace_low))-1,
                            hist_data_template[0][field].FindBin(float(replace_high))+1)
                        total_bkg_sig = sum([s.hist for s in channel.samples])
                        log.info('hybrid data: before --> {0}'.format(
                            list(channel.data.hist.y())))
                        channel.data.hist[replace_bin[0]:replace_bin[1]] = \
                            total_bkg_sig[replace_bin[0]:replace_bin[1]]
                        log.info('hybrid data: after --> {0}'.format(
                            list(channel.data.hist.y())))
            field_channels[field] = channel
        return field_channels
Esempio n. 11
0
def optimized_channels(clf, category, region, backgrounds,
                       data=None, cuts=None, mass_points=None, mu=1.,
                       systematics=True, lumi_rel_error=0.,
                       algo='EvenBinningByLimit'):
    """
    Return optimally binned HistFactory Channels for each mass hypothesis

    Determine the number of bins that yields the best limit at the 125 GeV mass
    hypothesis. Then construct and return the channels for all requested mass
    hypotheses.

    algos: EvenBinningByLimit, UnevenBinningBySignificance
    """
    log.info("constructing optimized channels")

    scores_obj = get_scores(clf, category, region, backgrounds,
                            data=data, cuts=cuts, mass_points=mass_points,
                            mu=mu, systematics=systematics)

    data_scores = scores_obj.data_scores
    bkg_scores = scores_obj.bkg_scores
    all_sig_scores = scores_obj.all_sig_scores
    min_score = scores_obj.min_score
    max_score = scores_obj.max_score

    sig_scores = all_sig_scores[125]

    best_hist_template = None
    if algo == 'EvenBinningByLimit':
        limit_hists = []
        best_limit = float('inf')
        best_nbins = 0
        nbins_range = xrange(2, 50)

        for nbins in nbins_range:

            hist_template = Hist(nbins, min_score, max_score, type='D')

            # create HistFactory samples
            samples = []
            for s, scores in bkg_scores + sig_scores:
                sample = s.get_histfactory_sample(
                    hist_template, clf,
                    category, region,
                    cuts=cuts, scores=scores)
                samples.append(sample)

            data_sample = None
            if data is not None:
                data_sample = data.get_histfactory_sample(
                    hist_template, clf,
                    category, region,
                    cuts=cuts, scores=data_scores)

            # create channel for this mass point
            channel = histfactory.make_channel(
                "%s_%d" % (category.name, 125),
                samples, data=data_sample)

            # get limit
            limit_hist = get_limit(channel,
                lumi_rel_error=lumi_rel_error)
            limit_hist.SetName("%s_%d_%d" % (category, 125, nbins))

            # is this better than the best limit so far?
            hist_dict = hist_to_dict(limit_hist)
            limit_hists.append(hist_dict)
            if hist_dict['Expected'] < best_limit:
                best_limit = hist_dict['Expected']
                best_nbins = nbins
                best_hist_template = hist_template

        # plot limit vs nbins
        fig = plt.figure()
        ax = fig.add_subplot(111)
        central_values = np.array([h['Expected'] for h in limit_hists])
        high_values_1sig = np.array([h['+1sigma'] for h in limit_hists])
        low_values_1sig = np.array([h['-1sigma'] for h in limit_hists])
        high_values_2sig = np.array([h['+2sigma'] for h in limit_hists])
        low_values_2sig = np.array([h['-2sigma'] for h in limit_hists])
        plt.plot(nbins_range, central_values, 'k-')
        plt.fill_between(nbins_range, low_values_2sig, high_values_2sig,
            linewidth=0, facecolor='yellow')
        plt.fill_between(nbins_range, low_values_1sig, high_values_1sig,
            linewidth=0, facecolor='green')
        plt.xlim(nbins_range[0], nbins_range[-1])
        plt.xlabel("Number of Bins")
        plt.ylabel("Limit")
        plt.grid(True)
        plt.text(.5, .8, "Best limit of %.2f at %d bins" % (best_limit, best_nbins),
                 horizontalalignment='center',
                 verticalalignment='center',
                 transform = ax.transAxes,
                 fontsize=20)
        plt.savefig('category_%s_limit_vs_nbins.png' % category.name)

    elif algo == 'UnevenBinningBySignificance':
        #hist_template = Hist(200, min_score, max_score)
        hist_template = Hist(200, -1.0, 1.0, type='D')

        sig_hist = hist_template.Clone(title='Signal')
        sig_hist.systematics = {}
        for sig, scores_dict in sig_scores:
            scores, weight = scores_dict['NOMINAL']
            sig_hist.fill_array(scores, weight)
            for sys_term in scores_dict.keys():
                if sys_term == 'NOMINAL':
                    continue
                if not sys_term in sig_hist.systematics:
                    sys_hist = hist_template.Clone()
                    sig_hist.systematics[sys_term] = sys_hist
                else:
                    sys_hist = sig_hist.systematics[sys_term]
                scores, weight = scores_dict[sys_term]
                sys_hist.fill_array(scores, weight)

        bkg_hist = hist_template.Clone(title='Background')
        bkg_hist.systematics = {}
        for bkg, scores_dict in bkg_scores:
            scores, weight = scores_dict['NOMINAL']
            bkg_hist.fill_array(scores, weight)
            for sys_term in scores_dict.keys():
                if sys_term == 'NOMINAL':
                    continue
                if not sys_term in bkg_hist.systematics:
                    sys_hist = hist_template.Clone()
                    bkg_hist.systematics[sys_term] = sys_hist
                else:
                    sys_hist = bkg_hist.systematics[sys_term]
                scores, weight = scores_dict[sys_term]
                sys_hist.fill_array(scores, weight)

        print "SIG entries:", sig_hist.GetEntries()
        print "BKG entries:", bkg_hist.GetEntries()
        sig_hist, bkg_hist, best_hist_template = optimize_binning(sig_hist, bkg_hist,
                #starting_point='fine'
                starting_point='merged'
            )
        if best_hist_template is None:
            best_hist_template = hist_template
        #raw_input("Hit enter to continue...")
    else:
        print "ERROR: binning optimisation algo %s not in list!" % algo
        exit(1)

    hist_template = best_hist_template
    channels = dict()

    # create HistFactory samples
    bkg_samples = []
    for s, scores in bkg_scores:
        sample = s.get_histfactory_sample(
            hist_template, clf,
            category, region,
            cuts=cuts, scores=scores)
        bkg_samples.append(sample)

    data_sample = None
    if data_scores is not None:
        data_sample = data.get_histfactory_sample(
            hist_template, clf,
            category, region,
            cuts=cuts, scores=data_scores)

    # now use the optimal binning and construct channels for all requested mass
    # hypotheses
    for mass in Higgs.MASSES:
        if mass_points is not None and mass not in mass_points:
            continue
        log.info('=' * 20)
        log.info("%d GeV mass hypothesis" % mass)

        # create HistFactory samples
        sig_samples = []
        for s, scores in all_sig_scores[mass]:
            sample = s.get_histfactory_sample(
                hist_template, clf,
                category, region,
                cuts=cuts, scores=scores)
            sig_samples.append(sample)

        # create channel for this mass point
        channel = histfactory.make_channel(
            "%s_%d" % (category.name, mass),
            bkg_samples + sig_samples,
            data=data_sample)

        channels[mass] = channel
    return channels