def get_channel(self, hist_template, expr_or_clf, category, region, cuts=None, include_signal=True, mass=125, mode=None, mixings=0.0, clf=None, min_score=None, max_score=None, systematics=True, no_signal_fixes=False, weighted=True): # TODO: implement blinding log.info("constructing channels") samples = [self.data] + self.backgrounds channel_name = 'hh_{0}_{1}'.format(self.year % 1000, category.name) suffix = None if include_signal: if isinstance(mass, list): suffix = '_' + ('_'.join(map(str, mass))) else: suffix = '_%d' % mass channel_name += suffix log.info("in get_channel signaling your higgle with mode {0} and mixing {1}".format(mode,mixings)) samples += self.get_signals(mass=mass, mixing=mixings, mode=mode) # create HistFactory samples histfactory_samples = [] for s in samples: sample = s.get_histfactory_sample( hist_template, expr_or_clf, category, region, cuts=cuts, clf=clf, min_score=min_score, max_score=max_score, suffix=suffix if not isinstance(s, Higgs) else None, no_signal_fixes=no_signal_fixes, systematics=systematics, weighted=weighted) histfactory_samples.append(sample) # create channel for this mass point return histfactory.make_channel( channel_name, histfactory_samples[1:], data=histfactory_samples[0])
def get_channel(self, hist_template, expr_or_clf, category, region, cuts=None, include_signal=True, mass=125, mode=None, clf=None, min_score=None, max_score=None, systematics=True, no_signal_fixes=False, weighted=True): # TODO: implement blinding log.info("constructing channels") samples = [self.data] + self.backgrounds channel_name = 'hh_{0}_{1}'.format(self.year % 1000, category.name) suffix = None if include_signal: if isinstance(mass, list): suffix = '_' + ('_'.join(map(str, mass))) else: suffix = '_%d' % mass channel_name += suffix samples += self.get_signals(mass, mode) # create HistFactory samples histfactory_samples = [] for s in samples: sample = s.get_histfactory_sample( hist_template, expr_or_clf, category, region, cuts=cuts, clf=clf, min_score=min_score, max_score=max_score, suffix=suffix if not isinstance(s, Higgs) else None, no_signal_fixes=no_signal_fixes, systematics=systematics, weighted=weighted) histfactory_samples.append(sample) # create channel for this mass point return histfactory.make_channel( channel_name, histfactory_samples[1:], data=histfactory_samples[0])
def get_channel(self, hist_template, expr_or_clf, category, region, cuts=None, include_signal=True, mass=125, mode=None, clf=None, min_score=None, max_score=None, systematics=True, no_signal_fixes=False): # TODO: implement blinding log.info("constructing channels") samples = [self.data] + self.backgrounds channel_name = category.name suffix = None if include_signal: if isinstance(mass, list): suffix = '_' + ('_'.join(map(str, mass))) else: suffix = '_%d' % mass channel_name += suffix samples += self.get_signals(mass, mode) # create HistFactory samples histfactory_samples = [] for s in samples: sample = s.get_histfactory_sample( hist_template, expr_or_clf, category, region, cuts=cuts, clf=clf, min_score=min_score, max_score=max_score, suffix=suffix if not isinstance(s, Higgs) else None, no_signal_fixes=no_signal_fixes) histfactory_samples.append(sample) # create channel for this mass point return histfactory.make_channel( channel_name, histfactory_samples[1:], data=histfactory_samples[0])
def clf_channels(self, clf, category, region, cuts=None, bins=10, limits=None, mass=None, mode=None, systematics=True, unblind=False, hybrid_data=False, no_signal_fixes=False, uniform=False, mva=False, min_score=None, max_score=None, include_signal=True): """ Return a HistFactory Channel for each mass hypothesis """ log.info("constructing channels") # determine min and max scores scores_obj = self.get_scores( clf, category, region, cuts=cuts, masses=[mass], mode=mode, systematics=systematics, unblind=unblind) data_scores = scores_obj.data_scores bkg_scores = scores_obj.bkg_scores all_sig_scores = scores_obj.all_sig_scores if min_score is None: min_score = scores_obj.min_score if max_score is None: max_score = scores_obj.max_score if isinstance(bins, int): if limits is not None: low, high = limits binning = Hist(bins, low, high, type='D') else: binning = Hist(bins, min_score, max_score, type='D') else: # iterable if bins[0] > min_score: log.warning("min score is less than first edge " "(will be underflow)") if bins[-1] <= max_score: log.warning("max score is greater than or equal to last edge " "(will be overflow)") binning = Hist(bins, type='D') bkg_samples = [] for s, scores in bkg_scores: hist_template = binning.Clone( title=s.label, **s.hist_decor) sample = s.get_histfactory_sample( hist_template, clf, category, region, min_score=min_score, max_score=max_score , cuts=cuts, scores=scores, systematics=systematics, uniform=uniform, mva=mva) bkg_samples.append(sample) data_sample = None if data_scores is not None: hist_template = binning.Clone( title=self.data.label, **self.data.hist_decor) data_sample = self.data.get_histfactory_sample( hist_template, clf, category, region, cuts=cuts, scores=data_scores, uniform=uniform) if unblind is False: # blind full histogram data_sample.hist[:] = (0, 0) elif (unblind is not True) and isinstance(unblind, int): # blind highest N bins data_sample.hist[-(unblind + 1):] = (0, 0) elif isinstance(unblind, float): # blind above a signal efficiency max_unblind_score = efficiency_cut( sum([histogram_scores(hist_template, scores) for s, scores in all_sig_scores[mass]]), unblind) blind_bin = hist_template.FindBin(max_unblind_score) data_sample.hist[blind_bin:] = (0, 0) # create signal HistFactory samples sig_samples = [] if include_signal: for s, scores in all_sig_scores[mass]: hist_template = binning.Clone( title=s.label, **s.hist_decor) sample = s.get_histfactory_sample( hist_template, clf, category, region, cuts=cuts, scores=scores, no_signal_fixes=no_signal_fixes, systematics=systematics, uniform=uniform, mva=mva) sig_samples.append(sample) # replace data in blind bins with signal + background if hybrid_data and (unblind is not True): sum_sig_bkg = sum([s.hist for s in (bkg_samples + sig_samples)]) if unblind is False: # replace full hist data_sample.hist[:] = sum_sig_bkg[:] elif isinstance(unblind, int): # replace highest N bins bin = -(unblind + 1) data_sample.hist[bin:] = sum_sig_bkg[bin:] elif isinstance(unblind, float): data_sample.hist[blind_bin:] = sum_sig_bkg[blind_bin:] # create channel for this mass point channel = histfactory.make_channel( 'hh_{0}_{1}_{2}'.format(self.year % 1000, category.name, mass), bkg_samples + sig_samples, data=data_sample) return scores_obj, channel
def get_channel_array(self, vars, category, region, cuts=None, include_signal=True, mass=125, mixings=0.0, mode=None, scale_125=False, clf=None, min_score=None, max_score=None, weighted=True, templates=None, field_scale=None, weight_hist=None, systematics=True, no_signal_fixes=False, bootstrap_data=False, ravel=True, uniform=False, hybrid_data=None): log.info("in get channel array {0}".format(str(mixings))) log.info("with mode {0}".format(str(mode))) """ Return a dictionnary of histfactory channels for different variables (i.e. {'MMC_MASS':channel1, ...}). Parameters ---------- vars: dict dictionary of histograms (i.e. {'MMC_MASS':hist_template, ...} category: Category analysis category (see mva/categories/*) region: str analysis region (i.e 'OS_ISOL', ...) cuts : str or Cut additional cuts that could be place when requesting the channel array (See mva/categories/common.py for examples) hybrid_data : dict if specified, it is a dictionary mapping the vars key to a tuple specifying the range to be replaced by s+b prediction. """ mixing=mixings # TODO: implement blinding log.info("constructing channels") samples = [self.data] + self.backgrounds channel_name = 'hh_{0}_{1}'.format(self.year % 1000, category.name) suffix = None if include_signal: if isinstance(mass, list): suffix = '_' + ('_'.join(map(str, mass))) else: suffix = '_%d' % mass channel_name += suffix log.info("about to get signals with mixing of {0}".format(mixing)) samples += self.get_signals(mass=mass, mixing=mixing, mode=mode, scale_125=scale_125) # create HistFactory samples histfactory_samples = [] for s in samples: field_hist, _ = s.get_field_hist( vars, category, templates=templates) field_sample = s.get_histfactory_sample_array( field_hist, category, region, cuts=cuts, clf=clf, min_score=min_score, max_score=max_score, weighted=weighted, field_scale=field_scale, weight_hist=weight_hist, systematics=systematics, suffix=suffix if not isinstance(s, Higgs) else None, no_signal_fixes=no_signal_fixes, bootstrap_data=bootstrap_data, ravel=ravel, uniform=uniform) histfactory_samples.append(field_sample) if isinstance(s, Higgs): log.info("got a higgle signal") field_channels = {} for field in vars.keys(): # create channel for this mass point channel = histfactory.make_channel( channel_name + '_{0}'.format(field), [s[field] for s in histfactory_samples[1:]], data=histfactory_samples[0][field]) # implement hybrid data if requested # TODO: clean up if isinstance(hybrid_data, dict): log.info('constructing hybrid data') if field in hybrid_data.keys(): if isinstance(hybrid_data[field], (list, tuple)): log.info('hybrid data: replacing data by s+b ' 'prediction for {0} in range {1}'.format( field, hybrid_data[field])) if len(hybrid_data[field])!=2: log.error('hybrid data: Need to specify a ' 'range with only two edged') # Get the range of bins to be replaced (add 1 # additional bin on both side for safety) (replace_low, replace_high) = ( hybrid_data[field][0], hybrid_data[field][1]) hist_data_template = self.data.get_field_hist( vars, category) log.info('hybrid data: template binning {0}'.format( list(hist_data_template[0][field].xedges()))) replace_bin = ( hist_data_template[0][field].FindBin(float(replace_low))-1, hist_data_template[0][field].FindBin(float(replace_high))+1) total_bkg_sig = sum([s.hist for s in channel.samples]) log.info('hybrid data: before --> {0}'.format( list(channel.data.hist.y()))) channel.data.hist[replace_bin[0]:replace_bin[1]] = \ total_bkg_sig[replace_bin[0]:replace_bin[1]] log.info('hybrid data: after --> {0}'.format( list(channel.data.hist.y()))) field_channels[field] = channel return field_channels
def clf_channels(self, clf, category, region, cuts=None, bins=10, mass_points=None, mode=None, systematics=True, unblind=False, hybrid_data=False, no_signal_fixes=False): """ Return a HistFactory Channel for each mass hypothesis """ log.info("constructing channels") channels = dict() scores_obj = self.get_scores( clf, category, region, cuts=cuts, mass_points=mass_points, mode=mode, systematics=systematics, unblind=unblind) data_scores = scores_obj.data_scores bkg_scores = scores_obj.bkg_scores all_sig_scores = scores_obj.all_sig_scores min_score = scores_obj.min_score max_score = scores_obj.max_score bkg_samples = [] for s, scores in bkg_scores: hist_template = Hist( bins, min_score, max_score, title=s.label, type='D', **s.hist_decor) sample = s.get_histfactory_sample( hist_template, clf, category, region, cuts=cuts, scores=scores) bkg_samples.append(sample) data_sample = None if data_scores is not None: max_unblind_score = None if isinstance(unblind, float): """ max_unblind_score = min([ efficiency_cut( sum([histogram_scores(hist_template, scores) for s, scores in all_sig_scores[mass]]), 0.3) for mass in mass_points]) """ max_unblind_score = efficiency_cut( sum([histogram_scores(hist_template, scores) for s, scores in all_sig_scores[125]]), unblind) hist_template = Hist( bins, min_score, max_score, title=self.data.label, type='D', **self.data.hist_decor) data_sample = self.data.get_histfactory_sample( hist_template, clf, category, region, cuts=cuts, scores=data_scores, max_score=max_unblind_score) if not unblind and hybrid_data: # blinded bins filled with S+B, for limit/p0 plots # Swagato: # We have to make 2 kinds of expected sensitivity plots: # blinded sensitivity and unblinded sensitivity. # For the first one pure AsimovData is used, for second one I # suggest to use Hybrid, because the profiled NP's are not # always at 0 pull. pass if mass_points is None: # create channel without signal channel = histfactory.make_channel( category.name, bkg_samples, data=data_sample) return scores_obj, channel # signal scores for mass in samples.Higgs.MASS_POINTS: if mass not in mass_points: continue log.info('=' * 20) log.info("%d GeV mass hypothesis" % mass) # create HistFactory samples sig_samples = [] for s, scores in all_sig_scores[mass]: hist_template = Hist( bins, min_score, max_score, title=s.label, type='D', **s.hist_decor) sample = s.get_histfactory_sample( hist_template, clf, category, region, cuts=cuts, scores=scores, no_signal_fixes=no_signal_fixes) sig_samples.append(sample) # create channel for this mass point channel = histfactory.make_channel( "%s_%d" % (category.name, mass), bkg_samples + sig_samples, data=data_sample) channels[mass] = channel return scores_obj, channels
def get_channel_array(self, vars, category, region, cuts=None, include_signal=True, mass=125, mode=None, scale_125=False, clf=None, min_score=None, max_score=None, weighted=True, templates=None, field_scale=None, weight_hist=None, systematics=True, no_signal_fixes=False, bootstrap_data=False, ravel=True, uniform=False): # TODO: implement blinding log.info("constructing channels") samples = [self.data] + self.backgrounds channel_name = category.name suffix = None if include_signal: if isinstance(mass, list): suffix = '_' + ('_'.join(map(str, mass))) else: suffix = '_%d' % mass channel_name += suffix samples += self.get_signals(mass, mode, scale_125=scale_125) # create HistFactory samples histfactory_samples = [] for s in samples: field_hist = s.get_field_hist(vars, category, templates=templates) field_sample = s.get_histfactory_sample_array( field_hist, category, region, cuts=cuts, clf=clf, min_score=min_score, max_score=max_score, weighted=weighted, field_scale=field_scale, weight_hist=weight_hist, systematics=systematics, suffix=suffix if not isinstance(s, Higgs) else None, no_signal_fixes=no_signal_fixes, bootstrap_data=bootstrap_data, ravel=ravel, uniform=uniform) histfactory_samples.append(field_sample) field_channels = {} for field in vars.keys(): # create channel for this mass point channel = histfactory.make_channel( channel_name + '_{0}'.format(field), [s[field] for s in histfactory_samples[1:]], data=histfactory_samples[0][field]) field_channels[field] = channel return field_channels
def optimized_channels(clf, category, region, backgrounds, data=None, cuts=None, mass_points=None, mu=1., systematics=True, lumi_rel_error=0., algo='EvenBinningByLimit'): """ Return optimally binned HistFactory Channels for each mass hypothesis Determine the number of bins that yields the best limit at the 125 GeV mass hypothesis. Then construct and return the channels for all requested mass hypotheses. algos: EvenBinningByLimit, UnevenBinningBySignificance """ log.info("constructing optimized channels") scores_obj = get_scores(clf, category, region, backgrounds, data=data, cuts=cuts, mass_points=mass_points, mu=mu, systematics=systematics) data_scores = scores_obj.data_scores bkg_scores = scores_obj.bkg_scores all_sig_scores = scores_obj.all_sig_scores min_score = scores_obj.min_score max_score = scores_obj.max_score sig_scores = all_sig_scores[125] best_hist_template = None if algo == 'EvenBinningByLimit': limit_hists = [] best_limit = float('inf') best_nbins = 0 nbins_range = xrange(2, 50) for nbins in nbins_range: hist_template = Hist(nbins, min_score, max_score, type='D') # create HistFactory samples samples = [] for s, scores in bkg_scores + sig_scores: sample = s.get_histfactory_sample(hist_template, clf, category, region, cuts=cuts, scores=scores) samples.append(sample) data_sample = None if data is not None: data_sample = data.get_histfactory_sample(hist_template, clf, category, region, cuts=cuts, scores=data_scores) # create channel for this mass point channel = histfactory.make_channel("%s_%d" % (category.name, 125), samples, data=data_sample) # get limit limit_hist = get_limit(channel, lumi_rel_error=lumi_rel_error) limit_hist.SetName("%s_%d_%d" % (category, 125, nbins)) # is this better than the best limit so far? hist_dict = hist_to_dict(limit_hist) limit_hists.append(hist_dict) if hist_dict['Expected'] < best_limit: best_limit = hist_dict['Expected'] best_nbins = nbins best_hist_template = hist_template # plot limit vs nbins fig = plt.figure() ax = fig.add_subplot(111) central_values = np.array([h['Expected'] for h in limit_hists]) high_values_1sig = np.array([h['+1sigma'] for h in limit_hists]) low_values_1sig = np.array([h['-1sigma'] for h in limit_hists]) high_values_2sig = np.array([h['+2sigma'] for h in limit_hists]) low_values_2sig = np.array([h['-2sigma'] for h in limit_hists]) plt.plot(nbins_range, central_values, 'k-') plt.fill_between(nbins_range, low_values_2sig, high_values_2sig, linewidth=0, facecolor='yellow') plt.fill_between(nbins_range, low_values_1sig, high_values_1sig, linewidth=0, facecolor='green') plt.xlim(nbins_range[0], nbins_range[-1]) plt.xlabel("Number of Bins") plt.ylabel("Limit") plt.grid(True) plt.text(.5, .8, "Best limit of %.2f at %d bins" % (best_limit, best_nbins), horizontalalignment='center', verticalalignment='center', transform=ax.transAxes, fontsize=20) plt.savefig('category_%s_limit_vs_nbins.png' % category.name) elif algo == 'UnevenBinningBySignificance': #hist_template = Hist(200, min_score, max_score) hist_template = Hist(200, -1.0, 1.0, type='D') sig_hist = hist_template.Clone(title='Signal') sig_hist.systematics = {} for sig, scores_dict in sig_scores: scores, weight = scores_dict['NOMINAL'] sig_hist.fill_array(scores, weight) for sys_term in scores_dict.keys(): if sys_term == 'NOMINAL': continue if not sys_term in sig_hist.systematics: sys_hist = hist_template.Clone() sig_hist.systematics[sys_term] = sys_hist else: sys_hist = sig_hist.systematics[sys_term] scores, weight = scores_dict[sys_term] sys_hist.fill_array(scores, weight) bkg_hist = hist_template.Clone(title='Background') bkg_hist.systematics = {} for bkg, scores_dict in bkg_scores: scores, weight = scores_dict['NOMINAL'] bkg_hist.fill_array(scores, weight) for sys_term in scores_dict.keys(): if sys_term == 'NOMINAL': continue if not sys_term in bkg_hist.systematics: sys_hist = hist_template.Clone() bkg_hist.systematics[sys_term] = sys_hist else: sys_hist = bkg_hist.systematics[sys_term] scores, weight = scores_dict[sys_term] sys_hist.fill_array(scores, weight) print "SIG entries:", sig_hist.GetEntries() print "BKG entries:", bkg_hist.GetEntries() sig_hist, bkg_hist, best_hist_template = optimize_binning( sig_hist, bkg_hist, #starting_point='fine' starting_point='merged') if best_hist_template is None: best_hist_template = hist_template #raw_input("Hit enter to continue...") else: print "ERROR: binning optimisation algo %s not in list!" % algo exit(1) hist_template = best_hist_template channels = dict() # create HistFactory samples bkg_samples = [] for s, scores in bkg_scores: sample = s.get_histfactory_sample(hist_template, clf, category, region, cuts=cuts, scores=scores) bkg_samples.append(sample) data_sample = None if data_scores is not None: data_sample = data.get_histfactory_sample(hist_template, clf, category, region, cuts=cuts, scores=data_scores) # now use the optimal binning and construct channels for all requested mass # hypotheses for mass in Higgs.MASSES: if mass_points is not None and mass not in mass_points: continue log.info('=' * 20) log.info("%d GeV mass hypothesis" % mass) # create HistFactory samples sig_samples = [] for s, scores in all_sig_scores[mass]: sample = s.get_histfactory_sample(hist_template, clf, category, region, cuts=cuts, scores=scores) sig_samples.append(sample) # create channel for this mass point channel = histfactory.make_channel("%s_%d" % (category.name, mass), bkg_samples + sig_samples, data=data_sample) channels[mass] = channel return channels
def clf_channels(self, clf, category, region, cuts=None, bins=10, limits=None, mass=None, mode=None, systematics=True, unblind=False, hybrid_data=False, no_signal_fixes=False, uniform=False, mva=False): """ Return a HistFactory Channel for each mass hypothesis """ log.info("constructing channels") # determine min and max scores scores_obj = self.get_scores( clf, category, region, cuts=cuts, masses=[mass], mode=mode, systematics=systematics, unblind=unblind) data_scores = scores_obj.data_scores bkg_scores = scores_obj.bkg_scores all_sig_scores = scores_obj.all_sig_scores min_score = scores_obj.min_score max_score = scores_obj.max_score if isinstance(bins, int): if limits is not None: low, high = limits binning = Hist(bins, low, high, type='D') else: binning = Hist(bins, min_score, max_score, type='D') else: # iterable if bins[0] > min_score: log.warning("min score is less than first edge " "(will be underflow)") if bins[-1] <= max_score: log.warning("max score is greater than or equal to last edge " "(will be overflow)") binning = Hist(bins, type='D') bkg_samples = [] for s, scores in bkg_scores: hist_template = binning.Clone( title=s.label, **s.hist_decor) sample = s.get_histfactory_sample( hist_template, clf, category, region, cuts=cuts, scores=scores, systematics=systematics, uniform=uniform, mva=mva) bkg_samples.append(sample) data_sample = None if data_scores is not None: hist_template = binning.Clone( title=self.data.label, **self.data.hist_decor) data_sample = self.data.get_histfactory_sample( hist_template, clf, category, region, cuts=cuts, scores=data_scores, uniform=uniform) if unblind is False: # blind full histogram data_sample.hist[:] = (0, 0) elif (unblind is not True) and isinstance(unblind, int): # blind highest N bins data_sample.hist[-(unblind + 1):] = (0, 0) elif isinstance(unblind, float): # blind above a signal efficiency max_unblind_score = efficiency_cut( sum([histogram_scores(hist_template, scores) for s, scores in all_sig_scores[mass]]), unblind) blind_bin = hist_template.FindBin(max_unblind_score) data_sample.hist[blind_bin:] = (0, 0) # create signal HistFactory samples sig_samples = [] for s, scores in all_sig_scores[mass]: hist_template = binning.Clone( title=s.label, **s.hist_decor) sample = s.get_histfactory_sample( hist_template, clf, category, region, cuts=cuts, scores=scores, no_signal_fixes=no_signal_fixes, systematics=systematics, uniform=uniform, mva=mva) sig_samples.append(sample) # replace data in blind bins with signal + background if hybrid_data and (unblind is not True): sum_sig_bkg = sum([s.hist for s in (bkg_samples + sig_samples)]) if unblind is False: # replace full hist data_sample.hist[:] = sum_sig_bkg[:] elif isinstance(unblind, int): # replace highest N bins bin = -(unblind + 1) data_sample.hist[bin:] = sum_sig_bkg[bin:] elif isinstance(unblind, float): data_sample.hist[blind_bin:] = sum_sig_bkg[blind_bin:] # create channel for this mass point channel = histfactory.make_channel( 'hh_{0}_{1}_{2}'.format(self.year % 1000, category.name, mass), bkg_samples + sig_samples, data=data_sample) return scores_obj, channel
def get_channel_array(self, vars, category, region, cuts=None, include_signal=True, mass=125, mode=None, scale_125=False, clf=None, min_score=None, max_score=None, weighted=True, templates=None, field_scale=None, weight_hist=None, systematics=True, no_signal_fixes=False, bootstrap_data=False, ravel=True, uniform=False, hybrid_data=None): """ Return a dictionnary of histfactory channels for different variables (i.e. {'MMC_MASS':channel1, ...}). Parameters ---------- vars: dict dictionary of histograms (i.e. {'MMC_MASS':hist_template, ...} category: Category analysis category (see mva/categories/*) region: str analysis region (i.e 'OS_ISOL', ...) cuts : str or Cut additional cuts that could be place when requesting the channel array (See mva/categories/common.py for examples) hybrid_data : dict if specified, it is a dictionary mapping the vars key to a tuple specifying the range to be replaced by s+b prediction. """ # TODO: implement blinding log.info("constructing channels") samples = [self.data] + self.backgrounds channel_name = 'hh_{0}_{1}'.format(self.year % 1000, category.name) suffix = None if include_signal: if isinstance(mass, list): suffix = '_' + ('_'.join(map(str, mass))) else: suffix = '_%d' % mass channel_name += suffix samples += self.get_signals(mass, mode, scale_125=scale_125) # create HistFactory samples histfactory_samples = [] for s in samples: field_hist, _ = s.get_field_hist( vars, category, templates=templates) field_sample = s.get_histfactory_sample_array( field_hist, category, region, cuts=cuts, clf=clf, min_score=min_score, max_score=max_score, weighted=weighted, field_scale=field_scale, weight_hist=weight_hist, systematics=systematics, suffix=suffix if not isinstance(s, Higgs) else None, no_signal_fixes=no_signal_fixes, bootstrap_data=bootstrap_data, ravel=ravel, uniform=uniform) histfactory_samples.append(field_sample) field_channels = {} for field in vars.keys(): # create channel for this mass point channel = histfactory.make_channel( channel_name + '_{0}'.format(field), [s[field] for s in histfactory_samples[1:]], data=histfactory_samples[0][field]) # implement hybrid data if requested # TODO: clean up if isinstance(hybrid_data, dict): log.info('constructing hybrid data') if field in hybrid_data.keys(): if isinstance(hybrid_data[field], (list, tuple)): log.info('hybrid data: replacing data by s+b ' 'prediction for {0} in range {1}'.format( field, hybrid_data[field])) if len(hybrid_data[field])!=2: log.error('hybrid data: Need to specify a ' 'range with only two edged') # Get the range of bins to be replaced (add 1 # additional bin on both side for safety) (replace_low, replace_high) = ( hybrid_data[field][0], hybrid_data[field][1]) hist_data_template = self.data.get_field_hist( vars, category) log.info('hybrid data: template binning {0}'.format( list(hist_data_template[0][field].xedges()))) replace_bin = ( hist_data_template[0][field].FindBin(float(replace_low))-1, hist_data_template[0][field].FindBin(float(replace_high))+1) total_bkg_sig = sum([s.hist for s in channel.samples]) log.info('hybrid data: before --> {0}'.format( list(channel.data.hist.y()))) channel.data.hist[replace_bin[0]:replace_bin[1]] = \ total_bkg_sig[replace_bin[0]:replace_bin[1]] log.info('hybrid data: after --> {0}'.format( list(channel.data.hist.y()))) field_channels[field] = channel return field_channels
def optimized_channels(clf, category, region, backgrounds, data=None, cuts=None, mass_points=None, mu=1., systematics=True, lumi_rel_error=0., algo='EvenBinningByLimit'): """ Return optimally binned HistFactory Channels for each mass hypothesis Determine the number of bins that yields the best limit at the 125 GeV mass hypothesis. Then construct and return the channels for all requested mass hypotheses. algos: EvenBinningByLimit, UnevenBinningBySignificance """ log.info("constructing optimized channels") scores_obj = get_scores(clf, category, region, backgrounds, data=data, cuts=cuts, mass_points=mass_points, mu=mu, systematics=systematics) data_scores = scores_obj.data_scores bkg_scores = scores_obj.bkg_scores all_sig_scores = scores_obj.all_sig_scores min_score = scores_obj.min_score max_score = scores_obj.max_score sig_scores = all_sig_scores[125] best_hist_template = None if algo == 'EvenBinningByLimit': limit_hists = [] best_limit = float('inf') best_nbins = 0 nbins_range = xrange(2, 50) for nbins in nbins_range: hist_template = Hist(nbins, min_score, max_score, type='D') # create HistFactory samples samples = [] for s, scores in bkg_scores + sig_scores: sample = s.get_histfactory_sample( hist_template, clf, category, region, cuts=cuts, scores=scores) samples.append(sample) data_sample = None if data is not None: data_sample = data.get_histfactory_sample( hist_template, clf, category, region, cuts=cuts, scores=data_scores) # create channel for this mass point channel = histfactory.make_channel( "%s_%d" % (category.name, 125), samples, data=data_sample) # get limit limit_hist = get_limit(channel, lumi_rel_error=lumi_rel_error) limit_hist.SetName("%s_%d_%d" % (category, 125, nbins)) # is this better than the best limit so far? hist_dict = hist_to_dict(limit_hist) limit_hists.append(hist_dict) if hist_dict['Expected'] < best_limit: best_limit = hist_dict['Expected'] best_nbins = nbins best_hist_template = hist_template # plot limit vs nbins fig = plt.figure() ax = fig.add_subplot(111) central_values = np.array([h['Expected'] for h in limit_hists]) high_values_1sig = np.array([h['+1sigma'] for h in limit_hists]) low_values_1sig = np.array([h['-1sigma'] for h in limit_hists]) high_values_2sig = np.array([h['+2sigma'] for h in limit_hists]) low_values_2sig = np.array([h['-2sigma'] for h in limit_hists]) plt.plot(nbins_range, central_values, 'k-') plt.fill_between(nbins_range, low_values_2sig, high_values_2sig, linewidth=0, facecolor='yellow') plt.fill_between(nbins_range, low_values_1sig, high_values_1sig, linewidth=0, facecolor='green') plt.xlim(nbins_range[0], nbins_range[-1]) plt.xlabel("Number of Bins") plt.ylabel("Limit") plt.grid(True) plt.text(.5, .8, "Best limit of %.2f at %d bins" % (best_limit, best_nbins), horizontalalignment='center', verticalalignment='center', transform = ax.transAxes, fontsize=20) plt.savefig('category_%s_limit_vs_nbins.png' % category.name) elif algo == 'UnevenBinningBySignificance': #hist_template = Hist(200, min_score, max_score) hist_template = Hist(200, -1.0, 1.0, type='D') sig_hist = hist_template.Clone(title='Signal') sig_hist.systematics = {} for sig, scores_dict in sig_scores: scores, weight = scores_dict['NOMINAL'] sig_hist.fill_array(scores, weight) for sys_term in scores_dict.keys(): if sys_term == 'NOMINAL': continue if not sys_term in sig_hist.systematics: sys_hist = hist_template.Clone() sig_hist.systematics[sys_term] = sys_hist else: sys_hist = sig_hist.systematics[sys_term] scores, weight = scores_dict[sys_term] sys_hist.fill_array(scores, weight) bkg_hist = hist_template.Clone(title='Background') bkg_hist.systematics = {} for bkg, scores_dict in bkg_scores: scores, weight = scores_dict['NOMINAL'] bkg_hist.fill_array(scores, weight) for sys_term in scores_dict.keys(): if sys_term == 'NOMINAL': continue if not sys_term in bkg_hist.systematics: sys_hist = hist_template.Clone() bkg_hist.systematics[sys_term] = sys_hist else: sys_hist = bkg_hist.systematics[sys_term] scores, weight = scores_dict[sys_term] sys_hist.fill_array(scores, weight) print "SIG entries:", sig_hist.GetEntries() print "BKG entries:", bkg_hist.GetEntries() sig_hist, bkg_hist, best_hist_template = optimize_binning(sig_hist, bkg_hist, #starting_point='fine' starting_point='merged' ) if best_hist_template is None: best_hist_template = hist_template #raw_input("Hit enter to continue...") else: print "ERROR: binning optimisation algo %s not in list!" % algo exit(1) hist_template = best_hist_template channels = dict() # create HistFactory samples bkg_samples = [] for s, scores in bkg_scores: sample = s.get_histfactory_sample( hist_template, clf, category, region, cuts=cuts, scores=scores) bkg_samples.append(sample) data_sample = None if data_scores is not None: data_sample = data.get_histfactory_sample( hist_template, clf, category, region, cuts=cuts, scores=data_scores) # now use the optimal binning and construct channels for all requested mass # hypotheses for mass in Higgs.MASSES: if mass_points is not None and mass not in mass_points: continue log.info('=' * 20) log.info("%d GeV mass hypothesis" % mass) # create HistFactory samples sig_samples = [] for s, scores in all_sig_scores[mass]: sample = s.get_histfactory_sample( hist_template, clf, category, region, cuts=cuts, scores=scores) sig_samples.append(sample) # create channel for this mass point channel = histfactory.make_channel( "%s_%d" % (category.name, mass), bkg_samples + sig_samples, data=data_sample) channels[mass] = channel return channels