Ejemplo n.º 1
0
    def get_efficiencies(prediction,
                         spectator,
                         sample_weight=None,
                         bins_number=20,
                         thresholds=None,
                         errors=False,
                         ignored_sideband=0.0):
        prediction, spectator, sample_weight = \
            check_arrays(prediction, spectator, sample_weight)

        spectator_min, spectator_max = weighted_quantile(
            spectator, [ignored_sideband, (1. - ignored_sideband)])
        mask = (spectator >= spectator_min) & (spectator <= spectator_max)
        spectator = spectator[mask]
        prediction = prediction[mask]
        bins_number = min(bins_number, len(prediction))
        sample_weight = sample_weight if sample_weight is None else numpy.array(
            sample_weight)[mask]

        if thresholds is None:
            thresholds = [
                weighted_quantile(prediction,
                                  quantiles=1 - eff,
                                  sample_weight=sample_weight)
                for eff in [0.2, 0.4, 0.5, 0.6, 0.8]
            ]

        binner = Binner(spectator, bins_number=bins_number)
        if sample_weight is None:
            sample_weight = numpy.ones(len(prediction))
        bins_data = binner.split_into_bins(spectator, prediction,
                                           sample_weight)

        bin_edges = numpy.array([spectator_min] + list(binner.limits) +
                                [spectator_max])
        xerr = numpy.diff(bin_edges) / 2.
        result = OrderedDict()
        for threshold in thresholds:
            x_values = []
            y_values = []
            N_in_bin = []
            for num, (masses, probabilities, weights) in enumerate(bins_data):
                if len(weights) == 0 or sum(weights) == 0.0: continue
                y_values.append(
                    numpy.average(probabilities > threshold, weights=weights))
                N_in_bin.append(numpy.sum(weights))
                if errors:
                    x_values.append((bin_edges[num + 1] + bin_edges[num]) / 2.)
                else:
                    x_values.append(numpy.mean(masses))

            x_values, y_values, N_in_bin = check_arrays(
                x_values, y_values, N_in_bin)
            if errors:
                result[threshold] = (x_values, y_values,
                                     numpy.sqrt(y_values * (1 - y_values) /
                                                N_in_bin), xerr)
            else:
                result[threshold] = (x_values, y_values)
        return result
Ejemplo n.º 2
0
def plot_flatness_particle(labels, predictions_dict, spectator, spectator_name, particle_name, 
                           weights=None, bins_number=30, ignored_sideband=0.1, 
                           thresholds=None, cuts_values=False):
    plt.figure(figsize=(18, 22))
    for n, (name, label) in enumerate(names_labels_correspondence.items()):
        plt.subplot(3, 2, n + 1)
        mask = labels == names_labels_correspondence[particle_name]
        probs = predictions_dict[label][mask]
        mask_signal = labels == label
        probs_signal = predictions_dict[label][mask_signal]
        if cuts_values:
            thresholds_values = cut_values
        else:
            thresholds_values = [weighted_quantile(probs_signal, quantiles=1 - eff / 100., 
                                                   sample_weight=None if weights is None else weights[mask_signal])
                                 for eff in thresholds]
        eff = get_efficiencies(probs, spectator[mask], 
                               sample_weight=None if weights is None else weights[mask], 
                               bins_number=bins_number, errors=True, ignored_sideband=ignored_sideband,
                               thresholds=thresholds_values)
        for thr in thresholds_values:
            eff[thr] = (eff[thr][0], 100*numpy.array(eff[thr][1]), 100*numpy.array(eff[thr][2]), eff[thr][3])
        plot_fig = ErrorPlot(eff)
        plot_fig.xlabel = '{} {}'.format(particle_name, spectator_name)
        plot_fig.ylabel = 'Efficiency'
        plot_fig.title = 'MVA {}'.format(name)
        plot_fig.ylim = (0, 100)
        plot_fig.plot(fontsize=22)
        plt.xticks(fontsize=12), plt.yticks(fontsize=12)
        if not cuts_values:
            plt.legend(['Signal Eff {}%'.format(thr) for thr in thresholds], loc='best', fontsize=18, framealpha=0.5)
Ejemplo n.º 3
0
def plot_flatness_by_particle(labels, predictions_dict, spectator, spectator_name, predictions_dict_comparison=None,
                              names_algorithms=['MVA', 'Baseline'],
                              weights=None, bins_number=30, ignored_sideband=0.1, 
                              thresholds=None, cuts_values=False, ncol=1):
    plt.figure(figsize=(22, 20))
    for n, (name, label) in enumerate(names_labels_correspondence.items()):
        plt.subplot(3, 2, n + 1)
        mask =labels == label
        legends = []
        for preds, name_algo in zip([predictions_dict, predictions_dict_comparison], names_algorithms):
            if preds is None:
                continue
            probs = preds[label][mask]
            if cuts_values:
                thresholds_values = cut_values
            else:
                thresholds_values = [weighted_quantile(probs, quantiles=1 - eff / 100., 
                                                       sample_weight=None if weights is None else weights[mask])
                                     for eff in thresholds]
            eff = get_efficiencies(probs, spectator[mask], 
                                   sample_weight=None if weights is None else weights[mask], 
                                   bins_number=bins_number, errors=True, ignored_sideband=ignored_sideband,
                                   thresholds=thresholds_values)
            for thr in thresholds_values:
                eff[thr] = (eff[thr][0], 100*numpy.array(eff[thr][1]), 100*numpy.array(eff[thr][2]), eff[thr][3])
            plot_fig = ErrorPlot(eff)
            plot_fig.xlabel = '{} {}'.format(name, spectator_name)
            plot_fig.ylabel = 'Efficiency'
            plot_fig.title = name
            plot_fig.ylim = (0, 100)
            plot_fig.plot(fontsize=22)
            plt.xticks(fontsize=12), plt.yticks(fontsize=12)
            legends.append(['{} Eff {}%'.format(thr, name_algo) for thr in thresholds])
        plt.legend(numpy.concatenate(legends), loc='best', fontsize=12, framealpha=0.5, ncol=ncol)
Ejemplo n.º 4
0
def plot_flatness_by_particle(labels,
                              predictions_dict,
                              spectator,
                              spectator_name,
                              predictions_dict_comparison=None,
                              names_algorithms=['MVA', 'Baseline'],
                              weights=None,
                              bins_number=30,
                              ignored_sideband=0.1,
                              thresholds=None,
                              cuts_values=False,
                              ncol=1):
    plt.figure(figsize=(22, 20))
    for n, (name, label) in enumerate(names_labels_correspondence.items()):
        plt.subplot(3, 2, n + 1)
        mask = labels == label
        legends = []
        for preds, name_algo in zip(
            [predictions_dict, predictions_dict_comparison], names_algorithms):
            if preds is None:
                continue
            probs = preds[label][mask]
            if cuts_values:
                thresholds_values = cut_values
            else:
                thresholds_values = [
                    weighted_quantile(probs,
                                      quantiles=1 - eff / 100.,
                                      sample_weight=None
                                      if weights is None else weights[mask])
                    for eff in thresholds
                ]
            eff = get_efficiencies(
                probs,
                spectator[mask],
                sample_weight=None if weights is None else weights[mask],
                bins_number=bins_number,
                errors=True,
                ignored_sideband=ignored_sideband,
                thresholds=thresholds_values)
            for thr in thresholds_values:
                eff[thr] = (eff[thr][0], 100 * numpy.array(eff[thr][1]),
                            100 * numpy.array(eff[thr][2]), eff[thr][3])
            plot_fig = ErrorPlot(eff)
            plot_fig.xlabel = '{} {}'.format(name, spectator_name)
            plot_fig.ylabel = 'Efficiency'
            plot_fig.title = name
            plot_fig.ylim = (0, 100)
            plot_fig.plot(fontsize=22)
            plt.xticks(fontsize=12), plt.yticks(fontsize=12)
            legends.append(
                ['{} Eff {}%'.format(thr, name_algo) for thr in thresholds])
        plt.legend(numpy.concatenate(legends),
                   loc='best',
                   fontsize=12,
                   framealpha=0.5,
                   ncol=ncol)
Ejemplo n.º 5
0
def test_weighted_quantile(size=10000):
    x = numpy.random.normal(size=size)
    weights = numpy.random.random(size=size)
    quantile_level = numpy.random.random()

    quantile_value = utils.weighted_quantile(x, quantile_level, sample_weight=weights)

    passed_weight = numpy.sum((x < quantile_value) * weights)
    expected_weight = quantile_level * numpy.sum(weights)
    assert numpy.abs(passed_weight - expected_weight) < 1.1, 'wrong cut'
Ejemplo n.º 6
0
def plot_flatness_particle(labels,
                           predictions_dict,
                           spectator,
                           spectator_name,
                           particle_name,
                           weights=None,
                           bins_number=30,
                           ignored_sideband=0.1,
                           thresholds=None,
                           cuts_values=False):
    plt.figure(figsize=(18, 22))
    for n, (name, label) in enumerate(names_labels_correspondence.items()):
        plt.subplot(3, 2, n + 1)
        mask = labels == names_labels_correspondence[particle_name]
        probs = predictions_dict[label][mask]
        mask_signal = labels == label
        probs_signal = predictions_dict[label][mask_signal]
        if cuts_values:
            thresholds_values = cut_values
        else:
            thresholds_values = [
                weighted_quantile(probs_signal,
                                  quantiles=1 - eff / 100.,
                                  sample_weight=None
                                  if weights is None else weights[mask_signal])
                for eff in thresholds
            ]
        eff = get_efficiencies(
            probs,
            spectator[mask],
            sample_weight=None if weights is None else weights[mask],
            bins_number=bins_number,
            errors=True,
            ignored_sideband=ignored_sideband,
            thresholds=thresholds_values)
        for thr in thresholds_values:
            eff[thr] = (eff[thr][0], 100 * numpy.array(eff[thr][1]),
                        100 * numpy.array(eff[thr][2]), eff[thr][3])
        plot_fig = ErrorPlot(eff)
        plot_fig.xlabel = '{} {}'.format(particle_name, spectator_name)
        plot_fig.ylabel = 'Efficiency'
        plot_fig.title = 'MVA {}'.format(name)
        plot_fig.ylim = (0, 100)
        plot_fig.plot(fontsize=22)
        plt.xticks(fontsize=12), plt.yticks(fontsize=12)
        if not cuts_values:
            plt.legend(['Signal Eff {}%'.format(thr) for thr in thresholds],
                       loc='best',
                       fontsize=18,
                       framealpha=0.5)
Ejemplo n.º 7
0
    def get_profiles(prediction,
                     spectator,
                     sample_weight=None,
                     bins_number=20,
                     errors=False,
                     ignored_sideband=0.0):
        """
        Construct profile of prediction vs. spectator
        :param binner: Binner object with bins computed from combined sig+bkg spectator value list
        :param prediction: list of probabilities
        :param spectator: list of spectator's values
        :param bins_number: int, count of bins for plot
        :return:
            if errors=False
            tuple (x_values, y_values)
            if errors=True
            tuple (x_values, y_values, y_err, x_err)
            All the parts: x_values, y_values, y_err, x_err are numpy.arrays of the same length.
        """
        prediction, spectator, sample_weight = check_arrays(
            prediction, spectator, sample_weight)

        spectator_min, spectator_max = weighted_quantile(
            spectator, [ignored_sideband, (1. - ignored_sideband)])
        mask = (spectator >= spectator_min) & (spectator <= spectator_max)
        spectator = spectator[mask]
        prediction = prediction[mask]
        bins_number = min(bins_number, len(prediction))
        sample_weight = sample_weight if sample_weight is None else numpy.array(
            sample_weight)[mask]

        binner = Binner(spectator, bins_number=bins_number)
        if sample_weight is None:
            sample_weight = numpy.ones(len(prediction))
        bins_data = binner.split_into_bins(spectator, prediction,
                                           sample_weight)

        bin_edges = numpy.array([spectator_min] + list(binner.limits) +
                                [spectator_max])
        x_err = numpy.diff(bin_edges) / 2.
        result = OrderedDict()
        x_values = []
        y_values = []
        N_in_bin = []
        y_err = []
        for num, (masses, probabilities, weights) in enumerate(bins_data):
            y_values.append(
                numpy.average(probabilities, weights=weights)
                if len(weights) > 0 and sum(weights) > 0.0 else 0)
            y_err.append(
                numpy.sqrt(
                    numpy.
                    cov(probabilities, aweights=numpy.abs(weights), ddof=0) /
                    numpy.sum(weights)
                ) if len(weights) > 0 and sum(weights) > 0.0 else 0)
            N_in_bin.append(numpy.sum(weights))
            x_values.append((bin_edges[num + 1] + bin_edges[num]) / 2.)

        x_values, y_values, N_in_bin = check_arrays(x_values, y_values,
                                                    N_in_bin)
        if errors:
            return (x_values, y_values, y_err, x_err)
        else:
            return (x_values, y_values)
Ejemplo n.º 8
0
def plot_flatness_by_particle(labels,
                              predictions,
                              spectator,
                              spectator_name,
                              predictions_comparison=None,
                              names_algorithms=['MVA', 'Baseline'],
                              for_particle=None,
                              weights=None,
                              bins_number=30,
                              ignored_sideband=0.1,
                              thresholds=None,
                              n_col=1):
    """
    Build a flatness-plot, which demonstrates the dependency between efficiency and some observable.

    :param labels: [n_samples], contains targets
    :param predictions: [n_samples, n_particle_types] with predictions of an algorithm
    :param spectator: [n_samples], values of spectator variable
    :param spectator_name: str, name shown on the plot
    :param predictions_comparison: [n_samples, n_particle types], optionally for comparison this may be provided
    :param names_algorithms: names for compared algorithms
    :param weights: [n_samples], optional
    :param bins_number: int,
    :param ignored_sideband: fraction of ignored sidebands
    :param thresholds: efficiencies, for which flatness is drawn
    :param n_col: number of columns in legend.
    """
    plt.figure(figsize=(22, 24))
    if predictions_comparison is not None:
        colors = ['blue', 'green']
        markers = ['o', 's', 'v', 'o', 's', 'v']
    else:
        colors = [None, None]
        markers = ['o'] * len(thresholds)

    for n, (particle_name,
            label) in enumerate(names_labels_correspondence.items()):
        plt.subplot(3, 2, n + 1)
        title = '{} algorithm'.format(particle_name)
        xlim_all = (1e10, -1e10)
        ylim_all = (20, -1e8)
        legends = []
        for preds, algo_name, color in zip(
            [predictions, predictions_comparison], names_algorithms, colors):
            if preds is None:
                continue
            particle_mask = labels == label
            particle_probs = preds[particle_mask, label]
            particle_weights = None if weights is None else weights[
                particle_mask]

            thresholds_values = [
                weighted_quantile(particle_probs,
                                  quantiles=1 - eff / 100.,
                                  sample_weight=particle_weights)
                for eff in thresholds
            ]

            if for_particle is not None:
                particle_mask = labels == names_labels_correspondence[
                    for_particle]
                particle_probs = preds[particle_mask, label]
                particle_weights = None if weights is None else weights[
                    particle_mask]
                title = '{} algorithm for {}'.format(particle_name,
                                                     for_particle)

            eff = get_efficiencies(particle_probs,
                                   spectator[particle_mask],
                                   sample_weight=particle_weights,
                                   bins_number=bins_number,
                                   errors=True,
                                   ignored_sideband=ignored_sideband,
                                   thresholds=thresholds_values)
            for thr in thresholds_values:
                eff[thr] = (eff[thr][0], 100 * numpy.array(eff[thr][1]),
                            100 * numpy.array(eff[thr][2]), eff[thr][3])

            xlim, ylim = compute_limits_and_plot_errorbar(eff,
                                                          markers,
                                                          color=color)
            plt.xlabel('{} {}\n\n'.format(particle_name, spectator_name),
                       fontsize=22)
            plt.ylabel('Efficiency', fontsize=22)
            plt.title('\n\n'.format(title), fontsize=22)
            plt.xticks(fontsize=12), plt.yticks(fontsize=12)
            legends.append(
                ['{} Eff {}%'.format(algo_name, thr) for thr in thresholds])
            plt.grid(True)

            xlim_all = (min(xlim_all[0], xlim[0]), max(xlim_all[1], xlim[1]))
            ylim_all = (min(ylim_all[0], ylim[0]), max(ylim_all[1], ylim[1]))
        plt.legend(numpy.concatenate(legends),
                   loc='best',
                   fontsize=16,
                   framealpha=0.5,
                   ncol=n_col)
        plt.xlim(xlim_all[0], xlim_all[1])
        plt.ylim(ylim_all[0], ylim_all[1])