Beispiel #1
0
def cvm_flatness(y,
                 proba,
                 X,
                 uniform_features,
                 sample_weight=None,
                 label=1,
                 knn=30):
    """ The most simple way to compute Cramer-von Mises flatness, this is however very slow
    if you need to compute it many times
    :param y: real classes of events, shape = [n_samples]
    :param proba: predicted probabilities, shape = [n_samples, n_classes]
    :param X: pandas.DataFrame with uniform features (i.e. test dataset)
    :param uniform_features: features, along which uniformity is desired, list of strings
    :param sample_weight: weights of events, shape = [n_samples]
    :param label: class, for which uniformity is measured (usually, 0 is bck, 1 is signal)
    :param knn: number of nearest neighbours used in knn

    Example of usage:
    proba = classifier.predict_proba(testX)
    cvm_flatness(testY, proba=proba, X=testX, uniform_features=['mass'])
    """
    assert len(y) == len(proba) == len(X), 'Different lengths'
    X = pandas.DataFrame(X)

    signal_mask = y == label
    groups_indices = compute_knn_indices_of_signal(X[uniform_features],
                                                   is_signal=signal_mask,
                                                   n_neighbours=knn)
    groups_indices = groups_indices[signal_mask, :]

    return group_based_cvm(proba[:, label],
                           mask=signal_mask,
                           groups_indices=groups_indices,
                           sample_weight=sample_weight)
Beispiel #2
0
def sde(y, proba, X, uniform_features, sample_weight=None, label=1, knn=30):
    """ The most simple way to compute SDE, this is however very slow
    if you need to recompute SDE many times
    :param y: real classes of events, shape = [n_samples]
    :param proba: predicted probabilities, shape = [n_samples, n_classes]
    :param X: pandas.DataFrame with uniform features
    :param uniform_features: features, along which uniformity is desired, list of strings
    :param sample_weight: weights of events, shape = [n_samples]
    :param label: class, for which uniformity is measured (usually, 0 is bck, 1 is signal)
    :param knn: number of nearest neighbours used in knn

    Example of usage:
    proba = classifier.predict_proba(testX)
    sde(testY, proba=proba, X=testX, uniform_features=['mass'])
    """
    assert len(y) == len(proba) == len(X), 'Different lengths'
    X = pandas.DataFrame(X)
    mask = y == label
    groups = compute_knn_indices_of_signal(X[uniform_features],
                                           is_signal=mask,
                                           n_neighbours=knn)
    groups = groups[mask, :]

    return compute_sde_on_groups(proba[:, label],
                                 mask=mask,
                                 groups_indices=groups,
                                 target_efficiencies=[0.5, 0.6, 0.7, 0.8, 0.9],
                                 sample_weight=sample_weight)
def theil_flatness(y, proba, X, uniform_variables, sample_weight=None, label=1, knn=30):
    """This is ready-to-use function, and it is quite slow to use many times"""

    mask = y == label
    groups_indices = compute_knn_indices_of_signal(X[uniform_variables], is_signal=mask, n_neighbours=knn)[mask, :]
    return compute_theil_on_groups(proba[:, label], mask=mask, groups_indices=groups_indices,
                                   target_efficiencies=[0.5, 0.6, 0.7, 0.8, 0.9], sample_weight=sample_weight)
def test_compute_knn_indices(n_events=100):
    X, y = generate_sample(n_events, 10, distance=.5)
    is_signal = y > 0.5
    signal_indices = numpy.where(is_signal)[0]
    uniform_columns = X.columns[:1]
    knn_indices = compute_knn_indices_of_signal(X[uniform_columns], is_signal, 10)
    distances = pairwise_distances(X[uniform_columns])
    for i, neighbours in enumerate(knn_indices):
        assert numpy.all(is_signal[neighbours]), "returned indices are not signal"
        not_neighbours = [x for x in signal_indices if not x in neighbours]
        min_dist = numpy.min(distances[i, not_neighbours])
        max_dist = numpy.max(distances[i, neighbours])
        assert min_dist >= max_dist, "distances are set wrongly!"

    knn_all_indices = compute_knn_indices_of_same_class(X[uniform_columns], is_signal, 10)
    for i, neighbours in enumerate(knn_all_indices):
        assert numpy.all(is_signal[neighbours] == is_signal[i]), "returned indices are not signal/bg"
Beispiel #5
0
def theil_flatness(y,
                   proba,
                   X,
                   uniform_features,
                   sample_weight=None,
                   label=1,
                   knn=30):
    """This is ready-to-use function, and it is quite slow to use many times"""

    mask = y == label
    groups_indices = compute_knn_indices_of_signal(X[uniform_features],
                                                   is_signal=mask,
                                                   n_neighbours=knn)[mask, :]
    return compute_theil_on_groups(
        proba[:, label],
        mask=mask,
        groups_indices=groups_indices,
        target_efficiencies=[0.5, 0.6, 0.7, 0.8, 0.9],
        sample_weight=sample_weight)
def test_compute_knn_indices(n_events=100):
    X, y = generate_sample(n_events, 10, distance=.5)
    is_signal = y > 0.5
    signal_indices = numpy.where(is_signal)[0]
    uniform_columns = X.columns[:1]
    knn_indices = compute_knn_indices_of_signal(X[uniform_columns], is_signal,
                                                10)
    distances = pairwise_distances(X[uniform_columns])
    for i, neighbours in enumerate(knn_indices):
        assert numpy.all(
            is_signal[neighbours]), "returned indices are not signal"
        not_neighbours = [x for x in signal_indices if not x in neighbours]
        min_dist = numpy.min(distances[i, not_neighbours])
        max_dist = numpy.max(distances[i, neighbours])
        assert min_dist >= max_dist, "distances are set wrongly!"

    knn_all_indices = compute_knn_indices_of_same_class(
        X[uniform_columns], is_signal, 10)
    for i, neighbours in enumerate(knn_all_indices):
        assert numpy.all(is_signal[neighbours] ==
                         is_signal[i]), "returned indices are not signal/bg"
def sde(y, proba, X, uniform_variables, sample_weight=None, label=1, knn=30):
    """ The most simple way to compute SDE, this is however very slow
    if you need to recompute SDE many times
    :param y: real classes of events, shape = [n_samples]
    :param proba: predicted probabilities, shape = [n_samples, n_classes]
    :param X: pandas.DataFrame with uniform features
    :param uniform_variables: features, along which uniformity is desired, list of strings
    :param sample_weight: weights of events, shape = [n_samples]
    :param label: class, for which uniformity is measured (usually, 0 is bck, 1 is signal)
    :param knn: number of nearest neighbours used in knn

    Example of usage:
    proba = classifier.predict_proba(testX)
    sde(testY, proba=proba, X=testX, uniform_variables=['mass'])
    """
    assert len(y) == len(proba) == len(X), 'Different lengths'
    X = pandas.DataFrame(X)
    mask = y == label
    groups = compute_knn_indices_of_signal(X[uniform_variables], is_signal=mask, n_neighbours=knn)
    groups = groups[mask, :]

    return compute_sde_on_groups(proba[:, label], mask=mask, groups_indices=groups,
                                 target_efficiencies=[0.5, 0.6, 0.7, 0.8, 0.9], sample_weight=sample_weight)
def cvm_flatness(y, proba, X, uniform_variables, sample_weight=None, label=1, knn=30):
    """ The most simple way to compute Cramer-von Mises flatness, this is however very slow
    if you need to compute it many times
    :param y: real classes of events, shape = [n_samples]
    :param proba: predicted probabilities, shape = [n_samples, n_classes]
    :param X: pandas.DataFrame with uniform features (i.e. test dataset)
    :param uniform_variables: features, along which uniformity is desired, list of strings
    :param sample_weight: weights of events, shape = [n_samples]
    :param label: class, for which uniformity is measured (usually, 0 is bck, 1 is signal)
    :param knn: number of nearest neighbours used in knn

    Example of usage:
    proba = classifier.predict_proba(testX)
    cvm_flatness(testY, proba=proba, X=testX, uniform_variables=['mass'])
    """
    assert len(y) == len(proba) == len(X), 'Different lengths'
    X = pandas.DataFrame(X)

    signal_mask = y == label
    groups_indices = compute_knn_indices_of_signal(X[uniform_variables], is_signal=signal_mask, n_neighbours=knn)
    groups_indices = groups_indices[signal_mask, :]

    return group_based_cvm(proba[:, label], mask=signal_mask, groups_indices=groups_indices,
                           sample_weight=sample_weight)