Beispiel #1
0
def cvm_flatness(y, proba, X, uniform_variables, sample_weight=None, label=1, knn=30):
    """ The most simple way to compute Cramer-von Mises flatness, this is however very slow
    if you need to compute it many times
    :param y: real classes of events, shape = [n_samples]
    :param proba: predicted probabilities, shape = [n_samples, n_classes]
    :param X: pandas.DataFrame with uniform features (i.e. test dataset)
    :param uniform_variables: features, along which uniformity is desired, list of strings
    :param sample_weight: weights of events, shape = [n_samples]
    :param label: class, for which uniformity is measured (usually, 0 is bck, 1 is signal)
    :param knn: number of nearest neighbours used in knn

    Example of usage:
    proba = classifier.predict_proba(testX)
    cvm_flatness(testY, proba=proba, X=testX, uniform_variables=['mass'])
    """
    y, proba = check_arrays(y, proba)
    assert len(y) == len(proba) == len(X), 'Different lengths'
    y = column_or_1d(y)
    sample_weight = check_sample_weight(y, sample_weight=sample_weight)

    X = pandas.DataFrame(X)

    signal_mask = y == label
    groups_indices = computeSignalKnnIndices(uniform_variables=uniform_variables, dataframe=X,
                                             is_signal=signal_mask, n_neighbors=knn)
    groups_indices = groups_indices[signal_mask, :]

    return ut.group_based_cvm(proba[:, label], mask=signal_mask, groups_indices=groups_indices,
                              sample_weight=sample_weight)
Beispiel #2
0
def sde(y, proba, X, uniform_variables, sample_weight=None, label=1, knn=30):
    """ The most simple way to compute SDE, this is however very slow
    if you need to recompute SDE many times
    :param y: real classes of events, shape = [n_samples]
    :param proba: predicted probabilities, shape = [n_samples, n_classes]
    :param X: pandas.DataFrame with uniform features
    :param uniform_variables: features, along which uniformity is desired, list of strings
    :param sample_weight: weights of events, shape = [n_samples]
    :param label: class, for which uniformity is measured (usually, 0 is bck, 1 is signal)
    :param knn: number of nearest neighbours used in knn

    Example of usage:
    proba = classifier.predict_proba(testX)
    sde(testY, proba=proba, X=testX, uniform_variables=['mass'])
    """
    y, proba = check_arrays(y, proba)
    assert len(y) == len(proba) == len(X), 'Different lengths'

    y = column_or_1d(y)
    sample_weight = check_sample_weight(y, sample_weight=sample_weight)

    X = pandas.DataFrame(X)
    mask = y == label
    groups = computeSignalKnnIndices(uniform_variables=uniform_variables,
                                     dataframe=X,
                                     is_signal=mask,
                                     n_neighbors=knn)
    groups = groups[mask, :]

    return ut.compute_sde_on_groups(
        proba[:, label],
        mask=mask,
        groups_indices=groups,
        target_efficiencies=[0.5, 0.6, 0.7, 0.8, 0.9],
        sample_weight=sample_weight)
Beispiel #3
0
    def fit(self, X, y, sample_weight=None, iterations=100, loss=None):
        X, y = check_arrays(X, y)
        self.n_features = X.shape[1]
        sample_weight = check_sample_weight(y, sample_weight=sample_weight)
        if loss is None:
            loss = BinomialDevianceLossFunction()
        loss.fit(X, y, sample_weight=sample_weight)

        self.coeffs = numpy.zeros(
            [self.compute_n_features(), 2**self.power_categories],
            dtype='float')
        y_pred = numpy.zeros(len(X), dtype='float')

        for iteration in range(iterations):
            print(iteration, loss(y_pred))

            for feature, feature_values in enumerate(
                    self.enumerate_features(X)):
                # TODO compute once per iteration!
                ngradient = loss.negative_gradient(y_pred)

                nominator = numpy.bincount(feature_values,
                                           weights=ngradient,
                                           minlength=2**self.power_categories)
                nominator -= 2 * self.l2_reg * self.coeffs[
                    feature, :] + self.l1_reg * numpy.sign(
                        self.coeffs[feature, :])

                denominator = numpy.abs(ngradient) * (1. -
                                                      numpy.abs(ngradient))
                denominator = numpy.bincount(
                    feature_values,
                    weights=denominator,
                    minlength=2**self.power_categories)
                denominator += 2 * self.l2_reg

                gradients = nominator / denominator
                right_gradients = gradients
                # those already zeros not to become nonzero
                mask = (self.coeffs[feature, :]
                        == 0) & (numpy.abs(gradients) < self.l1_reg)
                right_gradients[mask] = 0
                # those already not zeros
                old_coeffs = self.coeffs[feature, :]
                new_coeffs = old_coeffs + self.learning_rate * right_gradients
                new_coeffs[new_coeffs * old_coeffs < 0] = 0
                self.coeffs[feature, :] = new_coeffs
                y_diff = numpy.take(new_coeffs - old_coeffs, feature_values)
                y_pred += y_diff

        return self
    def fit(self, X, y, sample_weight=None, iterations=100, loss=None):
        X, y = check_arrays(X, y)
        self.n_features = X.shape[1]
        sample_weight = check_sample_weight(y, sample_weight=sample_weight)
        if loss is None:
            loss = BinomialDevianceLossFunction()
        loss.fit(X, y, sample_weight=sample_weight)

        self.coeffs = numpy.zeros([self.compute_n_features(), 2 ** self.power_categories], dtype='float')
        y_pred = numpy.zeros(len(X), dtype='float')

        for iteration in range(iterations):
            print(iteration, loss(y_pred))

            for feature, feature_values in enumerate(self.enumerate_features(X)):
                # TODO compute once per iteration!
                ngradient = loss.negative_gradient(y_pred)

                nominator = numpy.bincount(feature_values, weights=ngradient, minlength=2 ** self.power_categories)
                nominator -= 2 * self.l2_reg * self.coeffs[feature, :] + self.l1_reg * numpy.sign(self.coeffs[feature, :])

                denominator = numpy.abs(ngradient) * (1. - numpy.abs(ngradient))
                denominator = numpy.bincount(feature_values, weights=denominator, minlength=2 ** self.power_categories)
                denominator += 2 * self.l2_reg

                gradients = nominator / denominator
                right_gradients = gradients
                # those already zeros not to become nonzero
                mask = (self.coeffs[feature, :] == 0) & (numpy.abs(gradients) < self.l1_reg)
                right_gradients[mask] = 0
                # those already not zeros
                old_coeffs = self.coeffs[feature, :]
                new_coeffs = old_coeffs + self.learning_rate * right_gradients
                new_coeffs[new_coeffs * old_coeffs < 0] = 0
                self.coeffs[feature, :] = new_coeffs
                y_diff = numpy.take(new_coeffs - old_coeffs, feature_values)
                y_pred += y_diff

        return self