def test_gb_with_ada_and_log(n_samples=1000, n_features=10, distance=0.6):
    """
    Testing with two main classification losses.
    Also testing copying
    """
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    for loss in [LogLossFunction(), AdaLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss,
                                          min_samples_split=20,
                                          max_depth=5,
                                          learning_rate=.2,
                                          subsample=0.7,
                                          n_estimators=10,
                                          train_features=None)
        clf.fit(trainX, trainY)
        assert clf.n_features == n_features
        assert len(clf.feature_importances_) == n_features
        # checking that predict proba works
        for p in clf.staged_predict_proba(testX):
            assert p.shape == (n_samples, 2)
        assert numpy.all(p == clf.predict_proba(testX))
        assert roc_auc_score(testY, p[:, 1]) > 0.8, 'quality is too low'
        # checking clonability
        _ = clone(clf)
        clf_copy = copy.deepcopy(clf)
        assert numpy.all(
            clf.predict_proba(trainX) == clf_copy.predict_proba(
                trainX)), 'copied classifier is different'
def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = losses.CompositeLossFunction()
    loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features,
                                      uniform_label=1)
    loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features,
                                      uniform_label=[0, 1])
    loss6bin = losses.BinFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=0)
    loss7bin = losses.BinFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=[0, 1])
    loss6knn = losses.KnnFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=1)
    loss7knn = losses.KnnFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=[0, 1])

    for loss in [
            loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn,
            loss7knn
    ]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(
            result, loss)

    trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX))
    for loss in [
            losses.MSELossFunction(),
            losses.MAELossFunction(),
            losses.RankBoostLossFunction(request_column='fake_request')
    ]:
        print(loss)
        clf = UGradientBoostingRegressor(loss=loss,
                                         max_depth=3,
                                         n_estimators=50,
                                         learning_rate=0.01,
                                         subsample=0.5,
                                         train_features=list(
                                             trainX.columns[1:]))
        clf.fit(trainX, trainY)
        roc_auc = roc_auc_score(testY, clf.predict(testX))
        assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format(
            roc_auc, loss)
Exemple #3
0
 def __init__(self,
              learning_rate=1.0,
              regularization=100.,
              n_units=10,
              iterations=30,
              n_thresholds=10,
              max_overlap=20,
              sign=+1):
     self.learning_rate = learning_rate
     self.regularization = regularization
     self.bias_regularization = regularization * 0.1
     self.n_units = n_units
     self.iterations = iterations
     self.n_thresholds = n_thresholds
     self.loss = LogLossFunction()
     self.max_overlap = max_overlap
     self.sign = sign
     self.unit_signs = numpy.ones(n_units) * sign
Exemple #4
0
def test_weight_misbalance(n_samples=1000, n_features=10, distance=0.6):
    """
    Testing how classifiers work with highly misbalanced (in the terms of weights) datasets.
    """
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    trainW = trainY * 10000 + 1
    testW = testY * 10000 + 1
    for loss in [LogLossFunction(), AdaLossFunction(), losses.CompositeLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                          subsample=0.7, n_estimators=10, train_features=None)
        clf.fit(trainX, trainY, sample_weight=trainW)
        p = clf.predict_proba(testX)
        assert roc_auc_score(testY, p[:, 1], sample_weight=testW) > 0.8, 'quality is too low'
Exemple #5
0
 def fit(self, X, y):
     X = numpy.array(X)
     self.loss = LogLossFunction()
     self.loss.fit(X, y, sample_weight=y * 0 + 1)
     max_cats = numpy.max(X) + 1
     self.cat_biases = numpy.zeros([max_cats, X.shape[1]], dtype='float')
     predictions = numpy.zeros(len(X))
     for stage in range(self.n_iterations):
         for column in range(X.shape[1]):
             grads = self.loss.negative_gradient(predictions)
             hesss = self.loss.hessian(predictions)
             inds = X[:, column]
             nominator = numpy.bincount(
                 inds, weights=grads, minlength=max_cats
             ) - self.regularization * self.cat_biases[:, column]
             denominator = numpy.bincount(
                 inds, weights=hesss,
                 minlength=max_cats) + self.regularization
             predictions -= self.cat_biases[inds, column]
             self.cat_biases[:, column] += nominator / denominator
             predictions += self.cat_biases[inds, column]
         print stage, self.loss(predictions)
     return self
Exemple #6
0
class Logistic_My:
    def __init__(self, regularization, n_iterations=10):
        self.regularization = regularization
        self.n_iterations = n_iterations

    def fit(self, X, y):
        X = numpy.array(X)
        self.loss = LogLossFunction()
        self.loss.fit(X, y, sample_weight=y * 0 + 1)
        max_cats = numpy.max(X) + 1
        self.cat_biases = numpy.zeros([max_cats, X.shape[1]], dtype='float')
        predictions = numpy.zeros(len(X))
        for stage in range(self.n_iterations):
            for column in range(X.shape[1]):
                grads = self.loss.negative_gradient(predictions)
                hesss = self.loss.hessian(predictions)
                inds = X[:, column]
                nominator = numpy.bincount(
                    inds, weights=grads, minlength=max_cats
                ) - self.regularization * self.cat_biases[:, column]
                denominator = numpy.bincount(
                    inds, weights=hesss,
                    minlength=max_cats) + self.regularization
                predictions -= self.cat_biases[inds, column]
                self.cat_biases[:, column] += nominator / denominator
                predictions += self.cat_biases[inds, column]
            print stage, self.loss(predictions)
        return self

    def predict_proba(self, X):
        X = numpy.array(X)
        predictions = numpy.zeros(len(X))
        for column in range(X.shape[1]):
            predictions += self.cat_biases[X[:, column], column]
        return predictions

    def predict_train(self, X):
        X = numpy.array(X)
        predictions = self.predict_proba(X)

        grads = self.loss.negative_gradient(predictions)
        hesss = self.loss.hessian(predictions)
        prediction_shift = numpy.zeros(len(X))
        for column in range(X.shape[1]):
            inds = X[:, column]
            cum_grads = numpy.bincount(inds, weights=grads)[inds]
            cum_hess = numpy.bincount(
                inds, weights=hesss)[inds] + self.regularization
            prediction_shift += -grads / cum_hess

        return predictions + prediction_shift
Exemple #7
0
def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = CompositeLossFunction()
    loss4 = KnnAdaLossFunction(uniform_features=uniform_features,
                               uniform_label=1)
    loss5 = KnnAdaLossFunction(uniform_features=uniform_features,
                               uniform_label=[0, 1])
    loss6bin = BinFlatnessLossFunction(uniform_features,
                                       fl_coefficient=2.,
                                       uniform_label=0)
    loss7bin = BinFlatnessLossFunction(uniform_features,
                                       fl_coefficient=2.,
                                       uniform_label=[0, 1])
    loss6knn = KnnFlatnessLossFunction(uniform_features,
                                       fl_coefficient=2.,
                                       uniform_label=1)
    loss7knn = KnnFlatnessLossFunction(uniform_features,
                                       fl_coefficient=2.,
                                       uniform_label=[0, 1])

    for loss in [
            loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn,
            loss7knn
    ]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(
            result, loss)
Exemple #8
0
    def fit(self, X_cat, y):
        X_cat = numpy.array(X_cat)
        self.unit_weights = numpy.random.normal(size=self.n_units)
        self.cat_weights = []
        for column in X_cat.T:
            self.cat_weights.append(
                numpy.random.normal(size=[numpy.max(column) + 1, self.n_units])
                * 0.1)
        loss = LogLossFunction()
        loss.fit(X_cat, y, y * 0 + 1.)

        unit_predictions, predictions = self.compute_all(X_cat)

        # Training process
        for iteration in range(self.n_iterations):
            new_unit_predictions, new_predictions = self.compute_all(X_cat)
            assert numpy.allclose(predictions, new_predictions)
            predictions = new_predictions
            assert numpy.allclose(unit_predictions, new_unit_predictions)
            unit_predictions = new_unit_predictions

            for unit in range(self.n_units):
                # updating coefficient for unit

                for updated_unit in [unit]:
                    grads = loss.negative_gradient(predictions)
                    hesss = loss.hessian(predictions)
                    unit_outputs = self.activation(
                        unit_predictions[:, updated_unit])
                    nom = numpy.dot(grads, unit_outputs)
                    denom = (numpy.dot(hesss, unit_outputs**2) +
                             self.regularization)
                    step = 0.5 * nom / denom
                    self.unit_weights[updated_unit] += step
                    predictions += step * unit_outputs

                for column in range(X_cat.shape[1]):
                    inds = X_cat[:, column]
                    # updating with respect to column and unit
                    unit_outputs, unit_derivs, unit_hesss = self.act_grad_hess(
                        unit_predictions[:, unit])

                    unit_weight = self.unit_weights[unit]
                    grads = loss.negative_gradient(predictions) * unit_weight
                    hesss = loss.hessian(predictions) * unit_weight**2

                    cat_grads = grads * unit_derivs
                    cat_hesss = hesss * (unit_derivs**2) + grads * unit_hesss

                    max_cats = self.cat_weights[column].shape[0]

                    nominator = numpy.bincount(inds,
                                               weights=cat_grads,
                                               minlength=max_cats)
                    nominator -= self.regularization * self.cat_weights[
                        column][:, unit]

                    cat_steps =  nominator/ \
                        (numpy.bincount(inds, weights=cat_hesss.clip(0), minlength=max_cats) + self.regularization)
                    cat_steps *= 1.5

                    self.cat_weights[column][:, unit] += cat_steps
                    predictions -= self.unit_weights[unit] * unit_outputs
                    unit_predictions[:, unit] += cat_steps[inds]
                    unit_outputs = self.activation(unit_predictions[:, unit])
                    predictions += self.unit_weights[unit] * unit_outputs

                    print iteration, unit, column, loss(predictions)

        return self
Exemple #9
0
class StallsFM(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 learning_rate=1.0,
                 regularization=100.,
                 n_units=10,
                 iterations=30,
                 n_thresholds=10,
                 max_overlap=20,
                 sign=+1):
        self.learning_rate = learning_rate
        self.regularization = regularization
        self.bias_regularization = regularization * 0.1
        self.n_units = n_units
        self.iterations = iterations
        self.n_thresholds = n_thresholds
        self.loss = LogLossFunction()
        self.max_overlap = max_overlap
        self.sign = sign
        self.unit_signs = numpy.ones(n_units) * sign

    def decompose_data(self, X, fit=False):
        # hack to support both pandas and numpy.arrays
        X = pandas.DataFrame(X)

        if fit:
            self.is_sequential = numpy.array(
                [column.dtype == 'float' for name, column in X.iteritems()])
            self.codings = []
            self.codings.append([0])
            for name, column in X.iteritems():
                if column.dtype == 'float':
                    self.codings.append(
                        numpy.percentile(
                            column,
                            numpy.linspace(0, 100,
                                           self.n_thresholds + 1)[1:-1]))
                else:
                    self.codings.append(numpy.unique(column))

        X_categoricals = []
        X_categoricals.append(numpy.zeros(len(X), dtype=int))
        for is_seq, coding, (name, column) in zip(self.is_sequential,
                                                  self.codings[1:],
                                                  X.iteritems()):
            if is_seq:
                X_categoricals.append(numpy.searchsorted(coding, column))
            else:
                X_categoricals.append(
                    (numpy.searchsorted(coding, column) + 1) *
                    numpy.in1d(column, coding))
        return numpy.array(X_categoricals).T

    def compute_grad_hess(self, predictions):
        return self.loss.negative_gradient(predictions), self.loss.hessian(
            predictions)

    def fit(self, X, y):
        self.classes_, y = numpy.unique(y, return_inverse=True)
        assert len(self.classes_) == 2, 'only two classes supported'
        X_cat = self.decompose_data(X, fit=True)

        self.cat_biases = [
            numpy.zeros(len(coding) + 1) for coding in self.codings
        ]
        self.cat_representations = [
            numpy.random.normal(size=[len(coding) + 1, self.n_units]) * 0.1
            for coding in self.codings
        ]
        self.connections = numpy.zeros([X_cat.shape[1], self.n_units])
        max_overlap = min(self.max_overlap, X_cat.shape[1])
        self.connections[:] = generate_connections(X_cat.shape[1],
                                                   self.n_units,
                                                   n_overlap=max_overlap)

        return self.partial_fit(X, y, restart=True)

    def partial_fit(self, X, y, restart=False):
        assert isinstance(
            X, pandas.DataFrame), 'only pandas.DataFrames are accepted'
        assert numpy.in1d(y, self.classes_).all()
        y = numpy.searchsorted(self.classes_, y)

        assert len(X) == len(y)

        self.loss.fit(X, y, sample_weight=numpy.ones_like(y))
        X_cat = self.decompose_data(X, fit=False)

        unit_signs = self.unit_signs
        self.losses = []

        for iteration in range(self.iterations):
            if iteration % 1 == 0:
                biases, representations, representations_sq = self.compute_representations(
                    X_cat)
                new_predictions = self.compute_prediction(
                    biases, representations, representations_sq, unit_signs)
                if iteration > 0:
                    assert numpy.allclose(predictions, new_predictions)
                predictions = new_predictions

            for category_biases, category_representations, column, connection in \
                    zip(self.cat_biases, self.cat_representations, X_cat.T, self.connections):

                # fitting biases with exact step
                minlen = len(category_biases)
                grads, hesss = self.compute_grad_hess(predictions)
                total_grads = numpy.bincount(column,
                                             weights=grads,
                                             minlength=minlen)
                total_hesss = numpy.bincount(column,
                                             weights=hesss,
                                             minlength=minlen)
                updates = (total_grads -
                           self.bias_regularization * category_biases) / (
                               total_hesss + self.bias_regularization)
                category_biases[:] += updates
                biases += updates[column]
                predictions += updates[column]

                for unit in numpy.arange(self.n_units):
                    unit_sign = unit_signs[unit]
                    if unit_sign == 0 or connection[unit] == 0:
                        continue
                    grads, hesss = self.compute_grad_hess(predictions)
                    predictions -= unit_sign * representations[:, unit]**2
                    predictions += unit_sign * category_representations[
                        column, unit]**2
                    representations[:,
                                    unit] -= category_representations[column,
                                                                      unit]

                    total_grads = numpy.bincount(column,
                                                 weights=(2 * unit_sign) *
                                                 representations[:, unit] *
                                                 grads,
                                                 minlength=minlen)
                    total_hesss = numpy.bincount(
                        column,
                        weights=4 * representations[:, unit]**2 * hesss,
                        minlength=minlen)
                    nominator = total_grads - self.regularization * category_representations[:,
                                                                                             unit]
                    denominator = total_hesss + self.regularization

                    # TODO iterative update here with penalty for is_seq
                    unit_update = self.learning_rate * nominator / denominator
                    category_representations[:, unit] += unit_update
                    category_representations[:, unit] = numpy.clip(
                        category_representations[:, unit], -1, 1)

                    representations[:,
                                    unit] += category_representations[column,
                                                                      unit]
                    predictions += unit_sign * representations[:, unit]**2
                    predictions -= unit_sign * category_representations[
                        column, unit]**2

                self.losses.append(self.loss(predictions))
            print(iteration, self.losses[-1])
        return self

    def compute_prediction(self, biases, representations, representations_sq,
                           unit_signs):
        return biases + (representations**2
                         ).dot(unit_signs) - representations_sq.dot(unit_signs)

    def compute_representations(self, X_cat):
        biases = numpy.zeros(len(X_cat), dtype='float')
        representations = numpy.zeros([len(X_cat), self.n_units],
                                      dtype='float')
        representations_sq = numpy.zeros([len(X_cat), self.n_units],
                                         dtype='float')
        for cat_biases, cat_representations, column, connection in \
                zip(self.cat_biases, self.cat_representations, X_cat.T, self.connections):
            biases += cat_biases[column]
            representations += cat_representations[column] * connection[
                None, :]
            representations_sq += (cat_representations**
                                   2)[column] * connection[None, :]
        return biases, representations, representations_sq

    def decision_function(self, X):
        X_cat = self.decompose_data(X, fit=False)
        biases, representations, representations_sq = self.compute_representations(
            X_cat)
        return self.compute_prediction(biases, representations,
                                       representations_sq, self.unit_signs)

    def predict_proba(self, X):
        result = numpy.zeros([len(X), 2])
        result[:, 1] = scipy.special.expit(self.decision_function(X))
        result[:, 0] = 1 - result[:, 1]
        return result

    def predict(self, X):
        return numpy.argmax(self.predict_proba(X), axis=1)