Ejemplo n.º 1
0
 def _check_params(self):
     if self.loss is None:
         self.loss = AdaLossFunction()
     # Losses from sklearn are not allowed
     assert isinstance(self.loss, AbstractLossFunction), \
         'LossFunction should be derived from AbstractLossFunction'
     assert self.n_estimators > 0, 'n_estimators should be positive'
     self.random_state = check_random_state(self.random_state)
     assert 0 < self.subsample <= 1.0, 'subsample should be in the interval (0, 1]'
Ejemplo n.º 2
0
def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = losses.CompositeLossFunction()
    loss4 = losses.KnnAdaLossFunction(uniform_features=uniform_features,
                                      uniform_label=1)
    loss5 = losses.KnnAdaLossFunction(uniform_features=uniform_features,
                                      uniform_label=[0, 1])
    loss6bin = losses.BinFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=0)
    loss7bin = losses.BinFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=[0, 1])
    loss6knn = losses.KnnFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=1)
    loss7knn = losses.KnnFlatnessLossFunction(uniform_features,
                                              fl_coefficient=2.,
                                              uniform_label=[0, 1])

    for loss in [
            loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn,
            loss7knn
    ]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(
            result, loss)

    trainX['fake_request'] = numpy.random.randint(0, 4, size=len(trainX))
    for loss in [
            losses.MSELossFunction(),
            losses.MAELossFunction(),
            losses.RankBoostLossFunction(request_column='fake_request')
    ]:
        print(loss)
        clf = UGradientBoostingRegressor(loss=loss,
                                         max_depth=3,
                                         n_estimators=50,
                                         learning_rate=0.01,
                                         subsample=0.5,
                                         train_features=list(
                                             trainX.columns[1:]))
        clf.fit(trainX, trainY)
        roc_auc = roc_auc_score(testY, clf.predict(testX))
        assert roc_auc >= 0.7, "The quality is too poor: {} with loss: {}".format(
            roc_auc, loss)
Ejemplo n.º 3
0
def test_gb_with_ada_and_log(n_samples=1000, n_features=10, distance=0.6):
    """
    Testing with two main classification losses.
    Also testing copying
    """
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    for loss in [LogLossFunction(), AdaLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss,
                                          min_samples_split=20,
                                          max_depth=5,
                                          learning_rate=.2,
                                          subsample=0.7,
                                          n_estimators=10,
                                          train_features=None)
        clf.fit(trainX, trainY)
        assert clf.n_features == n_features
        assert len(clf.feature_importances_) == n_features
        # checking that predict proba works
        for p in clf.staged_predict_proba(testX):
            assert p.shape == (n_samples, 2)
        assert numpy.all(p == clf.predict_proba(testX))
        assert roc_auc_score(testY, p[:, 1]) > 0.8, 'quality is too low'
        # checking clonability
        _ = clone(clf)
        clf_copy = copy.deepcopy(clf)
        assert numpy.all(
            clf.predict_proba(trainX) == clf_copy.predict_proba(
                trainX)), 'copied classifier is different'
Ejemplo n.º 4
0
 def _check_params(self):
     if self.loss is None:
         self.loss = AdaLossFunction()
     # Losses from sklearn are not allowed
     assert isinstance(self.loss, AbstractLossFunction), \
         'LossFunction should be derived from AbstractLossFunction'
     assert self.n_estimators > 0, 'n_estimators should be positive'
     self.random_state = check_random_state(self.random_state)
     assert 0 < self.subsample <= 1.0, 'subsample should be in the interval (0, 1]'
Ejemplo n.º 5
0
def test_weight_misbalance(n_samples=1000, n_features=10, distance=0.6):
    """
    Testing how classifiers work with highly misbalanced (in the terms of weights) datasets.
    """
    testX, testY = generate_sample(n_samples, n_features, distance=distance)
    trainX, trainY = generate_sample(n_samples, n_features, distance=distance)
    trainW = trainY * 10000 + 1
    testW = testY * 10000 + 1
    for loss in [LogLossFunction(), AdaLossFunction(), losses.CompositeLossFunction()]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=.2,
                                          subsample=0.7, n_estimators=10, train_features=None)
        clf.fit(trainX, trainY, sample_weight=trainW)
        p = clf.predict_proba(testX)
        assert roc_auc_score(testY, p[:, 1], sample_weight=testW) > 0.8, 'quality is too low'
Ejemplo n.º 6
0
def test_gradient_boosting(n_samples=1000):
    """
    Testing workability of GradientBoosting with different loss function
    """
    # Generating some samples correlated with first variable
    distance = 0.6
    testX, testY = generate_sample(n_samples, 10, distance)
    trainX, trainY = generate_sample(n_samples, 10, distance)
    # We will try to get uniform distribution along this variable
    uniform_features = ['column0']

    loss1 = LogLossFunction()
    loss2 = AdaLossFunction()
    loss3 = CompositeLossFunction()
    loss4 = KnnAdaLossFunction(uniform_features=uniform_features,
                               uniform_label=1)
    loss5 = KnnAdaLossFunction(uniform_features=uniform_features,
                               uniform_label=[0, 1])
    loss6bin = BinFlatnessLossFunction(uniform_features,
                                       fl_coefficient=2.,
                                       uniform_label=0)
    loss7bin = BinFlatnessLossFunction(uniform_features,
                                       fl_coefficient=2.,
                                       uniform_label=[0, 1])
    loss6knn = KnnFlatnessLossFunction(uniform_features,
                                       fl_coefficient=2.,
                                       uniform_label=1)
    loss7knn = KnnFlatnessLossFunction(uniform_features,
                                       fl_coefficient=2.,
                                       uniform_label=[0, 1])

    for loss in [
            loss1, loss2, loss3, loss4, loss5, loss6bin, loss7bin, loss6knn,
            loss7knn
    ]:
        clf = UGradientBoostingClassifier(loss=loss, min_samples_split=20, max_depth=5, learning_rate=0.2,
                                          subsample=0.7, n_estimators=25, train_features=None) \
            .fit(trainX[:n_samples], trainY[:n_samples])
        result = clf.score(testX, testY)
        assert result >= 0.7, "The quality is too poor: {} with loss: {}".format(
            result, loss)
Ejemplo n.º 7
0
class AbstractGradientBoostingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, loss=None,
                 n_estimators=100,
                 learning_rate=0.1,
                 subsample=1.0,
                 train_variables=None,
                 random_state=None,
                 n_threads=1,
                 dtype=DTYPE):
        """This version of gradient boosting supports only two-class classification and only special losses
        derived from AbstractLossFunction.
        There are some methods that should be overriden in descendants.
        :type loss: AbstractLossFunction, by default AdaLossFunction is used
        """
        self.loss = loss
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.subsample = subsample
        self.train_variables = train_variables
        self.random_state = random_state
        self.initial_prediction = 0.
        self.dtype = dtype
        self.n_threads = n_threads

    def _check_params(self):
        if self.loss is None:
            self.loss = AdaLossFunction()
        # Losses from sklearn are not allowed
        assert isinstance(self.loss, AbstractLossFunction), \
            'LossFunction should be derived from AbstractLossFunction'
        assert self.n_estimators > 0, 'n_estimators should be positive'
        self.random_state = check_random_state(self.random_state)
        assert 0 < self.subsample <= 1.0, 'subsample should be in the interval (0, 1]'

    def _create_estimator(self, stage):
        raise NotImplementedError('Should be overriden in descendants')

    def _fit_estimator(self, estimator, X, y, sample_weight, residual, mask):
        """ mask - which events to use in training """
        # TODO do we need check_input=false for trees?
        estimator.fit(X[mask, :], residual[mask], sample_weight=sample_weight[mask])

    def _update_estimator(self, estimator, X, y, sample_weight, residual, y_pred, mask):
        pass

    def _prepare_data_for_fitting(self, X, y, sample_weight):
        """By default the same format used as for trees """
        X = self.get_train_vars(X)
        X, y = check_arrays(X, y, dtype=self.dtype, sparse_format="dense", check_ccontiguous=True)
        return X, y, sample_weight

    @staticmethod
    def _initial_data_check(X, y, sample_weight):
        sample_weight = check_sample_weight(y, sample_weight=sample_weight)
        assert len(X) == len(y), 'Different lengths of X and y'
        X = pandas.DataFrame(X)
        y = numpy.array(column_or_1d(y), dtype=int)
        assert numpy.all(numpy.in1d(y, [0, 1])), 'Only two-class classification supported'
        return X, y, sample_weight

    def _prepare_initial_predictions(self, X, y, sample_weight):
        self.initial_prediction = logit(numpy.average(y, weights=sample_weight))

    def _compute_initial_predictions(self, X):
        return numpy.zeros(len(X), dtype='float') + self.initial_prediction

    def _generate_mask(self, length, subsample):
        if subsample == 1.0:
            return slice(None, None, None)
        else:
            n_sampled_events = int(subsample * length)
            return self.random_state.choice(length, n_sampled_events, replace=True)

    def fit(self, X, y, sample_weight=None):
        X, y, sample_weight = self._initial_data_check(X, y, sample_weight)
        self._check_params()

        loss_weight = numpy.ones(len(sample_weight))
        tree_weight = sample_weight

        if False:
            loss_weight, tree_weight = tree_weight, loss_weight


        self.loss = copy.copy(self.loss)
        self.loss.fit(X, y, sample_weight=loss_weight)

        X, y, sample_weight = self._prepare_data_for_fitting(X, y, sample_weight)

        self._prepare_initial_predictions(X, y, sample_weight)
        y_pred = self._compute_initial_predictions(X)
        self.estimators = []
        self.scores = []

        # pool = ThreadPool(processes=self.n_threads)

        lock = Lock()
        train_params = [self, X, y, tree_weight, y_pred, lock]
        # TODO use threading
        # pool.map(_train_one_classifier, [train_params] * self.n_estimators, chunksize=1)
        map(_train_one_classifier, [train_params] * self.n_estimators)

        return self

    def get_train_vars(self, X):
        if self.train_variables is None:
            return numpy.array(X)
        else:
            return numpy.array(X.loc[:, self.train_variables])

    @staticmethod
    def score_to_proba(score):
        result = numpy.zeros([len(score), 2], dtype=float)
        result[:, 1] = sigmoid_function(score, width=1.)
        result[:, 0] = 1. - result[:, 1]
        return result

    def staged_predict_score(self, X):
        X = self.get_train_vars(X)
        y_pred = self._compute_initial_predictions(X)
        for estimator in self.estimators:
            y_pred += self.learning_rate * estimator.predict(X)
            yield y_pred

    def predict_score(self, X):
        result = None
        for score in self.staged_predict_score(X):
            result = score
        return result

    def staged_predict_proba(self, X):
        for score in self.staged_predict_score(X):
            yield self.score_to_proba(score)

    def predict_proba(self, X):
        return self.score_to_proba(self.predict_score(X))

    def predict(self, X):
        return numpy.argmax(self.predict_proba(X), axis=1)
Ejemplo n.º 8
0
class AbstractGradientBoostingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 loss=None,
                 n_estimators=100,
                 learning_rate=0.1,
                 subsample=1.0,
                 train_variables=None,
                 random_state=None,
                 n_threads=1,
                 dtype=DTYPE):
        """This version of gradient boosting supports only two-class classification and only special losses
        derived from AbstractLossFunction.
        There are some methods that should be overriden in descendants.
        :type loss: AbstractLossFunction, by default AdaLossFunction is used
        """
        self.loss = loss
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.subsample = subsample
        self.train_variables = train_variables
        self.random_state = random_state
        self.initial_prediction = 0.
        self.dtype = dtype
        self.n_threads = n_threads

    def _check_params(self):
        if self.loss is None:
            self.loss = AdaLossFunction()
        # Losses from sklearn are not allowed
        assert isinstance(self.loss, AbstractLossFunction), \
            'LossFunction should be derived from AbstractLossFunction'
        assert self.n_estimators > 0, 'n_estimators should be positive'
        self.random_state = check_random_state(self.random_state)
        assert 0 < self.subsample <= 1.0, 'subsample should be in the interval (0, 1]'

    def _create_estimator(self, stage):
        raise NotImplementedError('Should be overriden in descendants')

    def _fit_estimator(self, estimator, X, y, sample_weight, residual, mask):
        """ mask - which events to use in training """
        # TODO do we need check_input=false for trees?
        estimator.fit(X[mask, :],
                      residual[mask],
                      sample_weight=sample_weight[mask])

    def _update_estimator(self, estimator, X, y, sample_weight, residual,
                          y_pred, mask):
        pass

    def _prepare_data_for_fitting(self, X, y, sample_weight):
        """By default the same format used as for trees """
        X = self.get_train_vars(X)
        X, y = check_arrays(X, y)
        X = X.astype(self.dtype)
        return X, y, sample_weight

    @staticmethod
    def _initial_data_check(X, y, sample_weight):
        sample_weight = check_sample_weight(y, sample_weight=sample_weight)
        assert len(X) == len(y), 'Different lengths of X and y'
        X = pandas.DataFrame(X)
        y = numpy.array(column_or_1d(y), dtype=int)
        assert numpy.all(numpy.in1d(
            y, [0, 1])), 'Only two-class classification supported'
        return X, y, sample_weight

    def _prepare_initial_predictions(self, X, y, sample_weight):
        self.initial_prediction = logit(numpy.average(y,
                                                      weights=sample_weight))

    def _compute_initial_predictions(self, X):
        return numpy.zeros(len(X), dtype='float') + self.initial_prediction

    def _generate_mask(self, length, subsample):
        if subsample == 1.0:
            return slice(None, None, None)
        else:
            n_sampled_events = int(subsample * length)
            return self.random_state.choice(length,
                                            n_sampled_events,
                                            replace=True)

    def fit(self, X, y, sample_weight=None):
        X, y, sample_weight = self._initial_data_check(X, y, sample_weight)
        self._check_params()

        loss_weight = numpy.ones(len(sample_weight))
        tree_weight = sample_weight

        if False:
            loss_weight, tree_weight = tree_weight, loss_weight

        self.loss = copy.copy(self.loss)
        self.loss.fit(X, y, sample_weight=loss_weight)

        X, y, sample_weight = self._prepare_data_for_fitting(
            X, y, sample_weight)

        self._prepare_initial_predictions(X, y, sample_weight)
        y_pred = self._compute_initial_predictions(X)
        self.estimators = []
        self.scores = []

        # pool = ThreadPool(processes=self.n_threads)

        lock = Lock()
        train_params = [self, X, y, tree_weight, y_pred, lock]
        # TODO use threading
        # pool.map(_train_one_classifier, [train_params] * self.n_estimators, chunksize=1)
        map(_train_one_classifier, [train_params] * self.n_estimators)

        return self

    def get_train_vars(self, X):
        if self.train_variables is None:
            return numpy.array(X)
        else:
            return numpy.array(X.loc[:, self.train_variables])

    @staticmethod
    def score_to_proba(score):
        result = numpy.zeros([len(score), 2], dtype=float)
        result[:, 1] = sigmoid_function(score, width=1.)
        result[:, 0] = 1. - result[:, 1]
        return result

    def staged_predict_score(self, X):
        X = self.get_train_vars(X)
        y_pred = self._compute_initial_predictions(X)
        for estimator in self.estimators:
            y_pred += self.learning_rate * estimator.predict(X)
            yield y_pred

    def predict_score(self, X):
        result = None
        for score in self.staged_predict_score(X):
            result = score
        return result

    def staged_predict_proba(self, X):
        for score in self.staged_predict_score(X):
            yield self.score_to_proba(score)

    def predict_proba(self, X):
        return self.score_to_proba(self.predict_score(X))

    def predict(self, X):
        return numpy.argmax(self.predict_proba(X), axis=1)