Beispiel #1
0
def main():
    # An example use of 'build_tree' and 'predict'
    df_train = clean('horseTrain.txt')
    attributes = [
        'K', 'Na', 'CL', 'HCO', 'Endotoxin', 'Anioingap', 'PLA2', 'SDH',
        'GLDH', 'TPP', 'Breath rate', 'PCV', 'Pulse rate', 'Fibrinogen',
        'Dimer', 'FibPerDim'
    ]
    dec_tree = Tree(df_train, attributes, 'Outcome')
    print("Building the tree...")
    time1 = time.time()
    dec_tree.build_tree()
    time2 = time.time()
    print("Time spent to build the tree %.2f seconds" % (time2 - time1))
    print("Finish building the tree...")
    time.sleep(1)

    print("Shape of Tree ...")
    dec_tree.print_tree()
    time.sleep(1)

    print("Accuracy of test data ... ")
    df_test = clean('horseTest.txt')
    print(dec_tree.predict(df_test))
    print(df_test['Outcome'])
    def _init_extimator(self):
        index = list(np.random.choice(self.N, self.n_samples, p=self.weights))

        # randomly sampling samples(no change for dimensions)
        x_sub = np.array([self.x[inx, :] for inx in index])
        y_sub = np.array([self.y[inx] for inx in index])

        while True:
            tree = Tree(x_sub, y_sub)
            y_pred = tree.predict(self.x)
            accuracy = accuracy_score(self.y, y_pred)
            if accuracy != 0.5:
                self.estimators.append(tree)
                break
        return tree, y_pred
Beispiel #3
0
    def init_estimator(self):
        indices = [i for i in np.random.choice(X.shape[0], self.n_samples, p=self.weights)]
        X_tree = np.array([X[i, :] for i in indices])
        y_tree = np.array([y[i] for i in indices])

        print "%s / %s" % (self.count, self.n_estimators)

        while True:
            t1 = time.time()
            tree = Tree(X_tree, y_tree)
            t2 = time.time()

            print "tree generation time: %s" % (t2 - t1)

            predictions = tree.predict(self.X)
            accuracy = accuracy_score(self.y, predictions)
            print "accuracy: %s" % accuracy
            if accuracy != 0.50:
                self.estimators.append(tree)
                break

        return tree, predictions
Beispiel #4
0
    def fit(self,
            train_data,
            validation_data,
            early_stopping_rounds=np.inf,
            eval_metric=None,
            loss="logisticloss",
            eta=0.3,
            num_round=1000,
            max_depth=6,
            pool_size=1,
            min_instances_byleaf=1,
            scale_pos_weight=1,
            subsample=0.8,
            colsample_bytree=0.8,
            min_child_weight=1,
            reg_lambda=1.0,
            gamma=0):
        """
        :param train_data: Data object, train data
        :param validation_data: Data object, validation data
        :param eta: learning rate
        :param num_round: number of boosting round
        :param max_depth: max depth of each tree
        :param pool_size: the num of processes
        :param subsample: row sample rate when building a tree
        :param colsample_bytree: column sample rate when building a tree
        :param min_instances_byleaf: min number of samples in a leaf node
        :param loss: loss object
                     logisticloss,squareloss
        :param reg_lambda: lambda
        :param gamma: gamma
        :param eval_metric: evaluation metric, provided: "accuracy"
        """
        self.eta = eta
        self.num_round = num_round
        self.max_depth = max_depth
        self.pool_size = pool_size
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.reg_lambda = reg_lambda
        self.gamma = gamma
        self.min_instances_byleaf = min_instances_byleaf
        self.eval_metric = eval_metric
        self.min_child_weight = min_child_weight
        self.scale_pos_weight = scale_pos_weight
        self.first_round_pred = 0.0

        # initial loss function
        if loss == "logisticloss":
            self.loss = LogisticLoss(self.reg_lambda)
        elif loss == "squareloss":
            self.loss = SquareLoss(self.reg_lambda)
            self.first_round_pred = train_data.getLabelMean()
        else:
            raise NotImplementedError(
                "loss should be 'logisticloss' or 'squareloss'")

        # to evaluate on validation set and conduct early stopping
        do_validation = True
        valData = validation_data.getData()
        if not valData:
            raise ValueError("validation_data is empty !")

        valIdxList = []  #save an fixed order
        valLabels = []
        for idx in valData:
            valData[idx][
                'yPred'] = self.first_round_pred  #init it with traindata
            valIdxList.append(idx)
            valLabels.append(valData[idx]['label'])

        best_val_metric = np.inf
        best_round = 0
        become_worse_round = 0

        data = train_data.getData()
        if not train_data:
            raise ValueError("train_data is empty !")
        idxList = []  #save an fixed order
        labels = []
        for idx in data:
            data[idx]['yPred'] = self.first_round_pred
            data[idx]['grad'] = self.loss.grad(data[idx]['grad'],
                                               data[idx]['label'])
            data[idx]['hess'] = self.loss.hess(data[idx]['hess'],
                                               data[idx]['label'])
            if data[idx]['label'] == 1.0:
                data[idx]['weight'] = self.scale_pos_weight
            idxList.append(idx)
            labels.append(data[idx]['label'])
        labels = np.array(labels)
        for i in range(self.num_round):
            # weighted grad and hess
            for idx in data:
                data[idx]['grad'] = data[idx]['grad'] * data[idx]['weight']
                data[idx]['hess'] = data[idx]['hess'] * data[idx]['weight']

            # row and column sample before training the current tree
            factors = train_data.getFactors()
            factorTypes = train_data.getFeatureTypes()
            sampledFactors = random.sample(
                factors, int(len(factors) * self.colsample_bytree))
            sampledData = {}
            for idx in random.sample(idxList,
                                     int(len(idxList) * self.subsample)):
                sampledData.update({idx: data[idx]})

            # train current tree
            tree = Tree()
            tree.fit(sampledData,
                     sampledFactors,
                     factorTypes,
                     max_depth=self.max_depth,
                     pool_size=self.pool_size,
                     min_child_weight=self.min_child_weight,
                     min_instances_byleaf=self.min_instances_byleaf,
                     reg_lambda=self.reg_lambda,
                     gamma=self.gamma)

            # predict the whole trainset and update y_pred,grad,hess
            preds = tree.predict(sampledData)
            for idx in sampledData:
                data[idx]['yPred'] += self.eta * preds[idx]
                data[idx]['grad'] = self.loss.grad(data[idx]["yPred"],
                                                   data[idx]["label"])
                data[idx]['hess'] = self.loss.hess(data[idx]["yPred"],
                                                   data[idx]["label"])

            # update feature importance
            for k in tree.feature_importance.iterkeys():
                self.feature_importance[k] += tree.feature_importance[k]

            self.trees.append(tree)

            # print training information
            if self.eval_metric is None:
                print "Apollo round {iteration}".format(iteration=i)
            else:
                try:
                    mertric_func = get_metric(self.eval_metric)
                except:
                    raise NotImplementedError(
                        "The given eval_metric is not provided")

                curPreds = np.array([data[idx]["yPred"] for idx in idxList])
                train_metric = mertric_func(self.loss.transform(curPreds),
                                            labels)

                if not do_validation:
                    print "Apollo round {iteration}, train-{eval_metric} is {train_metric}".format(
                        iteration=i,
                        eval_metric=self.eval_metric,
                        train_metric=train_metric)
                else:
                    valPreds = tree.predict(valData)
                    for idx in valData:
                        valData[idx]['yPred'] += self.eta * valPreds[idx]
                    curValPreds = [valData[idx]['yPred'] for idx in valIdxList]
                    assert len(curValPreds) == len(valLabels)
                    val_metric = mertric_func(
                        self.loss.transform(np.array(curValPreds)),
                        np.array(valLabels))
                    print "Apollo round {iteration}, train-{eval_metric} is {train_metric}, val-{eval_metric} is {val_metric}".format(
                        iteration=i,
                        eval_metric=self.eval_metric,
                        train_metric=train_metric,
                        val_metric=val_metric)

                    # check if to early stop
                    if val_metric < best_val_metric:
                        best_val_metric = val_metric
                        best_round = i
                        become_worse_round = 0
                    else:
                        become_worse_round += 1
                    if become_worse_round > early_stopping_rounds:
                        print "Apollo training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric}".format(
                            best_round=best_round,
                            eval_metric=eval_metric,
                            best_val_metric=best_val_metric)
                        break
Beispiel #5
0
    def fit(self,
            X,
            y,
            eta=0.01,
            num_boost_round=1000,
            max_depth=5,
            rowsample=0.8,
            colsample_bytree=0.8,
            colsample_bylevel=0.8,
            min_sample_split=10,
            loss="logisticloss",
            l2_regularization=1.0,
            gamma=0.1,
            num_thread=-1,
            eval_metric=None):
        """
        :param X: pandas.core.frame.DataFrame
        :param y: pandas.core.series.Series
        :param eta: learning rate
        :param num_boost_round: number of boosting round
        :param max_depth: max depth of each tree
        :param rowsample: row sample rate when building a tree
        :param colsample_bytree: column sample rate when building a tree
        :param colsample_bylevel: column sample rate when spliting each tree node,
                                  the number of features = total_features*colsample_bytree*colsample_bylevel
        :param min_sample_split: min number of samples in a leaf node
        :param loss: loss object
                     logisticloss,squareloss, or customize loss
        :param l2_regularization: lambda
        :param gamma: gamma
        :param seed: random seed
        :param num_thread: number of thread to parallel
        :param eval_metric: evaluation metric, provided: "accuracy"
        """
        self.eta = eta
        self.num_boost_round = num_boost_round
        self.max_depth = max_depth
        self.rowsample = rowsample
        self.colsample_bytree = colsample_bytree
        self.colsample_bylevel = colsample_bylevel
        self.l2_regularization = l2_regularization
        self.gamma = gamma
        self.min_sample_split = min_sample_split
        self.num_thread = num_thread
        self.eval_metric = eval_metric

        if loss == "logisticloss":
            self.loss = LogisticLoss(l2_regularization)
        elif loss == "squareloss":
            self.loss = SquareLoss(l2_regularization)
        else:
            try:
                self.loss = CustomizeLoss(loss, l2_regularization)
            except:
                raise NotImplementedError(
                    "loss should be 'logisticloss','squareloss', or customize loss function"
                )

        self.first_round_pred = y.mean()

        # Y stores label, y_pred, grad, hess
        Y = pd.DataFrame(y.values,
                         columns=['label'])  # only one column "label"
        Y['y_pred'] = self.first_round_pred
        Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values)
        Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values)

        for i in range(self.num_boost_round):
            # sample samples and features to train current tree
            data = X.sample(frac=self.colsample_bytree, axis=1)
            data = pd.concat([data, Y], axis=1)
            data = data.sample(frac=self.rowsample, axis=0)
            Y_selected = data[['label', 'y_pred', 'grad', 'hess']]
            X_selected = data.drop(['label', 'y_pred', 'grad', 'hess'], axis=1)

            # train current tree
            tree = Tree()
            tree.fit(X_selected,
                     Y_selected,
                     max_depth=self.max_depth,
                     colsample_bylevel=self.colsample_bylevel,
                     min_sample_split=self.min_sample_split,
                     l2_regularization=self.l2_regularization,
                     gamma=self.gamma,
                     num_thread=self.num_thread)

            # predict the whole dataset and update y_pred,grad,hess
            preds = tree.predict(X)
            Y['y_pred'] += self.eta * preds
            Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values)
            Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values)

            if self.eval_metric is not None:
                try:
                    mertric_func = get_metric(self.eval_metric)
                except:
                    raise NotImplementedError(
                        "The given eval_metric is not provided")
                metric_value = mertric_func(
                    self.loss.transform(Y.y_pred.values), Y.label.values)
                print "TGBoost round {iteration}, {eval_metric} is {metric_value}".format(
                    iteration=i,
                    eval_metric=self.eval_metric,
                    metric_value=metric_value)
            else:
                print "TGBoost round {iteration}"

            # update feature importance
            for k in tree.feature_importance.iterkeys():
                self.feature_importance[k] += tree.feature_importance[k]

            self.trees.append(tree)
Beispiel #6
0
    def fit(self,
            features,
            label,
            validation_data=(None, None),
            early_stopping_rounds=np.inf,
            maximize=True,
            eval_metric=None,
            loss="logisticloss",
            eta=0.3,
            num_boost_round=1000,
            max_depth=6,
            scale_pos_weight=1,
            subsample=0.8,
            colsample=0.8,
            min_child_weight=1,
            min_sample_split=10,
            reg_lambda=1.0,
            gamma=0,
            num_thread=-1):
        """
        :param features: np.array
        :param label: np.array
        :param eta: learning rate
        :param num_boost_round: number of boosting round
        :param max_depth: max depth of each tree
        :param subsample: row sample rate when building a tree
        :param colsample: column sample rate when building a tree
        :param min_sample_split: min number of samples in a leaf node
        :param loss: loss object
                     logisticloss,squareloss, or customize loss
        :param reg_lambda: lambda
        :param gamma: gamma
        :param num_thread: number of threself.tree_predict_Xad to parallel
        :param eval_metric: evaluation metric, provided: "accuracy"
        """
        self.eta = eta
        self.num_boost_round = num_boost_round
        self.max_depth = max_depth
        self.subsample = subsample
        self.colsample = colsample
        self.reg_lambda = reg_lambda
        self.gamma = gamma
        self.min_sample_split = min_sample_split
        self.num_thread = num_thread
        self.eval_metric = eval_metric
        self.min_child_weight = min_child_weight
        self.scale_pos_weight = scale_pos_weight
        self.first_round_pred = 0

        # initial loss function
        if loss == "logisticloss":
            self.loss = LogisticLoss()
        elif loss == "squareloss":
            self.loss = SquareLoss()
            self.first_round_pred = label.mean()
        else:
            try:
                self.loss = CustomizeLoss(loss)
            except:
                raise NotImplementedError(
                    "loss should be 'logisticloss','squareloss', or customize loss function"
                )

        # initialize row_sampler, col_sampler, bin_structure, attribute_list, class_list
        row_sampler = RowSampler(features.shape[0], self.subsample)
        col_sampler = ColumnSampler(features.shape[1], self.colsample)
        bin_structure = BinStructure(features)
        attribute_list = AttributeList(features, bin_structure)
        class_list = ClassList(label)
        class_list.initialize_pred(self.first_round_pred)
        class_list.update_grad_hess(self.loss)

        # to evaluate on validation set and conduct early stopping
        # we should get (val_features,val_label)
        # and set some variable to check when to stop
        do_validation = True
        if not isinstance(validation_data, tuple):
            raise TypeError(
                "validation_data should be (val_features, val_label)")

        val_features, val_label = validation_data
        val_pred = None
        if val_features is None or val_label is None:
            do_validation = False
        else:
            val_pred = np.ones(val_label.shape) * self.first_round_pred

        if maximize:
            best_val_metric = -np.inf
            best_round = 0
            become_worse_round = 0
        else:
            best_val_metric = np.inf
            best_round = 0
            become_worse_round = 0

        # start learning
        logging.info("TGBoost start training")
        for i in range(self.num_boost_round):
            t0 = time()
            # train current tree
            tree = Tree(self.min_sample_split, self.min_child_weight,
                        self.max_depth, self.colsample, self.subsample,
                        self.reg_lambda, self.gamma, self.num_thread)
            tree.fit(attribute_list, class_list, row_sampler, col_sampler,
                     bin_structure)

            # when finish building this tree, update the class_list.pred, grad, hess
            class_list.update_pred(self.eta)
            class_list.update_grad_hess(self.loss)
            # save this tree
            self.trees.append(tree)

            t1 = time()

            # print training information
            if self.eval_metric is None:
                logging.info("TGBoost round {iteration}".format(iteration=i))
            else:
                try:
                    mertric_func = get_metric(self.eval_metric)
                except:
                    raise NotImplementedError(
                        "The given eval_metric is not provided")

                train_metric = mertric_func(
                    self.loss.transform(class_list.pred), label)

                if not do_validation:
                    logging.info(
                        "TGBoost round {iteration}, train-{eval_metric}: {train_metric:.4f}, exec time {tc:.3f}s"
                        .format(iteration=i,
                                eval_metric=self.eval_metric,
                                train_metric=train_metric,
                                tc=t1 - t0))
                else:
                    val_pred += self.eta * tree.predict(val_features)
                    val_metric = mertric_func(self.loss.transform(val_pred),
                                              val_label)
                    logging.info(
                        "TGBoost round {iteration}, train-{eval_metric}: {train_metric:.4f}, val-{eval_metric}: {val_metric:.4f}, exec time {tc:.3f}s"
                        .format(iteration=i,
                                eval_metric=self.eval_metric,
                                train_metric=train_metric,
                                val_metric=val_metric,
                                tc=t1 - t0))

                    # check whether to early stop
                    if maximize:
                        if val_metric > best_val_metric:
                            best_val_metric = val_metric
                            best_round = i
                            become_worse_round = 0
                        else:
                            become_worse_round += 1
                        if become_worse_round > early_stopping_rounds:
                            logging.info(
                                "TGBoost training Stop, best round is {best_round}, best {eval_metric} is {best_val_metric:.4f}"
                                .format(best_round=best_round,
                                        eval_metric=eval_metric,
                                        best_val_metric=best_val_metric))
                            break
                    else:
                        if val_metric < best_val_metric:
                            best_val_metric = val_metric
                            best_round = i
                            become_worse_round = 0
                        else:
                            become_worse_round += 1
                        if become_worse_round > early_stopping_rounds:
                            logging.info(
                                "TGBoost training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric:.4f}"
                                .format(best_round=best_round,
                                        eval_metric=eval_metric,
                                        best_val_metric=best_val_metric))
                            break
Beispiel #7
0
    def fit(self, X, y, eval_metric=None, early_stopping_rounds=None):
        self.trees = []
        self.feature_importances_ = {}
        self.eval_metric = _EVAL_METRIC[eval_metric] if eval_metric else None

        X.reset_index(drop=True, inplace=True)
        y.reset_index(drop=True, inplace=True)

        # Y stores: label, y_pred, grad, hess, sample_weight
        Y = pd.DataFrame(y.values, columns=[LABEL_COLUMN])
        Y['y_pred'] = self.first_round_pred
        Y[GRAD_COLUMN], Y[HESS_COLUMN] = self.loss.compute_grad_hess(
            Y.y_pred.values, Y.label.values)

        if self._is_classifier:
            Y['sample_weight'] = 1.0
            Y.loc[Y.label == 1, 'sample_weight'] = self.scale_pos_weight

        if self.eval_metric is not None and early_stopping_rounds is not None:
            assert early_stopping_rounds > 0
            best_val_score = -np.inf
            score_worse_round = 0
            best_round = 0

        for idx in xrange(self.n_estimators):
            if self._is_classifier:
                Y[GRAD_COLUMN] = Y[GRAD_COLUMN] * Y.sample_weight
                Y[HESS_COLUMN] = Y[HESS_COLUMN] * Y.sample_weight

            # subsample column and row before training the current tree
            X_sample_column = X.sample(frac=self.colsample_bytree, axis=1)
            data = pd.concat([X_sample_column, Y], axis=1)
            data = data.sample(frac=self.subsample, axis=0)

            X_feed = data[X_sample_column.columns]
            Y_feed = data[Y.columns]

            tree = Tree(max_depth=self.max_depth,
                        min_child_weight=self.min_child_weight,
                        colsample_bylevel=self.colsample_bylevel,
                        reg_lambda=self.reg_lambda,
                        gamma=self.gamma,
                        num_thread=self.num_thread)

            tree.fit(X_feed, Y_feed)

            # predict the whole train set to update the y_pred, grad and hess
            preds = tree.predict(X[X_sample_column.columns])

            Y['y_pred'] += self.learning_rate * preds
            Y[GRAD_COLUMN], Y[HESS_COLUMN] = self.loss.compute_grad_hess(
                Y.y_pred.values, Y.label.values)

            # only compute feature importance in "weight" type, xgboost support two more type "gain" and "cover"
            for feature, weight in tree.feature_importances_.iteritems():
                if feature in self.feature_importances_:
                    self.feature_importances_[feature] += weight
                else:
                    self.feature_importances_[feature] = weight

            self.trees.append(tree)

            if self.eval_metric is None:
                print '[SGBoost] train round: {0}'.format(idx)
            else:
                cur_val_score = self._eval_score(Y.label.values,
                                                 Y.y_pred.values)
                print '[SGBoost] train round: {0}, eval score: {1}'.format(
                    idx, cur_val_score)

                if early_stopping_rounds is not None:
                    if cur_val_score > best_val_score:
                        best_val_score = cur_val_score
                        score_worse_round = 0
                        best_round = idx
                    else:
                        score_worse_round += 1

                    if score_worse_round > early_stopping_rounds:
                        print '[SGBoost] train best round: {0}, best eval score: {1}'.format(
                            best_round, best_val_score)
                        break

        return self
Beispiel #8
0
    def fit(self,
            X,
            y,
            epoches=10,
            eta=0.3,
            num_boost_round=1000,
            max_depth=5,
            scale_pos_weight=1,
            subsample=0.8,
            colsample_bytree=0.8,
            colsample_bylevel=0.8,
            min_child_weight=1,
            min_sample_split=10,
            reg_lambda=1.0,
            gamma=0,
            num_thread=-1,
            pred_cutoff=0.5):
        '''
        X:pandas.core.frame.DataFrame
        y:pandas.core.series.Series
        early_stopping_rounds: early_stop when eval rsult become worse more the early_stopping_rounds times
        maximize:the target is to make loss as large as possible
        eval_metric: evaluate method
        loss : loss function for optionmize
        num_boost_round : number of boosting
        max_depth: max_depth for a tree
        scale_pos_weight: weight for samples with 1 labels
        subsample: row sample rate when build a tree
        colsample_bytree: column sample rate when building a tree
        colsample_bylevel: column sample rate when spliting each tree node. when split a tree,the number of features = total_features*colsample_bytree*colsample_bylevel
        min_sample_split: min number of samples in a leaf node
        '''
        self.eta = eta
        self.num_boost_round = num_boost_round
        self.first_round_pred = 0.0
        self.subsample = subsample
        self.max_depth = max_depth
        self.colsample_bytree = colsample_bytree
        self.colsample_bylevel = colsample_bylevel
        self.reg_lambda = reg_lambda
        self.min_sample_split = min_sample_split
        self.gamma = gamma
        self.num_thread = num_thread
        self.min_child_weight = min_child_weight
        self.scale_pos_weight = scale_pos_weight
        self.pred_cutoff = pred_cutoff
        self.epoches = epoches

        # 将X,y修改为能通过int下标(从0开始)进行索引的FramData
        X.reset_index(drop=True, inplace=True)
        y.reset_index(drop=True, inplace=True)

        self.loss = binary_classification_loss(self.eta)

        Y = pd.DataFrame(y.values, columns=['label'])
        Y['y_pred'] = self.first_round_pred
        Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values)
        Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values)

        Y['sample_weight'] = 1.0
        # 调整正样本权重
        Y.loc[Y.label == 1, 'sample_weight'] = self.scale_pos_weight
        for epoch in xrange(epoches):
            loss = []
            for i in range(self.num_boost_round):
                # row and column sample before training the current tree
                data = X.sample(frac=self.colsample_bytree,
                                axis=1)  # column sample
                data = pd.concat([data, Y], axis=1)
                data = data.sample(frac=self.subsample, axis=0)  # row sample

                Y_selected = data[['label', 'y_pred', 'grad', 'hess']]
                X_selected = data.drop(
                    ['label', 'y_pred', 'grad', 'hess', 'sample_weight'],
                    axis=1)

                # fit a tree
                if epoch == 0:
                    tree = Tree()
                else:
                    tree = self.trees[i]
                tree.fit(X_selected,
                         Y_selected,
                         max_depth=self.max_depth,
                         min_child_weight=self.min_child_weight,
                         colsample_bylevel=self.colsample_bylevel,
                         min_sample_split=self.min_sample_split,
                         eta=self.eta,
                         gamma=self.gamma,
                         num_thread=self.num_thread)

                # predict the whole trainset and update y_pred,grad,hess
                preds = tree.predict(X)
                Y['y_pred'] += self.eta * preds

                Y['grad'] = self.loss.grad(Y.y_pred.values,
                                           Y.label.values) * Y.sample_weight
                Y['hess'] = self.loss.hess(Y.y_pred.values,
                                           Y.label.values) * Y.sample_weight

                loss.append(self.loss.value(Y.y_pred.values, Y.label.values))
                # update feature importance
                for k in tree.feature_importance.iterkeys():
                    self.feature_importance[k] += tree.feature_importance[k]
                if epoch == 0:
                    self.trees.append(tree)
            print "epoch:{} loss:{}".format(epoch, np.mean(loss))
Beispiel #9
0
    def fit(self,
            X,
            y,
            validation_data=(None, None),
            early_stopping_rounds=np.inf,
            maximize=True,
            eval_metric=None,
            loss="logisticloss",
            eta=0.3,
            num_boost_round=1000,
            max_depth=6,
            scale_pos_weight=1,
            subsample=0.8,
            colsample_bytree=0.8,
            colsample_bylevel=0.8,
            min_child_weight=1,
            min_sample_split=10,
            reg_lambda=1.0,
            gamma=0,
            num_thread=-1):
        """
        :param X: pandas.core.frame.DataFrame
        :param y: pandas.core.series.Series
        :param eta: learning rate
        :param num_boost_round: number of boosting round
        :param max_depth: max depth of each tree
        :param subsample: row sample rate when building a tree
        :param colsample_bytree: column sample rate when building a tree
        :param colsample_bylevel: column sample rate when spliting each tree node,
                                  the number of features = total_features*colsample_bytree*colsample_bylevel
        :param min_sample_split: min number of samples in a leaf node
        :param loss: loss object
                     logisticloss,squareloss, or customize loss
        :param reg_lambda: lambda
        :param gamma: gamma
        :param seed: random seed
        :param num_thread: number of threself.tree_predict_Xad to parallel
        :param eval_metric: evaluation metric, provided: "accuracy"
        """
        self.eta = eta
        self.num_boost_round = num_boost_round
        self.max_depth = max_depth
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.colsample_bylevel = colsample_bylevel
        self.reg_lambda = reg_lambda
        self.gamma = gamma
        self.min_sample_split = min_sample_split
        self.num_thread = num_thread
        self.eval_metric = eval_metric
        self.min_child_weight = min_child_weight
        self.scale_pos_weight = scale_pos_weight
        self.first_round_pred = 0.0

        X.reset_index(drop=True, inplace=True)
        y.reset_index(drop=True, inplace=True)

        # initial loss function
        if loss == "logisticloss":
            self.loss = LogisticLoss(reg_lambda)
        elif loss == "squareloss":
            self.loss = SquareLoss(reg_lambda)
            self.first_round_pred = y.mean()
        else:
            try:
                self.loss = CustomizeLoss(loss, reg_lambda)
            except:
                raise NotImplementedError(
                    "loss should be 'logisticloss','squareloss', or customize loss function"
                )

        # to evaluate on validation set and conduct early stopping
        # we should get (val_X,val_y)
        # and set some variable to check when to stop
        do_validation = True
        if not isinstance(validation_data, tuple):
            raise TypeError("validation_data should be (val_X, val_y)")

        val_X, val_y = validation_data
        if val_X is None or val_y is None:
            do_validation = False
        else:
            # type check
            if not isinstance(val_X, pd.core.frame.DataFrame):
                raise TypeError("val_X should be 'pd.core.frame.DataFrame'")
            if not isinstance(val_y, pd.core.series.Series):
                raise TypeError("val_X should be 'pd.core.series.Series'")
            val_X.reset_index(drop=True, inplace=True)
            val_y.reset_index(drop=True, inplace=True)
            val_Y = pd.DataFrame(val_y.values, columns=['label'])
            val_Y['y_pred'] = self.first_round_pred

        if maximize:
            best_val_metric = -np.inf
            best_round = 0
            become_worse_round = 0
        else:
            best_val_metric = np.inf
            best_round = 0
            become_worse_round = 0

        # Y stores: label, y_pred, grad, hess, sample_weight
        Y = pd.DataFrame(y.values, columns=['label'])
        Y['y_pred'] = self.first_round_pred
        Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values)
        Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values)
        Y['sample_weight'] = 1.0
        Y.loc[Y.label == 1, 'sample_weight'] = self.scale_pos_weight

        for i in range(self.num_boost_round):
            # weighted grad and hess
            Y.grad = Y.grad * Y.sample_weight
            Y.hess = Y.hess * Y.sample_weight
            # row and column sample before training the current tree
            data = X.sample(frac=self.colsample_bytree, axis=1)
            data = pd.concat([data, Y], axis=1)
            data = data.sample(frac=self.subsample, axis=0)
            Y_selected = data[['label', 'y_pred', 'grad', 'hess']]
            X_selected = data.drop(
                ['label', 'y_pred', 'grad', 'hess', 'sample_weight'], axis=1)

            # train current tree
            tree = Tree()
            tree.fit(X_selected,
                     Y_selected,
                     max_depth=self.max_depth,
                     min_child_weight=self.min_child_weight,
                     colsample_bylevel=self.colsample_bylevel,
                     min_sample_split=self.min_sample_split,
                     reg_lambda=self.reg_lambda,
                     gamma=self.gamma,
                     num_thread=self.num_thread)

            # predict the whole trainset and update y_pred,grad,hess
            preds = tree.predict(X)
            Y['y_pred'] += self.eta * preds
            Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values)
            Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values)

            # update feature importance
            for k in tree.feature_importance.iterkeys():
                self.feature_importance[k] += tree.feature_importance[k]

            self.trees.append(tree)

            # print training information
            if self.eval_metric is None:
                print "TGBoost round {iteration}".format(iteration=i)
            else:
                try:
                    mertric_func = get_metric(self.eval_metric)
                except:
                    raise NotImplementedError(
                        "The given eval_metric is not provided")

                train_metric = mertric_func(
                    self.loss.transform(Y.y_pred.values), Y.label.values)

                if not do_validation:
                    print "TGBoost round {iteration}, train-{eval_metric} is {train_metric}".format(
                        iteration=i,
                        eval_metric=self.eval_metric,
                        train_metric=train_metric)
                else:
                    val_Y['y_pred'] += self.eta * tree.predict(val_X)
                    val_metric = mertric_func(
                        self.loss.transform(val_Y.y_pred.values),
                        val_Y.label.values)
                    print "TGBoost round {iteration}, train-{eval_metric} is {train_metric}, val-{eval_metric} is {val_metric}".format(
                        iteration=i,
                        eval_metric=self.eval_metric,
                        train_metric=train_metric,
                        val_metric=val_metric)

                    # check if to early stop
                    if maximize:
                        if val_metric > best_val_metric:
                            best_val_metric = val_metric
                            best_round = i
                            become_worse_round = 0
                        else:
                            become_worse_round += 1
                        if become_worse_round > early_stopping_rounds:
                            print "TGBoost training Stop, best round is {best_round}, best {eval_metric} is {best_val_metric}".format(
                                best_round=best_round,
                                eval_metric=eval_metric,
                                best_val_metric=best_val_metric)
                            break
                    else:
                        if val_metric < best_val_metric:
                            best_val_metric = val_metric
                            best_round = i
                            become_worse_round = 0
                        else:
                            become_worse_round += 1
                        if become_worse_round > early_stopping_rounds:
                            print "TGBoost training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric}".format(
                                best_round=best_round,
                                eval_metric=eval_metric,
                                best_val_metric=best_val_metric)
                            break
Beispiel #10
0
assert node.before_split_entropy is not None

# Test choose best attr
t1 = time.time()
node.choose_best_attr()
t2 = time.time()
print "time: %s" % (t2 - t1)

assert node.best_attr_index is not None
assert node.best_threshold is not None

print node.best_attr_index, node.best_threshold

# Test tree generation
indices = [i for i in np.random.choice(X.shape[0], 5000)]
X_tree = np.array([X[i, :] for i in indices])
y_tree = np.array([y[i] for i in indices])

t1 = time.time()
tree = Tree(X_tree, y_tree)
t2 = time.time()
print "time: %s" % (t2 - t1)
predictions = tree.predict(X_v)
accuracy_score(y_v, predictions)

# Test boost
boosting = Boosting(X, y, n_estimators=128, n_samples=2048)
boosting.train()
predictions = boosting.predict(X_v)
accuracy_score(y_v, predictions)
Beispiel #11
0
assert node.before_split_entropy is not None

# Test choose best attr
t1 = time.time()
node.choose_best_attr()
t2 = time.time()
print "time: %s" % (t2 - t1)

assert node.best_attr_index is not None
assert node.best_threshold is not None

print node.best_attr_index, node.best_threshold

# Test tree generation
indices = [i for i in np.random.choice(X.shape[0], 5000)]
X_tree = np.array([X[i, :] for i in indices])
y_tree = np.array([y[i] for i in indices])

t1 = time.time()
tree = Tree(X_tree, y_tree)
t2 = time.time()
print "time: %s" % (t2 - t1)
predictions = tree.predict(X_v)
accuracy_score(y_v, predictions)

# Test boost
boosting = Boosting(X, y, n_estimators=128, n_samples=2048)
boosting.train()
predictions = boosting.predict(X_v)
accuracy_score(y_v, predictions)