def fit(self, features, label, validation_data=(None, None), early_stopping_rounds=np.inf, maximize=True, eval_metric=None, loss="logisticloss", eta=0.3, num_boost_round=1000, max_depth=6, scale_pos_weight=1, subsample=0.8, colsample=0.8, min_child_weight=1, min_sample_split=10, reg_lambda=1.0, gamma=0, num_thread=-1): """ :param features: np.array :param label: np.array :param eta: learning rate :param num_boost_round: number of boosting round :param max_depth: max depth of each tree :param subsample: row sample rate when building a tree :param colsample: column sample rate when building a tree :param min_sample_split: min number of samples in a leaf node :param loss: loss object logisticloss,squareloss, or customize loss :param reg_lambda: lambda :param gamma: gamma :param num_thread: number of threself.tree_predict_Xad to parallel :param eval_metric: evaluation metric, provided: "accuracy" """ self.eta = eta self.num_boost_round = num_boost_round self.max_depth = max_depth self.subsample = subsample self.colsample = colsample self.reg_lambda = reg_lambda self.gamma = gamma self.min_sample_split = min_sample_split self.num_thread = num_thread self.eval_metric = eval_metric self.min_child_weight = min_child_weight self.scale_pos_weight = scale_pos_weight self.first_round_pred = 0 # initial loss function if loss == "logisticloss": self.loss = LogisticLoss() elif loss == "squareloss": self.loss = SquareLoss() self.first_round_pred = label.mean() else: try: self.loss = CustomizeLoss(loss) except: raise NotImplementedError( "loss should be 'logisticloss','squareloss', or customize loss function" ) # initialize row_sampler, col_sampler, bin_structure, attribute_list, class_list row_sampler = RowSampler(features.shape[0], self.subsample) col_sampler = ColumnSampler(features.shape[1], self.colsample) bin_structure = BinStructure(features) attribute_list = AttributeList(features, bin_structure) class_list = ClassList(label) class_list.initialize_pred(self.first_round_pred) class_list.update_grad_hess(self.loss) # to evaluate on validation set and conduct early stopping # we should get (val_features,val_label) # and set some variable to check when to stop do_validation = True if not isinstance(validation_data, tuple): raise TypeError( "validation_data should be (val_features, val_label)") val_features, val_label = validation_data val_pred = None if val_features is None or val_label is None: do_validation = False else: val_pred = np.ones(val_label.shape) * self.first_round_pred if maximize: best_val_metric = -np.inf best_round = 0 become_worse_round = 0 else: best_val_metric = np.inf best_round = 0 become_worse_round = 0 # start learning logging.info("TGBoost start training") for i in range(self.num_boost_round): t0 = time() # train current tree tree = Tree(self.min_sample_split, self.min_child_weight, self.max_depth, self.colsample, self.subsample, self.reg_lambda, self.gamma, self.num_thread) tree.fit(attribute_list, class_list, row_sampler, col_sampler, bin_structure) # when finish building this tree, update the class_list.pred, grad, hess class_list.update_pred(self.eta) class_list.update_grad_hess(self.loss) # save this tree self.trees.append(tree) t1 = time() # print training information if self.eval_metric is None: logging.info("TGBoost round {iteration}".format(iteration=i)) else: try: mertric_func = get_metric(self.eval_metric) except: raise NotImplementedError( "The given eval_metric is not provided") train_metric = mertric_func( self.loss.transform(class_list.pred), label) if not do_validation: logging.info( "TGBoost round {iteration}, train-{eval_metric}: {train_metric:.4f}, exec time {tc:.3f}s" .format(iteration=i, eval_metric=self.eval_metric, train_metric=train_metric, tc=t1 - t0)) else: val_pred += self.eta * tree.predict(val_features) val_metric = mertric_func(self.loss.transform(val_pred), val_label) logging.info( "TGBoost round {iteration}, train-{eval_metric}: {train_metric:.4f}, val-{eval_metric}: {val_metric:.4f}, exec time {tc:.3f}s" .format(iteration=i, eval_metric=self.eval_metric, train_metric=train_metric, val_metric=val_metric, tc=t1 - t0)) # check whether to early stop if maximize: if val_metric > best_val_metric: best_val_metric = val_metric best_round = i become_worse_round = 0 else: become_worse_round += 1 if become_worse_round > early_stopping_rounds: logging.info( "TGBoost training Stop, best round is {best_round}, best {eval_metric} is {best_val_metric:.4f}" .format(best_round=best_round, eval_metric=eval_metric, best_val_metric=best_val_metric)) break else: if val_metric < best_val_metric: best_val_metric = val_metric best_round = i become_worse_round = 0 else: become_worse_round += 1 if become_worse_round > early_stopping_rounds: logging.info( "TGBoost training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric:.4f}" .format(best_round=best_round, eval_metric=eval_metric, best_val_metric=best_val_metric)) break
class TGBoost(object): """ Tiny Gradient Boosting """ def __init__(self): self.trees = [] self.eta = None self.num_boost_round = None self.first_round_pred = None self.loss = None self.max_depth = None self.rowsample = None self.colsample_bytree = None self.colsample_bylevel = None self.l2_regularization = None self.min_sample_split = None self.gamma = None self.num_thread = None self.feature_importance = defaultdict(lambda: 0) def fit(self, X, y, eta=0.01, num_boost_round=1000, max_depth=5, rowsample=0.8, colsample_bytree=0.8, colsample_bylevel=0.8, min_sample_split=10, loss="logisticloss", l2_regularization=1.0, gamma=0.1, num_thread=-1, eval_metric=None): """ :param X: pandas.core.frame.DataFrame :param y: pandas.core.series.Series :param eta: learning rate :param num_boost_round: number of boosting round :param max_depth: max depth of each tree :param rowsample: row sample rate when building a tree :param colsample_bytree: column sample rate when building a tree :param colsample_bylevel: column sample rate when spliting each tree node, the number of features = total_features*colsample_bytree*colsample_bylevel :param min_sample_split: min number of samples in a leaf node :param loss: loss object logisticloss,squareloss, or customize loss :param l2_regularization: lambda :param gamma: gamma :param seed: random seed :param num_thread: number of thread to parallel :param eval_metric: evaluation metric, provided: "accuracy" """ self.eta = eta self.num_boost_round = num_boost_round self.max_depth = max_depth self.rowsample = rowsample self.colsample_bytree = colsample_bytree self.colsample_bylevel = colsample_bylevel self.l2_regularization = l2_regularization self.gamma = gamma self.min_sample_split = min_sample_split self.num_thread = num_thread self.eval_metric = eval_metric if loss == "logisticloss": self.loss = LogisticLoss(l2_regularization) elif loss == "squareloss": self.loss = SquareLoss(l2_regularization) else: try: self.loss = CustomizeLoss(loss, l2_regularization) except: raise NotImplementedError( "loss should be 'logisticloss','squareloss', or customize loss function" ) self.first_round_pred = y.mean() # Y stores label, y_pred, grad, hess Y = pd.DataFrame(y.values, columns=['label']) # only one column "label" Y['y_pred'] = self.first_round_pred Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values) Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values) for i in range(self.num_boost_round): # sample samples and features to train current tree data = X.sample(frac=self.colsample_bytree, axis=1) data = pd.concat([data, Y], axis=1) data = data.sample(frac=self.rowsample, axis=0) Y_selected = data[['label', 'y_pred', 'grad', 'hess']] X_selected = data.drop(['label', 'y_pred', 'grad', 'hess'], axis=1) # train current tree tree = Tree() tree.fit(X_selected, Y_selected, max_depth=self.max_depth, colsample_bylevel=self.colsample_bylevel, min_sample_split=self.min_sample_split, l2_regularization=self.l2_regularization, gamma=self.gamma, num_thread=self.num_thread) # predict the whole dataset and update y_pred,grad,hess preds = tree.predict(X) Y['y_pred'] += self.eta * preds Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values) Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values) if self.eval_metric is not None: try: mertric_func = get_metric(self.eval_metric) except: raise NotImplementedError( "The given eval_metric is not provided") metric_value = mertric_func( self.loss.transform(Y.y_pred.values), Y.label.values) print "TGBoost round {iteration}, {eval_metric} is {metric_value}".format( iteration=i, eval_metric=self.eval_metric, metric_value=metric_value) else: print "TGBoost round {iteration}" # update feature importance for k in tree.feature_importance.iterkeys(): self.feature_importance[k] += tree.feature_importance[k] self.trees.append(tree) def predict(self, X): preds = np.zeros((X.shape[0], )) for tree in self.trees: preds += self.eta * tree.predict(X) preds += self.first_round_pred return self.loss.transform(preds)
class TGBoost(object): """ Tiny Gradient Boosting """ def __init__(self): self.trees = [] self.eta = None self.num_boost_round = None self.first_round_pred = None self.loss = None self.max_depth = None self.subsample = None self.colsample_bytree = None self.colsample_bylevel = None self.reg_lambda = None self.min_sample_split = None self.gamma = None self.num_thread = None self.min_child_weight = None self.scale_pos_weight = None self.feature_importance = defaultdict(lambda: 0) def fit(self, X, y, validation_data=(None, None), early_stopping_rounds=np.inf, maximize=True, eval_metric=None, loss="logisticloss", eta=0.3, num_boost_round=1000, max_depth=6, scale_pos_weight=1, subsample=0.8, colsample_bytree=0.8, colsample_bylevel=0.8, min_child_weight=1, min_sample_split=10, reg_lambda=1.0, gamma=0, num_thread=-1): """ :param X: pandas.core.frame.DataFrame :param y: pandas.core.series.Series :param eta: learning rate :param num_boost_round: number of boosting round :param max_depth: max depth of each tree :param subsample: row sample rate when building a tree :param colsample_bytree: column sample rate when building a tree :param colsample_bylevel: column sample rate when spliting each tree node, the number of features = total_features*colsample_bytree*colsample_bylevel :param min_sample_split: min number of samples in a leaf node :param loss: loss object logisticloss,squareloss, or customize loss :param reg_lambda: lambda :param gamma: gamma :param seed: random seed :param num_thread: number of threself.tree_predict_Xad to parallel :param eval_metric: evaluation metric, provided: "accuracy" """ self.eta = eta self.num_boost_round = num_boost_round self.max_depth = max_depth self.subsample = subsample self.colsample_bytree = colsample_bytree self.colsample_bylevel = colsample_bylevel self.reg_lambda = reg_lambda self.gamma = gamma self.min_sample_split = min_sample_split self.num_thread = num_thread self.eval_metric = eval_metric self.min_child_weight = min_child_weight self.scale_pos_weight = scale_pos_weight self.first_round_pred = 0.0 X.reset_index(drop=True, inplace=True) y.reset_index(drop=True, inplace=True) # initial loss function if loss == "logisticloss": self.loss = LogisticLoss(reg_lambda) elif loss == "squareloss": self.loss = SquareLoss(reg_lambda) self.first_round_pred = y.mean() else: try: self.loss = CustomizeLoss(loss, reg_lambda) except: raise NotImplementedError( "loss should be 'logisticloss','squareloss', or customize loss function" ) # to evaluate on validation set and conduct early stopping # we should get (val_X,val_y) # and set some variable to check when to stop do_validation = True if not isinstance(validation_data, tuple): raise TypeError("validation_data should be (val_X, val_y)") val_X, val_y = validation_data if val_X is None or val_y is None: do_validation = False else: # type check if not isinstance(val_X, pd.core.frame.DataFrame): raise TypeError("val_X should be 'pd.core.frame.DataFrame'") if not isinstance(val_y, pd.core.series.Series): raise TypeError("val_X should be 'pd.core.series.Series'") val_X.reset_index(drop=True, inplace=True) val_y.reset_index(drop=True, inplace=True) val_Y = pd.DataFrame(val_y.values, columns=['label']) val_Y['y_pred'] = self.first_round_pred if maximize: best_val_metric = -np.inf best_round = 0 become_worse_round = 0 else: best_val_metric = np.inf best_round = 0 become_worse_round = 0 # Y stores: label, y_pred, grad, hess, sample_weight Y = pd.DataFrame(y.values, columns=['label']) Y['y_pred'] = self.first_round_pred Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values) Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values) Y['sample_weight'] = 1.0 Y.loc[Y.label == 1, 'sample_weight'] = self.scale_pos_weight for i in range(self.num_boost_round): # weighted grad and hess Y.grad = Y.grad * Y.sample_weight Y.hess = Y.hess * Y.sample_weight # row and column sample before training the current tree data = X.sample(frac=self.colsample_bytree, axis=1) data = pd.concat([data, Y], axis=1) data = data.sample(frac=self.subsample, axis=0) Y_selected = data[['label', 'y_pred', 'grad', 'hess']] X_selected = data.drop( ['label', 'y_pred', 'grad', 'hess', 'sample_weight'], axis=1) # train current tree tree = Tree() tree.fit(X_selected, Y_selected, max_depth=self.max_depth, min_child_weight=self.min_child_weight, colsample_bylevel=self.colsample_bylevel, min_sample_split=self.min_sample_split, reg_lambda=self.reg_lambda, gamma=self.gamma, num_thread=self.num_thread) # predict the whole trainset and update y_pred,grad,hess preds = tree.predict(X) Y['y_pred'] += self.eta * preds Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values) Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values) # update feature importance for k in tree.feature_importance.iterkeys(): self.feature_importance[k] += tree.feature_importance[k] self.trees.append(tree) # print training information if self.eval_metric is None: print "TGBoost round {iteration}".format(iteration=i) else: try: mertric_func = get_metric(self.eval_metric) except: raise NotImplementedError( "The given eval_metric is not provided") train_metric = mertric_func( self.loss.transform(Y.y_pred.values), Y.label.values) if not do_validation: print "TGBoost round {iteration}, train-{eval_metric} is {train_metric}".format( iteration=i, eval_metric=self.eval_metric, train_metric=train_metric) else: val_Y['y_pred'] += self.eta * tree.predict(val_X) val_metric = mertric_func( self.loss.transform(val_Y.y_pred.values), val_Y.label.values) print "TGBoost round {iteration}, train-{eval_metric} is {train_metric}, val-{eval_metric} is {val_metric}".format( iteration=i, eval_metric=self.eval_metric, train_metric=train_metric, val_metric=val_metric) # check if to early stop if maximize: if val_metric > best_val_metric: best_val_metric = val_metric best_round = i become_worse_round = 0 else: become_worse_round += 1 if become_worse_round > early_stopping_rounds: print "TGBoost training Stop, best round is {best_round}, best {eval_metric} is {best_val_metric}".format( best_round=best_round, eval_metric=eval_metric, best_val_metric=best_val_metric) break else: if val_metric < best_val_metric: best_val_metric = val_metric best_round = i become_worse_round = 0 else: become_worse_round += 1 if become_worse_round > early_stopping_rounds: print "TGBoost training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric}".format( best_round=best_round, eval_metric=eval_metric, best_val_metric=best_val_metric) break def predict(self, X): assert len(self.trees) > 0 # TODO: actually the tree prediction can be parallel preds = np.zeros((X.shape[0], )) preds += self.first_round_pred for tree in self.trees: preds += self.eta * tree.predict(X) return self.loss.transform(preds)