def main(): # An example use of 'build_tree' and 'predict' df_train = clean('horseTrain.txt') attributes = [ 'K', 'Na', 'CL', 'HCO', 'Endotoxin', 'Anioingap', 'PLA2', 'SDH', 'GLDH', 'TPP', 'Breath rate', 'PCV', 'Pulse rate', 'Fibrinogen', 'Dimer', 'FibPerDim' ] dec_tree = Tree(df_train, attributes, 'Outcome') print("Building the tree...") time1 = time.time() dec_tree.build_tree() time2 = time.time() print("Time spent to build the tree %.2f seconds" % (time2 - time1)) print("Finish building the tree...") time.sleep(1) print("Shape of Tree ...") dec_tree.print_tree() time.sleep(1) print("Accuracy of test data ... ") df_test = clean('horseTest.txt') print(dec_tree.predict(df_test)) print(df_test['Outcome'])
def _init_extimator(self): index = list(np.random.choice(self.N, self.n_samples, p=self.weights)) # randomly sampling samples(no change for dimensions) x_sub = np.array([self.x[inx, :] for inx in index]) y_sub = np.array([self.y[inx] for inx in index]) while True: tree = Tree(x_sub, y_sub) y_pred = tree.predict(self.x) accuracy = accuracy_score(self.y, y_pred) if accuracy != 0.5: self.estimators.append(tree) break return tree, y_pred
def init_estimator(self): indices = [i for i in np.random.choice(X.shape[0], self.n_samples, p=self.weights)] X_tree = np.array([X[i, :] for i in indices]) y_tree = np.array([y[i] for i in indices]) print "%s / %s" % (self.count, self.n_estimators) while True: t1 = time.time() tree = Tree(X_tree, y_tree) t2 = time.time() print "tree generation time: %s" % (t2 - t1) predictions = tree.predict(self.X) accuracy = accuracy_score(self.y, predictions) print "accuracy: %s" % accuracy if accuracy != 0.50: self.estimators.append(tree) break return tree, predictions
def fit(self, train_data, validation_data, early_stopping_rounds=np.inf, eval_metric=None, loss="logisticloss", eta=0.3, num_round=1000, max_depth=6, pool_size=1, min_instances_byleaf=1, scale_pos_weight=1, subsample=0.8, colsample_bytree=0.8, min_child_weight=1, reg_lambda=1.0, gamma=0): """ :param train_data: Data object, train data :param validation_data: Data object, validation data :param eta: learning rate :param num_round: number of boosting round :param max_depth: max depth of each tree :param pool_size: the num of processes :param subsample: row sample rate when building a tree :param colsample_bytree: column sample rate when building a tree :param min_instances_byleaf: min number of samples in a leaf node :param loss: loss object logisticloss,squareloss :param reg_lambda: lambda :param gamma: gamma :param eval_metric: evaluation metric, provided: "accuracy" """ self.eta = eta self.num_round = num_round self.max_depth = max_depth self.pool_size = pool_size self.subsample = subsample self.colsample_bytree = colsample_bytree self.reg_lambda = reg_lambda self.gamma = gamma self.min_instances_byleaf = min_instances_byleaf self.eval_metric = eval_metric self.min_child_weight = min_child_weight self.scale_pos_weight = scale_pos_weight self.first_round_pred = 0.0 # initial loss function if loss == "logisticloss": self.loss = LogisticLoss(self.reg_lambda) elif loss == "squareloss": self.loss = SquareLoss(self.reg_lambda) self.first_round_pred = train_data.getLabelMean() else: raise NotImplementedError( "loss should be 'logisticloss' or 'squareloss'") # to evaluate on validation set and conduct early stopping do_validation = True valData = validation_data.getData() if not valData: raise ValueError("validation_data is empty !") valIdxList = [] #save an fixed order valLabels = [] for idx in valData: valData[idx][ 'yPred'] = self.first_round_pred #init it with traindata valIdxList.append(idx) valLabels.append(valData[idx]['label']) best_val_metric = np.inf best_round = 0 become_worse_round = 0 data = train_data.getData() if not train_data: raise ValueError("train_data is empty !") idxList = [] #save an fixed order labels = [] for idx in data: data[idx]['yPred'] = self.first_round_pred data[idx]['grad'] = self.loss.grad(data[idx]['grad'], data[idx]['label']) data[idx]['hess'] = self.loss.hess(data[idx]['hess'], data[idx]['label']) if data[idx]['label'] == 1.0: data[idx]['weight'] = self.scale_pos_weight idxList.append(idx) labels.append(data[idx]['label']) labels = np.array(labels) for i in range(self.num_round): # weighted grad and hess for idx in data: data[idx]['grad'] = data[idx]['grad'] * data[idx]['weight'] data[idx]['hess'] = data[idx]['hess'] * data[idx]['weight'] # row and column sample before training the current tree factors = train_data.getFactors() factorTypes = train_data.getFeatureTypes() sampledFactors = random.sample( factors, int(len(factors) * self.colsample_bytree)) sampledData = {} for idx in random.sample(idxList, int(len(idxList) * self.subsample)): sampledData.update({idx: data[idx]}) # train current tree tree = Tree() tree.fit(sampledData, sampledFactors, factorTypes, max_depth=self.max_depth, pool_size=self.pool_size, min_child_weight=self.min_child_weight, min_instances_byleaf=self.min_instances_byleaf, reg_lambda=self.reg_lambda, gamma=self.gamma) # predict the whole trainset and update y_pred,grad,hess preds = tree.predict(sampledData) for idx in sampledData: data[idx]['yPred'] += self.eta * preds[idx] data[idx]['grad'] = self.loss.grad(data[idx]["yPred"], data[idx]["label"]) data[idx]['hess'] = self.loss.hess(data[idx]["yPred"], data[idx]["label"]) # update feature importance for k in tree.feature_importance.iterkeys(): self.feature_importance[k] += tree.feature_importance[k] self.trees.append(tree) # print training information if self.eval_metric is None: print "Apollo round {iteration}".format(iteration=i) else: try: mertric_func = get_metric(self.eval_metric) except: raise NotImplementedError( "The given eval_metric is not provided") curPreds = np.array([data[idx]["yPred"] for idx in idxList]) train_metric = mertric_func(self.loss.transform(curPreds), labels) if not do_validation: print "Apollo round {iteration}, train-{eval_metric} is {train_metric}".format( iteration=i, eval_metric=self.eval_metric, train_metric=train_metric) else: valPreds = tree.predict(valData) for idx in valData: valData[idx]['yPred'] += self.eta * valPreds[idx] curValPreds = [valData[idx]['yPred'] for idx in valIdxList] assert len(curValPreds) == len(valLabels) val_metric = mertric_func( self.loss.transform(np.array(curValPreds)), np.array(valLabels)) print "Apollo round {iteration}, train-{eval_metric} is {train_metric}, val-{eval_metric} is {val_metric}".format( iteration=i, eval_metric=self.eval_metric, train_metric=train_metric, val_metric=val_metric) # check if to early stop if val_metric < best_val_metric: best_val_metric = val_metric best_round = i become_worse_round = 0 else: become_worse_round += 1 if become_worse_round > early_stopping_rounds: print "Apollo training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric}".format( best_round=best_round, eval_metric=eval_metric, best_val_metric=best_val_metric) break
def fit(self, X, y, eta=0.01, num_boost_round=1000, max_depth=5, rowsample=0.8, colsample_bytree=0.8, colsample_bylevel=0.8, min_sample_split=10, loss="logisticloss", l2_regularization=1.0, gamma=0.1, num_thread=-1, eval_metric=None): """ :param X: pandas.core.frame.DataFrame :param y: pandas.core.series.Series :param eta: learning rate :param num_boost_round: number of boosting round :param max_depth: max depth of each tree :param rowsample: row sample rate when building a tree :param colsample_bytree: column sample rate when building a tree :param colsample_bylevel: column sample rate when spliting each tree node, the number of features = total_features*colsample_bytree*colsample_bylevel :param min_sample_split: min number of samples in a leaf node :param loss: loss object logisticloss,squareloss, or customize loss :param l2_regularization: lambda :param gamma: gamma :param seed: random seed :param num_thread: number of thread to parallel :param eval_metric: evaluation metric, provided: "accuracy" """ self.eta = eta self.num_boost_round = num_boost_round self.max_depth = max_depth self.rowsample = rowsample self.colsample_bytree = colsample_bytree self.colsample_bylevel = colsample_bylevel self.l2_regularization = l2_regularization self.gamma = gamma self.min_sample_split = min_sample_split self.num_thread = num_thread self.eval_metric = eval_metric if loss == "logisticloss": self.loss = LogisticLoss(l2_regularization) elif loss == "squareloss": self.loss = SquareLoss(l2_regularization) else: try: self.loss = CustomizeLoss(loss, l2_regularization) except: raise NotImplementedError( "loss should be 'logisticloss','squareloss', or customize loss function" ) self.first_round_pred = y.mean() # Y stores label, y_pred, grad, hess Y = pd.DataFrame(y.values, columns=['label']) # only one column "label" Y['y_pred'] = self.first_round_pred Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values) Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values) for i in range(self.num_boost_round): # sample samples and features to train current tree data = X.sample(frac=self.colsample_bytree, axis=1) data = pd.concat([data, Y], axis=1) data = data.sample(frac=self.rowsample, axis=0) Y_selected = data[['label', 'y_pred', 'grad', 'hess']] X_selected = data.drop(['label', 'y_pred', 'grad', 'hess'], axis=1) # train current tree tree = Tree() tree.fit(X_selected, Y_selected, max_depth=self.max_depth, colsample_bylevel=self.colsample_bylevel, min_sample_split=self.min_sample_split, l2_regularization=self.l2_regularization, gamma=self.gamma, num_thread=self.num_thread) # predict the whole dataset and update y_pred,grad,hess preds = tree.predict(X) Y['y_pred'] += self.eta * preds Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values) Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values) if self.eval_metric is not None: try: mertric_func = get_metric(self.eval_metric) except: raise NotImplementedError( "The given eval_metric is not provided") metric_value = mertric_func( self.loss.transform(Y.y_pred.values), Y.label.values) print "TGBoost round {iteration}, {eval_metric} is {metric_value}".format( iteration=i, eval_metric=self.eval_metric, metric_value=metric_value) else: print "TGBoost round {iteration}" # update feature importance for k in tree.feature_importance.iterkeys(): self.feature_importance[k] += tree.feature_importance[k] self.trees.append(tree)
def fit(self, features, label, validation_data=(None, None), early_stopping_rounds=np.inf, maximize=True, eval_metric=None, loss="logisticloss", eta=0.3, num_boost_round=1000, max_depth=6, scale_pos_weight=1, subsample=0.8, colsample=0.8, min_child_weight=1, min_sample_split=10, reg_lambda=1.0, gamma=0, num_thread=-1): """ :param features: np.array :param label: np.array :param eta: learning rate :param num_boost_round: number of boosting round :param max_depth: max depth of each tree :param subsample: row sample rate when building a tree :param colsample: column sample rate when building a tree :param min_sample_split: min number of samples in a leaf node :param loss: loss object logisticloss,squareloss, or customize loss :param reg_lambda: lambda :param gamma: gamma :param num_thread: number of threself.tree_predict_Xad to parallel :param eval_metric: evaluation metric, provided: "accuracy" """ self.eta = eta self.num_boost_round = num_boost_round self.max_depth = max_depth self.subsample = subsample self.colsample = colsample self.reg_lambda = reg_lambda self.gamma = gamma self.min_sample_split = min_sample_split self.num_thread = num_thread self.eval_metric = eval_metric self.min_child_weight = min_child_weight self.scale_pos_weight = scale_pos_weight self.first_round_pred = 0 # initial loss function if loss == "logisticloss": self.loss = LogisticLoss() elif loss == "squareloss": self.loss = SquareLoss() self.first_round_pred = label.mean() else: try: self.loss = CustomizeLoss(loss) except: raise NotImplementedError( "loss should be 'logisticloss','squareloss', or customize loss function" ) # initialize row_sampler, col_sampler, bin_structure, attribute_list, class_list row_sampler = RowSampler(features.shape[0], self.subsample) col_sampler = ColumnSampler(features.shape[1], self.colsample) bin_structure = BinStructure(features) attribute_list = AttributeList(features, bin_structure) class_list = ClassList(label) class_list.initialize_pred(self.first_round_pred) class_list.update_grad_hess(self.loss) # to evaluate on validation set and conduct early stopping # we should get (val_features,val_label) # and set some variable to check when to stop do_validation = True if not isinstance(validation_data, tuple): raise TypeError( "validation_data should be (val_features, val_label)") val_features, val_label = validation_data val_pred = None if val_features is None or val_label is None: do_validation = False else: val_pred = np.ones(val_label.shape) * self.first_round_pred if maximize: best_val_metric = -np.inf best_round = 0 become_worse_round = 0 else: best_val_metric = np.inf best_round = 0 become_worse_round = 0 # start learning logging.info("TGBoost start training") for i in range(self.num_boost_round): t0 = time() # train current tree tree = Tree(self.min_sample_split, self.min_child_weight, self.max_depth, self.colsample, self.subsample, self.reg_lambda, self.gamma, self.num_thread) tree.fit(attribute_list, class_list, row_sampler, col_sampler, bin_structure) # when finish building this tree, update the class_list.pred, grad, hess class_list.update_pred(self.eta) class_list.update_grad_hess(self.loss) # save this tree self.trees.append(tree) t1 = time() # print training information if self.eval_metric is None: logging.info("TGBoost round {iteration}".format(iteration=i)) else: try: mertric_func = get_metric(self.eval_metric) except: raise NotImplementedError( "The given eval_metric is not provided") train_metric = mertric_func( self.loss.transform(class_list.pred), label) if not do_validation: logging.info( "TGBoost round {iteration}, train-{eval_metric}: {train_metric:.4f}, exec time {tc:.3f}s" .format(iteration=i, eval_metric=self.eval_metric, train_metric=train_metric, tc=t1 - t0)) else: val_pred += self.eta * tree.predict(val_features) val_metric = mertric_func(self.loss.transform(val_pred), val_label) logging.info( "TGBoost round {iteration}, train-{eval_metric}: {train_metric:.4f}, val-{eval_metric}: {val_metric:.4f}, exec time {tc:.3f}s" .format(iteration=i, eval_metric=self.eval_metric, train_metric=train_metric, val_metric=val_metric, tc=t1 - t0)) # check whether to early stop if maximize: if val_metric > best_val_metric: best_val_metric = val_metric best_round = i become_worse_round = 0 else: become_worse_round += 1 if become_worse_round > early_stopping_rounds: logging.info( "TGBoost training Stop, best round is {best_round}, best {eval_metric} is {best_val_metric:.4f}" .format(best_round=best_round, eval_metric=eval_metric, best_val_metric=best_val_metric)) break else: if val_metric < best_val_metric: best_val_metric = val_metric best_round = i become_worse_round = 0 else: become_worse_round += 1 if become_worse_round > early_stopping_rounds: logging.info( "TGBoost training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric:.4f}" .format(best_round=best_round, eval_metric=eval_metric, best_val_metric=best_val_metric)) break
def fit(self, X, y, eval_metric=None, early_stopping_rounds=None): self.trees = [] self.feature_importances_ = {} self.eval_metric = _EVAL_METRIC[eval_metric] if eval_metric else None X.reset_index(drop=True, inplace=True) y.reset_index(drop=True, inplace=True) # Y stores: label, y_pred, grad, hess, sample_weight Y = pd.DataFrame(y.values, columns=[LABEL_COLUMN]) Y['y_pred'] = self.first_round_pred Y[GRAD_COLUMN], Y[HESS_COLUMN] = self.loss.compute_grad_hess( Y.y_pred.values, Y.label.values) if self._is_classifier: Y['sample_weight'] = 1.0 Y.loc[Y.label == 1, 'sample_weight'] = self.scale_pos_weight if self.eval_metric is not None and early_stopping_rounds is not None: assert early_stopping_rounds > 0 best_val_score = -np.inf score_worse_round = 0 best_round = 0 for idx in xrange(self.n_estimators): if self._is_classifier: Y[GRAD_COLUMN] = Y[GRAD_COLUMN] * Y.sample_weight Y[HESS_COLUMN] = Y[HESS_COLUMN] * Y.sample_weight # subsample column and row before training the current tree X_sample_column = X.sample(frac=self.colsample_bytree, axis=1) data = pd.concat([X_sample_column, Y], axis=1) data = data.sample(frac=self.subsample, axis=0) X_feed = data[X_sample_column.columns] Y_feed = data[Y.columns] tree = Tree(max_depth=self.max_depth, min_child_weight=self.min_child_weight, colsample_bylevel=self.colsample_bylevel, reg_lambda=self.reg_lambda, gamma=self.gamma, num_thread=self.num_thread) tree.fit(X_feed, Y_feed) # predict the whole train set to update the y_pred, grad and hess preds = tree.predict(X[X_sample_column.columns]) Y['y_pred'] += self.learning_rate * preds Y[GRAD_COLUMN], Y[HESS_COLUMN] = self.loss.compute_grad_hess( Y.y_pred.values, Y.label.values) # only compute feature importance in "weight" type, xgboost support two more type "gain" and "cover" for feature, weight in tree.feature_importances_.iteritems(): if feature in self.feature_importances_: self.feature_importances_[feature] += weight else: self.feature_importances_[feature] = weight self.trees.append(tree) if self.eval_metric is None: print '[SGBoost] train round: {0}'.format(idx) else: cur_val_score = self._eval_score(Y.label.values, Y.y_pred.values) print '[SGBoost] train round: {0}, eval score: {1}'.format( idx, cur_val_score) if early_stopping_rounds is not None: if cur_val_score > best_val_score: best_val_score = cur_val_score score_worse_round = 0 best_round = idx else: score_worse_round += 1 if score_worse_round > early_stopping_rounds: print '[SGBoost] train best round: {0}, best eval score: {1}'.format( best_round, best_val_score) break return self
def fit(self, X, y, epoches=10, eta=0.3, num_boost_round=1000, max_depth=5, scale_pos_weight=1, subsample=0.8, colsample_bytree=0.8, colsample_bylevel=0.8, min_child_weight=1, min_sample_split=10, reg_lambda=1.0, gamma=0, num_thread=-1, pred_cutoff=0.5): ''' X:pandas.core.frame.DataFrame y:pandas.core.series.Series early_stopping_rounds: early_stop when eval rsult become worse more the early_stopping_rounds times maximize:the target is to make loss as large as possible eval_metric: evaluate method loss : loss function for optionmize num_boost_round : number of boosting max_depth: max_depth for a tree scale_pos_weight: weight for samples with 1 labels subsample: row sample rate when build a tree colsample_bytree: column sample rate when building a tree colsample_bylevel: column sample rate when spliting each tree node. when split a tree,the number of features = total_features*colsample_bytree*colsample_bylevel min_sample_split: min number of samples in a leaf node ''' self.eta = eta self.num_boost_round = num_boost_round self.first_round_pred = 0.0 self.subsample = subsample self.max_depth = max_depth self.colsample_bytree = colsample_bytree self.colsample_bylevel = colsample_bylevel self.reg_lambda = reg_lambda self.min_sample_split = min_sample_split self.gamma = gamma self.num_thread = num_thread self.min_child_weight = min_child_weight self.scale_pos_weight = scale_pos_weight self.pred_cutoff = pred_cutoff self.epoches = epoches # 将X,y修改为能通过int下标(从0开始)进行索引的FramData X.reset_index(drop=True, inplace=True) y.reset_index(drop=True, inplace=True) self.loss = binary_classification_loss(self.eta) Y = pd.DataFrame(y.values, columns=['label']) Y['y_pred'] = self.first_round_pred Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values) Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values) Y['sample_weight'] = 1.0 # 调整正样本权重 Y.loc[Y.label == 1, 'sample_weight'] = self.scale_pos_weight for epoch in xrange(epoches): loss = [] for i in range(self.num_boost_round): # row and column sample before training the current tree data = X.sample(frac=self.colsample_bytree, axis=1) # column sample data = pd.concat([data, Y], axis=1) data = data.sample(frac=self.subsample, axis=0) # row sample Y_selected = data[['label', 'y_pred', 'grad', 'hess']] X_selected = data.drop( ['label', 'y_pred', 'grad', 'hess', 'sample_weight'], axis=1) # fit a tree if epoch == 0: tree = Tree() else: tree = self.trees[i] tree.fit(X_selected, Y_selected, max_depth=self.max_depth, min_child_weight=self.min_child_weight, colsample_bylevel=self.colsample_bylevel, min_sample_split=self.min_sample_split, eta=self.eta, gamma=self.gamma, num_thread=self.num_thread) # predict the whole trainset and update y_pred,grad,hess preds = tree.predict(X) Y['y_pred'] += self.eta * preds Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values) * Y.sample_weight Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values) * Y.sample_weight loss.append(self.loss.value(Y.y_pred.values, Y.label.values)) # update feature importance for k in tree.feature_importance.iterkeys(): self.feature_importance[k] += tree.feature_importance[k] if epoch == 0: self.trees.append(tree) print "epoch:{} loss:{}".format(epoch, np.mean(loss))
def fit(self, X, y, validation_data=(None, None), early_stopping_rounds=np.inf, maximize=True, eval_metric=None, loss="logisticloss", eta=0.3, num_boost_round=1000, max_depth=6, scale_pos_weight=1, subsample=0.8, colsample_bytree=0.8, colsample_bylevel=0.8, min_child_weight=1, min_sample_split=10, reg_lambda=1.0, gamma=0, num_thread=-1): """ :param X: pandas.core.frame.DataFrame :param y: pandas.core.series.Series :param eta: learning rate :param num_boost_round: number of boosting round :param max_depth: max depth of each tree :param subsample: row sample rate when building a tree :param colsample_bytree: column sample rate when building a tree :param colsample_bylevel: column sample rate when spliting each tree node, the number of features = total_features*colsample_bytree*colsample_bylevel :param min_sample_split: min number of samples in a leaf node :param loss: loss object logisticloss,squareloss, or customize loss :param reg_lambda: lambda :param gamma: gamma :param seed: random seed :param num_thread: number of threself.tree_predict_Xad to parallel :param eval_metric: evaluation metric, provided: "accuracy" """ self.eta = eta self.num_boost_round = num_boost_round self.max_depth = max_depth self.subsample = subsample self.colsample_bytree = colsample_bytree self.colsample_bylevel = colsample_bylevel self.reg_lambda = reg_lambda self.gamma = gamma self.min_sample_split = min_sample_split self.num_thread = num_thread self.eval_metric = eval_metric self.min_child_weight = min_child_weight self.scale_pos_weight = scale_pos_weight self.first_round_pred = 0.0 X.reset_index(drop=True, inplace=True) y.reset_index(drop=True, inplace=True) # initial loss function if loss == "logisticloss": self.loss = LogisticLoss(reg_lambda) elif loss == "squareloss": self.loss = SquareLoss(reg_lambda) self.first_round_pred = y.mean() else: try: self.loss = CustomizeLoss(loss, reg_lambda) except: raise NotImplementedError( "loss should be 'logisticloss','squareloss', or customize loss function" ) # to evaluate on validation set and conduct early stopping # we should get (val_X,val_y) # and set some variable to check when to stop do_validation = True if not isinstance(validation_data, tuple): raise TypeError("validation_data should be (val_X, val_y)") val_X, val_y = validation_data if val_X is None or val_y is None: do_validation = False else: # type check if not isinstance(val_X, pd.core.frame.DataFrame): raise TypeError("val_X should be 'pd.core.frame.DataFrame'") if not isinstance(val_y, pd.core.series.Series): raise TypeError("val_X should be 'pd.core.series.Series'") val_X.reset_index(drop=True, inplace=True) val_y.reset_index(drop=True, inplace=True) val_Y = pd.DataFrame(val_y.values, columns=['label']) val_Y['y_pred'] = self.first_round_pred if maximize: best_val_metric = -np.inf best_round = 0 become_worse_round = 0 else: best_val_metric = np.inf best_round = 0 become_worse_round = 0 # Y stores: label, y_pred, grad, hess, sample_weight Y = pd.DataFrame(y.values, columns=['label']) Y['y_pred'] = self.first_round_pred Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values) Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values) Y['sample_weight'] = 1.0 Y.loc[Y.label == 1, 'sample_weight'] = self.scale_pos_weight for i in range(self.num_boost_round): # weighted grad and hess Y.grad = Y.grad * Y.sample_weight Y.hess = Y.hess * Y.sample_weight # row and column sample before training the current tree data = X.sample(frac=self.colsample_bytree, axis=1) data = pd.concat([data, Y], axis=1) data = data.sample(frac=self.subsample, axis=0) Y_selected = data[['label', 'y_pred', 'grad', 'hess']] X_selected = data.drop( ['label', 'y_pred', 'grad', 'hess', 'sample_weight'], axis=1) # train current tree tree = Tree() tree.fit(X_selected, Y_selected, max_depth=self.max_depth, min_child_weight=self.min_child_weight, colsample_bylevel=self.colsample_bylevel, min_sample_split=self.min_sample_split, reg_lambda=self.reg_lambda, gamma=self.gamma, num_thread=self.num_thread) # predict the whole trainset and update y_pred,grad,hess preds = tree.predict(X) Y['y_pred'] += self.eta * preds Y['grad'] = self.loss.grad(Y.y_pred.values, Y.label.values) Y['hess'] = self.loss.hess(Y.y_pred.values, Y.label.values) # update feature importance for k in tree.feature_importance.iterkeys(): self.feature_importance[k] += tree.feature_importance[k] self.trees.append(tree) # print training information if self.eval_metric is None: print "TGBoost round {iteration}".format(iteration=i) else: try: mertric_func = get_metric(self.eval_metric) except: raise NotImplementedError( "The given eval_metric is not provided") train_metric = mertric_func( self.loss.transform(Y.y_pred.values), Y.label.values) if not do_validation: print "TGBoost round {iteration}, train-{eval_metric} is {train_metric}".format( iteration=i, eval_metric=self.eval_metric, train_metric=train_metric) else: val_Y['y_pred'] += self.eta * tree.predict(val_X) val_metric = mertric_func( self.loss.transform(val_Y.y_pred.values), val_Y.label.values) print "TGBoost round {iteration}, train-{eval_metric} is {train_metric}, val-{eval_metric} is {val_metric}".format( iteration=i, eval_metric=self.eval_metric, train_metric=train_metric, val_metric=val_metric) # check if to early stop if maximize: if val_metric > best_val_metric: best_val_metric = val_metric best_round = i become_worse_round = 0 else: become_worse_round += 1 if become_worse_round > early_stopping_rounds: print "TGBoost training Stop, best round is {best_round}, best {eval_metric} is {best_val_metric}".format( best_round=best_round, eval_metric=eval_metric, best_val_metric=best_val_metric) break else: if val_metric < best_val_metric: best_val_metric = val_metric best_round = i become_worse_round = 0 else: become_worse_round += 1 if become_worse_round > early_stopping_rounds: print "TGBoost training Stop, best round is {best_round}, best val-{eval_metric} is {best_val_metric}".format( best_round=best_round, eval_metric=eval_metric, best_val_metric=best_val_metric) break
assert node.before_split_entropy is not None # Test choose best attr t1 = time.time() node.choose_best_attr() t2 = time.time() print "time: %s" % (t2 - t1) assert node.best_attr_index is not None assert node.best_threshold is not None print node.best_attr_index, node.best_threshold # Test tree generation indices = [i for i in np.random.choice(X.shape[0], 5000)] X_tree = np.array([X[i, :] for i in indices]) y_tree = np.array([y[i] for i in indices]) t1 = time.time() tree = Tree(X_tree, y_tree) t2 = time.time() print "time: %s" % (t2 - t1) predictions = tree.predict(X_v) accuracy_score(y_v, predictions) # Test boost boosting = Boosting(X, y, n_estimators=128, n_samples=2048) boosting.train() predictions = boosting.predict(X_v) accuracy_score(y_v, predictions)