def gbdt_lr(X_train, X_test, y_train, y_test): """ GBDT + LR :param X_train: :param X_test: :param y_train: :param y_test: :return: """ # 基于 GBDT 的监督变换 gbdt = GradientBoostingClassifier(n_estimators=n_estimator) gbdt.fit(X_train, y_train) # 得到 OneHot 编码 gbdt_enc = OneHotEncoder(categories='auto') np.set_printoptions(threshold=np.inf) gbdt_enc.fit(gbdt.apply(X_train)[:, :, 0]) # 使用 OneHot 编码作为特征,训练 LR gbdt_lr = LogisticRegression(solver='lbfgs', max_iter=1000) gbdt_lr.fit(gbdt_enc.transform(gbdt.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred_gbdt_lr = gbdt_lr.predict_proba( gbdt_enc.transform(gbdt.apply(X_test)[:, :, 0]))[:, 1] fpr_gbdt_lr, tpr_gbdt_lr, _ = roc_curve(y_test, y_pred_gbdt_lr) return fpr_gbdt_lr, tpr_gbdt_lr
def transform_with_gbm_to_categorical(header,tr_x,tr_y,ts_x,n_est=100,learning_rate=0.1,max_depth=5): clf = GradientBoostingClassifier(n_estimators=n_est,learning_rate=learning_rate,max_depth=max_depth) clf = clf.fit(tr_x, tr_y) ''' #Node count estimators = clf.estimators_ for row in estimators: for e in row: print(e.tree_.node_count)''' leaf_indices = clf.apply(tr_x) leaf_indices = leaf_indices.reshape(leaf_indices.shape[0],-1) ts_leaf_indices = clf.apply(ts_x) ts_leaf_indices = ts_leaf_indices.reshape(ts_leaf_indices.shape[0],-1) enc = OneHotEncoder() enc.fit(np.append(leaf_indices,ts_leaf_indices,axis=0)) tr_cat_features = enc.transform(leaf_indices).toarray() ts_cat_features = enc.transform(ts_leaf_indices).toarray() header = ['cat_'+str(i) for i in range(ts_cat_features.shape[1])] print('[gbm_cat] Features size: ',len(header)) return header,tr_cat_features,ts_cat_features
def gbdt_lr_train_test(libsvmFileName): split_dataset(libsvmFileName, './model_train/label_feature_data_train', './model_train/label_feature_data_test',split_ratio, total) X_train, y_train = load_svmlight_file('./model_train/label_feature_data_train') X_test, y_test = load_svmlight_file('./model_train/label_feature_data_test') gbclf = GradientBoostingClassifier(n_estimators=30, max_depth=4, verbose=0) tuned_parameter = [{'n_estimators':[30, 40, 50,60], 'max_depth':[3, 4, 5, 6, 7, 8, 9], 'max_features':[0.4,0.5,0.6,0.7,0.8,0.9]}] gs_clf = GridSearchCV(gbclf, tuned_parameter, cv=5, scoring='roc_auc') gs_clf.fit(X_train.toarray(), y_train) logging.info('best parameters set found: ') logging.info(gs_clf.best_params_) y_pred_gbdt = gs_clf.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) logging.info('gbdt auc: %.5f' % gbdt_auc) X_train_leaves = gbclf.apply(X_train)[:,:,0] (train_rows, cols) = X_train_leaves.shape X_test_leaves = gbclf.apply(X_test)[:,:,0] gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) lr = LogisticRegression() lr.fit(X_trans[:train_rows, :], y_train) y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdtlr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) logging.info('gbdt+lr auc 1: %.5f' % gbdtlr_auc1) lr = LogisticRegression(n_jobs=-1) X_train_ext = hstack([X_trans[:train_rows, :], X_train]) lr.fit(X_train_ext, y_train) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdtlr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) logging.info('gbdt+lr auc 2: %.5f' % gbdtlr_auc2)
class GBDTLR: def __init__(self): #self.other_params = {'learning_rate': cfg.gbdt.learning_rate, # 'n_estimators':cfg.gbdt.n_estimators, # } self.clf_gbdt = GradientBoostingClassifier(n_estimators=50) self.clf_lr = LogisticRegression() self.enc = OneHotEncoder() pass def fit(self, train_x, train_y): self.clf_gbdt.fit(train_x, train_y) train_new_feature = self.clf_gbdt.apply(train_x) train_new_feature = train_new_feature.reshape(-1, 50) self.enc.fit(train_new_feature) train_new_feature2 = np.array( self.enc.transform(train_new_feature).toarray()) self.clf_lr.fit(train_new_feature2, train_y) return self def predict(self, X_test): test_new_feature = self.clf_gbdt.apply(X_test) test_new_feature = test_new_feature.reshape(-1, 50) test_new_feature2 = np.array( self.enc.transform(test_new_feature).toarray()) predict = self.clf_lr.predict_proba(test_new_feature2)[:, 1] return predict def save_model(self): pass def load_model(self): pass
def transform_with_gbm_to_categorical(header, tr_x, tr_y, ts_x, n_est=100, learning_rate=0.1, max_depth=5): clf = GradientBoostingClassifier(n_estimators=n_est, learning_rate=learning_rate, max_depth=max_depth) clf = clf.fit(tr_x, tr_y) """ #Node count estimators = clf.estimators_ for row in estimators: for e in row: print(e.tree_.node_count)""" leaf_indices = clf.apply(tr_x) leaf_indices = leaf_indices.reshape(leaf_indices.shape[0], -1) ts_leaf_indices = clf.apply(ts_x) ts_leaf_indices = ts_leaf_indices.reshape(ts_leaf_indices.shape[0], -1) enc = OneHotEncoder() enc.fit(np.append(leaf_indices, ts_leaf_indices, axis=0)) tr_cat_features = enc.transform(leaf_indices).toarray() ts_cat_features = enc.transform(ts_leaf_indices).toarray() header = ["cat_" + str(i) for i in range(ts_cat_features.shape[1])] print("[gbm_cat] Features size: ", len(header)) return header, tr_cat_features, ts_cat_features
def train_model(self): lable = "Churn" ID = "customerID" x_columns = [x for x in self.train.columns if x not in [lable, ID]] x_train = self.data[x_columns] y_train = self.data[lable] # 创建gbdt模型 并训练 gbdt = GradientBoostingClassifier() gbdt.fit(x_train, y_train) # 创建lr模型 lr = LogisticRegression() lr.fit(x_train, y_train) # 模型融合 gbdt_lr = LogisticRegression() enc = OneHotEncoder() print(gbdt.apply(x_train).shape) print(gbdt.apply(x_train).reshape( -1, 100).shape) # 返回是三维向量需要进行reshape成enc接收的二维形式 # 100为n_estimators,迭代次数 enc.fit(gbdt.apply(x_train).reshape(-1, 100)) # apply函数返回每棵树对应叶子节点索引值 gbdt_lr.fit(enc.transform(gbdt.apply(x_train).reshape(-1, 100)), y_train) return enc, gbdt, lr, gbdt_lr
def model_fit(train_X,train_y,test_X,sample_fraction): """fit lr, gbt, gbt + lr """ def rescale_prediction(x): return x / (x + (1 - x)/sample_fraction) train_X,train_X_lr, train_y,train_y_lr = train_test_split(train_X, train_y, test_size=0.5) # logistic regression l1_ratio = 1 model = SGDClassifier(loss='log', l1_ratio=l1_ratio, penalty='l1') model.fit(train_X, train_y) y_pred_lr = rescale_prediction(model.predict_proba(test_X)[:,1]) # gradient boosted tree grd = GradientBoostingClassifier(n_estimators=100, verbose=2) grd.fit(train_X, train_y) y_pred_grd = rescale_prediction(grd.predict_proba(test_X)[:, 1]) # GBDT + LR grd_enc = OneHotEncoder(categories='auto', sparse=False) grd_enc.fit(grd.apply(train_X)[:, :, 0]) grd_lm = SGDClassifier(loss='log', l1_ratio=1, penalty='l1', max_iter=1000, verbose=True) grd_lm.fit(grd_enc.transform(grd.apply(train_X_lr)[:, :, 0]), train_y_lr) y_pred_grd = rescale_prediction(grd_lm.predict_proba(grd_enc.transform(grd.apply(test_X)[:,:,0]))[:,1]) res = {'lr':y_pred_lr,'gdt':y_pred_grd,'gdt_lr':y_pred_grd} pickle.dump(res,open(".\\interim\\pred.pkl",'wb'))
def fit(self, **kwargs) -> Model: feature_list = kwargs.get('feature_list', None) if not feature_list: self.name = self.name + '(-irt)' self.train_x = self.select_features(self.feature.features_train, feature_list) self.train_y = self.feature.label_train.values self.feature_names = self.train_x.columns self.train_x, self.train_y = self.tf_sample(self.train_x, self.train_y) grd = GradientBoostingClassifier(**self.param) grd_enc = OneHotEncoder() grd_lm = LogisticRegression(penalty='l2', C=1, solver='lbfgs') grd.fit(self.train_x, self.train_y) grd_enc.fit(grd.apply(self.train_x)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(self.train_x)[:, :, 0]), self.train_y) self.grd = grd self.grd_enc = grd_enc self.model = grd_lm # 评估训练集上的效果 self.train_y_pred = self.predict(self.train_x) self.train_y = np.array(self.train_y) self.train_y_pred = np.array(self.train_y_pred) self.train_ev = self.evaluation.evaluate(y_true=self.train_y, y_pred=self.train_y_pred, threshold=0.5) return self
class GBDTLR(BaseEstimator, ClassifierMixin): def __init__(self, n_estimators=100, max_depth=3, min_samples_leaf=1, max_leaf_nodes=None, subsample=1.0, learning_rate=0.1, max_iter=100, C=1.0, random_state=None): self.n_estimators = n_estimators self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf self.max_leaf_nodes = max_leaf_nodes self.subsample = subsample self.learning_rate = learning_rate self.max_iter = max_iter self.C = C self.random_state = random_state self.gbdt_params = { 'n_estimators': self.n_estimators, 'max_depth': self.max_depth, 'min_samples_leaf': self.min_samples_leaf, 'max_leaf_nodes': self.max_leaf_nodes, 'subsample': self.subsample, 'learning_rate': self.learning_rate } self.lr_params = {'C': self.C, 'max_iter': self.max_iter} self.GBDT = GradientBoostingClassifier(**self.gbdt_params, random_state=random_state) self.LR = LogisticRegression(**self.lr_params, random_state=random_state) self.ENC = OneHotEncoder(categories='auto') def fit(self, X, y): X_gbdt, X_lr, Y_gbdt, Y_lr = train_test_split(X, y, test_size=0.5) self.GBDT.fit(X_gbdt, Y_gbdt) tree_feature = self.GBDT.apply(X_gbdt)[:, :, 0] self.ENC.fit(tree_feature) X = self.ENC.transform(self.GBDT.apply(X_lr)[:, :, 0]) y = Y_lr return self.LR.fit(X, y) def predict(self, X): X = self.ENC.transform(self.GBDT.apply(X)[:, :, 0]) return self.LR.predict(X) def predict_proba(self, X): X = self.ENC.transform(self.GBDT.apply(X)[:, :, 0]) return self.LR.predict_proba(X) def predict_log_proba(self, X): X = self.ENC.transform(self.GBDT.apply(X)[:, :, 0]) return self.LR.predict_log_proba(X)
def compare_models(): X, y = make_classification(n_samples=10000) # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 对lr部分也给出训练集 X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split( X_train, y_train, test_size=0.5) # 建立模型 n_estimaters = 100 gbc = GradientBoostingClassifier(n_estimators=n_estimaters) encoder = OneHotEncoder() lr = LogisticRegression() # 训练决策树 gbc.fit(X_train, y_train) # encode 编码规则训练 # apply返回每个树最终落到那个叶子上 apply返回的是 所在样本落在第几个树的第几个叶上 # 注意apply返回的维度 encoder.fit(gbc.apply(X_train)[:, :, 0]) # 训练Logistic 单独采用一些样本 lr.fit(encoder.transform(gbc.apply(X_train_lr)[:, :, 0]), y_train_lr) # predict # 预测概率 .predict_proba 返回的是每个类里面的概率,只需要选择正类的概率即可 y_test_pred = lr.predict_proba( encoder.transform(gbc.apply(X_test)[:, :, 0]))[:, 1] # plot roc # roc的分类只能鉴别两类 并且必须预测结果必须能以秩排序 fpr_gbc_lr, tpr_gbc_lr, _ = roc_curve(y_test, y_test_pred) auc = roc_auc_score(y_test, y_test_pred) print(auc) # make roc graph fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.plot([0, 1], [0, 1], 'k-') ax.plot(fpr_gbc_lr, tpr_gbc_lr, label='gbc-lr') # 若只采用lr回归预测看看效果呢 lr.fit(X_train_lr, y_train_lr) y_test_pred_lr = lr.predict_proba(X_test)[:, 1] fpr_lr, tpr_lr, _ = roc_curve(y_test, y_test_pred_lr) ax.plot(fpr_lr, tpr_lr, label='lr') # 若只采用gbc来预测呢 y_test_pred_gbc = gbc.predict_proba(X_test)[:, 1] fpr_gbc, tpr_gbc, _ = roc_curve(y_test, y_test_pred_gbc) ax.plot(fpr_gbc, tpr_gbc, label='gbc') # 呈现 ax.legend(loc='best') plt.show()
def GBDTLR_Fit(X_train, y_train, pars): #from sklearn.ensemble import GradientBoostingClassifier #from sklearn.preprocessing import OneHotEncoder gbdt = GradientBoostingClassifier(**pars) gbdt.fit(X_train, y_train) model_onehot = OneHotEncoder() model_onehot.fit(gbdt.apply(X_train)[:, :, 0]) gbdt_lr = LogisticRegression() gbdt_lr.fit(model_onehot.transform(gbdt.apply(X_train)[:, :, 0]), y_train) return gbdt_lr, model_onehot, gbdt
class GRDTransformer: def __init__(self, n_estimators): self.model = GradientBoostingClassifier(n_estimators=n_estimators) self.enc = OneHotEncoder(sparse=False) def fit(self, X, y): self.model.fit(X, y) self.enc.fit(self.model.apply(X)[:, :, 0]) def transform(self, X): return self.enc.transform(self.model.apply(X)[:, :, 0])
def GBDTLR(): GBDT = GradientBoostingClassifier(n_estimators=10) GBDT.fit(X_train, Y_train) OHE = OneHotEncoder() OHE.fit(GBDT.apply(X_train)[:, :, 0]) LR = LogisticRegression() LR.fit(OHE.transform(GBDT.apply(X_train_lr)[:, :, 0]), Y_train_lr) Y_pred = LR.predict_proba(OHE.transform(GBDT.apply(X_test)[:, :, 0]))[:, 1] fpr, tpr, _ = roc_curve(Y_test, Y_pred) auc = roc_auc_score(Y_test, Y_pred) print('GradientBoosting+LogisticRegression:', auc) return fpr, tpr
def GdbtLR(X_train, y_train, X_test, y_test, X_train_lr, y_train_lr): grd = GradientBoostingClassifier(n_estimators=50) grd_enc = OneHotEncoder() grd_lr = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lr.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred_grd_lr = grd_lr.predict_proba( grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] fpr_grd_lr, tpr_grd_lr, _ = roc_curve(y_test, y_pred_grd_lr) auc = roc_auc_score(y_test, y_pred_grd_lr) print("GDBT+LR:", auc) return fpr_grd_lr, tpr_grd_lr
def gbdt_lr_model(): """ GBDT + LR """ gbdt = GradientBoostingClassifier(n_estimators=n_estimator, max_depth=max_depth) gbdt_enc = OneHotEncoder() gbdt_lm = LogisticRegression() gbdt.fit(X_train, y_train) gbdt_enc.fit(gbdt.apply(X_train)[:, :, 0]) gbdt_lm.fit(gbdt_enc.transform(gbdt.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred = gbdt_lm.predict_proba(gbdt_enc.transform(gbdt.apply(X_test)[:, :, 0]))[:, 1] fpr, tpr, _ = roc_curve(y_test, y_pred) print 'GBDT+LR AUC: {0}'.format(auc(fpr, tpr))
def GBC_Logistic(X_train,y_train,X_test): X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5) grd = GradientBoostingClassifier(n_estimators=200,learning_rate=0.5) grd_enc = OneHotEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred_grd_lm = grd_lm.predict_proba( grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] return y_pred_grd_lm
def GBDT_LReval(self, feature, target): feature_train = feature.iloc[0:self.lenth_eval, :] feature_eval = feature.iloc[self.lenth_eval:, :] label_train = target.iloc[0:self.lenth_eval] label_eval = target.iloc[self.lenth_eval:] GBDT = GradientBoostingClassifier(n_estimators=10) GBDT.fit(feature_train.values, label_train.values) y_pred_gbdt = GBDT.predict_proba(feature_eval.values)[:, 1] gbdt_auc = roc_auc_score(label_eval.values, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) X_train_leaves = GBDT.apply(feature_train)[:, :, 0] X_test_leaves = GBDT.apply(feature_eval)[:, :, 0] (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], label_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(label_eval, y_pred_gbdtlr1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], feature_train]) X_test_ext = hstack([X_trans[train_rows:, :], feature_eval]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, label_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(label_eval, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) print(X_train_leaves.shape) print(X_test_leaves.shape) #print(X_train_leaves.shape) print(X_train_leaves) print(X_test_leaves)
def gbdtLR(): GBDT= GradientBoostingClassifier(learning_rate=0.005,n_estimators=2400,max_depth=3,min_samples_split=800,min_samples_leaf=600, max_features=9,subsample=0.7,random_state=20) GBDT.fit(train_x, train_y) OHE = OneHotEncoder() OHE.fit(GBDT.apply(train_x)[:, :, 0]) LR = LogisticRegression(n_jobs=4, C=0.1, penalty='l1') LR.fit(OHE.transform(GBDT.apply(train_x_lr)[:, :, 0]), train_y_lr) Y_pred = LR.predict_proba(OHE.transform(GBDT.apply(test_x)[:, :, 0]))[:, 1] fpr, tpr, _ = roc_curve(test_y, Y_pred) auc = roc_auc_score(test_y, Y_pred) print('GBDT + LogisticRegression: ', auc) return fpr, tpr
def gbdt_lr_mix(): gbdtModel = GradientBoostingClassifier(n_estimators=10) gbdtModel.fit(X_train, Y_train) oneHot = OneHotEncoder() train_leafs_inds = gbdtModel.apply(X_train)[:, :, 0] print(np.shape(train_leafs_inds)) print(gbdtModel.estimators_) oneHot.fit(train_leafs_inds) lrModel = LogisticRegression(n_jobs=4, C=0.1, penalty='l2') lrModel.fit(oneHot.transform(train_leafs_inds), Y_train) Y_pred = lrModel.predict_proba(oneHot.transform(gbdtModel.apply(X_test)[:, :, 0]))[:, 1] fpr, tpr, _ = roc_curve(Y_test, Y_pred) auc = roc_auc_score(Y_test, Y_pred) print('gbdt + lr: ', auc) return fpr, tpr
def fit_(self, X, y, **kwargs): """fit GBDT transformer Args: X (DataFrame|array-like) y (str|array-like) select_dtypes (str|numpy.dtypes): `'object'`, `'number'` etc. only selected dtypes will be transform, """ if isinstance(y, str): X = X.copy() y = X.pop(y) gbdt = GradientBoostingClassifier(**kwargs) gbdt.fit(X, y) X = gbdt.apply(X) X = X.reshape(-1, X.shape[1]) onehot = OneHotEncoder().fit(X) return { 'gbdt': gbdt, 'onehot': onehot, }
class GBDTTransformer(TransformerMixin): """GBDT transformer """ def __init__(self): self.gbdt = None self.onehot = None @support_exclude @support_select_dtypes def fit(self, X, y, **kwargs): """fit GBDT transformer Args: X (DataFrame|array-like) y (str|array-like) select_dtypes (str|numpy.dtypes): `'object'`, `'number'` etc. only selected dtypes will be transform, """ if isinstance(y, str): X = X.copy() y = X.pop(y) self.gbdt = GradientBoostingClassifier(**kwargs) self.gbdt.fit(X, y) X = self.gbdt.apply(X) X = X.reshape(-1, X.shape[1]) self.onehot = OneHotEncoder().fit(X) return self def transform(self, X): """transform woe Args: X (DataFrame|array-like) default (str): 'min'(default), 'max' - the strategy to be used for unknown group Returns: array-like """ X = self.gbdt.apply(X) X = X.reshape(-1, X.shape[1]) res = self.onehot.transform(X).toarray() return res
def _fit(self, dataset, **options): # self.param = param # print('model GBDT_LR fit begin:') # GBDT 模型 grd = GradientBoostingClassifier(**self.model_params) grd.fit(dataset.x, dataset.y) # enc = OneHotEncoder() enc.fit(grd.apply(dataset.x)[:, :, 0]) lm = LogisticRegression(penalty='l2', C=1, solver='lbfgs') x = enc.transform(grd.apply(dataset.x)[:, :, 0]) lm.fit(x, dataset.y) self.tree = grd self.enc = enc self.m = lm return self
def model(self): #x为除去customerid和churn的值 有y为churn x_train = self.train[[ x for x in self.train.columns if x not in ['customerID', 'Churn'] ]] y_train = self.train['Churn'] lr = LogisticRegression(penalty='l2', tol=0.0001, fit_intercept=True, max_iter=20) gbdt = GradientBoostingClassifier( learning_rate=0.1, n_estimators=100, max_depth=7) #学习率为0.1 最大迭代次数为100,树深为7 gbdt.fit(x_train, y_train) #gbdt产生的是3维数据 enc = OneHotEncoder() enc.fit(gbdt.apply(x_train).reshape(-1, 100)) lr.fit(enc.transform(gbdt.apply(x_train).reshape(-1, 100)), y_train) return lr, gbdt, enc
def train_model(self): print("training") label = "Churn" ID = "customerID" x_columns = [x for x in self.train.columns if x not in [ID, label]] x_train = self.train[x_columns] y_train = self.train[label] gbdt = GradientBoostingClassifier() gbdt.fit(x_train, y_train) gbdt_lr = LogisticRegression(max_iter=3000) enc = OneHotEncoder() enc.fit(gbdt.apply(x_train).reshape(-1, 100)) gbdt_lr.fit(enc.transform(gbdt.apply(x_train).reshape(-1, 100)), y_train) return enc, gbdt, gbdt_lr
def train_model(self): label = 'Churn' ID = 'customerID' x_columns = [x for x in self.train.columns if x not in [label, ID]] x_train = self.train[x_columns] y_train = self.train[label] gbdt = GradientBoostingClassifier() gbdt.fit(x_train, y_train) gbdt_lr = LogisticRegression() enc = OneHotEncoder() enc.fit(gbdt.apply(x_train).reshape(-1, 100)) gbdt_lr.fit(enc.transform(gbdt.apply(x_train).reshape(-1, 100)), y_train) return enc, gbdt, gbdt_lr
def train(X_train, y_train, params): # It is important to train the ensemble of trees on a different subset # of the training data than the linear regression model to avoid # overfitting, in particular if the total number of leaves is # similar to the number of training samples X_train, X_train_lr, y_train, y_train_lr = train_test_split( X_train, y_train, test_size=0.5) gbt = GradientBoostingClassifier(**params) gbt.fit(X_train, y_train) gbt_enc = OneHotEncoder() gbt_enc.fit(gbt.apply(X_train)[:, :, 0]) grd_lm = LogisticRegression(max_iter=300) grd_lm.fit(gbt_enc.transform(gbt.apply(X_train_lr)[:, :, 0]), y_train_lr) return grd_lm, gbt, gbt_enc
def select_feature(x_train, y_train): gbm = GradientBoostingClassifier(n_estimators=50, random_state=10, max_depth=8) gbm.fit(x_train, y_train) train_new_feature = gbm.apply(x_train) print(train_new_feature.shape) train_new_feature = train_new_feature.reshape(-1, 50) enc = OneHotEncoder() enc.fit(train_new_feature) new_feature = np.array(enc.transform(train_new_feature).toarray()) return new_feature
def buildModel(self, X_train_d, X_train_c, X_test_d, X_test_c, y_train, y_test): ''' 开始构建模型 Args: X_train_d: 离散特征训练数据 X_train_c: 连续特征训练数据 X_test_d: 离散特征测试数据 X_test_c: 连续特征测试数据 y_train: 训练数据标记 {-1, 1} y_test: 测试数据标记 {-1, 1} Returns: gbc_enc: GBDT OneHotEncoder gbc: GBDT模型 comb_model: 训练得到的组合模型 threshold: 正负样例阈值, Pred_Prob >= threshold 为正样例; Pred_Prob < threshold 为负样例 comb_model_auc: 模型AUC precision: 模型精度 recall: 模型召回率 ''' if self._random_state is not None: gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth, random_state=self._random_state).fit(X_train_c, y_train) else: gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth).fit(X_train_c, y_train) X_train_leaves = gbc.apply(X_train_c)[:,:,0] X_test_leaves = gbc.apply(X_test_c)[:,:,0] (X_train_rows, cols) = X_train_leaves.shape gbc_enc = OneHotEncoder().fit(np.concatenate([X_train_leaves,X_test_leaves], axis = 0)) X_trans = gbc_enc.transform(np.concatenate([X_train_leaves,X_test_leaves], axis = 0)) X_train_ext = hstack([X_trans[:X_train_rows,:], X_train_d]) X_test_ext = hstack([X_trans[X_train_rows:,:], X_test_d]) log.debug("Combine features done.") comb_model = LogisticRegression().fit(X_train_ext, y_train) log.debug("Training done.") comb_model_pred = comb_model.predict_proba(X_test_ext)[:,1] precision, recall, thresholds = precision_recall_curve(y_test, comb_model_pred) ap = average_precision_score(y_test, comb_model_pred) recall_meet = recall >= self._recall_rate recall_meet_min = len([item for item in recall_meet if item == True]) threshold = thresholds[recall_meet_min-1] log.debug("threshold: %f - precision: %f - recall: %f", threshold, precision[recall_meet_min-1], recall[recall_meet_min-1]) comb_model_auc = roc_auc_score(y_test, comb_model_pred) log.debug("AUC score is: %f", comb_model_auc) return gbc_enc, gbc, comb_model, threshold, comb_model_auc, precision[recall_meet_min-1], recall[recall_meet_min-1]
def train(x_train, y_train, x_train_lr, y_train_lr, solver, class_weigeht, n_estimator): enc = OneHotEncoder() grd = GradientBoostingClassifier(n_estimators=n_estimator) grd_enc = OneHotEncoder() # 初始化两个逻辑回归模型 # class_weight参数用于标示分类模型中各种类型的权重,可以不输入,即不考虑权重,或者说所有类型的权重一样。 # 如果选择输入的话,可以选择balanced让类库自己计算类型权重,或者我们自己输入各个类型的权重,比如对于0,1的二元模型,我们可以定义class_weight={0:0.9, 1:0.1},这样类型0的权重为90%,而类型1的权重为10%。 # 如果class_weight选择balanced,那么类库会根据训练样本量来计算权重。某种类型样本量越多,则权重越低,样本量越少,则权重越高。 grd_lm = LogisticRegression(solver=solver, class_weight=class_weigeht) # 训练GBDT模型 grd.fit(x_train, y_train) grd_enc.fit(grd.apply(x_train)[:, :, 0]) # 训练逻辑回归分类器 grd_lm.fit(grd_enc.transform(grd.apply(x_train_lr)[:, :, 0]), y_train_lr) #输出各个特征的权重值 weights = grd_lm.coef_ return weights, grd, grd_enc, grd_lm
def test_gbm_classifier_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.GradientBoostingClassifier # Run h2o4gpu version of RandomForest Regression gbm = Solver(backend=backend, random_state=1234) print("h2o4gpu fit()") gbm.fit(X, y) # Run Sklearn version of RandomForest Regression from sklearn.ensemble import GradientBoostingClassifier gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3) print("Scikit fit()") gbm_sk.fit(X, y) if backend == "sklearn": assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X) ).all() == True assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1] ).all() == True assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X))) assert np.allclose(list(gbm.staged_predict_proba(X)), list(gbm_sk.staged_predict_proba(X))) assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True print("Estimators") print(gbm.estimators_) print(gbm_sk.estimators_) print("loss") print(gbm.loss_) print(gbm_sk.loss_) assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__ print("init_") print(gbm.init) print(gbm_sk.init) print("Feature importance") print(gbm.feature_importances_) print(gbm_sk.feature_importances_) assert (gbm.feature_importances_ == gbm_sk.feature_importances_ ).all() == True print("train_score_") print(gbm.train_score_) print(gbm_sk.train_score_) assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
def GBDT_clf(X_train, y_train, X_valid, X_test): from sklearn.preprocessing import OneHotEncoder #若深度7 , 树10棵,则共80个叶子节点 gbdt = GradientBoostingClassifier(learning_rate=0.1, n_estimators=300, max_depth=6) #10,3 gbdt_enc = OneHotEncoder() gbdt.fit(X_train, y_train) gbdt_enc.fit(gbdt.apply(X_train)[:, :, 0]) # X_train = pd.DataFrame.as_matrix(X_train) #DataFrame转np.ndarray # X_valid = pd.DataFrame.as_matrix(X_valid) # X_test = pd.DataFrame.as_matrix(X_test) gbdt_train_feature = gbdt_enc.transform(gbdt.apply(X_train)[:, :, 0]).toarray() gbdt_valid_feature = gbdt_enc.transform(gbdt.apply(X_valid)[:, :, 0]).toarray() gbdt_test_feature = gbdt_enc.transform(gbdt.apply(X_test)[:, :, 0]).toarray() return gbdt_train_feature, gbdt_valid_feature, gbdt_test_feature
def check_iris(presort, subsample, sample_weight): # Check consistency on dataset iris. clf = GradientBoostingClassifier(n_estimators=100, loss='deviance', random_state=1, subsample=subsample, presort=presort) clf.fit(iris.data, iris.target, sample_weight=sample_weight) score = clf.score(iris.data, iris.target) assert_greater(score, 0.9) leaves = clf.apply(iris.data) assert_equal(leaves.shape, (150, 100, 3))
def test_iris(): # Check consistency on dataset iris. for subsample in (1.0, 0.5): for sample_weight in (None, np.ones(len(iris.target))): clf = GradientBoostingClassifier(n_estimators=100, loss='deviance', random_state=1, subsample=subsample) clf.fit(iris.data, iris.target, sample_weight=sample_weight) score = clf.score(iris.data, iris.target) assert score > 0.9, "Failed with subsample %.1f " \ "and score = %f" % (subsample, score) leaves = clf.apply(iris.data) assert_equal(leaves.shape, (150, 100, 3))
def test_gbm_classifier_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.GradientBoostingClassifier # Run h2o4gpu version of RandomForest Regression gbm = Solver(backend=backend, random_state=1234) print("h2o4gpu fit()") gbm.fit(X, y) # Run Sklearn version of RandomForest Regression from sklearn.ensemble import GradientBoostingClassifier gbm_sk = GradientBoostingClassifier(random_state=1234, max_depth=3) print("Scikit fit()") gbm_sk.fit(X, y) if backend == "sklearn": assert (gbm.predict(X) == gbm_sk.predict(X)).all() == True assert (gbm.predict_log_proba(X) == gbm_sk.predict_log_proba(X)).all() == True assert (gbm.predict_proba(X) == gbm_sk.predict_proba(X)).all() == True assert (gbm.score(X, y) == gbm_sk.score(X, y)).all() == True assert (gbm.decision_function(X)[1] == gbm_sk.decision_function(X)[1]).all() == True assert np.allclose(list(gbm.staged_predict(X)), list(gbm_sk.staged_predict(X))) assert np.allclose(list(gbm.staged_predict_proba(X)), list(gbm_sk.staged_predict_proba(X))) assert (gbm.apply(X) == gbm_sk.apply(X)).all() == True print("Estimators") print(gbm.estimators_) print(gbm_sk.estimators_) print("loss") print(gbm.loss_) print(gbm_sk.loss_) assert gbm.loss_.__dict__ == gbm_sk.loss_.__dict__ print("init_") print(gbm.init) print(gbm_sk.init) print("Feature importance") print(gbm.feature_importances_) print(gbm_sk.feature_importances_) assert (gbm.feature_importances_ == gbm_sk.feature_importances_).all() == True print("train_score_") print(gbm.train_score_) print(gbm_sk.train_score_) assert (gbm.train_score_ == gbm_sk.train_score_).all() == True
def check_classification_toy(presort, loss): # Check classification on a toy dataset. clf = GradientBoostingClassifier(loss=loss, n_estimators=10, random_state=1, presort=presort) assert_raises(ValueError, clf.predict, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(10, len(clf.estimators_)) deviance_decrease = (clf.train_score_[:-1] - clf.train_score_[1:]) assert np.any(deviance_decrease >= 0.0) leaves = clf.apply(X) assert_equal(leaves.shape, (6, 10, 1))
def test_classification_toy(): # Check classification on a toy dataset. for loss in ('deviance', 'exponential'): clf = GradientBoostingClassifier(loss=loss, n_estimators=10, random_state=1) assert_raises(ValueError, clf.predict, T) clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) assert_equal(10, len(clf.estimators_)) deviance_decrease = (clf.train_score_[:-1] - clf.train_score_[1:]) assert np.any(deviance_decrease >= 0.0), \ "Train deviance does not monotonically decrease." leaves = clf.apply(X) assert_equal(leaves.shape, (6, 10, 1))
# Supervised transformation based on random forests rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = OneHotEncoder() rf_lm = LogisticRegression() rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) grd = GradientBoostingClassifier(n_estimators=n_estimator) grd_enc = OneHotEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred_grd_lm = grd_lm.predict_proba( grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm) # The gradient boosted model by itself y_pred_grd = grd.predict_proba(X_test)[:, 1] fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd) # The random forest model by itself y_pred_rf = rf.predict_proba(X_test)[:, 1] fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
del X_train_gbdt del y_train_gbdt gc.collect() gbdt_model = pickle.load(open(fp_gbdt_model, 'rb')) #----- data for LR (one-hot encoding with GDBT output) -----# id_cols = [] for i in range(1, gbdt_model.get_params()['n_estimators']+1): id_cols.append('tree'+str(i)) oh_enc = OneHotEncoder(id_cols) def chunker(seq, size): return (seq[pos: pos + size] for pos in range(0, len(seq), size)) ## oh_enc fit the train_set df_train_id = pd.DataFrame(gbdt_model.apply(X_train_org)[:, :, 0], columns=id_cols, dtype=np.int8) for chunk in chunker(df_train_id, 50000): oh_enc.fit(chunk) del df_train_id del X_train_org del y_train_org gc.collect() ## oh_enc fit the test_set df_test_f = pd.read_csv(fp_test_f, index_col=None, dtype={'id':str}, chunksize=50000, iterator=True)
# Spit the data set into roughly 2 halfs # This allows us to not overfit with stacking train_x1 = train_X[:30000] train_x2 = train_X[30000:] train_y1 = train_y[:30000] train_y2 = train_y[30000:] # We are first going to use Graident Boosting to transform # the data and then using One Hot Encoding. # After this, we will then try and fit a Logist Regression grd = GradientBoostingClassifier() grd_enc = OneHotEncoder() grd.fit(train_x1, train_y1) grd_enc.fit(grd.apply(train_x1)[:,:, 0]) grd_lm = LogisticRegression(penalty = 'l2', C = .0115) grd_lm.fit(grd_enc.transform(grd.apply(train_x2)[:,:, 0]), train_y2) #Import test data test_x = [] with open('test_2012.csv', 'r') as f: first_row = f.readline() headers = first_row.split(',') for row in f: ints = [int(elem) for elem in row.split(',')]
# Supervised transformation based on random forests rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator) rf_enc = OneHotEncoder() rf_lm = LogisticRegression() rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) grd = GradientBoostingClassifier(n_estimators=n_estimator, verbose=1) grd_enc = OneHotEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred_grd_lm = grd_lm.predict_proba( grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm) # The gradient boosted model by itself y_pred_grd = grd.predict_proba(X_test)[:, 1] fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd) # The random forest model by itself y_pred_rf = rf.predict_proba(X_test)[:, 1] fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)
# xt = xt[selected_feature[0:23]] # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5) # x_train, x_train_lr, y_train, y_train_lr = train_test_split(x_train, y_train, test_size=0.5) x_train, x_train_lr, y_train, y_train_lr = train_test_split(x, y, test_size=0.5) params = {'n_estimators': 1800, 'max_leaf_nodes': 4, 'max_depth': 6, 'random_state': 2, # None 'min_samples_split': 5, 'learning_rate': 0.1, 'subsample': 0.83} gb = GradientBoostingClassifier(**params) gb_encoder = preprocessing.OneHotEncoder() lr = LogisticRegression() gb.fit(x_train, y_train) gb_encoder.fit(gb.apply(x_train)[:, :, 0]) lr.fit(gb_encoder.transform(gb.apply(x_train_lr)[:, :, 0]), y_train_lr) # yhat = lr.predict_proba(gb_encoder.transform(gb.(x_test)[:, :, 0]))[:, 1] yhat = lr.predict_proba(gb_encoder.transform(gb.apply(xt)[:, :, 0]))[:, 1] yhat2 = gb.predict_proba(xt)[:, 1] yhat3 = (np.array(yhat)+np.array(yhat2))/2 # fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, yhat) # plt.figure() # plt.xlim(0, 1) # plt.ylim(0, 1) # plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR') # plt.show() result_data = {'QuoteNumber': xt['QuoteNumber'], 'QuoteConversion_Flag': yhat3}
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size = 0.3) #X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size = 0.5) del train """ GBDT+LR """ print "Performing GBDT+LR" random_state = np.random.RandomState(520) lr = LogisticRegression(random_state = random_state) grd = GradientBoostingClassifier(n_estimators = 10, random_state = random_state) grd_enc = OneHotEncoder() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) lr.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) #probas = lr.predict_proba(grd_enc.transform(grd.apply(X_test)[:, :, 0])) print "Predicting..." probas_train = lr.predict_proba(grd_enc.transform(grd.apply(X_train)[:, :, 0])) #probas_validation = lr.predict_proba(grd_enc.transform(grd.apply(X_validation)[:, :, 0])) #fpr, tpr, thredsholds = roc_curve(y_test, probas[:, 1]) fpr_train, tpr_train, thredsholds_train = roc_curve(y_train, probas_train[:, 1]) #fpr_validation, tpr_validation, thredsholds_validation= roc_curve(y_validation, probas_validation[:, 1]) #roc_auc = auc(fpr, tpr) roc_auc_train = auc(fpr_train, tpr_train) #roc_auc_vldt = auc(fpr_validation, tpr_validation) #plt.plot(fpr, tpr, lw = 1, label = 'AUC: = %0.4f)' % (roc_auc)) print "Plotting!..." plt.plot(fpr_train, tpr_train, lw = 1, label = 'AUC: = %0.4f)' % (roc_auc_train)) #plt.plot(fpr_validation, tpr_validation, lw = 1, label = 'AUC: = %0.4f)' % (roc_auc_vldt))
X_test = test_data X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5) # rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) # y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] #sixthforestt #Encoder and Logestic Regression combined with Gradient Boosting Classifier n_estimator = 10 grd = GradientBoostingClassifier(n_estimators=n_estimator) grd_enc = OneHotEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) # output = grd_lm.predict(test_data).astype(int) #output = rf_lm.predict(rf_enc.transform(rf.apply(X_test))).astype(int) output = grd_lm.predict(grd_enc.transform(grd.apply(X_test)[:, :, 0])).astype(int) ''' #secondforest (in git) #Cross Validation train_size = int(0.7*(train_data.shape[0])) validation_size = train_data.shape[0] - train_size X_train = train_data[0:train_size, 1::]