def get_leaf(self): self.get_data("oneHot") n_estimators = 300 clf_xgb = XGBClassifier(max_depth=4, learning_rate=0.0125, n_estimators=300, subsample=0.6, colsample_bytree=0.7, seed=4) #clf_xgb = XGBClassifier(max_depth=4, n_estimators=300) clf_xgb.fit(self.x_train, self.y_train) leafes_train = list(clf_xgb.apply(self.x_train)) leafes_test = list(clf_xgb.apply(self.x_test)) #补充最大值,最小值,将数据one-hot时统一 max_train = np.array(leafes_train).max() min_train = np.array(leafes_train).min() max_test = np.array(leafes_test).max() min_test = np.array(leafes_test).min() max_value = max(max_train, max_test) min_value = min(min_train, min_test) for i in range(min_value, max_value + 1): leafes_train.append([i] * n_estimators) enc = OneHotEncoder() enc.fit(leafes_train) #去除补充的值 leafes_train_feature = enc.transform( leafes_train).toarray()[:-(max_value - min_value + 1), :] print leafes_train_feature.shape, len(leafes_train) return leafes_train_feature, self.y_train, enc.transform( leafes_test).toarray(), self.y_test
def fit_model_split(self, X_train, y_train, X_test, y_test): ##X_train_1用于生成模型 X_train_2用于和新特征组成新训练集合 X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split( X_train, y_train, test_size=0.6, random_state=0) clf = XGBClassifier(learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective=self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) clf.fit(X_train_1, y_train_1) y_pre = clf.predict(X_train_2) y_pro = clf.predict_proba(X_train_2)[:, 1] print("pred_leaf=T AUC Score : %f" % metrics.roc_auc_score(y_train_2, y_pro)) print("pred_leaf=T Accuracy : %.4g" % metrics.accuracy_score(y_train_2, y_pre)) new_feature = clf.apply(X_train_2) X_train_new2 = self.mergeToOne(X_train_2, new_feature) new_feature_test = clf.apply(X_test) X_test_new = self.mergeToOne(X_test, new_feature_test) print("Training set of sample size 0.4 fewer than before") return X_train_new2, y_train_2, X_test_new, y_test
def fit_model(self, X_train, y_train, X_test): clf = XGBClassifier(learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective=self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) clf.fit(X_train, y_train) # y_pre= clf.predict(X_test) # y_pro= clf.predict_proba(X_test)[:,1] # print("pred_leaf=T AUC Score : %f" % metrics.roc_auc_score(y_test, y_pro)) # print("pred_leaf=T Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pre)) new_feature = clf.apply(X_train) X_train_new = self.mergeToOne(X_train, new_feature) new_feature_test = clf.apply(X_test) X_test_new = self.mergeToOne(X_test, new_feature_test) print("Training set sample number remains the same") return X_train_new, y_train, X_test_new
def feature_transformer(x_train, y_train, x_test, y_test, params): xgbt = XGBClassifier(**params).fit(x_train, y_train) gbt_enc = OneHotEncoder() gbt_enc.fit(xgbt.apply(x_train)) train_x_transform = gbt_enc.transform(xgbt.apply(x_train)) test_x_transform = gbt_enc.transform(xgbt.apply(x_test)) # pipelineModel = PMMLPipeline([("xgboost", XGBClassifier(**params))]) # pipelineModel.fit(x_train, y_train) # sklearn2pmml(pipelineModel, "/data/kongyy/ctr_online/model/xgboost_feature.pmml", with_repr=True) return train_x_transform, test_x_transform
def gbdt_new_features(result): clf = XGBClassifier( learning_rate=0.2, # 默认0.3 n_estimators=30, # 树的个数 max_depth=7, min_child_weight=10, gamma=0.5, subsample=0.75, colsample_bytree=0.75, objective='binary:logistic', # 逻辑回归损失函数 nthread=8, # cpu线程数 scale_pos_weight=1, reg_alpha=1e-05, reg_lambda=10, seed=1024) # 随机种子 predictors = [ i for i in result.columns if i not in [ 'orderid', 'geohashed_start_loc', 'geohashed_end_loc', 'userid', 'bikeid', 'starttime', 'label', 'biketype', 'start_lon', 'start_lat', 'end_lon', 'end_lat' ] ] old_features = result[predictors].values result1, result2 = train_test_split(result, test_size=0.6, random_state=0) del result1 clf.fit(result2[predictors].values, result2['label'].values) new_features = clf.apply(old_features) features = mergeToOne(old_features, new_features) return features # array类型
def fit_model(self, X_train, y_train, X_test): clf = XGBClassifier(learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective=self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) clf.fit(X_train, y_train) new_feature = clf.apply(X_train) X_train_new = self.mergeToOne(X_train, new_feature) new_feature_test = clf.apply(X_test) X_test_new = self.mergeToOne(X_test, new_feature_test) print("Training set sample number remains the same") return X_train_new, X_test_new
def runXgbStack(inputfile, outputfile): ''' 输入输出文件,inputfile和outputfile ''' # In[2]: df_all = pd.read_csv(inputfile) df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:])) # 默认填充的0,显示使用一个负数尝试一下 df_all.replace([np.inf, -np.inf], np.nan, inplace=True) df_all = df_all.fillna(0) # 默认填充的0,显示使用一个负数尝试一下 features = df_all.columns[0:] features = list(features) features.remove('EID') label = 'TARGET' df_train, df_test = xtrain_and_test(df_all) # In[7]: clf = XGBClassifier( n_estimators=50, #50棵树 learning_rate=0.05, max_depth=7, min_child_weight=1, gamma=0.1, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', seed=91) X_train = df_train[features] Y_label = df_train[label] X_test = df_test[features] clf.fit(X_train, Y_label, eval_metric='auc', verbose=5) column = ['STACKFEATURE' + str(i) for i in range(50)] df_new_feature = pd.DataFrame(clf.apply(df_all[features]), columns=column) df_all[column] = df_new_feature # 融合特征转化为onehot for feature in column: df_all[feature] = df_all[feature].astype(np.int32) df_tmp = pd.get_dummies(df_all[feature], prefix=feature) df_all[df_tmp.columns] = df_tmp df_all.drop(feature, axis=1, inplace=True) df_all.to_csv(outputfile, index=False, index_label=False) del df_train, df_test, df_all return outputfile
def xgb_fit(self, data_x, data_y, all_data): new_xgb = XGBClassifier( n_estimators=self.n_estimators, # 可以调lgb包 param:boosting='gbdt' random_state=10, gamma=self.gamma, max_depth=self.max_depth, min_child_weight=self.min_child_weight, base_score=self.base_score, colsample_bytree=0.5) new_xgb.fit(data_x, data_y) # alldata_x = pd.concat([data_x, test_x]) train_new_fea = new_xgb.apply(all_data) # 返回所有样本每棵树的叶子节点索引 train_new_fea = train_new_fea.reshape(-1, self.n_estimators) enc = OneHotEncoder() enc.fit(train_new_fea) # 每一个tree都会得到一个one-hot transform train_new_embeddings = np.array(enc.transform(train_new_fea).toarray()) with open('xgblr_xgb.pkl', 'wb') as f: pickle.dump(new_xgb, f) return train_new_embeddings # 新输入特征样本
learning_rate=0.2, # 默认0.3 n_estimators=200, # 树的个数 max_depth=8, min_child_weight=10, gamma=0.5, subsample=0.75, colsample_bytree=0.75, objective='binary:logistic', # 逻辑回归损失函数 nthread=8, # cpu线程数 scale_pos_weight=1, reg_alpha=1e-05, reg_lambda=10, seed=1024) # 随机种子 clf.fit(X_train_1, y_train_1) new_feature = clf.apply(X_train_2) X_train_new2 = mergeToOne(X_train_2, new_feature) new_feature_test = clf.apply(X_test) X_test_new = mergeToOne(X_test, new_feature_test) model = XGBClassifier( learning_rate=0.05, # 默认0.3 n_estimators=300, # 树的个数 max_depth=7, min_child_weight=1, gamma=0.5, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', # 逻辑回归损失函数 nthread=8, # cpu线程数
metrics.roc_auc_score(y_test, y_sklearn)) print "原始train大小:", X_train.shape print "原始test大小:", X_test.shape # XGBoost 自带接口生成的新特征 train_new_feature = model_bst.predict(d_train, pred_leaf=True) test_new_feature = model_bst.predict(d_test, pred_leaf=True) train_new_feature1 = DataFrame(train_new_feature) test_new_feature1 = DataFrame(test_new_feature) print "新的特征集(自带接口):", train_new_feature1.shape print "新的测试集(自带接口):", test_new_feature1.shape # sklearn接口生成的新特征 train_sklearn_new_feature = clf.apply(X_train) #每个样本在每颗树叶子节点的索引值 test_sklearn_new_feature = clf.apply(X_test) train_new_feature2 = DataFrame(train_sklearn_new_feature) test_new_feature2 = DataFrame(test_sklearn_new_feature) print "新的特征集(sklearn接口):", train_new_feature2.shape print "新的测试集(sklearn自带接口):", test_new_feature2.shape #用XGBoost自带接口生成的新特征训练 new_feature1 = clf.fit(train_new_feature1, y_train) y_new_feature1 = clf.predict_proba(test_new_feature1)[:, 1] #用XGBoost自带接口生成的新特征训练 new_feature2 = clf.fit(train_new_feature2, y_train) y_new_feature2 = clf.predict_proba(test_new_feature2)[:, 1]
colsample_bytree=0.7, objective='binary:logistic', scale_pos_weight=1.002252816020025, reg_alpha=0.3, reg_lambda=0.1, seed=27) # train the values model_sklearn = clf.fit(X_train, Y_train) y_bst = model_sklearn.predict_proba(X_test)[:, 1] metrics_spec(Y_train, model_sklearn.predict_proba(X_train)[:, 1]) metrics_spec(Y_test, y_bst) # make new features # we can get the spare leaf nodes for the input of stacking train_new_feature = clf.apply(X_train) test_new_feature = clf.apply(X_test) enc = OneHotEncoder() enc.fit(train_new_feature) train_new_feature2 = np.array(enc.transform(train_new_feature).toarray()) test_new_feature2 = np.array(enc.transform(test_new_feature).toarray()) res_data = pd.DataFrame(np.c_[Y_train, train_new_feature2]) res_data.columns = ['f' + str(x) for x in range(res_data.shape[1])] res_test = pd.DataFrame(np.c_[Y_test, test_new_feature2]) res_test.columns = ['f' + str(x) for x in range(res_test.shape[1])] # stacking a model , it can be logistic or fm, nerual network and they will come to be beyond all expectations # attention points of the stacking model can be obtained from the article mentioned at the top of the code lr = LogisticRegression(C=1, penalty='l2', max_iter=100,
learning_rate=0.2, # 默认0.3 n_estimators=10, # 树的个数 max_depth=8, min_child_weight=10, gamma=0.5, subsample=0.75, colsample_bytree=0.75, objective='binary:logistic', # 逻辑回归损失函数 nthread=8, # cpu线程数 scale_pos_weight=1, reg_alpha=1e-05, reg_lambda=10, seed=1024) # 随机种子 clf.fit(X, y) new_feature_X = clf.apply(X) new_feature_Y = clf.apply(Y) print(new_feature_X) print(new_feature_Y) shape_x = np.shape(new_feature_X) shape_y = np.shape(new_feature_Y) print('shape of train: ', shape_x) print('shape of test: ', shape_y) new_feature_X = pd.DataFrame(new_feature_X) new_feature_Y = pd.DataFrame(new_feature_Y) new_feature_X.to_csv(data_path + 'train_Drop_Delete_Log_Ratio_Label_GBDT1.csv') new_feature_Y.to_csv(data_path + 'test_Drop_Delete_Log_Ratio_Label_GBDT1.csv') #submission = pd.DataFrame({'pred': new_feature_X.mean(axis=1)}) #submission.to_csv(r'sub{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')), header=None, index=False, float_format='%.3f')
def runXgbStack(inputfile, outputfile): ''' 输入输出文件,inputfile和outputfile ''' # In[2]: df_all = pd.read_csv(inputfile) df_all['XEID'] = df_all['EID'].map(lambda x: int(x[1:])) # 默认填充的0,显示使用一个负数尝试一下 df_all.replace([np.inf, -np.inf], np.nan, inplace=True) df_all = df_all.fillna(0) # 默认填充的0,显示使用一个负数尝试一下 features = df_all.columns[0:] features = list(features) features.remove('EID') label = 'TARGET' clf = XGBClassifier( n_estimators=50, #50棵树 learning_rate=0.05, max_depth=7, min_child_weight=1, gamma=0.1, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', seed=91) df_all_prov11, df_all_prov12 = split_data_with_prov(df_all) ###################### prov == 11 df_train11, df_test11 = xtrain_and_test(df_all_prov11) # In[7]: X_train11 = df_train11[features] Y_label11 = df_train11[label] X_test11 = df_test11[features] clf.fit(X_train11, Y_label11, eval_metric='auc', verbose=5) column = ['STACKFEATURE' + str(i) for i in range(50)] df_new_feature11 = pd.DataFrame(clf.apply(df_all_prov11[features]), columns=column) df_all_prov11[column] = df_new_feature11 ####################### prov == 12 df_train12, df_test12 = xtrain_and_test(df_all_prov12) # In[7]: X_train12 = df_train12[features] Y_label12 = df_train12[label] X_test12 = df_test12[features] clf.fit(X_train12, Y_label12, eval_metric='auc', verbose=5) column = ['STACKFEATURE' + str(i) for i in range(50)] df_new_feature12 = pd.DataFrame(clf.apply(df_all_prov12[features]), columns=column) df_all_prov12[column] = df_new_feature12 df_all = df_all_prov11.append(df_all_prov12) df_all.to_csv(outputfile, index=False, index_label=False) del df_all_prov11, df_all_prov12, df_all return outputfile
from xgboost.sklearn import XGBClassifier from sklearn.preprocessing import OneHotEncoder tree_model = XGBClassifier( silent=0, #设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。 #nthread=4,# cpu 线程数 默认最大 learning_rate=0.3, # 如同学习率 min_child_weight=1, # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言 #,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 #这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。 max_depth=5, # 构建树的深度,越大越容易过拟合 gamma=0, # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子。 subsample=1, # 随机采样训练样本 训练实例的子采样比 max_delta_step=0, #最大增量步长,我们允许每个树的权重估计。 colsample_bytree=1, # 生成树时进行的列采样 reg_lambda=1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 #reg_alpha=0, # L1 正则项参数 #scale_pos_weight=1, #如果取值大于0的话,在类别样本不平衡的情况下有助于快速收敛。平衡正负权重 #objective= 'multi:softmax', #多分类的问题 指定学习任务和相应的学习目标 #num_class=10, # 类别数,多分类与 multisoftmax 并用 n_estimators=20, #树的个数 seed=1000 #随机种子 #eval_metric= 'auc' ) tree_model.fit(train, train_label) code_tree = tree_model.apply(test) tm_enc = OneHotEncoder(categories='auto') ss = tm_enc.fit_transform(code_tree)