def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:,:,0] X_test_leaves = gbdt.apply(X_test)[:,:,0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
def test_hstack(self): A = coo_matrix([[1, 2], [3, 4]]) B = coo_matrix([[5], [6]]) expected = matrix([[1, 2, 5], [3, 4, 6]]) assert_equal(construct.hstack([A, B]).todense(), expected) assert_equal(construct.hstack([A, B], dtype=np.float32).dtype, np.float32) assert_equal(construct.hstack([A.tocsc(), B.tocsc()]).todense(), expected) assert_equal(construct.hstack([A.tocsc(), B.tocsc()], dtype=np.float32).dtype, np.float32)
def test_hstack(self): A = coo_matrix([[1,2],[3,4]]) B = coo_matrix([[5],[6]]) expected = matrix([[1, 2, 5], [3, 4, 6]]) assert_equal(construct.hstack([A,B]).todense(), expected) assert_equal(construct.hstack([A,B], dtype=np.float32).dtype, np.float32) assert_equal(construct.hstack([A.tocsc(),B.tocsc()]).todense(), expected) assert_equal(construct.hstack([A.tocsc(),B.tocsc()], dtype=np.float32).dtype, np.float32)
def GBDT_LReval(self, feature, target): feature_train = feature.iloc[0:self.lenth_eval, :] feature_eval = feature.iloc[self.lenth_eval:, :] label_train = target.iloc[0:self.lenth_eval] label_eval = target.iloc[self.lenth_eval:] GBDT = GradientBoostingClassifier(n_estimators=10) GBDT.fit(feature_train.values, label_train.values) y_pred_gbdt = GBDT.predict_proba(feature_eval.values)[:, 1] gbdt_auc = roc_auc_score(label_eval.values, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) X_train_leaves = GBDT.apply(feature_train)[:, :, 0] X_test_leaves = GBDT.apply(feature_eval)[:, :, 0] (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], label_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(label_eval, y_pred_gbdtlr1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], feature_train]) X_test_ext = hstack([X_trans[train_rows:, :], feature_eval]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, label_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(label_eval, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) print(X_train_leaves.shape) print(X_test_leaves.shape) #print(X_train_leaves.shape) print(X_train_leaves) print(X_test_leaves)
def test_hstack(self): A = coo_matrix([[1, 2], [3, 4]]) B = coo_matrix([[5], [6]]) expected = matrix([[1, 2, 5], [3, 4, 6]]) assert_equal(construct.hstack([A, B]).todense(), expected)
def test_hstack(self): A = coo_matrix([[1,2],[3,4]]) B = coo_matrix([[5],[6]]) expected = matrix([[1, 2, 5], [3, 4, 6]]) assert_equal(construct.hstack([A,B]).todense(), expected)
def buildModel(self, X_train_d, X_train_c, X_test_d, X_test_c, y_train, y_test): ''' 开始构建模型 Args: X_train_d: 离散特征训练数据 X_train_c: 连续特征训练数据 X_test_d: 离散特征测试数据 X_test_c: 连续特征测试数据 y_train: 训练数据标记 {-1, 1} y_test: 测试数据标记 {-1, 1} Returns: gbc_enc: GBDT OneHotEncoder gbc: GBDT模型 comb_model: 训练得到的组合模型 threshold: 正负样例阈值, Pred_Prob >= threshold 为正样例; Pred_Prob < threshold 为负样例 comb_model_auc: 模型AUC precision: 模型精度 recall: 模型召回率 ''' if self._random_state is not None: gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth, random_state=self._random_state).fit(X_train_c, y_train) else: gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth).fit(X_train_c, y_train) X_train_leaves = gbc.apply(X_train_c)[:,:,0] X_test_leaves = gbc.apply(X_test_c)[:,:,0] (X_train_rows, cols) = X_train_leaves.shape gbc_enc = OneHotEncoder().fit(np.concatenate([X_train_leaves,X_test_leaves], axis = 0)) X_trans = gbc_enc.transform(np.concatenate([X_train_leaves,X_test_leaves], axis = 0)) X_train_ext = hstack([X_trans[:X_train_rows,:], X_train_d]) X_test_ext = hstack([X_trans[X_train_rows:,:], X_test_d]) log.debug("Combine features done.") comb_model = LogisticRegression().fit(X_train_ext, y_train) log.debug("Training done.") comb_model_pred = comb_model.predict_proba(X_test_ext)[:,1] precision, recall, thresholds = precision_recall_curve(y_test, comb_model_pred) ap = average_precision_score(y_test, comb_model_pred) recall_meet = recall >= self._recall_rate recall_meet_min = len([item for item in recall_meet if item == True]) threshold = thresholds[recall_meet_min-1] log.debug("threshold: %f - precision: %f - recall: %f", threshold, precision[recall_meet_min-1], recall[recall_meet_min-1]) comb_model_auc = roc_auc_score(y_test, comb_model_pred) log.debug("AUC score is: %f", comb_model_auc) return gbc_enc, gbc, comb_model, threshold, comb_model_auc, precision[recall_meet_min-1], recall[recall_meet_min-1]
def Predict(self, X_test): X_test_leaves = self.gbdt.apply(X_test)[:, :, 0] gbdtenc = OneHotEncoder() self.X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0)) X_test_ext = hstack([self.X_trans[self.train_rows:, :], X_test]) y_pred_gbdtlr2 = self.lr.predict_proba(X_test_ext)[:, 1] values = [] for value in y_pred_gbdtlr2: values.append(value) return values
def fit_transform(self, raw_documents, y=None): results = [] for vect_rule in self.vect_rules: name = vect_rule.get('name') vect = vect_rule.get('vectorizer') if hasattr(vect, '__call__'): data = vect([get_nested_value(x, name, '') for x in raw_documents]) else: data = vect.fit_transform([get_nested_value(x, name, '') for x in raw_documents]) results.append(data) return hstack(results, format='csr', dtype=np.float32)
def fit_transform(self, raw_documents, y=None): results = [] for vect_rule in self.vect_rules: name = vect_rule.get('name') vect = vect_rule.get('vectorizer') if hasattr(vect, '__call__'): data = vect( [get_nested_str_value(x, name, '') for x in raw_documents]) else: data = vect.fit_transform( [get_nested_str_value(x, name, '') for x in raw_documents]) results.append(data) return hstack(results, format='csr', dtype=np.float32)
def gbdt_lr_train(self, Train_tab, Train_libsvm): # load样本数据 X_all, y_all = load_svmlight_file("sample_libsvm_data.txt") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.1, random_state=42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) # 训练模型 gbdt.fit(X_train, y_train) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_test_leaves = gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 filename = 'finalized_model.sav' pickle.dump(lr, open(filename, 'wb')) # load the model from disk loaded_model = pickle.load(open(filename, 'rb')) y_pred_gbdtlr2 = loaded_model.predict_proba(X_test_ext)[:, 1] print(y_pred_gbdtlr2)
def gbdt_lr_train(self, Train_tab, Train_libsvm): # load样本数据 X_all, y_all = load_svmlight_file("sample_libsvm_data.txt") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(Train_libsvm, Train_tab, test_size=0.1, random_state=42) # 定义GBDT模型 self.gbdt.fit(X_train, y_train) # GBDT编码原有特征 self.X_train_leaves = self.gbdt.apply(X_train)[:, :, 0] X_test_leaves = self.gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (self.train_rows, cols) = self.X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0)) X_train_ext = hstack([X_trans[:self.train_rows, :], X_train]) # lr对组合特征的样本模型训练 self.lr.fit(X_train_ext, y_train)
def save_dataset_file(dataset, directory): """ Cache a dataset into a file. Args: dataset (Dataset): The dataset to save. directory (str): The directory to save it to. """ filepath = os.path.join(directory, generate_filename_for_dataset(dataset)) header = get_file_header(dataset) logging.info("Saving dataset %s to path %s." % (dataset.label, filepath)) if dataset.sparse: concatenated = hstack([dataset.data, dataset.target[np.newaxis].T]) save_sparse_matrix(concatenated, filepath) else: concatenated = np.concatenate( (dataset.data, dataset.target[np.newaxis].T), axis=1) save_dense_matrix(concatenated, filepath, header) logging.debug("Saving successful")
def combineFeatures(self, gbdt_model, gbdt_enc, X_data_c=None, X_data_d=None): ''' 进行特征的组合 Args: gbdt_model: GBDT模型 gbdt_enc: GBDT叶子节点OneHotEncoder X_data_c: 待组合连续特征 X_data_d: 待组合离散特征 Returns: X_ext: 组合后的特征 ''' if X_data_c is None and X_data_d is None: log.error("Feature can not be None.") return X_ext = None if X_data_c is not None: X_leaves = gbdt_model.apply(X_data_c)[:,:,0] X_ext = gbdt_enc.transform(X_leaves) if X_data_d is not None: if X_ext is not None: X_ext = hstack([X_ext, X_data_d]) return X_ext
del a,gp,gp_day_mean,gp_day_var,gp1,gp2,gp3,gp4,index1,l,m1,m2,m3,merge_log,ss,ss2,t1,t2,t3,train_flg #gbdt 构造新特征 gbdt = GradientBoostingClassifier(loss='exponential',learning_rate=0.12,n_estimators=60, max_depth=3,random_state=42,max_features=None) X_train=train_agg.drop(['USRID','FLAG'],axis=1) y_train=train_agg['FLAG'] # 训练学习 gbdt.fit(X_train, y_train) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:,:,0] X_test_leaves=gbdt.apply(test_agg.drop('USRID',axis=1))[:,:,0] (train_rows, cols) = X_train_leaves.shape onehot = OneHotEncoder() X_trans = onehot.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 组合特征 X_train_agg = DataFrame(hstack([X_trans[:train_rows, :], train_agg]).toarray()) X_test_agg = DataFrame(hstack([X_trans[train_rows:, :], test_agg]).toarray()) X_train_agg.rename(columns={494: "USRID",495:"FLAG"},inplace=True) X_test_agg.rename(columns={494: "USRID"},inplace=True) #训练集和测试集 train_data=pd.merge(X_train_agg,train_log,on='USRID',how='left') test_data=pd.merge(X_test_agg,test_log,on='USRID',how='left') del X_train_agg,X_test_agg,train_log,test_log #建模 import lightgbm as lgb train_xy,offline_test = train_test_split(train_data,test_size = 0.3,random_state=42) train,val = train_test_split(train_xy,test_size = 0.3,random_state=42) # 训练集
def gbdt_lr_train(train,test,gbdt_features,lr_features,target,name,isOnline): # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=20, max_depth=3, verbose=0, max_features=0.3) #n_estimators=20, max_depth=3, verbose=0, max_features=0.5 # 训练学习 gbdt.fit(train[gbdt_features], train[target]) # 预测及AUC评测 if isOnline == False: y_pred_gbdt = gbdt.predict_proba(test[gbdt_features])[:, 1] gbdt_test_log_loss = log_loss(test[target], y_pred_gbdt) print('gbdt log_loss: %.5f' % gbdt_test_log_loss) else: y_pred_gbdt = gbdt.predict_proba(train[gbdt_features].tail(57562))[:, 1] gbdt_test_log_loss = log_loss(train[target].tail(57562), y_pred_gbdt) print('gbdt log_loss: %.5f' % gbdt_test_log_loss) # GBDT编码原有特征 X_train_leaves = gbdt.apply(train[gbdt_features])[:,:,0] X_test_leaves = gbdt.apply(test[gbdt_features])[:,:,0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], train[target]) # 预测及AUC评测 if isOnline == False: y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_test_log_loss1 = log_loss(test[target], y_pred_gbdtlr1) print('基于GBDT特征编码后的LR log_loss: %.5f' % gbdt_lr_test_log_loss1) else: print('Online') # 定义LR模型 lr = LogisticRegression() # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], train[lr_features]]) X_test_ext = hstack([X_trans[train_rows:, :], test[lr_features]]) print("gbdt output",X_trans[:train_rows, :].shape) print("input",train[lr_features].shape) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, train[target]) # 预测及AUC评测 if isOnline == False: y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_test_log_loss2 = log_loss(test[target], y_pred_gbdtlr2) print('基于组合特征的LR log_loss: %.5f' % gbdt_lr_test_log_loss2) else: print('Online') test['predicted_score'] = lr.predict_proba(X_test_ext)[:, 1] print(test['predicted_score'].head(5)) print(len(test)) test[['instance_id', 'predicted_score']].to_csv('../baseline_' + name +'.csv', index=False,sep=' ')#保存在线提交结果 print('Saved result success!')
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42) print "train data shape: ", X_train.shape # 模型训练 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_test_leaves = gbdt.apply(X_test)[:, :, 0] print "gbdt leaves shape: ", X_train_leaves.shape for i in range(0, len(X_train_leaves[0])): cateMap = {} for j in range(0, len(X_train_leaves)): cateMap[X_train_leaves[j][i]] = 0 print "F%d: %d" % (i, len(cateMap)) # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder(sparse=False, categories='auto') X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) print "gbdt oneHot shape: ", X_trans.shape print "oneHot leaves: ", X_trans[0] # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print "gbdt leaves cross", X_train_ext.shape # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
lr = LogisticRegression(n_jobs=1) # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], Y_train) # 预测及AUC评测 Y_predict_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] print('test:', log_loss(Y_test, Y_predict_gbdtlr1)) print "before Accuracy : %.4f" % metrics.roc_auc_score(Y_test, Y_predict_gbdtlr1) # ('test:', 0.082230433929968566) # before Accuracy: 0.6818 """ # 定义LR模型 lr = LogisticRegression(n_jobs=1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, Y_train) Y_predict_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] print('test:', log_loss(Y_test, Y_predict_gbdtlr2)) print "before Accuracy : %.4f" % metrics.roc_auc_score( Y_test, Y_predict_gbdtlr2) # ('test:', 0.095052240862119497) # before Accuracy : 0.5413 """ # 合并编码后的训练数据和测试数据 All_leaves = np.concatenate((X_train_leaves, X_test_leaves), axis=0) All_leaves = All_leaves.astype(np.int32)
def xgb_lr_train(): cv_lr_scores = [] cv_lr_trans_scores = [] cv_lr_trans_raw_scores = [] cv_xgb_scores = [] skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019) for train_index, valid_index in skf.split(X, y): X_train = X[train_index] X_valid = X[valid_index] y_train = y[train_index] y_valid = y[valid_index] # 定义xgb模型 xgboost = xgb.XGBClassifier(nthread=4, learning_rate=0.08, n_estimators=100, max_depth=4, gamma=0, subsample=0.7, colsample_bytree=0.7, verbosity=1) # 训练学习 xgboost.fit(X_train, y_train) y_pred_valid = xgboost.predict_proba(X_valid)[:, 1] xgb_valid_auc = roc_auc_score(y_valid, y_pred_valid) print('基于原有特征的xgb auc: %.5f' % xgb_valid_auc) cv_xgb_scores.append(xgb_valid_auc) # xgboost编码原有特征 X_train_leaves = xgboost.apply(X_train) X_valid_leaves = xgboost.apply(X_valid) # 合并编码后的训练数据和测试数据 All_leaves = np.concatenate((X_train_leaves, X_valid_leaves), axis=0) All_leaves = All_leaves.astype(np.int32) # 对所有特征进行ont-hot编码 xgbenc = OneHotEncoder() X_trans = xgbenc.fit_transform(All_leaves) (train_rows, cols) = X_train_leaves.shape # 定义LR模型 lr = LogisticRegression() # lr对xgboost特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_xgblr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] xgb_lr_auc1 = roc_auc_score(y_valid, y_pred_xgblr1) print('基于Xgb特征编码后的LR AUC: %.5f' % xgb_lr_auc1) cv_lr_trans_scores.append(xgb_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_valid]) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_xgblr2 = lr.predict_proba(X_test_ext)[:, 1] xgb_lr_auc2 = roc_auc_score(y_valid, y_pred_xgblr2) print('基于组合特征的LR AUC: %.5f' % xgb_lr_auc2) cv_lr_trans_raw_scores.append(xgb_lr_auc2) cv_lr_trans = np.mean(cv_lr_trans_scores) cv_lr_trans_raw = np.mean(cv_lr_trans_raw_scores) cv_xgb = np.mean(cv_xgb_scores) print("==" * 20) print("xgb原始特征cv_gbdt:", cv_xgb) print("lr基于xgb的特征cv_lr_trans:", cv_lr_trans) print("lr基于xgb特征个原始特征cv_lr_trans_raw:", cv_lr_trans_raw)
def gbdt_lr_train(): cv_lr_scores = [] cv_lr_trans_scores = [] cv_lr_trans_raw_scores = [] cv_gbdt_scores = [] skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019) for train_index, valid_index in skf.split(X, y): X_train = X[train_index] X_valid = X[valid_index] y_train = y[train_index] y_valid = y[valid_index] # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=60, max_depth=3, verbose=0, max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) y_pred_gbdt = gbdt.predict_proba(X_valid)[:, 1] gbdt_auc = roc_auc_score(y_valid, y_pred_gbdt) print('基于原有特征的gbdt auc: %.5f' % gbdt_auc) cv_gbdt_scores.append(gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_valid)[:, 1] lr_valid_auc = roc_auc_score(y_valid, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_valid_auc) cv_lr_scores.append(lr_valid_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_valid_leaves = gbdt.apply(X_valid)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_valid_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_valid, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) cv_lr_trans_scores.append(gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_valid_ext = hstack([X_trans[train_rows:, :], X_valid]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_valid_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_valid, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2) cv_lr_trans_raw_scores.append(gbdt_lr_auc2) cv_lr = np.mean(cv_lr_scores) cv_lr_trans = np.mean(cv_lr_trans_scores) cv_lr_trans_raw = np.mean(cv_lr_trans_raw_scores) cv_gbdt = np.mean(cv_gbdt_scores) print("==" * 20) print("gbdt原始特征cv_gbdt:", cv_gbdt) print("lr原始特征cv_lr:", cv_lr) print("lr基于gbdt的特征cv_lr_trans:", cv_lr_trans) print("lr基于gbdt特征个原始特征cv_lr_trans_raw:", cv_lr_trans_raw)
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # X_all_dense = X_all.todense() print(type(X_all)) # print(type(X_all_dense[0])) # print(y_all) # print("===") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42) # print(X_train) # print(y_train) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) # 预测及AUC评测 toarray = X_test.toarray() print(type(toarray)) y_pred_gbdt = gbdt.predict_proba(toarray) # print(y_pred_gbdt) y_pred_gbdt = gbdt.predict_proba(toarray)[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # gbdt auc: 0.96455 # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # 基于原有特征的LR AUC: 0.93455 # GBDT编码原有特征 # X_train_leaves = gbdt.apply(X_train) X_train_leaves = gbdt.apply(X_train)[:, :, 0] np.set_printoptions(linewidth=400) np.set_printoptions(threshold=np.inf) # print(X_train_leaves[0:22,:]) # 打印22行,所有列 print(type(X_train_leaves)) X_test_leaves = gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape print(train_rows, cols) gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) print(X_trans.shape) # print(X_trans.todense()[0:22,:]) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 # print(X_trans[train_rows:, :]) y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print("组合特征的个数:", X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)