Ejemplo n.º 1
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42)

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)    # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:,:,0]
    X_test_leaves = gbdt.apply(X_test)[:,:,0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print(X_train_ext.shape)
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
Ejemplo n.º 2
0
    def test_hstack(self):

        A = coo_matrix([[1, 2], [3, 4]])
        B = coo_matrix([[5], [6]])

        expected = matrix([[1, 2, 5], [3, 4, 6]])
        assert_equal(construct.hstack([A, B]).todense(), expected)
        assert_equal(construct.hstack([A, B], dtype=np.float32).dtype, np.float32)
        assert_equal(construct.hstack([A.tocsc(), B.tocsc()]).todense(), expected)
        assert_equal(construct.hstack([A.tocsc(), B.tocsc()], dtype=np.float32).dtype, np.float32)
Ejemplo n.º 3
0
    def test_hstack(self):

        A = coo_matrix([[1,2],[3,4]])
        B = coo_matrix([[5],[6]])

        expected = matrix([[1, 2, 5],
                           [3, 4, 6]])
        assert_equal(construct.hstack([A,B]).todense(), expected)
        assert_equal(construct.hstack([A,B], dtype=np.float32).dtype, np.float32)
        assert_equal(construct.hstack([A.tocsc(),B.tocsc()]).todense(),
                     expected)
        assert_equal(construct.hstack([A.tocsc(),B.tocsc()], dtype=np.float32).dtype,
                     np.float32)
Ejemplo n.º 4
0
    def GBDT_LReval(self, feature, target):
        feature_train = feature.iloc[0:self.lenth_eval, :]
        feature_eval = feature.iloc[self.lenth_eval:, :]
        label_train = target.iloc[0:self.lenth_eval]
        label_eval = target.iloc[self.lenth_eval:]
        GBDT = GradientBoostingClassifier(n_estimators=10)
        GBDT.fit(feature_train.values, label_train.values)

        y_pred_gbdt = GBDT.predict_proba(feature_eval.values)[:, 1]
        gbdt_auc = roc_auc_score(label_eval.values, y_pred_gbdt)
        print('gbdt auc: %.5f' % gbdt_auc)

        X_train_leaves = GBDT.apply(feature_train)[:, :, 0]
        X_test_leaves = GBDT.apply(feature_eval)[:, :, 0]

        (train_rows, cols) = X_train_leaves.shape

        gbdtenc = OneHotEncoder()
        X_trans = gbdtenc.fit_transform(
            np.concatenate((X_train_leaves, X_test_leaves), axis=0))

        # 定义LR模型
        lr = LogisticRegression()
        # lr对gbdt特征编码后的样本模型训练
        lr.fit(X_trans[:train_rows, :], label_train)
        # 预测及AUC评测
        y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
        gbdt_lr_auc1 = roc_auc_score(label_eval, y_pred_gbdtlr1)

        # 定义LR模型
        lr = LogisticRegression(n_jobs=-1)
        # 组合特征
        X_train_ext = hstack([X_trans[:train_rows, :], feature_train])
        X_test_ext = hstack([X_trans[train_rows:, :], feature_eval])

        print(X_train_ext.shape)
        # lr对组合特征的样本模型训练
        lr.fit(X_train_ext, label_train)

        # 预测及AUC评测
        y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
        gbdt_lr_auc2 = roc_auc_score(label_eval, y_pred_gbdtlr2)
        print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)

        print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

        print(X_train_leaves.shape)
        print(X_test_leaves.shape)
        #print(X_train_leaves.shape)
        print(X_train_leaves)
        print(X_test_leaves)
Ejemplo n.º 5
0
    def test_hstack(self):

        A = coo_matrix([[1, 2], [3, 4]])
        B = coo_matrix([[5], [6]])

        expected = matrix([[1, 2, 5], [3, 4, 6]])
        assert_equal(construct.hstack([A, B]).todense(), expected)
Ejemplo n.º 6
0
    def test_hstack(self):

        A = coo_matrix([[1,2],[3,4]])
        B = coo_matrix([[5],[6]])

        expected = matrix([[1, 2, 5],
                           [3, 4, 6]])
        assert_equal(construct.hstack([A,B]).todense(), expected)
Ejemplo n.º 7
0
	def buildModel(self, X_train_d, X_train_c, X_test_d, X_test_c, y_train, y_test):
		'''
		开始构建模型
		Args:
			X_train_d: 离散特征训练数据
			X_train_c: 连续特征训练数据
			X_test_d: 离散特征测试数据
			X_test_c: 连续特征测试数据
			y_train: 训练数据标记 {-1, 1}
			y_test: 测试数据标记 {-1, 1}
		Returns:
			gbc_enc: GBDT OneHotEncoder
			gbc: GBDT模型
			comb_model: 训练得到的组合模型
			threshold: 正负样例阈值, Pred_Prob >= threshold 为正样例; Pred_Prob < threshold 为负样例
			comb_model_auc: 模型AUC
			precision: 模型精度
			recall: 模型召回率
		'''
		if self._random_state is not None:
			gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth, random_state=self._random_state).fit(X_train_c, y_train)
		else:
			gbc = GradientBoostingClassifier(n_estimators=self._n_estimators, learning_rate=self._gbdt_learning_rate, max_depth=self._max_depth).fit(X_train_c, y_train)
		X_train_leaves = gbc.apply(X_train_c)[:,:,0]
		X_test_leaves = gbc.apply(X_test_c)[:,:,0]
		(X_train_rows, cols) = X_train_leaves.shape
		gbc_enc = OneHotEncoder().fit(np.concatenate([X_train_leaves,X_test_leaves], axis = 0))
		X_trans = gbc_enc.transform(np.concatenate([X_train_leaves,X_test_leaves], axis = 0))
		X_train_ext = hstack([X_trans[:X_train_rows,:], X_train_d])
		X_test_ext = hstack([X_trans[X_train_rows:,:], X_test_d])
		log.debug("Combine features done.")
		comb_model = LogisticRegression().fit(X_train_ext, y_train)
		log.debug("Training done.")
		comb_model_pred = comb_model.predict_proba(X_test_ext)[:,1]
		precision, recall, thresholds = precision_recall_curve(y_test, comb_model_pred)
		ap = average_precision_score(y_test, comb_model_pred)
		recall_meet = recall >= self._recall_rate
		recall_meet_min = len([item for item in recall_meet if item == True])
		threshold = thresholds[recall_meet_min-1]
		log.debug("threshold: %f - precision: %f - recall: %f", threshold, precision[recall_meet_min-1], recall[recall_meet_min-1])
		comb_model_auc = roc_auc_score(y_test, comb_model_pred)
		log.debug("AUC score is: %f", comb_model_auc)
		return gbc_enc, gbc, comb_model, threshold, comb_model_auc, precision[recall_meet_min-1], recall[recall_meet_min-1]
Ejemplo n.º 8
0
 def Predict(self, X_test):
     X_test_leaves = self.gbdt.apply(X_test)[:, :, 0]
     gbdtenc = OneHotEncoder()
     self.X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0))
     X_test_ext = hstack([self.X_trans[self.train_rows:, :], X_test])
     y_pred_gbdtlr2 = self.lr.predict_proba(X_test_ext)[:, 1]
     values = []
     for value in y_pred_gbdtlr2:
         values.append(value)
     return values
Ejemplo n.º 9
0
 def fit_transform(self, raw_documents, y=None):
     results = []
     for vect_rule in self.vect_rules:
         name = vect_rule.get('name')
         vect = vect_rule.get('vectorizer')
         if hasattr(vect, '__call__'):
             data = vect([get_nested_value(x, name, '') for x in raw_documents])
         else:
             data = vect.fit_transform([get_nested_value(x, name, '') for x in raw_documents])
         results.append(data)
     return hstack(results, format='csr', dtype=np.float32)
Ejemplo n.º 10
0
 def fit_transform(self, raw_documents, y=None):
     results = []
     for vect_rule in self.vect_rules:
         name = vect_rule.get('name')
         vect = vect_rule.get('vectorizer')
         if hasattr(vect, '__call__'):
             data = vect(
                 [get_nested_str_value(x, name, '') for x in raw_documents])
         else:
             data = vect.fit_transform(
                 [get_nested_str_value(x, name, '') for x in raw_documents])
         results.append(data)
     return hstack(results, format='csr', dtype=np.float32)
Ejemplo n.º 11
0
 def gbdt_lr_train(self, Train_tab, Train_libsvm):
     # load样本数据
     X_all, y_all = load_svmlight_file("sample_libsvm_data.txt")
     # 训练/测试数据分割
     X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                         y_all,
                                                         test_size=0.1,
                                                         random_state=42)
     # 定义GBDT模型
     gbdt = GradientBoostingClassifier(n_estimators=40,
                                       max_depth=3,
                                       verbose=0,
                                       max_features=0.5)
     # 训练模型
     gbdt.fit(X_train, y_train)
     # GBDT编码原有特征
     X_train_leaves = gbdt.apply(X_train)[:, :, 0]
     X_test_leaves = gbdt.apply(X_test)[:, :, 0]
     # 对所有特征进行ont-hot编码
     (train_rows, cols) = X_train_leaves.shape
     gbdtenc = OneHotEncoder()
     X_trans = gbdtenc.fit_transform(
         np.concatenate((X_train_leaves, X_test_leaves), axis=0))
     # 定义LR模型
     lr = LogisticRegression(n_jobs=-1)
     # 组合特征
     X_train_ext = hstack([X_trans[:train_rows, :], X_train])
     X_test_ext = hstack([X_trans[train_rows:, :], X_test])
     # lr对组合特征的样本模型训练
     lr.fit(X_train_ext, y_train)
     # 预测及AUC评测
     filename = 'finalized_model.sav'
     pickle.dump(lr, open(filename, 'wb'))
     # load the model from disk
     loaded_model = pickle.load(open(filename, 'rb'))
     y_pred_gbdtlr2 = loaded_model.predict_proba(X_test_ext)[:, 1]
     print(y_pred_gbdtlr2)
Ejemplo n.º 12
0
 def gbdt_lr_train(self, Train_tab, Train_libsvm):
     # load样本数据
     X_all, y_all = load_svmlight_file("sample_libsvm_data.txt")
     # 训练/测试数据分割
     X_train, X_test, y_train, y_test = train_test_split(Train_libsvm, Train_tab, test_size=0.1, random_state=42)
     # 定义GBDT模型
     self.gbdt.fit(X_train, y_train)
     # GBDT编码原有特征
     self.X_train_leaves = self.gbdt.apply(X_train)[:, :, 0]
     X_test_leaves = self.gbdt.apply(X_test)[:, :, 0]
     # 对所有特征进行ont-hot编码
     (self.train_rows, cols) = self.X_train_leaves.shape
     gbdtenc = OneHotEncoder()
     X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0))
     X_train_ext = hstack([X_trans[:self.train_rows, :], X_train])
     # lr对组合特征的样本模型训练
     self.lr.fit(X_train_ext, y_train)
Ejemplo n.º 13
0
def save_dataset_file(dataset, directory):
    """ Cache a dataset into a file.

    Args:
        dataset (Dataset): The dataset to save.
        directory (str): The directory to save it to.
    """
    filepath = os.path.join(directory, generate_filename_for_dataset(dataset))
    header = get_file_header(dataset)

    logging.info("Saving dataset %s to path %s." % (dataset.label, filepath))
    if dataset.sparse:
        concatenated = hstack([dataset.data, dataset.target[np.newaxis].T])
        save_sparse_matrix(concatenated, filepath)
    else:
        concatenated = np.concatenate(
            (dataset.data, dataset.target[np.newaxis].T), axis=1)
        save_dense_matrix(concatenated, filepath, header)
    logging.debug("Saving successful")
Ejemplo n.º 14
0
	def combineFeatures(self, gbdt_model, gbdt_enc, X_data_c=None, X_data_d=None):
		'''
		进行特征的组合
		Args:
			gbdt_model: GBDT模型
			gbdt_enc: GBDT叶子节点OneHotEncoder
			X_data_c: 待组合连续特征
			X_data_d: 待组合离散特征
		Returns:
			X_ext: 组合后的特征
		'''
		if X_data_c is None and X_data_d is None:
			log.error("Feature can not be None.")
			return
		X_ext = None
		if X_data_c is not None:
			X_leaves = gbdt_model.apply(X_data_c)[:,:,0]
			X_ext = gbdt_enc.transform(X_leaves)
		if X_data_d is not None:
			if X_ext is not None:
				X_ext = hstack([X_ext, X_data_d])
		return X_ext
Ejemplo n.º 15
0
del a,gp,gp_day_mean,gp_day_var,gp1,gp2,gp3,gp4,index1,l,m1,m2,m3,merge_log,ss,ss2,t1,t2,t3,train_flg
#gbdt 构造新特征
gbdt = GradientBoostingClassifier(loss='exponential',learning_rate=0.12,n_estimators=60, max_depth=3,random_state=42,max_features=None)
X_train=train_agg.drop(['USRID','FLAG'],axis=1)
y_train=train_agg['FLAG']
# 训练学习
gbdt.fit(X_train, y_train)
# GBDT编码原有特征
X_train_leaves = gbdt.apply(X_train)[:,:,0]
X_test_leaves=gbdt.apply(test_agg.drop('USRID',axis=1))[:,:,0]
(train_rows, cols) = X_train_leaves.shape
onehot = OneHotEncoder()
X_trans = onehot.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

# 组合特征
X_train_agg = DataFrame(hstack([X_trans[:train_rows, :], train_agg]).toarray())
X_test_agg = DataFrame(hstack([X_trans[train_rows:, :], test_agg]).toarray())
X_train_agg.rename(columns={494: "USRID",495:"FLAG"},inplace=True)
X_test_agg.rename(columns={494: "USRID"},inplace=True)

#训练集和测试集

train_data=pd.merge(X_train_agg,train_log,on='USRID',how='left')
test_data=pd.merge(X_test_agg,test_log,on='USRID',how='left')
del X_train_agg,X_test_agg,train_log,test_log
#建模
import lightgbm as lgb
train_xy,offline_test = train_test_split(train_data,test_size = 0.3,random_state=42)
train,val = train_test_split(train_xy,test_size = 0.3,random_state=42)

# 训练集
Ejemplo n.º 16
0
def gbdt_lr_train(train,test,gbdt_features,lr_features,target,name,isOnline):

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=20, max_depth=3, verbose=0, max_features=0.3)
    #n_estimators=20, max_depth=3, verbose=0, max_features=0.5

    # 训练学习
    gbdt.fit(train[gbdt_features], train[target])

    # 预测及AUC评测
    if isOnline == False:
        y_pred_gbdt = gbdt.predict_proba(test[gbdt_features])[:, 1]
        gbdt_test_log_loss = log_loss(test[target], y_pred_gbdt)
        print('gbdt log_loss: %.5f' % gbdt_test_log_loss)
    else:
        y_pred_gbdt = gbdt.predict_proba(train[gbdt_features].tail(57562))[:, 1]
        gbdt_test_log_loss = log_loss(train[target].tail(57562), y_pred_gbdt)
        print('gbdt log_loss: %.5f' % gbdt_test_log_loss)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(train[gbdt_features])[:,:,0]
    X_test_leaves = gbdt.apply(test[gbdt_features])[:,:,0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

    # 定义LR模型
    lr = LogisticRegression()
    
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], train[target])
    
    # 预测及AUC评测
    if isOnline == False:
        y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
        gbdt_lr_test_log_loss1 = log_loss(test[target], y_pred_gbdtlr1)
        print('基于GBDT特征编码后的LR log_loss: %.5f' % gbdt_lr_test_log_loss1)
    else:
        print('Online')

    # 定义LR模型
    lr = LogisticRegression()
    
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], train[lr_features]])
    X_test_ext = hstack([X_trans[train_rows:, :], test[lr_features]])
    
    print("gbdt output",X_trans[:train_rows, :].shape)
    print("input",train[lr_features].shape)
    print(X_train_ext.shape)
    
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, train[target])

    # 预测及AUC评测
    if isOnline == False:
        y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
        gbdt_lr_test_log_loss2 = log_loss(test[target], y_pred_gbdtlr2)
        print('基于组合特征的LR log_loss: %.5f' % gbdt_lr_test_log_loss2)
    else:
        print('Online')
        
        test['predicted_score'] = lr.predict_proba(X_test_ext)[:, 1]
        print(test['predicted_score'].head(5))
        print(len(test))
        test[['instance_id', 'predicted_score']].to_csv('../baseline_' + name +'.csv', index=False,sep=' ')#保存在线提交结果
        print('Saved result success!')
Ejemplo n.º 17
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                        y_all,
                                                        test_size=0.3,
                                                        random_state=42)
    print "train data shape: ", X_train.shape

    # 模型训练
    gbdt = GradientBoostingClassifier(n_estimators=40,
                                      max_depth=3,
                                      verbose=0,
                                      max_features=0.5)
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)  # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:, :, 0]
    X_test_leaves = gbdt.apply(X_test)[:, :, 0]
    print "gbdt leaves shape: ", X_train_leaves.shape
    for i in range(0, len(X_train_leaves[0])):
        cateMap = {}
        for j in range(0, len(X_train_leaves)):
            cateMap[X_train_leaves[j][i]] = 0
        print "F%d: %d" % (i, len(cateMap))

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape
    gbdtenc = OneHotEncoder(sparse=False, categories='auto')
    X_trans = gbdtenc.fit_transform(
        np.concatenate((X_train_leaves, X_test_leaves), axis=0))
    print "gbdt oneHot shape: ", X_trans.shape
    print "oneHot leaves: ", X_trans[0]
    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print "gbdt leaves cross", X_train_ext.shape
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
Ejemplo n.º 18
0
        lr = LogisticRegression(n_jobs=1)
        # lr对gbdt特征编码后的样本模型训练
        lr.fit(X_trans[:train_rows, :], Y_train)
        # 预测及AUC评测
        Y_predict_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
        print('test:', log_loss(Y_test, Y_predict_gbdtlr1))
        print "before Accuracy : %.4f" % metrics.roc_auc_score(Y_test, Y_predict_gbdtlr1)
    
        # ('test:', 0.082230433929968566)
        # before Accuracy: 0.6818
        """
    # 定义LR模型
    lr = LogisticRegression(n_jobs=1)

    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, Y_train)
    Y_predict_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    print('test:', log_loss(Y_test, Y_predict_gbdtlr2))
    print "before Accuracy : %.4f" % metrics.roc_auc_score(
        Y_test, Y_predict_gbdtlr2)
    # ('test:', 0.095052240862119497)
    # before Accuracy : 0.5413
    """
    
    # 合并编码后的训练数据和测试数据
    All_leaves = np.concatenate((X_train_leaves, X_test_leaves), axis=0)
    All_leaves = All_leaves.astype(np.int32)
Ejemplo n.º 19
0
def xgb_lr_train():
    cv_lr_scores = []
    cv_lr_trans_scores = []
    cv_lr_trans_raw_scores = []
    cv_xgb_scores = []

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
    for train_index, valid_index in skf.split(X, y):
        X_train = X[train_index]
        X_valid = X[valid_index]
        y_train = y[train_index]
        y_valid = y[valid_index]

        # 定义xgb模型
        xgboost = xgb.XGBClassifier(nthread=4, learning_rate=0.08,
                                    n_estimators=100, max_depth=4,
                                    gamma=0, subsample=0.7, colsample_bytree=0.7,
                                    verbosity=1)
        # 训练学习
        xgboost.fit(X_train, y_train)
        y_pred_valid = xgboost.predict_proba(X_valid)[:, 1]
        xgb_valid_auc = roc_auc_score(y_valid, y_pred_valid)
        print('基于原有特征的xgb auc: %.5f' % xgb_valid_auc)
        cv_xgb_scores.append(xgb_valid_auc)

        # xgboost编码原有特征
        X_train_leaves = xgboost.apply(X_train)
        X_valid_leaves = xgboost.apply(X_valid)
        # 合并编码后的训练数据和测试数据
        All_leaves = np.concatenate((X_train_leaves, X_valid_leaves), axis=0)
        All_leaves = All_leaves.astype(np.int32)
        # 对所有特征进行ont-hot编码
        xgbenc = OneHotEncoder()
        X_trans = xgbenc.fit_transform(All_leaves)
        (train_rows, cols) = X_train_leaves.shape

        # 定义LR模型
        lr = LogisticRegression()
        # lr对xgboost特征编码后的样本模型训练
        lr.fit(X_trans[:train_rows, :], y_train)
        # 预测及AUC评测
        y_pred_xgblr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
        xgb_lr_auc1 = roc_auc_score(y_valid, y_pred_xgblr1)
        print('基于Xgb特征编码后的LR AUC: %.5f' % xgb_lr_auc1)
        cv_lr_trans_scores.append(xgb_lr_auc1)

        # 定义LR模型
        lr = LogisticRegression(n_jobs=-1)
        # 组合特征
        X_train_ext = hstack([X_trans[:train_rows, :], X_train])
        X_test_ext = hstack([X_trans[train_rows:, :], X_valid])

        # lr对组合特征的样本模型训练
        lr.fit(X_train_ext, y_train)

        # 预测及AUC评测
        y_pred_xgblr2 = lr.predict_proba(X_test_ext)[:, 1]
        xgb_lr_auc2 = roc_auc_score(y_valid, y_pred_xgblr2)
        print('基于组合特征的LR AUC: %.5f' % xgb_lr_auc2)
        cv_lr_trans_raw_scores.append(xgb_lr_auc2)
    cv_lr_trans = np.mean(cv_lr_trans_scores)
    cv_lr_trans_raw = np.mean(cv_lr_trans_raw_scores)
    cv_xgb = np.mean(cv_xgb_scores)

    print("==" * 20)
    print("xgb原始特征cv_gbdt:", cv_xgb)
    print("lr基于xgb的特征cv_lr_trans:", cv_lr_trans)
    print("lr基于xgb特征个原始特征cv_lr_trans_raw:", cv_lr_trans_raw)
Ejemplo n.º 20
0
def gbdt_lr_train():
    cv_lr_scores = []
    cv_lr_trans_scores = []
    cv_lr_trans_raw_scores = []
    cv_gbdt_scores = []

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
    for train_index, valid_index in skf.split(X, y):
        X_train = X[train_index]
        X_valid = X[valid_index]
        y_train = y[train_index]
        y_valid = y[valid_index]

        # 定义GBDT模型
        gbdt = GradientBoostingClassifier(n_estimators=60, max_depth=3, verbose=0, max_features=0.5)
        # 训练学习
        gbdt.fit(X_train, y_train)
        y_pred_gbdt = gbdt.predict_proba(X_valid)[:, 1]
        gbdt_auc = roc_auc_score(y_valid, y_pred_gbdt)
        print('基于原有特征的gbdt auc: %.5f' % gbdt_auc)
        cv_gbdt_scores.append(gbdt_auc)

        # lr对原始特征样本模型训练
        lr = LogisticRegression()
        lr.fit(X_train, y_train)  # 预测及AUC评测
        y_pred_test = lr.predict_proba(X_valid)[:, 1]
        lr_valid_auc = roc_auc_score(y_valid, y_pred_test)
        print('基于原有特征的LR AUC: %.5f' % lr_valid_auc)
        cv_lr_scores.append(lr_valid_auc)

        # GBDT编码原有特征
        X_train_leaves = gbdt.apply(X_train)[:, :, 0]
        X_valid_leaves = gbdt.apply(X_valid)[:, :, 0]

        # 对所有特征进行ont-hot编码
        (train_rows, cols) = X_train_leaves.shape

        gbdtenc = OneHotEncoder()
        X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_valid_leaves), axis=0))

        # 定义LR模型
        lr = LogisticRegression()
        # lr对gbdt特征编码后的样本模型训练
        lr.fit(X_trans[:train_rows, :], y_train)
        # 预测及AUC评测
        y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
        gbdt_lr_auc1 = roc_auc_score(y_valid, y_pred_gbdtlr1)
        print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)
        cv_lr_trans_scores.append(gbdt_lr_auc1)

        # 定义LR模型
        lr = LogisticRegression(n_jobs=-1)
        # 组合特征
        X_train_ext = hstack([X_trans[:train_rows, :], X_train])
        X_valid_ext = hstack([X_trans[train_rows:, :], X_valid])

        print(X_train_ext.shape)
        # lr对组合特征的样本模型训练
        lr.fit(X_train_ext, y_train)

        # 预测及AUC评测
        y_pred_gbdtlr2 = lr.predict_proba(X_valid_ext)[:, 1]
        gbdt_lr_auc2 = roc_auc_score(y_valid, y_pred_gbdtlr2)
        print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
        cv_lr_trans_raw_scores.append(gbdt_lr_auc2)

    cv_lr = np.mean(cv_lr_scores)
    cv_lr_trans = np.mean(cv_lr_trans_scores)
    cv_lr_trans_raw = np.mean(cv_lr_trans_raw_scores)
    cv_gbdt = np.mean(cv_gbdt_scores)
    print("==" * 20)
    print("gbdt原始特征cv_gbdt:", cv_gbdt)
    print("lr原始特征cv_lr:", cv_lr)
    print("lr基于gbdt的特征cv_lr_trans:", cv_lr_trans)
    print("lr基于gbdt特征个原始特征cv_lr_trans_raw:", cv_lr_trans_raw)
Ejemplo n.º 21
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)
    # X_all_dense = X_all.todense()
    print(type(X_all))
    # print(type(X_all_dense[0]))
    # print(y_all)
    # print("===")

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                        y_all,
                                                        test_size=0.3,
                                                        random_state=42)
    # print(X_train)
    # print(y_train)

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=40,
                                      max_depth=3,
                                      verbose=0,
                                      max_features=0.5)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    toarray = X_test.toarray()
    print(type(toarray))
    y_pred_gbdt = gbdt.predict_proba(toarray)
    # print(y_pred_gbdt)
    y_pred_gbdt = gbdt.predict_proba(toarray)[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)  # gbdt auc: 0.96455

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)  # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)  # 基于原有特征的LR AUC: 0.93455

    # GBDT编码原有特征
    # X_train_leaves = gbdt.apply(X_train)
    X_train_leaves = gbdt.apply(X_train)[:, :, 0]
    np.set_printoptions(linewidth=400)
    np.set_printoptions(threshold=np.inf)
    # print(X_train_leaves[0:22,:])  # 打印22行,所有列
    print(type(X_train_leaves))
    X_test_leaves = gbdt.apply(X_test)[:, :, 0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape
    print(train_rows, cols)

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(
        np.concatenate((X_train_leaves, X_test_leaves), axis=0))
    print(X_trans.shape)
    # print(X_trans.todense()[0:22,:])

    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    # print(X_trans[train_rows:, :])
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print("组合特征的个数:", X_train_ext.shape)
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)