コード例 #1
0
ファイル: testRLS.py プロジェクト: jbjorne/CAMDA2014
def testRLS(input):
    X, Y = svmlight_format.load_svmlight_file(input)

    hoindices = range(int(0.1 * len(Y)))
    hocompl = list(set(range(len(Y))) - set(hoindices))
    trainX = X[hocompl]
    testX = X[hoindices]
    trainY = Y[hocompl]
    testY = Y[hoindices]
    print len(trainY), len(testY)

    kwargs = {}
    kwargs["train_features"] = trainX
    kwargs["train_labels"] = trainY

    rls = RLS.createLearner(**kwargs)
    rls.train()
    bestperf = -1.0
    for logrp in range(-5, 5):
        rp = 2.0 ** logrp
        rls.solve(rp)
        Ploo = rls.computeLOO()
        perf = cindex(trainY, Ploo)
        print logrp, perf
        if perf > bestperf:
            bestperf = perf
            bestlogrp = logrp
    rp = 2.0 ** bestlogrp
    rls.solve(rp)
    P = rls.getModel().predict(testX)
コード例 #2
0
def load_data(data_path):
    X_all, y_all = load_svmlight_file(data_path)
    X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                        y_all,
                                                        test_size=0.3,
                                                        random_state=42)
    return X_train, X_test, y_train, y_test
コード例 #3
0
ファイル: gbdt_lr.py プロジェクト: rogeroyer/dataCastle
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42)

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)    # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:,:,0]
    X_test_leaves = gbdt.apply(X_test)[:,:,0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0))

    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print(X_train_ext.shape)
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
コード例 #4
0
ファイル: evaluation.py プロジェクト: laurensvdwiel/KeCo
def perform_experiment(T, V, experiment_parameters, evaluation_strategy, dataset_location, n_folds=10, labelled_percentage=1.0, random_seed=None, use_unlabelled=True, use_parallel=True):
    """Each experiment we split the data into three parts, where two parts are used
for training and remaining one is used for testing, we repeat this three times,
until all parts have been considered as testing. The result of an experiment is
the average performance over the three test parts."""
    X, Y = load_svmlight_file(dataset_location)
    
    # ensure the dataset gets split into multiple views
    X_views = split_dataset_into_random_views(X, V, random_seed)
    
    # retrieve the train-test folds
    folds = StratifiedShuffleSplit(Y, test_size=0.3, random_state=random_seed)
    for train_index, test_index in folds:
        X_train = {n:X_views[n][train_index] for n in X_views.keys()}
        X_test = {n:X_views[n][test_index] for n in X_views.keys()}
        y_train, y_test = Y[train_index], Y[test_index]
    
    # unlabel the trainingset
    np.random.seed(random_seed)
    unlabel = np.random.random(len(y_train))
    for i in range(len(unlabel)):
        if unlabel[i] > labelled_percentage:
            y_train[i] = 0.0
    
    # grid search for the best grid
    best_grid = gridsearch(X_train, y_train, T, V, experiment_parameters, n_folds, evaluation_strategy, use_unlabelled, use_parallel, random_seed)
    
    # predetermine the order of samples
    order_of_samples = evaluation_strategy(y_train, T, use_unlabelled, random_seed)

    # generate the model
    alphas, predictions = training(X_train, y_train, V, order_of_samples, best_grid['grid']['kernel_method'], best_grid['grid']['kernel_parameters'], best_grid['grid']['lambdas'])
    
    # test the model    
    y_preds_est = []
    y_preds = []
    for i in range(len(y_test)):
        y_pred = {}
        y_pred_est = 0.0
        for n in range(V):
            y_pred[n] = coagreement_prediction_for_view_n(X_test[n][i], X_train, y_train, V, n, T+1, predictions, alphas, best_grid['grid']['kernel_method'], best_grid['grid']['kernel_parameters'], best_grid['grid']['lambdas'])
            y_pred_est += y_pred[n]
        y_preds.append(y_pred)
        y_preds_est.append(y_pred_est/V)
    
    # retrieve the metrics
    AUC, fpr, tpr = area_under_the_roc_curve(y_test, y_preds_est)
    
    print 'Achieved in validation '+str(AUC)+' AUC, and in training '+str(best_grid['AUC'])+' over '+str(n_folds)+' folds'
        
    return {"auc":AUC, "fpr":fpr, "tpr":tpr, "model":alphas, "best_grid":best_grid}
コード例 #5
0
 def gbdt_lr_train(self, Train_tab, Train_libsvm):
     # load样本数据
     X_all, y_all = load_svmlight_file("sample_libsvm_data.txt")
     # 训练/测试数据分割
     X_train, X_test, y_train, y_test = train_test_split(Train_libsvm, Train_tab, test_size=0.1, random_state=42)
     # 定义GBDT模型
     self.gbdt.fit(X_train, y_train)
     # GBDT编码原有特征
     self.X_train_leaves = self.gbdt.apply(X_train)[:, :, 0]
     X_test_leaves = self.gbdt.apply(X_test)[:, :, 0]
     # 对所有特征进行ont-hot编码
     (self.train_rows, cols) = self.X_train_leaves.shape
     gbdtenc = OneHotEncoder()
     X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0))
     X_train_ext = hstack([X_trans[:self.train_rows, :], X_train])
     # lr对组合特征的样本模型训练
     self.lr.fit(X_train_ext, y_train)
コード例 #6
0
 def gbdt_lr_train(self, Train_tab, Train_libsvm):
     # load样本数据
     X_all, y_all = load_svmlight_file("sample_libsvm_data.txt")
     # 训练/测试数据分割
     X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                         y_all,
                                                         test_size=0.1,
                                                         random_state=42)
     # 定义GBDT模型
     gbdt = GradientBoostingClassifier(n_estimators=40,
                                       max_depth=3,
                                       verbose=0,
                                       max_features=0.5)
     # 训练模型
     gbdt.fit(X_train, y_train)
     # GBDT编码原有特征
     X_train_leaves = gbdt.apply(X_train)[:, :, 0]
     X_test_leaves = gbdt.apply(X_test)[:, :, 0]
     # 对所有特征进行ont-hot编码
     (train_rows, cols) = X_train_leaves.shape
     gbdtenc = OneHotEncoder()
     X_trans = gbdtenc.fit_transform(
         np.concatenate((X_train_leaves, X_test_leaves), axis=0))
     # 定义LR模型
     lr = LogisticRegression(n_jobs=-1)
     # 组合特征
     X_train_ext = hstack([X_trans[:train_rows, :], X_train])
     X_test_ext = hstack([X_trans[train_rows:, :], X_test])
     # lr对组合特征的样本模型训练
     lr.fit(X_train_ext, y_train)
     # 预测及AUC评测
     filename = 'finalized_model.sav'
     pickle.dump(lr, open(filename, 'wb'))
     # load the model from disk
     loaded_model = pickle.load(open(filename, 'rb'))
     y_pred_gbdtlr2 = loaded_model.predict_proba(X_test_ext)[:, 1]
     print(y_pred_gbdtlr2)
コード例 #7
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                        y_all,
                                                        test_size=0.3,
                                                        random_state=42)
    print "train data shape: ", X_train.shape

    # 模型训练
    gbdt = GradientBoostingClassifier(n_estimators=40,
                                      max_depth=3,
                                      verbose=0,
                                      max_features=0.5)
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)  # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)

    # GBDT编码原有特征
    X_train_leaves = gbdt.apply(X_train)[:, :, 0]
    X_test_leaves = gbdt.apply(X_test)[:, :, 0]
    print "gbdt leaves shape: ", X_train_leaves.shape
    for i in range(0, len(X_train_leaves[0])):
        cateMap = {}
        for j in range(0, len(X_train_leaves)):
            cateMap[X_train_leaves[j][i]] = 0
        print "F%d: %d" % (i, len(cateMap))

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape
    gbdtenc = OneHotEncoder(sparse=False, categories='auto')
    X_trans = gbdtenc.fit_transform(
        np.concatenate((X_train_leaves, X_test_leaves), axis=0))
    print "gbdt oneHot shape: ", X_trans.shape
    print "oneHot leaves: ", X_trans[0]
    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print "gbdt leaves cross", X_train_ext.shape
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
コード例 #8
0
from numpy import array
from nltk.metrics import ConfusionMatrix
import sys 
from cv import XValMT

def warning(*objs):
        print("evaluate.py: WARNING: ", *objs, file=sys.stderr)

if len(sys.argv) > 1: 
    svm_light_in  = sys.argv[1]
else:
    warning("No feature file loaded")
    sys.exit(1)

print ("Loading dataset %s..."%svm_light_in)
feat_vecs,labels = svmlight.load_svmlight_file(svm_light_in)
print ('done\n')


# initialize NB classifier
#clf = MultinomialNB()

#clf = BernoulliNB()
#clf.class_prior =[0.041175856307435255,0.9588241436925647]

#clf = svm.SVC()
#clf.cache_size = 4000
#clf.n_jobs = -1
#clf.C = .1

clf = SGDClassifier()
コード例 #9
0
ファイル: rbm.py プロジェクト: zhangchao1194/ID_01
	# return a tuple of the data matrix and targets
	return (np.array(data), np.array(target))
   	
if __name__ == '__main__':
    # construct the argument parser and parse the arguments
    ap = argparse.ArgumentParser()
    ap.add_argument("-d", "--dataset", required = True,
	    help = "path of data set")
    ap.add_argument("-t", "--test", required = True, type = float,
	    help = "size of test split")
    ap.add_argument("-s", "--search", type = int, default = 0,
	    help = "whether or not a grid search should be performed")
    args = vars(ap.parse_args())
    
    X = svmlight_format.load_svmlight_file(args['dataset'])
#    Test = svmlight_format.load_svmlight_file(args['test_dataset'])
    XX = X[0].toarray()
    XX = XX.astype("float32")
    yy = X[1]
    XX = scale(XX) 
    (trainX, testX, trainY, testY) = train_test_split(XX, yy, test_size = args['test'], random_state = 42)
    
    if args["search"] == 1:
        # perform a grid search on the 'C' parameter of Logistic
        # Regression
        print "SEARCHING LOGISTIC REGRESSION"
        params = {"C": [1.0, 10.0, 100.0]}
        start = time.time()
        gs = GridSearchCV(LogisticRegression(), params, n_jobs = -1, verbose = 1)
        gs.fit(trainX, trainY)
コード例 #10
0
from sklearn import linear_model
import sklearn.datasets.svmlight_format as svmlight
from sklearn import grid_search
import cPickle

parameters = {
    'alpha': [.000001, .0000001, .00000001],
    'n_iter': [5000, 10000, 15000]
}
feat_vecs, labels = svmlight.load_svmlight_file('featureFile.dat')
svr = linear_model.SGDClassifier()
svr.n_jobs = -1
clf = grid_search.GridSearchCV(svr, parameters)
clf.n_jobs = -1
clf.fit(feat_vecs, labels)

print clf.grid_scores_
with open('gridSearch.out', 'w') as fo:
    cPickle.dump(clf.grid_scores_, fo)
コード例 #11
0
#!/usr/bin/python

from pdb import set_trace
import sklearn.datasets.svmlight_format as svmlight
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.cross_validation import KFold
from operator import itemgetter
from random import shuffle
from numpy import array

print "Loading dataset ..."
svm_light_in = 'featureFile.dat'

feat_vecs, labels = svmlight.load_svmlight_file(svm_light_in)
print 'done\n'

# initialize chi2 filtering object
yes_chi2 = False
k = 20000
ch2 = SelectKBest(chi2, k=k)

# initialize NB classifier
clf = MultinomialNB()
#clf = svm.SVC()

# how to divide sample sizes
bins = 6
samp_size = len(labels) / bins
コード例 #12
0
ds_paths = [pjoin(ds_dir, 'libsvm', name) for name in ds_names]


def sigma_from_gamma(gamma=0.1):
    return _ensure_min_eps(np.sqrt(1.0 / (2 * gamma)))


def gamma_from_sigma(sigma=0.1):
    return _ensure_min_eps(1.0 / (2 * sigma**2))


for name, ds_path in zip(ds_names, ds_paths):
    time_stamp = strftime("%H:%M:%S", gmtime())

    X, y = load_svmlight_file(ds_path)
    X = X.toarray()

    print('\n{:10}  {:20} {}'.format(time_stamp, name, X.shape))

    gamma = 0.1
    skl_svm = SVC(C=1.0, kernel='rbf', gamma=gamma)
    ss_cv1 = ShuffleSplit(n_splits=20, train_size=0.8, test_size=0.2)
    scores_skl = cross_val_score(skl_svm, X, y, cv=ss_cv1)

    ker_func = GaussianKernel(sigma=sigma_from_gamma(gamma))
    km_svm = KernelMachine(k_func=ker_func, learner_id='SVM', normalized=False)
    ss_cv2 = ShuffleSplit(n_splits=20, train_size=0.8, test_size=0.2)
    scores_km = cross_val_score(km_svm, X, y, cv=ss_cv2)

    print('\tSKLearn    Accuracy: {:.4f} +/- {:.4f}'
コード例 #13
0
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.datasets.svmlight_format import load_svmlight_file
from sklearn import svm
import sys

parser = argparse.ArgumentParser(description='Program to do some simple gridsearch')
parser.add_argument("libsvm_file", type=str)

args = parser.parse_args()

if args.libsvm_file is None:
    parser.print_help("error")
    sys.exit(-1)

X, y = load_svmlight_file(args.libsvm_file)


# split the dataset in two equal part respecting label proportions
train, test = iter(StratifiedKFold(y, 2, indices=True)).next()


tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = [
    ('precision', precision_score),
    ('recall', recall_score),
    ('f1_score', f1_score)
]
コード例 #14
0
from sklearn import linear_model
import sklearn.datasets.svmlight_format as svmlight
from sklearn import grid_search
import cPickle

parameters = {'alpha':[.000001,.0000001,.00000001],'n_iter':[5000,10000,15000]}
feat_vecs,labels = svmlight.load_svmlight_file('featureFile.dat')
svr = linear_model.SGDClassifier()
svr.n_jobs = -1
clf = grid_search.GridSearchCV(svr, parameters)
clf.n_jobs = -1
clf.fit(feat_vecs,labels)

print clf.grid_scores_
with open('gridSearch.out', 'w') as fo:
    cPickle.dump(clf.grid_scores_,fo)
コード例 #15
0
ファイル: train.py プロジェクト: zhangchao1194/ID_01
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import numpy as np
from sklearn.datasets import svmlight_format
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression

if __name__ == '__main__':
    num_epoches = 10
    Train= svmlight_format.load_svmlight_file('./feature_train.txt')
    Test = svmlight_format.load_svmlight_file('./feature_test.txt')
    #method-0
#    model = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=-1, random_state=None, verbose=0)
    
    #method-1
    model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None)

    #method-2
    #model = svm.libsvm.fit( np.array( training_data,dtype=np.float64), np.array( training_label,dtype=np.float64), kernel='linear' )
    for epoch in xrange(num_epoches):
        print "learning epoch: ", epoch, "/", num_epoches
        #method-0
#        model.fit( Train[0].toarray(), Train[1] )
        
        #method-1
        model.fit( Train[0], Train[1] )
    print "testing..."
    #output = model.predict(predict_data)
コード例 #16
0
from sklearn import linear_model
import sklearn.datasets.svmlight_format as svmlight
from sklearn import grid_search
from sklearn.svm import LinearSVC,SVC
import cPickle

#-----------------SGD-------------------------------
parameters = {'alpha':[.000001,.0000001,.00000001,.000000001],'n_iter':[5000,10000,15000,30000]}
svr = linear_model.SGDClassifier()
svr.n_jobs = -1

# -------- SVM--------------------
#svr = SVC(class_weight='auto')
#parameters = {'C':range(500,5000,1000)}

print "Loading feature file..."
feat_vecs,labels = svmlight.load_svmlight_file('feature_files/+3-3/feats-Dev.dat')
print "Done"
clf = grid_search.GridSearchCV(svr, parameters)
clf.n_jobs = -1
clf.fit(feat_vecs,labels)

print clf.grid_scores_
with open('gridSearch.cPickle', 'wb') as fo:
    cPickle.dump(clf.grid_scores_,fo)
コード例 #17
0
from scipy.stats.stats import pearsonr
import numpy as np
from sklearn.svm.classes import NuSVR
from sklearn.datasets.svmlight_format import load_svmlight_file

if __name__ == '__main__':

    trainfile = './data/svm_train.txt'

    problem = svm_read_problem(trainfile)
    rank_model = svm_train(problem[0][:-100], problem[1][:-100],
                           '-s 4 -h 0 -m 1000')

    predicted_f, _, _ = svm_predict(
        np.ones(100).tolist(), problem[1][-100:], rank_model)

    scores_rank_test = problem[0][-100:]

    print(("Pearson correlation for fold = %f" %
           pearsonr(scores_rank_test, predicted_f)[0]))

    svr = NuSVR()

    lingfeat, y = load_svmlight_file(trainfile)

    svr.fit(lingfeat[:-100], y[:-100])
    y_pred = svr.predict(lingfeat[-100:])

    print(("Pearson correlation for fold = %f" %
           pearsonr(scores_rank_test, y_pred)[0]))
コード例 #18
0
def gbdt_lr_train(libsvmFileName):

    # load样本数据
    X_all, y_all = load_svmlight_file(libsvmFileName)
    # X_all_dense = X_all.todense()
    print(type(X_all))
    # print(type(X_all_dense[0]))
    # print(y_all)
    # print("===")

    # 训练/测试数据分割
    X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                        y_all,
                                                        test_size=0.3,
                                                        random_state=42)
    # print(X_train)
    # print(y_train)

    # 定义GBDT模型
    gbdt = GradientBoostingClassifier(n_estimators=40,
                                      max_depth=3,
                                      verbose=0,
                                      max_features=0.5)

    # 训练学习
    gbdt.fit(X_train, y_train)

    # 预测及AUC评测
    toarray = X_test.toarray()
    print(type(toarray))
    y_pred_gbdt = gbdt.predict_proba(toarray)
    # print(y_pred_gbdt)
    y_pred_gbdt = gbdt.predict_proba(toarray)[:, 1]
    gbdt_auc = roc_auc_score(y_test, y_pred_gbdt)
    print('gbdt auc: %.5f' % gbdt_auc)  # gbdt auc: 0.96455

    # lr对原始特征样本模型训练
    lr = LogisticRegression()
    lr.fit(X_train, y_train)  # 预测及AUC评测
    y_pred_test = lr.predict_proba(X_test)[:, 1]
    lr_test_auc = roc_auc_score(y_test, y_pred_test)
    print('基于原有特征的LR AUC: %.5f' % lr_test_auc)  # 基于原有特征的LR AUC: 0.93455

    # GBDT编码原有特征
    # X_train_leaves = gbdt.apply(X_train)
    X_train_leaves = gbdt.apply(X_train)[:, :, 0]
    np.set_printoptions(linewidth=400)
    np.set_printoptions(threshold=np.inf)
    # print(X_train_leaves[0:22,:])  # 打印22行,所有列
    print(type(X_train_leaves))
    X_test_leaves = gbdt.apply(X_test)[:, :, 0]

    # 对所有特征进行ont-hot编码
    (train_rows, cols) = X_train_leaves.shape
    print(train_rows, cols)

    gbdtenc = OneHotEncoder()
    X_trans = gbdtenc.fit_transform(
        np.concatenate((X_train_leaves, X_test_leaves), axis=0))
    print(X_trans.shape)
    # print(X_trans.todense()[0:22,:])

    # 定义LR模型
    lr = LogisticRegression()
    # lr对gbdt特征编码后的样本模型训练
    lr.fit(X_trans[:train_rows, :], y_train)
    # 预测及AUC评测
    # print(X_trans[train_rows:, :])
    y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1]
    gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1)
    print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1)

    # 定义LR模型
    lr = LogisticRegression(n_jobs=-1)
    # 组合特征
    X_train_ext = hstack([X_trans[:train_rows, :], X_train])
    X_test_ext = hstack([X_trans[train_rows:, :], X_test])

    print("组合特征的个数:", X_train_ext.shape)
    # lr对组合特征的样本模型训练
    lr.fit(X_train_ext, y_train)

    # 预测及AUC评测
    y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1]
    gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2)
    print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)