def testRLS(input): X, Y = svmlight_format.load_svmlight_file(input) hoindices = range(int(0.1 * len(Y))) hocompl = list(set(range(len(Y))) - set(hoindices)) trainX = X[hocompl] testX = X[hoindices] trainY = Y[hocompl] testY = Y[hoindices] print len(trainY), len(testY) kwargs = {} kwargs["train_features"] = trainX kwargs["train_labels"] = trainY rls = RLS.createLearner(**kwargs) rls.train() bestperf = -1.0 for logrp in range(-5, 5): rp = 2.0 ** logrp rls.solve(rp) Ploo = rls.computeLOO() perf = cindex(trainY, Ploo) print logrp, perf if perf > bestperf: bestperf = perf bestlogrp = logrp rp = 2.0 ** bestlogrp rls.solve(rp) P = rls.getModel().predict(testX)
def load_data(data_path): X_all, y_all = load_svmlight_file(data_path) X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42) return X_train, X_test, y_train, y_test
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.3, random_state = 42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0,max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:,:,0] X_test_leaves = gbdt.apply(X_test)[:,:,0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print(X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
def perform_experiment(T, V, experiment_parameters, evaluation_strategy, dataset_location, n_folds=10, labelled_percentage=1.0, random_seed=None, use_unlabelled=True, use_parallel=True): """Each experiment we split the data into three parts, where two parts are used for training and remaining one is used for testing, we repeat this three times, until all parts have been considered as testing. The result of an experiment is the average performance over the three test parts.""" X, Y = load_svmlight_file(dataset_location) # ensure the dataset gets split into multiple views X_views = split_dataset_into_random_views(X, V, random_seed) # retrieve the train-test folds folds = StratifiedShuffleSplit(Y, test_size=0.3, random_state=random_seed) for train_index, test_index in folds: X_train = {n:X_views[n][train_index] for n in X_views.keys()} X_test = {n:X_views[n][test_index] for n in X_views.keys()} y_train, y_test = Y[train_index], Y[test_index] # unlabel the trainingset np.random.seed(random_seed) unlabel = np.random.random(len(y_train)) for i in range(len(unlabel)): if unlabel[i] > labelled_percentage: y_train[i] = 0.0 # grid search for the best grid best_grid = gridsearch(X_train, y_train, T, V, experiment_parameters, n_folds, evaluation_strategy, use_unlabelled, use_parallel, random_seed) # predetermine the order of samples order_of_samples = evaluation_strategy(y_train, T, use_unlabelled, random_seed) # generate the model alphas, predictions = training(X_train, y_train, V, order_of_samples, best_grid['grid']['kernel_method'], best_grid['grid']['kernel_parameters'], best_grid['grid']['lambdas']) # test the model y_preds_est = [] y_preds = [] for i in range(len(y_test)): y_pred = {} y_pred_est = 0.0 for n in range(V): y_pred[n] = coagreement_prediction_for_view_n(X_test[n][i], X_train, y_train, V, n, T+1, predictions, alphas, best_grid['grid']['kernel_method'], best_grid['grid']['kernel_parameters'], best_grid['grid']['lambdas']) y_pred_est += y_pred[n] y_preds.append(y_pred) y_preds_est.append(y_pred_est/V) # retrieve the metrics AUC, fpr, tpr = area_under_the_roc_curve(y_test, y_preds_est) print 'Achieved in validation '+str(AUC)+' AUC, and in training '+str(best_grid['AUC'])+' over '+str(n_folds)+' folds' return {"auc":AUC, "fpr":fpr, "tpr":tpr, "model":alphas, "best_grid":best_grid}
def gbdt_lr_train(self, Train_tab, Train_libsvm): # load样本数据 X_all, y_all = load_svmlight_file("sample_libsvm_data.txt") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(Train_libsvm, Train_tab, test_size=0.1, random_state=42) # 定义GBDT模型 self.gbdt.fit(X_train, y_train) # GBDT编码原有特征 self.X_train_leaves = self.gbdt.apply(X_train)[:, :, 0] X_test_leaves = self.gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (self.train_rows, cols) = self.X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform(np.concatenate((self.X_train_leaves, X_test_leaves), axis=0)) X_train_ext = hstack([X_trans[:self.train_rows, :], X_train]) # lr对组合特征的样本模型训练 self.lr.fit(X_train_ext, y_train)
def gbdt_lr_train(self, Train_tab, Train_libsvm): # load样本数据 X_all, y_all = load_svmlight_file("sample_libsvm_data.txt") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.1, random_state=42) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) # 训练模型 gbdt.fit(X_train, y_train) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_test_leaves = gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 filename = 'finalized_model.sav' pickle.dump(lr, open(filename, 'wb')) # load the model from disk loaded_model = pickle.load(open(filename, 'rb')) y_pred_gbdtlr2 = loaded_model.predict_proba(X_test_ext)[:, 1] print(y_pred_gbdtlr2)
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42) print "train data shape: ", X_train.shape # 模型训练 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) gbdt.fit(X_train, y_train) # 预测及AUC评测 y_pred_gbdt = gbdt.predict_proba(X_test.toarray())[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # GBDT编码原有特征 X_train_leaves = gbdt.apply(X_train)[:, :, 0] X_test_leaves = gbdt.apply(X_test)[:, :, 0] print "gbdt leaves shape: ", X_train_leaves.shape for i in range(0, len(X_train_leaves[0])): cateMap = {} for j in range(0, len(X_train_leaves)): cateMap[X_train_leaves[j][i]] = 0 print "F%d: %d" % (i, len(cateMap)) # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape gbdtenc = OneHotEncoder(sparse=False, categories='auto') X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) print "gbdt oneHot shape: ", X_trans.shape print "oneHot leaves: ", X_trans[0] # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print "gbdt leaves cross", X_train_ext.shape # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)
from numpy import array from nltk.metrics import ConfusionMatrix import sys from cv import XValMT def warning(*objs): print("evaluate.py: WARNING: ", *objs, file=sys.stderr) if len(sys.argv) > 1: svm_light_in = sys.argv[1] else: warning("No feature file loaded") sys.exit(1) print ("Loading dataset %s..."%svm_light_in) feat_vecs,labels = svmlight.load_svmlight_file(svm_light_in) print ('done\n') # initialize NB classifier #clf = MultinomialNB() #clf = BernoulliNB() #clf.class_prior =[0.041175856307435255,0.9588241436925647] #clf = svm.SVC() #clf.cache_size = 4000 #clf.n_jobs = -1 #clf.C = .1 clf = SGDClassifier()
# return a tuple of the data matrix and targets return (np.array(data), np.array(target)) if __name__ == '__main__': # construct the argument parser and parse the arguments ap = argparse.ArgumentParser() ap.add_argument("-d", "--dataset", required = True, help = "path of data set") ap.add_argument("-t", "--test", required = True, type = float, help = "size of test split") ap.add_argument("-s", "--search", type = int, default = 0, help = "whether or not a grid search should be performed") args = vars(ap.parse_args()) X = svmlight_format.load_svmlight_file(args['dataset']) # Test = svmlight_format.load_svmlight_file(args['test_dataset']) XX = X[0].toarray() XX = XX.astype("float32") yy = X[1] XX = scale(XX) (trainX, testX, trainY, testY) = train_test_split(XX, yy, test_size = args['test'], random_state = 42) if args["search"] == 1: # perform a grid search on the 'C' parameter of Logistic # Regression print "SEARCHING LOGISTIC REGRESSION" params = {"C": [1.0, 10.0, 100.0]} start = time.time() gs = GridSearchCV(LogisticRegression(), params, n_jobs = -1, verbose = 1) gs.fit(trainX, trainY)
from sklearn import linear_model import sklearn.datasets.svmlight_format as svmlight from sklearn import grid_search import cPickle parameters = { 'alpha': [.000001, .0000001, .00000001], 'n_iter': [5000, 10000, 15000] } feat_vecs, labels = svmlight.load_svmlight_file('featureFile.dat') svr = linear_model.SGDClassifier() svr.n_jobs = -1 clf = grid_search.GridSearchCV(svr, parameters) clf.n_jobs = -1 clf.fit(feat_vecs, labels) print clf.grid_scores_ with open('gridSearch.out', 'w') as fo: cPickle.dump(clf.grid_scores_, fo)
#!/usr/bin/python from pdb import set_trace import sklearn.datasets.svmlight_format as svmlight from sklearn.feature_selection import chi2, SelectKBest from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score from sklearn.cross_validation import KFold from operator import itemgetter from random import shuffle from numpy import array print "Loading dataset ..." svm_light_in = 'featureFile.dat' feat_vecs, labels = svmlight.load_svmlight_file(svm_light_in) print 'done\n' # initialize chi2 filtering object yes_chi2 = False k = 20000 ch2 = SelectKBest(chi2, k=k) # initialize NB classifier clf = MultinomialNB() #clf = svm.SVC() # how to divide sample sizes bins = 6 samp_size = len(labels) / bins
ds_paths = [pjoin(ds_dir, 'libsvm', name) for name in ds_names] def sigma_from_gamma(gamma=0.1): return _ensure_min_eps(np.sqrt(1.0 / (2 * gamma))) def gamma_from_sigma(sigma=0.1): return _ensure_min_eps(1.0 / (2 * sigma**2)) for name, ds_path in zip(ds_names, ds_paths): time_stamp = strftime("%H:%M:%S", gmtime()) X, y = load_svmlight_file(ds_path) X = X.toarray() print('\n{:10} {:20} {}'.format(time_stamp, name, X.shape)) gamma = 0.1 skl_svm = SVC(C=1.0, kernel='rbf', gamma=gamma) ss_cv1 = ShuffleSplit(n_splits=20, train_size=0.8, test_size=0.2) scores_skl = cross_val_score(skl_svm, X, y, cv=ss_cv1) ker_func = GaussianKernel(sigma=sigma_from_gamma(gamma)) km_svm = KernelMachine(k_func=ker_func, learner_id='SVM', normalized=False) ss_cv2 = ShuffleSplit(n_splits=20, train_size=0.8, test_size=0.2) scores_km = cross_val_score(km_svm, X, y, cv=ss_cv2) print('\tSKLearn Accuracy: {:.4f} +/- {:.4f}'
from sklearn.metrics import recall_score from sklearn.metrics import f1_score from sklearn.datasets.svmlight_format import load_svmlight_file from sklearn import svm import sys parser = argparse.ArgumentParser(description='Program to do some simple gridsearch') parser.add_argument("libsvm_file", type=str) args = parser.parse_args() if args.libsvm_file is None: parser.print_help("error") sys.exit(-1) X, y = load_svmlight_file(args.libsvm_file) # split the dataset in two equal part respecting label proportions train, test = iter(StratifiedKFold(y, 2, indices=True)).next() tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] scores = [ ('precision', precision_score), ('recall', recall_score), ('f1_score', f1_score) ]
from sklearn import linear_model import sklearn.datasets.svmlight_format as svmlight from sklearn import grid_search import cPickle parameters = {'alpha':[.000001,.0000001,.00000001],'n_iter':[5000,10000,15000]} feat_vecs,labels = svmlight.load_svmlight_file('featureFile.dat') svr = linear_model.SGDClassifier() svr.n_jobs = -1 clf = grid_search.GridSearchCV(svr, parameters) clf.n_jobs = -1 clf.fit(feat_vecs,labels) print clf.grid_scores_ with open('gridSearch.out', 'w') as fo: cPickle.dump(clf.grid_scores_,fo)
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import numpy as np from sklearn.datasets import svmlight_format from sklearn.ensemble import RandomForestClassifier from sklearn import svm from sklearn.linear_model import LogisticRegression if __name__ == '__main__': num_epoches = 10 Train= svmlight_format.load_svmlight_file('./feature_train.txt') Test = svmlight_format.load_svmlight_file('./feature_test.txt') #method-0 # model = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features='auto', max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=-1, random_state=None, verbose=0) #method-1 model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None) #method-2 #model = svm.libsvm.fit( np.array( training_data,dtype=np.float64), np.array( training_label,dtype=np.float64), kernel='linear' ) for epoch in xrange(num_epoches): print "learning epoch: ", epoch, "/", num_epoches #method-0 # model.fit( Train[0].toarray(), Train[1] ) #method-1 model.fit( Train[0], Train[1] ) print "testing..." #output = model.predict(predict_data)
from sklearn import linear_model import sklearn.datasets.svmlight_format as svmlight from sklearn import grid_search from sklearn.svm import LinearSVC,SVC import cPickle #-----------------SGD------------------------------- parameters = {'alpha':[.000001,.0000001,.00000001,.000000001],'n_iter':[5000,10000,15000,30000]} svr = linear_model.SGDClassifier() svr.n_jobs = -1 # -------- SVM-------------------- #svr = SVC(class_weight='auto') #parameters = {'C':range(500,5000,1000)} print "Loading feature file..." feat_vecs,labels = svmlight.load_svmlight_file('feature_files/+3-3/feats-Dev.dat') print "Done" clf = grid_search.GridSearchCV(svr, parameters) clf.n_jobs = -1 clf.fit(feat_vecs,labels) print clf.grid_scores_ with open('gridSearch.cPickle', 'wb') as fo: cPickle.dump(clf.grid_scores_,fo)
from scipy.stats.stats import pearsonr import numpy as np from sklearn.svm.classes import NuSVR from sklearn.datasets.svmlight_format import load_svmlight_file if __name__ == '__main__': trainfile = './data/svm_train.txt' problem = svm_read_problem(trainfile) rank_model = svm_train(problem[0][:-100], problem[1][:-100], '-s 4 -h 0 -m 1000') predicted_f, _, _ = svm_predict( np.ones(100).tolist(), problem[1][-100:], rank_model) scores_rank_test = problem[0][-100:] print(("Pearson correlation for fold = %f" % pearsonr(scores_rank_test, predicted_f)[0])) svr = NuSVR() lingfeat, y = load_svmlight_file(trainfile) svr.fit(lingfeat[:-100], y[:-100]) y_pred = svr.predict(lingfeat[-100:]) print(("Pearson correlation for fold = %f" % pearsonr(scores_rank_test, y_pred)[0]))
def gbdt_lr_train(libsvmFileName): # load样本数据 X_all, y_all = load_svmlight_file(libsvmFileName) # X_all_dense = X_all.todense() print(type(X_all)) # print(type(X_all_dense[0])) # print(y_all) # print("===") # 训练/测试数据分割 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.3, random_state=42) # print(X_train) # print(y_train) # 定义GBDT模型 gbdt = GradientBoostingClassifier(n_estimators=40, max_depth=3, verbose=0, max_features=0.5) # 训练学习 gbdt.fit(X_train, y_train) # 预测及AUC评测 toarray = X_test.toarray() print(type(toarray)) y_pred_gbdt = gbdt.predict_proba(toarray) # print(y_pred_gbdt) y_pred_gbdt = gbdt.predict_proba(toarray)[:, 1] gbdt_auc = roc_auc_score(y_test, y_pred_gbdt) print('gbdt auc: %.5f' % gbdt_auc) # gbdt auc: 0.96455 # lr对原始特征样本模型训练 lr = LogisticRegression() lr.fit(X_train, y_train) # 预测及AUC评测 y_pred_test = lr.predict_proba(X_test)[:, 1] lr_test_auc = roc_auc_score(y_test, y_pred_test) print('基于原有特征的LR AUC: %.5f' % lr_test_auc) # 基于原有特征的LR AUC: 0.93455 # GBDT编码原有特征 # X_train_leaves = gbdt.apply(X_train) X_train_leaves = gbdt.apply(X_train)[:, :, 0] np.set_printoptions(linewidth=400) np.set_printoptions(threshold=np.inf) # print(X_train_leaves[0:22,:]) # 打印22行,所有列 print(type(X_train_leaves)) X_test_leaves = gbdt.apply(X_test)[:, :, 0] # 对所有特征进行ont-hot编码 (train_rows, cols) = X_train_leaves.shape print(train_rows, cols) gbdtenc = OneHotEncoder() X_trans = gbdtenc.fit_transform( np.concatenate((X_train_leaves, X_test_leaves), axis=0)) print(X_trans.shape) # print(X_trans.todense()[0:22,:]) # 定义LR模型 lr = LogisticRegression() # lr对gbdt特征编码后的样本模型训练 lr.fit(X_trans[:train_rows, :], y_train) # 预测及AUC评测 # print(X_trans[train_rows:, :]) y_pred_gbdtlr1 = lr.predict_proba(X_trans[train_rows:, :])[:, 1] gbdt_lr_auc1 = roc_auc_score(y_test, y_pred_gbdtlr1) print('基于GBDT特征编码后的LR AUC: %.5f' % gbdt_lr_auc1) # 定义LR模型 lr = LogisticRegression(n_jobs=-1) # 组合特征 X_train_ext = hstack([X_trans[:train_rows, :], X_train]) X_test_ext = hstack([X_trans[train_rows:, :], X_test]) print("组合特征的个数:", X_train_ext.shape) # lr对组合特征的样本模型训练 lr.fit(X_train_ext, y_train) # 预测及AUC评测 y_pred_gbdtlr2 = lr.predict_proba(X_test_ext)[:, 1] gbdt_lr_auc2 = roc_auc_score(y_test, y_pred_gbdtlr2) print('基于组合特征的LR AUC: %.5f' % gbdt_lr_auc2)