def lda_on(train_x, train_y, test_x, test_y, feats_name='all_features'): """ Linear Discriminant Analysis """ lda = LDA() lda.fit(train_x, train_y, store_covariance=True) print feats_name, "(train):", lda.score(train_x, train_y) print feats_name, "(test):", lda.score(test_x, test_y) with open(dataset_name + '_lda_classif_' + feats_name + '.pickle', 'w') as w_f: cPickle.dump(lda, w_f) y_pred = lda.predict(test_x) X_train, X_validate, y_train, y_validate = cross_validation\ .train_test_split(train_x, train_y, test_size=0.2, random_state=0) lda.fit(X_train, y_train) print feats_name, "(validation):", lda.score( X_validate, y_validate) y_pred_valid = lda.predict(X_validate) cm_test = confusion_matrix(test_y, y_pred) cm_valid = confusion_matrix(y_validate, y_pred_valid) np.set_printoptions(threshold='nan') with open("cm_test" + feats_name + ".txt", 'w') as w_f: print >> w_f, cm_test with open("cm_valid" + feats_name + ".txt", 'w') as w_f: print >> w_f, cm_valid
def get_performance(test_df, X_std, y): Xtest = test_df.ix[:, 'x.1':'x.10'].values ytest = test_df.ix[:, 'y'].values X_std_test = StandardScaler().fit_transform(Xtest) lda_model = LDA() lda_model.fit(X_std, y) qda_model = QDA() qda_model.fit(X_std, y) knn_model = KNeighborsClassifier(n_neighbors=10) knn_model.fit(X_std, y) print "KNN SCORE" print knn_model.score(X_std_test, ytest) print "LDA SCORE" print lda_model.score(X_std_test, ytest) print "QDA SCORE" print qda_model.score(X_std_test, ytest) knn_scores_training = [] knn_scores_test = [] for i in range(1, 12): knn_model = KNeighborsClassifier(n_neighbors=i) knn_model.fit(X_std, y) knn_scores_training.append(knn_model.score(X_std_test, ytest)) knn_scores_test.append(knn_model.score(X_std, y)) plt.plot(range(11), knn_scores_training, 'r--') plt.plot(range(11), knn_scores_test, 'b--') plt.axis([0, 10, 0.3, 1.1]) plt.show()
def get_LDA(Xtrain, Xtest, Ytrain, Ytest): lda = LDA() lda.fit(Xtrain, Ytrain) scores = np.empty((4)) scores[0] = lda.score(Xtrain, Ytrain) scores[1] = lda.score(Xtest, Ytest) print('LDA, train: {0:.02f}% '.format(scores[0] * 100)) print('LDA, test: {0:.02f}% '.format(scores[1] * 100)) return lda
def get_LDA(Xtrain, Xtest, Ytrain, Ytest): lda = LDA() lda.fit(Xtrain,Ytrain) scores = np.empty((4)) scores[0] = lda.score(Xtrain,Ytrain) scores[1] = lda.score(Xtest,Ytest) print('LDA, train: {0:.02f}% '.format(scores[0]*100)) print('LDA, test: {0:.02f}% '.format(scores[1]*100)) return lda
def get_LDA_performance(test_df, X_std, y): X_test = test_df.ix[:, 'x.1':'x.10'].values X_std_test = StandardScaler().fit_transform(X_test) y_test = test_df.ix[:, 'y'].values lda_scores_training = [] lda_scores_test = [] qda_scores_training = [] qda_scores_test = [] knn_scores_training = [] knn_scores_test = [] for d in range(1, 11): lda = LDA(n_components=d) Xred_lda_training = lda.fit_transform(X_std, y) Xred_lda_test = lda.transform(X_std_test) lda_model = LDA() lda_model.fit(Xred_lda_training, y) qda_model = QDA() qda_model.fit(Xred_lda_training, y) knn_model = KNeighborsClassifier(n_neighbors=10) knn_model.fit(Xred_lda_training, y) lda_scores_training.append(1 - lda_model.score(Xred_lda_training, y)) lda_scores_test.append(1 - lda_model.score(Xred_lda_test, y_test)) qda_scores_training.append(1 - qda_model.score(Xred_lda_training, y)) qda_scores_test.append(1 - qda_model.score(Xred_lda_test, y_test)) knn_scores_training.append(1 - knn_model.score(Xred_lda_training, y)) knn_scores_test.append(1 - knn_model.score(Xred_lda_test, y_test)) plt.plot(range(10), lda_scores_training, 'r--', label="Train data") plt.plot(range(10), lda_scores_test, 'b--', label="Test data") plt.title("LDA vs LDA") plt.xlabel('k') plt.ylabel('Score') plt.show() plt.plot(range(10), qda_scores_training, 'r--', label="Train data") plt.plot(range(10), qda_scores_test, 'b--', label="Test data") plt.title("QDA vs LDA") plt.show() plt.plot(range(10), knn_scores_training, 'r--', label="Train data") plt.plot(range(10), knn_scores_test, 'b--', label="Test data") plt.title("KNN vs LDA") plt.show()
def plot_lda_projection(marker, flname): lda = LDA() lda.fit(marker["individuals"], marker["population_labels"]) print lda.score(marker["individuals"], marker["population_labels"]) proj = lda.transform(marker["individuals"]) n_samples, n_components = proj.shape plt.scatter(proj, marker["population_labels"]) plt.xlabel("Component 0", fontsize=18) plt.ylabel("Population Labels", fontsize=18) plt.savefig(flname, DPI=200)
def classify(Xtrain,Xtest,Ytrain,Ytest): ''' Linear and RBF SVM classifiers ''' scores = np.zeros((5,)) lr = LogisticRegression() lr.fit(Xtrain,Ytrain) scores[0] = lr.score(Xtest,Ytest) lda = LDA() lda.fit(Xtrain,Ytrain) scores[1] = lda.score(Xtest,Ytest) nb = GaussianNB() nb.fit(Xtrain,Ytrain) scores[2] = nb.score(Xtest,Ytest) lsvm = LinearSVC( C = 1) lsvm.fit(Xtrain,Ytrain) scores[3] = lsvm.score(Xtest,Ytest) gsvm = SVC(kernel='rbf', C = 1000) gsvm.fit(Xtrain,Ytrain) scores[4] = gsvm.score(Xtest,Ytest) return scores
def LDAmeanScore(X, Y, n_folds, dim_reduction=0): """ :param X: matrice d'entree du classifieur, n_samples*n_parameters, n_paramters>=2, n_samples>0. DONNES COHERENTES POUR CLASSIFICATION LDA :param Y: matrice des labels, n_samples :param n_folds: nombre de tests pour le KFold, >1 :param dim_reduction: si egale a 0, pas de reduction, si inferieur a 0, best_reduction, sinon on fait une reduction PCA (on reduit a dim_reduction dimensions) :return: le score moyen de la validation croisee, affiche ce score. Si n_folds>n_samples, renvoie -1 """ if dim_reduction > 0 and X.shape[1] > dim_reduction: X = dim_reduction_PCA(X, dim_reduction) if dim_reduction == -1: dim_reduction = best_dimension(X) print "Best dimension : " + str(dim_reduction) X = dim_reduction_PCA(X, dim_reduction) if X.shape[0] > n_folds: # Cross validation pour estimer la performance d'un classifieur LDA kf = KFold(n=len(Y), n_folds=n_folds, shuffle=True, random_state=None) scores = [] for train_index, test_index in kf: X_train, X_test = X[train_index, :], X[test_index, :] Y_train, Y_test = Y[train_index], Y[test_index] cl = LDA() cl.fit(X_train, Y_train) scores.append(cl.score(X_test, Y_test)) print "Score moyen : ", np.mean(np.array(scores)) return 100.0 * np.mean(np.array(scores)) else: return -1
def pca_lda(X_train, X_test, y_train, y_test): pca = PCA(n_components=500) lda = LDA() pca.fit(X_train) scores = np.dot(X_train, np.transpose(pca.components_)) lda.fit(scores, y_train) return lda.score(scores, y_train, sample_weight=None)
def pca_lda(X_train,X_test,y_train,y_test): pca = PCA(n_components=500) lda = LDA() pca.fit(X_train) scores = np.dot(X_train,np.transpose(pca.components_)) lda.fit(scores, y_train) return lda.score(scores, y_train, sample_weight=None)
def LDAmeanScore(X, Y, n_folds, dim_reduction=0): """ :param X: matrice d'entree du classifieur, n_samples*n_parameters, n_paramters>=2, n_samples>0. DONNES COHERENTES POUR CLASSIFICATION LDA :param Y: matrice des labels, n_samples :param n_folds: nombre de tests pour le KFold, >1 :param dim_reduction: si inferieur ou egale a 0, pas de reduction, sinon, si le nombre de parametre est superieur a dim_reduction, on fait une reduction PCA :return: le score moyen de la validation croisee, affiche ce score. Si n_folds>n_samples, renvoie -1 """ if dim_reduction > 0 and X.shape[1] > dim_reduction: X = dim_reduction_PCA(X, dim_reduction) if (X.shape[0] > n_folds): # Cross validation pour estimer la performance d'un classifieur LDA kf = KFold(n=len(Y), n_folds=n_folds, shuffle=False, random_state=None) scores = [] for train_index, test_index in kf: X_train, X_test = X[train_index, :], X[test_index, :] Y_train, Y_test = Y[train_index], Y[test_index] cl = LDA() cl.fit(X_train, Y_train) scores.append(cl.score(X_test, Y_test)) print 'Score moyen : ', np.mean(np.array(scores)) return 100. * np.mean(np.array(scores)) else: return -1
def LDA(data, label, pred_data, pred_last): '''not good,不需要规范化 ''' data = np.array(data) pred_data = np.array(pred_data) label = np.array(label) pred_last = np.array(pred_last) from sklearn.lda import LDA gnb = LDA() gnb.fit(data, label) print gnb.score(data, label) pred_result = gnb.predict(pred_data) print("Number of mislabeled points out of a total %d points : %d" % (pred_data.shape[0], (pred_last != pred_result).sum())) print gnb.score(pred_data, pred_last) return pred_result
def acc_image(training_data, tarining_label, test_data, test_label): n_train = training_data.shape[0] # samples for training n_test = test_data.shape[0] # samples for testing n_averages = 50 # how often to repeat classification n_features_max = 5 # maximum number of features step = 1 # step size for the calculation acc_clf1, acc_clf2 = [], [] n_features_range = range(1, n_features_max + 1, step) for n_features in n_features_range: score_clf1, score_clf2 = 0, 0 for _ in range(n_averages): X, y = training_data[:, 0:n_features], tarining_label clf1 = LDA(solver='lsqr', shrinkage='auto').fit(X, y) clf2 = LDA(solver='lsqr', shrinkage=None).fit(X, y) X, y = test_data[:, 0:n_features], test_label score_clf1 += clf1.score(X, y) score_clf2 += clf2.score(X, y) acc_clf1.append(score_clf1 / n_averages) acc_clf2.append(score_clf2 / n_averages) features_samples_ratio = np.array(n_features_range) / n_train plt.plot(features_samples_ratio, acc_clf1, linewidth=2, label="LDA with shrinkage", color='r') plt.plot(features_samples_ratio, acc_clf2, linewidth=2, label="LDA", color='g') plt.xlabel('n_features / n_samples') plt.ylabel('Classification accuracy') plt.legend(loc=1, prop={'size': 12}) plt.suptitle('LDA vs. shrinkage LDA (1 discriminative feature)') plt.show()
def lda_on(train_x, train_y, test_x, test_y, feats_name='all_features'): lda = LDA() lda.fit(train_x, train_y, store_covariance=True) print feats_name, "(train):", lda.score(train_x, train_y) print feats_name, "(test):", lda.score(test_x, test_y) with open(dataset_name + '_lda_classif_' + feats_name + '.pickle', 'w') as f: cPickle.dump(lda, f) y_pred = lda.predict(test_x) X_train, X_validate, y_train, y_validate = cross_validation.train_test_split(train_x, train_y, test_size=0.2, random_state=0) lda.fit(X_train, y_train) print feats_name, "(validation):", lda.score(X_validate, y_validate) y_pred_valid = lda.predict(X_validate) cm_test = confusion_matrix(test_y, y_pred) cm_valid = confusion_matrix(y_validate, y_pred_valid) np.set_printoptions(threshold='nan') with open("cm_test" + feats_name + ".txt", 'w') as wf: print >> wf, cm_test with open("cm_valid" + feats_name + ".txt", 'w') as wf: print >> wf, cm_valid
def LDA_select_cv(X, Y, num_features): scores = [] skf = cross_validation.StratifiedKFold(Y, n_folds=10) for train, test in skf: X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test] XRF_train, imp, ind, std = fitRF(X_train, y_train, est=2000) # RFsel XRF_test = X_test[:, ind] # reorder test set after RFsel clf = LDA() clf.fit(XRF_train[:, 0:num_features], y_train) scores.append(clf.score(XRF_test[:, 0:num_features], y_test)) score = np.mean(scores) return(score)
def table_4_1(): """Reproduces table 4.1 in ESLii showing the training and test error rates for classifying vowels using different classification techniques. The sklearn implementation of logistic regression uses OvA instead of a true multinomial which likely accounts for the worse results """ vowels_train = eslii.read_vowel_data() train_X = vowels_train[vowels_train.columns[1:]] train_y = vowels_train['y'] vowels_test = eslii.read_vowel_data(train=False) test_X = vowels_test[vowels_test.columns[1:]] test_y = vowels_test['y'] lda = LDA().fit(train_X, train_y) print "Linear discriminant analysis: {:.2f} {:.2f}".format( 1 - lda.score(train_X, train_y), 1 - lda.score(test_X, test_y)) qda = QDA().fit(train_X, train_y) print "Quadratic discriminant analysis: {:.2f} {:.2f}".format( 1 - qda.score(train_X, train_y), 1 - qda.score(test_X, test_y)) lr = LogisticRegression(C=1e30).fit(train_X, train_y) print "Logistic regression: {:.2f} {:.2f}".format( 1 - lr.score(train_X, train_y), 1 - lr.score(test_X, test_y))
def yj(): params['mu0'] = np.random.randn()*0.2 params['mu1'] = np.random.randn()*0.2 params['sigma0'] = di.invgamma.rvs(3) params['sigma1'] = di.invgamma.rvs(3) sel, rawdata, normdata = get_data(data_yj, params) norm_trn_data = normdata.loc[sel['trn'], sel['feats']] norm_tst_data = normdata.loc[sel['tst'], sel['feats']] sklda = LDA() sklda.fit(norm_trn_data, sel['trnl']) error = (1-sklda.score(norm_tst_data, sel['tstl'])) print("skLDA error: %f" % error) return error
def acc_image(training_data, tarining_label, test_data, test_label): n_train = training_data.shape[0] # samples for training n_test = test_data.shape[0] # samples for testing n_averages = 50 # how often to repeat classification n_features_max = 5 # maximum number of features step = 1 # step size for the calculation acc_clf1, acc_clf2 = [], [] n_features_range = range(1, n_features_max + 1, step) for n_features in n_features_range: score_clf1, score_clf2 = 0, 0 for _ in range(n_averages): X, y = training_data[:,0:n_features], tarining_label clf1 = LDA(solver='lsqr', shrinkage='auto').fit(X, y) clf2 = LDA(solver='lsqr', shrinkage=None).fit(X, y) X, y = test_data[:,0:n_features], test_label score_clf1 += clf1.score(X, y) score_clf2 += clf2.score(X, y) acc_clf1.append(score_clf1 / n_averages) acc_clf2.append(score_clf2 / n_averages) features_samples_ratio = np.array(n_features_range) / n_train plt.plot(features_samples_ratio, acc_clf1, linewidth=2, label="LDA with shrinkage", color='r') plt.plot(features_samples_ratio, acc_clf2, linewidth=2, label="LDA", color='g') plt.xlabel('n_features / n_samples') plt.ylabel('Classification accuracy') plt.legend(loc=1, prop={'size': 12}) plt.suptitle('LDA vs. shrinkage LDA (1 discriminative feature)') plt.show()
def predict_scores(markers, threshold=0.05): scores = [] for i, marker in enumerate(markers): try: lda = LDA() lda.fit(marker["individuals"], marker["population_labels"]) scores.append((lda.score(marker["individuals"], marker["population_labels"]), i)) except: scores.append((0.0, i)) scores.sort() scores.reverse() cutoff_idx = int(threshold * len(scores)) return scores[:cutoff_idx]
def optimize(self, X, y): clf = LDA() scores = [] train_times = [] for train, test in StratifiedKFold(y, 10): X_train, X_test, y_train, y_test = (X[train], X[test], y[train], y[test]) clf.fit(X_train.toarray(), y_train) t0 = self._timer() scores.append(clf.score(X_test.toarray(), y_test)) train_times.append(self._timer() - t0) self._mean_score = np.mean(scores) self._score_std = np.var(scores) self._mean_train_time = np.mean(train_times) self._train_time_std = np.var(train_times)
def plot_scores(markers, flname): plt.clf() scores = [] for i, marker in enumerate(markers): try: lda = LDA() lda.fit(marker["individuals"], marker["population_labels"]) scores.append(lda.score(marker["individuals"], marker["population_labels"])) except: scores.append(0.0) plt.hist(scores, bins=np.arange(0.0, 1.0, 0.01)) plt.xlabel("Score", fontsize=18) plt.ylabel("Occurrences", fontsize=18) plt.savefig(flname, DPI=200)
def main(): X_BBC, y_BBC = get_data('BBC') X_CNN, y_CNN = get_data('CNN') print('# of BBC frames = ' + str(X_BBC.shape[0])) print('# of CNN frames = ' + str(X_CNN.shape[0])) clf = LDA() print('Training...') t0 = time.clock() clf.fit(X_BBC.toarray(), y_BBC) trainTime = time.clock() - t0 print('Training time: ' + str(trainTime) + 's\n') print('Testing...') t0 = time.clock() score = clf.score(X_CNN.toarray(), y_CNN) testTime = time.clock() - t0 print('Testing time: ' + str(testTime) + 's\n') print('Total time: ' + str(trainTime + testTime) + 's\n') print('score = ' + str(score))
X2=X[best[0:20]] X_test2=X_test[best[0:20]] #Building a loop to find best model and feature selection (results are lda with the 23 best features) model=[] score=[] for i in range(10,len(best)): X2=X[best[0:i]] X_test2=X_test[best[0:i]] #running the train and test data in LDA (this typically gives the best model) model.append(['lda',i]) lda= LDA(n_components=2) lda_x_axis = lda.fit(X2, y).transform(X2) score.append(lda.score(X_test2, y_test, sample_weight=None)) #Look at Decision Tree Accuracy model.append(['dt',i]) dt = DecisionTreeClassifier(class_weight='balanced') dt.fit(X2,y) score.append(dt.score(X_test2,y_test)) #Look at Random Forest Accuracy model.append(['rf',i]) rf = RandomForestClassifier(class_weight='balanced') rf.fit(X2,y) score.append(rf.score(X_test2,y_test)) #Extra Trees Accuracy model.append(['et',i])
import pickle from sklearn.lda import LDA from sklearn.model_selection import train_test_split import random def loadXY(): #zippedXY = pickle.load(open("../Feature_reduction/zippedXY_wff_fs_2k.p","rb")) #zippedXY = pickle.load(open("../CNN_features/zippedXY_cnn_wff_2k_gap4.p","rb")) #zippedXY = pickle.load(open("../Vectorizer/zippedXY_wff_2k.p","rb")) #zippedXY = pickle.load(open("../CNN_features/zippedXY_cnn_te.p","rb")) zippedXY = pickle.load(open("../Feature_reduction/zippedXY_te_fs.p","rb")) random.shuffle(zippedXY) X,Y = zip(*zippedXY) return X,Y if __name__ == "__main__": X,Y = loadXY() print "X and Y loaded" X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.80, random_state=0) print Y lda_model = LDA() lda_model.fit(X_train,Y_train) predictedY = lda_model.predict(X_test) for tt in range(len(Y_test)): print "Actual:",Y_test[tt]," Predicted:",predictedY[tt] accuracy = lda_model.score(X_test,Y_test) print accuracy
import pandas as pd import numpy as np from sklearn.lda import LDA ## read files train = pd.read_csv('data/spam_train.csv') test = pd.read_csv('data/spam_test.csv') x = np.array(train.iloc[:, 0:57]) y = np.ravel(train.iloc[:, -1]) ## separate the predictors and response in the test data set x2 = np.array(test.iloc[:, 0:57]) y2 = np.ravel(test.iloc[:, -1]) ## fit the model using lda lda_cls = LDA() lda_cls.fit(x, y) print("(1): lda accuracy") print(lda_cls.score(x, y)) ## predict output on test data set with lda predict = lda_cls.predict(x2) print("(2): lda test accuracy") print(lda_cls.score(x2, y2))
delimiter=',', skiprows=1) test = rawtest[:,feat_inds] norm_test = (test - test.mean(axis=0)) / np.sqrt(test.var(axis=0,ddof=1)) N = test.shape[0] D = data.shape[1] #sys.exit() trn_labels = np.hstack(( np.zeros(Ntrn/2), np.ones(Ntrn/2) )) tst_labels = np.hstack(( np.zeros(N/2), np.ones(N/2) )) sklda = LDA() skknn = KNN(3, warn_on_equidistant=False) sksvm = SVC() sklda.fit(norm_data, trn_labels) skknn.fit(norm_data, trn_labels) sksvm.fit(norm_data, trn_labels) print("skLDA error: %f" % (1-sklda.score(norm_test, tst_labels))) print("skKNN error: %f" % (1-skknn.score(norm_test, tst_labels))) print("skSVM error: %f" % (1-sksvm.score(norm_test, tst_labels))) labels = np.hstack((np.zeros(N/2), np.ones(N/2))) n,gext,grid = get_grid_data(np.vstack(( norm_data0, norm_data1 ))) bayes0 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D)*3, norm_data0) bayes1 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D)*3, norm_data1) # Gaussian Analytic gc = GaussianCls(bayes0, bayes1) print("Gaussian Analytic error: %f" % gc.approx_error_data(norm_test, labels)) gavg = gc.calc_gavg(grid).reshape(-1,n) myplot(p.subplot(2,3,1),gavg,norm_data0, norm_data1)
norm_trn_data = normdata.loc[sel['trn'], sel['feats']] norm_tst_data = normdata.loc[sel['tst'], sel['feats']] tst_data = rawdata.loc[sel['tst'], sel['feats']] t1 = time() #################### CLASSIFICATION ################ ######################################## ######################################## ######################################## sklda = LDA() skknn = KNN(3, warn_on_equidistant=False) sksvm = SVC() sklda.fit(norm_trn_data, sel['trnl']) skknn.fit(norm_trn_data, sel['trnl']) sksvm.fit(norm_trn_data, sel['trnl']) errors['lda'] = (1-sklda.score(norm_tst_data, sel['tstl'])) errors['knn'] = (1-skknn.score(norm_tst_data, sel['tstl'])) errors['svm'] = (1-sksvm.score(norm_tst_data, sel['tstl'])) print("skLDA error: %f" % errors['lda']) print("skKNN error: %f" % errors['knn']) print("skSVM error: %f" % errors['svm']) bayes0 = GaussianBayes(np.zeros(num_feat), 1, kappa, np.eye(num_feat)*(kappa-1-num_feat), normdata.loc[sel['trn0'], sel['feats']]) bayes1 = GaussianBayes(np.zeros(num_feat), 1, kappa, np.eye(num_feat)*(kappa-1-num_feat), normdata.loc[sel['trn1'], sel['feats']]) # Gaussian Analytic gc = GaussianCls(bayes0, bayes1)
skiprows=1) test = rawtest[:, feat_inds] norm_test = (test - test.mean(axis=0)) / np.sqrt(test.var(axis=0, ddof=1)) N = test.shape[0] D = data.shape[1] #sys.exit() trn_labels = np.hstack((np.zeros(Ntrn / 2), np.ones(Ntrn / 2))) tst_labels = np.hstack((np.zeros(N / 2), np.ones(N / 2))) sklda = LDA() skknn = KNN(3, warn_on_equidistant=False) sksvm = SVC() sklda.fit(norm_data, trn_labels) skknn.fit(norm_data, trn_labels) sksvm.fit(norm_data, trn_labels) print("skLDA error: %f" % (1 - sklda.score(norm_test, tst_labels))) print("skKNN error: %f" % (1 - skknn.score(norm_test, tst_labels))) print("skSVM error: %f" % (1 - sksvm.score(norm_test, tst_labels))) labels = np.hstack((np.zeros(N / 2), np.ones(N / 2))) n, gext, grid = get_grid_data(np.vstack((norm_data0, norm_data1))) bayes0 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D) * 3, norm_data0) bayes1 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D) * 3, norm_data1) # Gaussian Analytic gc = GaussianCls(bayes0, bayes1) print("Gaussian Analytic error: %f" % gc.approx_error_data(norm_test, labels)) gavg = gc.calc_gavg(grid).reshape(-1, n) myplot(p.subplot(2, 3, 1), gavg, norm_data0, norm_data1)
for c in classes: lda = LDA(ldasolver) lda.fit(features['train'],np.array(labels['train'])==c) #test classifier p = np.array(lda.predict_proba(features['test'])) proba.append(p[:,1]) proba=np.transpose(np.array(proba)) prediction=np.argmax(proba,axis=1)+1 else: #train classifier lda = LDA(ldasolver) lda.fit(features['train'],labels['train']) #test classifier prediction = lda.predict(features['test']) proba = lda.predict_proba(features['test']) print('Accuracy %.2f%%' % lda.score(features['test'],labels['test'])) #output data file = open(outputFile,'w') file.write('labels ') for c in classes: file.write(str(c)+' ') file.write('\n') for i in range(len(prediction)): l = prediction[i] file.write(str(l)+' ') for p in proba[i]: file.write(str(p)+' ') file.write('\n')
from sklearn import svm from sklearn import cross_validation from sklearn.lda import LDA # Import training data trainingData = loadtxt('Data/featureData.txt') trainingLabels = loadtxt('Data/labels.txt') # Find the sizes of the data trainingSize = size(trainingLabels) # Run PCA pca = PCA(n_components=200) trainingDataNew = pca.fit_transform(trainingData) X_train, X_test, y_train, y_test = cross_validation.train_test_split(trainingDataNew, trainingLabels, test_size=0.3) # Train SVM model for the training data clf_svm = LDA() clf_svm.fit(X_train,y_train) # Test the trained model in the test data print clf_svm.score(X_test, y_test) * 100 # Find the percentage error # error = 0 # for i in range(0,trainingSize): # if predictedLabels[i] != trainingLabels[i]: # error = error+1 # print float(error)/float(trainingSize) * 100
param_grid.update({'min_samples_split':np.arange(2,11)}) gtree = GridSearchCV(DecisionTreeClassifier(),param_grid,scoring='precision',cv=StratifiedKFold(Ytrain, n_folds = 5),refit=True,n_jobs=-1) gtree.fit(Xtrain,Ytrain) scores = np.empty((6)) scores[0] = gtree.score(Xtrain,Ytrain) scores[1] = gtree.score(Xtest,Ytest) print "---------------------Decission Tree Clasifier--------------" print('Decision Tree, train: {0:.02f}% '.format(scores[0]*100)) print('Decision Tree, test: {0:.02f}% '.format(scores[1]*100)) if (LDA_cl == 1): from sklearn.lda import LDA lda = LDA() lda.fit(Xtrain,Ytrain) scores = np.empty((4)) scores[0] = lda.score(Xtrain,Ytrain) scores[1] = lda.score(Xtest,Ytest) print "---------------------Linear Discriminant Analysis---------------------------" print('LDA, train: {0:.02f}% '.format(scores[0]*100)) print('LDA, test: {0:.02f}% '.format(scores[1]*100)) if (GNB_cl == 1): nb = GaussianNB() nb.fit(Xtrain,Ytrain) scores = np.empty((4)) scores[0] = nb.score(Xtrain,Ytrain) scores[1] = nb.score(Xtest,Ytest) print "---------------------Naive Bayes Classifier------------------" # print "Prediction time:", t1-t0, "s"
N = tst_data.shape[0] D = trn_data.shape[1] norm_tst_data0 = norm_tst_data[:N / 2, :] norm_tst_data1 = norm_tst_data[N / 2:, :] trn_labels = np.hstack((np.zeros(Ntrn / 2), np.ones(Ntrn / 2))) tst_labels = np.hstack((np.zeros(N / 2), np.ones(N / 2))) sklda = LDA() skknn = KNN(3, warn_on_equidistant=False) sksvm = SVC() sklda.fit(norm_trn_data, trn_labels) skknn.fit(norm_trn_data, trn_labels) sksvm.fit(norm_trn_data, trn_labels) output = {} output['ldaerr'] = (1 - sklda.score(norm_tst_data, tst_labels)) output['knnerr'] = (1 - skknn.score(norm_tst_data, tst_labels)) output['svmerr'] = (1 - sksvm.score(norm_tst_data, tst_labels)) print("skLDA error: %f" % output['ldaerr']) print("skKNN error: %f" % output['knnerr']) print("skSVM error: %f" % output['svmerr']) # Gaussian Analytic bayes0 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D) * 3, norm_trn_data0) bayes1 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D) * 3, norm_trn_data1) gc = GaussianCls(bayes0, bayes1) output['gausserr'] = gc.approx_error_data(norm_tst_data, tst_labels) print("Gaussian Analytic error: %f" % output['gausserr'])
import pandas as pd import numpy as np from sklearn.lda import LDA ## read files train = pd.read_csv('data/spam_train.csv') test = pd.read_csv('data/spam_test.csv') x = np.array(train.iloc[:, 0:57]) y = np.ravel(train.iloc[:, -1]) ## separate the predictors and response in the test data set x2 = np.array(test.iloc[:, 0:57]) y2 = np.ravel(test.iloc[:, -1]) ## fit the model using lda lda_cls = LDA() lda_cls.fit(x,y) print("(1): lda accuracy") print(lda_cls.score(x, y)) ## predict output on test data set with lda predict = lda_cls.predict(x2) print("(2): lda test accuracy") print(lda_cls.score(x2, y2))
from sklearn import svm lin_clf = svm.LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0) lin_clf.fit(X_train, y_train) 1-lin_clf.score(X_test, y_test) # In[29]: from sklearn.lda import LDA clf3 =LDA() clf3.fit(X_train, y_train) 1-clf3.score(X_test, y_test) # In[27]: clfrbf = svm.SVC(kernel='rbf') clfrbf.fit(X_train, y_train) 1-clfrbf.score(X_test, y_test) # In[18]: from sklearn.naive_bayes import GaussianNB clf5 = GaussianNB()
XX = [] for i in xrange(len(Y)): if Y[i] == value: XX.append(X[i]) return XX out = open(sys.argv[1], "r") model = LDA(solver='lsqr') X, Y = read_fea(sys.argv[1]) sel = VarianceThreshold(threshold=0) model.fit(sel.fit_transform(X), Y) warning("useful features dim: " + str(len(sel.get_support(True)))) if hasattr(model, 'score'): warning("accuracy on training set: " + str(model.score(sel.transform(X), Y))) if len(sys.argv) > 2: X, Y = read_fea(sys.argv[2]) warning("accuracy on cv set: " + str(model.score(sel.transform(X), Y))) if len(sys.argv) > 3: X, Y = read_fea(sys.argv[3]) warning("accuracy on dev set: " + str(model.score(sel.transform(X), Y))) if len(sys.argv) > 4: ref = model.decision_function(sel.transform(X)) X, Y = read_fea(sys.argv[4], True) Z = model.decision_function(sel.transform(X)) Z = (Z - ref.mean(axis=0)[np.newaxis, :]) / ref.std(axis=0)[np.newaxis, :] for i in xrange(len(Y)):
# -*- coding: utf-8 -*- __author__ = 'PC-LiNing' import gensim from lda import load_data import numpy as np from sklearn.lda import LDA corpus,dic,labels = load_data.load_corpus() tfidf = gensim.models.TfidfModel(corpus=corpus,dictionary=dic) corpus_tfidf = [tfidf[doc] for doc in corpus] matrix = load_data.convert_to_matrix(corpus_tfidf) train_data,train_label,test_data,test_label = load_data.get_train_test(matrix,labels) lda = LDA(solver='svd',store_covariance=True) lda.fit(train_data,train_label) score = lda.score(test_data,test_label) print(score)
def main(): #Define our connection string conn_string = "host='localhost' dbname='CRAWL4J' user='******' password='******'" # print the connection string we will use to connect print "Connecting to database\n ->%s" % (conn_string) # get a connection, if a connect cannot be made an exception will be raised here conn = psycopg2.connect(conn_string) # fetching training data from Cdiscount-maison cdiscount_maison_request = "select url, whole_text, title, h1, short_description, status_code, depth, outlinks_size, inlinks_size, nb_breadcrumbs, nb_aggregated_ratings, nb_ratings_values, nb_prices, nb_availabilities, nb_reviews, nb_reviews_count, nb_images, nb_search_in_url, nb_add_in_text, nb_filter_in_text, nb_search_in_text, nb_guide_achat_in_text, nb_product_info_in_text, nb_livraison_in_text, nb_garanties_in_text, nb_produits_similaires_in_text, nb_images_text, width_average, height_average, page_rank, page_type, concurrent_name, last_update, semantic_hits, semantic_title, inlinks_semantic, inlinks_semantic_count from arbocrawl_results where page_type !='Unknown' and concurrent_name = 'Cdiscount-maison' "; catPred=["PAGE DEPTH AT SITE LEVEL","NUMBER OF OUTGOING LINKS","NUMBER OF INCOMING LINKS","NUMBER OF ITEMTYPE http://data-vocabulary.org/Breadcrumb","NUMBER OF ITEMPROP aggregateRating","NUMBER OF ITEMPROP ratingValue","NUMBER OF ITEMPROP price","NUMBER OF ITEMPROP availability","NUMBER OF ITEMPROP review","NUMBER OF ITEMPROP reviewCount","NUMBER OF ITEMPROP image","NUMBER OF OCCURENCES FOUND IN URL of search + recherche + Recherche + Search","NUMBER OF OCCURENCES FOUND IN PAGE TEXT ajout + ajouter + Ajout + Ajouter","NUMBER OF OCCURENCES FOUND IN PAGE TEXT filtre + facette + Filtre + Facette + filtré + filtrés","NUMBER OF OCCURENCES FOUND IN PAGE TEXT Ma recherche + Votre recherche + résultats pour + résultats associés","NUMBER OF OCCURENCES FOUND IN PAGE TEXT guide d""achat + Guide d""achat","NUMBER OF OCCURENCES FOUND IN PAGE TEXT caractéristique + Caractéristique + descriptif + Descriptif +information + Information","NUMBER OF OCCURENCES FOUND IN PAGE TEXT livraison + Livraison + frais de port + Frais de port","NUMBER OF OCCURENCES FOUND IN PAGE TEXT garantie + Garantie +assurance + Assurance","NUMBER OF OCCURENCES FOUND IN PAGE TEXT Produits Similaires + produits similaires + Meilleures Ventes + meilleures ventes +Meilleures ventes + Nouveautés + nouveautés + Nouveauté + nouveauté","NUMBER OF HTML TAG img IN THE PAGE","AVERAGE WIDTH OF HTML TAG img IN THE PAGE","AVERAGE HEIGHT OF HTML TAG img IN THE PAGE"]; semPred =["PAGE TEXT", "PAGE TITLE", "PAGE H1", "PAGE SHORT DESCRIPTION","TEN BEST TF/IDF HITS FOR THE PAGE","TITLE TF/IDF","PAGE INCOMING LINKS ANCHOR SEMANTIC"]; print "Executing the following request to fetch data for Cdiscount-maison from the ARBOCRAWL_RESULTS table : " + cdiscount_maison_request print"Page-type predictors : "+ ', '.join(catPred) print"Semantic predictors : " + ', '.join(semPred) df = pd.read_sql(cdiscount_maison_request, conn) url_list = df.url.values semantic_columns = ["url","title","h1","short_description","semantic_hits", "semantic_title", "inlinks_semantic"]; semantic_predictors = df[list(semantic_columns)].values; classifying_columns = ["depth", "outlinks_size", "inlinks_size", "nb_breadcrumbs", "nb_aggregated_ratings", "nb_ratings_values", "nb_prices", "nb_availabilities", "nb_reviews", "nb_reviews_count", "nb_images", "nb_search_in_url", "nb_add_in_text", "nb_filter_in_text", "nb_search_in_text", "nb_guide_achat_in_text", "nb_product_info_in_text", "nb_livraison_in_text", "nb_garanties_in_text", "nb_produits_similaires_in_text", "nb_images_text", "width_average","height_average"] classifying_predictors = df[list(classifying_columns)].values; X= np.asanyarray(classifying_predictors); y = df.page_type.values; print type(X) print X.shape print type(y) print y.shape # fetching the data to predict to_predict_request = "select url, whole_text, title, h1, short_description, status_code, depth, outlinks_size, inlinks_size, nb_breadcrumbs, nb_aggregated_ratings, nb_ratings_values, nb_prices, nb_availabilities, nb_reviews, nb_reviews_count, nb_images, nb_search_in_url, nb_add_in_text, nb_filter_in_text, nb_search_in_text, nb_guide_achat_in_text, nb_product_info_in_text, nb_livraison_in_text, nb_garanties_in_text, nb_produits_similaires_in_text, nb_images_text, width_average, height_average, page_rank, page_type, concurrent_name, last_update, semantic_hits, semantic_title, inlinks_semantic, inlinks_semantic_count from arbocrawl_results where concurrent_name != 'Cdiscount-maison' "; df_to_predict = pd.read_sql(to_predict_request, conn) # df_to_predict.dropna() # df_to_predict.replace([np.inf, -np.inf], np.nan).dropna(subset=list(classifying_columns), how="all") # df_to_predict.dropna(subset=list(classifying_columns), how="all", with_inf=True) # indexnan = sum(np.isnan(Xval)) # indexinfinite = np.isfinite(Xval) classifying_predictors_to_predict = df_to_predict[list(classifying_columns)].values; Xval= np.asanyarray(classifying_predictors_to_predict); print type(Xval) print Xval.shape url_val_list = df_to_predict.url.values print type(url_val_list) print url_val_list.shape # we must here filter the NaN / Infinity in Xval values #print np.isnan(Xval) #Xval = Xval[~np.isnan(Xval)] #print Xval.shape # transforming the predictors / rescaling the predictors # we don't need to do that #X = StandardScaler().fit_transform(X) #Xval = StandardScaler().fit_transform(Xval) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) single_tree = DecisionTreeClassifier(max_depth=5) single_tree.fit(X_train, y_train) single_tree_score = single_tree.score(X_test, y_test) print "Single tree score " + str(single_tree_score) random_forest = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) random_forest.fit(X_train, y_train) random_forest_score = random_forest.score(X_test, y_test) print "Random forest score " + str(random_forest_score) kneighbors = KNeighborsClassifier(3) kneighbors.fit(X_train, y_train) kneighbors_score = kneighbors.score(X_test, y_test) print "K-Neighbors score " + str(kneighbors_score) adaboost = AdaBoostClassifier() adaboost.fit(X_train, y_train) adaboost_score = adaboost.score(X_test, y_test) print "Ada boost score " + str(adaboost_score) gaussian_nb = GaussianNB() gaussian_nb.fit(X_train, y_train) gaussian_nb_score = gaussian_nb.score(X_test, y_test) print "gaussian mixtures score " + str(gaussian_nb_score) lda = LDA() lda.fit(X_train, y_train) lda_nb_score = lda.score(X_test, y_test) print "linear discriminant score " + str(lda_nb_score) qda = QDA() qda.fit(X_train, y_train) qda_nb_score = qda.score(X_test, y_test) print "quadratic discriminant score " + str(qda_nb_score) #SVC(kernel="linear", C=0.025), #SVC(gamma=2, C=1), # we now predict the dataset from the other web sites with the best scoring trained classifier y_val_predicted = random_forest.predict(Xval); print type(y_val_predicted) print y_val_predicted.shape print type(url_val_list) print url_val_list.shape url_validation_list = url_val_list.tolist() y_val_predicted_list = y_val_predicted.tolist() # displaying the classified data # pprint.pprint(y_val_predicted_list) # pprint.pprint(url_validation_list) classified_values = zip(url_validation_list, y_val_predicted_list) print "Updating the database with the classification results" update_database_with_page_type(conn, classified_values) conn.close()
len(X_train) # fitting logistic regression logit_1 = LogisticRegression() logit_1 = logit_1.fit(X_train, y_train) logit_1.score(X_train, y_train) logitpred = logit_1.predict(X_test) print logitpred confusion_matrix(y_test, logitpred) prob = logit_1.predict_proba(X_test) print prob print metrics.accuracy_score(y_test, logitpred) lda1 = LDA() lda1 = lda1.fit(X_train, y_train) lda1.score(X_train, y_train) ldapredict = lda1.predict(X_test) print ldapredict confusion_matrix(y_test, ldapredict) print metrics.accuracy_score(y_test, ldapredict) # KNN knn1 = KNeighborsClassifier(n_neighbors=2) knn1 = knn1.fit(X_train, y_train) knn1.score(X_train, y_train) knnpredict = knn1.predict(X_test) print knnpredict confusion_matrix(y_test, knnpredict) print metrics.accuracy_score(y_test, knnpredict) knn2 = KNeighborsClassifier(n_neighbors=10) knn2 = knn2.fit(X_train, y_train)
else: x.append(base[j][0:50]) # %%%%%%%%%%%%%%%%%%%%%%%%%%TREINANDO%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% clf = LDA() clf.fit(x, y) #%%%%%%%%%%%%%%%%%%%%%%CRIANDO O CONJUNTO DE TESTE%%%%%%%%%%%%%%%%%%%%%% xteste = [] for i in ret: xteste.append(base[i][0:50]) #%%%%%%%%%%%%%%%%%%%%%%%%%%TESTANDO%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% a = clf.score(xteste, labels) b = clf.predict(xteste) cm = confusion_matrix(labels, b) cm = np.asarray(cm) matriz = matriz + cm grupo.append(a) np.set_printoptions(precision=0) np.savetxt("matriz102lda.txt", matriz) plt.figure() plot_confusion_matrix(matriz) plt.show() print('Acurácia media do grupo: ', np.mean(grupo)) print('Desvio padrão do grupo: ', np.std(grupo))
log_reg.fit(train_weekly_x, train_weekly_y) log_reg_weekly_y_preds = log_reg.predict(test_weekly_x) score = log_reg.score(test_weekly_x, test_weekly_y) conf_matrix = confusion_matrix(test_weekly_y, log_reg_weekly_y_preds) print "\nLogistic Regression Coefficients [Lag2]: " + str(log_reg.coef_) print "Confusion Matrix:" print conf_matrix print "Fraction of Correct Predictions: " + str(score) #%% LDA using sklearn from sklearn.lda import LDA lda = LDA() lda.fit(train_weekly_x, train_weekly_y) lda_preds = lda.predict(test_weekly_x) lda_score = lda.score(test_weekly_x, test_weekly_y) conf_matrix = confusion_matrix(test_weekly_y, lda_preds) print "\nLDA Results" print "Confusion Matrix:" print conf_matrix print "Fraction of Correct Predictions: " + str(lda_score) #%% QDA using sklearn from sklearn.qda import QDA qda = QDA() qda.fit(train_weekly_x, train_weekly_y) qda_preds = qda.predict(test_weekly_x) qda_score = qda.score(test_weekly_x, test_weekly_y) conf_matrix = confusion_matrix(test_weekly_y, qda_preds)
if n_features > 1: X = np.hstack([X, np.random.randn(n_samples, n_features - 1)]) return X, y acc_clf1, acc_clf2 = [], [] n_features_range = range(1, n_features_max + 1, step) for n_features in n_features_range: score_clf1, score_clf2 = 0, 0 for _ in range(n_averages): X, y = generate_data(n_train, n_features) clf1 = LDA(solver='lsqr', shrinkage='auto').fit(X, y) clf2 = LDA(solver='lsqr', shrinkage=None).fit(X, y) X, y = generate_data(n_test, n_features) score_clf1 += clf1.score(X, y) score_clf2 += clf2.score(X, y) acc_clf1.append(score_clf1 / n_averages) acc_clf2.append(score_clf2 / n_averages) features_samples_ratio = np.array(n_features_range) / n_train plt.plot(features_samples_ratio, acc_clf1, linewidth=2, label="LDA with shrinkage", color='r') plt.plot(features_samples_ratio, acc_clf2, linewidth=2, label="LDA", color='g') plt.xlabel('n_features / n_samples') plt.ylabel('Classification accuracy')
return X, y # acc_clf1 = [] acc_clf2 = [] n_features_range = list(range(1, n_features_max + 1, step)) for n_features in n_features_range: score_clf1, score_clf2 = 0, 0 for _ in range(n_averages): X, y = generate_data(n_train, n_features) # clf1 = LDA(solver='lsqr', shrinkage='auto').fit(X, y) clf2 = LDA().fit(X, y) X, y = generate_data(n_test, n_features) # score_clf1 += clf1.score(X, y) score_clf2 += clf2.score(X, y) # acc_clf1.append(score_clf1 / n_averages) acc_clf2.append(score_clf2 / n_averages) features_samples_ratio = np.array(n_features_range) / n_train # plt.plot(features_samples_ratio, acc_clf1, linewidth=2, # label="LDA with shrinkage", color='r') plt.plot(features_samples_ratio, acc_clf2, linewidth=2, label="LDA", color='g') plt.xlabel('n_features / n_samples') plt.ylabel('Classification accuracy') plt.legend(loc=1, prop={'size': 12})
norm_trn_data = norm(trn_data) norm_tst_data = norm(tst_data) norm_trn_data0, norm_trn_data1 = split(norm_trn_data) norm_tst_data0, norm_tst_data1 = split(norm_tst_data) trn_data0, trn_data1 = split(trn_data) tst_data0, tst_data1 = split(tst_data) #################### CLASSIFICATION ################ sklda = LDA() skknn = KNN(3) sksvm = SVC() sklda.fit(norm_trn_data, trn_labels) skknn.fit(norm_trn_data, trn_labels) sksvm.fit(norm_trn_data, trn_labels) errors['lda'] = (1-sklda.score(norm_tst_data, tst_labels)) errors['knn'] = (1-skknn.score(norm_tst_data, tst_labels)) errors['svm'] = (1-sksvm.score(norm_tst_data, tst_labels)) bayes0 = GaussianBayes(np.zeros(num_feat), 1, 8, np.eye(num_feat)*3, norm_trn_data0) bayes1 = GaussianBayes(np.zeros(num_feat), 1, 8, np.eye(num_feat)*3, norm_trn_data1) # Gaussian Analytic gc = GaussianCls(bayes0, bayes1) errors['gauss'] = gc.approx_error_data(norm_tst_data, tst_labels) # MPM Model #d0 = np.asarray(mquantiles(trn_data0, 0.75, axis=1)).reshape(-1) #d1 = np.asarray(mquantiles(trn_data1, 0.75, axis=1)).reshape(-1) #dist0 = MPMDist(trn_data0,kmax=1,priorkappa=150,lammove=0.01,mumove=0.08,d=d0) #dist1 = MPMDist(trn_data1,kmax=1,priorkappa=150,lammove=0.01,mumove=0.08,d=d1)
probas[7]=probas[7]+1 if row['y']==9: probas[8]=probas[8]+1 for i in range(0,9): probas[i]=probas[i]/528 yhat_apriori = np.argmax(probas) + 1 print "Clase: %d"%yhat_apriori ######## Pregunta (g) ############################################################ lda_model = LDA() lda_model.fit(X_std,y) print "Score LDA train: %f"%lda_model.score(X_std,y) print "Score LDA test: %f"%lda_model.score(X_std_test,ytest) qda_model = QDA() qda_model.fit(X_std,y) print "Score QDA train: %f"%qda_model.score(X_std,y) print "Score QDA test: %f"%qda_model.score(X_std_test,ytest) knn_model = KNeighborsClassifier(n_neighbors=10) knn_model.fit(X_std,y) print "Score KNN train: %f"%knn_model.score(X_std,y) print "Score KNN test: %f"%knn_model.score(X_std_test,ytest) values_train = [] values_test = [] for i in range(1, 12): knn_model = KNeighborsClassifier(n_neighbors=i) knn_model.fit(X_std,y)
# Possible solution import matplotlib.pyplot as plt from sklearn import datasets from sklearn.lda import LDA iris = datasets.load_iris() X = iris.data; y = iris.target target_names = iris.target_names # Now, invoke the LDA method to compute and fit the model: lda_classifier = LDA(n_components=2) lda_x_axis = lda_classifier.fit(X, y).transform(X) # Now output a simple visualization of the model result: color_scheme = ['r', 'g', 'b'] for c, i, target_name in zip(color_scheme, [0, 1, 2], target_names): plt.scatter(lda_x_axis[y == i, 0], lda_x_axis[y == i, 1], c = c, label = target_names) plt.xlabel('First LDA'); plt.ylabel('Second LDA') plt.show() # We have a score associated with the classifier's performance lda_classifier.score(X, y, sample_weight=None)
plt.xlabel('Number of components') plt.ylabel('accuracy') plt.legend(['LR', 'LDA', 'GNB', 'Linear SVM', 'rbf SVM'], loc='lower right') plt.grid(True) if (0): # Calculate classifiation scores for each component nComponents = np.linspace(500, 1500, 100, endpoint=True) kpcaldaScores = np.zeros((np.alen(nComponents), 1)) lda = LDA() for i in range(len(nComponents)): lda.fit(XtrainT[:, :nComponents[i]], labelsTrain) kpcaldaScores[i] = lda.score(XtestT[:, :nComponents[i]], labelsTest) # %% Plot accuracies for kPCA plt.figure() plt.plot(nComponents, kpcaldaScores, lw=3) plt.xlim(1, np.amax(nComponents)) plt.title('kPCA accuracy') plt.xlabel('Number of components') plt.ylabel('accuracy') plt.xlim([500, 1500]) plt.legend(['LDA'], loc='lower right') plt.grid(True) if (0): # K-PCA second round
#X_std : X entrenamiento #Y_std : Y entrenamiento Xtest = test_df.ix[:,'x.1':'x.10'].values ytest = test_df.ix[:,'y'].values X_std_test = StandardScaler().fit_transform(Xtest) ############ LDA ##################### #Construcción y Fit del modelo LDA lda_model = LDA() lda_model.fit(X_std,y) #Score conjunto de entrenamiento y conjunto de testing. print lda_model.score(X_std,y) print lda_model.score(X_std_test,ytest) ############ QDA ##################### #Construcción y Fit del modelo QDA qda_model = QDA() qda_model.fit(X_std,y) #Score conjunto de entrenamiento y conjunto de testing. print qda_model.score(X_std,y) print qda_model.score(X_std_test,ytest) # ############ KNN ##################### # #Construcción y Fit del modelo KNN # knn_model = KNeighborsClassifier(n_neighbors=10) # knn_model.fit(X_std,y)
# for particle_features in particles_features: # y = 1 if particle_features['truth']==1 else -1 # norm_particle_features = [float(particle_features[features[k]])/norm[features[k]] for k in range(len(features))] # # pt = particle_features['pt'] # test_data.append(norm_particle_features) # test_truth.append(y) # # test_data = np.array(test_data) # test_truth = np.array(test_truth) # # np.save("../Data/particle_features_tjets/particle_features_"+str(numParticles)+"_upto_"+str(2*numParticles)+".npy", test_data) # np.save("../Data/particle_features_tjets/particle_features_"+str(numParticles)+"_upto_"+str(2*numParticles)+"_truth.npy", test_truth) #print train_data.shape, train_truth.shape #print test_data.shape, test_truth.shape print(clf.score(train_data,train_truth)) print(clf.score(test_data,test_truth)) p = np.where(train_truth == -1)[0] p_truth = np.where(test_truth == -1)[0] hs = np.where(train_truth == 1)[0] hs_truth = np.where(test_truth == 1)[0] print("Pileup") print(clf.score(train_data[p],train_truth[p])) print(clf.score(test_data[p_truth],test_truth[p_truth])) print("Hard Scatter") print(clf.score(train_data[hs],train_truth[hs])) print(clf.score(test_data[hs_truth],test_truth[hs_truth])) print '\n'
scores_windows = [] for train_idx, test_idx in cv: y_train, y_test = labels[train_idx], labels[test_idx] X_train = csp.fit_transform(epochs_data_train[train_idx], y_train) X_test = csp.transform(epochs_data_train[test_idx]) # fit classifier svc.fit(X_train, y_train) # running classifier: test classifier on sliding window score_this_window = [] for n in w_start: X_test = csp.transform(epochs_data[test_idx][:, :, n:(n + w_length)]) score_this_window.append(svc.score(X_test, y_test)) scores_windows.append(score_this_window) # Plot scores over time w_times = (w_start + w_length / 2.) / sfreq + epochs.tmin plt.figure() plt.plot(w_times, np.mean(scores_windows, 0), label='Score') plt.axvline(0, linestyle='--', color='k', label='Onset') plt.axhline(0.5, linestyle='-', color='k', label='Chance') plt.xlabel('time (s)') plt.ylabel('classification accuracy') plt.title('Classification score over time') plt.legend(loc='lower right') plt.show()
def sample_data(X, Y, value=0): XX=[] for i in xrange(len(Y)): if Y[i]==value: XX.append(X[i]) return XX out=open(sys.argv[1],"r") model=LDA() X, Y = read_fea(sys.argv[1]) sel = VarianceThreshold(threshold=0) model.fit(sel.fit_transform(X), Y) warning("useful features dim: "+str(len(sel.get_support(True)))) if hasattr(model,'score'): warning("accuracy on training set: "+str(model.score(sel.transform(X), Y))) if len(sys.argv)>2: X, Y = read_fea(sys.argv[2]) warning("accuracy on cv set: "+str(model.score(sel.transform(X), Y))) if len(sys.argv)>3: X, Y = read_fea(sys.argv[3]) warning("accuracy on dev set: "+str(model.score(sel.transform(X), Y))) if len(sys.argv)>4: ref = model.decision_function(sel.transform(X)) X, Y = read_fea(sys.argv[4], True) Z = model.decision_function(sel.transform(X)) Z = (Z-ref.mean(axis=0)[np.newaxis,:])/ref.std(axis=0)[np.newaxis,:] for i in xrange(len(Y)): ZZ=np.array(Z[i][1:])
#ws.var_.xvschema = scot.xvschema.singletrial #ws.optimize_var() ws.var_.delta = 1 # Single-Trial Fitting and feature extraction features = np.zeros((len(triggers), 32)) for t in range(len(triggers)): print('Fold %d/%d, Trial: %d ' %(fold, nfolds, t), end='\r') ws.set_data(data[:, :, t]) ws.fit_var() con = ws.get_connectivity('ffPDC') alpha = np.mean(con[:, :, np.logical_and(7 < freq, freq < 13)], axis=2) beta = np.mean(con[:, :, np.logical_and(15 < freq, freq < 25)], axis=2) features[t, :] = np.array([alpha, beta]).flatten() lda.fit(features[train, :], classids[train]) acc_train = lda.score(features[train, :], classids[train]) acc_test = lda.score(features[test, :], classids[test]) print('Fold %d/%d, Acc Train: %.4f, Acc Test: %.4f' %(fold, nfolds, acc_train, acc_test)) pred = lda.predict(features[test, :]) cm += confusion_matrix(classids[test], pred) print('Confusion Matrix:\n', cm) print('Total Accuracy: %.4f'%(np.sum(np.diag(cm))/np.sum(cm)))
if n_features > 1: X = np.hstack([X, np.random.randn(n_samples, n_features - 1)]) return X, y acc_clf1, acc_clf2 = [], [] n_features_range = range(1, n_features_max + 1, step) for n_features in n_features_range: score_clf1, score_clf2 = 0, 0 for _ in range(n_averages): X, y = generate_data(n_train, n_features) clf1 = LDA(solver='lsqr', shrinkage='auto').fit(X, y) clf2 = LDA(solver='lsqr', shrinkage=None).fit(X, y) X, y = generate_data(n_test, n_features) score_clf1 += clf1.score(X, y) score_clf2 += clf2.score(X, y) acc_clf1.append(score_clf1 / n_averages) acc_clf2.append(score_clf2 / n_averages) features_samples_ratio = np.array(n_features_range) / n_train plt.plot(features_samples_ratio, acc_clf1, linewidth=2, label='LDA with shrinkage', color='r') plt.plot(features_samples_ratio, acc_clf2, linewidth=2, label='LDA', color='g') plt.xlabel('n_features / n_samples') plt.ylabel('Classification accuracy')