def classify_using_lda(feat1, feat2, num_comp=2): n_plus = len(feat1) n_minus = len(feat2) X = np.concatenate((feat1, feat2), axis=0) y = np.concatenate((np.zeros(n_plus), np.ones(n_minus)), axis=0) y += 1 print(X.shape, y.shape, n_plus, n_minus, feat1.shape, feat2.shape) lda = LDA(n_components=num_comp) lda.fit(X, y) # TODO FIXME Why is this returning n_samples x 1, and not n_samples x 2? # Is it able to to differentiate using just 1 component? Crazy!! X_tr = lda.transform(X) print(X_tr.shape, lda.score(X, y)) # CRAZY, we don't actually have the 2nd component from LDA X1 = np.concatenate((X_tr[0:n_plus], np.zeros((n_plus, 1))), axis=1) X2 = np.concatenate((X_tr[-n_minus:], np.ones((n_minus, 1))), axis=1) plt.plot(X1[:, 0], X1[:, 1], 'ro') plt.plot(X2[:, 0], X2[:, 1], 'g+') plt.ylim(-1, 3) plt.show()
def computing_performance_LDA(in_path=None, seeds=list([0])): def u65(mod_Y): return 1.6 / mod_Y - 0.6 / mod_Y ** 2 def u80(mod_Y): return 2.2 / mod_Y - 1.2 / mod_Y ** 2 data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path) print("-----DATA SET TRAINING---", in_path) X = data.iloc[:, :-1].values y = data.iloc[:, -1].tolist() lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) mean_u65, mean_u80 = 0, 0 n_times = len(seeds) for k in range(0, n_times): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=seeds[k]) sum_u65, sum_u80 = 0, 0 lda.fit(X_train, y_train) n, _ = X_test.shape for i, test in enumerate(X_test): evaluate = lda.predict([test]) print("-----TESTING-----", i) if y_test[i] in evaluate: sum_u65 += u65(len(evaluate)) sum_u80 += u80(len(evaluate)) print("--k-->", k, sum_u65 / n, sum_u80 / n) mean_u65 += sum_u65 / n mean_u80 += sum_u80 / n print("--->", mean_u65 / n_times, mean_u80 / n_times)
def test_lda_predict(): # Test LDA classification. # This checks that LDA implements fit and predict and returns correct # values for simple toy data. for test_case in solver_shrinkage: solver, shrinkage = test_case clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage) y_pred = clf.fit(X, y).predict(X) assert_array_equal(y_pred, y, "solver %s" % solver) # Assert that it works with 1D data y_pred1 = clf.fit(X1, y).predict(X1) assert_array_equal(y_pred1, y, "solver %s" % solver) # Test probability estimates y_proba_pred1 = clf.predict_proba(X1) assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, "solver %s" % solver) y_log_proba_pred1 = clf.predict_log_proba(X1) assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8, "solver %s" % solver) # Primarily test for commit 2f34950 -- "reuse" of priors y_pred3 = clf.fit(X, y3).predict(X) # LDA shouldn't be able to separate those assert_true(np.any(y_pred3 != y3), "solver %s" % solver) # Test invalid shrinkages clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231) assert_raises(ValueError, clf.fit, X, y) clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy") assert_raises(ValueError, clf.fit, X, y) clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto") assert_raises(NotImplementedError, clf.fit, X, y) # Test unknown solver clf = LinearDiscriminantAnalysis(solver="dummy") assert_raises(ValueError, clf.fit, X, y)
def tuneSpatialFilters(self): print colors.MAGENTA num_total_spatial_filters = self.all_spatial_filters.shape[0] best_mean = 0 best_num = 0 best_score = None for i in xrange(num_total_spatial_filters): num_filters_to_try = i+1 print "trying with first",num_filters_to_try,"spatial filters" trial_X = self.extractFeatures(self.epochs, self.all_spatial_filters[:num_filters_to_try]) lda = LinearDiscriminantAnalysis() lda = lda.fit(trial_X, self.y) cross_validation_folds = 10 xval = cross_val_score(lda, trial_X, self.y, cv=cross_validation_folds) #print xval this_mean = xval.mean() print "mean",this_mean if this_mean > best_mean: best_mean = this_mean best_num = num_filters_to_try best_score = xval print "-----------------------------" print "best mean was", best_mean, "with", best_num, "filters used" print best_score print colors.ENDC
def performLDA(data_to_fit, y, numComponent=None): data_to_fit_np_t = np.array(data_to_fit).T if numComponent is None: numComponent = len(data_to_fit_np_t) lda_model = LinearDiscriminantAnalysis(n_components=numComponent) lda_results = lda_model.fit_transform(data_to_fit_np_t, y) return lda_model, lda_results
def test(self): iris = datasets.load_iris() X = iris.data y = iris.target target_names = iris.target_names pca = PCA(n_components=3) X_r = pca.fit(X).transform(X) lda = LinearDiscriminantAnalysis(n_components=3) X_r2 = lda.fit(X, y).transform(X) # Percentage of variance explained for each components print('explained variance ratio (first two components): %s' % str(pca.explained_variance_ratio_)) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for c, i, target_name in zip("rgb", [0, 1, 2], target_names): ax.scatter(X_r[y == i, 0], X_r[y == i, 1], zs=X[y == i, 2], c=c, label=target_name) plt.legend() plt.title('PCA of IRIS dataset') fig2 = plt.figure() ax = fig2.add_subplot(111, projection='3d') for c, i, target_name in zip("rgb", [0, 1, 2], target_names): ax.scatter(X_r2[y == i, 0], X_r2[y == i, 1], zs=X[y == i, 2], c=c, label=target_name) plt.legend() plt.title('LDA of IRIS dataset') plt.show()
def visualize_lda2D(X,y): """ Visualize the separation between classes using the two most discriminant features Keyword arguments: X -- The feature vectors y -- The target vector """ labels=['Paid','Default'] lda = LDA(n_components = 2,solver='eigen') # lda = LDA(n_components = 2) discriminative_attributes = lda.fit(X, y).transform(X) palette = sea.color_palette() # plt.plot(discriminative_attributes[:,0][y==0],'sg',label="Paid", alpha=0.5) # plt.plot(discriminative_attributes[:,0][y==1],'^r',label="Default", alpha=0.5) plt.scatter(discriminative_attributes[:,0][y==0],discriminative_attributes[:,1][y==0],marker='s',color='green',label="Paid", alpha=0.5) plt.scatter(discriminative_attributes[:,0][y==1],discriminative_attributes[:,1][y==1],marker='^',color='red',label="Default", alpha=0.5) plt.xlabel('First Linear Discriminant') plt.ylabel('Second Linear Discriminant') leg = plt.legend(loc='upper right', fancybox=True) leg.get_frame().set_alpha(0.5) plt.title("Linear Discriminant Analysis") plt.tight_layout #save fig output_dir='img' save_fig(output_dir,'{}/lda.png'.format(output_dir))
def plot_lda(features, labels): """ Input features: features to get LDA and plot labels: labels of features Description plots the LDA of features """ lda = LinearDiscriminantAnalysis(n_components=2) new_features = lda.fit(chroma[0], chroma[1]).transform(chroma[0]) colors = list("rgbykrgbyk") markers = list("xxxxxooooo") plt.figure(len(genres)) # for all together for i, genre in enumerate(genres): plt.figure(i) # for one particular genre plt.scatter(new_features[i*num_songs:(i+1)*num_songs, 0], new_features[i*num_songs:(i+1)*num_songs, 1], c=colors[i], marker=markers[i], label=genre) plt.title(genre) plt.figure(len(genres)) # for all together plt.scatter(new_features[i*num_songs:(i+1)*num_songs, 0], new_features[i*num_songs:(i+1)*num_songs, 1], c=colors[i], marker=markers[i], label=genre) plt.legend() plt.title('LDA') plt.show()
def main(): """Read Train/test log.""" df = pd.read_csv("train.csv") # train/test split using stratified sampling labels = df['label'] df = df.drop(['label'], 1) sss = StratifiedShuffleSplit(labels, 10, test_size=0.2, random_state=23) for train_index, test_index in sss: x_train, x_test = df.values[train_index], df.values[test_index] y_train, y_test = labels[train_index], labels[test_index] # classification algorithm classification(x_train, y_train, x_test, y_test) # Predict Test Set favorite_clf = LinearDiscriminantAnalysis() favorite_clf.fit(x_train, y_train) test = pd.read_csv('test.csv') test_predictions = favorite_clf.predict(test) print test_predictions # Format DataFrame submission = pd.DataFrame(test_predictions, columns=['Label']) submission.tail() submission.insert(0, 'ImageId', np.arange(len(test_predictions)) + 1) submission.reset_index() submission.tail() # Export Submission submission.to_csv('submission.csv', index=False) submission.tail()
class LinearDiscriminantAnalysisPredictor(PredictorBase): ''' Linear Discriminant Analysis ''' def __init__(self, animal_type): self.animal_type = animal_type self.clf = LinearDiscriminantAnalysis() def fit(self, X_train, y_train): self.clf.fit(X_train, y_train) def predict(self, X_test): predictions = self.clf.predict_proba(X_test) predictions_df = self.bundle_predictions(predictions) return predictions_df def find_best_params(self): parameters = {'solver': ['svd', 'lsqr', 'eigen']} knn = LinearDiscriminantAnalysis() clf = grid_search.GridSearchCV(knn, parameters) train_data = get_data('../data/train.csv') train_data = select_features(train_data, self.animal_type) X = train_data.drop(['OutcomeType'], axis=1) y = train_data['OutcomeType'] clf.fit(X, y) print clf.best_params_
def test_lda_orthogonality(): # arrange four classes with their means in a kite-shaped pattern # the longer distance should be transformed to the first component, and # the shorter distance to the second component. means = np.array([[0, 0, -1], [0, 2, 0], [0, -2, 0], [0, 0, 5]]) # We construct perfectly symmetric distributions, so the LDA can estimate # precise means. scatter = np.array([[0.1, 0, 0], [-0.1, 0, 0], [0, 0.1, 0], [0, -0.1, 0], [0, 0, 0.1], [0, 0, -0.1]]) X = (means[:, np.newaxis, :] + scatter[np.newaxis, :, :]).reshape((-1, 3)) y = np.repeat(np.arange(means.shape[0]), scatter.shape[0]) # Fit LDA and transform the means clf = LinearDiscriminantAnalysis(solver="svd").fit(X, y) means_transformed = clf.transform(means) d1 = means_transformed[3] - means_transformed[0] d2 = means_transformed[2] - means_transformed[1] d1 /= np.sqrt(np.sum(d1 ** 2)) d2 /= np.sqrt(np.sum(d2 ** 2)) # the transformed within-class covariance should be the identity matrix assert_almost_equal(np.cov(clf.transform(scatter).T), np.eye(2)) # the means of classes 0 and 3 should lie on the first component assert_almost_equal(np.abs(np.dot(d1[:2], [1, 0])), 1.0) # the means of classes 1 and 2 should lie on the second component assert_almost_equal(np.abs(np.dot(d2[:2], [0, 1])), 1.0)
def computing_cv_accuracy_LDA(in_path=None, cv_n_fold=10): def u65(mod_Y): return 1.6 / mod_Y - 0.6 / mod_Y ** 2 def u80(mod_Y): return 2.2 / mod_Y - 1.2 / mod_Y ** 2 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path) print("-----DATA SET TRAINING---", in_path) X = data.iloc[:, :-1].values y = np.array(data.iloc[:, -1].tolist()) kf = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True) lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) mean_u65, mean_u80 = 0, 0 for idx_train, idx_test in kf.split(y): print("---k-FOLD-new-executing--") X_cv_train, y_cv_train = X[idx_train], y[idx_train] X_cv_test, y_cv_test = X[idx_test], y[idx_test] lda.fit(X_cv_train, y_cv_train) n_test = len(idx_test) sum_u65, sum_u80 = 0, 0 for i, test in enumerate(X_cv_test): evaluate = lda.predict([test]) print("-----TESTING-----", i) if y_cv_test[i] in evaluate: sum_u65 += u65(len(evaluate)) sum_u80 += u80(len(evaluate)) mean_u65 += sum_u65 / n_test mean_u80 += sum_u80 / n_test print("--->", mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
class LinearDiscriminantAnalysiscls(object): """docstring for ClassName""" def __init__(self): self.lda_cls = LinearDiscriminantAnalysis() self.prediction = None self.train_x = None self.train_y = None def train_model(self, train_x, train_y): try: self.train_x = train_x self.train_y = train_y self.lda_cls.fit(train_x, train_y) except: print(traceback.format_exc()) def predict(self, test_x): try: self.test_x = test_x self.prediction = self.lda_cls.predict(test_x) return self.prediction except: print(traceback.format_exc()) def accuracy_score(self, test_y): try: # return r2_score(test_y, self.prediction) return self.lda_cls.score(self.test_x, test_y) except: print(traceback.format_exc())
def plot_lda_only(filename, title, filename_fig): df = pd.read_csv(path+filename, names=['x1','x2','y'], header=None) fig = plt.figure() fig.suptitle(title, fontsize=20) columns_ls = [] for column in df.columns: columns_ls.append(column) X = df[columns_ls[0:len(columns_ls)-1]].values Y = df[columns_ls[len(columns_ls)-1]].values clf_lda = LinearDiscriminantAnalysis() clf_lda.fit(X, Y) w = clf_lda.coef_[0] a = -w[0]/w[1] xx = np.linspace(-12, 34) yy = a*xx-clf_lda.intercept_[0]/w[1] plt.plot(xx,yy, color="blue", label ="LDA decision boundary") print "Weights W0 %.2f and W1%.2f"%(w[0], w[1]) plt.text(0, 0, "Y=+1", fontsize=12) plt.text(10, -20, "Y=-1", fontsize=12) # plt.plot(xx, yy_down, 'k--') # plt.plot(xx, yy_up, 'k--') # plt.plot(xx,yy,color="black", label ="svm decision boundary") plt.xlabel('X1', fontsize=18) plt.ylabel('X2', fontsize=16) # fig.savefig(filename_fig) # model = LogisticRegression() # model.fit(X, Y) # w = model.coef_[0] # a = -w[0]/w[1] # # xx = np.linspace(-12, 34) # yy = a*xx-model.intercept_[0]/w[1] # # plt.plot(xx,yy, label ="logistic decision boundary") # # clf_lda = LinearDiscriminantAnalysis() # clf_lda.fit(X, Y) # w = clf_lda.coef_[0] # a = -w[0]/w[1] # # xx = np.linspace(-12, 34) # yy = a*xx-clf_lda.intercept_[0]/w[1] # plt.plot(xx,yy, color="blue", label ="LDA decision boundary") # plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], # s=80, color='b') plt.scatter(X[:, 0], X[:, 1], c=Y) plt.axis('tight') plt.legend() plt.show()
def feature_distribute_4_projection(channel_length=4, projection='pca'): ''' 六个动作的四个特征组合的2D映射分布,共34个通道 ''' colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'] markers = ['o', '+', 'v', '^', '*', 'x'] sample_len = 100 subjects = ['subject_'+str(i+1) for i in range(2)] # 受试者 # subjects = ['subject_1'] for subject in subjects: title_pre = subject + '_feature_class_' channel_num = 34 # 通道 # channel_num = 18 # channel_num = 1 for channel in range(channel_num): feature_list = ['MAV', 'ZC', 'SSC', 'WL'] # feature_list = ['MAV'] actions = [i + 1 for i in range(6)] # 动作 # actions = [1, 2] fig = plt.figure(figsize=(8, 6)) ax = fig.add_subplot() trains = np.array([]) targets = np.array([], np.int) for action in actions: filename = title_pre + str(action) feature = np.load( root_path + '/train1_250_100/' + filename + '.npy') train = feature[:sample_len, channel * channel_length : channel * channel_length+4] target = np.ones(train.shape[0], np.int) * action # print train.shape, target.shape, target[0, 0:5] trains = np.concatenate((trains, train), axis=None) targets = np.concatenate( (targets, target), axis=None) # sys.exit(0) trains = trains.reshape((-1, 4)) # print trains.shape, targets.shape if projection == 'pca': pca = PCA(n_components=2) X_r = pca.fit(trains).transform(trains) elif projection == 'lda': lda = LinearDiscriminantAnalysis(n_components=2) X_r = lda.fit(trains, targets).transform(trains) for action in actions: plt.scatter(X_r[targets == action, 0], X_r[targets == action, 1], c=colors[action], marker=markers[ action % 1], alpha=0.5, label=action) plt.legend() plt.title(subject + '-channel_' + str(channel) + '-' + projection + '-TD4') # plt.show() plt.savefig( 'result/figure/distribute4_proj/' + subject + '-channel_' + str(channel) + '-' + projection + '-TD4', dpi=120) plt.close()
def doLDA(x,digits,s): myLDA = LDA() myLDA.fit(x.PCA[:,:s],digits.train_Labels) newtest = digits.test_Images -x.centers [email protected](x.V[:s,:]) labels = myLDA.predict(newtest) errors = class_error_rate(labels.reshape(1,labels.shape[0]),digits.test_Labels) return errors
def Train(enhancedGeneSet, classLabels): enhancedGeneSet = np.array(enhancedGeneSet); classLabels = np.array(classLabels); classifier = LinearDiscriminantAnalysis(); classifier.fit(enhancedGeneSet, classLabels); #del enhancedGeneSet; #del classLabels; return classifier;
def lda(X, y, n): ''' Returns optimal projection of the data LDA with n components ''' selector = LinearDiscriminantAnalysis(n_components=n) selector.fit(X, y) return selector.transform(X), y
def train_model(self): ### Train spectrum data # form training data and labels X = np.empty((0, self.freq_cutoff), int) y = np.empty((0, 1), int) data_dir = 'clap_data/claps/spectrum/' for fname in os.listdir(data_dir): data = np.load("%s%s"% (data_dir, fname)) X = np.append(X, data, axis=0) y = np.append(y, [1] * data.shape[0]) data_dir = 'clap_data/noclaps/spectrum/' for fname in os.listdir(data_dir): data = np.load("%s%s"% (data_dir, fname)) X = np.append(X, data, axis=0) y = np.append(y, [0] * data.shape[0]) # pca = PCA(n_components=200) # X_pca = pca.fit_transform(X) # fit the model # clf = LogisticRegression(penalty='l1') clf = LinearDiscriminantAnalysis() clf.fit(X, y) preds = clf.predict(X) # X_new = clf.transform(X) # clf2 = LinearDiscriminantAnalysis() # clf2.fit(X_new, y) # preds2 = clf2.predict(X_new) # print X.shape, X_pca.shape print preds print np.sum(preds), preds.size # print preds2, np.sum(preds2) # save model pickle.dump(clf, open(clap_model_dir + clap_classifier_fname, 'w')) self.clap_clf = clf ### Train decay data X = np.empty((0, self.decay_samples/10), int) data_dir = 'clap_data/claps/decay/' for fname in os.listdir(data_dir): if fname.endswith('npy'): data = np.load("%s%s"% (data_dir, fname)) print data.shape, X.shape X = np.append(X, data, axis=0) print X.shape X_avg = np.mean(X, axis=0) plt.plot(X_avg) plt.show() # Average decay data np.save('%s%s' % (clap_model_dir, clap_decay_model_fname), X_avg)
def plot_lda(X, y): colors = ['b', 'r'] lda = LinearDiscriminantAnalysis(n_components=2) X_r = lda.fit(X, y).transform(X) plt.figure() for i, c in enumerate(colors): plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=str(i)) plt.legend() plt.title('PCA')
def _get_lda(self, data, variables): domain = Domain(attributes=variables, class_vars=data.domain.class_vars) data = data.transform(domain) lda = LinearDiscriminantAnalysis(solver='eigen', n_components=2) lda.fit(data.X, data.Y) scalings = lda.scalings_[:, :2].T if scalings.shape == (1, 1): scalings = np.array([[1.], [0.]]) return scalings
def test_raises_value_error_on_same_number_of_classes_and_samples(solver): """ Tests that if the number of samples equals the number of classes, a ValueError is raised. """ X = np.array([[0.5, 0.6], [0.6, 0.5]]) y = np.array(["a", "b"]) clf = LinearDiscriminantAnalysis(solver=solver) with pytest.raises(ValueError, match="The number of samples must be more"): clf.fit(X, y)
def feature_scaling(feature_matrix,target,reductor=None,scaler=None): lda = LDA(n_components=2) minmax = MinMaxScaler(feature_range=(-1,1)) if not reductor: reductor = lda.fit(feature_matrix,target) feature_matrix_lda = reductor.transform(feature_matrix) if not scaler: scaler = minmax.fit(feature_matrix_lda) feature_matrix_scaled = scaler.transform(feature_matrix_lda) return feature_matrix_scaled,reductor,scaler
def self_tune(self, X, y, verbose=False): # fix random seed for reproducibility seed = 5 np.random.seed(seed) # define k-fold cross validation test harness kfold = StratifiedKFold(y=y, n_folds=self.tuning_csp_num_folds, shuffle=True, random_state=seed) # init scores cvscores = {} for i in xrange(1,self.num_spatial_filters): cvscores[i+1] = 0 for i, (train, test) in enumerate(kfold): # calculate CSP spatial filters csp = CSP(n_components=self.num_spatial_filters) csp.fit(X[train], y[train]) # try all filters, from the given num down to 2 # (1 is too often found to be overfitting) for j in xrange(2,self.num_spatial_filters): num_filters_to_try = j # calculate spatial filters csp.n_components = num_filters_to_try # apply CSP filters to train data tuning_train_LDA_features = csp.transform(X[train]) np.nan_to_num(tuning_train_LDA_features) check_X_y(tuning_train_LDA_features, y[train]) # apply CSP filters to test data tuning_test_LDA_features = csp.transform(X[test]) np.nan_to_num(tuning_test_LDA_features) check_X_y(tuning_test_LDA_features, y[test]) # train LDA lda = LinearDiscriminantAnalysis() prediction_score = lda.fit(tuning_train_LDA_features, y[train]).score(tuning_test_LDA_features, y[test]) cvscores[num_filters_to_try] += prediction_score if verbose: print "prediction score", prediction_score, "with",num_filters_to_try,"spatial filters" best_num = max(cvscores, key=cvscores.get) best_score = cvscores[best_num] / i+1 if verbose: print "best num filters:", best_num, "(average accuracy ",best_score,")" print "average scores per filter num:" for k in cvscores: print k,":", cvscores[k]/i+1 return [best_num, best_score]
def assess_embedding(to_vec): """ Returns LDA classification score and projected data """ (x_data, y_data) = get_x_y_matrices(to_vec) lda = LDA(n_components=2) x_prime = lda.fit_transform(x_data, y_data) score = lda.score(x_data, y_data) return (x_prime.reshape(26, ), y_data, score)
def test_lda_explained_variance_ratio(): # Test if the sum of the normalized eigen vectors values equals 1 n_features = 2 n_classes = 2 n_samples = 1000 X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_classes, random_state=11) clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen") clf_lda_eigen.fit(X, y) assert_almost_equal(clf_lda_eigen.explained_variance_ratio_.sum(), 1.0, 3)
def LD(pth): train_desc=np.load(pth+'/training_features.npy') nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0) idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32') # Scaling the words stdSlr = StandardScaler().fit(train_desc) train_desc = stdSlr.transform(train_desc) modelLD=LinearDiscriminantAnalysis() modelLD.fit(train_desc,np.array(train_labels)) joblib.dump((modelLD, img_classes, stdSlr), pth+"/ld-bof.pkl", compress=3) test(pth, "ld-")
def testEvaluateLDA(self, trCList, teCList): # LDA object clf = LinearDiscriminantAnalysis() # fit lda model using training chromosomes clf.fit(numpy.asarray(trCList), numpy.asarray(trainGroupings)) predicted = clf.predict(teCList) self.confusionMatrix(testGroupings, predicted, 'lda_test') # return precision ([0]), recall ([1]) or f1 score ([2]), replace with clf.score(numpy.asarray(teCList), testGroupings) for accuracy return precision_recall_fscore_support(testGroupings, predicted, average = 'weighted')[2] # fitness for test set
def PCA_plot(D, TFS, EXPS, A , toEXPS, toTFS): A = A.T pca = sd.PCA(n_components=2) X_r = pca.fit(A).transform(A) F = plt.figure(figsize=(15,10)) ax = F.add_subplot(111) y = [get_color(toEXPS[i], EXPS, BINARY=True) for i in range(X_r.shape[0]) ] lda = LinearDiscriminantAnalysis(n_components=2) X_r2 = lda.fit(D, y).transform(D) ax.scatter(X_r[:,0], X_r[:,1], c=[get_color(toEXPS[i], EXPS) for i in range(X_r.shape[0])], s=150 ) plt.show()
def transformLDA(X,y,xTest): originalSize = np.size(X,1) print("Learning LDA \nProjecting {} features to 1 component".format(originalSize)) priors = [0.5,0.5] clf = LinearDiscriminantAnalysis('svd', n_components=1,priors=priors) print(X.shape) X = clf.fit_transform(X,y) print("True size of X : ", X.shape) if xTest != []: xTest = clf.transform(xTest) return X,xTest
from sklearn.model_selection import train_test_split from sklearn.datasets import load_iris # Loading data form Iris data = load_iris() # Setting data x = data.data # Setting target y = data.target # Giving test and train samples train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=12) # Calling Linear Discriminant method lindiscamodel = LinearDiscriminantAnalysis() # Calling Logical Regression method logregmodel = LogisticRegression() # Fitting the train data lindiscamodel.fit(train_x, train_y) # Predicting the Test data liprediction = lindiscamodel.predict(test_x) # Accuracy for linear regression print("Accuracy for linear regression is ", accuracy_score(liprediction, test_y)) # Fitting logical regression model logregmodel.fit(train_x, train_y) # Prediction logical regression model loprediction = logregmodel.predict(test_x) # Logistic regression accuracy
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed) # print(X_train) # print Y_train # print("n_components size: %d" % X_train.size) # print("n_components len: %d" % len(X_train)) # print("n_components index 0: %d" % len(X_train[0])) # pca = PCA(n_components = (len(X_train)-2)) # pca.fit(X_train) # X_train = pca.transform(X_train) # Run LDA lda = LinearDiscriminantAnalysis() lda.fit(X_train, Y_train) # lda.fit(zip(*(shuffle(X_train, Y_train)))) # project training data onto found axes so can look later at plots of how its discriminating X_trans = lda.transform(X_train) # Transform validation set onro these new axes X_trans2 = lda.transform(X_validation) #Make predictions as to what each data row is in the validation set predictions = lda.predict(X_validation) #look at accuracy of predicitons for i in range(Y_validation.size): if predictions[i] == Y_validation[i] :
time_test_svm = (time.clock() - start) # test knn classifier parameters_knn = {'n_neighbors': [2, 10, 5, 20, 50, 100]} start = time.clock() best_knn = get_best_model(KNeighborsClassifier(), parameters_knn, X_train_top, y_train) time_train_knn = (time.clock() - start) start = time.clock() score_test_knn = best_knn.score(X_test_top, y_test) time_test_knn = (time.clock() - start) # test LDA classifier parameters_lda = {'solver': ('svd', 'lsqr', 'eigen')} start = time.clock() best_lda = get_best_model(LinearDiscriminantAnalysis(), parameters_lda, X_train_top, y_train) time_train_lda = (time.clock() - start) start = time.clock() score_test_lda = best_lda.score(X_test_top, y_test) time_test_lda = (time.clock() - start) # test decision tree classifier parameters_dtree = {'criterion': ('gini', 'entropy')} start = time.clock() best_dtree = get_best_model(DecisionTreeClassifier(), parameters_dtree, X_train_top, y_train) time_train_dtree = (time.clock() - start) start = time.clock() score_test_dtree = best_dtree.score(X_test_top, y_test) time_test_dtree = (time.clock() - start)
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed) print("X_train :",len(X_train), "\nX_validation :",len(X_validation)) # harnais de test # nous allons eclater nos données en 10 sous parties (10 folders) # le modele va s'entrainer sur 9 d'entre elles et tester sur le dernier folder # Il va itérer cette opération sur toutes les combinaisons de folder existantes # construire les modeles # nous allons faire cela sur plusieurs modeles (6) afin de choisir lequel est le plus performant/pertinent # avec le même traitement des données (folder) pour chaque model, afin de les rendre comparable models =[] models.append(('LogisticRegression', LogisticRegression(solver='liblinear', multi_class='ovr'))) models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis())) models.append(('+ProcheVoisins', KNeighborsClassifier())) models.append(('ArbreDecision', DecisionTreeClassifier())) models.append(('NaiveBayes', GaussianNB())) models.append(('VecteurDeSupport', SVC(gamma='auto'))) # on évalue chaque modele results = [] names = [] for name, model in models: kfold = KFold(n_splits=10, random_state=seed) cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name,cv_results.mean(), cv_results.std())
SBi = Ni * np.mat(ui - u).T * np.mat(ui - u) SB += SBi S = np.linalg.inv(Sw) * SB eigVals, eigVects = np.linalg.eig(S) #求特征值,特征向量 eigValInd = np.argsort(eigVals) eigValInd = eigValInd[:(-n_dim - 1):-1] w = eigVects[:, eigValInd] data_ndim = np.dot(data, w) return data_ndim if __name__ == '__main__': iris = load_iris() X = iris.data Y = iris.target data_1 = lda(X, Y, 2) data_2 = LinearDiscriminantAnalysis(n_components=2).fit_transform(X, Y) plt.figure(figsize=(8, 4)) plt.subplot(121) plt.title("my_LDA") plt.scatter(data_1[:, 0], data_1[:, 1], c=Y) plt.subplot(122) plt.title("sklearn_LDA") plt.scatter(data_2[:, 0], data_2[:, 1], c=Y) plt.savefig("LDA.png") plt.show()
# pegando o diretório que contem as imagens no formato MNIST path = os.path.join("images-mnist") # passando o diretório para a função MNIST para trabalhar com as imagens data = MNIST(path) print("Loading dataset") trainImages, trainLabels = data.load_training( ) # carregando imagens de treinamento testImages, testLabels = data.load_testing() #carregando imagens de teste print("Dataset is load") tempoInicial = time.time() lda = LinearDiscriminantAnalysis() # definindo a função LDA print("Training LDA") lda.fit(trainImages, trainLabels) ldaResult = lda.predict(testImages) #print_report(predictions, testLabels) printResul(ldaResult) tempoAux = time.time() tempo(int(tempoAux - tempoInicial)) print(int(tempoAux - tempoInicial)) k = 1 resultKnn = list() knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
number_principal_components = pca.n_components_ MSE_PCA_train = 1 - pca.explained_variance_ratio_.sum() pca_output = "Principal Component Analysis (PCA) \n\n" \ "Explained Variance: " + str(round(pca.explained_variance_ratio_.sum(), 3)) + "" \ " \nNumber of Principal Components: " + str( number_principal_components) + "\n" + "" \ "MSE (training data): " + str(round(MSE_PCA_train, 3)) + "\n" + line_str print(pca_output) # calculate the PCs for the test data as well principal_components_test = pca.transform(df_test_input_) # _______________________________________________________________________________________________________________________ # TASK 6: Fisher Discriminant fisher = LinearDiscriminantAnalysis() fisher.fit(df_train_input_, df_train_output['class'].values) fisher_components_train = fisher.transform(df_train_input_) fisher_components_test = fisher.transform(df_test_input_) # Adding Fisher Discrimant Analysis to PCA fisher_pca = LinearDiscriminantAnalysis() fisher_pca.fit(principal_components_train, df_train_output['class'].values) fisher_pca_components_train = fisher_pca.transform( principal_components_train) fisher_pca_components_test = fisher_pca.transform( principal_components_test) print("\n" + line_str) # _______________________________________________________________________________________________________________________ # TASK 4: MLP
print(stats.f_oneway( compra_sim['durabilid'], compra_nao['durabilid']) ) print(stats.f_oneway( compra_sim['estilo'], compra_nao['estilo']) ) from sklearn.discriminant_analysis import LinearDiscriminantAnalysis X = compra_xls[['durabilid', 'desempenh', 'estilo']] y = compra_xls['compra'] == 'sim' print(y) clf = LinearDiscriminantAnalysis() clf.fit(X, y) print(clf.decision_function(X)) print(clf.score(X, y)) y_ = clf.predict(X) print(clf) print(clf.score(X, y)) print(clf.coef_, clf.intercept_) comprapredic = pd.read_csv("comprapredic.csv", header=0, sep=";") X2 = comprapredic[['durabilid', 'desempenh', 'estilo']] clf.predict(X2)
preload=True) epochs_train = epochs.copy().crop(tmin=1., tmax=2.) labels = epochs.events[:, -1] - 2 # %% # Classification with linear discrimant analysis # Define a monte-carlo cross-validation generator (reduce variance): scores = [] epochs_data = epochs.get_data() epochs_data_train = epochs_train.get_data() cv = ShuffleSplit(10, test_size=0.2, random_state=42) cv_split = cv.split(epochs_data_train) # Assemble a classifier lda = LinearDiscriminantAnalysis() csp = CSP(n_components=4, reg=None, log=True, norm_trace=False) # Use scikit-learn Pipeline with cross_val_score function clf = Pipeline([('CSP', csp), ('LDA', lda)]) scores = cross_val_score(clf, epochs_data_train, labels, cv=cv, n_jobs=1) a = csp.fit(epochs_data_train) a = csp.fit_transform(epochs_data_train) # Printing the results class_balance = np.mean(labels == labels[0]) class_balance = max(class_balance, 1. - class_balance) print("Classification accuracy: %f / Chance level: %f" % (np.mean(scores), class_balance))
nnscores = Neural(x1_n, y1) MAXITER = 20 nn_trnscores = np.tile(nnscores[0], MAXITER) nn_tstscores = np.tile(nnscores[1], MAXITER) nn_time = np.tile(nnscores[2], MAXITER) data1 = np.array(x1_n) #---------------------->APPLY CLUSTERING #independent------------> newkmdata, newemdata, newldadata = [], [], [] km = KMeans(n_clusters=2, random_state=0).fit(data1) kmdata = km.labels_ em = GM(n_components=2, random_state=0).fit(data1) emdata = em.predict(data1) lda = LDA(n_components=2).fit(data1, y1) data1_lda = lda.transform(data1) x1_nn = x1_n.tolist() for i in range(len(x1_nn)): newkm = (x1_nn[i]) kmdatai = int(kmdata[i]) newkm.extend([kmdatai]) newkmdata.append(newkm) x1_nn = x1_n.tolist() for i in range(len(x1_nn)): newem = (x1_nn[i]) emdatai = int(emdata[i]) newem.extend([kmdatai]) newemdata.append(newem)
from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.linear_model import LogisticRegression from sklearn.mixture import GaussianMixture #from sklearn.cluster import AgglomerativeClustering classifiers = [ KNeighborsClassifier(3), SVC(probability=True), DecisionTreeClassifier(), RandomForestClassifier(), GaussianMixture(), AdaBoostClassifier(), GradientBoostingClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), LogisticRegression() ] log_cols = ["Classifier", "Accuracy"] log = pd.DataFrame(columns=log_cols) sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0) X = train[0::, 1::] y = train[0::, 0] acc_dict = {} for train_index, test_index in sss.split(X, y):
# In[4]: #標準化 from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # # LDA # In[5]: from sklearn.discriminant_analysis import LinearDiscriminantAnalysis lda = LinearDiscriminantAnalysis(n_components=2) X_train = lda.fit_transform(X_train, y_train) X_test = lda.transform(X_test) # In[6]: # 執行分類 from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(X_train, y_train) # In[7]: y_pred = classifier.predict(X_test)
# Modeling step Test differents algorithms random_state = 2 classifiers = [] classifiers.append(SVC(random_state=random_state)) classifiers.append(DecisionTreeClassifier(random_state=random_state)) classifiers.append( AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state), random_state=random_state, learning_rate=0.1)) classifiers.append(RandomForestClassifier(random_state=random_state)) classifiers.append(ExtraTreesClassifier(random_state=random_state)) classifiers.append(GradientBoostingClassifier(random_state=random_state)) classifiers.append(MLPClassifier(random_state=random_state)) classifiers.append(KNeighborsClassifier()) classifiers.append(LogisticRegression(random_state=random_state)) classifiers.append(LinearDiscriminantAnalysis()) cv_results = [] for classifier in classifiers: cv_results.append( cross_val_score(classifier, X_train, y=Y_train, scoring="accuracy", cv=kfold, n_jobs=4)) cv_means = [] cv_std = [] for cv_result in cv_results: cv_means.append(cv_result.mean())
def train_and_test(filename, class_field): attrs, classes = prepare_ds(filename, class_field) corr_scatter = [] for cls, color in zip(range(1, 4), ('red', 'green', 'blue')): attr_one = attrs[:, 0][classes == cls] attr_two = attrs[:, 1][classes == cls] p = pearsonr(attr_one, attr_two) corr_scatter.append({ "x": attr_one.tolist(), "y": attr_two.tolist(), "p": p[0] }) # plt.scatter(x=attr_one, y=attr_two, marker='o', color=color, # label='cls: {:}, pearsonr={:.2f}'.format(cls, p[0])) # plt.title('Pearson correlation') # plt.xlabel('Elevation, m') # plt.ylabel('Slope, num') # plt.legend(loc='upper right') # plt.show() data_train, data_test, class_train, class_test = train_test_split( attrs, classes, test_size=.3, random_state=123, ) lda = LDA(n_components=2) lda_transform = lda.fit_transform(data_train, class_train) lda_scatter = [] # plt.figure(figsize=(10, 8)) for cls, color in zip(range(1, 4), ('red', 'green', 'blue')): attr_one = lda_transform[:, 0][class_train == cls] attr_two = lda_transform[:, 1][class_train == cls] lda_scatter.append({"x": attr_one.tolist(), "y": attr_two.tolist()}) # plt.scatter(x=attr_one, y=attr_two, marker='o', color=color, # label='cls: {:}'.format(cls)) # plt.xlabel('vec 1') # plt.ylabel('vec 2') # plt.legend() # plt.show() lda_clf = LDA() lda_clf.fit(data_train, class_train) pred_train_lda = lda_clf.predict(data_train) print('Точность классификации на обучающем наборе данных (LDA): {:.2%}'. format(metrics.accuracy_score(class_train, pred_train_lda))) pred_test_lda = lda_clf.predict(data_test) print('Точность классификации на тестовом наборе данных (LDA): {:.2%}'. format(metrics.accuracy_score(class_test, pred_test_lda))) qda_clf = QuadraticDiscriminantAnalysis() qda_clf.fit(data_train, class_train) pred_train_qda = qda_clf.predict(data_train) print('Точность классификации на обучающем наборе данных (QDA): {:.2%}'. format(metrics.accuracy_score(class_train, pred_train_qda))) pred_test_qda = qda_clf.predict(data_test) print('Точность классификации на тестовом наборе данных (QDA): {:.2%}'. format(metrics.accuracy_score(class_test, pred_test_qda))) return corr_scatter, lda_scatter
def compare_svm(paths, orientation1, orientation2, save_path, aliases, lda_comp, pca_comp, title, c, samplingRate=32000): if aliases is None: text = paths else: text = aliases fig = plt.figure() ax1 = fig.add_subplot(111) for index, path in enumerate(paths): for dirpath, _, file_paths in os.walk(path): label_names = [] s = [] file_paths.sort( lambda x, y: cmp(int(x.split('_')[-1]), int(y.split('_')[-1]))) for fp in file_paths: file_path = os.path.join(dirpath, fp) data = pd.read_csv(file_path, header=None) raw_data = data.iloc[:, :-1].values Y = data.iloc[:, -1].values if raw_data.shape[1] != 1: if lda_comp is not None: lda = LDA(n_components=lda_comp) X = lda.fit_transform(raw_data, Y) elif pca_comp is not None: pca = PCA(n_components=pca_comp) X = pca.fit(raw_data) else: X = raw_data interval_splits = [ int(y) for y in file_path.split('_')[-2:] ] for i in range(0, len(interval_splits), 2): first = interval_splits[i] / float(samplingRate) second = interval_splits[i + 1] / float(samplingRate) if interval_splits[i + 1] == 117574: label_names.append(''.join([ str('{0:.3f}'.format(first)), 's', '\n-\n', str('{0:.3f}'.format(second)), 's', '\nStim OFF' ])) elif interval_splits[i] == 0: label_names.append(''.join([ str('{0:.3f}'.format(first)), 's', '\n-\n', str('{0:.3f}'.format(second)), 's', '\nTrial Start' ])) elif interval_splits[i] == 32066: label_names.append(''.join([ str('{0:.3f}'.format(first)), 's', '\n-\n', str('{0:.3f}'.format(second)), 's', '\nStim ON' ])) elif interval_splits[i + 1] == 133606: label_names.append(''.join([ str('{0:.3f}'.format(first)), 's', '\n-\n', str('{0:.3f}'.format(second)), 's', '\nTrial END' ])) else: label_names.append(''.join([ str('{0:.3f}'.format(first)), 's', '\n-\n', str('{0:.3f}'.format(second)), 's' ])) scaler = StandardScaler() X_norm = scaler.fit_transform(X) new_x = X_norm[np.logical_or(Y == orientation1, Y == orientation2)] new_y = Y[np.logical_or(Y == orientation1, Y == orientation2)] score = 0 time = 0 skf = StratifiedKFold(n_splits=2, shuffle=True) for x in range(0, 5000): for i, (train, test) in enumerate(skf.split(new_x, new_y)): xtrain, xval = new_x[train], new_x[test] ytrain, yval = new_y[train], new_y[test] clf = Pipeline([('scaler', StandardScaler()), ('SVM', svm.SVC(kernel='linear', C=1))]) clf.fit(xtrain, ytrain) time += 1 score += clf.score(xval, yval) s.append(score / time) if c is None: color = np.random.rand(3) ax1.plot(label_names, s, marker='o', label=text[index], color=color) else: ax1.plot(label_names, s, marker='o', label=text[index], color=c[index]) plt.xlabel("Time") plt.ylabel("Accuracy") ax1.set_ylim(0.3, 1.0) plt.legend() fig.set_size_inches(28, 12, forward=True) if title is not None: ax1.set_title(title) if save_path is not None: plt.savefig(save_path) else: plt.show()
for train_index, test_index in kfold.split(data_opto_SOM,target): ## Opto SOM ## x_train, x_test = data_opto_SOM[train_index,:],data_opto_SOM[test_index,:] y_train,y_test = target[train_index],target[test_index] mul_lr = LogisticRegression(multi_class='multinomial', solver='newton-cg',max_iter=max_i) mul_lr.fit(x_train, y_train) score_opto_SOM_LR[n,f] = mul_lr.score(x_test, y_test)*100 print(mul_lr.score(x_test,y_test)) clf = NearestCentroid(metric='euclidean',shrink_threshold=None) clf.fit(x_train,y_train) score_opto_SOM_NN[n,f] = clf.score(x_test,y_test)*100 lda = LinearDiscriminantAnalysis(solver='svd') lda.fit(x_train,y_train) score_opto_SOM_LDA[n,f]=lda.score(x_test,y_test)*100 print(lda.score(x_test,y_test)) svm_algo = svm.SVC(decision_function_shape='ovo',kernel='linear') svm_algo.fit(x_train,y_train) score_opto_SOM_SVM[n,f]=svm_algo.score(x_test,y_test)*100 ## Opto PV ## x_train, x_test = data_opto_PV[train_index,:],data_opto_PV[test_index,:] y_train,y_test = target[train_index],target[test_index] mul_lr = LogisticRegression(multi_class='multinomial', solver='newton-cg',max_iter=max_i) mul_lr.fit(x_train, y_train) score_opto_PV_LR[n,f] = mul_lr.score(x_test, y_test)*100 print(mul_lr.score(x_test,y_test))
data_size=len(train_targets), input_data=train_data, width=sample_width) filter_feature = GetNonTargetsAverage(train_inputs, train_targets) train_inputs = ApplySpecialFilter(train_inputs, filter_feature, reshaped=True) print(train_inputs.shape) # Modeling train_targets = np.array(train_targets) lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(train_inputs, train_targets) sel_model = SelectFromModel(lsvc, prefit=True) train_inputs = sel_model.transform(train_inputs) print(train_inputs.shape) model = LinearDiscriminantAnalysis() model.fit(train_inputs, train_targets) # Prediction test_events = BasicDataProcess.LoadDataFromFile( data_dir + "/Test/testEvents.txt") test_data = BasicDataProcess.LoadEEGFromFile(data_dir, False) test_inputs = PreprocessData(data_dir, filter_applied=True, pca_applied=False, pca_threshold=20, reshaped=False, data_size=len(test_events), input_data=test_data, width=sample_width) test_inputs = ApplySpecialFilter(test_inputs,
def run_LDA(DataPath, LabelsPath, CV_RDataPath, OutputDir): ''' run baseline classifier: LDA Wrapper script to run an LDA classifier on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. ''' # read the Rdata file ro.r['load'](CV_RDataPath) nfolds = np.array(ro.r['n_folds'], dtype = 'int') tokeep = np.array(ro.r['Cells_to_Keep'], dtype = 'bool') col = np.array(ro.r['col_Index'], dtype = 'int') col = col - 1 test_ind =ro.r['Test_Idx'] train_ind =ro.r['Train_Idx'] # read the data data=ro.r['readRDS'](DataPath) data=pd.DataFrame(dgc_to_csr(data).toarray()).T labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep='\t', usecols = col) # print(len(data)) # print(labels) # print(len(tokeep)) labels = labels.iloc[tokeep] data = data.iloc[tokeep] # normalize data data = np.log1p(data) Classifier = LinearDiscriminantAnalysis() tr_time=[] ts_time=[] truelab = [] pred = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data.iloc[train_ind_i] test=data.iloc[test_ind_i] y_train=labels.iloc[train_ind_i] y_test=labels.iloc[test_ind_i] start=tm.time() Classifier.fit(train, y_train) tr_time.append(tm.time()-start) start=tm.time() predicted = Classifier.predict(test) ts_time.append(tm.time()-start) truelab.extend(y_test.values) pred.extend(predicted) print(len(pred)) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) # print(len(tr_time)) OutputDir = Path(OutputDir) os.makedirs(Path(OutputDir),exist_ok=True) truelab.to_csv(str(OutputDir / Path("LDA_true.csv")), index = False) pred.to_csv(str(OutputDir / Path("LDA_pred.csv")), index = False) tr_time.to_csv(str(OutputDir / Path("LDA_training_time.csv")), index = False) ts_time.to_csv(str(OutputDir / Path("LDA_test_time.csv")), index = False)
scatter_matrix(dataset) # pyplot.show() #分离数据集 array=dataset.values X=array[:,0:4] Y=array[:,4] validation_size=0.2 seed=7 X_train,X_validation,Y_train,Y_validation=train_test_split(X,Y,test_size=validation_size,random_state=seed) #算法审查 # 线性回归 线性判别分析 K邻近 分类与回归树 贝叶斯分类器 支持向量机 models={} models['LR']=LogisticRegression() models['LDA']=LinearDiscriminantAnalysis() models['KNN']=KNeighborsClassifier() models['CART']=DecisionTreeClassifier() models['NB']=GaussianNB() models['SVM']=SVC() #评估算法 result=[] for key in models: kfold=KFold(n_splits=10,random_state=seed) cv_result=cross_val_score(models[key],X_train,Y_train,cv=kfold,scoring='accuracy') result.append(cv_result) print('%s:%f(%f)'%(key,cv_result.mean(),cv_result.std())) #箱线图比较算法 fiq=pyplot.figure()
mrnaseq = get_expression(cancer_types[catype]) #%% Calculate mitotic index genes = ["MKI67"] # ki67 mitindex = pd.DataFrame(columns=["mitindex"]+genes,index=data.columns) samples = [s[-35:-19] if (s[-1] in ["A","B"]) else s[-34:-18] for s in mitindex.index] mitindex.loc[:,genes] = [list(mrnaseq.loc[genes,s].values) if (s in mrnaseq.columns) else [np.nan]*len(genes) for s in samples] mitindex.loc[:,"mitindex"] = mitindex.loc[:,genes].mean(1) #%% Dimension reduction ## Discriminant analysis tumcode = [re.findall("(?<=TCGA-[A-Z0-9]{2}-[A-Z0-9]{4}-)[0-9]+",s)[0] for s in data.columns] targets = ["CA" if s=="01" else "HE" for s in tumcode] ## Discriminant analysis disc = LinearDiscriminantAnalysis(n_components=2, store_covariance=True) principalComponents = disc.fit_transform(data.transpose(),targets) expl_var = disc.explained_variance_ratio_ normcoef = disc.coef_*disc.covariance_.diagonal() coef = pd.DataFrame(normcoef.transpose(), index = data.index) # Record normalized coefficients weights.loc[coef.index,catype] = coef.values.reshape((coef.shape[0],)) # Add an uninformative second component if not generated if principalComponents.shape[1]==1: principalComponents = np.append(principalComponents,np.random.uniform(size=(principalComponents.shape[0],1)),axis=1) expl_var = np.append(expl_var, 0) ## Plot principalDf = pd.DataFrame(principalComponents,columns=["PCA1","PCA2"],index=data.columns) principalDf["source"] = targets
def LDA_process(X): l, n = X.shape lda = LinearDiscriminantAnalysis(n_components=2) lda.fit(X[:, :-1], X[:, -1]) Y = lda.transform(X[:, :-1]) return Y
width, height).astype(int).transpose(1, 0) GaussNB_predict_prob = GaussNB.predict_proba(data_all) # Post-processing using Graph-Cut Seg_Label, seg_accuracy = Post_Processing(GaussNB_predict_prob,height,width,\ num_classes,y_test,test_indexes) print('(GaussNB) Train_Acc=%.3f, Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\ % (GaussNB.score(X_train,y_train),GaussNB.score(X_test,y_test),\ seg_accuracy, (time.time()-start_time))) # draw classification map draw(GT_Label, GaussNB_Label, Seg_Label, train_map, test_map) print('--------------------------------------------------------------------') # discriminant_analysis - linear discriminant analysis from sklearn.discriminant_analysis import LinearDiscriminantAnalysis start_time = time.time() LDA = LinearDiscriminantAnalysis().fit(X_train, y_train) LDA_Label = LDA.predict(data_all).reshape(width, height).astype(int).transpose(1, 0) LDA_predict_prob = LDA.predict_proba(data_all) # Post-processing using Graph-Cut Seg_Label, seg_accuracy = Post_Processing(LDA_predict_prob,height,width,\ num_classes,y_test,test_indexes) print('(LDA) Train_Acc=%.3f, Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\ % (LDA.score(X_train,y_train),LDA.score(X_test,y_test),\ seg_accuracy, (time.time()-start_time))) # draw classification map draw(GT_Label, LDA_Label, Seg_Label, train_map, test_map) print('--------------------------------------------------------------------') # Logistic Regression from sklearn.linear_model import LogisticRegression
Y, test_size=0.33, random_state=0) model = svm.SVC(kernel='linear', C=1) model.fit(x_train, y_train) y_pred = model.predict(x_test) acc25 = accuracy_score(y_test, y_pred) pca = PCA(n_components=15) X1 = pca.fit_transform(data1) Y = np.repeat(range(1, 16), 11) x_train, x_test, y_train, y_test = train_test_split(X1, Y, test_size=0.33, random_state=0) model = svm.SVC(kernel='linear', C=1) model.fit(x_train, y_train) y_pred = model.predict(x_test) acc15 = accuracy_score(y_test, y_pred) from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA model = LDA() Y = np.repeat(range(1, 16), 11) X2 = model.fit_transform(data1, Y) x_train, x_test, y_train, y_test = train_test_split(X2, Y, test_size=0.33) model.fit(x_train, y_train) y_pred = model.predict(x_test) acc_LDa = accuracy_score(y_test, y_pred)
for value in [0, 1]: # forecast yhat = naive_prediction(testX, value) # evaluate score = accuracy_score(testy, yhat) # summarize print('Naive=%d score=%.3f' % (value, score)) # Test options and evaluation metric seed = 7 scoring = 'accuracy' # Spot Check Algorithms models = [] models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) # evaluate each model in turn results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, trainX, trainy, cv=kfold, scoring=scoring) results.append(cv_results)
Y=Y.astype(str) #my_imputer = SimpleImputer() #X=my_imputer.fit_transform(X) validation_size = 0.20 seed = 7 X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed) # Test options and evaluation metric seed = 7 scoring = 'accuracy' # Spot Check Algorithms models = [] models.append(('LR', LogisticRegression(solver='lbfgs',max_iter=10000, multi_class='auto'))) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) # evaluate each model in turn results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) print(msg)
clever_print('svm with RBF kernel with cv') svm_rbf_cv= GridSearchCV(SVC(kernel='rbf'), param_grid, cv=CV_SET) svm_rbf_cv.fit(svm_scaler.transform(data_all_x), data_all_y) print('accuracy on training and dev') print(svm_rbf_cv.best_score_) print('best param') print(svm_rbf_cv.best_params_) # LDA QDA------------------------------------ clever_print('LDA analysis') from sklearn.discriminant_analysis import LinearDiscriminantAnalysis lda=LinearDiscriminantAnalysis().fit(data_train_x,data_train_y) print('accuracy on training') print(accuracy_score(data_train_y,lda.predict(data_train_x))) print('accuracy on dev') print(accuracy_score(data_dev_y,lda.predict(data_dev_x))) clever_print('QDA analysis') from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis qda=QuadraticDiscriminantAnalysis().fit(data_train_x,data_train_y) print('accuracy on training') print(accuracy_score(data_train_y,qda.predict(data_train_x))) print('accuracy on dev') print(accuracy_score(data_dev_y,qda.predict(data_dev_x))) # NN(MLP)------------------------------------
def __init__(self, n_features): self.n_features = n_features self.lda = LinearDiscriminantAnalysis(n_components=n_features)
i_iter = -1 acc_test_nested_iter = np.zeros((cv_nested.get_n_splits(), )) for tr_idx_nested, te_idx_nested in cv_split_nested: i_iter += 1 epochs_train_nested = epochs_train[tr_idx_nested, :, :] epochs_test_nested = epochs_train[te_idx_nested, :, :] labels_train_nested = labels_train[tr_idx_nested] labels_test_nested = labels_train[te_idx_nested] csp = CSP(n_components=n_components, reg='ledoit_wolf', log=True, cov_est='concat') csp.fit(epochs_train_nested, labels_train_nested) epochs_train_nested_new = csp.transform(epochs_train_nested) epochs_test_nested_new = csp.transform(epochs_test_nested) lda = LinearDiscriminantAnalysis() lda.fit(epochs_train_nested_new, labels_train_nested) # lbl_train_pred_nested = lda.predict(epochs_train_nested_new) lbl_test_pred_nested = lda.predict(epochs_test_nested_new) acc_test_nested_iter[i_iter] = np.mean( lbl_test_pred_nested == labels_test_nested) acc_n_components[i_comp] = np.mean(acc_test_nested_iter) idx1 = np.argmax(acc_n_components) n_components = list(range_n_components)[idx1] print('*****************************') print('n_components=', n_components) print('acc_nested_max=', acc_n_components[idx1]) print('*****************************') csp = CSP(n_components=n_components, reg='ledoit_wolf', log=True,
# Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) #Applying LDA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA lda = LDA(n_components=2) X_train = lda.fit_transform(X_train, y_train) X_test = lda.transform(X_test) # Fitting Logistic Regression to the Training set from sklearn.linear_model import LogisticRegression classifier = LogisticRegression(random_state=0) classifier.fit(X_train, y_train) # Predicting the Test set results` y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred)
e_co = [] l_lda = [] l_clustering = [] l_co = [] for index, i in enumerate(num_unlabeled): X_labeled = X_train[:num_labeled] y_labeled = y_train[:num_labeled] X_unlabeled = X_train[num_labeled: num_labeled + num_unlabeled[index]] y_unlabeled = y_train[num_labeled: num_labeled + num_unlabeled[index]] ###### supervised-LDA ###### X_trnall = X_train[: num_labeled] y_trnall = y_train[: num_labeled] clf_lda = LinearDiscriminantAnalysis() clf_lda.fit(X_trnall, y_trnall) train_predictions = clf_lda.predict(X_test) e_lda.append(1 - accuracy_score(y_test, train_predictions)) l_lda.append(log_loss(y_test, train_predictions)) ###### SS-Clustering ###### if num_unlabeled[index] == 0: X_trnall = X_train[: num_labeled] y_trnall = y_train[: num_labeled] else: X_trnall, y_trnall = ssclustering(X_labeled, y_labeled, X_unlabeled) clf_lda = LinearDiscriminantAnalysis() clf_lda.fit(X_trnall, y_trnall) train_predictions = clf_lda.predict(X_test) e_clustering.append(1 - accuracy_score(y_test, train_predictions))