def classify_using_lda(feat1, feat2, num_comp=2): n_plus = len(feat1) n_minus = len(feat2) X = np.concatenate((feat1, feat2), axis=0) y = np.concatenate((np.zeros(n_plus), np.ones(n_minus)), axis=0) y += 1 print(X.shape, y.shape, n_plus, n_minus, feat1.shape, feat2.shape) lda = LDA(n_components=num_comp) lda.fit(X, y) # TODO FIXME Why is this returning n_samples x 1, and not n_samples x 2? # Is it able to to differentiate using just 1 component? Crazy!! X_tr = lda.transform(X) print(X_tr.shape, lda.score(X, y)) # CRAZY, we don't actually have the 2nd component from LDA X1 = np.concatenate((X_tr[0:n_plus], np.zeros((n_plus, 1))), axis=1) X2 = np.concatenate((X_tr[-n_minus:], np.ones((n_minus, 1))), axis=1) plt.plot(X1[:, 0], X1[:, 1], 'ro') plt.plot(X2[:, 0], X2[:, 1], 'g+') plt.ylim(-1, 3) plt.show()
def computing_cv_accuracy_LDA(in_path=None, cv_n_fold=10): def u65(mod_Y): return 1.6 / mod_Y - 0.6 / mod_Y ** 2 def u80(mod_Y): return 2.2 / mod_Y - 1.2 / mod_Y ** 2 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path) print("-----DATA SET TRAINING---", in_path) X = data.iloc[:, :-1].values y = np.array(data.iloc[:, -1].tolist()) kf = KFold(n_splits=cv_n_fold, random_state=None, shuffle=True) lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) mean_u65, mean_u80 = 0, 0 for idx_train, idx_test in kf.split(y): print("---k-FOLD-new-executing--") X_cv_train, y_cv_train = X[idx_train], y[idx_train] X_cv_test, y_cv_test = X[idx_test], y[idx_test] lda.fit(X_cv_train, y_cv_train) n_test = len(idx_test) sum_u65, sum_u80 = 0, 0 for i, test in enumerate(X_cv_test): evaluate = lda.predict([test]) print("-----TESTING-----", i) if y_cv_test[i] in evaluate: sum_u65 += u65(len(evaluate)) sum_u80 += u80(len(evaluate)) mean_u65 += sum_u65 / n_test mean_u80 += sum_u80 / n_test print("--->", mean_u65 / cv_n_fold, mean_u80 / cv_n_fold)
def computing_performance_LDA(in_path=None, seeds=list([0])): def u65(mod_Y): return 1.6 / mod_Y - 0.6 / mod_Y ** 2 def u80(mod_Y): return 2.2 / mod_Y - 1.2 / mod_Y ** 2 data = export_data_set('iris.data') if in_path is None else pd.read_csv(in_path) print("-----DATA SET TRAINING---", in_path) X = data.iloc[:, :-1].values y = data.iloc[:, -1].tolist() lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) mean_u65, mean_u80 = 0, 0 n_times = len(seeds) for k in range(0, n_times): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=seeds[k]) sum_u65, sum_u80 = 0, 0 lda.fit(X_train, y_train) n, _ = X_test.shape for i, test in enumerate(X_test): evaluate = lda.predict([test]) print("-----TESTING-----", i) if y_test[i] in evaluate: sum_u65 += u65(len(evaluate)) sum_u80 += u80(len(evaluate)) print("--k-->", k, sum_u65 / n, sum_u80 / n) mean_u65 += sum_u65 / n mean_u80 += sum_u80 / n print("--->", mean_u65 / n_times, mean_u80 / n_times)
def test_lda_predict(): # Test LDA classification. # This checks that LDA implements fit and predict and returns correct # values for simple toy data. for test_case in solver_shrinkage: solver, shrinkage = test_case clf = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage) y_pred = clf.fit(X, y).predict(X) assert_array_equal(y_pred, y, "solver %s" % solver) # Assert that it works with 1D data y_pred1 = clf.fit(X1, y).predict(X1) assert_array_equal(y_pred1, y, "solver %s" % solver) # Test probability estimates y_proba_pred1 = clf.predict_proba(X1) assert_array_equal((y_proba_pred1[:, 1] > 0.5) + 1, y, "solver %s" % solver) y_log_proba_pred1 = clf.predict_log_proba(X1) assert_array_almost_equal(np.exp(y_log_proba_pred1), y_proba_pred1, 8, "solver %s" % solver) # Primarily test for commit 2f34950 -- "reuse" of priors y_pred3 = clf.fit(X, y3).predict(X) # LDA shouldn't be able to separate those assert_true(np.any(y_pred3 != y3), "solver %s" % solver) # Test invalid shrinkages clf = LinearDiscriminantAnalysis(solver="lsqr", shrinkage=-0.2231) assert_raises(ValueError, clf.fit, X, y) clf = LinearDiscriminantAnalysis(solver="eigen", shrinkage="dummy") assert_raises(ValueError, clf.fit, X, y) clf = LinearDiscriminantAnalysis(solver="svd", shrinkage="auto") assert_raises(NotImplementedError, clf.fit, X, y) # Test unknown solver clf = LinearDiscriminantAnalysis(solver="dummy") assert_raises(ValueError, clf.fit, X, y)
def test_lda_explained_variance_ratio(): # Test if the sum of the normalized eigen vectors values equals 1, # Also tests whether the explained_variance_ratio_ formed by the # eigen solver is the same as the explained_variance_ratio_ formed # by the svd solver state = np.random.RandomState(0) X = state.normal(loc=0, scale=100, size=(40, 20)) y = state.randint(0, 3, size=(40,)) clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen") clf_lda_eigen.fit(X, y) assert_almost_equal(clf_lda_eigen.explained_variance_ratio_.sum(), 1.0, 3) clf_lda_svd = LinearDiscriminantAnalysis(solver="svd") clf_lda_svd.fit(X, y) assert_almost_equal(clf_lda_svd.explained_variance_ratio_.sum(), 1.0, 3) tested_length = min(clf_lda_svd.explained_variance_ratio_.shape[0], clf_lda_eigen.explained_variance_ratio_.shape[0]) # NOTE: clf_lda_eigen.explained_variance_ratio_ is not of n_components # length. Make it the same length as clf_lda_svd.explained_variance_ratio_ # before comparison. assert_array_almost_equal(clf_lda_svd.explained_variance_ratio_, clf_lda_eigen.explained_variance_ratio_[:tested_length])
class LinearDiscriminantAnalysisPredictor(PredictorBase): ''' Linear Discriminant Analysis ''' def __init__(self, animal_type): self.animal_type = animal_type self.clf = LinearDiscriminantAnalysis() def fit(self, X_train, y_train): self.clf.fit(X_train, y_train) def predict(self, X_test): predictions = self.clf.predict_proba(X_test) predictions_df = self.bundle_predictions(predictions) return predictions_df def find_best_params(self): parameters = {'solver': ['svd', 'lsqr', 'eigen']} knn = LinearDiscriminantAnalysis() clf = grid_search.GridSearchCV(knn, parameters) train_data = get_data('../data/train.csv') train_data = select_features(train_data, self.animal_type) X = train_data.drop(['OutcomeType'], axis=1) y = train_data['OutcomeType'] clf.fit(X, y) print clf.best_params_
def main(): """Read Train/test log.""" df = pd.read_csv("train.csv") # train/test split using stratified sampling labels = df['label'] df = df.drop(['label'], 1) sss = StratifiedShuffleSplit(labels, 10, test_size=0.2, random_state=23) for train_index, test_index in sss: x_train, x_test = df.values[train_index], df.values[test_index] y_train, y_test = labels[train_index], labels[test_index] # classification algorithm classification(x_train, y_train, x_test, y_test) # Predict Test Set favorite_clf = LinearDiscriminantAnalysis() favorite_clf.fit(x_train, y_train) test = pd.read_csv('test.csv') test_predictions = favorite_clf.predict(test) print test_predictions # Format DataFrame submission = pd.DataFrame(test_predictions, columns=['Label']) submission.tail() submission.insert(0, 'ImageId', np.arange(len(test_predictions)) + 1) submission.reset_index() submission.tail() # Export Submission submission.to_csv('submission.csv', index=False) submission.tail()
class LinearDiscriminantAnalysiscls(object): """docstring for ClassName""" def __init__(self): self.lda_cls = LinearDiscriminantAnalysis() self.prediction = None self.train_x = None self.train_y = None def train_model(self, train_x, train_y): try: self.train_x = train_x self.train_y = train_y self.lda_cls.fit(train_x, train_y) except: print(traceback.format_exc()) def predict(self, test_x): try: self.test_x = test_x self.prediction = self.lda_cls.predict(test_x) return self.prediction except: print(traceback.format_exc()) def accuracy_score(self, test_y): try: # return r2_score(test_y, self.prediction) return self.lda_cls.score(self.test_x, test_y) except: print(traceback.format_exc())
def plot_lda_only(filename, title, filename_fig): df = pd.read_csv(path+filename, names=['x1','x2','y'], header=None) fig = plt.figure() fig.suptitle(title, fontsize=20) columns_ls = [] for column in df.columns: columns_ls.append(column) X = df[columns_ls[0:len(columns_ls)-1]].values Y = df[columns_ls[len(columns_ls)-1]].values clf_lda = LinearDiscriminantAnalysis() clf_lda.fit(X, Y) w = clf_lda.coef_[0] a = -w[0]/w[1] xx = np.linspace(-12, 34) yy = a*xx-clf_lda.intercept_[0]/w[1] plt.plot(xx,yy, color="blue", label ="LDA decision boundary") print "Weights W0 %.2f and W1%.2f"%(w[0], w[1]) plt.text(0, 0, "Y=+1", fontsize=12) plt.text(10, -20, "Y=-1", fontsize=12) # plt.plot(xx, yy_down, 'k--') # plt.plot(xx, yy_up, 'k--') # plt.plot(xx,yy,color="black", label ="svm decision boundary") plt.xlabel('X1', fontsize=18) plt.ylabel('X2', fontsize=16) # fig.savefig(filename_fig) # model = LogisticRegression() # model.fit(X, Y) # w = model.coef_[0] # a = -w[0]/w[1] # # xx = np.linspace(-12, 34) # yy = a*xx-model.intercept_[0]/w[1] # # plt.plot(xx,yy, label ="logistic decision boundary") # # clf_lda = LinearDiscriminantAnalysis() # clf_lda.fit(X, Y) # w = clf_lda.coef_[0] # a = -w[0]/w[1] # # xx = np.linspace(-12, 34) # yy = a*xx-clf_lda.intercept_[0]/w[1] # plt.plot(xx,yy, color="blue", label ="LDA decision boundary") # plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], # s=80, color='b') plt.scatter(X[:, 0], X[:, 1], c=Y) plt.axis('tight') plt.legend() plt.show()
def doLDA(x,digits,s): myLDA = LDA() myLDA.fit(x.PCA[:,:s],digits.train_Labels) newtest = digits.test_Images -x.centers [email protected](x.V[:s,:]) labels = myLDA.predict(newtest) errors = class_error_rate(labels.reshape(1,labels.shape[0]),digits.test_Labels) return errors
def lda(X, y, n): ''' Returns optimal projection of the data LDA with n components ''' selector = LinearDiscriminantAnalysis(n_components=n) selector.fit(X, y) return selector.transform(X), y
def Train(enhancedGeneSet, classLabels): enhancedGeneSet = np.array(enhancedGeneSet); classLabels = np.array(classLabels); classifier = LinearDiscriminantAnalysis(); classifier.fit(enhancedGeneSet, classLabels); #del enhancedGeneSet; #del classLabels; return classifier;
def train_model(self): ### Train spectrum data # form training data and labels X = np.empty((0, self.freq_cutoff), int) y = np.empty((0, 1), int) data_dir = 'clap_data/claps/spectrum/' for fname in os.listdir(data_dir): data = np.load("%s%s"% (data_dir, fname)) X = np.append(X, data, axis=0) y = np.append(y, [1] * data.shape[0]) data_dir = 'clap_data/noclaps/spectrum/' for fname in os.listdir(data_dir): data = np.load("%s%s"% (data_dir, fname)) X = np.append(X, data, axis=0) y = np.append(y, [0] * data.shape[0]) # pca = PCA(n_components=200) # X_pca = pca.fit_transform(X) # fit the model # clf = LogisticRegression(penalty='l1') clf = LinearDiscriminantAnalysis() clf.fit(X, y) preds = clf.predict(X) # X_new = clf.transform(X) # clf2 = LinearDiscriminantAnalysis() # clf2.fit(X_new, y) # preds2 = clf2.predict(X_new) # print X.shape, X_pca.shape print preds print np.sum(preds), preds.size # print preds2, np.sum(preds2) # save model pickle.dump(clf, open(clap_model_dir + clap_classifier_fname, 'w')) self.clap_clf = clf ### Train decay data X = np.empty((0, self.decay_samples/10), int) data_dir = 'clap_data/claps/decay/' for fname in os.listdir(data_dir): if fname.endswith('npy'): data = np.load("%s%s"% (data_dir, fname)) print data.shape, X.shape X = np.append(X, data, axis=0) print X.shape X_avg = np.mean(X, axis=0) plt.plot(X_avg) plt.show() # Average decay data np.save('%s%s' % (clap_model_dir, clap_decay_model_fname), X_avg)
def _get_lda(self, data, variables): domain = Domain(attributes=variables, class_vars=data.domain.class_vars) data = data.transform(domain) lda = LinearDiscriminantAnalysis(solver='eigen', n_components=2) lda.fit(data.X, data.Y) scalings = lda.scalings_[:, :2].T if scalings.shape == (1, 1): scalings = np.array([[1.], [0.]]) return scalings
def test_raises_value_error_on_same_number_of_classes_and_samples(solver): """ Tests that if the number of samples equals the number of classes, a ValueError is raised. """ X = np.array([[0.5, 0.6], [0.6, 0.5]]) y = np.array(["a", "b"]) clf = LinearDiscriminantAnalysis(solver=solver) with pytest.raises(ValueError, match="The number of samples must be more"): clf.fit(X, y)
def test_lda_numeric_consistency_float32_float64(): for (solver, shrinkage) in solver_shrinkage: clf_32 = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage) clf_32.fit(X.astype(np.float32), y.astype(np.float32)) clf_64 = LinearDiscriminantAnalysis(solver=solver, shrinkage=shrinkage) clf_64.fit(X.astype(np.float64), y.astype(np.float64)) # Check value consistency between types rtol = 1e-6 assert_allclose(clf_32.coef_, clf_64.coef_, rtol=rtol)
def test_lda_explained_variance_ratio(): # Test if the sum of the normalized eigen vectors values equals 1 n_features = 2 n_classes = 2 n_samples = 1000 X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_classes, random_state=11) clf_lda_eigen = LinearDiscriminantAnalysis(solver="eigen") clf_lda_eigen.fit(X, y) assert_almost_equal(clf_lda_eigen.explained_variance_ratio_.sum(), 1.0, 3)
def testEvaluateLDA(self, trCList, teCList): # LDA object clf = LinearDiscriminantAnalysis() # fit lda model using training chromosomes clf.fit(numpy.asarray(trCList), numpy.asarray(trainGroupings)) predicted = clf.predict(teCList) self.confusionMatrix(testGroupings, predicted, 'lda_test') # return precision ([0]), recall ([1]) or f1 score ([2]), replace with clf.score(numpy.asarray(teCList), testGroupings) for accuracy return precision_recall_fscore_support(testGroupings, predicted, average = 'weighted')[2] # fitness for test set
def LD(pth): train_desc=np.load(pth+'/training_features.npy') nbr_occurences = np.sum( (train_desc > 0) * 1, axis = 0) idf = np.array(np.log((1.0*len(image_paths)+1) / (1.0*nbr_occurences + 1)), 'float32') # Scaling the words stdSlr = StandardScaler().fit(train_desc) train_desc = stdSlr.transform(train_desc) modelLD=LinearDiscriminantAnalysis() modelLD.fit(train_desc,np.array(train_labels)) joblib.dump((modelLD, img_classes, stdSlr), pth+"/ld-bof.pkl", compress=3) test(pth, "ld-")
def lda_analysis(self, X_train, X_test, y_train, y_test, data_set_name): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) ## ## Plots ## ph = plot_helper() scores = [] train_scores = [] rng = range(1, X_train_scl.shape[1]+1) for i in rng: lda = LinearDiscriminantAnalysis(n_components=i) cv = KFold(X_train_scl.shape[0], 3, shuffle=True) # cross validation cv_scores = [] for (train, test) in cv: lda.fit(X_train_scl[train], y_train[train]) score = lda.score(X_train_scl[test], y_train[test]) cv_scores.append(score) mean_score = np.mean(cv_scores) scores.append(mean_score) # train score lda = LinearDiscriminantAnalysis(n_components=i) lda.fit(X_train_scl, y_train) train_score = lda.score(X_train_scl, y_train) train_scores.append(train_score) print(i, mean_score) ## ## Score Plot ## title = 'Score Summary Plot (LDA) for ' + data_set_name name = data_set_name.lower() + '_lda_score' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(rng, [scores, train_scores], [None, None], ['cross validation score', 'training score'], cm.viridis(np.linspace(0, 1, 2)), ['o', '*'], title, 'n_components', 'Score', filename)
def test_lda_transform(): # Test LDA transform. clf = LinearDiscriminantAnalysis(solver="svd", n_components=1) X_transformed = clf.fit(X, y).transform(X) assert_equal(X_transformed.shape[1], 1) clf = LinearDiscriminantAnalysis(solver="eigen", n_components=1) X_transformed = clf.fit(X, y).transform(X) assert_equal(X_transformed.shape[1], 1) clf = LinearDiscriminantAnalysis(solver="lsqr", n_components=1) clf.fit(X, y) msg = "transform not implemented for 'lsqr'" assert_raise_message(NotImplementedError, msg, clf.transform, X)
def learn(self, decoy_peaks, target_peaks, use_main_score=True): assert isinstance(decoy_peaks, Experiment) assert isinstance(target_peaks, Experiment) X0 = decoy_peaks.get_feature_matrix(use_main_score) X1 = target_peaks.get_feature_matrix(use_main_score) X = np.vstack((X0, X1)) y = np.zeros((X.shape[0],)) y[X0.shape[0] :] = 1.0 classifier = LinearDiscriminantAnalysis() classifier.fit(X, y) self.classifier = classifier self.scalings = classifier.scalings_.flatten() return self
def _dimReduce(df, method='pca', n_components=2, labels=None, standardize=False, smatFunc=None, ldaShrinkage='auto'): if method == 'kpca': """By using KernelPCA for dimensionality reduction we don't need to impute missing values""" if smatFunc is None: smatFunc = corrTSmatFunc pca = KernelPCA(kernel='precomputed', n_components=n_components) smat = smatFunc(df).values xy = pca.fit_transform(smat) pca.components_ = pca.alphas_ pca.explained_variance_ratio_ = pca.lambdas_ / pca.lambdas_.sum() return xy, pca elif method == 'pca': if standardize: normed = df.apply(lambda vec: (vec - vec.mean())/vec.std(), axis=0) else: normed = df.apply(lambda vec: vec - vec.mean(), axis=0) pca = PCA(n_components=n_components) xy = pca.fit_transform(normed) return xy, pca elif method == 'lda': if labels is None: raise ValueError('labels needed to perform LDA') if standardize: normed = df.apply(lambda vec: (vec - vec.mean())/vec.std(), axis=0) else: normed = df.apply(lambda vec: vec - vec.mean(), axis=0) if df.shape[1] > df.shape[0]: """Pre-PCA step""" ppca = PCA(n_components=int(df.shape[0]/1.5)) normed = ppca.fit_transform(df) lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage=ldaShrinkage, n_components=n_components) lda.fit(normed, labels.values) lda.explained_variance_ratio_ = np.abs(lda.explained_variance_ratio_) / np.abs(lda.explained_variance_ratio_).sum() xy = lda.transform(normed) elif method == 'pls': if labels is None: raise ValueError('labels needed to perform PLS') if standardize: normed = df.apply(lambda vec: (vec - vec.mean())/vec.std(), axis=0) else: normed = df.apply(lambda vec: vec - vec.mean(), axis=0) pls = PLSRegression(n_components=n_components) pls.fit(normed, labels) pls.explained_variance_ratio_ = np.zeros(n_components) xy = pls.x_scores_ return xy, pls
def importance_lda(data, kpi, max_features=10, **kwargs): from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA """ :param data: dataframe containing training data :param kpi: Name of the current kpi :param max_features: maximum number of features to return :return: list of the best metrics """ columns = data[[col for col in set(data.columns) - {kpi}]].columns train, test, target_train, target_test = prepare_data_for_kpi(data, kpi) model = LDA(**kwargs) model.fit(train) print model.coef_
def plot_linear_svm_lda(filename, title): df = pd.read_csv(path+filename, names=['x1','x2','y'], header=None) columns_ls = [] for column in df.columns: columns_ls.append(column) X = df[columns_ls[0:len(columns_ls)-1]].values Y = df[columns_ls[len(columns_ls)-1]].values fig = plt.figure() fig.suptitle(title, fontsize=20) clf = svm.SVC(kernel='linear') clf.fit(X, Y) w = clf.coef_[0] print "Weights SVM W0=%.2f and W1=%.2f"%(w[0], w[1]) a = -w[0]/w[1] xx =np.linspace(-12, 34) yy = a*xx-clf.intercept_[0]/w[1] b = clf.support_vectors_[0] yy_down = a * xx + (b[1] - a * b[0]) b = clf.support_vectors_[-1] yy_up = a * xx + (b[1] - a * b[0]) # plot the line, the points, and the nearest vectors to the plane # plt.plot(xx, yy, 'k-') plt.text(0, 10, "Y=+1", fontsize=12) plt.text(10, 0, "Y=-1", fontsize=12) plt.plot(xx, yy_down, 'k--') plt.plot(xx, yy_up, 'k--') plt.plot(xx,yy,color="black", label ="svm decision boundary") clf_lda = LinearDiscriminantAnalysis() clf_lda.fit(X, Y) w = clf_lda.coef_[0] a = -w[0]/w[1] print "Weights LDA W0=%.2f and W1=%.2f"%(w[0], w[1]) xx = np.linspace(-12, 34) yy = a*xx-clf_lda.intercept_[0]/w[1] plt.plot(xx,yy, color="blue", label ="LDA decision boundary") plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=80, color='b') plt.scatter(X[:, 0], X[:, 1], c=Y) plt.xlabel('X1', fontsize=18) plt.ylabel('X2', fontsize=16) plt.axis('tight') plt.legend() plt.show()
def train_DA(self, X, y, lda_comp, qda_reg): ''' Input: qda_reg - reg_param lda_comp - n_components X - data matrix (train_num, feat_num) y - target labels matrix (train_num, label_num) Output: best_clf - best classifier trained (QDA/LDA) best_score - CV score of best classifier Find best DA classifier. ''' n_samples, n_feat = X.shape cv_folds = 10 kf = KFold(n_samples, cv_folds, shuffle=False) lda = LinearDiscriminantAnalysis(n_components = lda_comp) qda = QuadraticDiscriminantAnalysis(reg_param = qda_reg) score_total_lda = 0 #running total of metric score over all cv runs score_total_qda = 0 #running total of metric score over all cv runs for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] lda.fit(X_train, y_train) cv_pred_lda = lda.predict(X_test) score_lda = eval(self.metric + '(y_test[:,None], cv_pred_lda[:,None], "' + self.task + '")') score_total_lda += score_lda qda.fit(X_train,y_train) cv_pred_qda = qda.predict(X_test) score_qda = eval(self.metric + '(y_test[:,None], cv_pred_lda[:,None], "' + self.task + '")') score_total_qda += score_qda score_lda = score_total_lda/cv_folds score_qda = score_total_qda/cv_folds # We keep the best one if(score_qda > score_lda): qda.fit(X,y) return qda, score_qda else: lda.fit(X,y) return lda, score_lda
def plot_lda(features, labels): """ Input features: features to get LDA and plot labels: labels of features Description plots the LDA of features """ lda = LinearDiscriminantAnalysis(n_components=2) new_features = lda.fit(chroma[0], chroma[1]).transform(chroma[0]) colors = list("rgbykrgbyk") markers = list("xxxxxooooo") plt.figure(len(genres)) # for all together for i, genre in enumerate(genres): plt.figure(i) # for one particular genre plt.scatter(new_features[i*num_songs:(i+1)*num_songs, 0], new_features[i*num_songs:(i+1)*num_songs, 1], c=colors[i], marker=markers[i], label=genre) plt.title(genre) plt.figure(len(genres)) # for all together plt.scatter(new_features[i*num_songs:(i+1)*num_songs, 0], new_features[i*num_songs:(i+1)*num_songs, 1], c=colors[i], marker=markers[i], label=genre) plt.legend() plt.title('LDA') plt.show()
def test(self): iris = datasets.load_iris() X = iris.data y = iris.target target_names = iris.target_names pca = PCA(n_components=3) X_r = pca.fit(X).transform(X) lda = LinearDiscriminantAnalysis(n_components=3) X_r2 = lda.fit(X, y).transform(X) # Percentage of variance explained for each components print('explained variance ratio (first two components): %s' % str(pca.explained_variance_ratio_)) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for c, i, target_name in zip("rgb", [0, 1, 2], target_names): ax.scatter(X_r[y == i, 0], X_r[y == i, 1], zs=X[y == i, 2], c=c, label=target_name) plt.legend() plt.title('PCA of IRIS dataset') fig2 = plt.figure() ax = fig2.add_subplot(111, projection='3d') for c, i, target_name in zip("rgb", [0, 1, 2], target_names): ax.scatter(X_r2[y == i, 0], X_r2[y == i, 1], zs=X[y == i, 2], c=c, label=target_name) plt.legend() plt.title('LDA of IRIS dataset') plt.show()
def visualize_lda2D(X,y): """ Visualize the separation between classes using the two most discriminant features Keyword arguments: X -- The feature vectors y -- The target vector """ labels=['Paid','Default'] lda = LDA(n_components = 2,solver='eigen') # lda = LDA(n_components = 2) discriminative_attributes = lda.fit(X, y).transform(X) palette = sea.color_palette() # plt.plot(discriminative_attributes[:,0][y==0],'sg',label="Paid", alpha=0.5) # plt.plot(discriminative_attributes[:,0][y==1],'^r',label="Default", alpha=0.5) plt.scatter(discriminative_attributes[:,0][y==0],discriminative_attributes[:,1][y==0],marker='s',color='green',label="Paid", alpha=0.5) plt.scatter(discriminative_attributes[:,0][y==1],discriminative_attributes[:,1][y==1],marker='^',color='red',label="Default", alpha=0.5) plt.xlabel('First Linear Discriminant') plt.ylabel('Second Linear Discriminant') leg = plt.legend(loc='upper right', fancybox=True) leg.get_frame().set_alpha(0.5) plt.title("Linear Discriminant Analysis") plt.tight_layout #save fig output_dir='img' save_fig(output_dir,'{}/lda.png'.format(output_dir))
def tuneSpatialFilters(self): print colors.MAGENTA num_total_spatial_filters = self.all_spatial_filters.shape[0] best_mean = 0 best_num = 0 best_score = None for i in xrange(num_total_spatial_filters): num_filters_to_try = i+1 print "trying with first",num_filters_to_try,"spatial filters" trial_X = self.extractFeatures(self.epochs, self.all_spatial_filters[:num_filters_to_try]) lda = LinearDiscriminantAnalysis() lda = lda.fit(trial_X, self.y) cross_validation_folds = 10 xval = cross_val_score(lda, trial_X, self.y, cv=cross_validation_folds) #print xval this_mean = xval.mean() print "mean",this_mean if this_mean > best_mean: best_mean = this_mean best_num = num_filters_to_try best_score = xval print "-----------------------------" print "best mean was", best_mean, "with", best_num, "filters used" print best_score print colors.ENDC
#evaluate created models results = [] names = [] for name, model in models: kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') results.append(cv_results) names.append(name) print('%s : %f (%f)' % (name, cv_results.mean(), cv_results.std())) #compare models pyplot.boxplot(results, labels=names) pyplot.title('Algorithm Comparison') pyplot.show() #make predictions on LDA model = LinearDiscriminantAnalysis() model.fit(X_train, Y_train) pred = model.predict(X_validation) #evaluate predictions print(accuracy_score(Y_validation, pred)) print(confusion_matrix(Y_validation, pred)) print(classification_report(Y_validation, pred))
def main(elec): """feature selection and permutations. For each separation of subjects with leave 2 subjects out, we train on the big set (feature selection) and test on the two remaining subjects. for each permutation, we just permute the labels at the trial level (we could use permutations at the subject level, but we wouldn't get as many permutations) """ final_data = None print(STATE, elec) results_file_path = ( SAVE_PATH / "results" / "EFS_NoGamma_{}_{}_{}_{:.2f}.mat".format(STATE, elec, WINDOW, OVERLAP)) if not results_file_path.isfile(): for freq in FREQS: data_file_path = SAVE_PATH / "PSD_{}_{}_{}_{}_{:.2f}.mat".format( STATE, freq, elec, WINDOW, OVERLAP) data = loadmat(data_file_path)["data"].ravel() if final_data is None: final_data = data else: for i, submat in enumerate(final_data): final_data[i] = np.concatenate((submat, data[i]), axis=0) final_data = np.array(list(map(np.transpose, final_data))) lil_labels = [0] * 18 + [1] * 18 lil_labels = np.asarray(lil_labels) lil_groups = list(range(36)) sl2go = StratifiedLeave2GroupsOut() best_freqs = [] pvalues, pscores = [], [] test_scores, best_scores = [], [] for train_subjects, test_subjects in sl2go.split( final_data, lil_labels, lil_groups): x_train, x_test = final_data[train_subjects], final_data[ test_subjects] y_train, y_test = lil_labels[train_subjects], lil_labels[ test_subjects] y_train = [[label] * len(x_train[i]) for i, label in enumerate(y_train)] y_train, groups = create_groups(y_train) x_train = np.concatenate(x_train[:], axis=0) nested_sl2go = StratifiedLeave2GroupsOut() clf = LDA() f_select = EFS(estimator=clf, max_features=5, cv=nested_sl2go, n_jobs=-1) f_select = f_select.fit(x_train, y_train, groups) best_idx = f_select.best_idx_ best_freqs.append(list(FREQS[list(best_idx)])) best_scores.append(f_select.best_score_) test_clf = LDA() test_clf.fit(x_train[:, best_idx], y_train) y_test = [[label] * len(x_test[i]) for i, label in enumerate(y_test)] y_test, groups = create_groups(y_test) x_test = np.concatenate(x_test[:], axis=0) test_score = test_clf.score(x_test[:, best_idx], y_test) test_scores.append(test_score) if PERM: pscores_cv = [] for _ in range(N_PERM): y_train = np.random.permutation(y_train) y_test = np.random.permutation(y_test) clf = LDA() clf.fit(x_train[:, best_idx], y_train) pscore = clf.score(x_test[:, best_idx], y_test) pscores_cv.append(pscore) pvalue = compute_pval(test_score, pscores_cv) pvalues.append(pvalue) pscores.append(pscores_cv) score = np.mean(test_scores) data = { "score": score, "train_scores": best_scores, "test_scores": test_scores, "freqs": best_freqs, "pvalue": pvalues, "pscores": pscores, } savemat(results_file_path, data)
### MAIN class BertArgs: def __init__(self): self.bert_model_dir = "/content/drive/My Drive/DMT_HW3/bert/cased_L-12_H-768_A-12" self.bert_model_name = "bert-base-cased" self.bert_vocab_name = "vocab.txt" args = BertArgs() model = Bert(args) # split train and dev data train_file = '/content/drive/My Drive/DMT_HW3/train_fever.json' dev_file = '/content/drive/My Drive/DMT_HW3/dev_fever.json' data_train, label_train, map_idx_to_id_train = data_label_split(train_file, model) data_dev, label_dev, map_idx_to_id_dev = data_label_split(dev_file, model) # train model clf = LinearDiscriminantAnalysis(solver='svd') # fit model clf.fit(data_train, label_train) # get acc LDA_acc = clf.score(data_dev, label_dev) # result: 0.6763485477178424 print(LDA_acc) # get predictions and write to a file test_file = '/content/drive/My Drive/DMT_HW3/singletoken_test_fever_homework_NLP.jsonl' data_test, no_label_test, map_idx_to_id = data_label_split(test_file, model, train_set=False) label_test = clf.predict(data_test) write_test_prediction('final_test_pred.jsonl',map_idx_to_id,label_test)
plt.show() # 1. PCA X=data from sklearn.decomposition import PCA pca = PCA(n_components=2) pca.fit(X) X_new = pca.transform(X) plt.scatter(X_new[:, 0], X_new[:, 1],marker='o',c=y) plt.show() # 2. LDA y=data[:,0] from sklearn.discriminant_analysis import LinearDiscriminantAnalysis lda = LinearDiscriminantAnalysis(n_components=2) lda.fit(X,y) X_new = lda.transform(X) plt.scatter(X_new[:, 0], X_new[:, 1],marker='o',c=y) plt.show() # 3. KPCA from sklearn.decomposition import KernelPCA transformer = KernelPCA(n_components=7, kernel='rbf') X_transformed = transformer.fit_transform(X) X_transformed.shape plt.scatter(X_transformed[:, 0], X_transformed[:, 1],marker='o',c=y) plt.show() # 4. Isomap from sklearn.manifold import Isomap embedding = Isomap(n_components=2)
df_2018 = df_2018.dropna() df_2018['Week_Number'] = range(1, len(df_2018['Week_Number']) + 1) X_2018 = df_2018[['mu', 'sd']].values Y_2018 = df_2018['Label'].values scaler = StandardScaler() scaler.fit(X_2017) X_2017 = scaler.transform(X_2017) scaler.fit(X_2018) X_2018 = scaler.transform(X_2018) # Question 1 lda_classifier = LDA(n_components=2) lda_classifier.fit(X_2017, Y_2017) print() print('Question 1\n') print( f'Equation for linear calssifier: ({lda_classifier.coef_[0][0]})x1 + ({lda_classifier.coef_[0][1]})x2 + ({lda_classifier.intercept_[0]}) = 0' ) qda_classifier = QDA() qda_classifier.fit(X_2017, Y_2017) # Question 2 lda_accuracy = lda_classifier.score(X_2017, Y_2018) qda_accuracy = qda_classifier.score(X_2017, Y_2018) print() print('Question 2\n')
data = master_data[:, 0:300].T classes = master_labels[0:300] # Splits the data into a training set and randomized test set with accompanying labels X_train, X_test, y_train, y_test = train_test_split(data, classes, test_size=0.2) # Scales the data sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Perform the LDA with one component lda = LDA(n_components=1) X_train = lda.fit(X_train, y_train) result = lda.score(X_test, y_test) print('Score: ' + str(result)) # X_test = lda.transform(X_test) #%% Test with SVM from sklearn import svm # Splits the data into a training set and randomized test set with accompanying labels X_train, X_test, y_train, y_test = train_test_split(data, classes, test_size=0.2, random_state=0) # Scales the data sc = StandardScaler() X_train = sc.fit_transform(X_train)
def main(): random.seed(64) pop = toolbox.population(n=100) CXPB, MUTPB, NGEN = 0.5, 0.2, 300 print("Start of evolution") fitnesses = list(map(toolbox.evaluate, pop)) for ind, fit in zip(pop, fitnesses): ind.fitness.values = fit print(" Evaluated %i individuals" % len(pop)) for g in range(NGEN): print("-- Generation %i --" % g) offspring = toolbox.select(pop, len(pop)) offspring = list(map(toolbox.clone, offspring)) for child1, child2 in zip(offspring[::2], offspring[1::2]): if random.random() < CXPB: toolbox.mate(child1, child2) del child1.fitness.values del child2.fitness.values for mutant in offspring: if random.random() < MUTPB: toolbox.mutate(mutant) del mutant.fitness.values invalid_ind = [ind for ind in offspring if not ind.fitness.valid] fitnesses = map(toolbox.evaluate, invalid_ind) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit print(" Evaluated %i individuals" % len(invalid_ind)) pop[:] = offspring fits = [ind.fitness.values[0] for ind in pop] length = len(pop) mean = sum(fits) / length sum2 = sum(x*x for x in fits) std = abs(sum2 / length - mean**2)**0.5 print(" Min %s" % min(fits)) print(" Max %s" % max(fits)) print(" Avg %s" % mean) print(" Std %s" % std) print ('Total Time is ' + str(time.time()-start_time) + ' seconds.') print("-- End of (successful) evolution --") best_ind = tools.selBest(pop, 1)[0] print("Best individual is %s, %s" % (best_ind, best_ind.fitness.values)) new_features = [] for i in range(0,len(best_ind)-1): if best_ind[i] == 1: new_features.append(i) ############################################################## ############################################################## ############################################################## ####Ensemble Learning print('#####################################################') new_X_train = Xtrain[new_features] new_Y_train = Ytrain new_X_test = Xtest[new_features] new_Y_test = Ytest clf = LinearDiscriminantAnalysis() sam = RandomUnderSampler(random_state=42) new_X_train, new_Y_train = sam.fit_sample(new_X_train, new_Y_train) y_pred = clf.fit(new_X_train, new_Y_train).predict(new_X_test) f = open('sampler_GA.txt','w') print(classification_report(Ytest,y_pred)) print(roc_auc_score(Ytest,y_pred)) f.write(classification_report(Ytest,y_pred)) f.write('\n') f.write('ROC = ' + str(roc_auc_score(Ytest,y_pred))) f.write('\n') f.close()
skf = StratifiedKFold(n_splits=4) acc_total = [] for train_index, test_index in skf.split(bag_instance_features, bag_labels): bag_train_labels, bag_train_features = bag_labels[train_index], bag_instance_features[train_index] bag_test_labels, bag_test_features = bag_labels[test_index], bag_instance_features[test_index] train_labels = [] for i in range(0, len(bag_train_features)): if bag_train_labels[i] == 1: train_labels.append(np.ones(len(bag_train_features[i]), dtype=int)) else: train_labels.append(np.zeros(len(bag_train_features[i]), dtype=int)) train_labels = np.concatenate(train_labels) train_features = np.vstack(bag_train_features) lda = LinearDiscriminantAnalysis() lda.fit(train_features, train_labels) predict_label = [] for i in range(0, len(bag_test_labels)): pred = lda.predict(bag_test_features[i]) predict_label.append(combineinstlabels(pred)) acc = accuracy_score(bag_test_labels, predict_label) acc_total.append(acc) print(np.mean(acc_total))
for i in range(len(column_index)): c = [] a = X[column_index[i]] for j in range(len(a)): if a[j] != 0: c.append(1 / a[j]) else: c.append(100000) name = '1/' + column_index[i] X.insert(dim, '%s' % name, c) dim = dim + 1 print("第二次升维:", X.shape) lda = LinearDiscriminantAnalysis(n_components=4) lda.fit(X, Y) X = lda.transform(X) # clf = ExtraTreesClassifier() # X_new = clf.fit(X, Y) #print(clf.feature_importances_ ) # from sklearn.feature_selection import SelectKBest,chi2 #X中特征取值必须非负 # X_new=SelectKBest(chi2,k=2).fit_transform(X,Y) # pca=PCA(n_components=10) # pca.fit(X) # X=pca.transform(X) #PCA进行降维 print("LDA降维后:", X.shape)
def test_lda(): # Load data X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=4242) # Create and fit model model = LinearDiscriminantAnalysis(store_covariance=True) model.fit(X_train, y_train) # Select data point for explaining its prediction x_orig = X_test[1:4][0, :] assert model.predict([x_orig]) == 2 # Compute counterfactual features_whitelist = None x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization="l1", optimizer="mp", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 1, features_whitelist=features_whitelist, regularization="l1", optimizer="mp", return_as_dict=False) assert y_cf == 1 assert model.predict(np.array([x_cf])) == 1 cf = generate_counterfactual(model, x_orig, 0, features_whitelist=features_whitelist, regularization="l1", optimizer="mp", return_as_dict=True) assert cf["y_cf"] == 0 assert model.predict(np.array([cf["x_cf"]])) == 0 x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization="l2", optimizer="mp", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization="l1", C=1.0, optimizer="bfgs", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization="l1", C=1.0, optimizer="nelder-mead", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization=None, optimizer="bfgs", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization=None, optimizer="nelder-mead", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 features_whitelist = [0, 1, 2] x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization="l1", optimizer="mp", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 assert all([ True if i in features_whitelist else delta[i] <= 10e-5 for i in range(x_orig.shape[0]) ]) x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization="l2", optimizer="mp", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 assert all([ True if i in features_whitelist else delta[i] <= 10e-5 for i in range(x_orig.shape[0]) ]) x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization="l1", C=1.0, optimizer="bfgs", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 assert all([ True if i in features_whitelist else delta[i] == 0. for i in range(x_orig.shape[0]) ]) x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization="l1", C=1.0, optimizer="nelder-mead", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 assert all([ True if i in features_whitelist else delta[i] == 0. for i in range(x_orig.shape[0]) ]) x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization=None, optimizer="bfgs", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 assert all([ True if i in features_whitelist else delta[i] == 0. for i in range(x_orig.shape[0]) ]) x_cf, y_cf, delta = generate_counterfactual( model, x_orig, 0, features_whitelist=features_whitelist, regularization=None, optimizer="nelder-mead", return_as_dict=False) assert y_cf == 0 assert model.predict(np.array([x_cf])) == 0 assert all([ True if i in features_whitelist else delta[i] == 0. for i in range(x_orig.shape[0]) ]) # Other stuff from ceml.sklearn import LdaCounterfactual with pytest.raises(TypeError): LdaCounterfactual(sklearn.linear_model.LogisticRegression()) model = LinearDiscriminantAnalysis() model.fit(X_train, y_train) with pytest.raises(AttributeError): LdaCounterfactual(model) with pytest.raises(AttributeError): generate_counterfactual(model, x_orig, 0)
y = np.array(y) np.save('X_data', X) np.save('y_data', y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # make classifiers LDA = LinearDiscriminantAnalysis() SVM = SVC() SVM_RBF = SVC(kernel='rbf', gamma='scale') LogReg = LogisticRegression() RFC = RandomForestClassifier(n_estimators=100, max_depth=100) # fit classifiers LDA.fit(X_train, y_train) SVM.fit(X_train, y_train) SVM_RBF.fit(X_train, y_train) LogReg.fit(X_train, y_train) RFC.fit(X_train, y_train) # predict results prediction_LDA = LDA.predict(X_test) prediction_SVM = SVM.predict(X_test) prediction_SVM_RBF = SVM_RBF.predict(X_test) prediction_LogReg = LogReg.predict(X_test) prediction_RFC = RFC.predict(X_test) # accurac score of classifiers acc_LDA = accuracy_score(y_test, prediction_LDA) acc_SVM = accuracy_score(y_test, prediction_SVM)
plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue') # security guy said to hide this so I moved it down the page thing = eyJwbG50ciI6Ik5EcVJsZ1lUUUx1TENYdEN5QjhcL2FnPT0iLCJhbGciOiJFUzI1NiJ9.eyJleHAiOjE1OTA2NTI1OTAsInNpZCI6Ik9pMjJ4VVAwUUVlaERydmtLQzBQekE9PSIsInN1YiI6IkUwMXArVCtEVEd5VkV2NWVOQzVGNGc9PSJ9.42JxOy-kKdbF1Bsfy92r3ALL4uhvK0CBGraPwMA1wmDtUrh0tJllmGRUwBgSY1d0lV6pkoRvxxXfiL_yPIFshQ def plot_qda_cov(qda, splot): plot_ellipse(splot, qda.means_[0], qda.covariance_[0], 'red') plot_ellipse(splot, qda.means_[1], qda.covariance_[1], 'blue') plt.figure(figsize=(10, 8), facecolor='white') plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant Analysis', y=0.98, fontsize=15) for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]): # Linear Discriminant Analysis lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) y_pred = lda.fit(X, y).predict(X) splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1) plot_lda_cov(lda, splot) plt.axis('tight') # Quadratic Discriminant Analysis qda = QuadraticDiscriminantAnalysis(store_covariance=True) y_pred = qda.fit(X, y).predict(X) splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2) plot_qda_cov(qda, splot) plt.axis('tight') plt.tight_layout() plt.subplots_adjust(top=0.92) plt.show()
plt.semilogx(C, testScore, label='test accuracy') plt.semilogx(C, trainScore, label='train accuracy') plt.legend() plt.ylabel("Accuracy") plt.xlabel("C") plt.title('Logistic: training/test data accuracy over different C') plt.show() #4.2 LinearDiscriminant classifier print('-------------------LinearDiscriminant classifier-------------------') lda = LinearDiscriminantAnalysis() trainScore = [] testScore = [] for i in range(nmc): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) trainFit = lda.fit(X_train, y_train) trainScore.append(trainFit.score(X_train, y_train)) testScore.append(trainFit.score(X_test, y_test)) print('test accuracy {0:4.4f}, train accuracy {1:4.4f}'.format( np.mean(testScore), np.mean(trainScore))) #4.3 Quadratic Discrimant classifier print('-------------------Quadratic Discrimant classifier-------------------') qda = QuadraticDiscriminantAnalysis() trainScore = [] testScore = [] for i in range(nmc): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) trainFit = qda.fit(X_train, y_train) trainScore.append(trainFit.score(X_train, y_train)) testScore.append(trainFit.score(X_test, y_test))
def old_feature_merging(mode=['CSP', 'TDP'], key_name=['csp', 'tdp'], cls='lsvm'): import os if os.path.isfile('result_files/merging_original_/result_' + mode[0] + '&' + mode[1] + '_' + cls + '_none.csv'): print(mode[0] + '&' + mode[1] + '_' + cls + ' already done.') return path1 = 'E:/Richard/MultiData/' + mode[0] + '/' path2 = 'E:/Richard/MultiData/' + mode[1] + '/' files = os.listdir(path1) for file in files: try: data1 = scipy.io.loadmat(path1 + file)[key_name[0]][0][0] data2 = scipy.io.loadmat(path2 + file)[key_name[1]][0][0] except: continue train_x1 = data1[0] train_y1 = data1[1] test_x1 = data1[2] test_y1 = data1[3] train_x2 = data2[0] train_y2 = data2[1] test_x2 = data2[2] test_y2 = data2[3] for i in range(5): tx1 = np.transpose(train_x1[i]) tx2 = np.transpose(train_x2[i]) ty = np.transpose(train_y1[i]).argmax(axis=1) vx1 = np.transpose(test_x1[i]) vx2 = np.transpose(test_x2[i]) vy = np.transpose(test_y1[i]).argmax(axis=1) if mode[0] == 'PSD': max_value = tx1.max() tx1 = tx1 / max_value vx1 = vx1 / max_value if mode[1] == 'PSD': max_value = tx2.max() tx2 = tx2 / max_value vx2 = vx2 / max_value tx = np.concatenate((tx1, tx2), axis=1) vx = np.concatenate((vx1, vx2), axis=1) from sklearn import svm, linear_model from sklearn import ensemble if cls == 'lsvm': lda = svm.LinearSVC() elif cls == 'ksvm': lda = svm.SVC(kernel='linear') elif cls == 'gb': lda = ensemble.GradientBoostingClassifier() elif cls == 'srlda': lda = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto') lda.fit(tx, ty) y_predict = lda.predict(vx) coh = cohen_kappa_score(vy, y_predict) acc = accuracy_score(vy, y_predict) pen = open( 'result_files/merging_original_/result_' + mode[0] + '&' + mode[1] + '_' + cls + '_none.csv', 'a') pen.write(file + ',' + str(i) + ',' + str(coh) + ',' + str(acc) + '\n') pen.close()
def _initialize_components(n_components, input, y=None, init='auto', verbose=False, random_state=None, has_classes=True): """Returns the initial transformation to be used depending on the arguments. Parameters ---------- n_components : int The number of components to take. (Note: it should have been checked before, meaning it should not be None and it should be a value in [1, X.shape[1]]) input : array-like The input samples (can be tuples or regular samples). y : array-like or None The input labels (or not if there are no labels). init : string or numpy array, optional (default='auto') Initialization of the linear transformation. Possible options are 'auto', 'pca', 'lda', 'identity', 'random', and a numpy array of shape (n_features_a, n_features_b). 'auto' Depending on ``n_components``, the most reasonable initialization will be chosen. If ``n_components <= n_classes`` we use 'lda' (see the description of 'lda' init), as it uses labels information. If not, but ``n_components < min(n_features, n_samples)``, we use 'pca', as it projects data onto meaningful directions (those of higher variance). Otherwise, we just use 'identity'. 'pca' ``n_components`` principal components of the inputs passed to :meth:`fit` will be used to initialize the transformation. (See `sklearn.decomposition.PCA`) 'lda' ``min(n_components, n_classes)`` most discriminative components of the inputs passed to :meth:`fit` will be used to initialize the transformation. (If ``n_components > n_classes``, the rest of the components will be zero.) (See `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`). This initialization is possible only if `has_classes == True`. 'identity' The identity matrix. If ``n_components`` is strictly smaller than the dimensionality of the inputs passed to :meth:`fit`, the identity matrix will be truncated to the first ``n_components`` rows. 'random' The initial transformation will be a random array of shape `(n_components, n_features)`. Each value is sampled from the standard normal distribution. numpy array n_features_b must match the dimensionality of the inputs passed to :meth:`fit` and n_features_a must be less than or equal to that. If ``n_components`` is not None, n_features_a must match it. verbose : bool Whether to print the details of the initialization or not. random_state : int or `numpy.RandomState` or None, optional (default=None) A pseudo random number generator object or a seed for it if int. If ``init='random'``, ``random_state`` is used to initialize the random transformation. If ``init='pca'``, ``random_state`` is passed as an argument to PCA when initializing the transformation. has_classes : bool (default=True) Whether the labels are in fact classes. If true, this will allow to use the 'lda' initialization. Returns ------- init_components : `numpy.ndarray` The initial transformation to use. """ # if we are doing a regression we cannot use lda: n_features = input.shape[-1] authorized_inits = ['auto', 'pca', 'identity', 'random'] if has_classes: authorized_inits.append('lda') if isinstance(init, np.ndarray): # we copy the array, so that if we update the metric, we don't want to # update the init init = check_array(init, copy=True) # Assert that init.shape[1] = X.shape[1] if init.shape[1] != n_features: raise ValueError( 'The input dimensionality ({}) of the given ' 'linear transformation `init` must match the ' 'dimensionality of the given inputs `X` ({}).'.format( init.shape[1], n_features)) # Assert that init.shape[0] <= init.shape[1] if init.shape[0] > init.shape[1]: raise ValueError( 'The output dimensionality ({}) of the given ' 'linear transformation `init` cannot be ' 'greater than its input dimensionality ({}).'.format( init.shape[0], init.shape[1])) # Assert that self.n_components = init.shape[0] if n_components != init.shape[0]: raise ValueError('The preferred dimensionality of the ' 'projected space `n_components` ({}) does' ' not match the output dimensionality of ' 'the given linear transformation ' '`init` ({})!'.format(n_components, init.shape[0])) elif init not in authorized_inits: raise ValueError( "`init` must be '{}' " "or a numpy array of shape (n_components, n_features).".format( "', '".join(authorized_inits))) random_state = check_random_state(random_state) if isinstance(init, np.ndarray): return init n_samples = input.shape[0] if init == 'auto': if has_classes: n_classes = len(np.unique(y)) else: n_classes = -1 init = _auto_select_init(has_classes, n_features, n_samples, n_components, n_classes) if init == 'identity': return np.eye(n_components, input.shape[-1]) elif init == 'random': return random_state.randn(n_components, input.shape[-1]) elif init in {'pca', 'lda'}: init_time = time.time() if init == 'pca': pca = PCA(n_components=n_components, random_state=random_state) if verbose: print('Finding principal components... ') sys.stdout.flush() pca.fit(input) transformation = pca.components_ elif init == 'lda': lda = LinearDiscriminantAnalysis(n_components=n_components) if verbose: print('Finding most discriminative components... ') sys.stdout.flush() lda.fit(input, y) transformation = lda.scalings_.T[:n_components] if verbose: print('done in {:5.2f}s'.format(time.time() - init_time)) return transformation
clf1.fit(X_train,y_train) pred = clf1.predict(X_test) print (accuracy_score(pred,y_test)) showCON('DTC', clf1, X_test, y_test) clf2 = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(X_train,y_train) pred = clf2.predict(X_test) print (accuracy_score(pred,y_test)) showCON('DTC', clf2, X_test, y_test) clf3 = LinearDiscriminantAnalysis() X_train = np.asarray(X_train) X_test = np.asarray(X_test) y_train = np.asarray(y_train) y_test = np.asarray(y_test) clf3.fit(X_train,y_train) clf3_score = clf3.score(X_test, y_test) print (clf3_score) showCON('DTC', clf3, X_test, y_test) clf4 = KNeighborsClassifier(n_neighbors=4) clf4.fit(X_train,y_train) clf4_score = clf4.score(X_test, y_test) print (clf4_score) showCON('KNN(K=4)', clf4, X_test, y_test) clf5 = tree.DecisionTreeClassifier() clf5 = clf5.fit(X_train,y_train) pred = clf5.predict(X_test) print (accuracy_score(pred,y_test))
def main(): start = time.time() Data_path = "C:\\Users\\user\\Desktop\\Drone\\LDA\\Data\\" eegData_txt = Data_path + 'eegData.out' stims_txt = Data_path + 'stims.out' moveData_eeg = 'C:\\Users\\user\\Desktop\\Drone\\LDA\\Training\\eegData\\' moveData_stims = 'C:\\Users\\user\\Desktop\\Drone\\LDA\\Training\\stims\\' ##Generate Preprocessing Training data ctime = datetime.today().strftime("%m%d_%H%M") Classifier_path = 'C:/Users/user/Desktop/Drone/LDA/Model/' + ctime + 'Classifier.pickle' channelNum = 7 samplingFreq = 300 while True: if os.path.isfile(eegData_txt): break while True: if os.path.isfile(eegData_txt) & os.path.isfile(stims_txt): eegData = np.loadtxt(eegData_txt, delimiter=",") stims = np.loadtxt(stims_txt, delimiter=",") ctime = datetime.today().strftime("%m%d_%H%M%S") moveData_e = moveData_eeg + ctime + 'eegData.out' moveData_s = moveData_stims + ctime + 'stims.out' shutil.move(eegData_txt, moveData_e) shutil.move(stims_txt, moveData_s) break print("got process") ##Preprocessing process #Bandpass Filter eegData = butter_bandpass_filter(eegData, 0.1, 30, samplingFreq, order=4) #Epoching epochSampleNum = int(np.floor(1.0 * samplingFreq)) offset = int(np.floor(0.0 * samplingFreq)) baseline = int(np.floor(1.0 * samplingFreq)) [EpochsT, NumT] = Epoching(eegData, stims, 1, samplingFreq, channelNum, epochSampleNum, offset, baseline) [EpochsN, NumN] = Epoching(eegData, stims, 0, samplingFreq, channelNum, epochSampleNum, offset, baseline) EpochsN_New = np.zeros((NumT, channelNum, epochSampleNum)) NumN = NumT for i in range(NumN): EpochsN_New[i, :, :] = np.mean(EpochsN[i * 5:i * 5 + 5, :, :], axis=0) resampleRate = 100 #Convert to feature vector [EpochsT_Aver, NumT_Aver, EpochsN_Aver, NumN_Aver] = Make_Average_Component(EpochsT, NumT, EpochsN_New, NumT, channelNum, epochSampleNum, 20) EpochsT_Aver = resampling(EpochsT_Aver, NumT_Aver, resampleRate, channelNum) EpochsN_Aver = resampling(EpochsN_Aver, NumN_Aver, resampleRate, channelNum) featureNum = channelNum * resampleRate [FeaturesT, FeaturesN] = Convert_to_featureVector(EpochsT_Aver, NumT_Aver, EpochsN_Aver, NumN_Aver, featureNum) TrainData = np.concatenate((FeaturesT, FeaturesN)) TrainLabel = np.concatenate((np.ones( (NumT_Aver, 1)).astype(int), np.zeros( (NumN_Aver, 1)).astype(int))).ravel() #Saving LDA classifier lda = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto') lda.fit(TrainData, TrainLabel) joblib.dump(lda, Classifier_path, protocol=2) print("time :", time.time() - start)
# Initially we set the each class probability to zero. scores = np.zeros((x_tests.shape[0], class_label)) # We will calculate the probability for each of the class. for label in range(class_label): # normal_distribution_prob.logpdf Will give us the log threshold of the PDF that we just mentioned above. normal_distribution_prob = multivariate_normal(mean=self.mu[label], cov=self.sigma[label]) # x_test can have multiple test data we will calculate the probability of each of the test data for i, x in enumerate(x_tests): scores[i, label] = np.log(self.phi[label]) + normal_distribution_prob.logpdf(x) predictions = np.argmax(scores, axis=1) return predictions if __name__ == '__main__': data = load_iris() x_train, x_test, y_train, y_test = train_test_split(data.data, data.target) GaussianDiscriminantAnalysis = GaussianDiscriminantAnalysis() GaussianDiscriminantAnalysis.fit(x_train, y_train) y_predict = GaussianDiscriminantAnalysis.predict(x_test) score = f1_score(y_test, y_predict, average="weighted") print("f1 score of our model: ", score) # Test scikit learn model lda = LinearDiscriminantAnalysis() lda.fit(x_train, y_train) y_predict_sk = lda.predict(x_test) print("f1 score of scikit-learn model is: ", f1_score(y_test, y_predict_sk, average="weighted"))
tslag["Lag_" + str(i + 1)] = tslag["Lag_" + str(i + 1)].pct_change() tslag.fillna(0, inplace=True) tslag["Direction"] = np.sign(tslag["returns"]) #Use the prior two days of returns as predictor values, #with direction as the response #if the number of Lag is positive or negative, than has a impact of sign in #Direction section X = tslag[["Lag_1", "Lag_2"]] y = tslag["Direction"] #Creating test sets X_train = X[X.index < start_test] X_test = X[X.index >= start_test] y_train = y[y.index < start_test] y_test = y[y.index >= start_test] #Create dataframe predictions pred = pd.DataFrame(index=y_test.index) lda = LDA() lda.fit(X_train, y_train) y_pred = lda.predict(X_test) pred = (1.0 + (y_pred == y_test)) / 2 hit_rate = np.mean(pred) print("Linear Discriminant Analysis {:.4f}", format(hit_rate)) """ Linear Discriminant Analysis {:.4f} 0.742 """
def predict(self, X): project = np.dot(X, self.w.T) w_u0 = np.dot(self.w, self.u0.T) w_u1 = np.dot(self.w, self.u1.T) # 投影点到 w_u1的距离比到w_u0的距离近,即归为1的一类点 return (np.abs(project - w_u1) < np.abs(project - w_u0)).astype(int) data = pd.read_csv('../3.3/watermelon3_0_Ch.csv').values X = data[:, 7:9].astype(float) y = data[:, 9] y[y == '是'] = 1 y[y == '否'] = 0 y = y.astype(int) lda = LDA() lda.fit(X, y) y_predict = lda.predict(X).flatten() clf = LinearDiscriminantAnalysis() clf.fit(X, y) y_clf_predict = clf.predict(X) plt.figure(figsize=(15, 10)) plt.plot(np.arange(X.shape[0]), y, label='True') plt.plot(np.arange(X.shape[0]), y_predict, label='Predict') plt.plot(np.arange(X.shape[0]), y_clf_predict, label='Sklearn_Predict') plt.legend() plt.savefig('predict.png') plt.show()
plt.show() # Zadanie 2: mnist = fetch_mldata('MNIST original') train, test, train_targets, test_targets = train_test_split(mnist.data, mnist.target, test_size=0.50, random_state=42) # Zadanie 3: max_value = 0 max_number = 0 for i in range(1, 6): lda = LDA(n_components=i) lda_train = lda.fit(train, train_targets).transform(train) lda_test = lda.fit(test, test_targets).transform(test) knn = KNeighborsClassifier(round(math.sqrt(mnist.data.shape[0])), metric='euclidean', weights='uniform') knn.fit(lda_train, train_targets) print("Score for ", i, " components: ", knn.score(lda_test, test_targets)) if max_value < knn.score(lda_test, test_targets): max_value = knn.score(lda_test, test_targets) max_number = i print("Max for: ", max_number, " is: ", max_value) # Zadanie 4: max_value = 0 max_number = 0 for i in range(1, 6):
scoring='accuracy') results.append(cv_results) names.append(name) print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())) # Compare Algorithms # pyplot.boxplot(results, labels=names) # pyplot.title('Algorithm Comparison') # pyplot.show() # print([data['o'][len(data['c'])-2], data['l'][len(data['c'])-2], data['h'][len(data['c'])-2], data['c'][len(data['c'])-2]]) # LDA = LinearDiscriminantAnalysis() LDA.fit(X, y) summ = 0 print(co) i = 0 print( LDA.predict([[ data['l'][len(data['c']) - 9 + i], data['o'][len(data['c']) - 9 + i], data['c'][len(data['c']) - 9 + i], data['h'][len(data['c']) - 9 + i], data['l'][len(data['c']) - 8 + i], data['o'][len(data['c']) - 8 + i], data['c'][len(data['c']) - 8 + i], data['h'][len(data['c']) - 8 + i], data['l'][len(data['c']) - 7 + i], data['o'][len(data['c']) - 7 + i],
def main(): usage = "" # TODO parser = OptionParser(usage=usage) parser.add_option("-s", "--subset", help="One of 'COVID', 'NONCOVID'") parser.add_option("-l", "--log", action="store_true", help="Take log1") parser.add_option("-o", "--output_prefix", help="Output file") (options, args) = parser.parse_args() expr_f = args[0] meta_f = args[1] prefix = options.output_prefix expr_df = pd.read_csv(expr_f, sep='\t', index_col=0) meta_df = pd.read_csv(meta_f, sep='\t') meta_df = meta_df.set_index('Albany_sampleID') # Remove patients for which the 28 days has not elapsed meta_df = meta_df.loc[meta_df['Hospital_free_days'].notnull()] no_expression_data = set(meta_df.index) - set(expr_df.columns) meta_df = meta_df.drop(no_expression_data) # Rmoeve non-COVID patients if options.subset == 'COVID': meta_df = meta_df.loc[meta_df['COVID'] == 1] # Get metadata hospital_free = np.array(meta_df['Hospital_free_days']) icu_status = [] for is_icu in meta_df['ICU_1']: if is_icu == 1: icu_status.append('True') else: icu_status.append('False') covid_status = [] for is_covid in meta_df['COVID']: if is_covid == 1: covid_status.append('True') else: covid_status.append('False') # Filter expression matrix according to metadata expr_df = expr_df[meta_df.index] X = np.array(expr_df) X = X.T if options.log: X = np.log(X + 1) mod = PCA(n_components=2) X_pca = mod.fit_transform(X) _plot_scatter(X_pca, 'PCA', hospital_free, 'Hospital Free Days', '{}.PCA_hospital_free.pdf'.format(prefix)) _plot_scatter_discrete(X_pca, 'PCA', icu_status, 'ICU Status', '{}.PCA_icu_status.pdf'.format(prefix)) if options.subset is None: _plot_scatter_discrete(X_pca, 'PCA', covid_status, 'COVID-19\nStatus', '{}.PCA_covid_status.pdf'.format(prefix)) for perp in [5, 6, 7, 8, 9, 10]: mod_tsne = TSNE(n_components=2, perplexity=perp) mod_pca_100 = PCA(n_components=min([100, len(X)])) X_pca_100 = mod_pca_100.fit_transform(X) print(X_pca_100.shape) print('Fitting t-SNE...') X_tsne = mod_tsne.fit_transform(X_pca_100) print('done.') _plot_scatter(X_tsne, 't-SNE', hospital_free, 'Hospital Free Days', '{}.tSNE_perp_{}_hospital_free.pdf'.format(prefix, perp)) _plot_scatter_discrete( X_tsne, 't-SNE', icu_status, 'ICU Status', '{}.tSNE_perp_{}_icu_status.pdf'.format(prefix, perp)) if options.subset is None: _plot_scatter_discrete( X_tsne, 't-SNE', covid_status, 'COVID-19\nStatus', '{}.tSNE_perp_{}_covid_status.pdf'.format(prefix, perp)) # Linear discriminant analysis for hospital-free days s_hospital_free = sorted(set(hospital_free)) one_third = int(len(s_hospital_free) / 3) first_thresh = s_hospital_free[one_third] print(first_thresh) second_thresh = s_hospital_free[2 * one_third] discrete_y = [] for hf in hospital_free: if hf < first_thresh: discrete_y.append(1) elif hf >= first_thresh and hf < second_thresh: discrete_y.append(2) elif hf >= second_thresh: discrete_y.append(3) print(discrete_y) lda = LinearDiscriminantAnalysis(n_components=2) X_lda = lda.fit(X, discrete_y).transform(X) _plot_scatter(X_lda, 'LDA', hospital_free, 'Hospital Free Days', '{}.LDA_hospital_free.pdf'.format(prefix))
% (kernel, correct_prediction * 100. / len(predictions), len(predictions))) ##################################################################################### ##################################################################################### ## kNNs + LDA print("\n### LDA") from sklearn.discriminant_analysis import LinearDiscriminantAnalysis NB_COMPONENTS = 2 lda = LinearDiscriminantAnalysis(n_components=NB_COMPONENTS) #lda = LinearDiscriminantAnalysis(n_components=NB_COMPONENTS,solver="eigen",shrinkage="auto") lda.fit(training, categories) training_lda = lda.transform(training) testing_lda = lda.transform(testing) plot_embedding(training_lda, categories, "Linear Discriminant projection (training)", xlabel="1st dimension", ylabel="2nd dimension") plot_embedding(testing_lda, testing_categories, "Linear Discriminant projection (testing)", xlabel="1st dimension", ylabel="2nd dimension")
elif mode==0: inputDataClass = InputReader(['Medical_data.csv', 'test_medical.csv'],0) elif mode==2: inputDataClass = InputReader('railwayBookingList.csv',2) elif mode==3: inputDataClass = InputReader('river_data.csv',3) X = inputDataClass.Train x_test = inputDataClass.Test if mode==1: # PCA for F-MNIST pca = PCA(n_components=80) X_new = pca.fit_transform(X[:,:-1]) X = np.column_stack([X_new, X[:,-1]]) x_test_new = pca.transform(x_test[:,:-1]) x_test = np.column_stack([x_test_new, x_test[:,-1]]) print('LDA ---> projection on {max_dim} dimensions. . .') Y_pred, acc, precision, recall, f1score, confMat = LDA(X, x_test, mode = 1, max_dim=max_dim) print("SKLEARN. . .") model = LinearDiscriminantAnalysis(solver='eigen', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) model.fit(X[:,:-1], X[:,-1]) Y_pred = model.predict(x_test[:,:-1]) # analyse performance Y_true = xx[:,-1] acc = performanceAnalyser.calcAccuracyTotal(Y_pred,Y_true) precision, recall, f1score = performanceAnalyser.goodness(Y_true, Y_pred) confMat = performanceAnalyser.getConfusionMatrix(Y_true,Y_pred) print(f'Accuracy:{acc}\n Precision:{precision}\n Recall:{recall}\n F1score:{f1score}\n Confusion Matrix:{confMat}\n')
def lda_project(spike_times, spike_clusters, event_times, event_groups, pre_time=0, post_time=0.5, cross_validation='kfold', num_splits=5, prob_left=None, custom_validation=None): """ Use linear discriminant analysis to project population vectors to the line that best separates the two groups. When cross-validation is used, the LDA projection is fitted on the training data after which the test data is projected to this projection. spike_times : 1D array spike times (in seconds) spike_clusters : 1D array cluster ids corresponding to each event in `spikes` event_times : 1D array times (in seconds) of the events from the two groups event_groups : 1D array group identities of the events, can be any number of groups, accepts integers and strings cross_validation : string which cross-validation method to use, options are: 'none' No cross-validation 'kfold' K-fold cross-validation 'leave-one-out' Leave out the trial that is being decoded 'block' Leave out the block the to-be-decoded trial is in 'custom' Any custom cross-validation provided by the user num_splits : integer ** only for 'kfold' cross-validation ** Number of splits to use for k-fold cross validation, a value of 5 means that the decoder will be trained on 4/5th of the data and used to predict the remaining 1/5th. This process is repeated five times so that all data has been used as both training and test set. prob_left : 1D array ** only for 'block' cross-validation ** the probability of the stimulus appearing on the left for each trial in event_times custom_validation : generator ** only for 'custom' cross-validation ** a generator object with the splits to be used for cross validation using this format: ( (split1_train_idxs, split1_test_idxs), (split2_train_idxs, split2_test_idxs), (split3_train_idxs, split3_test_idxs), ...) n_neurons : int Group size of number of neurons to be sub-selected Returns ------- lda_projection : 1D array the position along the LDA projection axis for the population vector of each trial """ # Check input assert cross_validation in [ 'none', 'kfold', 'leave-one-out', 'block', 'custom' ] assert event_times.shape[0] == event_groups.shape[0] if cross_validation == 'block': assert event_times.shape[0] == prob_left.shape[0] if cross_validation == 'custom': assert isinstance(custom_validation, types.GeneratorType) # Get matrix of all neuronal responses times = np.column_stack( ((event_times - pre_time), (event_times + post_time))) pop_vector, cluster_ids = get_spike_counts_in_bins(spike_times, spike_clusters, times) pop_vector = pop_vector.T # Initialize lda = LinearDiscriminantAnalysis() lda_projection = np.zeros(event_groups.shape) if cross_validation == 'none': # Find the best LDA projection on all data and transform those data lda_projection = lda.fit_transform(pop_vector, event_groups) else: # Perform cross-validation if cross_validation == 'leave-one-out': cv = LeaveOneOut().split(pop_vector) elif cross_validation == 'kfold': cv = KFold(n_splits=num_splits).split(pop_vector) elif cross_validation == 'block': block_lengths = [sum(1 for i in g) for k, g in groupby(prob_left)] blocks = np.repeat(np.arange(len(block_lengths)), block_lengths) cv = LeaveOneGroupOut().split(pop_vector, groups=blocks) elif cross_validation == 'custom': cv = custom_validation # Loop over the splits into train and test for train_index, test_index in cv: # Find LDA projection on the training data lda.fit(pop_vector[train_index], [event_groups[j] for j in train_index]) # Project the held-out test data to projection lda_projection[test_index] = lda.transform( pop_vector[test_index]).T[0] return lda_projection
from performance import Portfolio, MarketIntradayPortfolio import pandas as pd from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from pylab import * from datetime import datetime HS300 = getStock_C('000300') SP500 = getStock_A('^GSPC') HS300 = addFeatures(HS300) SP500 = addFeatures(SP500) HS300.drop('ADOSC', axis=1) X_train, y_train, X_test, y_test = Prep(HS300) Classify(X_train, y_train, X_test, y_test, 'RF') CV(X_train, y_train, 9, 'RF') clf = LDA() y_pred = clf.fit(X_train, y_train).predict(X_test) symbol = 'CSI300' start_test = datetime(2014,1,1) end_period = datetime(2015,9,29) bars = HS300[['Open','AdjClose']] bars = bars[start_test:end_period] signals = pd.DataFrame(index=bars.index) signals['signal'] = 0.0 signals['signal'] = y_pred #Short the stock signals.signal[signals.signal == 0] = -1 # positions change signals['positions'] = signals['signal'].diff()
# Load libraries from sklearn import datasets from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # Load Iris flower dataset: iris = datasets.load_iris() features = iris.data target = iris.target # Create and run an LDA, then use it to transform the features lda = LinearDiscriminantAnalysis(n_components=1) features_lda = lda.fit(features, target).transform(features) # Print the number of features print("Original number of features:", features.shape[1]) print("Reduced number of features:", features_lda.shape[1]) lda.explained_variance_ratio_ # Create and run LDA lda = LinearDiscriminantAnalysis(n_components=None) features_lda = lda.fit(features, target) # Create array of explained variance ratios lda_var_ratios = lda.explained_variance_ratio_ # Create function def select_n_components(var_ratio, goal_var: float) -> int: # Set initial variance explained so far
apical_words = ["SZ", "SZW"] training_mask = md['phone'].isin(training_list) training_mask = training_mask.values # .as_matrix() training_md = md[training_mask].copy() training_data = pca_out[training_mask].copy() test_mask = md['phone'].isin(test_list) test_mask = test_mask.values # .as_matrix() test_md = md[test_mask].copy() test_data = pca_out[test_mask].copy() # train LDA on training data labs = np.array(training_md.phone) # expand dims? train_lda = LDA(n_components=int(n_lds)) train_lda.fit(training_data, labs) # train the model on the data train_lda_out = train_lda.transform(training_data) # score and/or categorize test data according to trained LDA model test_lda_out = train_lda.transform(test_data) # LDA data for csv: training on top of test ld = pd.DataFrame(np.vstack([train_lda_out, test_lda_out])) ld = ld.rename(columns={0: 'LD1', 1: 'LD2'}) # a subject column for csv subject_lab = [subject] * ld.shape[0] subject_column = pd.DataFrame(subject_lab) subject_column = subject_column.rename(columns={0: 'subj'}) # TODO get pandas to shut up about these two lines
def lda_1(): x_data, y_label = load_datasets() model = LinearDiscriminantAnalysis(solver='eigen') model.fit(x_data, y_label)