def startlda(self): from sklearn.lda import LDA clf=LDA() X=np.array(self.traindata) Y=np.array(self.trainclass) y=self.testdata X=[[float(y) for y in x] for x in X] Y=[[int(y) for y in x] for x in Y] y=[[float(y) for y in x] for x in self.testdata] clf.fit(X,Y) print clf.predict(y)
class Ensemble: def __init__(self, data): self.rf = RandomForestClassifier(n_estimators=80, n_jobs=-1, min_samples_split=45, criterion='entropy') self.lda = LDA() self.dec = DecisionTreeClassifier(criterion='entropy') self.ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.25) self.make_prediction(data) def make_prediction(self, data): ''' Make an ensemble prediction ''' self.rf.fit(data.features_train, data.labels_train) self.lda.fit(data.features_train, data.labels_train) self.dec.fit(data.features_train, data.labels_train) self.ada.fit(data.features_train, data.labels_train) pre_pred = [] self.pred = [] ada_pred = self.ada.predict(data.features_test) rf_pred = self.rf.predict(data.features_test) lda_pred = self.lda.predict(data.features_test) dec_pred = self.dec.predict(data.features_test) for i in range(len(rf_pred)): pre_pred.append([ rf_pred[i], lda_pred[i], dec_pred[i], ada_pred[i] ]) for entry in pre_pred: pred_list = sorted(entry, key=entry.count, reverse=True) self.pred.append(pred_list[0])
def tryLinearDiscriminantAnalysis(goFast): from sklearn.datasets import dump_svmlight_file, load_svmlight_file if goFast: training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True) else: training_data, training_labels = load_svmlight_file("dt1.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1.tst.svm", n_features=253659, zero_based=True) from sklearn.lda import LDA from sklearn.metrics import accuracy_score from sklearn.grid_search import ParameterGrid from sklearn.decomposition import RandomizedPCA rpcaDataGrid = [{"n_components": [10,45,70,100], "iterated_power": [2, 3, 4], "whiten": [True]}] for rpca_parameter_set in ParameterGrid(rpcaDataGrid): rpcaOperator = RandomizedPCA(**rpca_parameter_set) rpcaOperator.fit(training_data,training_labels) new_training_data = rpcaOperator.transform(training_data,training_labels) new_validation_data = rpcaOperator.transform(validation_data,validation_labels) ldaOperator = LDA() ldaOperator.fit(new_training_data,training_labels) print "Score = " + str(accuracy_score(validation_labels,ldaOperator.predict(new_validation_data)))
def lda_on(train_x, train_y, test_x, test_y, feats_name='all_features'): """ Linear Discriminant Analysis """ lda = LDA() lda.fit(train_x, train_y, store_covariance=True) print feats_name, "(train):", lda.score(train_x, train_y) print feats_name, "(test):", lda.score(test_x, test_y) with open(dataset_name + '_lda_classif_' + feats_name + '.pickle', 'w') as w_f: cPickle.dump(lda, w_f) y_pred = lda.predict(test_x) X_train, X_validate, y_train, y_validate = cross_validation\ .train_test_split(train_x, train_y, test_size=0.2, random_state=0) lda.fit(X_train, y_train) print feats_name, "(validation):", lda.score( X_validate, y_validate) y_pred_valid = lda.predict(X_validate) cm_test = confusion_matrix(test_y, y_pred) cm_valid = confusion_matrix(y_validate, y_pred_valid) np.set_printoptions(threshold='nan') with open("cm_test" + feats_name + ".txt", 'w') as w_f: print >> w_f, cm_test with open("cm_valid" + feats_name + ".txt", 'w') as w_f: print >> w_f, cm_valid
def ldapredict(trainData,testData,trainOuts,testOuts): clf = LDA() print(clf.fit(trainData,trainOuts)) predictions = clf.predict(testData) print(predictions) misses,error = sup.crunchTestResults(predictions,testOuts,.5) print(1-error)
def read_subpop_data(one_hot=True, fake_data=False, test_size=0.2, undersample=False): labeled_dic = convert_txt_to_npy(LABELED_RL_PATH) unlabeled_dic = convert_txt_to_npy(UNLABELED_RL_PATH, labeled=False) X_train, X_test, y_train, y_test = split_train_test(labeled_dic, test_size=test_size) class DataSets(object): pass data_sets = DataSets() if undersample: from unbalanced_dataset import UnderSampler US = UnderSampler(verbose=True) X_train, y_train = US.fit_transform(X_train, y_train) lda = LDA() lda.fit(X_train, y_train) score = metrics.accuracy_score(lda.predict(X_test), y_test) print("Baseline LDA: %f " % score) if one_hot: y_train = convert_to_one_hot(y_train) y_test = convert_to_one_hot(y_test) data_sets = DataSets() data_sets.test = DataSet(X_test, y_test) data_sets.train = SemiDataSet(unlabeled_dic['data'], X_train, y_train) return data_sets
def test_all_methods(self): x_cols = ["Lag2"] formula = "Direction~Lag2" # print self.df.shape[0] train_data = self.df.ix[(self.df["Year"] >= 1990) & (self.df["Year"] <= 2008), :] # print train_data.shape[0] """ (d) logistic""" model = smf.glm(formula, data=train_data, family=sm.families.Binomial()) result = model.fit() test_data = self.df.ix[self.df["Year"] > 2008, :] probs = Series(result.predict(sm.add_constant(test_data[["Lag2"]]))) pred_values = probs.map(lambda x: "Down" if x > 0.5 else "Up") tp.output_table(pred_values.values, test_data[self.y_col].values) train_X = train_data[x_cols].values train_y = train_data[self.y_col].values test_X = test_data[x_cols].values test_y = test_data[self.y_col].values """ (e) LDA """ lda_res = LDA().fit(train_X, train_y) pred_y = lda_res.predict(test_X) tp.output_table(pred_y, test_y) """ (f) QDA """ qda_res = QDA().fit(train_X, train_y) pred_y = qda_res.predict(test_X) tp.output_table(pred_y, test_y) """ (g) KNN """ clf = neighbors.KNeighborsClassifier(1, weights="uniform") clf.fit(train_X, train_y) pred_y = clf.predict(test_X) tp.output_table(pred_y, test_y) """ (h) logistic and LDA """ """ (i) Is the purpose of the last question going through all methods with no direction?"""
def DLDA(self, trainLabel, featureData, testData): # print featureData == testData # print testData clf = LDA() clf.fit(featureData, trainLabel) testLabel = clf.predict(testData) return testLabel
def test_twomethods(self): key_y_pred = 'y' + conf.SEP + conf.PREDICTION X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2) # = With EPAC wf = Methods(LDA(), SVC(kernel="linear")) r_epac = wf.run(X=X, y=y) # = With SKLEARN lda = LDA() svm = SVC(kernel="linear") lda.fit(X, y) svm.fit(X, y) r_sklearn = [lda.predict(X), svm.predict(X)] # Comparison for i_cls in range(2): comp = np.all(np.asarray(r_epac[i_cls][key_y_pred]) == np.asarray(r_sklearn[i_cls])) self.assertTrue(comp, u'Diff Methods') # test reduce r_epac_reduce = [wf.reduce().values()[0][key_y_pred], wf.reduce().values()[1][key_y_pred]] comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn)) self.assertTrue(comp, u'Diff Perm / CV: EPAC reduce')
def LDA模型(self, 問題, 答案): lda = LDA() # clf = svm.NuSVC() print('訓練LDA') lda.fit(問題, 答案) print('訓練了') return lambda 問:lda.predict(問)
def eval_func(chromosome): alldata = LoadFeatures(data_N_x, data_F_x, chromosome) sx, sy, tx, ty = GetData(0.8, alldata) clf = LDA() clf.fit(sx, sy) py = clf.predict(tx) return accuracy_score(ty, py)
def do_lda(x, y, folds): indexes = list(range(len(x))) shuffle(indexes) x = list(x[i] for i in indexes) y = list(y[i] for i in indexes) fold_size = len(x) / folds corrects = [] for fold in range(folds): test_x = [] train_x = [] test_y = [] train_y = [] for i in range(len(x)): fold_index = i / fold_size if fold == fold_index: test_x.append(x[i]) test_y.append(y[i]) else: train_x.append(x[i]) train_y.append(y[i]) print 'Partitioned data into fold' test_x, train_x = remove_redundant_dimensions(test_x, train_x) print 'Removed redundant dimensions' lda = LDA() lda.fit(train_x, train_y) print 'Fit lda' predictions = lda.predict(test_x) correct = sum(1 for i in range(len(predictions)) if predictions[i] == test_y[i]) print 'Did fold, correct:', correct corrects.append(correct) return corrects
def main(): for question in range(3,18): print("Question ", question, " Percent Accuracy") trainingSet_features, trainingSet_labels, testSet_features, testSet_labels = loadTrainingAndTestData(question) #print(len(trainingSet_features)) #print(trainingSet_labels) #print(len(testSet_features)) #print(len(testSet_labels)) #print(trainingSet_labels) nnC = KNeighborsClassifier(n_neighbors=5) nnC.fit(trainingSet_features, trainingSet_labels) nnC_predictions = nnC.predict(testSet_features) print("Nearest Neighbor: %.2f" % (100*accuracy_score(testSet_labels,nnC_predictions)),"%") svmC = svm.SVC() svmC.fit(trainingSet_features, trainingSet_labels) svmCpredictions = svmC.predict(testSet_features) print("Support Vector Machines: %.2f" % (100*accuracy_score(testSet_labels,svmCpredictions)),"%") rfC = RandomForestClassifier(n_estimators=100) rfC.fit(trainingSet_features, trainingSet_labels) rfC_predictions = rfC.predict(testSet_features) print("Random Forrest: %.2f" % (100*accuracy_score(testSet_labels,rfC_predictions)),"%") ldaC = LDA(solver='lsqr') ldaC.fit(trainingSet_features, trainingSet_labels) ldaC_predictions = ldaC.predict(testSet_features) print("Linear Discriminant Analysis Classifier: %.2f" % (100*accuracy_score(testSet_labels,ldaC_predictions)),"%")
def DLDA(self, trainLabel, featureData, testData): # print featureData == testData # print testData clf = LDA() clf.fit(featureData, trainLabel) testLabel = clf.predict(testData) return testLabel
def eval_lda(X_train, y_train, X_test, y_test): wrongtrain=0 wrongtest=0 #train set pri = prior(y_train) #clf = LDA(priors=pri) clf = LDA() clf.fit(X_train, y_train) y_pred_train = clf.predict(X_train) y_pred = clf.predict(X_test) for y in xrange(len(y_pred_train)): if y_pred_train[y] != y_train[y]: wrongtrain +=1 for y in xrange(len(y_pred)): if y_pred[y] != y_test[y]: wrongtest +=1 return wrongtrain/len(y_train), wrongtest/len(y_test)
def lda_on(train_x, train_y, test_x, test_y, feats_name='all_features'): lda = LDA() lda.fit(train_x, train_y, store_covariance=True) print feats_name, "(train):", lda.score(train_x, train_y) print feats_name, "(test):", lda.score(test_x, test_y) with open(dataset_name + '_lda_classif_' + feats_name + '.pickle', 'w') as f: cPickle.dump(lda, f) y_pred = lda.predict(test_x) X_train, X_validate, y_train, y_validate = cross_validation.train_test_split(train_x, train_y, test_size=0.2, random_state=0) lda.fit(X_train, y_train) print feats_name, "(validation):", lda.score(X_validate, y_validate) y_pred_valid = lda.predict(X_validate) cm_test = confusion_matrix(test_y, y_pred) cm_valid = confusion_matrix(y_validate, y_pred_valid) np.set_printoptions(threshold='nan') with open("cm_test" + feats_name + ".txt", 'w') as wf: print >> wf, cm_test with open("cm_valid" + feats_name + ".txt", 'w') as wf: print >> wf, cm_valid
def lda(data,labels,n,v_type): train_data,train_labels,test_data,test_labels = split_data(data,labels,v_type) clf = LDA() clf.fit(np.array(train_data,dtype=np.float64), np.array(train_labels,dtype=np.float64)) y_pred = clf.predict(test_data) pure_accuracy_rate = len([y_pred[x] for x in range(len(y_pred)) if y_pred[x] == test_labels[x]])/float(len(test_labels)) report = classification_report(y_pred, test_labels, target_names=rock_names) cm = confusion_matrix(test_labels, y_pred) return pure_accuracy_rate,report,y_pred,test_labels,test_data,clf,cm,"LDA"
def test(): class1 = np.mat([ (2.9500 , 6.6300), (2.5300 , 7.7900), (3.5700 , 5.6500), (3.1600, 5.4700), ]) class2 = np.mat([ (2.5800 , 4.4600), (2.1600, 6.2200), (3.2700 , 3.5200), ]) test = (2.81, 5.46) lda = myLDA(class1, class2) print lda.predict(test) lda = LDA() lda.fit(np.concatenate((class1, class2)), np.concatenate((np.zeros((3, 1)), np.ones((4, 1))), axis=0), store_covariance=True) print lda.predict(test)
def train_lda(filename,delim=','): start = time.time() [X_train, X_test, y_train, y_test] = load_and_split_dataset(filename,delim) clf = LDA() clf.fit(X_train, y_train) end = time.time() print('Training Time: '+str((end - start))+'s') y_pred = clf.predict(X_test) print np.sum(y_pred == y_test)/len(y_pred) return y_pred
def test(): class1 = np.mat([ (2.9500, 6.6300), (2.5300, 7.7900), (3.5700, 5.6500), (3.1600, 5.4700), ]) class2 = np.mat([ (2.5800, 4.4600), (2.1600, 6.2200), (3.2700, 3.5200), ]) test = (2.81, 5.46) lda = myLDA(class1, class2) print lda.predict(test) lda = LDA() lda.fit(np.concatenate((class1, class2)), np.concatenate((np.zeros((3, 1)), np.ones((4, 1))), axis=0), store_covariance=True) print lda.predict(test)
def lda_f(train, train_labels, test): # LDA print '' print '----------------' print 'LDA:' # http://scikit-learn.org/0.16/modules/generated/sklearn.lda.LDA.html clf = LDA() clf.fit(train, train_labels) pred = clf.predict(test) return pred
def FisherLD(images): a = 0 coordinates = [[0 for x in range(28)] for x in range(28)] #This is a list of coordinate values, each x y pair coorresponding to #a place in values (coordinates[0][0] -> values[0], coordinates[0][1] -> values[1]) values = [] #This is the value of each spot within the image, either a 1 or 0 #Populate the list of coordinates for x in range(size): for y in range(size): coordinates[x][y] = x #Populate the list of values for image in images: values.append(image.norm) #Perform LDA clf = LDA() clf.fit(coordinates, values) print(clf.predict([[-0.8, -1]])) return clf.predict([[-0.8, -1]])
class LinearDiscriminantAnalysis(object): def __init__(self, input_matrix, labels): self.x = input_matrix self.y = labels self.clf = LDA() def train(self): self.clf.fit(self.x, self.y) def predict(self, x): return self.clf.predict(x) def save_model(self, file): joblib.dump(self.clf, file)
class ProteinFamilyClassifier(object): def __init__(self, word_length): self.word_length = word_length self.clf = None def fit(self, data): sequences, families = zip(*data) # Create signatures from the CGR representations, as our X signatures = [create_cgr_signature(seq, self.word_length) for seq in sequences] self.clf = LDA() self.clf.fit(np.array(signatures), families) # TODO: maybe return some information about the new feature space ? def predict(self, data): if not self.clf: raise RuntimeError("Cannot call predict before running fit.") sequences, true_families = zip(*data) # Create signatures from the CGR representations, as our X signatures = [create_cgr_signature(seq, self.word_length) for seq in sequences] predicted_families = self.clf.predict(signatures) # precision, recall, fscore, support = precision_recall_fscore_support( # true_families, predicted_families) # # confusion_matrix = \ # confusion_matrix(true_families, predicted_families) # # metrics = { # 'accuracy': accuracy_score(true_families, predicted_families), # 'precision': precision, # 'recall': recall, # 'fscore': fscore, # 'support': support, # 'confusion_matrix': confusion_matrix, # # 'roc_auc': roc_auc_score(true_families, predicted_families), # } return classification_report(true_families, predicted_families)
def runTestPairs( e ): x = e[0]; y = e[1] trainX = labelsmaptra[x] + labelsmaptra[y] labelsX = [x]*len(labelsmaptra[x]) + [y]*len(labelsmaptra[y]) clf = LDA() clf.fit( trainX, labelsX ) testX = labelsmaptes[x] + labelsmaptes[y] labelsX = [x]*len(labelsmaptes[x]) + [y]*len(labelsmaptes[y]) error = 0 for lab, test in zip( labelsX, testX ): pred = clf.predict(test) if lab != pred: error += 1 print e, error, error/float(len(testX)) return ( e, error, error/float(len(testX)) )
def LDA(data, label, pred_data, pred_last): '''not good,不需要规范化 ''' data = np.array(data) pred_data = np.array(pred_data) label = np.array(label) pred_last = np.array(pred_last) from sklearn.lda import LDA gnb = LDA() gnb.fit(data, label) print gnb.score(data, label) pred_result = gnb.predict(pred_data) print("Number of mislabeled points out of a total %d points : %d" % (pred_data.shape[0], (pred_last != pred_result).sum())) print gnb.score(pred_data, pred_last) return pred_result
def runTestPairs(e): x = e[0] y = e[1] trainX = labelsmaptra[x] + labelsmaptra[y] labelsX = [x] * len(labelsmaptra[x]) + [y] * len(labelsmaptra[y]) clf = LDA() clf.fit(trainX, labelsX) testX = labelsmaptes[x] + labelsmaptes[y] labelsX = [x] * len(labelsmaptes[x]) + [y] * len(labelsmaptes[y]) error = 0 for lab, test in zip(labelsX, testX): pred = clf.predict(test) if lab != pred: error += 1 print e, error, error / float(len(testX)) return (e, error, error / float(len(testX)))
def lda(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans lda split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape lda=LDA(n_components=2) lda.fit(X_train,y_train) X_LDA = lda.transform(X_train) print "shape of result:", X_LDA.shape y_pred = lda.predict(X_test) print "Linear Discriminant Analysis Accuracy " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) #LVLprint "\n" results = Output+"LDA_metrics_test.txt" file = open(results, "w") file.write("Linear Discriminant Analaysis estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "LDA %f"%test_size save = Output + "LDA_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save) # plot the results along with the labels fig, ax = plt.subplots() im = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], c=y_train) fig.colorbar(im); save_lda = Output + "LDA_plot_test"+"_%s.png"%test_size plt.savefig(save_lda) plt.close() lvltrace.lvltrace("LVLSortie dans lda split_test")
def lda(input_file,Output): lvltrace.lvltrace("LVLEntree dans lda") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape #lda=LDA(n_components=2) lda=LDA() lda.fit(X,y) X_LDA = lda.transform(X) y_pred = lda.predict(X) print "#########################################################################################################\n" print "Linear Discriminant Analysis Accuracy " print "classification accuracy:", metrics.accuracy_score(y, y_pred) print "precision:", metrics.precision_score(y, y_pred) print "recall:", metrics.recall_score(y, y_pred) print "f1 score:", metrics.f1_score(y, y_pred) print "\n" print "#########################################################################################################\n" results = Output+"LDA_metrics.txt" file = open(results, "w") file.write("Linear Discriminant Analaysis estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y)): file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1))) file.close() title = "LDA" save = Output + "LDA_confusion_matrix.png" plot_confusion_matrix(y, y_pred,title,save) # plot the results along with the labels fig, ax = plt.subplots() im = ax.scatter(X_LDA[:, 0], X_LDA[:, 1], c=y) fig.colorbar(im); save_lda = Output + "LDA_plot.png" plt.savefig(save_lda) plt.close() lvltrace.lvltrace("LVLSortie dans lda")
class LDAClassifier(Classifier): '''Linear Discriminant analysis classifier''' def __init__(self): super(LDAClassifier, self).__init__() self.fig = 20 self.is_trainable = True self.is_trained = False def train(self, classification_data, indices=None, settings_name=None, **kwargs): super(LDAClassifier, self).train(classification_data, indices, settings_name, **kwargs) indices = self.settings['indices'] self.lda = LDA(**self.classifier_kwargs) self.lda.fit(classification_data.data[:, indices], classification_data.are_hurr_actual) return self def classify(self, classification_data): super(LDAClassifier, self).classify(classification_data) indices = self.settings['indices'] self.are_hurr_pred = self.lda.predict(classification_data.data[:, indices]) return self.are_hurr_pred
def parseOtu(): fn = r'..\cfs_data\otu_table_mc2_w_tax_even32233.txt' fid = open(fn) line = fid.readline() subjects = (fid.readline()).split('\t') subjects = subjects[1:-1] numSubjects = len(subjects) mat = [] otus = [] while True: line = (fid.readline()).split('\t') otus.append(line[-1]) if line[0] is '': break mat.append(np.array([float(i) for i in line[1:-1]])) mat = np.array(mat) # Compare at the family level # Gather all families families = [] for k in range(len(otus)): tmp = otus[k].split(';') if len(tmp) >= 5: if len(tmp[4]) > 4: family = str.strip(tmp[4][4:]) family = family.replace('[', '') family = family.replace(']', '') families.append(family) families = set(families) numFamilies = len(families) rr = np.zeros((numFamilies, numSubjects)) # Gather up rows for a specific family idx = 0 for k in otus: famId = 0 for family in families: if k.find(family) > -1: rr[famId, :] += mat[idx, :] break famId += 1 idx += 1 # Normalize rr = rr / np.sum(rr, axis=0)[np.newaxis] # read in control vs patients fid = open(r'..\cfs_data\mapping_metadata_CFS.txt') reader = csv.DictReader(fid, delimiter='\t') controls = [] patients = [] idx = 0 for row in reader: if row['Subject'] == 'Control': controls.append(row['#SampleID']) if row['Subject'] == 'Patient': patients.append(row['#SampleID']) controlsIdx = [] patientsIdx = [] for k in range(len(subjects)): for kk in controls: if subjects[k] == kk: controlsIdx.append(k) for kk in patients: if subjects[k] == kk: patientsIdx.append(k) patientsIdx = np.array(patientsIdx) controlsIdx = np.array(controlsIdx) controlMat = rr[:, controlsIdx] patientMat = rr[:, patientsIdx] inputMat = np.hstack((patientMat, controlMat)) outputVec = np.hstack( (np.ones(patientMat.shape[1]), -np.ones(controlMat.shape[1]))) # Affinity Matrix numSubjects = inputMat.shape[1] aff_mat = np.zeros((numSubjects, numSubjects)) for k in range(numSubjects): for kk in range(numSubjects): aff_mat[k, kk] = 1 / np.sqrt( np.sum((inputMat[:, k] - inputMat[:, kk])**2)) plt.figure() plt.imshow(aff_mat) plt.show() from sklearn.lda import LDA clf = LDA() clf.fit(inputMat[:, 1:80].T, outputVec[1:80]) fit = clf.predict(inputMat.T) err = np.sum(np.abs(fit - outputVec) > 0) print(err) # SVM clf = sklearn.svm.SVC(kernel='linear', C=1e-1) n_samples = inputMat.shape[1] cv = sklearn.cross_validation.KFold(n_samples, n_folds=8, shuffle=True) scores = sklearn.cross_validation.cross_val_score(clf, inputMat.T, outputVec, cv=cv) # Predict my data # Read in my data file myOtu = otu.OTU(r'..\sample_data\01112016.json') # Get family distribution gen = myOtu.getTaxonomy('genus') myOtu.mergeTaxonomy('family', families) myOtu.getDistribution('family') # Do LDA fit # run predictor return
model_transf = LogisticRegression() model_transf = model_transf.fit(X_transf[:200,:],Y[:200]) #Fazendo a classificacao dos dados de teste do conjunto de dados original predicted = model.predict(X[200:]) #Fazendo a classificacao dos dados de teste do conjunto de dados transformado predicted_transf = model_transf.predict(X_transf[200:]) #Verificando a acuracia das classificacoes ----- Resposta da pergunta 2 print "Acuracia da regressao logistica no conjunto de dados original: "+str(metrics.accuracy_score(Y[200:], predicted)) print "Acuracia da regressao logistica no conjunto de dados transformado: "+str(metrics.accuracy_score(Y[200:], predicted_transf)) #Aplicando o LDA aos dados de treino do conjunto de dados original model_LDA = LDA() model_LDA = model_LDA.fit(X[:200],Y[:200]) #Aplicando o LDA aos dados de treino do conjunto de dados transformado model_LDA_transf = LDA() model_LDA_transf = model_LDA_transf.fit(X_transf[:200],Y[:200]) #Fazendo a classificacao dos dados de teste do conjunto de dados original predicted_LDA = model_LDA.predict(X[200:]) #Fazendo a classificacao dos dados de teste do conjunto de dados transformado predicted_LDA_transf = model_LDA_transf.predict(X_transf[200:]) #Verificando a acuracia das classificacoes ----- Resposta da pergunta 3 print "Acuracia do LDA no conjunto de dados original: "+str(metrics.accuracy_score(Y[200:], predicted_LDA)) print "Acuracia do LDA no conjunto de dados transformado: "+str(metrics.accuracy_score(Y[200:], predicted_LDA_transf))
Xpart = Xproj[np.where(y_species == species_id)[0], :] plt.scatter(Xpart[:, 0], Xpart[:, 1], color=colors[i]) i = i + 1 plt.title("Citrus Species (first 2 Principal Components)") plt.xlabel("X0") plt.ylabel("X1") plt.show() # Perform multiclass LDA Xtrain, Xtest, ytrain, ytest = train_test_split(Xscaled, y_species, test_size=0.25, random_state=42) clf = LDA(len(species_ids)) clf.fit(Xtrain, ytrain) ypred = clf.predict(Xtest) print "LDA Accuracy Score: %.3f" % (accuracy_score(ypred, ytest)) # What varieties are most spectrally similar? corr = np.corrcoef(clf.means_) plt.imshow(corr, interpolation="nearest", cmap=plt.cm.cool) plt.xticks(np.arange(len(species_ids)), species_ids, rotation=45) plt.yticks(np.arange(len(species_ids)), species_ids) plt.colorbar() plt.show() # Find LDA classifier accuracy using cross validation kfold = KFold(Xscaled.shape[0], 10) scores = [] for train, test in kfold: Xtrain, Xtest, ytrain, ytest = Xscaled[train], Xscaled[test], \
def analyze_by_t2t(R, trl_ix, t2t): perc = [0]+[np.percentile(t2t, i) for i in [25, 50, 75, 100]] n_assemblies = R.shape[1] f, ax = plt.subplots(nrows=n_assemblies) labels = {} for i, p in enumerate(perc[:-1]): p2 = perc[i+1] ix = np.nonzero(np.logical_and(t2t>p, t2t<=p2))[0] labels[i] = ix for a in range(n_assemblies): tmp = [] for ii, x in enumerate(ix): xx = np.nonzero(trl_ix == x)[0] tmp.append(R[xx, a]) labels[i, a, 'trl'] = np.vstack((tmp)) m = np.mean(np.vstack((tmp)), axis=0) sem = np.std(np.vstack((tmp)), axis=0)/len(ix) ax[a].plot(m, color=cmap_list[i]) ax[a].fill_between(np.arange(len(m)), m-sem, m+sem, color=cmap_list[i], alpha=.5) ax[a].plot(int(np.floor(p2*10)), m[int(np.floor(p2*10))-1], '.', markersize=20, color=cmap_list[i]) #Classify new trials: test={} train={} from sklearn.lda import LDA chance = {} for a in range(n_assemblies): lda = LDA() lda.n_components = len(perc) - 1 X = [] Y = [] X_test = [] Y_test = [] ix_train = {} ix_test = {} for k in range(len(perc)-1): n = len(labels[k]) ix = np.random.permutation(n) ix_train[k] = ix[:n/2] ix_test[k] = ix[n/2:] X.append(labels[k, a, 'trl'][ix_train[k],:]) Y.append([k]*len(ix_train[k])) X_test.append(labels[k, a, 'trl'][ix_test[k], :]) Y_test.append([k]*len(ix_test[k])) Y_train = np.hstack((Y)) lda.fit(np.vstack((X)), Y_train) y_true = lda.predict(np.vstack((X))) train[a] = np.sum(y_true==np.hstack((Y)))/float(len(y_true)) y_pred = lda.predict(np.vstack(X_test)) test[a] = np.sum(y_pred == np.hstack((Y_test)))/float(len(y_pred)) chance[a] = [] for i in range(100): ix = np.random.permutation(len(Y_train)) lda.fit(np.vstack((X)), Y_train[ix]) y_pred = lda.predict(np.vstack(X)) chance[a].append(np.sum(y_pred == Y_train[ix])/float(len(y_pred))) plt.show() return train, test, chance
def classify(sx, sy, tx, ty): clf = LDA() clf.fit(sx, sy) py = clf.predict(tx) return accuracy_score(ty, py)
N_st = np.sum(y == 0) N_rr = N_tot - N_st N_train = len(y_train) N_test = len(y_test) N_plot = 5000 + N_rr #---------------------------------------------------------------------- # perform LDA classifiers = [] predictions = [] Ncolors = np.arange(1, X.shape[1] + 1) for nc in Ncolors: clf = LDA() clf.fit(X_train[:, :nc], y_train) y_pred = clf.predict(X_test[:, :nc]) classifiers.append(clf) predictions.append(y_pred) completeness, contamination = completeness_contamination(predictions, y_test) print("completeness", completeness) print("contamination", contamination) #------------------------------------------------------------ # Compute the decision boundary clf = classifiers[1] xlim = (0.7, 1.35) ylim = (-0.15, 0.4)
for point in DataSet: for f in range(P): if point[f] == "NaN": point[f] = SampleMean[f] return DataSet TestSetNum = 7 #ratio of DataSet : TestSet impute_mean(DataSet) for i in range(N): if i%TestSetNum == 0: TestSet.append(DataSet[i]) Y_test.append(Y_data[i]) else : TrainSet.append(DataSet[i]) Y_train.append(Y_data[i]) print len(TrainSet),len(TestSet) # data has been split into TrainSet and TestSet import numpy as np from sklearn.lda import LDA clf = LDA() clf.fit(np.array(TrainSet),np.array(Y_train)) output = clf.predict(TestSet) collect_stat(Y_test, output)
def LDA_onData(): #Parsing Full training dataset XFull = common.parseFile('../UCI HAR Dataset/train/X_train.txt') YFull = common.parseFile('../UCI HAR Dataset/train/y_train.txt') #Parsing Full testing dataset XFullTest = common.parseFile('../UCI HAR Dataset/test/X_test.txt') YFullTest = common.parseFile('../UCI HAR Dataset/test/y_test.txt') ################################################################################################################################# #Getting the dataset associated with Dynamic Activities on training X_Dynamic, Y_Dynamic = common.getDataSubset(XFull, YFull.flatten(), [1, 2, 3, 4, 5, 6]) #Getting the dataset associated with Dynamic Activities on testing X_DynamicTest, Y_DynamicTest = common.getDataSubset( XFullTest, YFullTest.flatten(), [1, 2, 3, 4, 5, 6]) #Fitting data using LDA classifier clf = LDA() clf.fit(X_Dynamic, Y_Dynamic.flatten()) precision, recall, fscore = common.checkAccuracy( clf.predict(X_DynamicTest), Y_DynamicTest, [1, 2, 3, 4, 5, 6]) print( common.createConfusionMatrix( clf.predict(X_DynamicTest).flatten(), Y_DynamicTest.flatten(), [1, 2, 3, 4, 5, 6])) print(fscore) ################################################################################################################################# #Getting the dataset associated with Dynamic Activities on training X_Dynamic, Y_Dynamic = common.getDataSubset(XFull, YFull.flatten(), [1, 2, 3, 4, 5, 6]) #Getting the dataset associated with Dynamic Activities on testing X_DynamicTest, Y_DynamicTest = common.getDataSubset( XFullTest, YFullTest.flatten(), [1, 2, 3, 4, 5, 6]) X_Dynamic = common.getPowerK(X_Dynamic, [1, 2]) X_DynamicTest = common.getPowerK(X_DynamicTest, [1, 2]) #Fitting data using LDA classifier clf = LDA() clf.fit(X_Dynamic, Y_Dynamic.flatten()) precision, recall, fscore = common.checkAccuracy( clf.predict(X_DynamicTest), Y_DynamicTest, [1, 2, 3, 4, 5, 6]) print( common.createConfusionMatrix( clf.predict(X_DynamicTest).flatten(), Y_DynamicTest.flatten(), [1, 2, 3, 4, 5, 6])) print(fscore) ################################################################################################################################# #Getting the dataset associated with Dynamic Activities on training X_Dynamic, Y_Dynamic = common.getDataSubset(XFull, YFull.flatten(), [1, 2, 3, 4, 5, 6]) #Getting the dataset associated with Dynamic Activities on testing X_DynamicTest, Y_DynamicTest = common.getDataSubset( XFullTest, YFullTest.flatten(), [1, 2, 3, 4, 5, 6]) X_Dynamic = common.getPowerK(X_Dynamic, [1, 2, 3]) X_DynamicTest = common.getPowerK(X_DynamicTest, [1, 2, 3]) #Fitting data using LDA classifier clf = LDA() clf.fit(X_Dynamic, Y_Dynamic.flatten()) precision, recall, fscore = common.checkAccuracy( clf.predict(X_DynamicTest), Y_DynamicTest, [1, 2, 3, 4, 5, 6]) print( common.createConfusionMatrix( clf.predict(X_DynamicTest).flatten(), Y_DynamicTest.flatten(), [1, 2, 3, 4, 5, 6])) print(fscore) ################################################################################################################################# #Getting the dataset associated with Dynamic Activities on training X_Dynamic, Y_Dynamic = common.getDataSubset(XFull, YFull.flatten(), [1, 2, 3]) #Getting the dataset associated with Dynamic Activities on testing X_DynamicTest, Y_DynamicTest = common.getDataSubset( XFullTest, YFullTest.flatten(), [1, 2, 3]) print(len(X_DynamicTest), len(Y_DynamicTest)) #Fitting data using LDA classifier clf = LDA() clf.fit(X_Dynamic, Y_Dynamic.flatten()) precision, recall, fscore = common.checkAccuracy( clf.predict(X_DynamicTest), Y_DynamicTest, [1, 2, 3]) print( common.createConfusionMatrix( clf.predict(X_DynamicTest).flatten(), Y_DynamicTest.flatten(), [1, 2, 3])) print(fscore) ################################################################################################################################# #Getting the dataset associated with Dynamic Activities on training X_Dynamic, Y_Dynamic = common.getDataSubset(XFull, YFull.flatten(), [1, 2, 3]) #Getting the dataset associated with Dynamic Activities on testing X_DynamicTest, Y_DynamicTest = common.getDataSubset( XFullTest, YFullTest.flatten(), [1, 2, 3]) X_Dynamic = common.getPowerK(X_Dynamic, [1, 2]) X_DynamicTest = common.getPowerK(X_DynamicTest, [1, 2]) #Fitting data using LDA classifier clf = LDA() clf.fit(X_Dynamic, Y_Dynamic.flatten()) precision, recall, fscore = common.checkAccuracy( clf.predict(X_DynamicTest), Y_DynamicTest, [1, 2, 3]) print( common.createConfusionMatrix( clf.predict(X_DynamicTest).flatten(), Y_DynamicTest.flatten(), [1, 2, 3])) print(fscore) ################################################################################################################################# #Getting the dataset associated with Dynamic Activities on training X_Dynamic, Y_Dynamic = common.getDataSubset(XFull, YFull.flatten(), [1, 2, 3]) #Getting the dataset associated with Dynamic Activities on testing X_DynamicTest, Y_DynamicTest = common.getDataSubset( XFullTest, YFullTest.flatten(), [1, 2, 3]) X_Dynamic = common.getPowerK(X_Dynamic, [1, 2, 3]) X_DynamicTest = common.getPowerK(X_DynamicTest, [1, 2, 3]) #Fitting data using LDA classifier clf = LDA() clf.fit(X_Dynamic, Y_Dynamic.flatten()) precision, recall, fscore = common.checkAccuracy( clf.predict(X_DynamicTest), Y_DynamicTest, [1, 2, 3]) print( common.createConfusionMatrix( clf.predict(X_DynamicTest).flatten(), Y_DynamicTest.flatten(), [1, 2, 3])) print(fscore) ################################################################################################################################# #Getting the dataset associated with Non-Dynamic Activities on training X_NonDynamic, Y_NonDynamic = common.getDataSubset(XFull, YFull.flatten(), [4, 5, 6]) #Getting the dataset associated with Non-Dynamic Activities on testing X_NonDynamicTest, Y_NonDynamicTest = common.getDataSubset( XFullTest, YFullTest.flatten(), [4, 5, 6]) #Fitting data using LDA classifier clf = LDA() clf.fit(X_NonDynamic, Y_NonDynamic.flatten()) precision, recall, fscore = common.checkAccuracy( clf.predict(X_NonDynamicTest), Y_NonDynamicTest, [4, 5, 6]) print( common.createConfusionMatrix( clf.predict(X_NonDynamicTest).flatten(), Y_NonDynamicTest.flatten(), [4, 5, 6])) print(fscore) ################################################################################################################################# #Getting the dataset associated with Dynamic Activities on training X_Dynamic, Y_Dynamic = common.getDataSubset(XFull, YFull.flatten(), [4, 5, 6]) #Getting the dataset associated with Dynamic Activities on testing X_DynamicTest, Y_DynamicTest = common.getDataSubset( XFullTest, YFullTest.flatten(), [4, 5, 6]) X_Dynamic = common.getPowerK(X_Dynamic, [1, 2]) X_DynamicTest = common.getPowerK(X_DynamicTest, [1, 2]) #Fitting data using LDA classifier clf = LDA() clf.fit(X_Dynamic, Y_Dynamic.flatten()) precision, recall, fscore = common.checkAccuracy( clf.predict(X_DynamicTest), Y_DynamicTest, [4, 5, 6]) print( common.createConfusionMatrix( clf.predict(X_DynamicTest).flatten(), Y_DynamicTest.flatten(), [4, 5, 6])) print(fscore) ################################################################################################################################# #Getting the dataset associated with Dynamic Activities on training X_Dynamic, Y_Dynamic = common.getDataSubset(XFull, YFull.flatten(), [4, 5, 6]) #Getting the dataset associated with Dynamic Activities on testing X_DynamicTest, Y_DynamicTest = common.getDataSubset( XFullTest, YFullTest.flatten(), [4, 5, 6]) X_Dynamic = common.getPowerK(X_Dynamic, [1, 2, 3]) X_DynamicTest = common.getPowerK(X_DynamicTest, [1, 2, 3]) #Fitting data using LDA classifier clf = LDA() clf.fit(X_Dynamic, Y_Dynamic.flatten()) precision, recall, fscore = common.checkAccuracy( clf.predict(X_DynamicTest), Y_DynamicTest, [4, 5, 6]) print( common.createConfusionMatrix( clf.predict(X_DynamicTest).flatten(), Y_DynamicTest.flatten(), [4, 5, 6])) print(fscore)
import pandas as pd import numpy as np from sklearn.lda import LDA ## read files train = pd.read_csv('data/spam_train.csv') test = pd.read_csv('data/spam_test.csv') x = np.array(train.iloc[:, 0:57]) y = np.ravel(train.iloc[:, -1]) ## separate the predictors and response in the test data set x2 = np.array(test.iloc[:, 0:57]) y2 = np.ravel(test.iloc[:, -1]) ## fit the model using lda lda_cls = LDA() lda_cls.fit(x, y) print("(1): lda accuracy") print(lda_cls.score(x, y)) ## predict output on test data set with lda predict = lda_cls.predict(x2) print("(2): lda test accuracy") print(lda_cls.score(x2, y2))
'n_neighbors', 5) pd.DataFrame(np.array([test_scores, nns]).T, columns=['Test Accuracy', 'Number of Nearest Neighbours']) ##Very low accuray in the results. best is 1 neighbour ###Training Accuracy accuracy_score(Y_test, knnFit.predict(X_test_01)) #worse then guessing, 18.16% #LDA #Since we dont have many parameters to vary for LDA, we run it as is to see the results: folds = KFold(n=X_train_01.shape[0], n_folds=10) ldaAccuracyScores = [] for train_fold, test_fold in folds: ldaFit = LDA().fit(X_train_01[train_fold], Y_train[train_fold]) accuracy = accuracy_score(Y_train[test_fold], ldaFit.predict(X_train_01[test_fold])) ldaAccuracyScores.append(accuracy) ldaAccuracyScores = np.array(ldaAccuracyScores) print('the mean accuracy through LDA on training data is %0.2f' % ldaAccuracyScores.mean()) ldaFit = LDA().fit(X_train_01, Y_train) accuracy_score( Y_test, ldaFit.predict(X_test_01)) #highest accuracy of 63.67%; best accuracy #The HofF variable ###Since the HofF variable is very unbalanced, we stick to ensemble based approaches AdaBoost, Random Forest ###Our main metric for performance is the senstivity NOT accuracy #Stratified Test-train split from sklearn.cross_validation import StratifiedShuffleSplit
for i in range(len(labels)): clf = LDA() #trainMat = repubAndDemMatrix #trainLabels = labels trainMat = np.concatenate((repubAndDemMatrix[0:i],repubAndDemMatrix[i+1:sz]), axis = 0) trainLabels = np.concatenate((labels[0:i], labels[i+1:sz]), axis = 0) #trainMat = repubAndDemMatrix[0:163] #trainLabels = labels[0:163] #print type(trainMat) #print type(trainLabels) #trainLabels = labels[0:i] + labels[i+1:sz] clf.fit(trainMat, trainLabels) #clf.fit(repubAndDemMatrix, labels) #clf = getLDAMat(trainMat, trainLabels, 5); if clf.predict([repubAndDemMatrix[i].tolist()]) == labels[i]: totalCorrect = totalCorrect + 1 # print clf.coef_ print(i) predicted = clf.predict([repubAndDemMatrix[i].tolist()]) print 'predicted =', predicted, '; actual =', labels[i] if labels[i] == 0: trueDem += 1 else: trueRep += 1 if predicted == 0: predDem += 1 else: predRep +=1
# -*- coding: utf-8 -*- """ Created on Wed May 18 16:57:23 2016 @author: siham.belgadi """ import numpy as np from sklearn.lda import LDA X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) clf = LDA() clf.fit(X, y) LDA(n_components=None, priors=None, shrinkage=None, solver='svd', store_covariance=False, tol=0.0001) print(clf.predict([[-0.8, -1]]))
Cov[:, :, b] += np.cov(np.transpose(new)) + np.eye(inputs.shape[1]) * 1e-9 return Mean, Covar, Prob def QDA_predict(inputs, Prob, covariance, mean): B = np.zeros([len(Prob), len(inputs)]) for b in range(len(Prob)): Mat = np.linalg.inv(covariance[:, :, b]) A = -1 / 2 * np.log(np.linalg.det(covariance[:, :, b]) + 1e-10) + np.log(Prob[b]) B[b, :] = np.array([ A - (1 / 2 * np.dot(np.dot(a - mean[b], Mat), a - mean[b])) for a in inputs ]) return np.argmax(B, axis=0) semg = scipy.io.loadmat('./data/subject-0/motion-fist/trial-0.csv') a, b, c, d = split_set(training_data, 10000, training_label) Mean, Covar, P = LDA_fit(a, c) prediction = LDA_predict(a, Covar, Mean, P) print("accuracy:", 1 - (np.sum(c != prediction) / len(c))) #sklearn implementation classify = LDA() classify.fit(a, c) classify.predict(a) print("accuracy:", 1 - (np.sum(c != prediction) / len(c)))
perc.append(float("{0:.2f}".format(cm[i][i] / count[i] * 100))) return perc def overall_accuracy(cm, y_test): sum = 0 for i in range(6): sum += cm[i][i] return float("{0:.2f}".format(sum * 100.0 / y_test.size)) #######LDA##################################################################### lda = LDA() lda.fit(X_train, y_train) y_predict_lda = lda.predict(X_test) y_pred_count_lda = total_count(y_predict_lda) cmatrix_lda = confusion_matrix(y_test, y_predict_lda) print "\nLDA:" print cmatrix_lda print "" recall_lda = pre_rec(cmatrix_lda, y_test_count) precision_lda = pre_rec(cmatrix_lda, y_pred_count_lda) accuracy_lda = overall_accuracy(cmatrix_lda, y_test) print "Precision for LDA: " print precision_lda print "Recall for LDA: "
#ws.var_.xvschema = scot.xvschema.singletrial #ws.optimize_var() ws.var_.delta = 1 # Single-Trial Fitting and feature extraction features = np.zeros((len(triggers), 32)) for t in range(len(triggers)): print('Fold %d/%d, Trial: %d ' %(fold, nfolds, t), end='\r') ws.set_data(data[:, :, t]) ws.fit_var() con = ws.get_connectivity('ffPDC') alpha = np.mean(con[:, :, np.logical_and(7 < freq, freq < 13)], axis=2) beta = np.mean(con[:, :, np.logical_and(15 < freq, freq < 25)], axis=2) features[t, :] = np.array([alpha, beta]).flatten() lda.fit(features[train, :], classids[train]) acc_train = lda.score(features[train, :], classids[train]) acc_test = lda.score(features[test, :], classids[test]) print('Fold %d/%d, Acc Train: %.4f, Acc Test: %.4f' %(fold, nfolds, acc_train, acc_test)) pred = lda.predict(features[test, :]) cm += confusion_matrix(classids[test], pred) print('Confusion Matrix:\n', cm) print('Total Accuracy: %.4f'%(np.sum(np.diag(cm))/np.sum(cm)))
class minDistance(): def __init__(self, dataName, p, k): data = parse(dataName) npData = numpy.array(data, dtype=numpy.dtype(decimal.Decimal)) self.X = npData[:,:-1].astype(numpy.float) self.Y = npData[:,-1].astype(numpy.integer) self._ret = [] self._p = p self._k = k @property def ret(self): return self._ret def process(self, fold=2): self.crossValidation(self.trainFunc, self.testFunc) def crossValidation(self, cbTrain, cbTest, fold=2): X = self.X Y = self.Y kFold = cross_validation.KFold(n=Y.size, n_folds=fold, shuffle=True, random_state=numpy.random.randint(1,16384)) for train_index, test_index in kFold: #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index,:], X[test_index,:] Y_train, Y_test = Y[train_index], Y[test_index] cbTrain(X_train, Y_train) cbTest(X_test, Y_test) def trainFunc(self,X,Y): self.lda = LDA(n_components=2) self.lda.fit(X, Y) def testFunc(self,X,Y): d = distance(self._p) classNum = max(Y)+1 ok = 0 for line in range(0,Y.size): shouldBe = Y[line] given = X[line,:] chosen = {} calced = self.lda.predict( given ) ok += calced == shouldBe #print 'Test: %s %%' % (100.0*ok/Y.size) self._ret.append(1.0*ok/Y.size) def plot(self): import pylab as pl self.lda = LDA(n_components=2) X = self.lda.fit(self.X, self.Y).transform(self.X) Y = self.Y for k in range(0,max(Y)+1): color = 'r' if k ==0 else 'g' if k == 1 else 'b' pl.plot( X[Y==k,0], X[Y==k,1], 'o'+color ) pl.show() pass
Fteste = np.nan_to_num((Fteste-Mteste) / Dteste) # LDA Xtreino = Ftreino Xteste = Fteste y = np.array([i for i in py.flatten([[i]*10 for i in range(12)])]) target_names = np.array(conf.artistas) # aplicamos LDA no conjunto de treino e teste (após fitar... treinar com o # conjunto de treino) lda = LDA(n_components=2) # lda.fit(Xtreino, y, store_covariance=True) Xtreino_r2 = lda.fit(Xtreino, y, store_covariance=True).transform(Xtreino) y_pred = lda.predict(Xteste) print y_pred cm = confusion_matrix(y, y_pred) cms.append(cm) print 'cm', cm cm_media = sum([np.array(cm, dtype=float) for cm in cms]) / N print cm_media fig = plt.figure() ax = plt.subplot(111) cax = ax.matshow(cm_media, interpolation='nearest', cmap=py.cm.jet) #py.title('Confusion matrix') plt.colorbar(cax) plt.ylabel('True paintings', fontsize=11) plt.xlabel('Predicted paintings', fontsize=11) dialabels = [r'Caravaggio',
def classify(images, classes_list, train_set, test_set, pos_fold, descriptor, parameters): """ Performs the classification of the test_set according to the train_set. """ print "Classification: LDA" #Paths #dirname = os.path.abspath(os.path.join(os.path.dirname(__file__))) temp_path = os.path.abspath(os.path.join(dirname, "..", "..", "temp")) model_path = os.path.join(temp_path, "iteration:" + str(iteration) + \ "-LDA_" + str(pos_fold) + ".model") #Preprocess each class to a unique value to the classification label_encoder = preprocessing.LabelEncoder() label_encoder.fit(classes_list) print "List of classes of this experiment:", label_encoder.classes_ #Read the train list and save the list of class and the list #of feature vectors list_class = [] list_fv = [] for img in train_set: list_class.append(images[img][POS_CLASSES][INDEX_ZERO]) list_fv.append(numpy.array(images[img][POS_FV][INDEX_ZERO])) list_train = numpy.array(list_fv) list_train_class = numpy.array(list_class) #Given a list of classes, transform each value in this list to a integer list_train_class = label_encoder.transform(list_train_class) #Read the test list and save the list of class and the list #of feature vectors list_img = test_set list_class = [] list_fv = [] for img in test_set: list_class.append(images[img][POS_CLASSES][INDEX_ZERO]) list_fv.append(numpy.array(images[img][POS_FV][INDEX_ZERO])) list_test = numpy.array(list_fv) list_test_class = numpy.array(list_class) #Classification #-------------------------------------------------------------------------- n_comp = parameters["Components"] if n_comp > len(label_encoder.classes_) - 1: n_comp = len(label_encoder.classes_) - 1 clf = LDA(n_components=n_comp) #Fit print "\tFit: Beginning" clf.fit(list_train, list_train_class) print "\tFit: Done!" #Save configuration of the LDA model_paths = joblib.dump(clf, model_path) #Predict print "\tPredict: Beginning" list_predict = clf.predict(list_test) print "\tPredict: Done" #Mapping the results into integers list_predict = map(int, list_predict) #Returning the result to strings list_predict = label_encoder.inverse_transform(list_predict) list_result = [] for predict in list_predict: img_result = [0] * len(label_encoder.classes_) #Find all predict in the list label_encoder.classes_ and grab the #first index pos = label_encoder.classes_.tolist().index(predict) img_result[pos] = 1 list_result.append(img_result) #-------------------------------------------------------------------------- return list_img, list_test_class, list_result, label_encoder.classes_, \ model_paths
from mpl_toolkits.mplot3d import Axes3D from mpl_toolkits.mplot3d import proj3d import sklearn.linear_model as LM import numpy as np from sklearn.metrics import precision_recall_fscore_support fname = "./3_que_data/train.csv" train_X = np.genfromtxt(fname, delimiter=",") train_Y = np.genfromtxt("./3_que_data/train_labels.csv", delimiter=",") test_X = np.genfromtxt("./3_que_data/test.csv", delimiter=",") test_Y = np.genfromtxt("./3_que_data/test_labels.csv", delimiter=",") clf = LDA() clf.fit(train_X, train_Y) train_X_transformed = clf.transform(train_X) train_X_transformed = train_X_transformed.flatten() print train_X_transformed.shape print clf.coef_ plt.plot(train_X_transformed[:1000], [10] * 1000, "ro", label="Class 1") plt.plot(train_X_transformed[1000:], [10] * 1000, "bo", label="Class 2") plt.plot([0] * 21, range(21), "g", label="Decision Boundary") plt.axis([-6, 6, 0, 20]) plt.xlabel("X-axis") plt.ylabel("Y-axis") plt.legend() plt.show() print precision_recall_fscore_support(test_Y, clf.predict(test_X), labels=[1, 2])
from sklearn.metrics import confusion_matrix #Somente o nome do arquivo if __name__=='__main__': for file in glob.glob(sys.argv[1]+'*.mat'): data = scipy.io.loadmat(file) X_train = data['Xtrain'] y_train = data['Ytrain'].T print("Treinando LDA...") lda = LDA() ytrain = data['Ytrain'].T.reshape(data['Ytrain'].shape[1]) lda.fit(data['Xtrain'].toarray(), ytrain) predict = lda.predict(data['Xval'].toarray()) yVal = data['Yval'].T.reshape(data['Yval'].shape[1]) print "Acuracia: ", sklearn.metrics.accuracy_score(yVal, predict) X_train = data["Xtrain"] X_val = data["Xval"] X_test, y_test = data["Xtest"], data["Ytest"] cm = confusion_matrix(yVal, predict) total = numpy.sum(cm, axis=1) if(cm.shape[0] < 2): acc = 1.0 else: acc = [] for i in range(total.shape[0]):
import numpy as np from sklearn.lda import LDA from sklearn.metrics import accuracy_score from util import DataReader, partition_data fp = '../data/E-GEOD-48350/E-GEOD-48350-combined.csv' x, y = DataReader(fp).get_data() argmax = lambda x: x[0] if x[0] == 1 else x[1] y = list(map(argmax, y)) partition = partition_data(x, y, [0.8, 0.2]) mli = lambda x: np.array(x).astype(float) train_x = mli(partition[0][0]) train_y = mli(partition[0][1]) test_x = mli(partition[1][0]) test_y = mli(partition[1][1]) lda = LDA(n_components=2, shrinkage='auto', solver='lsqr') lda.fit(train_x, train_y) test_y_pred = lda.predict(test_x) print(accuracy_score(test_y_pred, test_y))
def mlda(): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) clf = LDA() #LDA(n_components=None, priors=None, shrinkage=None, solver='svd', store_covariance=False, tol=0.0001) clf.fit(X, y) #训练 print(clf.predict([[-0.8, -1]])) #预测
print 'LDA result 1:', lda_result1.shape lda = LDA(n_components=1) lda_result2 = lda.fit_transform(iris.data, iris.target) print 'LDA result 2:', lda_result2.shape # Visualization import matplotlib.pyplot as plt plt.subplot(1,2,1) plt.scatter(lda_result1[iris.target==0, 0], lda_result1[iris.target==0, 1], color='r') plt.scatter(lda_result1[iris.target==1, 0], lda_result1[iris.target==1, 1], color='g') plt.scatter(lda_result1[iris.target==2, 0], lda_result1[iris.target==2, 1], color='b') plt.title('LDA on iris (1)') plt.subplot(1,2,2) plt.stem(lda_result2) plt.title('LDA on iris (2)') plt.show() # Classification x_train_set = iris.data[:-5] y_train_set = iris.target[:-5] x_test_set = iris.data[-5:] y_test_set = iris.target[-5:] clf = LDA() clf.fit(x_train_set, y_train_set) y_pre = clf.predict(x_test_set) print 'y_pre = \n', y_pre print 'y_corret = \n', y_test_set
#PLS Dimension Reduction pls2 = PLSRegression(n_components=n_components) pls2.fit(features, MA_label) XScore = pls2.transform(features) # XScore = features #LDA Classification kf = KFold(n_splits=5) kf.get_n_splits(XScore) mean_acc = 0 for train_index, test_index in kf.split(XScore): X_train, X_test = XScore[train_index], XScore[test_index] y_train, y_test = MA_label[train_index], MA_label[test_index] clf = LDA() clf.fit(X_train, y_train) Y_predict = clf.predict(X_test) for i in range(len(Y_predict)): print("Y_Predict {} - Y_Test {}".format(Y_predict[i], y_test[i])) acc = accuracy_score(Y_predict, y_test) print("Accuracy = {}".format(acc)) mean_acc = mean_acc + acc mean_acc = (mean_acc / 5) * 100 print("Accuracy is {}".format(mean_acc)) with open("Results/MLL.csv", 'a') as csvFile: writer = csv.writer(csvFile) writer.writerow([numFeatures, mean_acc]) csvfile.close()
# ***************************************************************************** # Linear Discriminant Analysis from sklearn import datasets from sklearn import metrics from sklearn.lda import LDA # load the iris datasets dataset = datasets.load_iris() # fit a LDA model to the data model = LDA() model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted))