def testSelectedFeatures1(): print("start testAdditionlSquaredFeatures()") LRModel = lr.LogisticRegression(0.001, 500) LDAModel = LDA.LDA() data1 = genRWNormalized() data2 = np.append(data1[:, [10, 1, 9, 6]], np.array([data1[:, -1]]).T, axis=1) data3 = addSquareFeature(data1, [10, 1, 9, 6]) a1 = 0 b1 = 0 a2 = 0 b2 = 0 a3 = 0 b3 = 0 for i in range(3): np.random.shuffle(data1) np.random.shuffle(data2) np.random.shuffle(data3) a1 += LRKFoldValidation(LRModel, data1, 5) b1 += LDAKFoldValidation(LDAModel, data2, 5) a2 += LRKFoldValidation(LRModel, data2, 5) b2 += LDAKFoldValidation(LDAModel, data2, 5) a3 += LRKFoldValidation(LRModel, data3, 5) b3 += LDAKFoldValidation(LDAModel, data3, 5) print("Accuracy for lr in rw is {}".format(a1 / 3)) print("Accuracy for LDA in rw is {}".format(b1 / 3)) print("Accuracy for lr in rw is {}".format(a2 / 3)) print("Accuracy for LDA in rw is {}".format(b2 / 3)) print("Accuracy for lr in rw is {}".format(a3 / 3)) print("Accuracy for LDA in rw is {}".format(b3 / 3))
def main(): filename = '../resource/train.csv' itemid, numattr, cateattr, label = readfile(filename) totalnum = len(numattr) testnum = totalnum * 0.1 testnum = int(testnum) trainnum = totalnum - testnum trainnumattr = numattr[0: trainnum] traincateattr = cateattr[0: trainnum] trainlabel = label[0: trainnum] testnumattr = numattr[trainnum:] testcateattr = cateattr[trainnum:] testlabel = label[trainnum:] multidim = MultiDimension(traincateattr) trainextattr = multidim.gettrainextattr() testextattr = multidim.gettestextattr(testcateattr) trainattr = append(trainnumattr, trainextattr, axis = 1) testattr = append(testnumattr, testextattr, axis = 1) LDAcoe = LDA(trainattr, trainlabel) LDAtrainattr = conpress(trainattr, LDAcoe) LDAtestattr = conpress(testattr, LDAcoe) for i in range(20): print LDAtrainattr[i] import sys sys.exit(1) model = WeightedModel(LDAtrainattr, trainlabel) right = 0 for i in range(testnum): p = model.predict(LDAtestattr[i]) if p == testlabel[i]: right += 1 accuracy = float(right) / testnum print 'accuracy:', accuracy
def three(): app = Flask(__name__) app.config['JSONIFY_PRETTYPRINT_REGULAR'] = True result = LDA.LDA(10) ##문서 10개 돌림 # print return json.dumps(result, ensure_ascii=False)
def __get_topic(self): """ returns a dictionary of (hashtag: topic) attributes using Latent Dirichlet Allocation """ tweet_topic = {} tweet_data = [] for tweet in self.tweets: tweet_topic[tweet["id_str"]] = "" text = self.get_tweet_text(tweet) tweet_data.append((text, tweet["id_str"])) lda = LDA.LDA(tweet_data) for tweet in self.tweets: text = self.get_tweet_text(tweet) tweet_topic[tweet["id_str"]] = lda.predict_with_bag(text) # tweet_topic[tweet["id_str"]] = lda.predict_with_tf_idf(text) hashtag_topic = {} for hashtag in self.hashtags: hashtag_topic[hashtag["text"]] = [] for tweet, hashtagList in self.tweet_hashtag_map.items(): for hashtag in hashtagList: hashtag_topic[hashtag["text"]].append(tweet_topic[tweet]) #hashtag topic is the the topic of the majority of hashtag's tweets hashtag_topic = {hashtag: self.most_common(l) for hashtag, l in hashtag_topic.items()} return hashtag_topic
def testLDAWithWine(): data = genDataWOHeader(file_path1) qualityToCategory(data) np.random.shuffle(data) #data1= removeOutLiersByND(data2) testSet, trainSet = seperateTestSet(data) aModel = LDA.LDA() return LDAKFoldValidation(aModel, trainSet, 5)
def testLDAWithCancer(): data = genData(file_path2) classToCategory(data) preprocessData(data) np.random.shuffle(data) #data1= removeOutLiersByND(data2) testSet, trainSet = seperateTestSet(data) aModel = LDA.LDA() return LDAKFoldValidation(aModel, trainSet, 5)
def RunTrainLDA(infile, pcaFile, ldaFile): import cPickle fp = open(infile, "r") dataset = cPickle.load(fp) subjID = cPickle.load(fp) fp.close() pca = PCA(dataset) pca_proj = pca.compute() np.save(pcaFile, pca_proj) lda_proj = [] lda = LDA(dataset, subjID, pca_proj) projData = lda.projectData() lda_proj = lda.train(projData) np.save(ldaFile, lda_proj)
def _init_trans_mat(self): # Check input if any([x is None for x in [self.X, self.labels, self.d]]): raise ValueError('X, labels and subdim not set!') num_pts = self.X.shape[0] D = self.X.shape[1] subdim = self.d # Setup random state prng = RandomState() if self._SEED is not None: prng = RandomState(self._SEED) if self._verbose: print("Setting random seed to", self._SEED) if self._init_method == "PCA": if num_pts < self.d: raise ValueError('num_pts < subdim') if self.d > D: raise ValueError('subdim > inputdim') pca = PCA(n_components=subdim, whiten=False) pca.fit(self.X) L = pca.components_.T + 1E-6 elif self._init_method == "LDA": if self.d > D: raise ValueError('subdim > inputdim') lda_obj = LDA.LDA(self.X, self.labels) lda_obj.compute(dim=self.d) L = lda_obj.getTransform() L = L * (1. / LA.norm(L, ord=1, axis=1)).reshape(-1, 1) elif self._init_method == "randbeng": # L = 1. * bound * prng.rand(D, self.d) - bound L = np.random.normal(0, np.sqrt(2) / np.sqrt(self.D + self.d), (self.D, self.d)) elif self._init_method == "randbest": # Do some random generation of matrices pick the one with lowest # of constraints if self._verbose: print('Doing random pre-gen L') t0 = timeit.default_timer() best_L = prng.rand(D, self.d) L = best_L self.loss_fun(best_L) # nconsts = self._count_active_constraints() bound = np.sqrt(6. / (D + self.d)) best_N_consts = 1E10 for i in range(0, 10): L = 1. * bound * prng.rand(D, self.d) - bound # L = 1E-5*prng.rand(D,self.d) # L = L * (1./LA.norm(L,ord=1,axiss=1)).reshape(-1,1) self.loss_fun(L) consts = self._count_active_constraints() if consts < best_N_consts: best_N_consts = consts best_L = copy.copy(L) L = copy.copy(best_L) if self._verbose: print("Pre-gen of L done. Took:", "%3.3f" % (timeit.default_timer() - t0), end=", ") print("# active const", best_N_consts, end=", ") elif self._init_method == "rand": # method_str = print('Doing random pre-gen Lapa') bound = np.sqrt(6. / (D + self.d)) L = 1. * bound * prng.rand(D, self.d) - bound return L
x_n = x_n.reshape(-1, 1) p_ks = np.empty(len(self.unique_y)) for j, k in enumerate(self.unique_y): p_x_given_y = self._mvn_density(x_n, self.mu_ks[j], self.Sigma) p_y_given_x = self.pi_ks[j]*p_x_given_y p_ks[j] = p_y_given_x y_n[i] = self.unique_y[np.argmax(p_ks)] return y_n We fit the LDA model below and classify the training observations. As the output shows, we have 100% training accuracy. lda = LDA() lda.fit(X, y) yhat = lda.classify(X) np.mean(yhat == y) The function below visualizes class predictions based on the input values for a model with $\bx_n \in \mathbb{R}^2$. To apply this function, we build a model with only two columns from the `wine` dataset. We see that the decision boundaries are linear, as we expect from LDA. def graph_boundaries(X, model, model_title, n0 = 100, n1 = 100, figsize = (7, 5), label_every = 4): # Generate X for plotting d0_range = np.linspace(X[:,0].min(), X[:,0].max(), n0) d1_range = np.linspace(X[:,1].min(), X[:,1].max(), n1) X_plot = np.array(np.meshgrid(d0_range, d1_range)).T.reshape(-1, 2) # Get class predictions y_plot = model.classify(X_plot).astype(int)
import os import pickle from LDA import * path = 'bbcsport' docs = [] for (dirpath, dirnames, filenames) in os.walk(path): for f_name in filenames: with open(dirpath + '/' + f_name, 'r', encoding='latin-1') as txt_file: print(dirpath + '/' + f_name) data = txt_file.read().replace('\n', ' ') docs.append(data) l = LDA(docs, K=5) l.train(n_iterations=100) l.pickle_LDA('pickledlda')
# カテゴリの最小のデータ数 minNum = np.min([np.sum(Ytr==-1),np.sum(Ytr==1)]) # 各カテゴリのデータ Xneg = Xtr[Ytr[:,0]==-1] Xpos = Xtr[Ytr[:,0]==1] # 最小データ数分だけ各カテゴリから抽出し結合 Xtr = np.concatenate([Xneg[:minNum],Xpos[:minNum]],axis=0) Ytr = np.concatenate([-1*np.ones(shape=[minNum,1]),1*np.ones(shape=[minNum,1])]) #------------------- ''' #------------------- # 3. 線形判別モデルの学習 myModel = lda.LDA(Xtr, Ytr) myModel.train() #------------------- #------------------- # 4. 線形判別モデルの評価 print(f"モデルパラメータ:\nw={myModel.w},\n平均m={myModel.m}") print(f"正解率={myModel.accuracy(Xte,Yte):.2f}") #------------------- #------------------- # 5. 真値と予測値のプロット if Xtr.shape[1] == 2: myModel.plotModel2D( X=Xtr, Y=Ytr,
df = bc_df.copy() del df['ID'] #Dropping an irrelevant feature that has nothing to do with the prediction of whether a tumor is benign or not df['class_modified'] = pd.to_numeric((df['Class'] == 4)).astype(int) df['Bare_Nuclei'] = pd.to_numeric(df['Bare_Nuclei']).astype(int) #Standardize data for column in df.columns[0:9]: df[column] = (df[column] - df[column].mean()) / df[column].std() #LDA LDA_BC = LDA() df.insert(0, "Constant", 1) df_copy = df.copy() df_copy = df_copy.drop(columns=['Class']) X = df_copy[df_copy.columns[0:10]] Y = df_copy["class_modified"] def k_fold_CV(data, model, k): all_data = data.iloc[np.random.permutation(len(data))] data_split = np.array_split(data, k) accuracies = np.ones(k)
# clf.fit(X_wines[:int(0.7*len(X_wines))], y_wines[:int(0.7*len(X_wines))]) # predicted_y = clf.predict(X_wines[int(0.7*len(X_wines)):]) # print(evaluate_acc(predicted_y,y_wines[int(0.7*len(X_wines)):])) # X_wines, y_wines = process_wines() # clf = LDA() # print("LDA on wines- Zachary",cross_validation(clf,X_wines,y_wines,5)) # # X_tumors, y_tumors = process_tumors() # clf = LDA() # print("LDA on tumors - Zachary",cross_validation(clf,X_tumors,y_tumors,5)) # X_wines, y_wines = process_wines() start = time.time() clf = LDA(X_wines[:int(0.8 * len(X_wines))]) print("LDA on wines", cross_validation(clf, X_wines, y_wines, 5)) end = time.time() print("LDA on wines time", (end - start) / 5) X_tumors, y_tumors = process_tumors() start = time.time() clf = LDA(X_tumors[:int(0.8 * len(X_tumors))]) print("LDA on tumors", cross_validation(clf, X_tumors, y_tumors, 5)) end = time.time() print("LDA on tumors time", (end - start) / 5) # # # # X_wines, y_wines = process_wines() start = time.time() clf = Logistic(0.01, 1000)
# y_pred = LDA.knn(breast_lower_dimension_train, breast_train_y.values.ravel(), breast_lower_dimension_test) # acc = LDA.compute_accuracy(y_pred, breast_test_y.values.ravel()) # print("=================== IONOSPHERE ==============") # ionosphere_train_x_selection, ionosphere_test_x_selection = featureSelection(ionosphere_train_x.values, ionosphere_train_y.values.ravel(), ionosphere_test_x.values) # ionosphere_lower_dimension_train, ionosphere_lower_dimension_test = LDA.LDA(ionosphere_train_x_selection, ionosphere_train_y.values.ravel(), ionosphere_test_x_selection, ionosphere_test_y.values.ravel(), 'ionosphere') # prior, train_mean, train_cov = NBC.train(ionosphere_train_x.values, ionosphere_train_y.values.ravel(), CLASS_NUM) # acc = NBC.test(ionosphere_test_x.values, ionosphere_test_y.values.ravel(), prior, train_mean, train_cov, CLASS_NUM, 'ionosphere', 'NBC', True) # # # Project to lower dimension # # crossValidation(iris_lower_dimension_train, iris_train_y, CLASS_NUM, 'iris', 'NBC', K) # BUG # prior, train_mean, train_cov = NBC.train(ionosphere_lower_dimension_train, ionosphere_train_y.values.ravel(), CLASS_NUM) # acc = NBC.test(ionosphere_lower_dimension_test, ionosphere_test_y.values.ravel(), prior, train_mean, train_cov, CLASS_NUM, 'ionosphere_lower', 'NBC', True) print("=================== WINE ==============") wine_train_x_selection, wine_test_x_selection = featureSelection(wine_train_x.values, wine_train_y.values.ravel(), wine_test_x.values) wine_lower_dimension_train, wine_lower_dimension_test = LDA.LDA(wine_train_x_selection, wine_train_y.values.ravel(), wine_test_x_selection, wine_test_y.values.ravel(), 'wine') # prior, train_mean, train_cov = NBC.train(wine_train_x.values, wine_train_y.values.ravel(), CLASS_NUM) # acc = NBC.test(wine_test_x.values, wine_test_y.values.ravel(), prior, train_mean, train_cov, CLASS_NUM, 'wine', 'NBC', True) # Pocket classifier # crossValidation(ionosphere_train_x, ionosphere_train_y, class_num, 'ionosphere', 'PC', K) train_weight = PC.train(wine_train_x.values, wine_train_y.values.ravel(), CLASS_NUM) acc = PC.test(wine_test_x.values, wine_test_y.values.ravel(), train_weight, CLASS_NUM, 'wine', 'PC', True) # # Project to lower dimension # # crossValidation(iris_lower_dimension_train, iris_train_y, CLASS_NUM, 'iris', 'NBC', K) # BUG # prior, train_mean, train_cov = NBC.train(wine_lower_dimension_train, wine_train_y.values.ravel(), CLASS_NUM) # acc = NBC.test(wine_lower_dimension_test, wine_test_y.values.ravel(), prior, train_mean, train_cov, CLASS_NUM, 'wine_lower', 'NBC', True) # Pocket classifier # crossValidation(ionosphere_train_x, ionosphere_train_y, class_num, 'ionosphere', 'PC', K) train_weight = PC.train(wine_lower_dimension_train, wine_train_y.values.ravel(), CLASS_NUM)
#### preprocess before LDA dict_title_preprocessed = lda.texts_preprocess(dict_title) dict_description_preprocessed = lda.texts_preprocess(dict_description) list_title_preprocessed = list(dict_title_preprocessed.values()) list_description_preprocessed = list( dict_description_preprocessed.values()) print("text preprocessed done!") #### generate item title and description similarity for selected items item_tt_id_lst = list(train_item_id.keys()) + list(test_item_id.keys()) item_total_id_lst = list(dict_title.keys()) index_lst = [] for id in item_tt_id_lst: index_lst.append(item_total_id_lst.index(id)) title_similarity = lda.LDA(texts=list_title_preprocessed, index_lst=index_lst, num_topics=finput_topic_num) description_similarity = lda.LDA(texts=list_description_preprocessed, index_lst=index_lst, num_topics=finput_topic_num) print("lda similarity calculated done!") #### generate train/test item similarity matrix df_title_similarity_matrix = pd.DataFrame(np.array(title_similarity), index=item_tt_id_lst, columns=item_tt_id_lst) df_description_similarity_matrix = pd.DataFrame( np.array(description_similarity), index=item_tt_id_lst, columns=item_tt_id_lst) # train_item_id = rw.readffile(finput_train_item_id)
import Potential import PhysTools import LDA import PlotResults # xrdb -load /dev/null # xrdb -query if __name__ == '__main__': temp = 40 * 1e-9 # temperature in unit of Kelvins lattice_er = 6 # lattices depth in recoil energy green_er = 0.1 # 532 green boxtrap depth m = 350 # magnetic flux density scattlength = PhysTools.scatlength(m) atom = PhysTools.Lithium(a=scattlength) boxtrap = Potential.BoxTrap(SheetEr=green_er, HoleEr=green_er, SheetD=30, HoleD=30, atom=atom) lattices = Potential.TopHatLattices( Er=[lattice_er, lattice_er, lattice_er], atom=atom) mfield = Potential.BiasMagneticField(B=350, curv_I=0.134363) lda0 = LDA.LDA(lattices=lattices, boxtrap=boxtrap, mfield=mfield, Global_mu=-2) PlotResults.plotresults(lda0, 0.6, m)
np.random.seed(1234) # GENERATE DATA # word to be 0 - 4 print('generate sample...') N = [1000, 1000, 1000] theta = np.array([[0.9, 0.05, 0.05], [0.1, 0.7, 0.2], [0.1, 0.2, 0.7]]) beta = np.array([[0, 0.3, 0, 0.6, 0.1], [0.8, 0.05, 0.05, 0.05, 0.05], [0.05, 0.05, 0.5, 0, 0.4]]) print('theta') print(theta) print('beta') print(beta) print() Y = [] for i in range(3): yi = np.zeros(N[i], dtype=int) for j in range(N[i]): topic = np.random.choice(3, p=theta[i, :]) yi[j] = np.random.choice(5, p=beta[topic, :]) Y.append(yi) LDA.LDA(Y, 3, 3, 5)
docs_bus = reviews_merged_bus.values() with open('../output/reviews_merged_bus.pickle', 'wb') as f: pickle.dump(reviews_merged_bus, f) with open('../output/docs_bars_bus.pickle', 'wb') as f: pickle.dump(docs_bus, f) with open('../output/bus_ids_bars_LDA.pickle', 'wb') as f: pickle.dump(reviews_merged_bus.keys(), f) lda_bus = LDA.LDA( alpha=alpha, eta=eta, n_topics=n_topics, n_features=n_features, max_df=max_df, min_df=min_df, max_iter=max_iter, ) lda_bus.vectorizecounts(docs_bus) lda_bus.fitLDA() LDA.SaveLDAModel('../output/LDA_model_bus.pickle', lda_bus) # The topic vector for a given business is given by this dataframe. bus_lda_ids = pickle.load(open('../output/bus_ids_bars_LDA.pickle', 'rb')) bus_vectors = pd.DataFrame() bus_vectors['business_id'] = bus_lda_ids transformed = lda_bus.lda.transform(lda_bus.tf) bus_vectors['topic_vector'] = [bus_topic_vec for bus_topic_vec in transformed]
import SVM import lr import Bayes import LDA LDA.LDA() Bayes.Bayes() SVM.svmwch() lr.lr()
def perform_lda(train_dataset, train_labelset, test_dataset): lda = LDA.LDA(train_dataset, train_labelset) projection_matrix, projected_train_data = lda.fit() print(np.shape(projection_matrix), np.shape(np.shape(test_dataset))) projected_test_data = lda.test_fit(projection_matrix, test_dataset) return projected_train_data, projected_test_data
import time import BCWDataset, WQDataset import LDA, LogisticRegression import KFoldCrossValidator bcwd = BCWDataset.BCWDataset() bcwd.load() wqd = WQDataset.WQDataset() wqd.load() print("LDA, BCW") print(KFoldCrossValidator.validate(LDA.LDA(), 5, bcwd.X, bcwd.y)) print("LogReg, BCW") print( KFoldCrossValidator.validate( LogisticRegression.LogisticRegression(flr=0.6, slr=0.1, num_it=100), 5, bcwd.X, bcwd.y)) print("LDA, WQ") print(KFoldCrossValidator.validate(LDA.LDA(), 5, wqd.X, wqd.y)) print("LogReg, WQ") print( KFoldCrossValidator.validate( LogisticRegression.LogisticRegression(flr=0.6, slr=0.1, num_it=100), 5, wqd.X, wqd.y))
print('K', lda.K) print('_uniqTermSet', lda._uniqTermSet) print('docsSize', lda._docNum) print('termSize', lda._termNum) print('Z ini:', lda.Z) print('docTopic ini', lda._docTopic) ##4 doc,2topic print('lda.termTopic', lda._termTopic) print('lda.Phi', lda.Phi) print('lda.Theta', lda.Theta) if __name__ == "__main__": corpus = [ "With all of the critical success Downey had experienced throughout his career, he had not appeared in a blockbuster film. That changed in 2008 when Downey starred in two critically and commercially successful films, Iron Man and Tropic Thunder. In the article Ben Stiller wrote for Downey's entry in the 2008 edition of The Time 100, he offered an observation on Downey's commercially successful summer at the box office.", "On June 14, 2010, Downey and his wife Susan opened their own production company called Team Downey. Their first project was The Judge.", "Robert John Downey Jr. is an American actor, producer, and singer. His career has been characterized by critical and popular success in his youth, followed by a period of substance abuse and legal troubles, before a resurgence of commercial success in middle age.", "In 2008, Downey was named by Time magazine among the 100 most influential people in the world, and from 2013 to 2015, he was listed by Forbes as Hollywood's highest-paid actor. His films have grossed over $14.4 billion worldwide, making him the second highest-grossing box-office star of all time." ] X = [i.split(' ') for i in corpus] lda = LDA.LDA() lda.fit(X) printAttr(lda) #fig,ax= lda.plotDocTopicDist(2) #fig,ax = lda.plotTermTopicDist(2) #fig,ax = lda.plotTopicTermDist(1) plt.show()
sample_image = test_images[random.sample(range(len(test_label)), 10)] if input_.mode == 0: ## Doing PCA and get the eigenface and W(dimension reduction) PCA_mean, PCA_EigenFace, PCA_W = PCA(images=images, Size=Size, FacePath="./PCA/EigenFace/") Reconstruct(EigenFace=PCA_EigenFace, sample_image=sample_image, Size=Size, Path="./PCA/") ## Doing LDA and get the fisherface and W(dimension reduction) LDA_mean, LDA_EigenFace, LDA_W = LDA(images=images, Size=Size, label=label, FacePath="./LDA/EigenFace/") Reconstruct(EigenFace=LDA_EigenFace, sample_image=sample_image, Size=Size, Path="./LDA/") elif input_.mode == 1: ## Doing PCA and get the eigenface and W(dimension reduction) print("PCA:") PCA_mean, PCA_EigenFace, PCA_W = PCA(images=images, Size=Size, FacePath=None) ## Using PCA Knn on test image sets, I try to label the test images. KNN("PCA", k = 3, images = images, EigenFace = PCA_EigenFace.T, proj_train_image = PCA_W, label = label, \ test_images = test_images, test_label = test_label)
import LoadImage import LBP import LDA ClassNum = 40 countInSameClass = 10 image_total = ClassNum * countInSameClass sizeOfImage = 112 * 92 if __name__ == '__main__': FaceMat, label = LoadImage.loadImage('./ORL/s', ClassNum, countInSameClass, image_total, sizeOfImage) # FaceMat_fromLBP = LBP.LBP(92,112,FaceMat) LDA.LDA(FaceMat.T, label)
ax.plot(X_pca_projected[30:], np.zeros(30), linestyle='None', marker='o', markersize=7, color='blue') ax.set_xlabel('PC1') ax.set_ylabel('') ax.set_title('Projected of X onto PC1') fig.show() fig.savefig('Projected of X onto PC1') X.shape[0] y = np.array(y) W = lda.LDA(X, y) set(y) X_Wproj = X.dot(W) fig = plt.figure(figsize=(10, 8)) ax = fig.add_subplot(2, 1, 1) ax.plot(X_Wproj[0:30], np.zeros(30), linestyle='None', marker='o', markersize=5, color='orange') ax.plot(X_Wproj[30:], np.zeros(30), linestyle='None',
for i in learn: for j in ite: LRModel = lr.LogisticRegression(i, j) ave = 0.0 for k in range(3): ac = LRKFoldValidation(LRModel, rwClear, 5) print("per k fold:", ac) ave += ac ave = ave / 3.0 print("ave:", ave) if ave > max_acc: max_acc = ave bestLearn = i bestIte = j print(ave, " ", i, " ", j) print(bestLearn) print(bestIte) print(max_acc) LRModel = lr.LogisticRegression(0.001, 500) LDAModel = LDA.LDA() rwNormalized = genRWNormalized() cancerNormalized = genCancerNormalized() rwNormalized = genRWNormalized() print(LRKFoldValidation(LRModel, cancerNormalized, 5)) print(LDAKFoldValidation(LDAModel, cancerNormalized, 5)) print(LRKFoldValidation(LRModel, rwNormalized, 5)) print(LDAKFoldValidation(LDAModel, rwNormalized, 5))
def featureSelection(data, isLR): selectedFeatureNum = [] selectedFeatureArray = -1 bestAccuracyAll = 0 y_2d = np.array([data[:, -1]]).T #print(y_2d) for i in range(data.shape[1] - 1): featureToAdd = -1 bestAccuracy = 0 column_2d = -1 print("select feature{}".format(i)) if i == 0: for j in range(data.shape[1] - 1): if (j in selectedFeatureNum) == False: column_2d = np.array([data[:, j]]).T nums = selectedFeatureNum + [j] # ------5 should be changed -- #print(np.concatenate((column_2d,y_2d), axis = 1)) if isLR: model = lr.LogisticRegression(0.001, 500) accuracy = LRKFoldValidation( model, np.concatenate((column_2d, y_2d), axis=1), 5) else: model = LDA.LDA() accuracy = LDAKFoldValidation( model, np.concatenate((column_2d, y_2d), axis=1), 5) print("Using feature(s){} accuracy is{}".format( nums, accuracy)) if accuracy >= bestAccuracy: bestAccuracy = accuracy featureToAdd = j selectedFeatureArray = column_2d bestAccuracyAll = bestAccuracy selectedFeatureNum.append(featureToAdd) continue else: #try add feature from the rest of set for j in range(data.shape[1] - 1): if (j in selectedFeatureNum) == False: column_2d = np.array([data[:, j]]).T nums = selectedFeatureNum + [j] # ------5 should be changed --- #print(np.concatenate((selectedFeatureArray, column_2d , y_2d), axis = 1)) if isLR: model = lr.lr.LogisticRegression(0.001, 500) accuracy = LRKFoldValidation( model, np.concatenate( (selectedFeatureArray, column_2d, y_2d), axis=1), 5) else: model = LDA.LDA accuracy = LDAKFoldValidation( model, np.concatenate( (selectedFeatureArray, column_2d, y_2d), axis=1), 5) print("Using feature(s){} accuracy is{}".format( nums, accuracy)) if accuracy >= bestAccuracy: bestAccuracy = accuracy featureToAdd = j #additional feature cannot improve performance by 1% if bestAccuracyAll >= bestAccuracy: print("maxima reached") break else: #add addtional feature bestAccuracyAll = bestAccuracy selectedFeatureNum.append(featureToAdd) selectedFeatureArray = np.concatenate( (selectedFeatureArray, np.array([data[:, featureToAdd]]).T), axis=1) print( "feature selection ended, best performing features are {}, the accuracy is {}" .format(selectedFeatureNum, bestAccuracyAll)) return selectedFeatureNum, selectedFeatureArray
def testDataPreprocess(): rwData = genRW() cancerData = genCancer() rwNormalized = genRWNormalized() cancerNormalized = genCancerNormalized() rwRemovedOL = genRWRemovedOL() cancerRemovedOL = genCancerRemovedOL() rwClear = genRWClear() cancerClear = genCancerClear() LRModel = lr.LogisticRegression(0.001, 500) LDAModel = LDA.LDA() a = 0 b = 0 c = 0 d = 0 for i in range(3): np.random.shuffle(rwData) np.random.shuffle(cancerData) a += LRKFoldValidation(LRModel, rwData, 5) b += LDAKFoldValidation(LDAModel, rwData, 5) c += LRKFoldValidation(LRModel, cancerData, 5) d += LDAKFoldValidation(LDAModel, cancerData, 5) print(a / 3) print(b / 3) print(c / 3) print(d / 3) a2 = 0 b2 = 0 c2 = 0 d2 = 0 for i in range(3): np.random.shuffle(rwNormalized) np.random.shuffle(cancerNormalized) a2 += LRKFoldValidation(LRModel, rwNormalized, 5) b2 += LDAKFoldValidation(LDAModel, rwNormalized, 5) c2 += LRKFoldValidation(LRModel, cancerNormalized, 5) d2 += LDAKFoldValidation(LDAModel, cancerNormalized, 5) print(a2 / 3) print(b2 / 3) print(c2 / 3) print(d2 / 3) a3 = 0 b3 = 0 c3 = 0 d3 = 0 for i in range(3): np.random.shuffle(rwClear) np.random.shuffle(cancerClear) a3 += LRKFoldValidation(LRModel, rwClear, 5) b3 += LDAKFoldValidation(LDAModel, rwClear, 5) c3 += LRKFoldValidation(LRModel, cancerClear, 5) d3 += LDAKFoldValidation(LDAModel, cancerClear, 5) print(a3 / 3) print(b3 / 3) print(c3 / 3) print(d3 / 3) a4 = 0 b4 = 0 c4 = 0 d4 = 0 for i in range(3): np.random.shuffle(rwRemovedOL) np.random.shuffle(cancerRemovedOL) a4 += LRKFoldValidation(LRModel, rwRemovedOL, 5) b4 += LDAKFoldValidation(LDAModel, rwRemovedOL, 5) c4 += LRKFoldValidation(LRModel, cancerRemovedOL, 5) d4 += LDAKFoldValidation(LDAModel, cancerRemovedOL, 5) print(a4 / 3) print(b4 / 3) print(c4 / 3) print(d4 / 3)
#for userId, user in dic_user.iteritems(): # print(str(userId) + " " + str(len(user.tweet_set))) k_topics = num_topics LDA_iterations = num_iterations sentimentPoints = getSentimentPoints() #print(sentimentPoints) dictionary, corpus, out_set = preprocessing(doc_set) for i in range(0,len(out_set)): tweet_set[i].wordSet = out_set[i] sentimentsOfTweets = getSentimentScoreOfTweets(out_set) model = LDA(dictionary, corpus, k_topics, LDA_iterations) for i in range(0,len(sentimentsOfTweets)): tweet_set[i].russell_tuple = sentimentsOfTweets[i] sentDic = loadDict() dictByTopic = [] tempDic = {} topics = model.get_topics() for topic in topics: tempDic = {} for i in range(0,len(topic)): tempDic[dictionary[i]] = topic[i] dictByTopic.append(tempDic)
plt.colorbar() plt.show() # part c # Reference: https://github.com/scikit-learn/scikit-learn/blob/7389dba/sklearn/discriminant_analysis.py # First, split the data into training and validation sets data_size = len(training_data_norm) indices = np.random.permutation(data_size) x_val, y_val = training_data_norm[indices][:10000], training_labels[indices][:10000] x_train, y_train = training_data_norm[indices][10000:], training_labels[indices][10000:] y_val.flatten() nums = [100, 200, 500, 1000, 2000, 5000, 10000, 30000, 50000] model1, model2 = LDA.LDA(), QDA.QDA() lda_score, qda_score = [], [] for i in nums: model1.fit(x_train[:i], y_train[:i]) model2.fit(x_train[:i], y_train[:i]) lda_pred = model1.predict(x_val) qda_pred = model2.predict(x_val) lda_err = 1 - np.sum(lda_pred == y_val)/y_val.shape[0] lda_score.append(lda_err) qda_err = 1 - np.sum(qda_pred == y_val)/y_val.shape[0] qda_score.append(qda_err) print(lda_score, qda_score) plt.plot(nums, lda_score, 'ro', label="LDA") plt.plot(nums, qda_score, 'yo', label="QDA") plt.xlabel('numbers of training examples')