def test_LDA(): from LDA import LDA x = [[2.95, 6.63], [2.53, 7.79], [3.57, 5.65], [3.16, 5.47], [2.58, 4.46], [2.16, 6.22], [3.27, 3.52]] e = copy.deepcopy(x) y = [1, 1, 1, 1, 2, 2, 2] t = LDA(x, y) for a in e: r = t.predict(a) print max(r, key=r.get)
def lda (text): lda = LDA (text) lda.setTopics (10) lda.buildModel () lda.writeTopics () vector = lda.getTopics () del lda return vector
def test_LDA(): from LDA import LDA x = [ [2.95, 6.63], [2.53, 7.79], [3.57, 5.65], [3.16, 5.47], [2.58, 4.46], [2.16, 6.22], [3.27, 3.52] ] e = copy.deepcopy(x) y = [1,1,1,1,2,2,2] t = LDA(x, y) for a in e: r = t.predict(a) print max(r, key=r.get)
def get_model(self, model_type, fo_lang_code, docs): model = None if model_type == "vsm": model = VSM(fo_lang_code=fo_lang_code) model.build_model(docs) elif model_type == "lda": model = LDA(fo_lang_code=fo_lang_code) model.build_model(docs, num_topics=60, passes=100) return model
def fit(self, X, y): self.pca = PCA(n_components=self.pca_components).fit(X) pca_projected = self.pca.project(X) self.lda = LDA(n_components=self.n_components).fit(pca_projected, y) self.subspace = np.dot(self.pca.pro_subspace, self.lda.pro_subspace) return self
def fit(self,data): '''训练模型参数,data = np.array([[value1,value2,...,label],...])''' model = LDA(k=self.k) new_data,self.W = model.fit(data) # new_data是每行一条数据,最后一维度是类别 new_data = np.array(new_data) # matrix -> np.ndarray print('降维后的前3条数据为',new_data[:3]) # 按类别将降维后的数据分类 m,n = new_data.shape # m条数据,每条数据由n个维度,其中最后一维度是类别 # 将数据按类别划分 dataDict = defaultdict(list) # 遍历数据 for i in range(m): dataDict[new_data[i,-1]].append(new_data[i,:-1]) # 去除类别标签 self.classes = list(dataDict.keys()) # 存储类别 # 计算每个类别的均值向量,协方差矩阵和协方差矩阵的行列式的值 for label in dataDict.keys(): temp = np.array(dataDict[label]) # 每行为一条数据 mean = temp.mean(axis=0,keepdims=True) # 求均值向量,保持二维 var = temp.var(axis=0,keepdims=True) self.parameters['class' + str(label)] = {'mean':mean,'var':var}
class TR_LDA: def __init__(self, fo_lang_code, trans_agent): self.lda = LDA(fo_lang_code) self.trans_agent = trans_agent def train(self, docs, num_topics=5, passes=100): # docs = [doc.split() for doc in docs] trans_docs = [] for doc in docs: trans_doc = translate_sentences(doc, "en") trans_docs.append(trans_doc) self.lda.train(trans_docs, num_topics=num_topics, passes=passes) def get_doc_similarity(self, doc1, doc2): en_doc1 = self.trans_agent.get_translated_doc(doc1) en_doc2 = self.trans_agent.get_translated_doc(doc2) return self.lda.get_doc_similarity(en_doc1, en_doc2) def get_model_name(self): return "TR-LDA"
def fit(self,data): '''训练模型参数,data = np.array([[value1,value2,...,label],...])''' model = LDA(k=self.k) new_data,self.W = model.fit(data) # new_data是每行一条数据,最后一维度是类别 new_data = np.array(new_data) # matrix -> np.ndarray # 按类别将降维后的数据分类 m,n = new_data.shape # m条数据,每条数据由n个维度,其中最后一维度是类别 # 将数据按类别划分 dataDict = defaultdict(list) # 遍历数据 for i in range(m): dataDict[new_data[i,-1]].append(new_data[i,:-1]) # 去除类别标签 self.classes = list(dataDict.keys()) # 存储类别 # 计算每个类别的均值向量,协方差矩阵和协方差矩阵的行列式的值 for label in dataDict.keys(): temp = np.array(dataDict[label]).T # 转为每列为一条数据 mean = temp.mean(axis=1,keepdims=True) # 求均值向量,保持二维 covariance = (temp - mean).dot((temp - mean).T) # 求协方差矩阵,shape=(n,n),n是特征的个数 determinant = np.linalg.det(covariance) # 求协方差矩阵的行列式 inverse_cov = np.mat(covariance).I # 协方差矩阵的逆矩阵 self.parameters['class' + str(label)] = {'mean':mean,'inverse_cov':inverse_cov,'determinant':determinant}
def create_features(self): cate_col = ['col9', 'col609', 'col1317', 'col1430', 'col1975', 'col2135', 'col3213', 'col3289', 'col3290', 'col3519', 'col3591'] co_feature1 = [(cate_col[i],cate_col[j]) for i in range(len(cate_col)) for j in range(i)] co_feature2 = [(cate_col[j],cate_col[i]) for i in range(len(cate_col)) for j in range(i)] co_feature = co_feature1 + co_feature2 n_topics,n_lda_fe = 3,10 lda = LDA(co_feature,n_topics,n_lda_fe) lda.fit(train) train_lda = lda.transform(train,cate_col) test_lda = lda.transform(test,cate_col) new_col = [f"{c}_topic{i}" for c in co_feature for i in range(n_topics)] for i,c in enumerate(new_col): self.train[c]= train_lda[:,i] self.test[c] = test_lda[:,i]
def analysisEM_GMM(cryo_data, use_PCA=True, normalize=True, title="EM_GMM Results"): ### Define a seed that doesn't push two Gaussians right next to each other np.random.seed(1) ### Reduce dimensionality to 2D new_data = [] if use_PCA: new_data = PCA(cryo_data, normalize=normalize) else: ### use_LDA new_data = LDA(cryo_data, user_dims=2, normalize=normalize) ### Run the EM_GMM algorithm to attempt to classify our data points EM_GMM(new_data, cryo_data.iloc[:,-1], 2, max_iters=10, title=title)
def analysisLDA(cryo_data, normalize=True): ### Get results on my own PCA on this dataset new_data = LDA(cryo_data, user_dims=2, normalize=normalize) plotResults_2D(new_data, cryo_data.iloc[:,-1], 'Custom LDA Results on cryo Dataset - Normalized = '+str(normalize)) ### Get results to compare to using the sklearn version of LDA on this dataset lda = sklearn_LDA(n_components=2) if normalize: sklearn_data = sklearn_SS().fit_transform(cryo_data.iloc[:,:-1]) sklearn_new_data = lda.fit_transform(sklearn_data, cryo_data.iloc[:,-1]) else: sklearn_new_data = lda.fit_transform(cryo_data.iloc[:,:-1], cryo_data.iloc[:,-1]) plotResults_2D(pd.DataFrame(sklearn_new_data), cryo_data.iloc[:,-1], 'Sklearn LDA Results on cryo Dataset - Normalized = '+str(normalize))
def Topic2Vec_v2(): """ 分析句子在,将句子转换为topic 向量 :return: """ lda = LDA() sentences = ReadFile.readTXTFile(config.BTMData + "topic_data_processed.txt") docs = [] lab = [] for index, line in enumerate(sentences): term = line.strip().split("\t") if len(term) != 3: continue docs.append(term[1]) lab.append(term[2]) documents = line_Cut_Word(docs) documents = [" ".join(doc) for doc in documents] lda.load_word_dic() lda.load_LdaModel() # lda.build_word_dic(lines(documents)) # print len(lda.word_dic.keys()) # lda.buildModel(lines(documents)) result_lab = [] topic2vec = [] x_index, y_index = [], [] count = 0 print len(lab) for index, doc_lab in enumerate(list(zip(docs, lab))): if index % 1000 == 0 and index != 0: print doc_lab[0], doc_lab[1] # break doc = doc_lab[0] la = doc_lab[1] topics = lda.getQuerySimilarly(doc) if topics: # print doc, "\t", la for topic in topics: x_index.append(count) y_index.append(topic[0]) topic2vec.append(topic[1]) count += 1 result_lab.append(la) print len(x_index), len(y_index), len(topic2vec), len(result_lab), count result = [x_index, y_index, topic2vec, result_lab] with open(config.BTMData + "topic2vec_2.txt", 'wb') as fp: cPickle.dump(result, fp)
def get_model(self, model_type, fo_lang_code, docs): model = None if model_type == "vsm": model = VSM(fo_lang_code=fo_lang_code) model.build_model(docs) elif model_type == "lda": model = LDA(fo_lang_code=fo_lang_code) model.build_model(docs, num_topics=60, passes=100) elif model_type == "gvsm": model = GVSM(fo_lang_code=fo_lang_code, term_similarity_type=self.term_similarity_type) model.build_model(docs) elif model_type == "lsi": model = LSI(fo_lang_code=fo_lang_code) model.build_model(docs, num_topics=60) return model
def cross_validation(X, Y, folds=5, split_value=0.3, name="lda"): # Y = Y.reshape((len(Y), 1)) # X = np.hstack((X, Y)) # part = -1 # # if split: # part = split_value # else: # part = int(np.math.ceil(len(X) / folds)) # scores = [] # # for i in range(folds): # test = np.array(X[i * part: (i + 1) * part]) # test = [list(d) for d in test] # train = [np.array(j) for j in X if list(j) not in test] # test = np.array(test) # train = np.array(train) # # train_x, train_y = train[:, :-1], train[:, -1] # test_x, test_y = test[:, :-1], test[:, -1] # # print(train_x.shape) # print(test_x.shape) scores = [] for fold in range(folds): train_x, test_x, train_y, test_y = train_test_split( X, Y, shuffle=True, test_size=split_value) if name == "lda": lda = LDA() lda.fit(train_x, train_y) lda_train_x = lda.transform(train_x) lda_test_x = lda.transform(test_x) else: pca = PCA() pca.fit(train_x) pca_train_x = pca.transform(train_x) pca_test_x = pca.transform(test_x) '''classifier''' lr = LogisticRegression(solver='saga', n_jobs=4) lr.fit(train_x, train_y) score = lr.score(test_x, test_y) scores.append(score) print("accuracy on fold ", fold, " : ", score) mean = np.mean(scores) std = np.std(scores) print("mean accuracy : ", mean) print("standard deviation : ", std) return mean, std, scores
def train_LDA(): x_train, y_train, x_test, y_test = get_test_train() lda = LDA() lda.fit(x_train, y_train) y_prediction = np.zeros(len(y_test)) for i, x in enumerate(x_test): y_prediction[i] = lda.predict(x) print(y_prediction) print(error_rate(y_prediction, y_test)) y_train_pred = np.zeros(len(y_train)) for i, x in enumerate(x_train): y_train_pred[i] = lda.predict(x) print(error_rate(y_train_pred, y_train))
def run_cm1_exp2_LDA(): """ Based on Jane's suggestion: evaluate the result on the impacted artifacts only :return: """ cm1_exp2 = CM1_Experiment2(word_threshold=5) # cm1_exp2.select_word_all_doc_threshold() cm1_exp2.replace_words_in_target_artifacts() cm1_exp2.get_impacted_links() vsm = LDA(fo_lang_code="en") vsm.train(cm1_exp2.get_docs(), num_topics=20) vsm_replace = LDA(fo_lang_code="en") vsm_replace.train(cm1_exp2.get_docs(), num_topics=20) cm1_exp2.run_origin_model(vsm) cm1_exp2.run_replaced_model(vsm_replace) cm1_exp2.eval_and_compare()
def kFoldCrossValidate(dataset, classificationModel, numFolds=5, shuffle='off', alpha=0.0002, steps=10000): """ numFolder = number of folds (default: 5) dataset = numpy array of preprocessed dataset with last column = labels classificationModel = "LDA" or "LR" shuffle = 'on' or 'off' - if shuffle is 'on', shuffle rows using np.random.shuffle alpha = learning rate for logistic regression Split dataset into numFolds (default: 5) equal sections, train model on the other numFolds - 1 (default: 4) folds and return average accuracy (as a float decimal) over numFolds folds. """ if classificationModel!='LDA' and classificationModel!='LR': return -1 #error if shuffle=='on': np.random.shuffle(dataset) foldsList = np.array_split(dataset, numFolds) #np.array_split splits dataset into a list of numFolds number of folds totalAccuracy = 0 for currentFoldIndex in range(numFolds): validationData = foldsList[currentFoldIndex] #assign current fold to validationData del foldsList[currentFoldIndex] #remove the current fold from the list... trainingData = np.vstack(foldsList) # vertically stack the remaining elements in the list creating a matrix of # the dataset with validation data removed --> creating the training set foldsList.insert(currentFoldIndex, validationData) # add it back at the same index to leave the list unchanged if classificationModel=='LDA': model = LDA(data=trainingData) model.fit() elif classificationModel=='LR': model = LogisticRegression(data=trainingData) model.fit(steps=steps, alpha=alpha) X_test = validationData[:,:-1] #remove last col of validationData y_predict = model.predict(X_test=X_test) y_test = validationData[:,-1][:,np.newaxis] #last col of validationData totalAccuracy += utils.evaluate_acc(y_predict=y_predict, y_test=y_test) return totalAccuracy / numFolds
def LoadTextData(Country,gallery_id): S ,Data = Load_GalLery_Textual_Data(Country,gallery_id) S1 ,Data1 = Load_GoogleVision_Labels(Country,gallery_id) labels = [Preprocessing(x['label']) for x in S1[0]] labels.append(Preprocessing(S1[1])) DocList = S[1] DocList.append(S[0]) for s in S[2]: DocList.extend(s) data_lemmatized = PrepareData(DocList) lda_model,id2word,corpus = LDA(data_lemmatized,num_topics=20)#len(labels)) Topic_Words = Topics_Words(lda_model,num_words=len(labels)) return Topic_Words,labels
def kfold(data, k, m): # shuffles the data and group them into k groups numpy.random.shuffle(data) groups = numpy.array_split(data, k, axis=0) # averages the acuracy of k predictions acc = 0 for i in range(k): val_set = groups[i][:, 0:-1] true_val = groups[i][:, -1] train_set = numpy.concatenate(groups[:i] + groups[i + 1:], axis=0) # build a model using the training set if m == Model.logreg: model = LogReg(train_set) w = model.fit(0.1, 100) elif m == Model.lda: model = LDA(train_set) w = model.fit(model.cvinv, model.u) r = model.predict(val_set, w) acc += evaluate_acc(r, true_val) acc /= k return acc
def __init__(self, data): self.data = data self.stopwords = nltk.corpus.stopwords.words('english') self.ws = WordShape() # train and predict word2vec clustering print('training word2vec ...') for doc in data: for word in doc: self.tokens.append(word[0]) w2v = word2vec(self.tokens, 5) w2v.train() self.w2v_dict = dict(zip(self.tokens, w2v.predict())) print('train brown clustering ...') brown_wrapper = BrownWrapper(data) self.brown_dict = brown_wrapper.get_brown_clustering() print('train LDA topic clustering ...') self.lda = LDA(self.tokens) print('\nextracting features ...\n')
def get_trained_model(self): if self.model_name == 'LDA': self.model = LDA(self.x_train, self.y_train, self.x_test, self.y_test) self.model.train_model() elif self.model_name == 'LR': self.model = LogisticReg(self.x_train, self.y_train, self.x_test, self.y_test) self.model.train_model() elif self.model_name == 'MLP': self.model = MultiLayerPerceptron(self.x_train, self.y_train, self.x_test, self.y_test) self.model.train_model() elif self.model_name == 'SVM': self.model = SupportVectorMachine(self.x_train, self.y_train, self.x_test, self.y_test) self.model.train_model() return self.model.get_model()
import numpy as np from generate_data import gen_data from LDA import LDA import matplotlib.pyplot as plt d1, d2, c1, c2 = gen_data(10) w, mu1, mu2 = LDA(d1, d2) w = w / np.linalg.norm(w) #print(np.linalg.norm(w)) a = w[1] / w[0] plt.figure(1) plt.axis('equal') x = np.linspace(-8, 8, 100) y = a * x plt.plot(x, y) for x1, wx1 in zip(d1, d1.dot(w)): #print(wx1) y1 = wx1 * w plt.scatter(x1[0], x1[1], color='r') plt.scatter(y1[0], y1[1], color='g') plt.plot([x1[0], y1[0]], [x1[1], y1[1]], color='y', linestyle='--') for x1, wx1 in zip(d2, d2.dot(w)): #print(wx1) y1 = wx1 * w plt.scatter(x1[0], x1[1], color='b') plt.scatter(y1[0], y1[1], color='skyblue') plt.plot([x1[0], y1[0]], [x1[1], y1[1]], color='y', linestyle='--')
def lda(): path_hasil = 'static/assets/LDA/hasil/' if request.method == 'POST': id_time = datetime.datetime.now().time().microsecond input_data_test_ke = (request.form['datatest_input']) pilih_metode = str(request.form['pilih_metode']) # print(pilih_metode,type(pilih_metode)) data = ORL_face.data data_train = ORL_face.data_train data_test = ORL_face.data_test if input_data_test_ke != "SEMUA": data_test_ke = data_test[int(input_data_test_ke)] orang, pose = LDA(input_data_test_ke).calc_lda( data_test_ke, pilih_metode) temp_datatest = np.reshape(data_test_ke, (data.shape[2], data.shape[3])) result = Image.new("RGB", (92 * 2, 112)) img_train = (Image.fromarray(data[orang[0] - 1][pose[0] - 1])) x = 0 y = 0 w, h = img_train.size # print('pos {0},{1} size {2},{3}'.format(x, y, w, h)) result.paste(img_train, (x, y, x + w, y + h)) img_test = (Image.fromarray(temp_datatest)) x = (1) * 92 y = (1) * 0 w, h = img_test.size # print('pos {0},{1} size {2},{3}'.format(x, y, w, h)) result.paste(img_test, (x, y, x + w, y + h)) filename_img = str(id_time) + "_" + pilih_metode + "_" + str(1) result.save('static/assets/LDA/hasil/' + filename_img + '.png') hasil_keterangan = PCA.eval(None, int(input_data_test_ke), orang - 1) return render_template('LDA.html', query_path=input_data_test_ke, img_hasil=filename_img, keterangan=hasil_keterangan) elif input_data_test_ke == "SEMUA": temp_benar = 0 temp_salah = 0 temp_keterangan = [] temp_img = [] for input_data_test_ke in range((ORL_face.data_test.shape[0])): data_test_ke = data_test[int(input_data_test_ke)] orang, pose = LDA(input_data_test_ke).calc_lda( data_test_ke, pilih_metode) temp_datatest = np.reshape(data_test_ke, (data.shape[2], data.shape[3])) result = Image.new("RGB", (92 * 2, 112)) img_train = (Image.fromarray(data[orang[0] - 1][pose[0] - 1])) x = 0 y = 0 w, h = img_train.size # print('pos {0},{1} size {2},{3}'.format(x, y, w, h)) result.paste(img_train, (x, y, x + w, y + h)) img_test = (Image.fromarray(temp_datatest)) x = (1) * 92 y = (1) * 0 w, h = img_test.size # print('pos {0},{1} size {2},{3}'.format(x, y, w, h)) result.paste(img_test, (x, y, x + w, y + h)) filename_img = str(id_time) + "__" + pilih_metode + "_" + str( input_data_test_ke + 1) result.save('static/assets/LDA/hasil/' + filename_img + '.png') hasil_keterangan = PCA.eval(None, int(input_data_test_ke), orang - 1) temp_keterangan.append(hasil_keterangan) temp_img.append(filename_img) if hasil_keterangan == 'benar': temp_benar += 1 else: temp_salah += 1 hasil_akurasi = (temp_benar / (temp_benar + temp_salah)) * 100 return render_template('LDA.html', query_path="SEMUA", len_data_test=ORL_face.data_test.shape[0], img=temp_img, akurasi=hasil_akurasi, keterangan=temp_keterangan) else: return render_template('LDA.html')
from LDA import LDA from QDA import QDA import scipy.io as sio import csv # problem 6d data = sio.loadmat('spam/spam_data_2.mat') train_X = data['training_data'] train_y = data['training_labels'][0] test_X = data['test_data'] # validate_X = train_X[:5000,:] # validate_y = train_y[:5000] # train_X = train_X[5000:,:] # train_y = train_y[5000:] cls_lda = LDA(2, 104, 0.0001) cls_lda.fit(train_X, train_y) y_predict = cls_lda.predict(test_X) # error = 0 # for i in range(validate_X.shape[0]): # error += (int(validate_y[i]) != int(y_predict[i])) # error /= validate_X.shape[0] # print(error) with open('spam_predict.csv', 'wt') as f: writer = csv.writer(f, delimiter=',') writer.writerow(['Id', 'Category']) for i in range(y_predict.shape[0]): writer.writerow([i, int(y_predict[i][0])])
dump_file = ( "/home1/roy/QGen/DGen/Dataset/enwiki-latest-pages-articles.xml.bz2" ) # for generate_input_files bow_path = ("/home1/roy/QGen/DGen/CDC/data/full_wiki_bow.mm" ) # doc to [(word_id, count)..] mapping dict_path = ("/home1/roy/QGen/DGen/CDC/data/full_wiki.dictionary" ) # word_id to word mapping model_file = ( "/home1/roy/QGen/DGen/CDC/models/ldamodel_topics100_trainiter20_full_en.gensim" ) num_topics = 100 id2word_dict = gensim.corpora.Dictionary.load(dict_path) # print(id2word_dict.token2id.items()[:100]) lda = LDA() debug = False lda.load(model_file) print("Load LDA model") conceptualizer = Conceptualizer.Conceptualizer(lda) def search_e_from_c(c, concept, k): """ Find all entities under concept :param c: the database cursor :param concept: concept to be searched :param k: maximum number of entities to be generated :return: a sorted list containing (entity_name, frequency) pairs """ cursor = c.execute(
label_test = np.zeros((ntest, 1)) for i in range(0, c): for j in range(0, n): x_train[:, n * i + j] = np.reshape(t[:, j, i], (d, 1)) label_train[n * i + j] = i for j in range(0, (21 - n)): x_test[:, (21 - n) * i + j] = np.reshape(t[:, n + j, i], (d, 1)) label_test[(21 - n) * i + j] = i ''' dimensionality reduction or original data''' if opt == 1: x_train, x_test = x_train, x_test elif opt == 2: x_train, x_test = PCA(x_train, x_test) d, _ = x_train.shape elif opt == 3: x_train, x_test = LDA(x_train, x_test, c, n) d, _ = x_train.shape ''' estimating mean and variance from train data''' mean_train = np.matrix(np.zeros((d, c)), dtype=complex) for k in range(0, c): for l in range(0, n): mean_train[:, k] = mean_train[:, k] + x_train[:, n * (k) + l] mean_train[:, k] = (1 / n) * mean_train[:, k] cov_train = np.zeros((d, d, c), dtype=complex) cov_inv = np.zeros((d, d, c), dtype=complex) for a in range(0, c): for b in range(0, n): cov_train[:, :, a] = cov_train[:, :, a] + ( (x_train[:, n * (a) + b] - mean_train[:, a]) *
def train_lad(): lda = LDA() sentences = ReadFile.readTXTFile(config.BTMData + "btm_text_corpus.txt") # line = LineSetence(sentences=sentences) lda.buildModel(lines(sentences), num_topics=21)
def start(args): if args.start == "Cold": tdm = TwintDataMiner() attributes = ["id", "tweet", "place", "date"] before_df = get_before_tweets(attributes=attributes, tdm=TwintDataMiner()) after_df = get_after_tweets(attributes=attributes, tdm=TwintDataMiner()) print(before_df.shape) print(before_df.head) print(after_df.shape) print(after_df.head) before_df.drop_duplicates(subset='id') print(before_df.shape) print("----") after_df.drop_duplicates(subset='id') print(after_df.shape) # LDA declarations lda_before_orig = LDA() lda_after_orig = LDA() lda_before_bow = LDA() lda_after_bow = LDA() # Personalised BOW print("Personalised BOW") lemmatised_vocab = pd.DataFrame() lemmatised_vocab['column'] = [final_bow] lda_after_bow.dict_from_vocab(doc=lemmatised_vocab.iloc[0]) lda_before_bow.dict_from_vocab(doc=lemmatised_vocab.iloc[0]) print("BOW created") print("Processing docs...") before_docs = process_docs(before_df, lda_before_bow) print("Processed before docs.") after_docs = process_docs(after_df, lda_after_bow) print("Processed after docs.") # Original lda_before_orig.dict_from_vocab(doc=before_docs) lda_after_orig.dict_from_vocab(doc=after_docs) # Store before and after docs store_as_pickle("before_docs", before_docs) store_as_pickle("after_docs", after_docs) #Store lda objects store_as_pickle("lda_before_orig", lda_before_orig) store_as_pickle("lda_after_orig", lda_after_orig) store_as_pickle("lda_before_bow", lda_before_bow) store_as_pickle("lda_after_bow", lda_after_bow) #calculate corpus' bow_corpus_before_bow = lda_before_bow.generateBagOfWords( docs=before_docs) bow_corpus_after_bow = lda_after_bow.generateBagOfWords( docs=after_docs) bow_corpus_before_orig = lda_before_orig.generateBagOfWords( docs=before_docs) bow_corpus_after_orig = lda_after_orig.generateBagOfWords( docs=after_docs) # store corpus' store_as_pickle(name="bow_corpus_before_bow", obj=bow_corpus_before_bow) store_as_pickle(name="bow_corpus_after_bow", obj=bow_corpus_after_bow) store_as_pickle(name="bow_corpus_before_orig", obj=bow_corpus_before_orig) store_as_pickle(name="bow_corpus_after_orig", obj=bow_corpus_after_orig) # calculate tfidfs corpus_tfidf_before_orig = lda_before_orig.tf_idf( bow_corpus=bow_corpus_before_orig) corpus_tfidf_after_orig = lda_after_orig.tf_idf( bow_corpus=bow_corpus_after_orig) corpus_tfidf_before_bow = lda_before_bow.tf_idf( bow_corpus=bow_corpus_before_bow) corpus_tfidf_after_bow = lda_after_bow.tf_idf( bow_corpus=bow_corpus_after_bow) # store tfidfs store_as_pickle(name="corpus_tfidf_before_orig", obj=corpus_tfidf_before_orig) store_as_pickle(name="corpus_tfidf_after_orig", obj=corpus_tfidf_after_orig) store_as_pickle(name="corpus_tfidf_before_bow", obj=corpus_tfidf_before_bow) store_as_pickle(name="corpus_tfidf_after_bow", obj=corpus_tfidf_after_bow) elif args.start == "Warm": # get before and after docs before_docs = get_pickle_object(name="before_docs") after_docs = get_pickle_object(name="after_docs") print("Docs loaded") #get lda objects lda_before_orig = get_pickle_object(name="lda_before_orig") lda_after_orig = get_pickle_object(name="lda_after_orig") lda_before_bow = get_pickle_object(name="lda_before_bow") lda_after_bow = get_pickle_object(name="lda_after_bow") print("LDAs loaded") # get corpus' bow_corpus_before_bow = get_pickle_object(name="bow_corpus_before_bow") bow_corpus_after_bow = get_pickle_object(name="bow_corpus_after_bow") bow_corpus_before_orig = get_pickle_object( name="bow_corpus_before_orig") bow_corpus_after_orig = get_pickle_object(name="bow_corpus_after_orig") print("Corpus' loaded") # get tfidfs corpus_tfidf_before_orig = get_pickle_object( name="corpus_tfidf_before_orig") corpus_tfidf_after_orig = get_pickle_object( name="corpus_tfidf_after_orig") corpus_tfidf_before_bow = get_pickle_object( name="corpus_tfidf_before_bow") corpus_tfidf_after_bow = get_pickle_object( name="corpus_tfidf_after_bow") print("TFIDF' loaded\n\n") alpha_value = 0.025 for i in range(1, 4): passes = 10 alpha_value *= 2 print(f"passes={passes}, alpha: {alpha_value}\n") print("--------------Original--------------") for no_topics in range(3, 5): alpha = [alpha_value] * no_topics print("Before") before_orig = train_lda(no_topics, corpus_tfidf_before_orig, lda_before_orig, passes=passes, alpha=alpha) topics = before_orig.show_topics(formatted=False) plot(model=before_orig, docs=before_docs, type_model="orig", date="before", passes=passes, height=0.02, alpha=alpha) # t_SNE(lda_model=before_orig, corpus=corpus_tfidf_before_orig, type_model="orig", topics=topics, date="before", passes=passes) print("------------ ------------") print("After") after_orig = train_lda(no_topics, corpus_tfidf_after_orig, lda_after_orig, passes=passes, alpha=alpha) topics = after_orig.show_topics(formatted=False) plot(model=after_orig, docs=after_docs, type_model="orig", date="after", passes=passes, height=0.02, alpha=alpha) # t_SNE(lda_model=after_orig, corpus=corpus_tfidf_after_orig, type_model="orig", topics=topics, date="after", passes=passes) print("\n\n") store_as_pickle(name=f"after-orig_passes={passes}", obj=after_orig) store_as_pickle(name=f"before-orig_passes={passes}", obj=before_orig) print("--------------BOW--------------") for no_topics in range(2, 5): alpha = [alpha_value] * no_topics print("Before") before_bow = train_lda(no_topics, corpus_tfidf_before_bow, lda_before_bow, passes=passes, alpha=alpha) topics = before_bow.show_topics(formatted=False) plot(model=before_bow, docs=before_docs, type_model="bow", date="before", passes=passes, alpha=alpha) # t_SNE(lda_model=before_bow, corpus=corpus_tfidf_before_bow, type_model="bow", topics=topics, date="before", passes=passes) print("------------ ------------") print("After") after_bow = train_lda(no_topics, corpus_tfidf_after_bow, lda_after_bow, passes=passes, alpha=alpha) topics = after_bow.show_topics(formatted=False) plot(model=after_bow, docs=after_docs, type_model="bow", date="after", passes=passes, alpha=alpha) # t_SNE(lda_model=after_bow, corpus=corpus_tfidf_after_bow, type_model="bow", topics=topics, date="after", passes=passes) print("\n\n") store_as_pickle(name=f"after-bow_passes={passes}", obj=after_bow) store_as_pickle(name=f"before-bow_passes={passes}", obj=before_bow)
from LDA import LDA from classifier import Classifier from tkinter import * from tkinter import ttk from tkinter.filedialog import askopenfilename from tkinter import messagebox labels = [ 'Chinh tri Xa hoi', 'Doi song', 'Khoa hoc', 'Kinh doanh', 'Phap luat', 'Suc khoe', 'The gioi', 'The thao', 'Van hoa', 'Vi tinh' ] lda = LDA() root = Tk() classifier = Classifier(type_model='SVM') Title = root.title("File Opener") label = ttk.Label(root, text="Text Classification", font=("Helvetica", 16)) label.pack() #Menu Bar def OpenFile(): name = askopenfilename(initialdir="./data/Test_Full/", filetypes=(("Text File", "*.txt"), ("All Files", "*.*")), title="Choose a file.") topic_vec = lda.cluster(name) result = classifier.predict(topic_vec)
# shuffle the data & labels idx = NP.arange(data.shape[0]) NP.random.shuffle(idx) data = data[idx, ] labels = labels[idx] # set number of dimensions in rescaled data dim_rescale = 3 # {'London': 0, 'Austen': 1, 'Milton': 2, 'Shakespeare': 3} ndx0 = labels == 0 ndx1 = labels == 1 ndx2 = labels == 2 ndx3 = labels == 3 rescaled_data, w = LDA(data, labels, dim_rescale) assert NP.sum(ndx0) + NP.sum(ndx1) + NP.sum(ndx2) + NP.sum( ndx3) == data.shape[0] class0 = rescaled_data[ndx0, ] class1 = rescaled_data[ndx1, ] class2 = rescaled_data[ndx2, ] class3 = rescaled_data[ndx3, ] #----------------------- plotting ----------------------# x0, y0, z0 = data[:, 0], data[:, 1], data[:, 2] x1, y1, z1 = class0[:, 0], class0[:, 1], class0[:, 2] x2, y2, z2 = class1[:, 0], class1[:, 1], class1[:, 2] x3, y3, z3 = class2[:, 0], class2[:, 1], class2[:, 2]
start_time = time.time() MODEL_BASE_DIR = 'F:/Not_Uploaded/conceptualization_eval/models/' EVALUATION_DATASET = '../ratings.txt' MODEL_FILE_EXTENSION = '.gensim' model_names = [ 'ldamodel_simple_mallet_20_10_keep_300000_gensimstop_topics100', 'ldamodel_topics100_trainiter20_en_noStopWords', 'ldamodel_topics100_trainiter20_full_en', 'ldamodel_topics100_trainiter20_train_en', ] model_stat = [] eval_set = get_eval_set(EVALUATION_DATASET) for model_name in model_names: print("\nTest", model_name, ':') lda = LDA() lda.load(MODEL_BASE_DIR + model_name + MODEL_FILE_EXTENSION) conceptualizer = Conceptualizer(lda) none_counter = 0 estimated_similarities = [] real_similarities = [] for element in eval_set: estimated_similarity = evaluate(element[1], element[2], element[3], element[4]) if estimated_similarity is None: none_counter += 1 continue estimated_similarities.append(float(estimated_similarity[0][0]))