def create_topic(): # 存取语料库, 一行为一个文档 corpus = [] for line in open(documentfile, 'r').readlines(): corpus.append(line.strip()) # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频 vectorizer = CountVectorizer() X = vectorizer.fit_transform(corpus) X = X.toarray() print(X.shape) # LDA算法 model = lda.LDA(n_topics=5, n_iter=500, random_state=1) model.fit(np.asanyarray(X)) topic_word = model.topic_word_ # print(topic_word) # n_top_words = 8 # for i, topic_dist in enumerate(topic_word): # topic_words = [np.argsort(topic_dist)][:-(n_top_words+1):-1] # print('Topic {}: {}'.format(i, ' '.join(topic_words))) # 文档-主题分布 doc_topic = model.doc_topic_ print("type(doc_topic): {}".format(type(doc_topic))) print("shape: {}".format(doc_topic.shape)) # 输出前10篇文章最有可能的Topic label = [] for n in range(10): topic_most_pr = doc_topic[n].argmax() label.append(topic_most_pr) print("doc: {} topic: {}".format(n, topic_most_pr)) # 计算文档主题分布图 f, ax = plt.subplots(2, 1, figsize=(6, 6), sharex=True) for i, k in enumerate([0, 1]): # 两个主题 ax[i].stem(topic_word[k, :], linefmt='b-', markerfmt='bo', basefmt='w-') ax[i].set_xlim(-2, 20) ax[i].set_ylim(0, 1) ax[i].set_ylabel("Prob") ax[i].set_title("topic {}".format(k)) ax[1].set_xlabel("word") plt.tight_layout()
def test_lda_zero_iter(self): dtm = self.dtm model = self.model doc_topic = self.doc_topic n_topics = self.n_topics random_seed = self.random_seed # fit a new model with 0 iterations n_iter = 0 model_new = lda.LDA(n_topics=n_topics, n_iter=n_iter, random_state=random_seed) doc_topic_new = model_new.fit_transform(dtm) self.assertIsNotNone(model_new) self.assertIsNotNone(doc_topic_new) self.assertLess(model_new.loglikelihood(), model.loglikelihood()) self.assertFalse((doc_topic_new == doc_topic).all())
def train_lda_model(self, n_topics): """ function: 训练LDA模型 :param: n_topic: 主题的个数 :return: words_in_topic 主题内的词分布 """ model = lda.LDA(n_topics=n_topics, n_iter= self.n_iter, random_state=1) model.fit(self.vsm_model) # 填充vsm模型 topic_word = model.topic_word_ loglikelihood = model.loglikelihoods_ perplexity = loglikelihood.pop() * (-1.0) / self.vocabulary.__len__() * self.n_topics n_top_words = self.n_top_words # 取topic前几个热词 words_in_topic = dict() for i, topic_dict in enumerate(topic_word): topic_words = np.array(self.vocabulary)[np.argsort(topic_dict)][:-(n_top_words+1):-1] words_in_topic[i] = topic_words return words_in_topic, perplexity
def test_get_topic_word_relevance(dtm, n_topics, lambda_): dtm = np.array(dtm) if dtm.sum() == 0: # assure that we have at least one word in the DTM dtm[0, 0] = 1 model = lda.LDA(n_topics, 1) model.fit(dtm) doc_lengths = tmtoolkit.bow.bow_stats.get_doc_lengths(dtm) rel_mat = model_stats.get_topic_word_relevance(model.topic_word_, model.doc_topic_, doc_lengths, lambda_) assert rel_mat.shape == (n_topics, dtm.shape[1]) assert all( isinstance(x, float) and not np.isnan(x) for x in rel_mat.flatten())
def ldaModel(texts, topics, iters, nWords, documents): #vocab, dtm = getVocab(texts, documents) bi_dtm, bi_reducedTextDic, bi_vocab = bagOfWords(texts, documents, True, 0, False, False) uni_dtm, uni_reducedTextDic, uni_vocab = bagOfWords( texts, documents, False, 0, False, False) dtm = [] for sub in range(len(uni_dtm)): dtm.append(np.concatenate((uni_dtm[sub], bi_dtm[sub]))) dtm = np.asarray(dtm) vocab = uni_vocab + bi_vocab dtm = bi_dtm vocab = bi_vocab # limit to those that appear in TDs at least once? mean_occ = np.mean(dtm, axis=0) cleaned_dtm = [] for sub in range(len(dtm)): temp_cleaned = [] for occ in range(len(mean_occ)): if mean_occ[occ] >= 0: temp_cleaned.append(dtm[sub][occ]) cleaned_dtm.append(np.asarray(temp_cleaned)) cleaned_dtm = np.asarray(cleaned_dtm) dtm = cleaned_dtm model = lda.LDA(n_topics=topics, n_iter=iters, random_state=1) model.fit(dtm) topic_word = model.topic_word_ n_top_words = nWords topic_words = {} for i, topic_dist in enumerate(topic_word): topic_words[i] = np.array(vocab)[np.argsort( topic_dist)][:-n_top_words:-1] print('Topic {}: {}'.format(i, ' '.join(topic_words[i]))) doc_topic = model.doc_topic_ for i in range(len(texts)): print("{} (top topic: {})".format(texts[i], doc_topic[i].argmax())) probs = np.array(doc_topic) meanProbs = np.mean(probs, axis=0) return topic_words, meanProbs, probs, vocab, dtm
def main(): print 'start getting vocabulary' if os.path.exists('../data/index_to_word.json') and os.path.exists('../data/word_to_index.json'): index_to_word, word_to_index = load_word_index() else: init_word_index() index_to_word, word_to_index = load_word_index() print 'finish getting vocabulary' print 'start getting doc size' doc_num = doc_count() print 'finish getting doc size' word_num = len(index_to_word) print 'start generating train data' X = sparse.lil_matrix((doc_num, word_num), dtype=np.int32) # with open('../data/arxiv_word_category_nltk.csv', 'rb') as fin: # fin.readline() # reader = csv.reader(fin) # for i, (paper_id, words, category) in enumerate(reader): # words = json.loads(words) # for w in words: # w = w.lower() # if w in word_to_index: # X[i, word_to_index[w]] += 1 with open('../data/arxiv_categories_words_fasttext.txt', 'rb') as fin: for i, line in enumerate(fin): words = line.split() for w in words: if not w.startswith('__label__'): w = w.lower() if w in word_to_index: X[i, word_to_index[w]] += 1 print 'finish generating train data' print 'start training' model = lda.LDA(n_topics=50) model.fit(X) print 'finish training' print 'start saving result' np.save('topic_word.np', model.topic_word_) np.save('doc_topic.np', model.doc_topic_) print 'finish saving result'
def lda_generate_model(headlines, bodies): X = [] # get train data templist = [] # clean_headlines = [] # clean_bodies = [] # for headline in headlines: # clean_headlines.append(clean(headline)) # for body in bodies: # clean_bodies.append(clean(body)) clean_headlines = headlines clean_bodies = bodies # get test data test_stances = test.test test_dataset = data test_headlines, test_bodies = [], [] for stance in test_stances: #test has not been cleaned test_headlines.append(stance['Headline']) test_bodies.append(test_dataset.body[stance['Body ID']]) # add train & test clean_headlines = list(set(clean_headlines)) clean_bodies = list(set(clean_bodies)) test_headlines = list(set(test_headlines)) templist = clean_headlines + clean_bodies + test_headlines cv = CountVectorizer() cv_fit = cv.fit_transform(templist) cv_fit = cv_fit.toarray() model = lda.LDA(n_topics=20, n_iter=500, random_state=1) model.fit(cv_fit) # get lda dict vec_dict = OrderedDict() doc_topic = model.doc_topic_ i = 0 for i in range(len(templist)): vec_dict[templist[i]] = doc_topic[i] print("lda_generate_model complete!") return vec_dict
def infer_topics(self, num_topics=10, algorithm='variational', **kwargs): self.nb_topics = num_topics lda_model = None topic_document = None if algorithm == 'variational': lda_model = LDA(n_topics=num_topics, learning_method='batch') topic_document = lda_model.fit_transform( self.corpus.sklearn_vector_space) elif algorithm == 'gibbs': lda_model = lda.LDA(n_topics=num_topics, n_iter=500) topic_document = lda_model.fit_transform( self.corpus.sklearn_vector_space) else: raise ValueError( "algorithm must be either 'variational' or 'gibbs', got '%s'" % algorithm) self.topic_word_matrix = [] self.document_topic_matrix = [] vocabulary_size = len(self.corpus.vocabulary) row = [] col = [] data = [] for topic_idx, topic in enumerate(lda_model.components_): for i in range(vocabulary_size): row.append(topic_idx) col.append(i) data.append(topic[i]) self.topic_word_matrix = coo_matrix( (data, (row, col)), shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr() row = [] col = [] data = [] doc_count = 0 for doc in topic_document: topic_count = 0 for topic_weight in doc: row.append(doc_count) col.append(topic_count) data.append(topic_weight) topic_count += 1 doc_count += 1 self.document_topic_matrix = coo_matrix( (data, (row, col)), shape=(self.corpus.size, self.nb_topics)).tocsr()
def lda_reduction(dataArray, k, get="feature-latent"): #print ("dataArray", dataArray.shape) sparseDataArray = lil_matrix(dataArray) model = lda.LDA(n_topics=k, n_iter=200) model.fit(sparseDataArray) # model.fit_transform(X) is also available topic_word = model.topic_word_ # model.components_ also works doc_topic = model.doc_topic_ # print ("topic_word", topic_word.shape) # print ("dpc_topic", doc_topic.shape) if get == "feature-latent": #print ("topic_word:", topic_word.shape) #print ("doc_topic:", doc_topic.shape) #return topic_word return np.matmul(dataArray.transpose(), doc_topic) else: return np.matmul(dataArray, topic_word.transpose())
def test_save_load_ldamodel_pickle(): pfile = 'tests/data/test_pickle_unpickle_ldamodel.pickle' dtm = np.array([[0, 1], [2, 3], [4, 5], [6, 0]]) doc_labels = ['doc_' + str(i) for i in range(dtm.shape[0])] vocab = ['word_' + str(i) for i in range(dtm.shape[1])] model = lda.LDA(2, n_iter=1) model.fit(dtm) lda_utils.common.save_ldamodel_to_pickle(pfile, model, vocab, doc_labels) unpickled = lda_utils.common.load_ldamodel_from_pickle(pfile) assert np.array_equal(model.doc_topic_, unpickled['model'].doc_topic_) assert np.array_equal(model.topic_word_, unpickled['model'].topic_word_) assert vocab == unpickled['vocab'] assert doc_labels == unpickled['doc_labels']
def lda_location(): corpus = position_list # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频,sklen.countVectorizer()类能够把文档词块化 vectorizer = CountVectorizer() # 统计每个行每个单词的词频 x = vectorizer.fit_transform(corpus) # 这里toarray()和todense()结果一样,都是单词根据词典的分布。 weight = x.toarray() model = lda.LDA(n_topics=10, n_iter=500, random_state=1) model.fit( numpy.asarray(weight)) # model.fit_transform(X) is also available # 文档-主题(Document-Topic)分布 doc_topic = model.doc_topic_ a = doc_topic numpy.savetxt('C:/Users/MyPC/Desktop/doc_location.csv', a, delimiter=',') # 将得到的文档-主题分布保存
def lda_topic_models(self, num_topics, num_iter, min_occ, docs): """ Extract LDA topic models """ cvectorizer = CountVectorizer(min_df=min_occ, stop_words="english") cvz = cvectorizer.fit_transform(docs) lda_model = lda.LDA(n_topics=num_topics, n_iter=num_iter) X_topics = lda_model.fit_transform(cvz) _lda_keys = [] for i in xrange(X_topics.shape[0]): _lda_keys.append(X_topics[i].argmax()) topic_summaries = [] topic_word = lda_model.topic_word_ # all topic words n_top_words = 5 vocab = cvectorizer.get_feature_names() for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort( topic_dist)][:-(n_top_words + 1):-1] # get! topic_summaries.append(' '.join(topic_words)) return topic_summaries
def getItemTopic(trainset): item_review_df = getItemReview(trainset) doc_clean_set = [doc_clean(doc) for doc in item_review_df['reviews']] corpus, diction = getDict(doc_clean_set, len(diction)) X = sparse2dense(corpus, diction) model = lda.LDA(n_topics=5, n_iter=20, random_state=1) model.fit(X) doc_topic = model.doc_topic_ doc_topic_df = pd.DataFrame(doc_topic) #构建item_vector字典 item_id_l = list(item_review_df['item']) doc_topic_l = doc_topic_df.as_matrix().tolist() item_vector_dict = dict(zip(item_id_l, doc_topic_l)) return item_vector_dict
def fitLDA(self, nTopics, nTopWords): #Fit LDA model topicsList = [] tdm = textmining.TermDocumentMatrix( tokenizer=textmining.simple_tokenize_remove_stopwords) for index, row in self.typeData.iterrows(): if isinstance(row["Title/Description"], basestring): tdm.add_doc(row["Title/Description"]) temp = list(tdm.rows(cutoff=1)) vocab = tuple(temp[0]) X = np.array(temp[1:]) self.model = lda.LDA(n_topics=nTopics, n_iter=500, random_state=1) self.model.fit_transform(X) topicWord = self.model.topic_word_ # model.components_ also works topWords = nTopWords for i, topic_dist in enumerate(topicWord): topicWords = np.array(vocab)[np.argsort(topic_dist)][:-topWords:-1] topicsList.append(topicWords) return topicsList
def test_get_marginal_topic_distrib(dtm, n_topics): try: import lda except ImportError: pytest.skip('lda not installed') if dtm.sum() == 0: # assure that we have at least one word in the DTM dtm[0, 0] = 1 model = lda.LDA(n_topics, 1) model.fit(dtm) doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm) marginal_topic_distr = model_stats.marginal_topic_distrib(model.doc_topic_, doc_lengths) assert marginal_topic_distr.shape == (n_topics,) assert np.isclose(marginal_topic_distr.sum(), 1.0) assert all(0 <= v <= 1 for v in marginal_topic_distr)
def test_generate_topic_labels_from_top_words(dtm, n_topics, lambda_): dtm = np.array(dtm) if dtm.sum() == 0: # assure that we have at least one word in the DTM dtm[0, 0] = 1 model = lda.LDA(n_topics, 1) model.fit(dtm) vocab = np.array([chr(65 + i) for i in range(dtm.shape[1]) ]) # this only works for few words doc_lengths = tmtoolkit.bow.bow_stats.get_doc_lengths(dtm) topic_labels = model_stats.generate_topic_labels_from_top_words( model.topic_word_, model.doc_topic_, doc_lengths, vocab, lambda_=lambda_) assert isinstance(topic_labels, list) assert len(topic_labels) == n_topics for i, l in enumerate(topic_labels): assert isinstance(l, six.string_types) parts = l.split('_') assert len(parts) >= 2 assert int(parts[0]) == i + 1 assert all(w in vocab for w in parts[1:]) topic_labels_2 = model_stats.generate_topic_labels_from_top_words( model.topic_word_, model.doc_topic_, doc_lengths, vocab, lambda_=lambda_, n_words=2) assert isinstance(topic_labels_2, list) assert len(topic_labels_2) == n_topics for i, l in enumerate(topic_labels_2): assert isinstance(l, six.string_types) parts = l.split('_') assert len(parts) == 3 assert int(parts[0]) == i + 1 assert all(w in vocab for w in parts[1:])
def run_tm_and_dump(cand_year, files): db = get_cands_data('thesis_db.xls', DATA_LEN) engLines = get_translated_text("Translated_text.txt") engLines = engLines[:DATA_LEN] index = 0 reviewed_cands = [] cand_ids = [] index2cand = {} run_text = [] errors = [0, 0, 0, 0, 0, 0] for line in engLines: this_cand_id = db.ID_coded[index] if this_cand_id not in reviewed_cands: reviewed_cands.append(this_cand_id) cand = get_cand(db, engLines, index, [cand_year], errors) if cand is not None: run_text.append(line) cand_ids.append(cand.id) index2cand[index] = cand index = index + 1 lem_text = get_data_lemmatized(run_text) id2word, corpus = text2corpus(lem_text) X2 = corpus2nparray(corpus, id2word) users, words, cands = get_users_and_words(db, engLines, cand_year, files[0]) X = get_np_array(db, engLines, users, words, cand_year) model = lda.LDA(n_topics=N, n_iter=ITERATIONS, random_state=1) model.fit(X2) # model.fit_transform(X) is also available topic_word = model.topic_word_ # model.components_ also works n_top_words = 8 for i, topic_dist in enumerate(topic_word): topic_words = np.array(words)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] str = 'Topic {}: {}'.format(i, ' '.join(topic_words)) print(str) #dump_tm(db, model.doc_topic_, users, cands, files) rr = 6
def lda_model(doc, topic, iterator=500): # 返回词汇表和训练好的lda模型 word_set = set() print("转化doc_word") # 首先创建词汇表 for d in doc: document = doc[d] document_word_list = document.split(" ") for w in document_word_list: word_set.add(w) # 创建document矩阵 N = len(doc) V = len(word_set) data = [] word_list = list(word_set) for d in doc: """ 将存在的词的编号和数量作为tuple存储 N为文档数,V为词汇数 之后将其转换为np N*V大小的矩阵 """ document = doc[d] document_word_list = document.split(" ") simple_list = [] # 每个单词在该文档中包含数 for i in range(len(word_list)): c = document_word_list.count(word_list[i]) if c > 0: simple_list.append((i, document_word_list.count(word_list[i]))) data.append(tuple(simple_list)) # 创建矩阵 dtm = np.zeros((N, V), dtype=np.intc) for i, doc in enumerate(data): for v, cnt in doc: np.testing.assert_equal(dtm[i, v], 0) # 确认下以免出错 dtm[i, v] = cnt print("训练lda模型") model = lda.LDA(n_topics=topic, n_iter=iterator, random_state=1) model.fit(dtm) return word_list, model
def score(self, source_corpus, target_corpus, weighting=None, pool=None): start = time.time() self.vector_extractor.estimate_idf(source_corpus, target_corpus) print "IDF extimation took %s seconds" % (time.time() - start) start = time.time() if self.lda_dim > 0: import lda doc_matrix = self.vector_extractor.extract(source_corpus + target_corpus) lda_model = lda.LDA(n_topics=self.lda_dim, n_iter=1500, random_state=1, refresh=200) lda_model.fit(doc_matrix.astype(int)) del doc_matrix source_matrix = self.vector_extractor.extract(source_corpus) target_matrix = self.vector_extractor.extract(target_corpus) if self.lda_dim > 0: source_matrix = source_matrix.astype(int) target_matrix = target_matrix.astype(int) source_matrix = lda_model.transform(source_matrix) target_matrix = lda_model.transform(target_matrix) print "Extraction took %s seconds" % (time.time() - start) print "Nonzero source: ", len(source_matrix.nonzero()[0]) print "Nonzero target: ", len(target_matrix.nonzero()[0]) print "< 0 source: ", type(source_matrix).sum(source_matrix < 0) print "< 0 target: ", type(target_matrix).sum(target_matrix < 0) start = time.time() del self.vector_extractor n_jobs = 1 if pool is not None: n_jobs = len(pool._pool) sys.stderr.write("Scoring using %s and %d jobs\n" % (self.metric, n_jobs)) d = 1 - pairwise_distances( source_matrix, target_matrix, metric=self.metric, n_jobs=n_jobs) # should not happen tfidf entries are negative print "< 0 d: ", np.sum(d < 0) print "Scoring took %s seconds" % (time.time() - start) return d
def _test_LDA(l, path1, file='', data_samples=[], term=0): n_topics = 10 n_top_words = 10 if term == 7: n_top_words = 10 elif term == 50: n_top_words = 100 elif term == 100: n_top_words = 1000 elif term == 200: n_top_words = 10000 elif term == 400: n_top_words = 10000 fileB = [] fileB.append(file) #filepath = '/home/amrit/GITHUB/Pits_lda/dataset/' topics = [] for j, file1 in enumerate(fileB): for i in range(10): #data_samples = readfile1(filepath + str(file1)) # shuffling the list shuffle(data_samples) tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') tf = tf_vectorizer.fit_transform(data_samples) lda1 = lda.LDA(n_topics=int(l[0]), alpha=l[1], eta=l[2], n_iter=10) lda1.fit_transform(tf) # print("done in %0.3fs." % (time() - t0)) tf_feature_names = tf_vectorizer.get_feature_names() topics.extend( get_top_words(lda1, path1, tf_feature_names, n_top_words, i=i, file1=file1)) return topics
def get_train_test_lda(topic): model = VGG16(include_top=False, pooling='avg') x_train, y_train, x_test, y_test = load() x_train = x_train.astype('float32') x_train /= 255 y_train = y_train.astype('int64') x_test = x_test.astype('float32') x_test /= 255 y_test = y_test.astype('float32') X_train = model.predict(x_train) print(X_train.shape) X_test = model.predict(x_test) # X_train = model.predict(x_train) # X_test = model.predict(x_test) for k in topic: X_iter = X_train model_label = lda.LDA(n_topics=k, n_iter=1000) model_label.fit(y_train) doc_topic = model_label.doc_topic_ x2 = doc_topic x = x2 x = discretization_doc_topic(x) X_train = np.hstack((X_train, x)) # multi-label learning to get x2 classifier = LabelPowerset(RandomForestClassifier()) classifier.fit(X_iter, x) x = np.array(sp.csr_matrix(classifier.predict(X_test)).toarray()) # print(x) # x = alpha * x1 + (1-alpha) * x2 # x = self.discretization_doc_topic(x) X_test = np.hstack((X_test, x)) return np.array(X_train)[:, -28:], np.array(y_train), np.array( X_test)[:, -28:], np.array(y_test)
def get_topic_labels(n_topics, n_top_words, n_cand_labels, label_min_df, n_labels, lda_random_state, lda_n_iter): """ Refer the arguments to `create_parser` """ print("Loading docs and preprocessing (cvalue etc) for lda input...") # docs = get_lda_input_from_corpus_folder(CORPUS_PATH) # docs = load_line_corpus(corpus_path) docs = pickle.load(open('./data/lda_input_docs_finalized.pickle', 'rb')) print("Generate candidate bigram labels(with POS filtering)...") finder = BigramLabelFinder(min_freq=label_min_df) cand_labels = finder.find(docs, top_n=n_cand_labels) print("Collected {} candidate labels".format(len(cand_labels))) print("Calculate the PMI scores...") pmi_cal = PMICalculator(doc2word_vectorizer=WordCountVectorizer( min_df=5, stop_words=load_stopwords(STOP_WORDS_FILES)), doc2label_vectorizer=LabelCountVectorizer()) pmi_w2l = pmi_cal.from_texts(docs, cand_labels) print("Topic modeling using LDA...") model = lda.LDA(n_topics=n_topics, n_iter=lda_n_iter, random_state=lda_random_state) model.fit(pmi_cal.d2w_) print("\nTopical words:") print("-" * 20) for i, topic_dist in enumerate(model.topic_word_): top_word_ids = np.argsort(topic_dist)[:-n_top_words:-1] topic_words = [pmi_cal.index2word_[id_] for id_ in top_word_ids] print('Topic {}: {}'.format(i, ' '.join(topic_words))) ranker = LabelRanker(apply_intra_topic_coverage=False) return ranker.top_k_labels(topic_models=model.topic_word_, pmi_w2l=pmi_w2l, index2label=pmi_cal.index2label_, label_models=None, k=n_labels)
def create_topic_vectors(path_json, save_path): files, vectorizer, count_matrix, datas = convert_jsoncorpus_to_count_matrix(path_json) vocab = vectorizer.get_feature_names() print "Number of feature: ", len(vectorizer.get_feature_names()) print "Features:", len(vocab), vocab[2500:2510], "..." print "Size of count matrix: ", count_matrix.shape print "Number of files: ", len(files) model = lda.LDA(n_topics=10, n_iter=2000, random_state=1) model.fit(count_matrix) # model.fit_transform(X) is also available topic_word = model.topic_word_ topic_vectors = [topic_dist for i, topic_dist in enumerate(topic_word)] with open(save_path, 'wb') as handle: pickle.dump([model, topic_vectors, vocab], handle) print "Topic vectors and vocab saved !!!"
def exec_lda(mtx_lda, vocab_set, topics, words, iterations, path='./topic'): import numpy ft = io.open(path, 'w', encoding='utf8') model = lda.LDA(n_topics=topics, n_iter=iterations, random_state=1) model.fit(mtx_lda) topic_word = model.topic_word_ n_top_words = words print topic_word for i, topic_dist in enumerate(topic_word): try: topic_words = numpy.array(vocab_set)[numpy.argsort(topic_dist)][:-(n_top_words + 1):-1] except IndexError as e: print (str(e)) else: words = u'' for word in topic_words: words += word words += ' ' ft.write(str(i).encode('utf8') + u' ' + words + u'\n')
def _test_LDA(l, path1, file='', data_samples=[], target=[]): n_topics = 10 n_top_words = 10 fileB = [] fileB.append(file) #filepath = '/home/amrit/GITHUB/Pits_lda/dataset/' topics = [] data = data_samples tar = target x = list(xrange(len(data_samples))) for j, file1 in enumerate(fileB): for i in range(10): #data_samples = readfile1(filepath + str(file1)) # shuffling the list shuffle(x) data = [data[k] for k in x] tar = [tar[k] for k in x] tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') tf = tf_vectorizer.fit_transform(data) lda1 = lda.LDA(n_topics=int(l[0]), alpha=l[1], eta=l[2], n_iter=200) lda1.fit_transform(tf) tops = lda1.doc_topic_ topic_word = lda1.topic_word_ tf_feature_names = tf_vectorizer.get_feature_names() topics.extend( get_top_words(lda1, path1, tf_feature_names, n_top_words, i=i, file1=file1)) return topics, tops, topic_word, tf_feature_names, tar
def LDAModel(train_count, vocab): # Topic modeling using LDA lda_model = lda.LDA(n_topics=10, n_iter=400) train_topics = lda_model.fit_transform(train_count) # Get a map between each user and the topic they most likely belong to _lda_keys = [] for i in range(train_topics.shape[0]): _lda_keys += train_topics[i].argmax(), n_top_words = 5 topic_summaries = [] topic_word = lda_model.topic_word_ # all topic words for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] # get! topic_summaries.append(' '.join(topic_words)) # append! return (lda_model, train_topics, _lda_keys, topic_summaries)
def lda_out(doc_term_mat, vocab, directory, outfile_prefix, num_topics, n_top_words, date_range): model = lda.LDA(n_topics=num_topics, n_iter=1500, random_state=1) model.fit(doc_term_mat) topic_word = model.topic_word_ tpc_wds_file = directory + outfile_prefix + 'tpc_wds' + date_range + '.mat' doc_tpc_file = directory + outfile_prefix + 'doc_tpc' + date_range + '.mat' matrix_dump(topic_word, tpc_wds_file) matrix_dump(model.doc_topic_, doc_tpc_file) with open( directory + outfile_prefix + str(num_topics) + date_range + '.txt', 'w+') as f: for i, topic_dist in enumerate(topic_word): topic_words = np.array( sorted(vocab))[np.argsort(topic_dist)][:-(n_top_words + 1):-1] f.write('Topic {0} : {1}\n'.format( i, ', '.join(topic_words).encode("utf-8")))
def test_get_word_distinctiveness(dtm, n_topics): try: import lda except ImportError: pytest.skip('lda not installed') if dtm.sum() == 0: # assure that we have at least one word in the DTM dtm[0, 0] = 1 model = lda.LDA(n_topics, 1) model.fit(dtm) doc_lengths = tmtoolkit.bow.bow_stats.doc_lengths(dtm) p_t = model_stats.marginal_topic_distrib(model.doc_topic_, doc_lengths) w_distinct = model_stats.word_distinctiveness(model.topic_word_, p_t) assert w_distinct.shape == (dtm.shape[1],) assert all(v > -1e10 for v in w_distinct)
def fit_models(self, k_list, n_iter=500): """ Fits multiple LDA models to X. Implements <lda> module. k_list = [10, 20, 25, ..., 90] """ self.k_list = k_list self.topics_n = sum(k_list) models_k = reduce(lambda x, y: x + y, [[k] * k for k in self.k_list]) for i in k_list: for j in range(0, i): self.topic_labels.append(str(i) + "-" + str(j + 1)) self.models_matrix = np.matrix([0] * len(self.features)) for k in k_list: model = lda.LDA(n_topics=k, n_iter=n_iter, random_state=1) model.fit(self.X) self.models_list.append(model) self.models_matrix = np.vstack((self.models_matrix, model.nzw_)) self.models_matrix = self.models_matrix[1:]
def test_lda_random_seed(self): dtm = self.dtm doc_topic = self.doc_topic n_iter = self.n_iter n_topics = self.n_topics random_seed = self.random_seed random_state = self.model.random_state # refit model with same random seed and verify results identical model_new = lda.LDA(n_topics=n_topics, n_iter=n_iter, random_state=random_seed) rands_init = model_new._rands.copy() doc_topic_new = model_new.fit_transform(dtm) rands_fit = model_new._rands.copy() random_state_new = model_new.random_state np.testing.assert_array_equal(doc_topic_new, doc_topic) np.testing.assert_array_equal(random_state_new, random_state) # verify random variates are not changed np.testing.assert_array_equal(rands_init, rands_fit)