def transform(self, X, y=None): corpus = Sparse2Corpus(X, documents_columns=False) topics = np.array([ map(lambda x: x[1], self.lda.__getitem__(c, eps=0)) for c in corpus ]) print topics.shape return topics
def lda_model(): # Load the list of documents with open('newsgroups', 'rb') as f: newsgroup_data = pickle.load(f) # Use CountVectorizor to find three letter tokens, remove stop_words, # remove tokens that don't appear in at least 20 documents, # remove tokens that appear in more than 20% of the documents vect = CountVectorizer(min_df=20, max_df=0.2, stop_words='english', token_pattern='(?u)\\b\\w\\w\\w+\\b') # Fit and transform X = vect.fit_transform(newsgroup_data) # Convert sparse matrix to gensim corpus. corpus = Sparse2Corpus(X, documents_columns=False) # Mapping from word IDs to words (To be used in LdaModel's id2word parameter) id_map = dict((v, k) for k, v in vect.vocabulary_.items()) # Use the gensim.models.ldamodel.LdaModel constructor to estimate # LDA model parameters on the corpus, and save to the variable `ldamodel` return vect, LdaModel(corpus, num_topics=10, id2word=id_map, passes=25, random_state=34)
def finalize(self): if self.model_exist: return if self.num_of_scans == 1: print "Loaded the model from file." else: print "Performing SVD..." # svd = SVD(n_components=self.num_of_features, random_state=42) # x = svd.fit_transform(self.vectors) # self.vectors = x x = Sparse2Corpus(self.vectors) lsi = lsimodel.LsiModel(corpus=x, id2word=None, num_topics=self.num_of_features) lsi.save(self.model_file_name) self.vectors = lsi.projection.u print "done." if self.n <= 1: self.n = 2.0 self.mean = self.sum / self.n self.var = (self.sum_sq - (self.sum * self.sum) / self.n) / (self.n - 1) self.var = math.sqrt(self.var) f = open(self.stat_filename, 'a') lang_pair = self.src_language + self.trg_language f.write("\n" + lang_pair + "\n") f.write("stats\t" + str(self.mean) + "\t" + str(self.var) + "\n") f.close()
def getLDAvis(topics, min_df, max_features): ldavis_key = f'{int(min_df*1000):d}_{max_features}_{topics}' ldavis_path = Path('./pyldavis') / f'{ldavis_key}_tsne.html' if not ldavis_path.exists(): key = f'{max_features}' dtm_path = corpus_path / f'dtm_{key}.npz' dtm = sparse.load_npz(dtm_path) token_path = corpus_path / f'tokens_{key}.csv' tokens = pd.read_csv(token_path, header=None, squeeze=True, na_values=[], keep_default_na=False) model_file = datapath( (experiment_path / 'models' / f'{key}_{topics}').resolve()) lda_model = LdaModel.load(model_file) id2word = tokens.to_dict() corpus = Sparse2Corpus(dtm, documents_columns=False) dictionary = Dictionary.from_corpus(corpus, id2word) vis = prepare(lda_model, corpus, dictionary, mds='tsne') kwargs = {"ldavis_url": "/static/ldavis.js"} pyLDAvis.save_html(vis, str(ldavis_path), **kwargs) with open(str(ldavis_path), 'r') as myfile: data = myfile.read() return data #getLDAvis(5, 0.001, 10000) #getLDAvis(5, 0.001, 25000) #getLDAvis(10, 0.001, 10000) #getLDAvis(10, 0.001, 25000) #getLDAvis(20, 0.001, 10000) #getLDAvis(20, 0.001, 25000)
def graphLDA(name): embedFile = 'backendOutput/embeddings-' + name + '.pkl' bow, tfidf, _, id2word = loadData(embedFile) for (docRep, docRepName) in [(bow, 'bow'), (tfidf, 'tfidf')]: ldamodel = loadData('backendOutput/ldamodel-' + name + "-" + docRepName + '.pkl') corpus = Sparse2Corpus(docRep, documents_columns=False) dictionary = Dictionary.from_corpus(corpus, id2word) #This could be more descriptive if we wanted document_labels = ["Document " + str(i) for i in range(len(corpus))] grapher = LDAGrapher(docRepName, corpus, dictionary, ldamodel, document_labels, name) print("Graphing t-SNE for " + docRepName + "...") grapher.graphTSNE(perplexity=30) print("Graphing pyLDAvis for " + docRepName + "...") grapher.graphPyLDAvis() print("Creating word cloud for " + docRepName + "...") grapher.graphWordCloud() print("Graphing word weights for " + docRepName + "...") grapher.graphWordWeight() print("Done graphing!")
def lda(docs, k): """Latent Dirichlet allocation topic model. Uses Gensim's LdaModel after tokenizing using scikit-learn's TfidfVectorizer. Parameters ---------- k : integer Number of topics. """ from gensim.matutils import Sparse2Corpus from gensim.models import LdaModel # Use a scikit-learn vectorizer rather than Gensim's equivalent # for speed and consistency with LSA and k-means. vect = _vectorizer() corpus = vect.fit_transform(fetch(d) for d in docs) corpus = Sparse2Corpus(corpus) model = LdaModel(corpus=corpus, num_topics=k) topics = model.show_topics(formatted=False) vocab = vect.get_feature_names() #return [(vocab[int(idx)], w) for topic in topics for w, idx in topic] return [[(vocab[int(idx)], w) for w, idx in topic] for topic in topics]
def load_topic_model(self): if not hasattr(self, "word2id"): self.load_globel_vocab() self.vectorizer = CountVectorizer(vocabulary=self.word2id, tokenizer=lambda x: x, preprocessor=lambda x: x) file_path = "./preproc_data/topic_model.pkl" if os.path.exists(file_path): self.topic_model = LdaModel.load(file_path) else: texts = [] if not hasattr(self, "domain2data"): self.load_domain2data() for domain in self.domain2data: texts.extend(self.domain2data[domain]["labeled"]) texts.extend(self.domain2data[domain]["unlabeled"]) corpus = self.vectorizer.fit_transform(texts) corpus = Sparse2Corpus(corpus, documents_columns=False) self.topic_model = LdaMulticore( corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, iterations=self.num_topic_iterations, passes=self.num_topic_passes) self.topic_model.save(file_path)
def transform(self, X): corpus = Sparse2Corpus(X, documents_columns=False) doc_topic = self.model[corpus] mat = np.zeros((X.shape[0], self.n_components), dtype=np.float64) for did, doc in enumerate(doc_topic): for topic in doc: mat[did][topic[0]] = topic[1] return mat
def vect2gensim(self, vectorizer, dtmatrix): # transform sparse matrix into gensim corpus and dictionary corpus_vect_gensim = Sparse2Corpus(dtmatrix, documents_columns=False) dictionary = Dictionary.from_corpus( corpus_vect_gensim, id2word=dict( (id, word) for word, id in vectorizer.vocabulary_.items())) return (corpus_vect_gensim, dictionary)
def trainLDA(docRep, dictionary, save=False, name=""): ''' Function to train and return an ldamodel. Expects a sparse matrix as input ''' corpus = Sparse2Corpus(docRep, documents_columns=False) ldamodel = ldamulticore.LdaMulticore( corpus, num_topics=20, id2word=dictionary, workers=4, passes=4) if save: saveData(ldamodel, 'ldamodel-' + name) return ldamodel
def train(self, corpus, project): self.info('creating similarity index') veccorpus = project.vectorizer.transform( (subj.text for subj in corpus.subjects)) gscorpus = Sparse2Corpus(veccorpus, documents_columns=False) self._index = gensim.similarities.SparseMatrixSimilarity( gscorpus, num_features=len(project.vectorizer.vocabulary_)) annif.util.atomic_save(self._index, self.datadir, self.INDEX_FILE)
def _create_index(self, veccorpus): self.info('creating similarity index') gscorpus = Sparse2Corpus(veccorpus, documents_columns=False) self._index = gensim.similarities.SparseMatrixSimilarity( gscorpus, num_features=len(self.vectorizer.vocabulary_)) annif.util.atomic_save( self._index, self.datadir, self.INDEX_FILE)
def topic_distribution(vect: CountVectorizer, model: LdaModel): new_doc = [ "\n\nIt's my understanding that the freezing will start to occur because \ of the\ngrowing distance of Pluto and Charon from the Sun, due to it's\nelliptical orbit. \ It is not due to shadowing effects. \n\n\nPluto can shadow Charon, and vice-versa.\n\nGeorge \ Krumins\n-- " ] bow = Sparse2Corpus(vect.transform(new_doc), documents_columns=False) return next(iter(model[bow]), None)
def getLDARep(ldaModel, docRep, save=False, name=""): ''' Convert doc representation to lda output ''' corpus = Sparse2Corpus(docRep, documents_columns=False) converted = ldaModel.get_document_topics(corpus, minimum_probability=0.0) rep = [list(map(lambda topic: topic[1], converted[i])) for i in range(len(corpus))] if save: saveData(rep, name) return rep
def getLsiModel(tfidfModel) -> LsiModel: modelPath = os.path.join('.cache', 'lsi.gensim_model') try: lsiModel = LsiModel.load(modelPath) except FileNotFoundError: corpus = Sparse2Corpus(tfidfModel.vectors, documents_columns=False) lsiModel = LsiModel(corpus, num_topics=200) lsiModel.save(modelPath) return lsiModel
def get_similar_topics_distribution(abst_to_match): topics_array = np.zeros(n_topics) trans = cvec.transform(list([abst_to_match])) corpus = Sparse2Corpus(trans, documents_columns=False) results = list(ldamodel.get_document_topics(bow=corpus))[0] for items in results: topics_array[items[0]] = items[1] return topics_array
def __init__(self, dataset, n_topics, vocabulary_size=None): super(DescriptionLDA, self).__init__(dataset, vocabulary_size, tfidf=False) self.n_topics = n_topics id2word = {i: w for w, i in self.vectorizer.vocabulary_.iteritems()} corpus = Sparse2Corpus(self.features, documents_columns=False) self.transformer = LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics)
def _run(self, info): nbprint('Running LDA') vocab = data.load_vocab(info) id2word = {e['id']: e['token'] for e in vocab} corpus = Sparse2Corpus(self.input_mat) lda = LdaModel(corpus, id2word=id2word, num_topics=info["num_topics"]) self.W = lda.get_topics().T self.H = np.zeros((info["num_topics"], self.input_mat.shape[1])) for idx, doc in enumerate(corpus): weights = lda[doc] for topic, value in weights: self.H[topic, idx] = value
def sum_weighted_term_lists(wtlist, dictionary): if len(wtlist) == 0: return [] term_vecs = [] for weight, terms in wtlist: term_vec_raw = dictionary.doc2bow(terms) term_vec = [(term_id, weight * val) for term_id, val in term_vec_raw] term_vecs.append(term_vec) # make into numpy matrix for convenience term_matrix = corpus2csc(term_vecs) # calculate sum sum_vec = Sparse2Corpus(csc_matrix(term_matrix.sum(1)))[0] return sum_vec
def getMatrixSimilarity(tfidfModel, lsiModel=None) -> SparseMatrixSimilarity: similarityPath = os.path.join('.cache', 'sim_mat.gensim_sim') try: sim = MatrixSimilarity.load(similarityPath) except FileNotFoundError: corpus = Sparse2Corpus(tfidfModel.vectors, documents_columns=False) if lsiModel is None: lsiModel = getLsiModel(tfidfModel) sim = SparseMatrixSimilarity(lsiModel[corpus], num_best=21, num_features=tfidfModel.vectors.shape[0]) sim.save(similarityPath) return sim
def get_texts_topic_distribution(self, texts): if not hasattr(self, "vectorizer"): self.load_topic_model() vectorized_corpus = self.vectorizer.transform(texts) gensim_corpus = Sparse2Corpus(vectorized_corpus, documents_columns=False) topic_representations = [] for doc in gensim_corpus: topic_representations.append([ topic_prob for (_, topic_prob) in self.topic_model.get_document_topics( doc, minimum_probability=0.) ]) return np.array(topic_representations)
def score(self, X, y=None, sample_weight=None) -> float: # TODO this needs further testing for correctness, WIP if self.autoencoder is None: raise NotFittedError self.autoencoder.eval() corpus = Sparse2Corpus(X, documents_columns=False) decoder_weight = self.autoencoder.decoder.linear.weight.detach().cpu() id2word = {index: str(index) for index in range(X.shape[1])} topics = [[str(item.item()) for item in topic] for topic in decoder_weight.topk( min(self.score_num, X.shape[1]), dim=0)[1].t()] cm = CoherenceModel(topics=topics, corpus=corpus, dictionary=Dictionary.from_corpus(corpus, id2word), coherence='u_mass') return cm.get_coherence()
def from_stream_of_tokens_to_sparse2corpus( source: Any, vocabulary: Dictionary | dict) -> Sparse2Corpus: if not hasattr(vocabulary, 'doc2bow'): vocabulary: Dictionary = _from_token2id_to_dictionary(vocabulary) bow_corpus: GensimBowCorpus = [ vocabulary.doc2bow(tokens) for _, tokens in source ] csc_matrix: sp.csc_matrix = corpus2csc( bow_corpus, num_terms=len(vocabulary), num_docs=len(bow_corpus), num_nnz=sum(map(len, bow_corpus)), ) corpus: Sparse2Corpus = Sparse2Corpus(csc_matrix, documents_columns=True) return corpus
def fit(self, X, y=None): """ Parameter --------- X : [sp.csr_matrix] Returns ------- self """ if self.lda is None: self.lda = LdaMulticore(id2word=self.id2word, num_topics=self.num_topics, passes=self.passes) X_flat = sp.vstack(X) self.lda.update(Sparse2Corpus(X_flat, documents_columns=False)) return self
def merge_corpus(corpus, dictionary, docs_ids): ''' Merge documents with same doc_id in the corpus according to docs_ids. The function will return the merged corpus and merged docs_ids. Noted that the merged corpus is a sparse matrix in scipy. ''' # Deprecated version # # References: # # - How to sort a list and reorder a list according to indices? # # https://stackoverflow.com/questions/6422700/how-to-get-indices-of-a-sorted-array-in-python/6423325 # # https://stackoverflow.com/questions/2177590/how-can-i-reorder-a-list # # - How to group by data by their keys? # # https://docs.python.org/2/library/itertools.html#itertools.groupby # dense_corpus = corpus2dense(corpus, num_terms=len(dictionary)).transpose() # ordered_indice = np.argsort(docs_ids) # reordered_docs_ids = [ docs_ids[index] for index in ordered_indice ] # reordered_dense_corpus = [ dense_corpus[index] for index in ordered_indice ] # reordered_key_values = zip(reordered_docs_ids, reordered_dense_corpus) # merged_key_values = [ # [ key_value[0], np.array(list(zip(*list(key_value[1])))[1]).sum(axis=0) ] # for key_value in itertools.groupby(reordered_key_values, lambda x: x[0]) ] # merged_key_values = list(zip(*merged_key_values)) # merged_docs_ids = merged_key_values[0] # merged_dense_corpus = np.array(merged_key_values[1]) # merged_corpus = Dense2Corpus(merged_dense_corpus, documents_columns=False) # return merged_docs_ids, merged_corpus # TODO: Make this function more efficient in the future. # Convert corpus to sparse matrix in Scipy sparse_corpus = corpus2csc(corpus, num_terms=len(dictionary)).transpose() # Get corpus groups with respect to the indices according to their doc_ids groups = defaultdict(list) for index, doc_id in enumerate(docs_ids): groups[doc_id].append(index) # Merge corpus with same doc_id id_corpus_obj = { doc_id: sparse_corpus[indices, :].sum(axis=0) for doc_id, indices in groups.items() } merged_doc_ids = list(id_corpus_obj.keys()) merged_corpus = vstack( [csc_matrix(doc) for doc in list(id_corpus_obj.values())]) merged_corpus = Sparse2Corpus(merged_corpus, documents_columns=False) return merged_doc_ids, merged_corpus
def transform(self, X): """ Parameter --------- X : [sp.csr_matrix] Returns ------- topic_vectors : [np.ndarray] each matrix is of shape (sent_count, topic_count) """ topic_vectors = [] for doc in X: sents_bow = Sparse2Corpus(doc, documents_columns=False) gamma, _ = self.lda.inference(sents_bow) # divide row by row sum topic_dist = (gamma.T / np.sum(gamma, axis=1)).T topic_vectors.append(topic_dist) return topic_vectors
def compete_number_of_words(detoken_data, token_data, min_num, max_num, step, random_state=None): ''' number_of_words를 찾기 위한 함수 Parameters : ------------- detoken_data : list 형태의 역토큰화된 데이터 token_data : coherence 값을 계산하기 위한 token_data min_num : number of words range의 최솟값 min_num부터 시작 max_num : number of words range의 최댓값 max_num까지 찾음 step : min_num ~ max_num 까지 가기 위해 step을 얼마나 갈것인지 random_state : 재현성을 주기 위해 설정, default = None Output : ------------- coherence_value : Num of Words와 그에 따른 Coherence Value가 있는 DataFrame 반환 ''' coherence_value = pd.DataFrame(columns=['min_df', 'Perplexity Value','Coherence Value']) i = 0 min_df = list(np.arange(min_num,max_num,step)) for m in min_df : print("{} 번째, min_df = {}".format(i+1, m)) vectorizer = CountVectorizer(min_df=m) # CountVectorizer 생성 cv = vectorizer.fit_transform(detoken_data) # fit and transform dictionary = corpora.Dictionary([vectorizer.get_feature_names()]) corpus = Sparse2Corpus(cv.T) lda_model = LdaModel(corpus=corpus, id2word=dictionary, random_state=random_state) coherence_lda = CoherenceModel(model=lda_model, texts=token_data, dictionary=dictionary, coherence='c_v') coherence_value.loc[i] = [m, lda_model.log_perplexity(corpus),coherence_lda.get_coherence()] i += 1 return coherence_value
def infer_ngrams_corpus(corpus, return_dict=False): bow_features = [(i, attribute.name) for i, attribute in enumerate(corpus.domain.attributes) if 'bow-feature' in attribute.attributes] if len(bow_features) == 0: corpus = BowVectorizer().transform(corpus) bow_features = [(i, attribute.name) for i, attribute in enumerate(corpus.domain.attributes) if 'bow-feature' in attribute.attributes] feature_presence = corpus.X.sum(axis=0) keep = [(i, a) for i, a in bow_features if feature_presence[0, i] > 0] # sort features by the order in the dictionary dictionary = Dictionary(corpus.ngrams_iterator(include_postags=True), prune_at=None) idx_of_keep = np.argsort([dictionary.token2id[a] for _, a in keep]) keep = [keep[i][0] for i in idx_of_keep] result = Sparse2Corpus(corpus.X[:, keep].T) return (result, dictionary) if return_dict else result
def do_after_a_full_scan(self, num_of_finished_scans): # First iteration of a normal run (collecting the vocabulary) if num_of_finished_scans == 1 and self.num_of_scans == 3: self.vocab = Counter(self.all_words) self.all_words = {} for word in self.vocab: if self.vocab[word] >= self.min_count: self.all_words[word] = len(self.all_words) self.vectors = lil_matrix( (len(self.all_words), self.number_of_tus), dtype=np.int8) print("-#-#-#-#-#-#-#-#-#-#-#-") print("size of vocab:", len(self.vocab)) print("size of common words:", len(self.all_words)) print("number of TUs:", self.number_of_tus) self.number_of_tus = 0 f = open(self.dict_file_name, "a+") for w in self.all_words: f.write(w) f.write("\t" + str(self.all_words[w]) + "\n") f.close() # Second iteration of a normal run (making the tu-word matrix) elif num_of_finished_scans == 2: print("Performing SVD...") x = Sparse2Corpus(self.vectors) lsi = lsimodel.LsiModel(corpus=x, id2word=None, num_topics=self.num_of_features) lsi.save(self.model_file_name) self.vectors = lsi.projection.u print("done.") else: print("-#-#-#-#-#-#-#-#-#-#-#-")
def get_topics(cv, train_data): """ Uses gensim to perform topic modeling. Parameters --------- cv: A TfidfVectorizer instance. train_data: A scipy csr_matrix. Returns ------- A list of strings (functions of the most important terms in each topic). """ td_gensim = Sparse2Corpus(train_data, documents_columns=False) tmp_dct = dict((idv, word) for word, idv in cv.vocabulary_.items()) dct = Dictionary.from_corpus(td_gensim, id2word=tmp_dct) lda = LdaModel(corpus=td_gensim, id2word=dct, num_topics=20) topics = lda.top_topics(corpus=td_gensim, num_words=5) return topics