def train(self, df): self.user_dict = { el: self.topic_dict.copy() for el in df.sender.unique() } cv = CV(stop_words='english') X = cv.fit_transform(df['context']) vocab = cv.vocabulary_.keys() self.worddict = dict([(i, s) for i, s in enumerate(vocab)]) self.mydict = Dictionary() self.mydict = self.mydict.from_corpus(matutils.Sparse2Corpus( X, documents_columns=False), id2word=self.worddict) self.model = LatentDA.LdaModel(matutils.Sparse2Corpus( X, documents_columns=False), num_topics=self.numtopics, passes=20, id2word=self.worddict) for i in df.iterrows(): if i[1]['context'] == '': continue else: values = new_model[mydict.doc2bow(i[1]['context'].split())] for val in values: if val[0] in user_dict[i[1].sender].keys(): if i[1].amt == '': continue user_dict[i[1].sender][val[0]] += val[1] * float( i[1].amt) continue user_dict[i[1].sender][val[0]] = val[1] for i in user_dict.keys(): norm_const = sum(user_dict[i].values()) for j in user_dict[i].keys(): user_dict[i][j] = user_dict[i][j] / norm_const
def prepare_corpus(self, doc, gram=(1, 2), option='c'): if option == 'c': # Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df cvna = CountVectorizer(tokenizer=self.parse_text, ngram_range=gram, stop_words=stop_fr, strip_accents='ascii', max_df=.8) data_cvna = cvna.fit_transform(doc) data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names()) # Create the gensim corpus (term_document matrix) doc_term_matrix = matutils.Sparse2Corpus( scipy.sparse.csr_matrix(data_dtmna.transpose())) # Create the vocabulary dictionary dictionary = dict((v, k) for k, v in cvna.vocabulary_.items()) elif option == 'tf': tfna = TfidfVectorizer(tokenizer=self.parse_text, ngram_range=gram, stop_words=stop_fr, strip_accents='ascii', max_df=.8) data_tfna = tfna.fit_transform(doc) data_dtmna = pd.DataFrame(data_tfna.toarray(), columns=tfna.get_feature_names()) # Create the gensim corpus (term_document matrix) doc_term_matrix = matutils.Sparse2Corpus( scipy.sparse.csr_matrix(data_dtmna.transpose())) # Create the vocabulary dictionary dictionary = dict((v, k) for k, v in tfna.vocabulary_.items()) # generate LDA model return dictionary, doc_term_matrix
def cluster(sentences): my_stop_words = {'okay', 'don', 've', 'didn', 'know', 'think', 'really'} corpus = [c['text'].replace("%hesitation", "").lower() for c in sentences] corpus = np.array(corpus) tf_vectorizer = TfidfVectorizer(decode_error='ignore', max_df=0.7, stop_words=my_stop_words.union(stop_words), ngram_range=(1, 1)) tf_mat = tf_vectorizer.fit_transform(corpus) id2word = {i: s for i, s in enumerate(tf_vectorizer.get_feature_names())} n_topics = 5 lsi = LsiModel(matutils.Sparse2Corpus(tf_mat.T), num_topics=n_topics, id2word=id2word, onepass=False) gs_lsi_mat = lsi[matutils.Sparse2Corpus(tf_mat.T)] lsi_mat = matutils.corpus2dense(gs_lsi_mat, n_topics).T norm = Normalizer(copy=False) lsi_mat = norm.fit_transform(lsi_mat) valid_indices = np.where(lsi_mat.any(axis=1))[0] valid_sent = lsi_mat[valid_indices] n_clusters = 7 cluster = KMeans(n_clusters, n_init=100) cluster.fit(valid_sent) clusters = {} for i in range(n_clusters): clusters[i] = np.where(cluster.labels_ == i)[0] for i in clusters.keys(): if np.sum( np.square(valid_sent[clusters[i]] - cluster.cluster_centers_[i] )) > cluster.inertia_ / n_clusters: del clusters[i] last_cluster = [ valid_indices[clusters[i][np.where( np.sum(np.square(valid_sent[clusters[i]] - cluster.cluster_centers_[i]), axis=1) < cluster.inertia_ / len(corpus))]].tolist() for i in clusters ] return last_cluster
def fit(self, X, y=None): """Fit the model according to the given training data. Parameters ---------- X : {iterable of list of (int, number), scipy.sparse matrix} A collection of documents in BOW format used for training the model. Returns ------- :class:`~gensim.sklearn_api.hdp.HdpTransformer` The trained model. """ if sparse.issparse(X): corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False) else: corpus = X self.gensim_model = models.HdpModel( corpus=corpus, id2word=self.id2word, max_chunks=self.max_chunks, max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state ) return self
def transform(self, docs): """ Takes a list of documents as input ('docs'). Returns a matrix of topic distribution for the given document bow, where a_ij indicates (topic_i, topic_probability_j). The input `docs` should be in BOW format and can be a list of documents like : [ [(4, 1), (7, 1)], [(9, 1), (13, 1)], [(2, 1), (6, 1)] ] or a single document like : [(4, 1), (7, 1)] """ if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) # The input as array of array # import pdb; pdb.set_trace() # check = lambda x: [x] if isinstance(x[0], tuple) else x # docs = check(docs) if sparse.issparse(docs): docs = matutils.Sparse2Corpus(docs, documents_columns=False) X = [[] for i in range(0, len(docs))] for k, v in enumerate(docs): doc_topics = self.gensim_model[v] probs_docs = list(map(lambda x: x[1], doc_topics)) # Everything should be equal in length if len(probs_docs) != self.num_topics: probs_docs.extend([1e-12] * (self.num_topics - len(probs_docs))) X[k] = probs_docs probs_docs = [] return np.reshape(np.array(X), (len(docs), self.num_topics))
def predict_On_Unseen_Corpus(self, new_dataFrameWithText): # Take raw text and remove all none-nouns and adj, convert to BOW: new_data_dtmna = clean_df_text_from_nounsAndAdj(new_dataFrameWithText, self.textColumnName, self.applyStemming, minCountThreshold=self.minCountThreshold,maxCountThreshold=self.maxCountThreshold) # Take BOW, remove redundent features, convert to TDM (Transpose) print("Number of feature of new dataset: %s" % (str(new_data_dtmna.shape))) new_termDocumentMatric = new_data_dtmna.filter( items=self.mainCorpusFeatures).T # filter(items=self.mainCorpusFeatures,axis=1). print("Number of feature of new dataset: %s" % (str(new_termDocumentMatric.shape))) self.new_dataFrameWithText = new_dataFrameWithText self.new_termDocumentMatric = new_termDocumentMatric new_Corpus = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(new_termDocumentMatric)) # for i,corp in enumerate(new_Corpus): # print(i,corp) self.new_Corpus = new_Corpus topicDict_new = assign_Topic_To_document(self.lda_nounAdj, # Use previsouly trained LDA model new_Corpus, # Use the new corpus self.id2word_nounAdj, # Use existing Id2Word dictionary new_dataFrameWithText, self.textColumnName) # Use new dataFrameWithText self.topicDict_new = topicDict_new self.new_Corpus = new_Corpus return topicDict_new, new_Corpus
def add_features(corpus, X, dictionary, compute_values=None, var_attrs=None): order = np.argsort([dictionary[i] for i in range(len(dictionary))]) if compute_values is not None: compute_values = np.array(compute_values)[order] variable_attrs = { 'hidden': True, 'skip-normalization': True, } if isinstance(var_attrs, dict): variable_attrs.update(var_attrs) feature_names = [dictionary[i] for i in order] corpus = corpus.extend_attributes(X[:, order], feature_names=feature_names, var_attrs=variable_attrs, compute_values=compute_values, sparse=True, rename_existing=True) corpus.ngrams_corpus = matutils.Sparse2Corpus(X.T) return corpus
def fit(self, X): """ For fitting corpus into the class object. Calls gensim.model.LdaModel: >>>gensim.models.LdaModel(corpus=corpus,num_topics=num_topics,id2word=id2word,passes=passes,update_every=update_every,alpha=alpha,iterations=iterations,eta=eta,random_state=random_state) """ if sparse.issparse(X): self.corpus = matutils.Sparse2Corpus(X) else: self.corpus = X models.LdaModel.__init__(self, corpus=self.corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, passes=self.passes, update_every=self.update_every, alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability, random_state=self.random_state) return self
def reduce_nlp_data(vectorizer, data, n_components, reducer): transformed_data = vectorizer.fit_transform(data) id2word = { identifier: word for word, identifier in vectorizer.vocabulary_.items() } if reducer == 'lda': corpus = matutils.Sparse2Corpus(transformed_data.transpose()) lda = models.LdaModel(corpus=corpus, num_topics=n_components, minimum_probability=0.03, id2word=id2word, passes=10, random_state=42) print(lda.print_topics()) lda_corpus = lda[corpus] return lda, matutils.corpus2csc(lda_corpus).toarray().transpose() elif reducer == 'svd': SVD = TruncatedSVD(n_components, n_iter=10, random_state=42) svd_data = SVD.fit_transform(transformed_data) get_eigenvectors(SVD, id2word) return SVD, svd_data elif reducer == 'nmf': nmf = NMF(n_components, random_state=42) nmf_data = nmf.fit_transform(transformed_data) get_eigenvectors(nmf, id2word) return nmf, nmf_data else: return None, None
def partial_fit(self, X): """Train model over a potentially incomplete set of documents. Uses the parameters set in the constructor. This method can be used in two ways: * On an unfitted model in which case the model is initialized and trained on `X`. * On an already fitted model in which case the model is **updated** by `X`. Parameters ---------- X : {iterable of list of (int, number), scipy.sparse matrix} A collection of documents in BOW format used for training the model. Returns ------- :class:`~gensim.sklearn_api.hdp.HdpTransformer` The trained model. """ if sparse.issparse(X): X = matutils.Sparse2Corpus(sparse=X, documents_columns=False) if self.gensim_model is None: self.gensim_model = models.HdpModel( id2word=self.id2word, max_chunks=self.max_chunks, max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state ) self.gensim_model.update(corpus=X) return self
def fit(self, X, y=None): """ Fit the model according to the given training data. Calls gensim.models.LdaModel """ if sparse.issparse(X): corpus = matutils.Sparse2Corpus(X) else: corpus = X self.gensim_model = models.LdaModel( corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, passes=self.passes, update_every=self.update_every, alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability, random_state=self.random_state) return self
def evaluate_improved_cllsi(x_train1_in, x_test1_in, x_train2_in, x_test2_in, dimensions, evaluation_function): scores = [] for k in dimensions: x_train1, x_test1 = tfidf(data=(x_train1_in, x_test1_in)) x_train2, x_test2 = tfidf(data=(x_train2_in, x_test2_in)) n_train, n_test = len(x_train1), len(x_test1) X1 = matutils.corpus2csc(list(x_train1) + list(x_test1)) X2 = matutils.corpus2csc(list(x_train2) + list(x_test2)) x_train1, x_train2 = X1[:, :n_train], X2[:, :n_train] x_test1, x_test2 = X1[:, n_train:], X2[:, n_train:] x = sp.sparse.vstack([x_train1, x_train2]) x = matutils.Sparse2Corpus(x) lsa = models.LsiModel(x, num_topics=k) n = x_train1.shape[0] U = lsa.projection.u U1, U2 = U[:n, :], U[n:, :] p1, p2 = sp.sparse.csr_matrix( np.linalg.pinv(U1)), sp.sparse.csr_matrix(np.linalg.pinv(U2)) a1, a2 = np.dot(x_test1.T, p1.T).todense(), np.dot(x_test2.T, p2.T).todense() score = evaluation_function(a1, a2) scores.append(score) return scores
def fit(self, X, y=None): """ Fit the model according to the given training data. Calls gensim.models.HdpModel """ if sparse.issparse(X): corpus = matutils.Sparse2Corpus(X) else: corpus = X self.gensim_model = models.HdpModel(corpus=corpus, id2word=self.id2word, max_chunks=self.max_chunks, max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state) return self
def partial_fit(self, X): """ Train model over X. """ if sparse.issparse(X): X = matutils.Sparse2Corpus(X) self.add_documents(corpus=X)
def partial_fit(self, X): """ Train model over X. """ if sparse.issparse(X): X = matutils.Sparse2Corpus(X) if self.gensim_model is None: self.gensim_model = models.HdpModel(id2word=self.id2word, max_chunks=self.max_chunks, max_time=self.max_time, chunksize=self.chunksize, kappa=self.kappa, tau=self.tau, K=self.K, T=self.T, alpha=self.alpha, gamma=self.gamma, eta=self.eta, scale=self.scale, var_converge=self.var_converge, outputdir=self.outputdir, random_state=self.random_state) self.gensim_model.update(corpus=X) return self
def fit_lda(X, vocab, num_topics=5, passes=20): """ Fit LDA from a scipy CSR matrix (X). """ print('fitting lda...') return LdaModel(matutils.Sparse2Corpus(X.T), num_topics=num_topics, passes=passes, id2word=dict([(i, s) for i, s in enumerate(vocab)]))
def create_corpus(dtm): """Creates a word corpus from a document term matrix. """ tdm = dtm.transpose() sparse_counts = scipy.sparse.csr_matrix(tdm) corpus = matutils.Sparse2Corpus(sparse_counts) return corpus
def partial_fit(self, X): """Train model over a potentially incomplete set of documents. This method can be used in two ways: 1. On an unfitted model in which case the model is initialized and trained on `X`. 2. On an already fitted model in which case the model is **further** trained on `X`. Parameters ---------- X : {iterable of list of (int, number), scipy.sparse matrix} Stream of document vectors or sparse matrix of shape: [`num_terms`, `num_documents`]. Returns ------- :class:`~gensim.sklearn_api.lsimodel.LsiTransformer` The trained model. """ if sparse.issparse(X): X = matutils.Sparse2Corpus(sparse=X, documents_columns=False) if self.gensim_model is None: self.gensim_model = models.LsiModel( num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples) self.gensim_model.add_documents(corpus=X) return self
def fit(self, X, y=None): """Fit the model according to the given training data. Parameters ---------- X : {iterable of iterable of (int, int), scipy.sparse matrix} A collection of documents in BOW format used for training the model. Returns ------- :class:`~gensim.sklearn_api.ldamodel.LdaTransformer` The trained model. """ if sparse.issparse(X): corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False) else: corpus = X self.gensim_model = models.LdaModel( corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, passes=self.passes, update_every=self.update_every, alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability, random_state=self.random_state, dtype=self.dtype ) return self
def partial_fit(self, X): """Train model over a potentially incomplete set of documents. Uses the parameters set in the constructor. This method can be used in two ways: * On an unfitted model in which case the model is initialized and trained on `X`. * On an already fitted model in which case the model is **updated** by `X`. Parameters ---------- X : {iterable of iterable of (int, int), scipy.sparse matrix} A collection of documents in BOW format used for training the model. Returns ------- :class:`~gensim.sklearn_api.ldamodel.LdaTransformer` The trained model. """ if sparse.issparse(X): X = matutils.Sparse2Corpus(sparse=X, documents_columns=False) if self.gensim_model is None: self.gensim_model = models.LdaModel( num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, passes=self.passes, update_every=self.update_every, alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability, random_state=self.random_state, dtype=self.dtype ) self.gensim_model.update(corpus=X) return self
def infer_topics(self, num_topics=10): if self.corpus.gensim_vector_space is None: self.corpus.gensim_vector_space = matutils.Sparse2Corpus(self.corpus.sklearn_vector_space, documents_columns=False) self.nb_topics = num_topics lsa = models.LsiModel(corpus=self.corpus.gensim_vector_space, id2word=self.corpus.vocabulary, num_topics=num_topics) tmp_topic_word_matrix = list(lsa.show_topics(num_topics=num_topics, num_words=len(self.corpus.vocabulary), formatted=False)) row = [] col = [] data = [] for topic_id in range(self.nb_topics): topic_description = tmp_topic_word_matrix[topic_id] for weight, word_id in topic_description: row.append(topic_id) col.append(word_id) data.append(weight) self.topic_word_matrix = coo_matrix((data, (row, col)), shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr() self.document_topic_matrix = np.transpose(matutils.corpus2dense(lsa[self.corpus.gensim_vector_space], num_topics, self.corpus.size)) self.corpus.gensim_vector_space = None
def infer_topics(self, num_topics=10): if self.corpus.gensim_vector_space is None: self.corpus.gensim_vector_space = matutils.Sparse2Corpus(self.corpus.sklearn_vector_space, documents_columns=False) self.nb_topics = num_topics lda = models.LdaModel(corpus=self.corpus.gensim_vector_space, iterations=10000, num_topics=num_topics) tmp_topic_word_matrix = list(lda.show_topics(num_topics=num_topics, num_words=len(self.corpus.vocabulary), formatted=False)) row = [] col = [] data = [] for topic_id in range(self.nb_topics): topic_description = tmp_topic_word_matrix[topic_id] for word_id, probability in topic_description[1]: row.append(topic_id) col.append(int(word_id)) data.append(probability) self.topic_word_matrix = coo_matrix((data, (row, col)), shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr() self.document_topic_matrix = sparse.csr_matrix( np.transpose(matutils.corpus2dense(lda[self.corpus.gensim_vector_space], num_topics, self.corpus.size))) self.corpus.gensim_vector_space = None
def main(K, numfeatures, sample_file, num_display_words, outputfile): K_clusters = K vectorizer = idfVectorizer(sample_file, numfeatures) t0 = time() print("Applying topic modeling, using LDA") print(str(K_clusters) + "topics") corpus = matutils.Sparse2Corpus(vectorizer.X, documents_columns=False) lda = models.ldamodel.LdaModel(corpus, num_topics=K_clusters, id2word=vectorizer.id2words) print("done in %fs" % (time() - t0)) output_text = [] # for K_clusters, return num_words_most significant word # return as a list - a list of strings if formatted is True, or (word, probability) 2-tuples if False for i, item in enumerate(lda.show_topics(num_topics=K_clusters, num_words=num_display_words, formatted=False)): output_text.append("Topic: " + str(i)) for term, weight in item[1]: output_text.append(term + ":" + str(weight)) print "writing topics to file: ", outputfile with open (outputfile, 'w') as f: f.write('\n'.join(output_text)) output_json = [] for i, item in enumerate(lda.show_topics(num_topics=K_clusters, num_words=num_display_words, formatted=False)): topic_terms = {term: str(weight) for term, weight in item[1]} output_json.append(topic_terms)
def createModel(self, corpus, dictionary, info): logging.basicConfig(format='%(asctime)s: %(levelname)s : %(message)s', level=logging.INFO) path = 'TopicModel/' + info.data + '_' + info.identifier if not type(corpus) == list: corpus = matutils.Sparse2Corpus(corpus, documents_columns=False) if not os.path.exists(path): if self.name == 'LDA': if info.multicore: self.model = models.LdaMulticore( corpus, num_topics=info.numberTopics, id2word=dictionary, passes=info.passes, iterations=info.iterations, batch=0) else: self.model = models.LdaModel(corpus, num_topics=info.numberTopics, id2word=dictionary, passes=info.passes, iterations=info.iterations, update_every=info.online, chunksize=info.chunksize) elif self.name == 'LSI': self.model = models.LsiModel(corpus, info.numberTopics, dictionary) self.info = str(self.model) else: print 'Unkown Model type' print 'save Model' self.model.save(path) else: print 'Load Model' self.model = models.LdaModel.load(path)
def partial_fit(self, X): """ Train model over X. By default, 'online (single-pass)' mode is used for training the LDA model. Configure `passes` and `update_every` params at init to choose the mode among : - online (single-pass): update_every != None and passes == 1 - online (multi-pass): update_every != None and passes > 1 - batch: update_every == None """ if sparse.issparse(X): X = matutils.Sparse2Corpus(X) if self.gensim_model is None: self.gensim_model = models.LdaModel( num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, passes=self.passes, update_every=self.update_every, alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset, eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability, random_state=self.random_state) self.gensim_model.update(corpus=X) return self
def fit(self, X, y=None): """Fit the model according to the given training data. Parameters ---------- X : {iterable of list of (int, number), scipy.sparse matrix} A collection of documents in BOW format to be transformed. Returns ------- :class:`~gensim.sklearn_api.lsimodel.LsiTransformer` The trained model. """ if sparse.issparse(X): corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False) else: corpus = X self.gensim_model = models.LsiModel(corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples) return self
def _extract_data(topic_model, corpus, dictionary, doc_topic_dists=None): if not matutils.ismatrix(corpus): corpus_csc = matutils.corpus2csc(corpus, num_terms=len(dictionary)) else: corpus_csc = corpus # Need corpus to be a streaming gensim list corpus for len and inference functions below: corpus = matutils.Sparse2Corpus(corpus_csc) beta = 0.01 fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_) term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort] term_freqs[term_freqs == 0] = beta doc_lengths = corpus_csc.sum(axis=0).A.ravel() assert term_freqs.shape[0] == len( dictionary ), 'Term frequencies and dictionary have different shape {} != {}'.format( term_freqs.shape[0], len(dictionary)) assert doc_lengths.shape[0] == len( corpus ), 'Document lengths and corpus have different sizes {} != {}'.format( doc_lengths.shape[0], len(corpus)) if hasattr(topic_model, 'lda_alpha'): num_topics = len(topic_model.lda_alpha) else: num_topics = topic_model.num_topics if doc_topic_dists is None: # If its an HDP model. if hasattr(topic_model, 'lda_beta'): gamma = topic_model.inference(corpus) else: gamma, _ = topic_model.inference(corpus) doc_topic_dists = gamma / gamma.sum(axis=1)[:, None] else: if isinstance(doc_topic_dists, list): doc_topic_dists = matutils.corpus2dense(doc_topic_dists, num_topics).T elif issparse(doc_topic_dists): doc_topic_dists = doc_topic_dists.T.todense() doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1) assert doc_topic_dists.shape[ 1] == num_topics, 'Document topics and number of topics do not match {} != {}'.format( doc_topic_dists.shape[1], num_topics) # get the topic-term distribution straight from gensim without # iterating over tuples if hasattr(topic_model, 'lda_beta'): topic = topic_model.lda_beta else: topic = topic_model.state.get_lambda() topic = topic / topic.sum(axis=1)[:, None] topic_term_dists = topic[:, fnames_argsort] assert topic_term_dists.shape[0] == doc_topic_dists.shape[1] return doc_topic_dists
def partial_fit(self, X): """ Train model over X. """ if sparse.issparse(X): X = matutils.Sparse2Corpus(X) self.update(corpus=X)
def ns(numpy_matrix, number_of_corpus_features, scipy_sparse_matrix): corpus = matutils.Dense2Corpus(numpy_matrix) numpy_matrix = matutils.corpus2dense(corpus, num_terms=number_of_corpus_features) corpus = matutils.Sparse2Corpus(scipy_sparse_matrix) scipy_csc_matrix = matutils.corpus2csc(corpus)
def fit_lda(X, vocab, num_topics=10, passes=20, alpha=0.001): ''' fit LDA from a scipy CSR matrix (X). ''' print("fitting lda...") return LdaModel(matutils.Sparse2Corpus(X), num_topics=num_topics, passes=passes, alpha=alpha, id2word=dict([(i, s) for i, s in enumerate(vocab)]))