def fit_universal_models(self): vec = CountVectorizer(stop_words='english', max_features=10000) vec_t = vec.fit_transform(' '.join(x) for x in self.all_sentences) id2word = {v: k for k, v in vec.vocabulary_.iteritems()} vec_corpus = gensim.matutils.Sparse2Corpus(vec_t.T) if os.path.isfile('lda.modl'): lda = LdaMulticore.load('lda.modl') else: lda = LdaMulticore(corpus=vec_corpus, id2word=id2word, iterations=200, num_topics=2, passes=10, workers=4) lda.save('lda.modl') all_counts = vec.transform(' '.join(x) for x in self.all_sentences) self.d['all']['_probas'] = np.array( lda.inference(gensim.matutils.Sparse2Corpus(all_counts.T))[0]) labeled_counts = vec.transform(' '.join(x) for x in self.X) self.d['labeled']['_probas'] = np.array( lda.inference(gensim.matutils.Sparse2Corpus(labeled_counts.T))[0]) w2vmodel = Word2Vec(self.all_sentences, size=100, window=5, min_count=3, workers=4) best_centroids = None best_score = None for _ in xrange( 10): # todo -- implement kmeans++ instead of best of 10 km = Kmeans(50) km.fit(w2vmodel.syn0) score = km.compute_sse(w2vmodel.syn0) if best_score is None or score < best_score: best_score = score best_centroids = km.centroids km.centroids = best_centroids self.tfidf = TfidfVectorizer(stop_words=set(stopwords.words())) self.d['all']['_t'] = self.tfidf.fit_transform( ' '.join(x) for x in self.all_sentences) self.d['labeled']['_t'] = self.tfidf.transform(' '.join(x) for x in self.X) self.d['all']['_kmeans'] = np.array( kmeans_word2vecify(self.all_sentences, w2vmodel, km, self.d['all']['_t'], self.tfidf)) self.d['labeled']['_kmeans'] = np.array( kmeans_word2vecify(self.X, w2vmodel, km, self.d['labeled']['_t'], self.tfidf))
class LdaTransformer(BaseEstimator, TransformerMixin): def __init__(self, dim = 2, column = 'whole'): self.dim = dim self.column = column def fit(self, X, y=None): lda_tokens = X[self.column].apply(lambda x: x.split()) # create Dictionary and train it on text corpus self.lda_dic = Dictionary(lda_tokens) self.lda_dic.filter_extremes(no_below=10, no_above=0.6, keep_n=8000) lda_corpus = [self.lda_dic.doc2bow(doc) for doc in lda_tokens] # create TfidfModel and train it on text corpus self.lda_tfidf = TfidfModel(lda_corpus) lda_corpus = self.lda_tfidf[lda_corpus] # create LDA Model and train it on text corpus self.lda_model = LdaMulticore( lda_corpus, num_topics=self.dim, id2word=self.lda_dic, workers=4, passes=20, chunksize=1000, random_state=0 ) return self def transform(self, X, y=None): lda_emb_len = len(self.lda_model[[]]) lda_corpus = [self.lda_dic.doc2bow(doc) for doc in X[self.column].apply(lambda x: x.split())] lda_corpus = self.lda_tfidf[lda_corpus] lda_que_embs = self.lda_model.inference(lda_corpus)[0] # append lda question embeddings out = np.zeros((len(X), lda_emb_len)) for i in range(lda_emb_len): out[:, i] = lda_que_embs[:, i] return out
def make_factors(self): cor = to_corpus(self.artist_BoWs.values()) lda = LdaMulticore(cor, num_topics=num_topics) for artist_id, BoW in self.artist_BoWs.items(): # format that gensim expects formatted_BoW = [(i, x) for i, x in enumerate(BoW)] self.artist_factors[artist_id] = lda.inference([formatted_BoW])[0][0] del self.artist_BoWs
def fit_transform(self, dataset: Dataset, name: str) -> TopicModel: # https://radimrehurek.com/gensim/models/ldamulticore.html#module-gensim.models.ldamulticore model = LdaMulticore( corpus=dataset.get_gensim_corpus(), id2word=dataset.get_gensim_vocab(), num_topics=self.n, random_state=get_seed(), **self.kwargs, ) # To get back DT matrix https://github.com/bmabey/pyLDAvis/blob/master/pyLDAvis/gensim_models.py topic_word_matrix = model.get_topics() doc_topic_matrix = model.inference(dataset.get_gensim_corpus())[0] self.model = model return TopicModel.from_array(name, topic_word_matrix, doc_topic_matrix)
class Lda(BaseEstimator, TransformerMixin): def __init__(self, id2word=None, num_topics=25, passes=1): self.lda = None self.id2word = id2word self.num_topics = num_topics self.passes = passes def fit(self, X, y=None): """ Parameter --------- X : [sp.csr_matrix] Returns ------- self """ if self.lda is None: self.lda = LdaMulticore(id2word=self.id2word, num_topics=self.num_topics, passes=self.passes) X_flat = sp.vstack(X) self.lda.update(Sparse2Corpus(X_flat, documents_columns=False)) return self def fit_transform(self, X, y=None): self.fit(X) return self.transform(X) def transform(self, X): """ Parameter --------- X : [sp.csr_matrix] Returns ------- topic_vectors : [np.ndarray] each matrix is of shape (sent_count, topic_count) """ topic_vectors = [] for doc in X: sents_bow = Sparse2Corpus(doc, documents_columns=False) gamma, _ = self.lda.inference(sents_bow) # divide row by row sum topic_dist = (gamma.T / np.sum(gamma, axis=1)).T topic_vectors.append(topic_dist) return topic_vectors
class Lda(BaseEstimator, TransformerMixin): def __init__(self, id2word=None, num_topics=25, passes=1): self.lda = None self.id2word = id2word self.num_topics = num_topics self.passes = passes def fit(self, X, y=None): """ Parameter --------- X : [sp.csr_matrix] Returns ------- self """ if self.lda is None: self.lda = LdaMulticore( id2word=self.id2word, num_topics=self.num_topics, passes=self.passes) X_flat = sp.vstack(X) self.lda.update(Sparse2Corpus(X_flat, documents_columns=False)) return self def fit_transform(self, X, y=None): self.fit(X) return self.transform(X) def transform(self, X): """ Parameter --------- X : [sp.csr_matrix] Returns ------- topic_vectors : [np.ndarray] each matrix is of shape (sent_count, topic_count) """ topic_vectors = [] for doc in X: sents_bow = Sparse2Corpus(doc, documents_columns=False) gamma, _ = self.lda.inference(sents_bow) # divide row by row sum topic_dist = (gamma.T / np.sum(gamma, axis=1)).T topic_vectors.append(topic_dist) return topic_vectors