Exemple #1
0
    def fit_universal_models(self):

        vec = CountVectorizer(stop_words='english', max_features=10000)
        vec_t = vec.fit_transform(' '.join(x) for x in self.all_sentences)

        id2word = {v: k for k, v in vec.vocabulary_.iteritems()}
        vec_corpus = gensim.matutils.Sparse2Corpus(vec_t.T)

        if os.path.isfile('lda.modl'):
            lda = LdaMulticore.load('lda.modl')
        else:
            lda = LdaMulticore(corpus=vec_corpus,
                               id2word=id2word,
                               iterations=200,
                               num_topics=2,
                               passes=10,
                               workers=4)
            lda.save('lda.modl')

        all_counts = vec.transform(' '.join(x) for x in self.all_sentences)
        self.d['all']['_probas'] = np.array(
            lda.inference(gensim.matutils.Sparse2Corpus(all_counts.T))[0])
        labeled_counts = vec.transform(' '.join(x) for x in self.X)
        self.d['labeled']['_probas'] = np.array(
            lda.inference(gensim.matutils.Sparse2Corpus(labeled_counts.T))[0])

        w2vmodel = Word2Vec(self.all_sentences,
                            size=100,
                            window=5,
                            min_count=3,
                            workers=4)

        best_centroids = None
        best_score = None
        for _ in xrange(
                10):  # todo -- implement kmeans++ instead of best of 10
            km = Kmeans(50)
            km.fit(w2vmodel.syn0)
            score = km.compute_sse(w2vmodel.syn0)
            if best_score is None or score < best_score:
                best_score = score
                best_centroids = km.centroids
        km.centroids = best_centroids

        self.tfidf = TfidfVectorizer(stop_words=set(stopwords.words()))
        self.d['all']['_t'] = self.tfidf.fit_transform(
            ' '.join(x) for x in self.all_sentences)
        self.d['labeled']['_t'] = self.tfidf.transform(' '.join(x)
                                                       for x in self.X)

        self.d['all']['_kmeans'] = np.array(
            kmeans_word2vecify(self.all_sentences, w2vmodel, km,
                               self.d['all']['_t'], self.tfidf))
        self.d['labeled']['_kmeans'] = np.array(
            kmeans_word2vecify(self.X, w2vmodel, km, self.d['labeled']['_t'],
                               self.tfidf))
class LdaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, dim = 2, column = 'whole'):
        self.dim = dim
        self.column = column
    def fit(self, X, y=None):     
        lda_tokens = X[self.column].apply(lambda x: x.split())
        # create Dictionary and train it on text corpus
        self.lda_dic = Dictionary(lda_tokens)
        self.lda_dic.filter_extremes(no_below=10, no_above=0.6, keep_n=8000)
        lda_corpus = [self.lda_dic.doc2bow(doc) for doc in lda_tokens]
        # create TfidfModel and train it on text corpus
        self.lda_tfidf = TfidfModel(lda_corpus)
        lda_corpus = self.lda_tfidf[lda_corpus]
        # create LDA Model and train it on text corpus
        self.lda_model = LdaMulticore(
            lda_corpus, num_topics=self.dim, id2word=self.lda_dic, workers=4,
            passes=20, chunksize=1000, random_state=0
        )
        return self
    
    def transform(self, X, y=None):
        lda_emb_len = len(self.lda_model[[]])
        lda_corpus = [self.lda_dic.doc2bow(doc) for doc in X[self.column].apply(lambda x: x.split())]
        lda_corpus = self.lda_tfidf[lda_corpus]
        lda_que_embs = self.lda_model.inference(lda_corpus)[0]
        # append lda question embeddings
        out = np.zeros((len(X), lda_emb_len))
        for i in range(lda_emb_len):
            out[:, i] = lda_que_embs[:, i]
        return out
Exemple #3
0
 def make_factors(self):
     
     cor = to_corpus(self.artist_BoWs.values())
     lda = LdaMulticore(cor, num_topics=num_topics)
     
     for artist_id, BoW in self.artist_BoWs.items():
         # format that gensim expects
         formatted_BoW = [(i, x) for i, x in enumerate(BoW)]
         self.artist_factors[artist_id] = lda.inference([formatted_BoW])[0][0]
     
     del self.artist_BoWs
Exemple #4
0
    def fit_transform(self, dataset: Dataset, name: str) -> TopicModel:
        # https://radimrehurek.com/gensim/models/ldamulticore.html#module-gensim.models.ldamulticore
        model = LdaMulticore(
            corpus=dataset.get_gensim_corpus(),
            id2word=dataset.get_gensim_vocab(),
            num_topics=self.n,
            random_state=get_seed(),
            **self.kwargs,
        )
        # To get back DT matrix https://github.com/bmabey/pyLDAvis/blob/master/pyLDAvis/gensim_models.py
        topic_word_matrix = model.get_topics()
        doc_topic_matrix = model.inference(dataset.get_gensim_corpus())[0]

        self.model = model
        return TopicModel.from_array(name, topic_word_matrix, doc_topic_matrix)
Exemple #5
0
class Lda(BaseEstimator, TransformerMixin):
    def __init__(self, id2word=None, num_topics=25, passes=1):
        self.lda = None
        self.id2word = id2word
        self.num_topics = num_topics
        self.passes = passes

    def fit(self, X, y=None):
        """
        Parameter
        ---------
        X : [sp.csr_matrix]

        Returns
        -------
        self
        """
        if self.lda is None:
            self.lda = LdaMulticore(id2word=self.id2word,
                                    num_topics=self.num_topics,
                                    passes=self.passes)
        X_flat = sp.vstack(X)
        self.lda.update(Sparse2Corpus(X_flat, documents_columns=False))
        return self

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

    def transform(self, X):
        """
        Parameter
        ---------
        X : [sp.csr_matrix]

        Returns
        -------
        topic_vectors : [np.ndarray]
            each matrix is of shape (sent_count, topic_count)
        """
        topic_vectors = []
        for doc in X:
            sents_bow = Sparse2Corpus(doc, documents_columns=False)
            gamma, _ = self.lda.inference(sents_bow)
            # divide row by row sum
            topic_dist = (gamma.T / np.sum(gamma, axis=1)).T
            topic_vectors.append(topic_dist)
        return topic_vectors
class Lda(BaseEstimator, TransformerMixin):
    def __init__(self, id2word=None, num_topics=25, passes=1):
        self.lda = None
        self.id2word = id2word
        self.num_topics = num_topics
        self.passes = passes

    def fit(self, X, y=None):
        """
        Parameter
        ---------
        X : [sp.csr_matrix]

        Returns
        -------
        self
        """
        if self.lda is None:
            self.lda = LdaMulticore(
                    id2word=self.id2word, num_topics=self.num_topics, passes=self.passes)
        X_flat = sp.vstack(X)
        self.lda.update(Sparse2Corpus(X_flat, documents_columns=False))
        return self

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

    def transform(self, X):
        """
        Parameter
        ---------
        X : [sp.csr_matrix]

        Returns
        -------
        topic_vectors : [np.ndarray]
            each matrix is of shape (sent_count, topic_count)
        """
        topic_vectors = []
        for doc in X:
            sents_bow = Sparse2Corpus(doc, documents_columns=False)
            gamma, _ = self.lda.inference(sents_bow)
            # divide row by row sum
            topic_dist = (gamma.T / np.sum(gamma, axis=1)).T
            topic_vectors.append(topic_dist)
        return topic_vectors