Python LdaVowpalWabbit Examples

Programming Language: Python

Namespace/Package Name: gensim.models.wrappers

Class/Type: LdaVowpalWabbit

Examples at hotexamples.com: 3

Python LdaVowpalWabbit - 3 examples found. These are the top rated real world Python examples of gensim.models.wrappers.LdaVowpalWabbit extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

LdaVowpalWabbit(2)

print_topics(1)

save(1)

show_topic(1)

Example #1

Show file

 def LDA(self, num_topics, num_words):
     dictionary = corpora.Dictionary(self.para_list)
     doc_term_matrix = [dictionary.doc2bow(para) for para in self.para_list]
     path = '/mnt/APPDATA/Project_Mafia/omkhalil/vowpal_binaries/vw-7.20150623'
     self.ldamodel = LdaVowpalWabbit(path,
                                     doc_term_matrix,
                                     num_topics=num_topics,
                                     id2word=dictionary)
     self.ldamodel.save('model/lda_model')
     print(self.ldamodel.print_topics(num_topics=10, num_words=num_words))

Example #2

Show file

File: test_coherencemodel.py Project: trideeprath/mr2vec_train

 def setUp(self):
     # Suppose given below are the topics which two different LdaModels come up with.
     # `topics1` is clearly better as it has a clear distinction between system-human
     # interaction and graphs. Hence both the coherence measures for `topics1` should be
     # greater.
     self.topics1 = [['human', 'computer', 'system', 'interface'],
                     ['graph', 'minors', 'trees', 'eps']]
     self.topics2 = [['user', 'graph', 'minors', 'system'],
                     ['time', 'graph', 'survey', 'minors']]
     self.ldamodel = LdaModel(corpus=corpus,
                              id2word=dictionary,
                              num_topics=2,
                              passes=0,
                              iterations=0)
     mallet_home = os.environ.get('MALLET_HOME', None)
     self.mallet_path = os.path.join(mallet_home, 'bin',
                                     'mallet') if mallet_home else None
     if self.mallet_path:
         self.malletmodel = LdaMallet(mallet_path=self.mallet_path,
                                      corpus=corpus,
                                      id2word=dictionary,
                                      num_topics=2,
                                      iterations=0)
     vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None)
     if not vw_path:
         msg = "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model"
         logging.info(msg)
         self.vw_path = None
     else:
         self.vw_path = vw_path
         self.vwmodel = LdaVowpalWabbit(self.vw_path,
                                        corpus=corpus,
                                        id2word=dictionary,
                                        num_topics=2,
                                        passes=0)

Example #3

Show file

class Contract_Reader():
    def __init__(self, config):
        print('Filepath for texts = ', config.textpath)
        self.corpus = PCR(config.textpath,
                          '.*\.txt',
                          encoding='utf-16',
                          para_block_reader=read_line_block)
        if config.clean_paragraphs == 'yes':
            self.clean(config, mode='para')
        if config.clean_sentences == 'yes':
            self.clean(config, mode='sent')
        #Corpus summaries
        self.corpus_info()
        self.LDA(config.num_topics, config.num_words)
        self.plot(config.num_words)

    def clean(self, config, mode='sent'):
        stop = set(stopwords.words('english'))
        exclude = set(string.punctuation)
        lemma = WNL()
        if mode == 'para':
            #paragraphs are lists of sentences each of which is a list of tokens. Reducing to list of strings.
            self.para_list = [
                list(itertools.chain.from_iterable(para))
                for para in self.corpus.paras()
            ]
            for index, paragraph in enumerate(self.para_list):
                paragraph = " ".join(paragraph)
                stop_free = " ".join(
                    [i for i in paragraph.lower().split() if i not in stop])
                punc_free = ''.join(ch for ch in stop_free
                                    if ch not in exclude)
                normalized = " ".join(
                    lemma.lemmatize(word) for word in punc_free.split())
                self.para_list[index] = normalized
            print(self.para_list[0])
            self.para_list = [para.split() for para in self.para_list]
            print(self.para_list[0])
        if mode == 'sent':
            #Obtain list of strings each one a sentence rather than list of lists.
            self.sents_list = [" ".join(sent) for sent in self.corpus.sents()]
            for index, sentence in enumerate(self.sents_list):
                stop_free = " ".join(
                    [i for i in sentence.lower().split() if i not in stop])
                punc_free = ''.join(ch for ch in stop_free
                                    if ch not in exclude)
                normalized = " ".join(
                    lemma.lemmatize(word) for word in punc_free.split())
                self.sents_list[index] = normalized
            print(self.sents_list[0])
            self.sents_list = [
                sentence.split() for sentence in self.sents_list
            ]
            print(self.sents_list[0])

    def LDA(self, num_topics, num_words):
        dictionary = corpora.Dictionary(self.para_list)
        doc_term_matrix = [dictionary.doc2bow(para) for para in self.para_list]
        path = '/mnt/APPDATA/Project_Mafia/omkhalil/vowpal_binaries/vw-7.20150623'
        self.ldamodel = LdaVowpalWabbit(path,
                                        doc_term_matrix,
                                        num_topics=num_topics,
                                        id2word=dictionary)
        self.ldamodel.save('model/lda_model')
        print(self.ldamodel.print_topics(num_topics=10, num_words=num_words))

    def plot(self, num_words):
        for t in range(self.ldamodel.num_topics):
            plt.figure()
            tuples = [
                reversed(x) for x in self.ldamodel.show_topic(t, num_words)
            ]
            plt.imshow(WordCloud().fit_words(dict(tuples)))
            plt.axis("off")
            plt.title("Topic #" + str(t))
            plt.savefig('plots/topic' + str(t))

    def corpus_info(self):
        """
        Summary information about the status of a corpus.
        """
        fids = len(self.corpus.fileids())
        paras = len(self.corpus.paras())
        sents = len(self.corpus.sents())
        sperp = sum(len(para) for para in self.corpus.paras()) / float(paras)
        tokens = FreqDist(self.corpus.words())
        count = sum(tokens.values())
        vocab = len(tokens)
        lexdiv = float(count) / float(vocab)

        print(
            ("Text corpus contains {} files\n"
             "Composed of {} paragraphs and {} sentences.\n"
             "{:0.3f} sentences per paragraph\n"
             "Word count of {} with a vocabulary of {}\n"
             "lexical diversity is {:0.3f}").format(fids, paras, sents, sperp,
                                                    count, vocab, lexdiv))