Python BleiCorpus Examples

Programming Language: Python

Namespace/Package Name: gensim.corpora

Class/Type: BleiCorpus

Examples at hotexamples.com: 7

Python BleiCorpus - 7 examples found. These are the top rated real world Python examples of gensim.corpora.BleiCorpus extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

serialize(4)

BleiCorpus(1)

Example #1

Show file

File: train.py Project: acatinfd/nlpproject

    def serialize(self):
        BleiCorpus.serialize(self.corpus_path, self, id2word=self.reviews_dictionary)

        return self

Example #2

Show file

    def serialize(self):
        BleiCorpus.serialize(self.corpus_path,
                             self,
                             id2word=self.reviews_dictionary)

        return self

Example #3

Show file

File: train_model.py Project: OminiaVincit/AmazonReviewAnalysis

 def serialize(self):
     u'''Serialize corpus'''
     BleiCorpus.serialize(self.corpus_path, self, \
         id2word=self.corpus_dictionary)
     return self

Example #4

Show file

        print "pos-tag", nelem

folder = 'models_unigram_topics_all_pos'
dictionary_path = folder + "/dictionary.dict"
corpus_path = folder + "/corpus.lda-c"
lda_num_topics = 250
lda_model_path = folder + "/lda_model_" + str(lda_num_topics) + "_topics.lda"
print "created paths"

dictionary = corpora.Dictionary(review["words"] for review in corpus_list)
dictionary.filter_extremes(keep_n=10000)
dictionary.compactify()
corpora.Dictionary.save(dictionary, dictionary_path)

corpus = [dictionary.doc2bow(review["words"]) for review in corpus_list]
BleiCorpus.serialize(corpus_path, corpus, id2word=dictionary)

corpus = corpora.BleiCorpus(corpus_path)
print "running lda"
lda = gensim.models.LdaMulticore(corpus,
                                 num_topics=lda_num_topics,
                                 id2word=dictionary,
                                 minimum_probability=0.,
                                 workers=8)
lda.save(lda_model_path)
print "done lda"

dictionary = corpora.Dictionary.load(dictionary_path)
corpus = corpora.BleiCorpus(corpus_path)
lda = LdaMulticore.load(lda_model_path)
i = 0

Example #5

Show file

File: vlad_topics.py Project: TheResearchProject/CommentParser

    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)

        #Create temporary directory to write the corpus LDA-C files
        temp_dir_path = tempfile.mkdtemp()
        corpus_path = temp_dir_path + "/corpus.lda-c"

        lda_num_topics = 50

        for database in self.selected_dbs:

            print "Processing database " + database

            #Building dictionary
            print "Building dictionary"
            dictionary = Dictionary()

            queryset = Comment.objects.using(database).exclude(
                text__isnull=True)
            self.pbar_setup(maxval=queryset.count())

            for comment in queryset_iterator(queryset, chunksize=50):
                dictionary.add_documents(
                    [[word.word for word in comment.text.word_set.all()]])
                self.pbar_increment()
            self.pbar_destroy()

            dictionary.filter_extremes(keep_n=10000)
            dictionary.compactify()

            #Serialize corpus
            print "Serializing corpus"
            corpus = Corpus(
                queryset_iterator(Comment.objects.using(database).all(),
                                  chunksize=50), dictionary)
            BleiCorpus.serialize(corpus_path, corpus, id2word=dictionary)

            #Train
            print "Training..."
            bleicorpus = BleiCorpus(corpus_path)
            lda = gensim.models.LdaModel(bleicorpus,
                                         num_topics=lda_num_topics,
                                         id2word=dictionary)

            #Saving
            print "Saving results to DB"
            lda_db_obj, created = Algorithm.objects.using(
                database).get_or_create(name='LDA')
            #Removing previous results
            lda_db_obj.result_set.all().delete()
            #Looping through results and saving to DB
            i = 0
            for topic in lda.show_topics(num_topics=lda_num_topics):
                Result.objects.using(database).create(sequence=i,
                                                      value=str(topic),
                                                      algorithm=lda_db_obj)
                i += 1

            #Remove temporary directory
            #Check first if it's not the current working directory, as removing it
            #  would be a disaster! ;)
            if os.getcwd() != temp_dir_path:
                #Just remove it if it's a temp dir
                shutil.rmtree(temp_dir_path)
            else:
                #If it's the current working directory, just remove the uneeded files
                map(os.remove, glob.glob('corpus.lda-c*'))

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

Example #6

Show file

File: train.py Project: nkman/Raiden

    def serialize(self):
        # serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False)
        # Iterate through the document stream corpus, saving the documents to fname and recording byte offset of each document. Save the resulting index structure to file index_fname (or fname.index is not set).

        BleiCorpus.serialize(self.corpus_path, self, id2word=self.tag_dictionary)
        return self

Example #7

Show file

File: train_model.py Project: OminiaVincit/AmazonReviewAnalysis

 def serialize(self):
     u'''Serialize corpus'''
     BleiCorpus.serialize(self.corpus_path, self, \
         id2word=self.corpus_dictionary)
     return self