Exemple #1
0
    def serialize(self):
        BleiCorpus.serialize(self.corpus_path, self, id2word=self.reviews_dictionary)

        return self
Exemple #2
0
    def serialize(self):
        BleiCorpus.serialize(self.corpus_path,
                             self,
                             id2word=self.reviews_dictionary)

        return self
 def serialize(self):
     u'''Serialize corpus'''
     BleiCorpus.serialize(self.corpus_path, self, \
         id2word=self.corpus_dictionary)
     return self
Exemple #4
0
        print "pos-tag", nelem

folder = 'models_unigram_topics_all_pos'
dictionary_path = folder + "/dictionary.dict"
corpus_path = folder + "/corpus.lda-c"
lda_num_topics = 250
lda_model_path = folder + "/lda_model_" + str(lda_num_topics) + "_topics.lda"
print "created paths"

dictionary = corpora.Dictionary(review["words"] for review in corpus_list)
dictionary.filter_extremes(keep_n=10000)
dictionary.compactify()
corpora.Dictionary.save(dictionary, dictionary_path)

corpus = [dictionary.doc2bow(review["words"]) for review in corpus_list]
BleiCorpus.serialize(corpus_path, corpus, id2word=dictionary)

corpus = corpora.BleiCorpus(corpus_path)
print "running lda"
lda = gensim.models.LdaMulticore(corpus,
                                 num_topics=lda_num_topics,
                                 id2word=dictionary,
                                 minimum_probability=0.,
                                 workers=8)
lda.save(lda_model_path)
print "done lda"

dictionary = corpora.Dictionary.load(dictionary_path)
corpus = corpora.BleiCorpus(corpus_path)
lda = LdaMulticore.load(lda_model_path)
i = 0
    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)

        #Create temporary directory to write the corpus LDA-C files
        temp_dir_path = tempfile.mkdtemp()
        corpus_path = temp_dir_path + "/corpus.lda-c"

        lda_num_topics = 50

        for database in self.selected_dbs:

            print "Processing database " + database

            #Building dictionary
            print "Building dictionary"
            dictionary = Dictionary()

            queryset = Comment.objects.using(database).exclude(
                text__isnull=True)
            self.pbar_setup(maxval=queryset.count())

            for comment in queryset_iterator(queryset, chunksize=50):
                dictionary.add_documents(
                    [[word.word for word in comment.text.word_set.all()]])
                self.pbar_increment()
            self.pbar_destroy()

            dictionary.filter_extremes(keep_n=10000)
            dictionary.compactify()

            #Serialize corpus
            print "Serializing corpus"
            corpus = Corpus(
                queryset_iterator(Comment.objects.using(database).all(),
                                  chunksize=50), dictionary)
            BleiCorpus.serialize(corpus_path, corpus, id2word=dictionary)

            #Train
            print "Training..."
            bleicorpus = BleiCorpus(corpus_path)
            lda = gensim.models.LdaModel(bleicorpus,
                                         num_topics=lda_num_topics,
                                         id2word=dictionary)

            #Saving
            print "Saving results to DB"
            lda_db_obj, created = Algorithm.objects.using(
                database).get_or_create(name='LDA')
            #Removing previous results
            lda_db_obj.result_set.all().delete()
            #Looping through results and saving to DB
            i = 0
            for topic in lda.show_topics(num_topics=lda_num_topics):
                Result.objects.using(database).create(sequence=i,
                                                      value=str(topic),
                                                      algorithm=lda_db_obj)
                i += 1

            #Remove temporary directory
            #Check first if it's not the current working directory, as removing it
            #  would be a disaster! ;)
            if os.getcwd() != temp_dir_path:
                #Just remove it if it's a temp dir
                shutil.rmtree(temp_dir_path)
            else:
                #If it's the current working directory, just remove the uneeded files
                map(os.remove, glob.glob('corpus.lda-c*'))

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
Exemple #6
0
    def serialize(self):
        # serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False)
        # Iterate through the document stream corpus, saving the documents to fname and recording byte offset of each document. Save the resulting index structure to file index_fname (or fname.index is not set).

        BleiCorpus.serialize(self.corpus_path, self, id2word=self.tag_dictionary)
        return self
 def serialize(self):
     u'''Serialize corpus'''
     BleiCorpus.serialize(self.corpus_path, self, \
         id2word=self.corpus_dictionary)
     return self