def serialize(self): BleiCorpus.serialize(self.corpus_path, self, id2word=self.reviews_dictionary) return self
def serialize(self): u'''Serialize corpus''' BleiCorpus.serialize(self.corpus_path, self, \ id2word=self.corpus_dictionary) return self
print "pos-tag", nelem folder = 'models_unigram_topics_all_pos' dictionary_path = folder + "/dictionary.dict" corpus_path = folder + "/corpus.lda-c" lda_num_topics = 250 lda_model_path = folder + "/lda_model_" + str(lda_num_topics) + "_topics.lda" print "created paths" dictionary = corpora.Dictionary(review["words"] for review in corpus_list) dictionary.filter_extremes(keep_n=10000) dictionary.compactify() corpora.Dictionary.save(dictionary, dictionary_path) corpus = [dictionary.doc2bow(review["words"]) for review in corpus_list] BleiCorpus.serialize(corpus_path, corpus, id2word=dictionary) corpus = corpora.BleiCorpus(corpus_path) print "running lda" lda = gensim.models.LdaMulticore(corpus, num_topics=lda_num_topics, id2word=dictionary, minimum_probability=0., workers=8) lda.save(lda_model_path) print "done lda" dictionary = corpora.Dictionary.load(dictionary_path) corpus = corpora.BleiCorpus(corpus_path) lda = LdaMulticore.load(lda_model_path) i = 0
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) #Create temporary directory to write the corpus LDA-C files temp_dir_path = tempfile.mkdtemp() corpus_path = temp_dir_path + "/corpus.lda-c" lda_num_topics = 50 for database in self.selected_dbs: print "Processing database " + database #Building dictionary print "Building dictionary" dictionary = Dictionary() queryset = Comment.objects.using(database).exclude( text__isnull=True) self.pbar_setup(maxval=queryset.count()) for comment in queryset_iterator(queryset, chunksize=50): dictionary.add_documents( [[word.word for word in comment.text.word_set.all()]]) self.pbar_increment() self.pbar_destroy() dictionary.filter_extremes(keep_n=10000) dictionary.compactify() #Serialize corpus print "Serializing corpus" corpus = Corpus( queryset_iterator(Comment.objects.using(database).all(), chunksize=50), dictionary) BleiCorpus.serialize(corpus_path, corpus, id2word=dictionary) #Train print "Training..." bleicorpus = BleiCorpus(corpus_path) lda = gensim.models.LdaModel(bleicorpus, num_topics=lda_num_topics, id2word=dictionary) #Saving print "Saving results to DB" lda_db_obj, created = Algorithm.objects.using( database).get_or_create(name='LDA') #Removing previous results lda_db_obj.result_set.all().delete() #Looping through results and saving to DB i = 0 for topic in lda.show_topics(num_topics=lda_num_topics): Result.objects.using(database).create(sequence=i, value=str(topic), algorithm=lda_db_obj) i += 1 #Remove temporary directory #Check first if it's not the current working directory, as removing it # would be a disaster! ;) if os.getcwd() != temp_dir_path: #Just remove it if it's a temp dir shutil.rmtree(temp_dir_path) else: #If it's the current working directory, just remove the uneeded files map(os.remove, glob.glob('corpus.lda-c*')) self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def serialize(self): # serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False) # Iterate through the document stream corpus, saving the documents to fname and recording byte offset of each document. Save the resulting index structure to file index_fname (or fname.index is not set). BleiCorpus.serialize(self.corpus_path, self, id2word=self.tag_dictionary) return self