def test_lda(self): my_lda = LDA("test.mm", "test.dict", ntopics=3) my_lda.save(os.path.join(module_path, 'test.model')) top_words = my_lda.get_top_words(15) self.assertEqual(len(top_words), 3) self.assertEqual(len(top_words[1]), 15) my_lda.termite_data(os.path.isfile(os.path.join(module_path, 'test_termite.csv'))) self.assertTrue(os.path.isfile(os.path.join(module_path, 'test.model'))) self.assertTrue(os.path.isfile(os.path.join(module_path, 'test_termite.model')))
def run_topic_model(output_dir, n_topics, content_fields, field_filters=None, field_filter_vals=None, seed=42): np.random.seed(seed) # documents = iter_elastic_query(ES_INSTANCE + ES_INDEX, "abstract", "", query=None) documents = read_bulk_index(elastic + "original/", content_fields, field_filters, field_filter_vals) corpus = EntitiesTokenizer( documents) #receives a generator of strings (content for each doc) # if os.path.isdir(output_dir): # shutil.rmtree(output_dir) # os.makedirs(output_dir) corpus_bow = CorpusBOW(corpus) corpus_dict = corpus_bow.save_dict(os.path.join(output_dir, 'corpus.dict')) # Serialize and store the corpus corpus_file = corpus_bow.serialize(os.path.join(output_dir, 'corpus.mm')) # Create LDA model from corpus and dictionary topik_lda = LDA(os.path.join(output_dir, 'corpus.mm'), os.path.join(output_dir, 'corpus.dict'), n_topics, update_every=1, passes=5) topik_lda.save(os.path.join(output_dir, 'model.gensim')) # Generate the input for the termite plot topik_lda.termite_data(os.path.join(output_dir, 'termite.csv')) # Get termite plot for this model termite = Termite(os.path.join(output_dir, 'termite.csv'), "Termite Plot") termite.plot(os.path.join(output_dir, 'termite.html')) df_results = generate_csv_output_file(documents, corpus, corpus_bow, topik_lda.model) to_r_ldavis(corpus_bow, dir_name=os.path.join(output_dir, 'ldavis'), lda=topik_lda) os.environ["LDAVIS_DIR"] = os.path.join(output_dir, 'ldavis') try: subprocess.call( ['Rscript', os.path.join(BASEDIR, 'topic-space/R/runLDAvis.R')]) except ValueError: logging.warning("Unable to run runLDAvis.R")