def run_topic_model(output_dir, n_topics, content_fields, field_filters=None, field_filter_vals=None, seed=42): np.random.seed(seed) # documents = iter_elastic_query(ES_INSTANCE + ES_INDEX, "abstract", "", query=None) documents = read_bulk_index(elastic + "original/", content_fields, field_filters, field_filter_vals) corpus = EntitiesTokenizer( documents) #receives a generator of strings (content for each doc) # if os.path.isdir(output_dir): # shutil.rmtree(output_dir) # os.makedirs(output_dir) corpus_bow = CorpusBOW(corpus) corpus_dict = corpus_bow.save_dict(os.path.join(output_dir, 'corpus.dict')) # Serialize and store the corpus corpus_file = corpus_bow.serialize(os.path.join(output_dir, 'corpus.mm')) # Create LDA model from corpus and dictionary topik_lda = LDA(os.path.join(output_dir, 'corpus.mm'), os.path.join(output_dir, 'corpus.dict'), n_topics, update_every=1, passes=5) topik_lda.save(os.path.join(output_dir, 'model.gensim')) # Generate the input for the termite plot topik_lda.termite_data(os.path.join(output_dir, 'termite.csv')) # Get termite plot for this model termite = Termite(os.path.join(output_dir, 'termite.csv'), "Termite Plot") termite.plot(os.path.join(output_dir, 'termite.html')) df_results = generate_csv_output_file(documents, corpus, corpus_bow, topik_lda.model) to_r_ldavis(corpus_bow, dir_name=os.path.join(output_dir, 'ldavis'), lda=topik_lda) os.environ["LDAVIS_DIR"] = os.path.join(output_dir, 'ldavis') try: subprocess.call( ['Rscript', os.path.join(BASEDIR, 'topic-space/R/runLDAvis.R')]) except ValueError: logging.warning("Unable to run runLDAvis.R")
def run_topic_model(field, subfield, output_dir, n_topics, seed=42): np.random.seed(seed) documents = iter_elastic_query(ES_INSTANCE, ES_INDEX, field, subfield) corpus = EntitiesTokenizer(documents) if os.path.isdir(output_dir): shutil.rmtree(output_dir) os.makedirs(output_dir) # Create dictionary corpus_bow = CorpusBOW(corpus) corpus_dict = corpus_bow.save_dict(os.path.join(output_dir, 'corpus.dict')) # Serialize and store the corpus corpus_file = corpus_bow.serialize(os.path.join(output_dir, 'corpus.mm')) # Create LDA model from corpus and dictionary lda = LDA(os.path.join(output_dir, 'corpus.mm'), os.path.join(output_dir, 'corpus.dict'), n_topics, update_every=1, chuncksize=10000, passes=1) # Generate the input for the termite plot lda.termite_data(os.path.join(output_dir, 'termite.csv')) # Get termite plot for this model termite = Termite(os.path.join(output_dir, 'termite.csv'), "Termite Plot") termite.plot(os.path.join(output_dir, 'termite.html')) df_results = generate_csv_output_file(documents, corpus, corpus_bow, lda.model) to_r_ldavis(corpus_bow, dir_name=os.path.join(output_dir, 'ldavis'), lda=lda) os.environ["LDAVIS_DIR"] = os.path.join(output_dir, 'ldavis') try: subprocess.call(['Rscript', os.path.join(BASEDIR, 'R/runLDAvis.R')]) except ValueError: logging.warning("Unable to run runLDAvis.R")
def run_topic_model(field, subfield, output_dir, n_topics, seed=42): np.random.seed(seed) documents = iter_elastic_query(ES_INSTANCE, ES_INDEX, field, subfield) corpus = EntitiesTokenizer(documents) if os.path.isdir(output_dir): shutil.rmtree(output_dir) os.makedirs(output_dir) # Create dictionary corpus_bow = CorpusBOW(corpus) corpus_dict = corpus_bow.save_dict(os.path.join(output_dir, 'corpus.dict')) # Serialize and store the corpus corpus_file = corpus_bow.serialize(os.path.join(output_dir, 'corpus.mm')) # Create LDA model from corpus and dictionary lda = LDA(os.path.join(output_dir, 'corpus.mm'), os.path.join(output_dir,'corpus.dict'), n_topics, update_every=1, chuncksize=10000, passes=1) # Generate the input for the termite plot lda.termite_data(os.path.join(output_dir,'termite.csv')) # Get termite plot for this model termite = Termite(os.path.join(output_dir,'termite.csv'), "Termite Plot") termite.plot(os.path.join(output_dir,'termite.html')) df_results = generate_csv_output_file(documents, corpus, corpus_bow, lda.model) to_r_ldavis(corpus_bow, dir_name=os.path.join(output_dir, 'ldavis'), lda=lda) os.environ["LDAVIS_DIR"] = os.path.join(output_dir, 'ldavis') try: subprocess.call(['Rscript', os.path.join(BASEDIR,'R/runLDAvis.R')]) except ValueError: logging.warning("Unable to run runLDAvis.R")
def run_model(data, format='json_stream', tokenizer='simple', n_topics=10, dir_path='./topic_model', model='lda_batch', termite_plot=True, output_file=False, r_ldavis=False, prefix_value=None, event_value=None, field=None, query='*:*', index=None, subfield=None, seed=42): """Run your data through all topik functionality and save all results to a specified directory. Parameters ---------- data: string Input data (e.g. file or folder or solr/elasticsearch instance). format: {'json_stream', 'folder_files', 'json_large', 'solr', 'elastic'}. The format of your data input. Currently available a json stream or a folder containing text files. Default is 'json_stream' tokenizer: {'simple', 'collocations', 'entities', 'mixed'} The type of tokenizer to use. Default is 'simple'. n_topics: int Number of topics to find in your data dir_path: string Directory path to store all topic modeling results files. Default is `./topic_model`. model: {'lda_batch', 'lda_online'}. Statistical modeling algorithm to use. Default 'lda_batch'. termite_plot: bool Generate termite plot of your model if True. Default is True. output_file: bool Generate a final summary csv file of your results. For each document: text, tokens, lda_probabilities and topic. r_ldavis: bool Generate an interactive data visualization of your topics. Default is False. prefix_value: string For 'large json' format reader, the prefix value to parse. event_value: string For 'large json' format reader, the event value to parse. field: string For 'json_stream', 'solr' or 'elastic' format readers, the field to parse. solr_instance: string For 'solr' format reader, the url to the solr instance. query: string For 'solr' format reader, an optional query. Default is '*:*' to retrieve all documents. seed: int Set random number generator to seed, to be able to reproduce results. Default 42. """ np.random.seed(seed) if format == 'folder_files': documents = iter_documents_folder(data) elif format == 'large_json' and prefix_value is not None and event_value is not None: documents = iter_large_json(data, prefix_value, event_value) elif format == 'json_stream' and field is not None: documents = iter_document_json_stream(data, field) elif format == 'solr' and field is not None: documents = iter_solr_query(data, field, query=query) elif format == 'elastic' and field is not None: documents = iter_elastic_query(data, index, field, subfield) else: raise Exception("Invalid input, make sure your passing the appropriate arguments for the different formats") if tokenizer == 'simple': corpus = SimpleTokenizer(documents) elif tokenizer == 'collocations' : corpus = CollocationsTokenizer(documents) elif tokenizer == 'entities': corpus = EntitiesTokenizer(documents) elif tokenizer == 'mixed': corpus = MixedTokenizer(documents) else: print("Processing value invalid, using simple") corpus = SimpleTokenizer(documents) if os.path.isdir(dir_path): shutil.rmtree(dir_path) os.makedirs(dir_path) # Create dictionary corpus_bow = CorpusBOW(corpus) corpus_dict = corpus_bow.save_dict(os.path.join(dir_path, 'corpus.dict')) # Serialize and store the corpus corpus_file = corpus_bow.serialize(os.path.join(dir_path, 'corpus.mm')) # Create LDA model from corpus and dictionary if model == 'lda_batch': # To perform lda in batch mode set update_every=0 and passes=20) # https://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation lda = LDA(os.path.join(dir_path, 'corpus.mm'), os.path.join(dir_path,'corpus.dict'), n_topics, update_every=0, passes=20) elif model == 'lda_online': # To perform lda in online mode set variables update_every, chuncksize and passes. lda = LDA(os.path.join(dir_path, 'corpus.mm'), os.path.join(dir_path,'corpus.dict'), n_topics, update_every=1, chuncksize=10000, passes=1) else: logging.warning('model provided not valid. Using lda_batch.') lda = LDA(os.path.join(dir_path, 'corpus.mm'), os.path.join(dir_path,'corpus.dict'), n_topics, update_every=0, passes=20) # Generate the input for the termite plot lda.termite_data(os.path.join(dir_path,'termite.csv')) # Get termite plot for this model if termite_plot: termite = Termite(os.path.join(dir_path,'termite.csv'), "Termite Plot") termite.plot(os.path.join(dir_path,'termite.html')) if output_file: if format == 'folder_files': documents = iter_documents_folder(data) elif format == 'large_json': documents = iter_large_json(data, prefix_value, event_value) else: documents = iter_document_json_stream(data, field) df_results = generate_csv_output_file(documents, corpus, corpus_bow, lda.model) if r_ldavis: to_r_ldavis(corpus_bow, dir_name=os.path.join(dir_path, 'ldavis'), lda=lda) os.environ["LDAVIS_DIR"] = os.path.join(dir_path, 'ldavis') try: subprocess.call(['Rscript', os.path.join(BASEDIR,'R/runLDAvis.R')]) except ValueError: logging.warning("Unable to run runLDAvis.R") os.chdir(os.path.join(dir_path, 'ldavis', 'output')) sp = subprocess.Popen(['python', '-m', 'SimpleHTTPServer', '8000']) webbrowser.open_new_tab('127.0.0.1:8000') time.sleep(30) sp.kill()
def run_model(data, format='json_stream', tokenizer='simple', n_topics=10, dir_path='./topic_model', model='lda_batch', termite_plot=True, output_file=False, r_ldavis=False, prefix_value=None, event_value=None, field=None, query='*:*', index=None, subfield=None, seed=42): """Run your data through all topik functionality and save all results to a specified directory. Parameters ---------- data: string Input data (e.g. file or folder or solr/elasticsearch instance). format: {'json_stream', 'folder_files', 'json_large', 'solr', 'elastic'}. The format of your data input. Currently available a json stream or a folder containing text files. Default is 'json_stream' tokenizer: {'simple', 'collocations', 'entities', 'mixed'} The type of tokenizer to use. Default is 'simple'. n_topics: int Number of topics to find in your data dir_path: string Directory path to store all topic modeling results files. Default is `./topic_model`. model: {'lda_batch', 'lda_online'}. Statistical modeling algorithm to use. Default 'lda_batch'. termite_plot: bool Generate termite plot of your model if True. Default is True. output_file: bool Generate a final summary csv file of your results. For each document: text, tokens, lda_probabilities and topic. r_ldavis: bool Generate an interactive data visualization of your topics. Default is False. prefix_value: string For 'large json' format reader, the prefix value to parse. event_value: string For 'large json' format reader, the event value to parse. field: string For 'json_stream', 'solr' or 'elastic' format readers, the field to parse. solr_instance: string For 'solr' format reader, the url to the solr instance. query: string For 'solr' format reader, an optional query. Default is '*:*' to retrieve all documents. seed: int Set random number generator to seed, to be able to reproduce results. Default 42. """ np.random.seed(seed) if format == 'folder_files': id_documents = iter_documents_folder(data) elif format == 'large_json' and prefix_value is not None and event_value is not None: id_documents = iter_large_json(data, prefix_value, event_value) elif format == 'json_stream' and field is not None: id_documents = iter_document_json_stream(data, field) elif format == 'solr' and field is not None: id_documents = iter_solr_query(data, field, query=query) elif format == 'elastic' and field is not None: id_documents = iter_elastic_query(data, index, field, subfield) else: raise Exception( "Invalid input, make sure your passing the appropriate arguments for the different formats" ) ids, documents = unzip(id_documents) if tokenizer == 'simple': corpus = SimpleTokenizer(documents) elif tokenizer == 'collocations': corpus = CollocationsTokenizer(documents) elif tokenizer == 'entities': corpus = EntitiesTokenizer(documents) elif tokenizer == 'mixed': corpus = MixedTokenizer(documents) else: print("Processing value invalid, using simple") corpus = SimpleTokenizer(documents) if os.path.isdir(dir_path): shutil.rmtree(dir_path) os.makedirs(dir_path) # Create dictionary corpus_bow = CorpusBOW(corpus) corpus_dict = corpus_bow.save_dict(os.path.join(dir_path, 'corpus.dict')) # Serialize and store the corpus corpus_file = corpus_bow.serialize(os.path.join(dir_path, 'corpus.mm')) # Create LDA model from corpus and dictionary if model == 'lda_batch': # To perform lda in batch mode set update_every=0 and passes=20) # https://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation lda = LDA(os.path.join(dir_path, 'corpus.mm'), os.path.join(dir_path, 'corpus.dict'), n_topics, update_every=0, passes=20) elif model == 'lda_online': # To perform lda in online mode set variables update_every, chuncksize and passes. lda = LDA(os.path.join(dir_path, 'corpus.mm'), os.path.join(dir_path, 'corpus.dict'), n_topics, update_every=1, chunksize=10000, passes=1) else: logging.warning('model provided not valid. Using lda_batch.') lda = LDA(os.path.join(dir_path, 'corpus.mm'), os.path.join(dir_path, 'corpus.dict'), n_topics, update_every=0, passes=20) # Generate the input for the termite plot lda.termite_data(os.path.join(dir_path, 'termite.csv')) # Get termite plot for this model if termite_plot: termite = Termite(os.path.join(dir_path, 'termite.csv'), "Termite Plot") termite.plot(os.path.join(dir_path, 'termite.html')) if output_file: if format == 'folder_files': id_documents = iter_documents_folder(data) elif format == 'large_json': id_documents = iter_large_json(data, prefix_value, event_value) else: id_documents = iter_document_json_stream(data, field) ids, documents = unzip(id_documents) df_results = generate_csv_output_file(documents, corpus, corpus_bow, lda.model) if r_ldavis: to_r_ldavis(corpus_bow, dir_name=os.path.join(dir_path, 'ldavis'), lda=lda) os.environ["LDAVIS_DIR"] = os.path.join(dir_path, 'ldavis') try: subprocess.call( ['Rscript', os.path.join(BASEDIR, 'R/runLDAvis.R')]) except ValueError: logging.warning("Unable to run runLDAvis.R") os.chdir(os.path.join(dir_path, 'ldavis', 'output')) sp = subprocess.Popen(['python', '-m', 'SimpleHTTPServer', '8000']) webbrowser.open_new_tab('127.0.0.1:8000') time.sleep(30) sp.kill()
def run_model(data_source, source_type="auto", year_field=None, start_year=None, stop_year=None, content_field=None, clear_es_index=False, tokenizer='simple', n_topics=10, dir_path='./topic_model', model='lda_batch', termite_plot=True, output_file=False, r_ldavis=False, json_prefix=None, seed=42, **kwargs): """Run your data through all topik functionality and save all results to a specified directory. Parameters ---------- data_source: string Input data (e.g. file or folder or solr/elasticsearch instance). source_type: {'json_stream', 'folder_files', 'json_large', 'solr', 'elastic'}. The format of your data input. Currently available a json stream or a folder containing text files. Default is 'json_stream' year_field: string The field name (if any) that contains the year associated with each document (for filtering). start_year: int For beginning of range filter on year_field values stop_year: int For beginning of range filter on year_field values content_field: string The primary text field to parse. clear_es_index: bool On true, delete and re-create destination elasticsearch index prior to loading in new documents. Otherwise leave any previously existing documents and just add/update with the new documents. tokenizer: {'simple', 'collocations', 'entities', 'mixed'} The type of tokenizer to use. Default is 'simple'. n_topics: int Number of topics to find in your data dir_path: string Directory path to store all topic modeling results files. Default is `./topic_model`. model: {'lda_batch', 'lda_online'}. Statistical modeling algorithm to use. Default 'lda_batch'. termite_plot: bool Generate termite plot of your model if True. Default is True. output_file: bool Generate a final summary csv file of your results. For each document: text, tokens, lda_probabilities and topic. r_ldavis: bool Generate an interactive data visualization of your topics. Default is False. json_prefix: string For 'large json' format reader, the prefix value to parse. seed: int Set random number generator to seed, to be able to reproduce results. Default 42. """ np.random.seed(seed) raw_data = read_input(data_source, content_field=content_field, source_type=source_type, **kwargs) processed_data = preprocess(raw_data, tokenizer_method=tokenizer, **kwargs) # Serialize and store the corpus # Create LDA model from corpus and dictionary if model == 'lda_batch': # To perform lda in batch mode set update_every=0 and passes=20) # https://radimrehurek.com/gensim/wiki.html#latent-dirichlet-allocation lda = LDA(processed_data, n_topics, update_every=0, passes=20) elif model == 'lda_online': # To perform lda in online mode set variables update_every, chunksize and passes. lda = LDA(processed_data, n_topics, update_every=1, chunksize=10000, passes=1) else: logging.warning('model provided not valid. Using lda_batch.') lda = LDA(processed_data, n_topics, update_every=0, passes=20) # Get termite plot for this model if termite_plot: # Generate the input for the termite plot csv_path = os.path.join(dir_path, 'termite.csv') lda.termite_data(csv_path) termite = Termite(csv_path, "Termite Plot") termite.plot(os.path.join(dir_path, 'termite.html')) if output_file: filtered_documents = raw_data.get_data_by_year(start_year, stop_year, year_field) df_results = generate_csv_output_file(filtered_documents, raw_data, processed_data, lda.model) if r_ldavis: to_r_ldavis(processed_data, dir_name=os.path.join(dir_path, 'ldavis'), lda=lda) os.environ["LDAVIS_DIR"] = os.path.join(dir_path, 'ldavis') try: subprocess.call(['Rscript', os.path.join(BASEDIR, 'R/runLDAvis.R')]) except ValueError: logging.warning("Unable to run runLDAvis.R") os.chdir(os.path.join(dir_path, 'ldavis', 'output')) sp = subprocess.Popen(['python', '-m', 'SimpleHTTPServer', '8000']) webbrowser.open_new_tab('127.0.0.1:8000') time.sleep(3) sp.kill() os.chdir(os.path.dirname(BASEDIR))