def index_data(data_folder, output_folder, project_name, cfg_folder, num_topics=DEFAULT_NUM_TOPICS, num_passes=DEFAULT_NUM_PASSES, min_token_freq=MIN_TOKEN_FREQ, min_token_len=MIN_TOKEN_LEN, max_token_len=MAX_TOKEN_LEN, log_to_file=False, lemmatize=False, stem=False, nonascii=True, procs=4, limitmb=512, multisegment=True): def save_config(cfg_file_name, config): # Writing our configuration file with open(cfg_file_name, 'w') as configfile: config.write(configfile) logging.info('The project configuration file is written to %s', cfg_file_name) # print 'The project configuration file is written to', cfg_file_name if not os.path.exists(data_folder): logging.error("Please provide a valid data folder!") sys.exit(1) # Checks whether the output folder exists if not os.path.exists(output_folder): os.makedirs(output_folder) # Checks whether the project folder exists project_folder = os.path.join(output_folder, project_name) if not os.path.exists(project_folder): os.makedirs(project_folder) tm_folder = os.path.join(project_folder, TM_FOLDER_NAME) whoosh_folder = os.path.join(project_folder, WHOOSH_FOLDER_NAME) dict_file = os.path.join(tm_folder, project_name + '.dict') ldac_file = os.path.join(tm_folder, project_name + '.ldac') cfg_file_name = os.path.join(cfg_folder, project_name + '.cfg') # Create file handler which logs debug messages if log_to_file: log_file_name = os.path.join(cfg_folder, project_name + '.log') logging.basicConfig(filename=log_file_name, format=LOG_FORMAT, level=logging.DEBUG) print "Log file:", log_file_name else: logging.basicConfig(format=LOG_FORMAT, level=logging.DEBUG) # Handling the project configuration file logging.info('Indexing Configurations:') logging.info('Project: %s', project_name) logging.info('Number of LDA topics: %d', num_topics) logging.info('Number of LDA passes: %d', num_passes) logging.info('Number of LSA topics: %d', LSI_DEFAULT_NUM_TOPICS) config = ConfigParser.RawConfigParser() if os.path.exists(cfg_file_name): config.read(cfg_file_name) if not config.has_section('DATA'): config.add_section('DATA') config.set('DATA', 'name', project_name) config.set('DATA', 'root_dir', os.path.normpath(data_folder)) config.set('DATA', 'project_dir', os.path.normpath(project_folder)) config.set('DATA', 'output_folder', os.path.normpath(output_folder)) save_config(cfg_file_name, config) print "Indexing documents...." start_time = time.time() if not config.has_section('WHOOSH'): logging.info('================================= BEGIN WHOOSH INDEXING') if not os.path.exists(whoosh_folder): os.makedirs(whoosh_folder) path_index_file_name = os.path.join(project_folder, project_name + '.path.index') # index_plain_text_emails(data_folder, path_index_file_name, # whoosh_folder, lemmatize=lemmatize, # stem=stem, nonascii=nonascii) index_plain_text_emails2(data_folder, path_index_file_name, whoosh_folder, stem, min_token_len, max_token_len, procs, limitmb, multisegment) config.add_section('WHOOSH') config.set('WHOOSH', 'whoosh_index_dir', os.path.normpath(whoosh_folder)) config.set('WHOOSH', 'path_index_file', os.path.normpath(path_index_file_name)) config.set('WHOOSH', 'lemmatize', lemmatize) config.set('WHOOSH', 'stem', stem) config.set('WHOOSH', 'nonascii', nonascii) save_config(cfg_file_name, config) logging.info('================================= END WHOOSH INDEXING') print '\nIndexing time:', (time.time() - start_time), 'seconds' print "Corpus building...." start_time = time.time() if not config.has_section('CORPUS'): logging.info('=============================== BEGIN CORPUS BUILDING') if not os.path.exists(tm_folder): os.makedirs(tm_folder) path_index_file_name = os.path.join(tm_folder, project_name + '.path.index') # build_lda_corpus(whoosh_folder, path_index_file_name, dict_file, # ldac_file, min_token_freq, min_token_len, # max_token_len) build_lda_corpus2(whoosh_folder, path_index_file_name, dict_file, ldac_file, min_token_freq, min_token_len, max_token_len, stem) config.add_section('CORPUS') config.set('CORPUS', 'tm_folder', os.path.normpath(tm_folder)) config.set('CORPUS', 'path_index_file', os.path.normpath(path_index_file_name)) config.set('CORPUS', 'blei_corpus_file', os.path.normpath(ldac_file)) config.set('CORPUS', 'dict_file', os.path.normpath(dict_file)) config.set('CORPUS', 'vocab_file', os.path.normpath(ldac_file + '.vocab')) config.set('CORPUS', 'min_token_freq', min_token_freq) config.set('CORPUS', 'min_token_len', min_token_len) config.set('CORPUS', 'max_token_len', 20) save_config(cfg_file_name, config) logging.info('=============================== END CORPUS BUILDING') print '\nCorpus building time:', (time.time() - start_time), 'seconds' # project_name = os.path.normpath(project_name) print "Topic modeling...." start_time = time.time() if not config.has_section('LDA'): logging.info('=============================== BEGIN LDA ESTIMATION') lda_model_file = os.path.join(tm_folder, project_name + '.lda') lda_beta_file = os.path.join(tm_folder, project_name + '.lda.beta') lda_theta_file = os.path.join(tm_folder, project_name + '.lda.theta') lda_cos_index_file = os.path.join(tm_folder, project_name + '.lda.cos.index') run_lda_estimation(dict_file, ldac_file, lda_model_file, lda_beta_file, lda_theta_file, lda_cos_index_file, num_topics, num_passes) config.add_section('LDA') config.set('LDA', 'lda_model_file', lda_model_file) config.set('LDA', 'lda_beta_file', lda_beta_file) config.set('LDA', 'lda_theta_file', lda_theta_file) config.set('LDA', 'lda_cos_index_file', lda_cos_index_file) config.set('LDA', 'num_topics', str(num_topics)) config.set('LDA', 'num_passes', str(num_passes)) save_config(cfg_file_name, config) logging.info('=============================== END LDA ESTIMATION') # logging.info('=============================== BEGIN LSI ESTIMATION') # # # Commented LSI due to an error from python interpreter on Feb 04, 2014 # # lsi_model_file = os.path.join(tm_folder, project_name + '.lsi') # lsi_beta_file = os.path.join(tm_folder, project_name + '.lsi.beta') # lsi_theta_file = os.path.join(tm_folder, project_name + '.lsi.theta') # lsi_cos_index_file = os.path.join(tm_folder, project_name + '.lsi.cos.index') # # # # run_lsi_estimation(dict_file, ldac_file, lsi_model_file, lsi_beta_file, lsi_theta_file, lsi_cos_index_file, LSI_DEFAULT_NUM_TOPICS) # # config.add_section('TFIDF') # config.set('TFIDF', 'tfidf_file', lsi_theta_file.replace('lsi', 'tfidf')) # # config.add_section('LSI') # config.set('LSI', 'lsi_model_file', lsi_model_file) # config.set('LSI', 'lsi_beta_file', lsi_beta_file) # config.set('LSI', 'lsi_theta_file', lsi_theta_file) # config.set('LSI', 'lsi_cos_index_file', lsi_cos_index_file) # config.set('LSI', 'lsi_num_topics', str(LSI_DEFAULT_NUM_TOPICS)) # # # logging.info('=============================== END LSI ESTIMATION') if not config.has_section('TFIDF'): logging.info('=============================== BEGIN TFIDF') tfidf_theta_file = os.path.join(tm_folder, project_name + '.tfidf.theta') run_tfidf(dict_file, ldac_file, tfidf_theta_file) config.add_section('TFIDF') config.set('TFIDF', 'tfidf_file', tfidf_theta_file) save_config(cfg_file_name, config) logging.info('=============================== END TFIDF') print '\nTopic modeling time:', (time.time() - start_time), 'seconds'
def index_data(data_folder, output_folder, project_name, cfg_folder, num_topics=DEFAULT_NUM_TOPICS, num_passes=DEFAULT_NUM_PASSES, min_token_freq=MIN_TOKEN_FREQ, min_token_len=MIN_TOKEN_LEN, log_to_file=True, lemmatize=False, stem=False, nonascii=True): if not os.path.exists(data_folder): print "Please provide a valid data folder!" sys.exit(1) # Checks whether the output folder exists if not os.path.exists(output_folder): os.makedirs(output_folder) # Checks whether the project folder exists project_folder = os.path.join(output_folder, project_name) if not os.path.exists(project_folder): os.makedirs(project_folder) # Create file handler which logs debug messages log_file_name = '%s.log' % os.path.join(project_folder, project_name) if log_to_file: logging.basicConfig(filename=log_file_name, format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) else: logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) # Handling the project configuration file print 'Indexing Configurations:' print 'Project:', project_name print 'Number of LDA topics:', num_topics print 'Number of LDA passes:', num_passes print 'Number of LSA topics:', LSI_DEFAULT_NUM_TOPICS cfg_file_name = os.path.join( cfg_folder , project_name + '.cfg' ) config = ConfigParser.RawConfigParser() config.add_section('DATA') config.set('DATA', 'name', project_name) config.set('DATA', 'root_dir', os.path.normpath(data_folder)) # may need to change the name config.set('DATA', 'project_dir', os.path.normpath(project_folder)) config.set('DATA', 'log_file', os.path.normpath(log_file_name)) config.set('DATA', 'output_folder', os.path.normpath(output_folder)) logging.info('================================================== BEGIN LUCENE INDEXING ==================================================') lucene_folder = os.path.join(project_folder, LUCENE_FOLDER_NAME) if not os.path.exists(lucene_folder): os.makedirs(lucene_folder) path_index_file_name = os.path.join(project_folder, project_name + '.path.index') index_plain_text_emails(data_folder, path_index_file_name, lucene_folder, lemmatize=lemmatize, stem=stem, nonascii=nonascii) config.add_section('LUCENE') config.set('LUCENE', 'lucene_index_dir', os.path.normpath(lucene_folder)) config.set('LUCENE', 'path_index_file', os.path.normpath(path_index_file_name)) logging.info('================================================== END LUCENE INDEXING ==================================================') logging.info('================================================== BEGIN CORPUS BUILDING ==================================================') ''' Consider all elements in emails for creating the LDA corpus, i.e., it uses the MetaDataType.ALL field in the Lucene index ''' tm_folder = os.path.join(project_folder, TM_FOLDER_NAME) if not os.path.exists(tm_folder): os.makedirs(tm_folder) dict_file = os.path.join(tm_folder, project_name + '.dict') ldac_file = os.path.join(tm_folder, project_name + '.ldac') path_index_file_name = os.path.join(tm_folder, project_name + '.path.index') # it's for topic modeling alone build_lda_corpus(lucene_folder, path_index_file_name, dict_file, ldac_file, min_token_freq, min_token_len) config.add_section('CORPUS') config.set('CORPUS', 'tm_folder', os.path.normpath(tm_folder)) config.set('CORPUS', 'path_index_file', os.path.normpath(path_index_file_name)) config.set('CORPUS', 'blei_corpus_file', os.path.normpath(ldac_file)) config.set('CORPUS', 'dict_file', os.path.normpath(dict_file)) config.set('CORPUS', 'vocab_file', os.path.normpath(ldac_file + '.vocab')) logging.info('================================================== END CORPUS BUILDING ==================================================') project_name = os.path.normpath(project_name) logging.info('================================================== BEGIN LDA ESTIMATION ==================================================') lda_model_file = os.path.join(tm_folder, project_name + '.lda') lda_beta_file = os.path.join(tm_folder, project_name + '.lda.beta') lda_theta_file = os.path.join(tm_folder, project_name + '.lda.theta') lda_cos_index_file = os.path.join(tm_folder, project_name + '.lda.cos.index') run_lda_estimation(dict_file, ldac_file, lda_model_file, lda_beta_file, lda_theta_file, lda_cos_index_file, num_topics, num_passes) # run_hdp_estimation(dict_file, ldac_file, lda_model_file, lda_beta_file, lda_theta_file, lda_cos_index_file) config.add_section('LDA') config.set('LDA', 'lda_model_file', lda_model_file) config.set('LDA', 'lda_beta_file', lda_beta_file) config.set('LDA', 'lda_theta_file', lda_theta_file) config.set('LDA', 'lda_cos_index_file', lda_cos_index_file) config.set('LDA', 'num_topics', str(num_topics)) config.set('LDA', 'num_passes', str(num_passes)) logging.info('================================================== END LDA ESTIMATION ==================================================') # logging.info('================================================== BEGIN LSI ESTIMATION ==================================================') # # # Commented LSI due to an error from python interpreter on Feb 04, 2014 # # lsi_model_file = os.path.join(tm_folder, project_name + '.lsi') # lsi_beta_file = os.path.join(tm_folder, project_name + '.lsi.beta') # lsi_theta_file = os.path.join(tm_folder, project_name + '.lsi.theta') # lsi_cos_index_file = os.path.join(tm_folder, project_name + '.lsi.cos.index') # # # # run_lsi_estimation(dict_file, ldac_file, lsi_model_file, lsi_beta_file, lsi_theta_file, lsi_cos_index_file, LSI_DEFAULT_NUM_TOPICS) # # config.add_section('TFIDF') # config.set('TFIDF', 'tfidf_file', lsi_theta_file.replace('lsi', 'tfidf')) # # config.add_section('LSI') # config.set('LSI', 'lsi_model_file', lsi_model_file) # config.set('LSI', 'lsi_beta_file', lsi_beta_file) # config.set('LSI', 'lsi_theta_file', lsi_theta_file) # config.set('LSI', 'lsi_cos_index_file', lsi_cos_index_file) # config.set('LSI', 'lsi_num_topics', str(LSI_DEFAULT_NUM_TOPICS)) # # # logging.info('================================================== END LSI ESTIMATION ==================================================') logging.info('================================================== BEGIN TFIDF ==================================================') tfidf_theta_file = os.path.join(tm_folder, project_name + '.tfidf.theta') run_tfidf(dict_file, ldac_file, tfidf_theta_file) config.add_section('TFIDF') config.set('TFIDF', 'tfidf_file', tfidf_theta_file) logging.info('================================================== END TFIDF ==================================================') # Writing our configuration file to 'project.cfg' with open(cfg_file_name, 'w') as configfile: config.write(configfile) logging.info('The project configuration file is written to %s', cfg_file_name) print 'Indexing is completed. The project configuration file is written to', cfg_file_name
def index_data2(data_folder, output_folder, project_name, num_topics=DEFAULT_NUM_TOPICS, num_passes=DEFAULT_NUM_PASSES, min_token_freq=MIN_TOKEN_FREQ, min_token_len=MIN_TOKEN_LEN, max_token_len=MAX_TOKEN_LEN, stem=False, procs=4, limitmb=512, multisegment=True): def save_config(cfg_file_name, config): # Writing our configuration file with open(cfg_file_name, 'w') as configfile: config.write(configfile) logging.info('The project configuration file is written to %s', cfg_file_name) if not os.path.exists(data_folder): logging.error("Please provide a valid data folder!") sys.exit(1) # Checks whether the output folder exists if not os.path.exists(output_folder): os.makedirs(output_folder) # Checks whether the project folder exists project_folder = os.path.join(output_folder, project_name) if not os.path.exists(project_folder): os.makedirs(project_folder) tm_folder = os.path.join(project_folder, TM_FOLDER_NAME) whoosh_folder = os.path.join(project_folder, WHOOSH_FOLDER_NAME) dict_file = os.path.join(tm_folder, project_name + '.dict') ldac_file = os.path.join(tm_folder, project_name + '.ldac') cfg_file_name = os.path.join(output_folder, project_name + '.cfg') # Handling the project configuration file logging.info('Indexing Configurations:') logging.info('Project: %s', project_name) logging.info('LDA Number of topics: %d', num_topics) logging.info('LDA Number of passes: %d', num_passes) config = ConfigParser.RawConfigParser() if os.path.exists(cfg_file_name): config.read(cfg_file_name) if not config.has_section('DATA'): config.add_section('DATA') config.set('DATA', 'name', project_name) config.set('DATA', 'root_dir', os.path.normpath(data_folder)) config.set('DATA', 'project_dir', os.path.normpath(project_folder)) config.set('DATA', 'output_folder', os.path.normpath(output_folder)) save_config(cfg_file_name, config) #print "Indexing documents...." start_time = time.time() if not config.has_section('WHOOSH'): logging.info('================================= BEGIN WHOOSH INDEXING') if not os.path.exists(whoosh_folder): os.makedirs(whoosh_folder) path_index_file_name = os.path.join(project_folder, project_name + '.path.index') index_plain_text_emails2(data_folder, path_index_file_name, whoosh_folder, stem, min_token_len, max_token_len, procs, limitmb, multisegment) config.add_section('WHOOSH') config.set('WHOOSH', 'whoosh_index_dir', os.path.normpath(whoosh_folder)) config.set('WHOOSH', 'path_index_file', os.path.normpath(path_index_file_name)) config.set('WHOOSH', 'stem', stem) save_config(cfg_file_name, config) logging.info('================================= END WHOOSH INDEXING') print '\nIndexing time:', (time.time() - start_time), 'seconds' #print "Corpus building...." start_time = time.time() if not config.has_section('CORPUS'): logging.info('=============================== BEGIN CORPUS BUILDING') if not os.path.exists(tm_folder): os.makedirs(tm_folder) path_index_file_name = os.path.join(tm_folder, project_name + '.path.index') build_lda_corpus2(whoosh_folder, path_index_file_name, dict_file, ldac_file, min_token_freq, min_token_len, max_token_len, stem) config.add_section('CORPUS') config.set('CORPUS', 'tm_folder', os.path.normpath(tm_folder)) config.set('CORPUS', 'path_index_file', os.path.normpath(path_index_file_name)) config.set('CORPUS', 'blei_corpus_file', os.path.normpath(ldac_file)) config.set('CORPUS', 'dict_file', os.path.normpath(dict_file)) config.set('CORPUS', 'vocab_file', os.path.normpath(ldac_file + '.vocab')) config.set('CORPUS', 'min_token_freq', min_token_freq) config.set('CORPUS', 'min_token_len', min_token_len) config.set('CORPUS', 'max_token_len', 20) save_config(cfg_file_name, config) logging.info('=============================== END CORPUS BUILDING') print '\nCorpus building time:', (time.time() - start_time), 'seconds' #print "Topic modeling...." start_time = time.time() if not config.has_section('LDA'): logging.info('=============================== BEGIN LDA ESTIMATION') lda_model_file = os.path.join(tm_folder, project_name + '.lda') lda_beta_file = os.path.join(tm_folder, project_name + '.lda.beta') lda_theta_file = os.path.join(tm_folder, project_name + '.lda.theta') lda_cos_index_file = os.path.join(tm_folder, project_name + '.lda.cos.index') run_lda_estimation(dict_file, ldac_file, lda_model_file, lda_beta_file, lda_theta_file, lda_cos_index_file, num_topics, num_passes) config.add_section('LDA') config.set('LDA', 'lda_model_file', lda_model_file) config.set('LDA', 'lda_beta_file', lda_beta_file) config.set('LDA', 'lda_theta_file', lda_theta_file) config.set('LDA', 'lda_cos_index_file', lda_cos_index_file) config.set('LDA', 'num_topics', str(num_topics)) config.set('LDA', 'num_passes', str(num_passes)) save_config(cfg_file_name, config) logging.info('=============================== END LDA ESTIMATION') if not config.has_section('TFIDF'): logging.info('=============================== BEGIN TFIDF') tfidf_theta_file = os.path.join(tm_folder, project_name + '.tfidf.theta') run_tfidf(dict_file, ldac_file, tfidf_theta_file) config.add_section('TFIDF') config.set('TFIDF', 'tfidf_file', tfidf_theta_file) save_config(cfg_file_name, config) logging.info('=============================== END TFIDF') print '\nTopic modeling time:', (time.time() - start_time), 'seconds'
def index_and_tm(data_folder, output_folder, project_name, num_topics, num_passes, min_token_freq, min_token_len, max_token_len, stem=False, procs=4, limitmb=512, multisegment=True): if not os.path.exists(data_folder): logging.error("Please provide a valid data folder!") sys.exit(1) # Checks whether the output folder exists if not os.path.exists(output_folder): os.makedirs(output_folder) # Checks whether the project folder exists project_folder = os.path.join(output_folder, project_name) if not os.path.exists(project_folder): os.makedirs(project_folder) tm_folder = os.path.join(project_folder, TM_FOLDER_NAME) whoosh_folder = os.path.join(project_folder, WHOOSH_FOLDER_NAME) # Handling the project configuration file logging.info('Indexing Configurations:') logging.info('Project: %s', project_name) #print "Indexing documents...." start_time = time.time() logging.info('================================= BEGIN WHOOSH INDEXING') if not os.path.exists(whoosh_folder): os.makedirs(whoosh_folder) path_index_file_name = os.path.join(project_folder, project_name + '.path.index') index_plain_text_emails2(data_folder, path_index_file_name, whoosh_folder, stem, min_token_len, max_token_len, procs, limitmb, multisegment) logging.info('================================= END WHOOSH INDEXING') print '\nIndexing time:', (time.time() - start_time), 'seconds' #print "Corpus building...." start_time = time.time() logging.info('=============================== BEGIN CORPUS BUILDING') if not os.path.exists(tm_folder): os.makedirs(tm_folder) dict_file = os.path.join(tm_folder, project_name + '.dict') ldac_file = os.path.join(tm_folder, project_name + '.ldac') path_index_file_name = os.path.join(tm_folder, project_name + '.path.index') build_lda_corpus2(whoosh_folder, path_index_file_name, dict_file, ldac_file, min_token_freq, min_token_len, max_token_len, stem) logging.info('=============================== END CORPUS BUILDING') print '\nCorpus building time:', (time.time() - start_time), 'seconds' #print "Topic modeling...." start_time = time.time() for k in num_topics: logging.info('=============================== BEGIN LDA ESTIMATION') logging.info('LDA Number of topics: %d', k) logging.info('LDA Number of passes: %d', num_passes) lda_model_file = os.path.join(tm_folder, project_name + '-K%d-VB.lda' % k) lda_beta_file = os.path.join(tm_folder, project_name + '-K%d-VB.lda.beta' % k) lda_theta_file = os.path.join(tm_folder, project_name + '-K%d-VB.lda.theta' % k) lda_cos_index_file = os.path.join(tm_folder, project_name + '-K%d-VB.lda.cos.index' % k) run_lda_estimation(dict_file, ldac_file, lda_model_file, lda_beta_file, lda_theta_file, lda_cos_index_file, k, num_passes) logging.info('=============================== END LDA ESTIMATION') logging.info('=============================== BEGIN TFIDF') tfidf_theta_file = os.path.join(tm_folder, project_name + '.tfidf.theta') run_tfidf(dict_file, ldac_file, tfidf_theta_file) logging.info('=============================== END TFIDF') print '\nTopic modeling time:', (time.time() - start_time), 'seconds'