def index_data(data_folder, output_folder, project_name, cfg_folder, num_topics=DEFAULT_NUM_TOPICS, num_passes=DEFAULT_NUM_PASSES, min_token_freq=MIN_TOKEN_FREQ, min_token_len=MIN_TOKEN_LEN, log_to_file=True, lemmatize=False, stem=False, nonascii=True): if not os.path.exists(data_folder): print "Please provide a valid data folder!" sys.exit(1) # Checks whether the output folder exists if not os.path.exists(output_folder): os.makedirs(output_folder) # Checks whether the project folder exists project_folder = os.path.join(output_folder, project_name) if not os.path.exists(project_folder): os.makedirs(project_folder) # Create file handler which logs debug messages log_file_name = '%s.log' % os.path.join(project_folder, project_name) if log_to_file: logging.basicConfig(filename=log_file_name, format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) else: logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) # Handling the project configuration file print 'Indexing Configurations:' print 'Project:', project_name print 'Number of LDA topics:', num_topics print 'Number of LDA passes:', num_passes print 'Number of LSA topics:', LSI_DEFAULT_NUM_TOPICS cfg_file_name = os.path.join( cfg_folder , project_name + '.cfg' ) config = ConfigParser.RawConfigParser() config.add_section('DATA') config.set('DATA', 'name', project_name) config.set('DATA', 'root_dir', os.path.normpath(data_folder)) # may need to change the name config.set('DATA', 'project_dir', os.path.normpath(project_folder)) config.set('DATA', 'log_file', os.path.normpath(log_file_name)) config.set('DATA', 'output_folder', os.path.normpath(output_folder)) logging.info('================================================== BEGIN LUCENE INDEXING ==================================================') lucene_folder = os.path.join(project_folder, LUCENE_FOLDER_NAME) if not os.path.exists(lucene_folder): os.makedirs(lucene_folder) path_index_file_name = os.path.join(project_folder, project_name + '.path.index') index_plain_text_emails(data_folder, path_index_file_name, lucene_folder, lemmatize=lemmatize, stem=stem, nonascii=nonascii) config.add_section('LUCENE') config.set('LUCENE', 'lucene_index_dir', os.path.normpath(lucene_folder)) config.set('LUCENE', 'path_index_file', os.path.normpath(path_index_file_name)) logging.info('================================================== END LUCENE INDEXING ==================================================') logging.info('================================================== BEGIN CORPUS BUILDING ==================================================') ''' Consider all elements in emails for creating the LDA corpus, i.e., it uses the MetaDataType.ALL field in the Lucene index ''' tm_folder = os.path.join(project_folder, TM_FOLDER_NAME) if not os.path.exists(tm_folder): os.makedirs(tm_folder) dict_file = os.path.join(tm_folder, project_name + '.dict') ldac_file = os.path.join(tm_folder, project_name + '.ldac') path_index_file_name = os.path.join(tm_folder, project_name + '.path.index') # it's for topic modeling alone build_lda_corpus(lucene_folder, path_index_file_name, dict_file, ldac_file, min_token_freq, min_token_len) config.add_section('CORPUS') config.set('CORPUS', 'tm_folder', os.path.normpath(tm_folder)) config.set('CORPUS', 'path_index_file', os.path.normpath(path_index_file_name)) config.set('CORPUS', 'blei_corpus_file', os.path.normpath(ldac_file)) config.set('CORPUS', 'dict_file', os.path.normpath(dict_file)) config.set('CORPUS', 'vocab_file', os.path.normpath(ldac_file + '.vocab')) logging.info('================================================== END CORPUS BUILDING ==================================================') project_name = os.path.normpath(project_name) logging.info('================================================== BEGIN LDA ESTIMATION ==================================================') lda_model_file = os.path.join(tm_folder, project_name + '.lda') lda_beta_file = os.path.join(tm_folder, project_name + '.lda.beta') lda_theta_file = os.path.join(tm_folder, project_name + '.lda.theta') lda_cos_index_file = os.path.join(tm_folder, project_name + '.lda.cos.index') run_lda_estimation(dict_file, ldac_file, lda_model_file, lda_beta_file, lda_theta_file, lda_cos_index_file, num_topics, num_passes) # run_hdp_estimation(dict_file, ldac_file, lda_model_file, lda_beta_file, lda_theta_file, lda_cos_index_file) config.add_section('LDA') config.set('LDA', 'lda_model_file', lda_model_file) config.set('LDA', 'lda_beta_file', lda_beta_file) config.set('LDA', 'lda_theta_file', lda_theta_file) config.set('LDA', 'lda_cos_index_file', lda_cos_index_file) config.set('LDA', 'num_topics', str(num_topics)) config.set('LDA', 'num_passes', str(num_passes)) logging.info('================================================== END LDA ESTIMATION ==================================================') # logging.info('================================================== BEGIN LSI ESTIMATION ==================================================') # # # Commented LSI due to an error from python interpreter on Feb 04, 2014 # # lsi_model_file = os.path.join(tm_folder, project_name + '.lsi') # lsi_beta_file = os.path.join(tm_folder, project_name + '.lsi.beta') # lsi_theta_file = os.path.join(tm_folder, project_name + '.lsi.theta') # lsi_cos_index_file = os.path.join(tm_folder, project_name + '.lsi.cos.index') # # # # run_lsi_estimation(dict_file, ldac_file, lsi_model_file, lsi_beta_file, lsi_theta_file, lsi_cos_index_file, LSI_DEFAULT_NUM_TOPICS) # # config.add_section('TFIDF') # config.set('TFIDF', 'tfidf_file', lsi_theta_file.replace('lsi', 'tfidf')) # # config.add_section('LSI') # config.set('LSI', 'lsi_model_file', lsi_model_file) # config.set('LSI', 'lsi_beta_file', lsi_beta_file) # config.set('LSI', 'lsi_theta_file', lsi_theta_file) # config.set('LSI', 'lsi_cos_index_file', lsi_cos_index_file) # config.set('LSI', 'lsi_num_topics', str(LSI_DEFAULT_NUM_TOPICS)) # # # logging.info('================================================== END LSI ESTIMATION ==================================================') logging.info('================================================== BEGIN TFIDF ==================================================') tfidf_theta_file = os.path.join(tm_folder, project_name + '.tfidf.theta') run_tfidf(dict_file, ldac_file, tfidf_theta_file) config.add_section('TFIDF') config.set('TFIDF', 'tfidf_file', tfidf_theta_file) logging.info('================================================== END TFIDF ==================================================') # Writing our configuration file to 'project.cfg' with open(cfg_file_name, 'w') as configfile: config.write(configfile) logging.info('The project configuration file is written to %s', cfg_file_name) print 'Indexing is completed. The project configuration file is written to', cfg_file_name
''') arg_parser.add_argument("-d", dest="data_folder", type=str, help="data folder", required=True) arg_parser.add_argument("-o", dest="index_folder", type=str, help="output folder", required=True) arg_parser.add_argument("-p", dest="path_index_file", type=str, help="File paths index file", required=True) arg_parser.add_argument("-l", "--log", dest="log", default=False, action="store_true", help="log details into a file") arg_parser.add_argument("-f", dest="log_file", type=str, help="Logs file (default: index_dir.log)", default='index_dir.log') args = arg_parser.parse_args() if not os.path.exists(args.data_folder): print "Please provide a valid data folder!" sys.exit(1) if not os.path.exists(args.index_folder): os.makedirs(args.index_folder) # create file handler which # logs debug messages if args.log: logging.basicConfig(filename=args.log_file, format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) else: logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) logging.info('=============================================================================================================') index_plain_text_emails(args.data_folder, args.path_index_file, args.index_folder) logging.info('=============================================================================================================')