Python run_lda_estimation Examples

Programming Language: Python

Namespace/Package Name: topicmodels

Method/Function: run_lda_estimation

Examples at hotexamples.com: 4

Python run_lda_estimation - 4 examples found. These are the top rated real world Python examples of topicmodels.run_lda_estimation extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: index_data_whoosh.py Project: clintpgeorge/ediscovery

def index_data(data_folder, output_folder, project_name, cfg_folder, 
               num_topics=DEFAULT_NUM_TOPICS, num_passes=DEFAULT_NUM_PASSES, 
               min_token_freq=MIN_TOKEN_FREQ, min_token_len=MIN_TOKEN_LEN, 
               max_token_len=MAX_TOKEN_LEN, log_to_file=False, lemmatize=False, 
               stem=False, nonascii=True, procs=4, limitmb=512, multisegment=True):
    
    def save_config(cfg_file_name, config):
        # Writing our configuration file
        with open(cfg_file_name, 'w') as configfile:
            config.write(configfile)
        logging.info('The project configuration file is written to %s', 
                     cfg_file_name)
        # print 'The project configuration file is written to', cfg_file_name


    if not os.path.exists(data_folder):
        logging.error("Please provide a valid data folder!")
        sys.exit(1)
           
    # Checks whether the output folder exists 
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Checks whether the project folder exists 
    project_folder = os.path.join(output_folder, project_name)
    if not os.path.exists(project_folder):
        os.makedirs(project_folder)
        
    tm_folder = os.path.join(project_folder, TM_FOLDER_NAME)
    whoosh_folder = os.path.join(project_folder, WHOOSH_FOLDER_NAME)
    dict_file = os.path.join(tm_folder, project_name + '.dict')
    ldac_file = os.path.join(tm_folder, project_name + '.ldac')
    cfg_file_name = os.path.join(cfg_folder, project_name + '.cfg')
    
    # Create file handler which logs debug messages
    if log_to_file: 
        log_file_name = os.path.join(cfg_folder, project_name + '.log')
        logging.basicConfig(filename=log_file_name, format=LOG_FORMAT, 
                            level=logging.DEBUG)
        print "Log file:", log_file_name
    else: 
        logging.basicConfig(format=LOG_FORMAT, level=logging.DEBUG)

    
    # Handling the project configuration file 
    
    logging.info('Indexing Configurations:')
    logging.info('Project: %s', project_name)
    logging.info('Number of LDA topics: %d', num_topics)
    logging.info('Number of LDA passes: %d', num_passes)
    logging.info('Number of LSA topics: %d', LSI_DEFAULT_NUM_TOPICS)

    config = ConfigParser.RawConfigParser()
    if os.path.exists(cfg_file_name): config.read(cfg_file_name)

    if not config.has_section('DATA'):
        config.add_section('DATA')
        config.set('DATA', 'name', project_name)
        config.set('DATA', 'root_dir', os.path.normpath(data_folder)) 
        config.set('DATA', 'project_dir', os.path.normpath(project_folder))
        config.set('DATA', 'output_folder', os.path.normpath(output_folder))   
        save_config(cfg_file_name, config)
    
    print "Indexing documents...."
    start_time = time.time()
    
    if not config.has_section('WHOOSH'):
        logging.info('================================= BEGIN WHOOSH INDEXING')
        
        if not os.path.exists(whoosh_folder): os.makedirs(whoosh_folder)
        path_index_file_name = os.path.join(project_folder, 
                                            project_name + '.path.index')
        
#         index_plain_text_emails(data_folder, path_index_file_name, 
#                                 whoosh_folder, lemmatize=lemmatize, 
#                                 stem=stem, nonascii=nonascii) 
        
        index_plain_text_emails2(data_folder, path_index_file_name, 
                                 whoosh_folder, stem, 
                                 min_token_len, max_token_len, 
                                 procs, limitmb, multisegment) 
            
        config.add_section('WHOOSH')
        config.set('WHOOSH', 'whoosh_index_dir', 
                   os.path.normpath(whoosh_folder))
        config.set('WHOOSH', 'path_index_file', 
                   os.path.normpath(path_index_file_name))
        config.set('WHOOSH', 'lemmatize', lemmatize)
        config.set('WHOOSH', 'stem', stem)
        config.set('WHOOSH', 'nonascii', nonascii)
        save_config(cfg_file_name, config)
        logging.info('================================= END WHOOSH INDEXING')

    print '\nIndexing time:', (time.time() - start_time), 'seconds'
    
    print "Corpus building...."
    start_time = time.time()

    if not config.has_section('CORPUS'):
        logging.info('=============================== BEGIN CORPUS BUILDING')
         
        if not os.path.exists(tm_folder): os.makedirs(tm_folder)
        path_index_file_name = os.path.join(tm_folder, 
                                            project_name + '.path.index') 
#         build_lda_corpus(whoosh_folder, path_index_file_name, dict_file, 
#                          ldac_file, min_token_freq, min_token_len, 
#                          max_token_len)
        
        build_lda_corpus2(whoosh_folder, path_index_file_name, dict_file, 
                         ldac_file, min_token_freq, min_token_len, 
                         max_token_len, stem)
         
        config.add_section('CORPUS')
        config.set('CORPUS', 'tm_folder', os.path.normpath(tm_folder))
        config.set('CORPUS', 'path_index_file', 
                   os.path.normpath(path_index_file_name))
        config.set('CORPUS', 'blei_corpus_file', os.path.normpath(ldac_file))
        config.set('CORPUS', 'dict_file', os.path.normpath(dict_file))
        config.set('CORPUS', 'vocab_file', 
                   os.path.normpath(ldac_file + '.vocab')) 
        config.set('CORPUS', 'min_token_freq', min_token_freq)
        config.set('CORPUS', 'min_token_len', min_token_len)
        config.set('CORPUS', 'max_token_len', 20)
        save_config(cfg_file_name, config) 
        logging.info('=============================== END CORPUS BUILDING')
     
    
    print '\nCorpus building time:', (time.time() - start_time), 'seconds'
    
    # project_name = os.path.normpath(project_name)
     
    print "Topic modeling...."
    start_time = time.time()
    
    if not config.has_section('LDA'):
        logging.info('=============================== BEGIN LDA ESTIMATION')
        lda_model_file = os.path.join(tm_folder, project_name + '.lda')
        lda_beta_file = os.path.join(tm_folder, project_name + '.lda.beta')
        lda_theta_file = os.path.join(tm_folder, project_name + '.lda.theta')
        lda_cos_index_file = os.path.join(tm_folder, 
                                          project_name + '.lda.cos.index')
        run_lda_estimation(dict_file, ldac_file, lda_model_file, lda_beta_file, 
                           lda_theta_file, lda_cos_index_file, num_topics, 
                           num_passes)
        config.add_section('LDA')
        config.set('LDA', 'lda_model_file', lda_model_file)
        config.set('LDA', 'lda_beta_file', lda_beta_file)
        config.set('LDA', 'lda_theta_file', lda_theta_file)
        config.set('LDA', 'lda_cos_index_file', lda_cos_index_file)
        config.set('LDA', 'num_topics', str(num_topics))
        config.set('LDA', 'num_passes', str(num_passes))    
        save_config(cfg_file_name, config)
        logging.info('=============================== END LDA ESTIMATION')    
     
     
     
#     logging.info('=============================== BEGIN LSI ESTIMATION')
#     
#     # Commented LSI due to an error from python interpreter on Feb 04, 2014
#  
#     lsi_model_file = os.path.join(tm_folder, project_name + '.lsi')
#     lsi_beta_file = os.path.join(tm_folder, project_name + '.lsi.beta')
#     lsi_theta_file = os.path.join(tm_folder, project_name + '.lsi.theta')
#     lsi_cos_index_file = os.path.join(tm_folder, project_name + '.lsi.cos.index')
#     # 
#       
#     run_lsi_estimation(dict_file, ldac_file, lsi_model_file, lsi_beta_file, lsi_theta_file, lsi_cos_index_file, LSI_DEFAULT_NUM_TOPICS)
#   
#     config.add_section('TFIDF')
#     config.set('TFIDF', 'tfidf_file', lsi_theta_file.replace('lsi', 'tfidf'))
#           
#     config.add_section('LSI')
#     config.set('LSI', 'lsi_model_file', lsi_model_file)
#     config.set('LSI', 'lsi_beta_file', lsi_beta_file)
#     config.set('LSI', 'lsi_theta_file', lsi_theta_file)
#     config.set('LSI', 'lsi_cos_index_file', lsi_cos_index_file)
#     config.set('LSI', 'lsi_num_topics', str(LSI_DEFAULT_NUM_TOPICS))
# 
#     
#     logging.info('=============================== END LSI ESTIMATION')    
 
    if not config.has_section('TFIDF'):
        logging.info('=============================== BEGIN TFIDF')
        tfidf_theta_file = os.path.join(tm_folder, project_name + '.tfidf.theta')
        run_tfidf(dict_file, ldac_file, tfidf_theta_file)
        config.add_section('TFIDF')
        config.set('TFIDF', 'tfidf_file', tfidf_theta_file)
        save_config(cfg_file_name, config)
        logging.info('=============================== END TFIDF')    


    print '\nTopic modeling time:', (time.time() - start_time), 'seconds'

Example #2

Show file

File: index_data.py Project: clintpgeorge/ediscovery

def index_data(data_folder, output_folder, project_name, cfg_folder, 
               num_topics=DEFAULT_NUM_TOPICS, num_passes=DEFAULT_NUM_PASSES, 
               min_token_freq=MIN_TOKEN_FREQ, min_token_len=MIN_TOKEN_LEN, 
               log_to_file=True, lemmatize=False, stem=False, nonascii=True):
    
    if not os.path.exists(data_folder):
        print "Please provide a valid data folder!"
        sys.exit(1)
        
    # Checks whether the output folder exists 
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Checks whether the project folder exists 
    project_folder = os.path.join(output_folder, project_name)
    if not os.path.exists(project_folder):
        os.makedirs(project_folder)

    # Create file handler which logs debug messages
    log_file_name = '%s.log' % os.path.join(project_folder, project_name)

    if log_to_file: 
        logging.basicConfig(filename=log_file_name, 
                            format='%(asctime)s : %(levelname)s : %(message)s', 
                            level=logging.DEBUG)
    else: 
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                            level=logging.DEBUG)

    
    # Handling the project configuration file 
    
    print 'Indexing Configurations:'
    print 'Project:', project_name
    print 'Number of LDA topics:', num_topics
    print 'Number of LDA passes:', num_passes
    print 'Number of LSA topics:', LSI_DEFAULT_NUM_TOPICS
    
    cfg_file_name = os.path.join( cfg_folder , project_name + '.cfg' )
    config = ConfigParser.RawConfigParser()

    config.add_section('DATA')
    config.set('DATA', 'name', project_name)
    config.set('DATA', 'root_dir', os.path.normpath(data_folder)) # may need to change the name 
    config.set('DATA', 'project_dir', os.path.normpath(project_folder))
    config.set('DATA', 'log_file', os.path.normpath(log_file_name))
    config.set('DATA', 'output_folder', os.path.normpath(output_folder))   
        
    logging.info('================================================== BEGIN LUCENE INDEXING ==================================================')
    
    lucene_folder = os.path.join(project_folder, LUCENE_FOLDER_NAME)
    if not os.path.exists(lucene_folder): os.makedirs(lucene_folder)
    path_index_file_name = os.path.join(project_folder, project_name + '.path.index')
    
    index_plain_text_emails(data_folder, path_index_file_name, lucene_folder, 
                            lemmatize=lemmatize, stem=stem, nonascii=nonascii)
    
    config.add_section('LUCENE')
    config.set('LUCENE', 'lucene_index_dir', os.path.normpath(lucene_folder))
    config.set('LUCENE', 'path_index_file', os.path.normpath(path_index_file_name))
    
    logging.info('================================================== END LUCENE INDEXING ==================================================')

    logging.info('================================================== BEGIN CORPUS BUILDING ==================================================')

    '''
    Consider all elements in emails for creating the LDA corpus, 
    i.e., it uses the MetaDataType.ALL field in the Lucene index 
    '''
    
    tm_folder = os.path.join(project_folder, TM_FOLDER_NAME)
    if not os.path.exists(tm_folder): os.makedirs(tm_folder)
    dict_file = os.path.join(tm_folder, project_name + '.dict')
    ldac_file = os.path.join(tm_folder, project_name + '.ldac')
    path_index_file_name = os.path.join(tm_folder, project_name + '.path.index') # it's for topic modeling alone 
    
    build_lda_corpus(lucene_folder, path_index_file_name, dict_file, ldac_file, 
                     min_token_freq, min_token_len)
    
    config.add_section('CORPUS')
    config.set('CORPUS', 'tm_folder', os.path.normpath(tm_folder))
    config.set('CORPUS', 'path_index_file', os.path.normpath(path_index_file_name))
    config.set('CORPUS', 'blei_corpus_file', os.path.normpath(ldac_file))
    config.set('CORPUS', 'dict_file', os.path.normpath(dict_file))
    config.set('CORPUS', 'vocab_file', os.path.normpath(ldac_file + '.vocab'))  

    
    logging.info('================================================== END CORPUS BUILDING ==================================================')
    
    project_name = os.path.normpath(project_name)
    
    logging.info('================================================== BEGIN LDA ESTIMATION ==================================================')

    lda_model_file = os.path.join(tm_folder, project_name + '.lda')
    lda_beta_file = os.path.join(tm_folder, project_name + '.lda.beta')
    lda_theta_file = os.path.join(tm_folder, project_name + '.lda.theta')
    lda_cos_index_file = os.path.join(tm_folder, project_name + '.lda.cos.index')
    
    run_lda_estimation(dict_file, ldac_file, lda_model_file, lda_beta_file, 
                       lda_theta_file, lda_cos_index_file, num_topics, num_passes)
    
    # run_hdp_estimation(dict_file, ldac_file, lda_model_file, lda_beta_file, lda_theta_file, lda_cos_index_file)
    
    config.add_section('LDA')
    config.set('LDA', 'lda_model_file', lda_model_file)
    config.set('LDA', 'lda_beta_file', lda_beta_file)
    config.set('LDA', 'lda_theta_file', lda_theta_file)
    config.set('LDA', 'lda_cos_index_file', lda_cos_index_file)
    config.set('LDA', 'num_topics', str(num_topics))
    config.set('LDA', 'num_passes', str(num_passes))    
    
    logging.info('================================================== END LDA ESTIMATION ==================================================')    
    
    
    
#     logging.info('================================================== BEGIN LSI ESTIMATION ==================================================')
#     
#     # Commented LSI due to an error from python interpreter on Feb 04, 2014
#  
#     lsi_model_file = os.path.join(tm_folder, project_name + '.lsi')
#     lsi_beta_file = os.path.join(tm_folder, project_name + '.lsi.beta')
#     lsi_theta_file = os.path.join(tm_folder, project_name + '.lsi.theta')
#     lsi_cos_index_file = os.path.join(tm_folder, project_name + '.lsi.cos.index')
#     # 
#       
#     run_lsi_estimation(dict_file, ldac_file, lsi_model_file, lsi_beta_file, lsi_theta_file, lsi_cos_index_file, LSI_DEFAULT_NUM_TOPICS)
#   
#     config.add_section('TFIDF')
#     config.set('TFIDF', 'tfidf_file', lsi_theta_file.replace('lsi', 'tfidf'))
#           
#     config.add_section('LSI')
#     config.set('LSI', 'lsi_model_file', lsi_model_file)
#     config.set('LSI', 'lsi_beta_file', lsi_beta_file)
#     config.set('LSI', 'lsi_theta_file', lsi_theta_file)
#     config.set('LSI', 'lsi_cos_index_file', lsi_cos_index_file)
#     config.set('LSI', 'lsi_num_topics', str(LSI_DEFAULT_NUM_TOPICS))
# 
#     
#     logging.info('================================================== END LSI ESTIMATION ==================================================')    

    logging.info('================================================== BEGIN TFIDF ==================================================')

    tfidf_theta_file = os.path.join(tm_folder, project_name + '.tfidf.theta')
    run_tfidf(dict_file, ldac_file, tfidf_theta_file)
 
    config.add_section('TFIDF')
    config.set('TFIDF', 'tfidf_file', tfidf_theta_file)


    
    logging.info('================================================== END TFIDF ==================================================')    


    
    
    # Writing our configuration file to 'project.cfg'
    with open(cfg_file_name, 'w') as configfile:
        config.write(configfile)
        
    logging.info('The project configuration file is written to %s', cfg_file_name)
    print 'Indexing is completed. The project configuration file is written to', cfg_file_name

Example #3

Show file

File: index_data_whoosh.py Project: clintpgeorge/ediscovery

def index_data2(data_folder, output_folder, project_name, 
                num_topics=DEFAULT_NUM_TOPICS, num_passes=DEFAULT_NUM_PASSES, 
                min_token_freq=MIN_TOKEN_FREQ, min_token_len=MIN_TOKEN_LEN, 
                max_token_len=MAX_TOKEN_LEN, stem=False, procs=4, limitmb=512, 
                multisegment=True):
    
    def save_config(cfg_file_name, config):
        # Writing our configuration file
        with open(cfg_file_name, 'w') as configfile:
            config.write(configfile)
        logging.info('The project configuration file is written to %s', 
                     cfg_file_name)


    if not os.path.exists(data_folder):
        logging.error("Please provide a valid data folder!")
        sys.exit(1)
           
    # Checks whether the output folder exists 
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Checks whether the project folder exists 
    project_folder = os.path.join(output_folder, project_name)
    if not os.path.exists(project_folder):
        os.makedirs(project_folder)
        
    tm_folder = os.path.join(project_folder, TM_FOLDER_NAME)
    whoosh_folder = os.path.join(project_folder, WHOOSH_FOLDER_NAME)
    dict_file = os.path.join(tm_folder, project_name + '.dict')
    ldac_file = os.path.join(tm_folder, project_name + '.ldac')
    cfg_file_name = os.path.join(output_folder, project_name + '.cfg')
    
    
    # Handling the project configuration file 
    
    logging.info('Indexing Configurations:')
    logging.info('Project: %s', project_name)
    logging.info('LDA Number of topics: %d', num_topics)
    logging.info('LDA Number of passes: %d', num_passes)

    config = ConfigParser.RawConfigParser()
    if os.path.exists(cfg_file_name): config.read(cfg_file_name)

    if not config.has_section('DATA'):
        config.add_section('DATA')
        config.set('DATA', 'name', project_name)
        config.set('DATA', 'root_dir', os.path.normpath(data_folder)) 
        config.set('DATA', 'project_dir', os.path.normpath(project_folder))
        config.set('DATA', 'output_folder', os.path.normpath(output_folder))   
        save_config(cfg_file_name, config)
    
    #print "Indexing documents...."
    start_time = time.time()
    
    if not config.has_section('WHOOSH'):
        logging.info('================================= BEGIN WHOOSH INDEXING')
        
        if not os.path.exists(whoosh_folder): os.makedirs(whoosh_folder)
        path_index_file_name = os.path.join(project_folder, 
                                            project_name + '.path.index')
        
        index_plain_text_emails2(data_folder, path_index_file_name, 
                                 whoosh_folder, stem, 
                                 min_token_len, max_token_len, 
                                 procs, limitmb, multisegment) 
            
        config.add_section('WHOOSH')
        config.set('WHOOSH', 'whoosh_index_dir', 
                   os.path.normpath(whoosh_folder))
        config.set('WHOOSH', 'path_index_file', 
                   os.path.normpath(path_index_file_name))
        config.set('WHOOSH', 'stem', stem)
        save_config(cfg_file_name, config)
        logging.info('================================= END WHOOSH INDEXING')

    print '\nIndexing time:', (time.time() - start_time), 'seconds'
    
    #print "Corpus building...."
    start_time = time.time()

    if not config.has_section('CORPUS'):
        logging.info('=============================== BEGIN CORPUS BUILDING')
         
        if not os.path.exists(tm_folder): os.makedirs(tm_folder)
        path_index_file_name = os.path.join(tm_folder, 
                                            project_name + '.path.index') 
        
        build_lda_corpus2(whoosh_folder, path_index_file_name, dict_file, 
                         ldac_file, min_token_freq, min_token_len, 
                         max_token_len, stem)
         
        config.add_section('CORPUS')
        config.set('CORPUS', 'tm_folder', os.path.normpath(tm_folder))
        config.set('CORPUS', 'path_index_file', 
                   os.path.normpath(path_index_file_name))
        config.set('CORPUS', 'blei_corpus_file', os.path.normpath(ldac_file))
        config.set('CORPUS', 'dict_file', os.path.normpath(dict_file))
        config.set('CORPUS', 'vocab_file', 
                   os.path.normpath(ldac_file + '.vocab')) 
        config.set('CORPUS', 'min_token_freq', min_token_freq)
        config.set('CORPUS', 'min_token_len', min_token_len)
        config.set('CORPUS', 'max_token_len', 20)
        save_config(cfg_file_name, config) 
        logging.info('=============================== END CORPUS BUILDING')
     
    
    print '\nCorpus building time:', (time.time() - start_time), 'seconds'
     
    #print "Topic modeling...."
    start_time = time.time()
    
    if not config.has_section('LDA'):
        logging.info('=============================== BEGIN LDA ESTIMATION')
        lda_model_file = os.path.join(tm_folder, project_name + '.lda')
        lda_beta_file = os.path.join(tm_folder, project_name + '.lda.beta')
        lda_theta_file = os.path.join(tm_folder, project_name + '.lda.theta')
        lda_cos_index_file = os.path.join(tm_folder, 
                                          project_name + '.lda.cos.index')
        run_lda_estimation(dict_file, ldac_file, lda_model_file, lda_beta_file, 
                           lda_theta_file, lda_cos_index_file, num_topics, 
                           num_passes)
        config.add_section('LDA')
        config.set('LDA', 'lda_model_file', lda_model_file)
        config.set('LDA', 'lda_beta_file', lda_beta_file)
        config.set('LDA', 'lda_theta_file', lda_theta_file)
        config.set('LDA', 'lda_cos_index_file', lda_cos_index_file)
        config.set('LDA', 'num_topics', str(num_topics))
        config.set('LDA', 'num_passes', str(num_passes))    
        save_config(cfg_file_name, config)
        logging.info('=============================== END LDA ESTIMATION')    
 
    if not config.has_section('TFIDF'):
        logging.info('=============================== BEGIN TFIDF')
        tfidf_theta_file = os.path.join(tm_folder, project_name + '.tfidf.theta')
        run_tfidf(dict_file, ldac_file, tfidf_theta_file)
        config.add_section('TFIDF')
        config.set('TFIDF', 'tfidf_file', tfidf_theta_file)
        save_config(cfg_file_name, config)
        logging.info('=============================== END TFIDF')    


    print '\nTopic modeling time:', (time.time() - start_time), 'seconds'

Example #4

Show file

File: trec2010_index_data.py Project: clintpgeorge/ediscovery

def index_and_tm(data_folder, output_folder, project_name, num_topics, 
                num_passes, min_token_freq, min_token_len, max_token_len, 
                stem=False, procs=4, limitmb=512, multisegment=True):

    if not os.path.exists(data_folder):
        logging.error("Please provide a valid data folder!")
        sys.exit(1)
           
    # Checks whether the output folder exists 
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Checks whether the project folder exists 
    project_folder = os.path.join(output_folder, project_name)
    if not os.path.exists(project_folder):
        os.makedirs(project_folder)
        
    tm_folder = os.path.join(project_folder, TM_FOLDER_NAME)
    whoosh_folder = os.path.join(project_folder, WHOOSH_FOLDER_NAME)

    
    # Handling the project configuration file 
    
    logging.info('Indexing Configurations:')
    logging.info('Project: %s', project_name)
    
    #print "Indexing documents...."
    start_time = time.time()
    

    logging.info('================================= BEGIN WHOOSH INDEXING')
    
    if not os.path.exists(whoosh_folder): os.makedirs(whoosh_folder)
    path_index_file_name = os.path.join(project_folder, 
                                        project_name + '.path.index')
    
    index_plain_text_emails2(data_folder, path_index_file_name, 
                             whoosh_folder, stem, 
                             min_token_len, max_token_len, 
                             procs, limitmb, multisegment) 

    logging.info('================================= END WHOOSH INDEXING')

    print '\nIndexing time:', (time.time() - start_time), 'seconds'
    
    #print "Corpus building...."
    start_time = time.time()


    logging.info('=============================== BEGIN CORPUS BUILDING')
     
    if not os.path.exists(tm_folder): os.makedirs(tm_folder)
    dict_file = os.path.join(tm_folder, project_name + '.dict')
    ldac_file = os.path.join(tm_folder, project_name + '.ldac')
    path_index_file_name = os.path.join(tm_folder, project_name + '.path.index')     
    build_lda_corpus2(whoosh_folder, path_index_file_name, dict_file, 
                     ldac_file, min_token_freq, min_token_len, 
                     max_token_len, stem)
     
    logging.info('=============================== END CORPUS BUILDING')
     
    
    print '\nCorpus building time:', (time.time() - start_time), 'seconds'
     
    #print "Topic modeling...."
    start_time = time.time()
    
    for k in num_topics:
        logging.info('=============================== BEGIN LDA ESTIMATION')
        logging.info('LDA Number of topics: %d', k)
        logging.info('LDA Number of passes: %d', num_passes)
        lda_model_file = os.path.join(tm_folder, project_name + '-K%d-VB.lda' % k)
        lda_beta_file = os.path.join(tm_folder, project_name + '-K%d-VB.lda.beta' % k)
        lda_theta_file = os.path.join(tm_folder, project_name + '-K%d-VB.lda.theta' % k)
        lda_cos_index_file = os.path.join(tm_folder, project_name + '-K%d-VB.lda.cos.index' % k)
        run_lda_estimation(dict_file, ldac_file, lda_model_file, lda_beta_file, 
                           lda_theta_file, lda_cos_index_file, k, 
                           num_passes)
        logging.info('=============================== END LDA ESTIMATION')    
 
    logging.info('=============================== BEGIN TFIDF')
    tfidf_theta_file = os.path.join(tm_folder, project_name + '.tfidf.theta')
    run_tfidf(dict_file, ldac_file, tfidf_theta_file)
    logging.info('=============================== END TFIDF')    


    print '\nTopic modeling time:', (time.time() - start_time), 'seconds'