def load_lda_parameters(mdl_cfg):
    
    dictionary_file = mdl_cfg['CORPUS']['dict_file']
    path_index_file = mdl_cfg['CORPUS']['path_index_file']
    lda_mdl_file = mdl_cfg['LDA']['lda_model_file']
    lda_cos_index_file = mdl_cfg['LDA']['lda_cos_index_file']
    
    if nexists(dictionary_file) and nexists(path_index_file):       
        lda_file_path_index = load_file_paths_index(path_index_file)
        lda_dictionary = load_dictionary(dictionary_file)
        
    if nexists(lda_mdl_file) and nexists(lda_cos_index_file): 
        lda_mdl, lda_index = load_lda_variables(lda_mdl_file, lda_cos_index_file)
        
    lda_theta_file = mdl_cfg['LDA']['lda_theta_file']
    lda_theta = np.loadtxt(lda_theta_file) # loads the LDA theta from the model theta file 
    num_docs, num_topics = lda_theta.shape
    min_lda_theta = np.min(np.min(lda_theta))
    print 'LDA-theta is loaded: # of documents:', num_docs, \
        '# of topics:', num_topics, 'min(Theta):', min_lda_theta  
    
    lda_beta_file = mdl_cfg['LDA']['lda_beta_file']
    lda_beta = np.loadtxt(lda_beta_file) # loads the LDA theta from the model theta file 
    num_topics, vocab_size = lda_beta.shape
    min_lda_beta = np.min(np.min(lda_beta))
    print 'LDA-beta is loaded: # of topics:', num_topics, \
        '# of terms in the vocabulary:', vocab_size, \
        'min(Bheta):', min_lda_beta
    print     
    
    return lda_dictionary, lda_mdl, lda_index, lda_file_path_index, lda_theta, lda_beta
def load_lsi_parameters(mdl_cfg):
    
    dictionary_file = mdl_cfg['CORPUS']['dict_file']
    path_index_file = mdl_cfg['CORPUS']['path_index_file']
    lsi_mdl_file = mdl_cfg['LSI']['lsi_model_file']
    lsi_cos_index_file = mdl_cfg['LSI']['lsi_cos_index_file']
    
    if nexists(dictionary_file) and nexists(path_index_file):       
        lsi_file_path_index = load_file_paths_index(path_index_file)
        lsi_dictionary = load_dictionary(dictionary_file)
        
    if nexists(lsi_mdl_file) and nexists(lsi_cos_index_file): 
        lsi_mdl, lsi_index = load_lsi_variables(lsi_mdl_file, lsi_cos_index_file)
        
    return lsi_dictionary, lsi_mdl, lsi_index, lsi_file_path_index
Esempio n. 3
0
def load_tm(mdl_cfg):
    
    dictionary_file = mdl_cfg['CORPUS']['dict_file']
    path_index_file = mdl_cfg['CORPUS']['path_index_file']
    lda_mdl_file = mdl_cfg['LDA']['lda_model_file']
    lda_cos_index_file = mdl_cfg['LDA']['lda_cos_index_file']
    
    if nexists(dictionary_file) and nexists(path_index_file):       
        lda_file_path_index = load_file_paths_index(path_index_file)
        lda_dictionary = load_dictionary(dictionary_file)
        
    if nexists(lda_mdl_file) and nexists(lda_cos_index_file): 
        lda_mdl, lda_index = load_lda_variables(lda_mdl_file, lda_cos_index_file)
        
    return lda_dictionary, lda_mdl, lda_index, lda_file_path_index
config_file = "E:\\E-Discovery\\edrmv2txt-a-b-index-t50-s\\edrmv2txt-a-b-index-t50-s.cfg"
M = 30 # number of terms used in coherence score 
topic_words_file = "top%d-topics-words.txt" % M
# topic_similarites_file = "topics-sim-M%d.txt" % M 


mdl_cfg = read_config(config_file)

# Loads the vocabulary 
vocab_file = mdl_cfg['CORPUS']['vocab_file']
vocab = dict()
with open(vocab_file) as fp:
    for vocab_id, token in enumerate(fp):
        vocab[token.strip()] = vocab_id 
lda_mdl_file = mdl_cfg['LDA']['lda_model_file']        
if nexists(lda_mdl_file): 
    lda_mdl = gensim.models.ldamodel.LdaModel.load(lda_mdl_file)


# Loads the corpus 
ldac_file = mdl_cfg['CORPUS']['blei_corpus_file']
lda_corpus = gensim.corpora.BleiCorpus(ldac_file)

print 'Computing Mimno score...'

coherence_scores = calc_Mimno_topic_coherence(lda_corpus, lda_mdl, vocab, M)
sort_index = np.argsort(coherence_scores)[::-1] # desc order of coherence scores 

# print 'Computing topic entropy scores'
# topic_entropies = calc_topic_entropy(lda_mdl)