def load_lda_parameters(mdl_cfg): dictionary_file = mdl_cfg['CORPUS']['dict_file'] path_index_file = mdl_cfg['CORPUS']['path_index_file'] lda_mdl_file = mdl_cfg['LDA']['lda_model_file'] lda_cos_index_file = mdl_cfg['LDA']['lda_cos_index_file'] if nexists(dictionary_file) and nexists(path_index_file): lda_file_path_index = load_file_paths_index(path_index_file) lda_dictionary = load_dictionary(dictionary_file) if nexists(lda_mdl_file) and nexists(lda_cos_index_file): lda_mdl, lda_index = load_lda_variables(lda_mdl_file, lda_cos_index_file) lda_theta_file = mdl_cfg['LDA']['lda_theta_file'] lda_theta = np.loadtxt(lda_theta_file) # loads the LDA theta from the model theta file num_docs, num_topics = lda_theta.shape min_lda_theta = np.min(np.min(lda_theta)) print 'LDA-theta is loaded: # of documents:', num_docs, \ '# of topics:', num_topics, 'min(Theta):', min_lda_theta lda_beta_file = mdl_cfg['LDA']['lda_beta_file'] lda_beta = np.loadtxt(lda_beta_file) # loads the LDA theta from the model theta file num_topics, vocab_size = lda_beta.shape min_lda_beta = np.min(np.min(lda_beta)) print 'LDA-beta is loaded: # of topics:', num_topics, \ '# of terms in the vocabulary:', vocab_size, \ 'min(Bheta):', min_lda_beta print return lda_dictionary, lda_mdl, lda_index, lda_file_path_index, lda_theta, lda_beta
def load_lsi_parameters(mdl_cfg): dictionary_file = mdl_cfg['CORPUS']['dict_file'] path_index_file = mdl_cfg['CORPUS']['path_index_file'] lsi_mdl_file = mdl_cfg['LSI']['lsi_model_file'] lsi_cos_index_file = mdl_cfg['LSI']['lsi_cos_index_file'] if nexists(dictionary_file) and nexists(path_index_file): lsi_file_path_index = load_file_paths_index(path_index_file) lsi_dictionary = load_dictionary(dictionary_file) if nexists(lsi_mdl_file) and nexists(lsi_cos_index_file): lsi_mdl, lsi_index = load_lsi_variables(lsi_mdl_file, lsi_cos_index_file) return lsi_dictionary, lsi_mdl, lsi_index, lsi_file_path_index
def load_tm(mdl_cfg): dictionary_file = mdl_cfg['CORPUS']['dict_file'] path_index_file = mdl_cfg['CORPUS']['path_index_file'] lda_mdl_file = mdl_cfg['LDA']['lda_model_file'] lda_cos_index_file = mdl_cfg['LDA']['lda_cos_index_file'] if nexists(dictionary_file) and nexists(path_index_file): lda_file_path_index = load_file_paths_index(path_index_file) lda_dictionary = load_dictionary(dictionary_file) if nexists(lda_mdl_file) and nexists(lda_cos_index_file): lda_mdl, lda_index = load_lda_variables(lda_mdl_file, lda_cos_index_file) return lda_dictionary, lda_mdl, lda_index, lda_file_path_index
def eval_ranking_methods(file_prefix, config_file, test_directory, tm_query, limit = 1000, img_extension = '.eps'): lucene_query = 'all:(%s)' % tm_query # search in all fields print 'Lucene query:', lucene_query print 'TM query:', tm_query positive_dir = os.path.join(test_directory, "1") # TRUE positive documents TOP_K_TOPICS = 5 # the number topics used for Topic-LDA rocs_file_name = '%s-ROCs' % file_prefix + img_extension rocs_img_title = '' # %s: ROC curves' % file_prefix roc_labels = ['Lucene ranking', 'Keyword-LDA ranking' , 'Keyword-LDA * Lucene ranking', 'Topic-LDA ranking' , 'Topic-LDA * Lucene Ranking', 'Keyword-LSI ranking'] line_styles = ['ro-','kx-','b+-','c^-','yv-.','gd-'] #---------------------------------------------- Reads the configuration file mdl_cfg = read_config(config_file) #------------ Checks whether the keywords are there in the corpus dictionary dictionary = load_dictionary(mdl_cfg['CORPUS']['dict_file']) valid_tokens = 0 for token in tm_query.split(): if token.strip() not in dictionary.values(): print token, "is not in the corpus vocabulary. Hence, this word will be ignored from the topic search." else: valid_tokens += 1 if valid_tokens == 0: print 'None of the tokens exist in the dictionary. Exiting topic search!' exit() #------------------------------------------------------------- Lucene search print 'Lucene ranking' lu_docs = search_li(lucene_query, limit, mdl_cfg) lu_docs_dict, lu_docs_list = lu_append_nonresp(lu_docs, test_directory) lu_res = convert_to_roc_format(lu_docs_list, positive_dir) print #---------------------------------------------------------------- LDA search # Loads the LDA model lda_dictionary, lda_mdl, lda_index, lda_file_path_index, lda_theta, lda_beta = load_lda_parameters(mdl_cfg) # To display the LDA model topics based on the # increasing order of entropy # print_lda_topics_on_entropy(lda_mdl, file_name='%s-topic-words.csv' % file_prefix, topn=50) # Gets the dominant topics from the LDA model dominant_topics = get_dominant_query_topics(tm_query, lda_dictionary, lda_mdl, TOP_K_TOPICS) dominant_topics_idx = [idx for (idx, _) in dominant_topics] # get the topic indices print 'LDA (w/ keywords) ranking' lda_docs = search_tm(tm_query, limit, lda_dictionary, lda_mdl, lda_index, lda_file_path_index) lda_res = convert_to_roc_format(lda_docs, positive_dir) # plot_doc_class_predictions(lda_res, '%s-Keyword-LDA' % file_prefix, img_extension) print 'LDA (w/ keywords) * Lucene ranking' lu_tm_docs = fuse_lucene_tm_scores(lu_docs_dict, lda_docs) lda_lu_res = convert_to_roc_format(lu_tm_docs, positive_dir) # plot_doc_class_predictions(lda_lu_res, '%s-Keyword-LDA-Lucene' % file_prefix, img_extension) print 'LDA (w/ query topics) ranking' lda_tts_docs = search_tm_topics(dominant_topics_idx, limit, lda_file_path_index, lda_theta) lda_tts_res = convert_to_roc_format(lda_tts_docs, positive_dir) # plot_doc_class_predictions(lda_tts_res, '%s-Topic-LDA' % file_prefix, img_extension) print 'LDA (w/ query topics) * Lucene Ranking' final_docs_tts = fuse_lucene_tm_scores(lu_docs_dict, lda_tts_docs) lda_tts_lu_res = convert_to_roc_format(final_docs_tts, positive_dir) # plot_doc_class_predictions(lda_tts_lu_res, '%s-Topic-LDA-Lucene' % file_prefix, img_extension) #---------------------------------------------------------------- LSI search print 'LSI (w/ keywords) ranking' lsi_docs = search_lsi(tm_query, limit, mdl_cfg) lsi_res = convert_to_roc_format(lsi_docs, positive_dir) ## Plot ROC curves results_list = [lu_res, lda_res, lda_lu_res, lda_tts_res, lda_tts_lu_res, lsi_res] roc_data_list = [ROCData(result, linestyle=line_styles[idx]) for idx, result in enumerate(results_list)] plot_multiple_roc(roc_data_list, title=rocs_img_title, labels=roc_labels, include_baseline=True, file_name=rocs_file_name)