def search_tm_topics(topics_list, limit, mdl_cfg): ''' Performs search on the topic model using relevant topic indices ''' EPS = 1e-24 # a constant lda_theta_file = mdl_cfg['LDA']['lda_theta_file'] index_dir = mdl_cfg['LUCENE']['lucene_index_dir'] path_index_file = mdl_cfg['CORPUS']['path_index_file'] lda_file_path_index = load_file_paths_index(path_index_file) # loads the file paths lda_theta = np.loadtxt(lda_theta_file, dtype=np.longdouble) # loads the LDA theta from the model theta file num_docs, num_topics = lda_theta.shape print 'LDA-theta is loaded: number of documents: ', num_docs, ' number of topics: ', num_topics unsel_topic_idx = [idx for idx in range(0, num_topics) if idx not in topics_list] sel = np.log(lda_theta[:, topics_list] + EPS) unsel = np.log(1.0 - lda_theta[:, unsel_topic_idx] + EPS) ln_score = sel.sum(axis=1) + unsel.sum(axis=1) sorted_idx = ln_score.argsort(axis=0)[::-1] # score = np.exp(ln_score) # Normalize the topic index search score # TODO: this is an adhoc method right now. May come back later... min_ln_score = min(ln_score) n_ln_score = (1.0 - ln_score / min_ln_score) ts_results = [] for i in range(0, min(limit, num_docs)): ts_results.append([lda_file_path_index[sorted_idx[i]][0], # document id lda_file_path_index[sorted_idx[i]][1], # document directory path lda_file_path_index[sorted_idx[i]][2], # document name n_ln_score[sorted_idx[i]]]) # similarity score # print lda_file_path_index[sorted_idx[i]], ln_score[sorted_idx[i]], n_ln_score[sorted_idx[i]], score[sorted_idx[i]] # grabs the files details from the index ts_results = get_indexed_file_details(ts_results, index_dir) results = [[row[0], float(row[10])] for row in ts_results] # Note: we need a float conversion because it's retrieving as string return results
def search_tm(query_text, limit, mdl_cfg): lda_dictionary, lda_mdl, lda_index, lda_file_path_index = load_tm(mdl_cfg) ts_results = search_lda_model(query_text, lda_dictionary, lda_mdl, lda_index, lda_file_path_index, limit) ## ts_results are in this format [doc_id, doc_dir_path, doc_name, score] # grabs the files details from the index index_dir = mdl_cfg['LUCENE']['lucene_index_dir'] ts_results = get_indexed_file_details(ts_results, index_dir) if len(ts_results) == 0: print 'No documents found.' return # Normalize the similarity scores results = [[row[0], ((float(row[10]) + 1.0) / 2.0)] for row in ts_results] return results
def search_lsi(query_text, limit, mdl_cfg): lsi_dictionary, lsi_mdl, lsi_index, lsi_file_path_index = load_lsi(mdl_cfg) ts_results = search_lsi_model(query_text, lsi_dictionary, lsi_mdl, lsi_index, lsi_file_path_index, limit) ## ts_results are in this format [doc_id, doc_dir_path, doc_name, score] # grabs the files details from the index index_dir = mdl_cfg['LUCENE']['lucene_index_dir'] ts_results = get_indexed_file_details(ts_results, index_dir) if len(ts_results) == 0: print 'No documents found.' return ''' Sahil Considering documents that satisfy a certain condition ''' results = [[row[0], ((float(row[10]) + 1.0) / 2.0)] for row in ts_results] return results
def search_tm_sel_topics_cos(topics_list, topics_prob, limit, mdl_cfg): lda_theta_file = mdl_cfg['LDA']['lda_theta_file'] index_dir = mdl_cfg['LUCENE']['lucene_index_dir'] path_index_file = mdl_cfg['CORPUS']['path_index_file'] lda_file_path_index = load_file_paths_index(path_index_file) lda_theta = np.loadtxt(lda_theta_file, dtype=np.longdouble) num_docs, num_topics = lda_theta.shape print 'Number of documents: ', num_docs, ' number of topics: ', num_topics from scipy.spatial.distance import cosine topics_prob = np.array(topics_prob) sel = lda_theta[:, topics_list] cos_scores = np.zeros(num_docs) for i in range(0, num_docs): cos_scores[i] = cosine(topics_prob, sel[i, :]) sorted_idx = cos_scores.argsort(axis=0)[::-1] ts_results = [] for i in range(0, min(limit, num_docs)): ts_results.append([lda_file_path_index[sorted_idx[i]][0], lda_file_path_index[sorted_idx[i]][1], lda_file_path_index[sorted_idx[i]][2], cos_scores[sorted_idx[i]]]) print lda_file_path_index[sorted_idx[i]], cos_scores[sorted_idx[i]] # grabs the files details from the index ts_results = get_indexed_file_details(ts_results, index_dir) results = [[row[0], row[10]] for row in ts_results] return results