def save_local_histograms(retrieval_alg, outv, num_bins, max_query_len, conf, histograms_mode): preranked_total_filename = "preranked/preranked_total_" + retrieval_alg preranked_total = load_from_pickle_file(preranked_total_filename) histograms = {} matching_histograms = MatchingHistograms(num_bins, max_query_len) for (query_id, document_id) in tqdm(preranked_total.ground_truth.keys()): query = queries.get(query_id) document = corpus.get(document_id) if query is not None and document is not None: oov_document = oov_corpus.get(document_id) oov_query = oov_queries.get(query_id) hist = matching_histograms.get_histograms(query, document, model, outv, oov_query, oov_document, histograms_mode) histograms[(query_id, document_id)] = hist save_to_pickle_file( "preprocessing/encoded_data/histograms/histograms_total" + conf + "_" + histograms_mode, histograms)
from drmm import DRMM import json with open('config.json') as config_file: data = json.load(config_file) SEED = data["seed"] stopwords = data["stopwords"] stemmed = data["stemmed"] histograms_mode = data["hist_mode"] SEED = 42 num_layers = 3 units = [30, 5, 1] activation_functions = ["tanh"] * num_layers num_bins = 30 batch_size = data["batch_size"] emb_size = 300 learning_rate = 1e-2 gating_function = data["gating_function"] conf = data["conf"] padded_query_idfs_filename = "preprocessing/encoded_data/idfs/padded_query_idfs" + conf padded_query_idfs = load_from_pickle_file(padded_query_idfs_filename) max_query_len = len(list(padded_query_idfs.values())[0]) model = DRMM(num_layers, units, activation_functions, max_query_len, num_bins, emb_size, gating_function, SEED, learning_rate) with tf.Session() as sess: writer = tf.summary.FileWriter('./graphs', sess.graph)
negative = 10 # negative samples sample = 1e-4 # negative sub-sample for infrequent words if q: min_count = 1 # in a query all terms should be considered # memory required (approx.) #vocabulary * #size * 4 (float) embeddings(data, algo, size, window, min_count, mode, negative, sample, file_name + ".bin") '''corpus_filename = "preprocessing/pre_data/Corpus/Corpus" + conf queries_filename = "preprocessing/pre_data/Queries/Queries" + conf''' corpus_model_filename = "preprocessing/pre_data/models/corpus_model" + conf queries_model_filename = "preprocessing/pre_data/models/queries_model" + conf corpus_sent_filename = "preprocessing/pre_data/Corpus/sents_corpus" + conf queries_sent_filename = "preprocessing/pre_data/Queries/sents_queries" + conf corpus_sent = load_from_pickle_file(corpus_sent_filename) queries_sent = load_from_pickle_file(queries_sent_filename) # corpus_obj = load_from_pickle_file(corpus_filename) text_embeddings(corpus_sent, corpus_model_filename, False) # queries_obj = load_from_pickle_file(queries_filename) # lines_queries = [query.get_text().split() for query in queries_obj.values()] text_embeddings(queries_sent, queries_model_filename, True)
from krovetzstemmer import Stemmer import json with open('config.json') as config_file: data = json.load(config_file) stopwords = data["stopwords"] stemmed = data["stemmed"] conf = data["conf"] corpus_filename = "preprocessing/pre_data/Corpus/Corpus" queries_filename = "preprocessing/pre_data/Queries/Queries" corpus_sent_filename = "preprocessing/pre_data/Corpus/sents_corpus" queries_sent_filename = "preprocessing/pre_data/Queries/sents_queries" corpus_obj = load_from_pickle_file(corpus_filename) queries_obj = load_from_pickle_file(queries_filename) corpus_sent = load_from_pickle_file(corpus_sent_filename) queries_sent = load_from_pickle_file(queries_sent_filename) if stopwords: print("Removing stopwords...") stopwords_list = [] with open("inquery", 'r') as f: for line in f.readlines(): stopwords_list.append(line.strip('\n')) stopwords_list = set(stopwords_list)
data = json.load(config_file) random.seed(data["seed"]) stopwords = data["stopwords"] stemmed = data["stemmed"] histograms_mode = data["hist_mode"] conf = data["conf"] retrieval_alg = data["retrieval_alg"] queries_filename = "preprocessing/encoded_data/Queries/Queries_encoded" + conf corpus_filename = "preprocessing/encoded_data/Corpus/Corpus_encoded" + conf corpus_model_filename = "preprocessing/encoded_data/embeddings/word_embeddings" + conf oov_queries_filename = "preprocessing/encoded_data/Queries/Queries_encoded_oov" + conf oov_corpus_filename = "preprocessing/encoded_data/Corpus/Corpus_encoded_oov" + conf qrels_filename = "preranked/preranked_total_" + retrieval_alg queries = load_from_pickle_file(queries_filename) corpus = load_from_pickle_file(corpus_filename) corpus_model = load_from_pickle_file(corpus_model_filename) oov_corpus = load_from_pickle_file(oov_corpus_filename) oov_queries = load_from_pickle_file(oov_queries_filename) qrels = load_from_pickle_file(qrels_filename) topic = "301" max_query_len = len(queries.get(topic)) num_bins = 30 matching_histograms = MatchingHistograms(num_bins, max_query_len) positive_doc = random.choice(list(qrels.get_relevant_docs(topic).keys())) negative_doc = random.choice(list(qrels.get_non_relevant_docs(topic).keys())) query = queries.get(topic) pos_document = corpus.get(positive_doc[1]) pos_oov_document = oov_corpus.get(positive_doc[1])
with open('config.json') as config_file: data = json.load(config_file) stopwords = data["stopwords"] stemmed = data["stemmed"] histograms_mode = data["hist_mode"] glv = data["use_glove"] conf = data["conf"] retrieval_alg = data["retrieval_alg"] num_bins = 30 queries_filename = "preprocessing/encoded_data/Queries/Queries_encoded" + conf corpus_filename = "preprocessing/encoded_data/Corpus/Corpus_encoded" + conf corpus_model_filename = "preprocessing/encoded_data/embeddings/word_embeddings" + conf corpus_model_out_filename = "preprocessing/encoded_data/embeddings/word_embeddings_out" + conf oov_queries_filename = "preprocessing/encoded_data/Queries/Queries_encoded_oov" + conf oov_corpus_filename = "preprocessing/encoded_data/Corpus/Corpus_encoded_oov" + conf queries = load_from_pickle_file(queries_filename) corpus = load_from_pickle_file(corpus_filename) model = load_from_pickle_file(corpus_model_filename) model_out = load_from_pickle_file(corpus_model_out_filename) oov_corpus = load_from_pickle_file(oov_corpus_filename) oov_queries = load_from_pickle_file(oov_queries_filename) max_query_len = max([len(q) for q in queries.values()]) save_local_histograms(retrieval_alg, model_out, num_bins, max_query_len, conf + "_glove_" + str(glv), histograms_mode)
def prepare_test_ids(qrels, topics_test): ids_test = {} for topic in topics_test: pairs = qrels.get_pairs_topic(str(topic)) ids_test.update(pairs) print("len test labels", len(ids_test.keys())) return list(ids_test.keys()) if __name__ == "__main__": qrels_filename = "preranked/preranked_total_" + retrieval_alg qrels = load_from_pickle_file(qrels_filename) topics = list(range(301, 451)) + list(range(601, 701)) random.shuffle(topics) cleared_ids_train = [] cleared_ids_test = [] for i in range(k): topic_test = topics[(i * 50):(i + 1) * 50] topic_train = [topic for topic in topics if topic not in topic_test] ids_train = prepare_train_ids(qrels, topic_train, n_pos, n_neg) ids_test = prepare_test_ids(qrels, topic_test) cleared_ids_train.append(ids_train) cleared_ids_test.append(ids_test)
from utilities.utilities import Qrels, load_from_pickle_file from tqdm import tqdm import pickle import json qrels_file = open('preprocessing/pre_data/Qrels/Qrels_cleaned', 'rb') qrels_obj = pickle.load(qrels_file) qrels_file.close() qrels_file = load_from_pickle_file('preprocessing/pre_data/Qrels/Qrels_cleaned') corpus_obj = load_from_pickle_file('preprocessing/pre_data/Corpus/Corpus') queries_obj = load_from_pickle_file('preprocessing/pre_data/Queries/Queries') num_topics = len(queries_obj) with open('config.json') as config_file: data = json.load(config_file) retrieval_alg = data["retrieval_alg"] if retrieval_alg == "QL": preranked_filename = "comparison/terrier_preranked/DirichletLM_6.res" elif retrieval_alg == "Bm25": preranked_filename = "comparison/terrier_preranked/BM25.res" """ create runs objects from galago batch-search output """ with open(preranked_filename, 'r') as results: runsList = (line.split() for line in results) runs = {} sum = 0
from utilities.utilities import load_from_pickle_file, save_to_pickle_file from sklearn.feature_extraction.text import TfidfVectorizer from itertools import chain from tqdm import tqdm import json with open('config.json') as config_file: data = json.load(config_file) stopwords = data["stopwords"] stemmed = data["stemmed"] conf = data["conf"] corpus_filename = "preprocessing/pre_data/Corpus/Corpus" + conf queries_filename = "preprocessing/pre_data/Queries/Queries" + conf corpus_obj = load_from_pickle_file(corpus_filename) queries_obj = load_from_pickle_file(queries_filename) tfidf = TfidfVectorizer() tfidf.fit( chain((doc.get_text() for doc in tqdm(corpus_obj.docs.values())), (query.title for query in queries_obj.values()))) idfs = dict(zip(tfidf.get_feature_names(), tfidf.idf_)) # assert len(words) == len(idfs.keys()) idf_filename = "preprocessing/pre_data/idfs/idfs" + conf save_to_pickle_file(idf_filename, idfs)
def cross_validation(k_folds, num_epochs, batch_size, ids_train, ids_test, str_config, histograms_mode, opt): histograms_total_filename = "preprocessing/encoded_data/histograms/histograms_total" + config + "_glove_" + str(glv) \ + "_" + histograms_mode histograms_total = load_from_pickle_file( histograms_total_filename) # 1.2 gb! all_map_test = [] all_p20_test = [] all_ndcg20_test = [] all_prec_rec_test = [] for k in range(k_folds): ids_train_fold = ids_train[k] # do NOT shuffle (see loss function) ids_test_fold = ids_test[k] print("len train fold:", len(ids_train_fold)) print("len test fold:", len(ids_test_fold)) best_val_map = -math.inf count_patience = 0 tf.reset_default_graph() with tf.Session() as session: tf.summary.FileWriter("./graphs/fold" + str(k), session.graph) tf.random.set_random_seed(SEED) model = DRMM(num_layers, units, activation_functions, max_query_len, num_bins, emb_size, gating_function, SEED, learning_rate, opt) saver = tf.train.Saver() session.run(tf.global_variables_initializer()) train_steps = len(ids_train_fold) - batch_size all_losses_train = [] all_map_train = [] all_p20_train = [] all_ndcg20_train = [] all_map_val = [] all_p20_val = [] all_ndcg20_val = [] for epoch in range(num_epochs): start_time = time.time() epoch_train_loss = 0 i = 0 sims_train_epoch = [] while i < train_steps: start = i end = i + batch_size batch_hist = [] batch_idf = [] batch_emb = [] for (query_id, document_id) in ids_train_fold[start:end]: hist = histograms_total[(query_id, document_id)] batch_hist.append(hist) batch_idf.append(padded_query_idfs.get(query_id)) batch_emb.append(padded_query_embs.get(query_id)) assert np.array(batch_hist).shape[ 1:] == model.matching_histograms.shape[1:] assert np.array( batch_idf).shape[1:] == model.queries_idf.shape[1:] assert np.array(batch_emb).shape[ 1:] == model.queries_embeddings.shape[1:] sims_batch_train, _, c_train = session.run( [model.sims, model.optimizer, model.loss], feed_dict={ model.matching_histograms: batch_hist, model.queries_idf: batch_idf, model.queries_embeddings: batch_emb }) assert len(sims_batch_train) == batch_size sims_train_epoch += list(sims_batch_train) epoch_train_loss += c_train i += batch_size hist_val = [] idf_val = [] emb_val = [] for (query_id, document_id) in ids_test_fold: hist = histograms_total[(query_id, document_id)] hist_val.append(hist) idf_val.append(padded_query_idfs.get(query_id)) emb_val.append(padded_query_embs.get(query_id)) sims_val = session.run( [model.sims], feed_dict={ model.matching_histograms: hist_val, model.queries_idf: idf_val, model.queries_embeddings: emb_val }) print('Epoch %s' % epoch) print('train_loss=%2.4f, time=%4.4fs' % (epoch_train_loss, time.time() - start_time)) all_losses_train.append(epoch_train_loss) start_time = time.time() train_epoch_run_text = score_to_text_run( sims_train_epoch, ids_train_fold, "sw_st_idf_lch") val_epoch_run_text = score_to_text_run(sims_val[0], ids_test_fold, "sw_st_idf_lch") with open( retrieval_alg + "/training/train_epoch_run_" + str(k) + ".txt", 'w') as file: file.write(train_epoch_run_text) with open( retrieval_alg + "/validation/val_epoch_run_" + str(k) + ".txt", 'w') as file: file.write(val_epoch_run_text) map_train, p20_train, ndcg20_train = get_metrics_run( retrieval_alg + "/training/train_epoch_run_" + str(k) + ".txt", qrels_path, False) print( 'train map=%2.4f, p@20=%2.4f, ndcg@20=%2.4f, time=%4.4fs' % (map_train, p20_train, ndcg20_train, time.time() - start_time)) map_val, p20_val, ndcg20_val = get_metrics_run( retrieval_alg + "/validation/val_epoch_run_" + str(k) + ".txt", qrels_path, False) print('val map=%2.4f, p@20=%2.4f, ndcg@20=%2.4f, time=%4.4fs' % (map_val, p20_val, ndcg20_val, time.time() - start_time)) if map_val - best_val_map < min_delta: # early stopping if count_patience < patience: count_patience += 1 else: print("stopping training: no improvements!") break if map_val > best_val_map: # save model with best validation map best_val_map = map_val count_patience = 0 saver.save(session, "models/model.ckpt") all_map_train.append(map_train) all_p20_train.append(p20_train) all_ndcg20_train.append(ndcg20_train) all_map_val.append(map_val) all_p20_val.append(p20_val) all_ndcg20_val.append(ndcg20_val) tf.summary.scalar('loss', all_losses_train) tf.summary.merge_all() make_metric_plot(str_config, all_losses_train, all_map_train, all_p20_train, all_ndcg20_train, all_map_val, all_p20_val, all_ndcg20_val, k) hist_test = [] idf_test = [] emb_test = [] for (query_id, document_id) in ids_test_fold: hist = histograms_total[(query_id, document_id)] hist_test.append(hist) idf_test.append(padded_query_idfs.get(query_id)) emb_test.append(padded_query_embs.get(query_id)) start_time = time.time() print("=== TESTING ===") saver.restore(session, "models/model.ckpt") predictions = session.run( [model.sims], feed_dict={ model.matching_histograms: hist_test, model.queries_idf: idf_test, model.queries_embeddings: emb_test }) assert len(predictions[0]) == len(ids_test_fold) test_run_text = score_to_text_run(predictions[0], ids_test_fold, "sw_st_idf_ch") with open(retrieval_alg + "/test/test_run_" + str(k) + ".txt", 'w') as file: file.write(test_run_text) print("Testing required: %4.4fs" % (time.time() - start_time)) map_t, p20_t, ndcg20_t, prec_rec_test = get_metrics_run( retrieval_alg + "/test/test_run_" + str(k) + ".txt", qrels_path, True) all_prec_rec_test.append(prec_rec_test) print(map_t, p20_t, ndcg20_t) all_map_test.append(map_t) all_p20_test.append(p20_t) all_ndcg20_test.append(ndcg20_t) make_prec_recall_11pt_curve(str_config, prec_rec_test, k) average_map = sum(all_map_test) / len(all_map_test) average_prec = sum(all_p20_test) / len(all_p20_test) average_ndcg = sum(all_ndcg20_test) / len(all_ndcg20_test) # print("Average MAP in folds:", average_map) # print("Average prec@20 in folds:", average_prec) # print("Average nDCG@20 in folds:", average_ndcg) make_all_prec_recall_fold_curves(str_config, all_prec_rec_test) return all_map_test, all_p20_test, all_ndcg20_test, average_map, average_prec, average_ndcg
return float( "%.2f" % (xy / np.sqrt(xx * yy))) # just to show numerical data on plot with open('config.json') as config_file: data = json.load(config_file) conf = data["conf"] vocabulary_filename = "preprocessing/encoded_data/vocabulary/word_index" + conf queries_filename = "preprocessing/encoded_data/Queries/Queries_encoded" + conf corpus_filename = "preprocessing/encoded_data/Corpus/Corpus_encoded" + conf corpus_model_filename = "preprocessing/encoded_data/embeddings/word_embeddings" + conf vocabulary = load_from_pickle_file(vocabulary_filename) ivd = {v: k for k, v in vocabulary.items()} queries = load_from_pickle_file(queries_filename) corpus = load_from_pickle_file(corpus_filename) corpus_model = load_from_pickle_file(corpus_model_filename) sample_query = queries['301'][:3] sample_document = corpus['FBIS3-10082'][20:40] trace = [] for query_term in sample_query: qtv = corpus_model[query_term] trace_qt = [] for doc_term in sample_document: trace_qt.append(cosine_score(qtv, corpus_model[doc_term])) trace.append(trace_qt)