Beispiel #1
0
def save_local_histograms(retrieval_alg, outv, num_bins, max_query_len, conf,
                          histograms_mode):
    preranked_total_filename = "preranked/preranked_total_" + retrieval_alg
    preranked_total = load_from_pickle_file(preranked_total_filename)
    histograms = {}
    matching_histograms = MatchingHistograms(num_bins, max_query_len)
    for (query_id, document_id) in tqdm(preranked_total.ground_truth.keys()):
        query = queries.get(query_id)
        document = corpus.get(document_id)
        if query is not None and document is not None:
            oov_document = oov_corpus.get(document_id)
            oov_query = oov_queries.get(query_id)
            hist = matching_histograms.get_histograms(query, document, model,
                                                      outv, oov_query,
                                                      oov_document,
                                                      histograms_mode)
            histograms[(query_id, document_id)] = hist
    save_to_pickle_file(
        "preprocessing/encoded_data/histograms/histograms_total" + conf + "_" +
        histograms_mode, histograms)
Beispiel #2
0
from drmm import DRMM
import json

with open('config.json') as config_file:
    data = json.load(config_file)

SEED = data["seed"]
stopwords = data["stopwords"]
stemmed = data["stemmed"]
histograms_mode = data["hist_mode"]
SEED = 42
num_layers = 3
units = [30, 5, 1]
activation_functions = ["tanh"] * num_layers
num_bins = 30
batch_size = data["batch_size"]
emb_size = 300
learning_rate = 1e-2
gating_function = data["gating_function"]
conf = data["conf"]

padded_query_idfs_filename = "preprocessing/encoded_data/idfs/padded_query_idfs" + conf
padded_query_idfs = load_from_pickle_file(padded_query_idfs_filename)
max_query_len = len(list(padded_query_idfs.values())[0])

model = DRMM(num_layers, units, activation_functions, max_query_len, num_bins, emb_size, gating_function, SEED,
             learning_rate)

with tf.Session() as sess:
    writer = tf.summary.FileWriter('./graphs', sess.graph)
Beispiel #3
0
    negative = 10  # negative samples
    sample = 1e-4  # negative sub-sample for infrequent words

    if q:
        min_count = 1  # in a query all terms should be considered

    # memory required (approx.) #vocabulary * #size * 4 (float)
    embeddings(data, algo, size, window, min_count, mode, negative, sample, file_name + ".bin")


'''corpus_filename = "preprocessing/pre_data/Corpus/Corpus" + conf
queries_filename = "preprocessing/pre_data/Queries/Queries" + conf'''
corpus_model_filename = "preprocessing/pre_data/models/corpus_model" + conf
queries_model_filename = "preprocessing/pre_data/models/queries_model" + conf

corpus_sent_filename = "preprocessing/pre_data/Corpus/sents_corpus" + conf
queries_sent_filename = "preprocessing/pre_data/Queries/sents_queries" + conf

corpus_sent = load_from_pickle_file(corpus_sent_filename)
queries_sent = load_from_pickle_file(queries_sent_filename)

# corpus_obj = load_from_pickle_file(corpus_filename)

text_embeddings(corpus_sent, corpus_model_filename, False)

# queries_obj = load_from_pickle_file(queries_filename)

# lines_queries = [query.get_text().split() for query in queries_obj.values()]

text_embeddings(queries_sent, queries_model_filename, True)
from krovetzstemmer import Stemmer
import json

with open('config.json') as config_file:
    data = json.load(config_file)

stopwords = data["stopwords"]
stemmed = data["stemmed"]
conf = data["conf"]

corpus_filename = "preprocessing/pre_data/Corpus/Corpus"
queries_filename = "preprocessing/pre_data/Queries/Queries"
corpus_sent_filename = "preprocessing/pre_data/Corpus/sents_corpus"
queries_sent_filename = "preprocessing/pre_data/Queries/sents_queries"

corpus_obj = load_from_pickle_file(corpus_filename)
queries_obj = load_from_pickle_file(queries_filename)
corpus_sent = load_from_pickle_file(corpus_sent_filename)
queries_sent = load_from_pickle_file(queries_sent_filename)

if stopwords:

    print("Removing stopwords...")

    stopwords_list = []

    with open("inquery", 'r') as f:
        for line in f.readlines():
            stopwords_list.append(line.strip('\n'))

    stopwords_list = set(stopwords_list)
Beispiel #5
0
    data = json.load(config_file)

random.seed(data["seed"])
stopwords = data["stopwords"]
stemmed = data["stemmed"]
histograms_mode = data["hist_mode"]
conf = data["conf"]
retrieval_alg = data["retrieval_alg"]

queries_filename = "preprocessing/encoded_data/Queries/Queries_encoded" + conf
corpus_filename = "preprocessing/encoded_data/Corpus/Corpus_encoded" + conf
corpus_model_filename = "preprocessing/encoded_data/embeddings/word_embeddings" + conf
oov_queries_filename = "preprocessing/encoded_data/Queries/Queries_encoded_oov" + conf
oov_corpus_filename = "preprocessing/encoded_data/Corpus/Corpus_encoded_oov" + conf
qrels_filename = "preranked/preranked_total_" + retrieval_alg
queries = load_from_pickle_file(queries_filename)
corpus = load_from_pickle_file(corpus_filename)
corpus_model = load_from_pickle_file(corpus_model_filename)
oov_corpus = load_from_pickle_file(oov_corpus_filename)
oov_queries = load_from_pickle_file(oov_queries_filename)
qrels = load_from_pickle_file(qrels_filename)

topic = "301"
max_query_len = len(queries.get(topic))
num_bins = 30
matching_histograms = MatchingHistograms(num_bins, max_query_len)
positive_doc = random.choice(list(qrels.get_relevant_docs(topic).keys()))
negative_doc = random.choice(list(qrels.get_non_relevant_docs(topic).keys()))
query = queries.get(topic)
pos_document = corpus.get(positive_doc[1])
pos_oov_document = oov_corpus.get(positive_doc[1])
Beispiel #6
0
with open('config.json') as config_file:
    data = json.load(config_file)

stopwords = data["stopwords"]
stemmed = data["stemmed"]
histograms_mode = data["hist_mode"]
glv = data["use_glove"]
conf = data["conf"]
retrieval_alg = data["retrieval_alg"]

num_bins = 30

queries_filename = "preprocessing/encoded_data/Queries/Queries_encoded" + conf
corpus_filename = "preprocessing/encoded_data/Corpus/Corpus_encoded" + conf
corpus_model_filename = "preprocessing/encoded_data/embeddings/word_embeddings" + conf
corpus_model_out_filename = "preprocessing/encoded_data/embeddings/word_embeddings_out" + conf
oov_queries_filename = "preprocessing/encoded_data/Queries/Queries_encoded_oov" + conf
oov_corpus_filename = "preprocessing/encoded_data/Corpus/Corpus_encoded_oov" + conf
queries = load_from_pickle_file(queries_filename)
corpus = load_from_pickle_file(corpus_filename)
model = load_from_pickle_file(corpus_model_filename)
model_out = load_from_pickle_file(corpus_model_out_filename)
oov_corpus = load_from_pickle_file(oov_corpus_filename)
oov_queries = load_from_pickle_file(oov_queries_filename)

max_query_len = max([len(q) for q in queries.values()])

save_local_histograms(retrieval_alg, model_out, num_bins, max_query_len,
                      conf + "_glove_" + str(glv), histograms_mode)
Beispiel #7
0

def prepare_test_ids(qrels, topics_test):
    ids_test = {}
    for topic in topics_test:
        pairs = qrels.get_pairs_topic(str(topic))
        ids_test.update(pairs)
    print("len test labels", len(ids_test.keys()))
    return list(ids_test.keys())


if __name__ == "__main__":

    qrels_filename = "preranked/preranked_total_" + retrieval_alg

    qrels = load_from_pickle_file(qrels_filename)

    topics = list(range(301, 451)) + list(range(601, 701))
    random.shuffle(topics)

    cleared_ids_train = []
    cleared_ids_test = []

    for i in range(k):
        topic_test = topics[(i * 50):(i + 1) * 50]
        topic_train = [topic for topic in topics if topic not in topic_test]
        ids_train = prepare_train_ids(qrels, topic_train, n_pos, n_neg)
        ids_test = prepare_test_ids(qrels, topic_test)
        cleared_ids_train.append(ids_train)
        cleared_ids_test.append(ids_test)
from utilities.utilities import Qrels, load_from_pickle_file
from tqdm import tqdm
import pickle
import json

qrels_file = open('preprocessing/pre_data/Qrels/Qrels_cleaned', 'rb')
qrels_obj = pickle.load(qrels_file)
qrels_file.close()

qrels_file = load_from_pickle_file('preprocessing/pre_data/Qrels/Qrels_cleaned')
corpus_obj = load_from_pickle_file('preprocessing/pre_data/Corpus/Corpus')
queries_obj = load_from_pickle_file('preprocessing/pre_data/Queries/Queries')

num_topics = len(queries_obj)

with open('config.json') as config_file:
    data = json.load(config_file)

retrieval_alg = data["retrieval_alg"]

if retrieval_alg == "QL":
    preranked_filename = "comparison/terrier_preranked/DirichletLM_6.res"
elif retrieval_alg == "Bm25":
    preranked_filename = "comparison/terrier_preranked/BM25.res"

""" create runs objects from galago batch-search output """
with open(preranked_filename, 'r') as results:
    runsList = (line.split() for line in results)

    runs = {}
    sum = 0
Beispiel #9
0
from utilities.utilities import load_from_pickle_file, save_to_pickle_file
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import chain
from tqdm import tqdm
import json

with open('config.json') as config_file:
    data = json.load(config_file)

stopwords = data["stopwords"]
stemmed = data["stemmed"]
conf = data["conf"]

corpus_filename = "preprocessing/pre_data/Corpus/Corpus" + conf
queries_filename = "preprocessing/pre_data/Queries/Queries" + conf

corpus_obj = load_from_pickle_file(corpus_filename)
queries_obj = load_from_pickle_file(queries_filename)

tfidf = TfidfVectorizer()
tfidf.fit(
    chain((doc.get_text() for doc in tqdm(corpus_obj.docs.values())),
          (query.title for query in queries_obj.values())))
idfs = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

# assert len(words) == len(idfs.keys())

idf_filename = "preprocessing/pre_data/idfs/idfs" + conf

save_to_pickle_file(idf_filename, idfs)
Beispiel #10
0
def cross_validation(k_folds, num_epochs, batch_size, ids_train, ids_test,
                     str_config, histograms_mode, opt):
    histograms_total_filename = "preprocessing/encoded_data/histograms/histograms_total" + config + "_glove_" + str(glv) \
                                + "_" + histograms_mode
    histograms_total = load_from_pickle_file(
        histograms_total_filename)  # 1.2 gb!
    all_map_test = []
    all_p20_test = []
    all_ndcg20_test = []
    all_prec_rec_test = []
    for k in range(k_folds):
        ids_train_fold = ids_train[k]  # do NOT shuffle (see loss function)
        ids_test_fold = ids_test[k]
        print("len train fold:", len(ids_train_fold))
        print("len test fold:", len(ids_test_fold))
        best_val_map = -math.inf
        count_patience = 0
        tf.reset_default_graph()
        with tf.Session() as session:
            tf.summary.FileWriter("./graphs/fold" + str(k), session.graph)
            tf.random.set_random_seed(SEED)

            model = DRMM(num_layers, units, activation_functions,
                         max_query_len, num_bins, emb_size, gating_function,
                         SEED, learning_rate, opt)
            saver = tf.train.Saver()
            session.run(tf.global_variables_initializer())
            train_steps = len(ids_train_fold) - batch_size
            all_losses_train = []
            all_map_train = []
            all_p20_train = []
            all_ndcg20_train = []
            all_map_val = []
            all_p20_val = []
            all_ndcg20_val = []
            for epoch in range(num_epochs):
                start_time = time.time()
                epoch_train_loss = 0

                i = 0
                sims_train_epoch = []
                while i < train_steps:
                    start = i
                    end = i + batch_size

                    batch_hist = []
                    batch_idf = []
                    batch_emb = []

                    for (query_id, document_id) in ids_train_fold[start:end]:
                        hist = histograms_total[(query_id, document_id)]
                        batch_hist.append(hist)
                        batch_idf.append(padded_query_idfs.get(query_id))
                        batch_emb.append(padded_query_embs.get(query_id))

                    assert np.array(batch_hist).shape[
                        1:] == model.matching_histograms.shape[1:]
                    assert np.array(
                        batch_idf).shape[1:] == model.queries_idf.shape[1:]
                    assert np.array(batch_emb).shape[
                        1:] == model.queries_embeddings.shape[1:]
                    sims_batch_train, _, c_train = session.run(
                        [model.sims, model.optimizer, model.loss],
                        feed_dict={
                            model.matching_histograms: batch_hist,
                            model.queries_idf: batch_idf,
                            model.queries_embeddings: batch_emb
                        })
                    assert len(sims_batch_train) == batch_size
                    sims_train_epoch += list(sims_batch_train)
                    epoch_train_loss += c_train
                    i += batch_size

                hist_val = []
                idf_val = []
                emb_val = []
                for (query_id, document_id) in ids_test_fold:
                    hist = histograms_total[(query_id, document_id)]
                    hist_val.append(hist)
                    idf_val.append(padded_query_idfs.get(query_id))
                    emb_val.append(padded_query_embs.get(query_id))
                sims_val = session.run(
                    [model.sims],
                    feed_dict={
                        model.matching_histograms: hist_val,
                        model.queries_idf: idf_val,
                        model.queries_embeddings: emb_val
                    })
                print('Epoch %s' % epoch)
                print('train_loss=%2.4f, time=%4.4fs' %
                      (epoch_train_loss, time.time() - start_time))
                all_losses_train.append(epoch_train_loss)
                start_time = time.time()
                train_epoch_run_text = score_to_text_run(
                    sims_train_epoch, ids_train_fold, "sw_st_idf_lch")
                val_epoch_run_text = score_to_text_run(sims_val[0],
                                                       ids_test_fold,
                                                       "sw_st_idf_lch")
                with open(
                        retrieval_alg + "/training/train_epoch_run_" + str(k) +
                        ".txt", 'w') as file:
                    file.write(train_epoch_run_text)
                with open(
                        retrieval_alg + "/validation/val_epoch_run_" + str(k) +
                        ".txt", 'w') as file:
                    file.write(val_epoch_run_text)
                map_train, p20_train, ndcg20_train = get_metrics_run(
                    retrieval_alg + "/training/train_epoch_run_" + str(k) +
                    ".txt", qrels_path, False)
                print(
                    'train map=%2.4f, p@20=%2.4f, ndcg@20=%2.4f, time=%4.4fs' %
                    (map_train, p20_train, ndcg20_train,
                     time.time() - start_time))
                map_val, p20_val, ndcg20_val = get_metrics_run(
                    retrieval_alg + "/validation/val_epoch_run_" + str(k) +
                    ".txt", qrels_path, False)
                print('val map=%2.4f, p@20=%2.4f, ndcg@20=%2.4f, time=%4.4fs' %
                      (map_val, p20_val, ndcg20_val, time.time() - start_time))
                if map_val - best_val_map < min_delta:  # early stopping
                    if count_patience < patience:
                        count_patience += 1
                    else:
                        print("stopping training: no improvements!")
                        break
                if map_val > best_val_map:  # save model with best validation map
                    best_val_map = map_val
                    count_patience = 0
                    saver.save(session, "models/model.ckpt")
                all_map_train.append(map_train)
                all_p20_train.append(p20_train)
                all_ndcg20_train.append(ndcg20_train)
                all_map_val.append(map_val)
                all_p20_val.append(p20_val)
                all_ndcg20_val.append(ndcg20_val)

                tf.summary.scalar('loss', all_losses_train)
                tf.summary.merge_all()

            make_metric_plot(str_config, all_losses_train, all_map_train,
                             all_p20_train, all_ndcg20_train, all_map_val,
                             all_p20_val, all_ndcg20_val, k)

            hist_test = []
            idf_test = []
            emb_test = []
            for (query_id, document_id) in ids_test_fold:
                hist = histograms_total[(query_id, document_id)]
                hist_test.append(hist)
                idf_test.append(padded_query_idfs.get(query_id))
                emb_test.append(padded_query_embs.get(query_id))
            start_time = time.time()
            print("=== TESTING ===")
            saver.restore(session, "models/model.ckpt")
            predictions = session.run(
                [model.sims],
                feed_dict={
                    model.matching_histograms: hist_test,
                    model.queries_idf: idf_test,
                    model.queries_embeddings: emb_test
                })
            assert len(predictions[0]) == len(ids_test_fold)
            test_run_text = score_to_text_run(predictions[0], ids_test_fold,
                                              "sw_st_idf_ch")
            with open(retrieval_alg + "/test/test_run_" + str(k) + ".txt",
                      'w') as file:
                file.write(test_run_text)
            print("Testing required: %4.4fs" % (time.time() - start_time))
            map_t, p20_t, ndcg20_t, prec_rec_test = get_metrics_run(
                retrieval_alg + "/test/test_run_" + str(k) + ".txt",
                qrels_path, True)
            all_prec_rec_test.append(prec_rec_test)
            print(map_t, p20_t, ndcg20_t)
            all_map_test.append(map_t)
            all_p20_test.append(p20_t)
            all_ndcg20_test.append(ndcg20_t)

            make_prec_recall_11pt_curve(str_config, prec_rec_test, k)

    average_map = sum(all_map_test) / len(all_map_test)
    average_prec = sum(all_p20_test) / len(all_p20_test)
    average_ndcg = sum(all_ndcg20_test) / len(all_ndcg20_test)

    # print("Average MAP in folds:", average_map)
    # print("Average prec@20 in folds:", average_prec)
    # print("Average nDCG@20 in folds:", average_ndcg)
    make_all_prec_recall_fold_curves(str_config, all_prec_rec_test)
    return all_map_test, all_p20_test, all_ndcg20_test, average_map, average_prec, average_ndcg
Beispiel #11
0
    return float(
        "%.2f" %
        (xy / np.sqrt(xx * yy)))  # just to show numerical data on plot


with open('config.json') as config_file:
    data = json.load(config_file)

conf = data["conf"]

vocabulary_filename = "preprocessing/encoded_data/vocabulary/word_index" + conf
queries_filename = "preprocessing/encoded_data/Queries/Queries_encoded" + conf
corpus_filename = "preprocessing/encoded_data/Corpus/Corpus_encoded" + conf
corpus_model_filename = "preprocessing/encoded_data/embeddings/word_embeddings" + conf

vocabulary = load_from_pickle_file(vocabulary_filename)
ivd = {v: k for k, v in vocabulary.items()}
queries = load_from_pickle_file(queries_filename)
corpus = load_from_pickle_file(corpus_filename)
corpus_model = load_from_pickle_file(corpus_model_filename)

sample_query = queries['301'][:3]
sample_document = corpus['FBIS3-10082'][20:40]

trace = []
for query_term in sample_query:
    qtv = corpus_model[query_term]
    trace_qt = []
    for doc_term in sample_document:
        trace_qt.append(cosine_score(qtv, corpus_model[doc_term]))
    trace.append(trace_qt)