def generate_annoy_index(embed_size, embeddings, raw_data_path):
    offset_mapping = obj_reader(raw_data_path)
    mapping = {v: k for k, v in offset_mapping.items()}
    index = AnnoyIndex(embed_size, 'dot')
    for i, value in enumerate(embeddings):
        index.add_item(i, value)
        if i % 1_000_000 == 0:
            print_message("Progress: " + str(i) + "/" + str(len(embeddings)) + " " + str(i / len(embeddings)))
def generate_annoy_index(embed_size, embeddings):
    mapping = {}
    i = 0
    index = AnnoyIndex(embed_size, 'euclidean')
    for key, value in embeddings.items():
        index.add_item(i, value)
        mapping[i] = key
        i += 1
        if i % 50000 == 0:
            print_message("Progress: " + str(i) + "/" + str(len(embeddings)) +
                          " " + str(i / len(embeddings)))
    return index, mapping
Example #3
0
def grid_nrbp(p_forwards=[0.5, 0.9, 1], p_reverses=[0.5, 0.9, 1], ranks=[100]):
    x = []
    y = []
    data = []
    baseline_data = []
    opts = get_opts()
    ## Need to specify following arguments
    # reverse_ranker_path
    # test_data_path
    # device
    # active_learning_stage
    true_dict, baseline_dict, result_dict, args_dict = testing(opts)
    active_learning = args_dict["active_learning"]
    network_type = args_dict["network_type"]
    num_query = args_dict["num_query"]
    num_passage = args_dict["num_passage"]
    for r in ranks:
        for p_forward in p_forwards:
            rating_dict = transform_ground_truth(true_dict, p_forward)
            for p_reverse in p_reverses:
                x.append(p_forward)
                y.append(p_reverse)
                baseline_nrbp = calculate_metrics(rating_dict, baseline_dict,
                                                  r, p_reverse)
                model_nrbp = calculate_metrics(rating_dict, result_dict, r,
                                               p_reverse)
                # data.append((model_nrbp-baseline_nrbp)/baseline_nrbp)
                data.append(model_nrbp)
                baseline_data.append(baseline_nrbp)
                print_message("Processed p_forward={}, p_reverse={}".format(
                    p_forward, p_reverse))

    # Write results to csv
    output_results = [active_learning, network_type, num_query, num_passage
                      ] + data + baseline_data
    with open(OUTPUT_PATH, mode='a+') as output:
        output_writer = csv.writer(output)
        output_writer.writerow(output_results)
Example #4
0
active_learning_option = opts.active_learning_option
active_learning_stage = opts.active_learning_stage
device = opts.device
data_option = opts.data_option
N_QUERIES = opts.n_query
TRAIN_SIZE = opts.n_passage

if active_learning_option == "No":
    TRAINING_DATA_PATH = "/datadrive/ruohan/final_train_test_data/ance_training_rank{}_nqueries{}_npassages{}_{}.csv".format(
        RANK, N_QUERIES, TRAIN_SIZE, data_option)
    TEST_DATA_PATH = "/datadrive/ruohan/final_train_test_data/ance_testing_rank{}_nqueries{}_npassages{}_{}.csv".format(
        RANK, N_QUERIES, TEST_SIZE, data_option)
    print("Training data is stored at {}".format(TRAINING_DATA_PATH))
    print("Test data is stored at {}".format(TEST_DATA_PATH))

    print_message("Loading embeddings.")
    passage_embeddings = obj_reader(
        "/home/jianx/results/passage_0__emb_p__data_obj_0.pb")
    query_train_embeddings = obj_reader(
        "/home/jianx/results/query_0__emb_p__data_obj_0.pb")

else:
    reverse_ranker_path = opts.reverse_ranker_path
    reverse_ranker, network_type = load_model(reverse_ranker_path, device)
    passage_embeddings = obj_reader(
        "/home/jianx/results/passage_0__emb_p__data_obj_0.pb")
    query_train_embeddings = obj_reader(
        "/home/jianx/results/query_0__emb_p__data_obj_0.pb")
    passage_embeddings = transform_np_transformation(passage_embeddings,
                                                     reverse_ranker, device)
    query_train_embeddings = transform_np_transformation(
Example #5
0
import random
import numpy as np
import matplotlib.pyplot as plt

PASSAGE_DICT_PATH = "/datadrive/jianx/data/passages.dict"
QUERY_TRAIN_DICT_PATH = "/datadrive/jianx/data/queries_train.dict"

# Queries plain text: queries.train.tsv
# Passages plain text: collection.tsv
QUERIES_TEXT_PATH = "/datadrive/jianx/data/queries.train.tsv"
PASSAGES_TEXT_PATH = "/datadrive/jianx/data/collection.tsv"

obj_reader = load_data.obj_reader
obj_writer = load_data.obj_writer

print_message("Loading embeddings.")
passage_embeddings = obj_reader(
    "/home/jianx/results/passage_0__emb_p__data_obj_0.pb")
query_train_embeddings = obj_reader(
    "/home/jianx/results/query_0__emb_p__data_obj_0.pb")

from sklearn.decomposition import PCA
all_embeddings = np.concatenate((passage_embeddings, query_train_embeddings),
                                axis=0)
print(all_embeddings.shape)
pca = PCA(n_components=50)
pca.fit(all_embeddings)

print("PCA Explained Variance: {}%".format(
    np.round(sum(pca.explained_variance_ratio_) * 100, 4)))
all_embeddings_pca = pca.transform(all_embeddings)
TREE_SIZE = int(sys.argv[4])
TYPE = sys.argv[5]


def generate_annoy_index(embed_size, embeddings, raw_data_path):
    offset_mapping = obj_reader(raw_data_path)
    mapping = {v: k for k, v in offset_mapping.items()}
    index = AnnoyIndex(embed_size, 'dot')
    for i, value in enumerate(embeddings):
        index.add_item(i, value)
        if i % 1_000_000 == 0:
            print_message("Progress: " + str(i) + "/" + str(len(embeddings)) + " " + str(i / len(embeddings)))
    return index, mapping


passage_embeddings = obj_reader(EMBEDDING_PATH)
print(passage_embeddings.shape)

passage_index, passage_mapping = generate_annoy_index(768, passage_embeddings,
                                                      RAW_PATH)
del passage_embeddings
obj_writer(passage_mapping, OUT_DIR + str(TREE_SIZE) + "_ance_" + TYPE + "_map.dict")
del passage_mapping

print_message("Start Building.")
passage_index.build(TREE_SIZE)
print_message("Finished Building.")

passage_index.save(OUT_DIR + str(TREE_SIZE) + "_" + TYPE + "_ance_index.ann")
print_message("Successfully Saved.")
def generate_annoy_index(embed_size, embeddings):
    mapping = {}
    i = 0
    index = AnnoyIndex(embed_size, 'euclidean')
    for key, value in embeddings.items():
        index.add_item(i, value)
        mapping[i] = key
        i += 1
        if i % 50000 == 0:
            print_message("Progress: " + str(i) + "/" + str(len(embeddings)) +
                          " " + str(i / len(embeddings)))
    return index, mapping


print_message("Start Loading Embeddings")
PASSAGE_EMBEDDINGS = obj_reader(
    "/home/jianx/data/results/passage_embeddings.dict")
print_message("Embeddings Successfully Loaded")

PID_INDEX, PID_MAP = generate_annoy_index(EMBED_SIZE, PASSAGE_EMBEDDINGS)
del PASSAGE_EMBEDDINGS
print_message("Start Building.")
PID_INDEX.build(TREE_SIZE)
print_message("Finished Building.")

PID_INDEX.save("/home/jianx/data/annoy/" + str(TREE_SIZE) +
               "_passage_index.ann")
obj_writer(PID_MAP,
           "/home/jianx/data/annoy/" + str(TREE_SIZE) + "_pid_map.dict")
print_message("Successfully Saved.")
Example #8
0
sys.path.insert(0, '/home/jianx/search-exposure/')
import faiss
import forward_ranker.load_data as load_data
from forward_ranker.utils import print_message

obj_reader = load_data.obj_reader
obj_writer = load_data.obj_writer

IS_FLAT = False
BATCH_SIZE = 100
NLIST = 100
RANK = 100
OUT_PATH = "/datadrive/jianx/data/results/all_search_rankings_{}_{}_{}.csv".format(
    NLIST, RANK, "flat" if IS_FLAT else "approximate")

print_message("Loading embeddings.")
passage_embeddings = obj_reader(
    "/home/jianx/results/passage_0__emb_p__data_obj_0.pb")
query_train_embeddings = obj_reader(
    "/home/jianx/results/query_0__emb_p__data_obj_0.pb")
query_train_mapping = obj_reader(
    "/datadrive/jianx/data/annoy/100_ance_query_train_map.dict")
pid_mapping = obj_reader(
    "/datadrive/jianx/data/annoy/100_ance_passage_map.dict")

print_message("Building index")
faiss.omp_set_num_threads(16)
dim = passage_embeddings.shape[1]
if IS_FLAT:
    cpu_index = faiss.IndexFlatIP(dim)
else:
sys.path.insert(0, '/home/jianx/search-exposure/')
import faiss
import numpy as np
import forward_ranker.load_data as load_data
from forward_ranker.utils import print_message

obj_reader = load_data.obj_reader
obj_writer = load_data.obj_writer

SAMPLE_SIZE = 1000
RANK = 100
TRAINING_DATA_PATH = "/datadrive/jianx/data/train_data/ance_training_rank{}_{}.csv".format(
    RANK, SAMPLE_SIZE)

print_message("Loading embeddings.")
passage_embeddings = obj_reader(
    "/home/jianx/results/passage_0__emb_p__data_obj_0.pb")
query_train_embeddings = obj_reader(
    "/home/jianx/results/query_0__emb_p__data_obj_0.pb")
query_train_mapping = obj_reader(
    "/datadrive/jianx/data/annoy/100_ance_query_train_map.dict")
pid_mapping = obj_reader(
    "/datadrive/jianx/data/annoy/100_ance_passage_map.dict")
pid_offset = obj_reader(
    "/datadrive/data/preprocessed_data_with_test/pid2offset.pickle")

print_message("Building index")
faiss.omp_set_num_threads(16)
dim = passage_embeddings.shape[1]
passage_index = faiss.IndexFlatIP(dim)