def generate_annoy_index(embed_size, embeddings, raw_data_path): offset_mapping = obj_reader(raw_data_path) mapping = {v: k for k, v in offset_mapping.items()} index = AnnoyIndex(embed_size, 'dot') for i, value in enumerate(embeddings): index.add_item(i, value) if i % 1_000_000 == 0: print_message("Progress: " + str(i) + "/" + str(len(embeddings)) + " " + str(i / len(embeddings)))
def generate_annoy_index(embed_size, embeddings): mapping = {} i = 0 index = AnnoyIndex(embed_size, 'euclidean') for key, value in embeddings.items(): index.add_item(i, value) mapping[i] = key i += 1 if i % 50000 == 0: print_message("Progress: " + str(i) + "/" + str(len(embeddings)) + " " + str(i / len(embeddings))) return index, mapping
def grid_nrbp(p_forwards=[0.5, 0.9, 1], p_reverses=[0.5, 0.9, 1], ranks=[100]): x = [] y = [] data = [] baseline_data = [] opts = get_opts() ## Need to specify following arguments # reverse_ranker_path # test_data_path # device # active_learning_stage true_dict, baseline_dict, result_dict, args_dict = testing(opts) active_learning = args_dict["active_learning"] network_type = args_dict["network_type"] num_query = args_dict["num_query"] num_passage = args_dict["num_passage"] for r in ranks: for p_forward in p_forwards: rating_dict = transform_ground_truth(true_dict, p_forward) for p_reverse in p_reverses: x.append(p_forward) y.append(p_reverse) baseline_nrbp = calculate_metrics(rating_dict, baseline_dict, r, p_reverse) model_nrbp = calculate_metrics(rating_dict, result_dict, r, p_reverse) # data.append((model_nrbp-baseline_nrbp)/baseline_nrbp) data.append(model_nrbp) baseline_data.append(baseline_nrbp) print_message("Processed p_forward={}, p_reverse={}".format( p_forward, p_reverse)) # Write results to csv output_results = [active_learning, network_type, num_query, num_passage ] + data + baseline_data with open(OUTPUT_PATH, mode='a+') as output: output_writer = csv.writer(output) output_writer.writerow(output_results)
active_learning_option = opts.active_learning_option active_learning_stage = opts.active_learning_stage device = opts.device data_option = opts.data_option N_QUERIES = opts.n_query TRAIN_SIZE = opts.n_passage if active_learning_option == "No": TRAINING_DATA_PATH = "/datadrive/ruohan/final_train_test_data/ance_training_rank{}_nqueries{}_npassages{}_{}.csv".format( RANK, N_QUERIES, TRAIN_SIZE, data_option) TEST_DATA_PATH = "/datadrive/ruohan/final_train_test_data/ance_testing_rank{}_nqueries{}_npassages{}_{}.csv".format( RANK, N_QUERIES, TEST_SIZE, data_option) print("Training data is stored at {}".format(TRAINING_DATA_PATH)) print("Test data is stored at {}".format(TEST_DATA_PATH)) print_message("Loading embeddings.") passage_embeddings = obj_reader( "/home/jianx/results/passage_0__emb_p__data_obj_0.pb") query_train_embeddings = obj_reader( "/home/jianx/results/query_0__emb_p__data_obj_0.pb") else: reverse_ranker_path = opts.reverse_ranker_path reverse_ranker, network_type = load_model(reverse_ranker_path, device) passage_embeddings = obj_reader( "/home/jianx/results/passage_0__emb_p__data_obj_0.pb") query_train_embeddings = obj_reader( "/home/jianx/results/query_0__emb_p__data_obj_0.pb") passage_embeddings = transform_np_transformation(passage_embeddings, reverse_ranker, device) query_train_embeddings = transform_np_transformation(
import random import numpy as np import matplotlib.pyplot as plt PASSAGE_DICT_PATH = "/datadrive/jianx/data/passages.dict" QUERY_TRAIN_DICT_PATH = "/datadrive/jianx/data/queries_train.dict" # Queries plain text: queries.train.tsv # Passages plain text: collection.tsv QUERIES_TEXT_PATH = "/datadrive/jianx/data/queries.train.tsv" PASSAGES_TEXT_PATH = "/datadrive/jianx/data/collection.tsv" obj_reader = load_data.obj_reader obj_writer = load_data.obj_writer print_message("Loading embeddings.") passage_embeddings = obj_reader( "/home/jianx/results/passage_0__emb_p__data_obj_0.pb") query_train_embeddings = obj_reader( "/home/jianx/results/query_0__emb_p__data_obj_0.pb") from sklearn.decomposition import PCA all_embeddings = np.concatenate((passage_embeddings, query_train_embeddings), axis=0) print(all_embeddings.shape) pca = PCA(n_components=50) pca.fit(all_embeddings) print("PCA Explained Variance: {}%".format( np.round(sum(pca.explained_variance_ratio_) * 100, 4))) all_embeddings_pca = pca.transform(all_embeddings)
TREE_SIZE = int(sys.argv[4]) TYPE = sys.argv[5] def generate_annoy_index(embed_size, embeddings, raw_data_path): offset_mapping = obj_reader(raw_data_path) mapping = {v: k for k, v in offset_mapping.items()} index = AnnoyIndex(embed_size, 'dot') for i, value in enumerate(embeddings): index.add_item(i, value) if i % 1_000_000 == 0: print_message("Progress: " + str(i) + "/" + str(len(embeddings)) + " " + str(i / len(embeddings))) return index, mapping passage_embeddings = obj_reader(EMBEDDING_PATH) print(passage_embeddings.shape) passage_index, passage_mapping = generate_annoy_index(768, passage_embeddings, RAW_PATH) del passage_embeddings obj_writer(passage_mapping, OUT_DIR + str(TREE_SIZE) + "_ance_" + TYPE + "_map.dict") del passage_mapping print_message("Start Building.") passage_index.build(TREE_SIZE) print_message("Finished Building.") passage_index.save(OUT_DIR + str(TREE_SIZE) + "_" + TYPE + "_ance_index.ann") print_message("Successfully Saved.")
def generate_annoy_index(embed_size, embeddings): mapping = {} i = 0 index = AnnoyIndex(embed_size, 'euclidean') for key, value in embeddings.items(): index.add_item(i, value) mapping[i] = key i += 1 if i % 50000 == 0: print_message("Progress: " + str(i) + "/" + str(len(embeddings)) + " " + str(i / len(embeddings))) return index, mapping print_message("Start Loading Embeddings") PASSAGE_EMBEDDINGS = obj_reader( "/home/jianx/data/results/passage_embeddings.dict") print_message("Embeddings Successfully Loaded") PID_INDEX, PID_MAP = generate_annoy_index(EMBED_SIZE, PASSAGE_EMBEDDINGS) del PASSAGE_EMBEDDINGS print_message("Start Building.") PID_INDEX.build(TREE_SIZE) print_message("Finished Building.") PID_INDEX.save("/home/jianx/data/annoy/" + str(TREE_SIZE) + "_passage_index.ann") obj_writer(PID_MAP, "/home/jianx/data/annoy/" + str(TREE_SIZE) + "_pid_map.dict") print_message("Successfully Saved.")
sys.path.insert(0, '/home/jianx/search-exposure/') import faiss import forward_ranker.load_data as load_data from forward_ranker.utils import print_message obj_reader = load_data.obj_reader obj_writer = load_data.obj_writer IS_FLAT = False BATCH_SIZE = 100 NLIST = 100 RANK = 100 OUT_PATH = "/datadrive/jianx/data/results/all_search_rankings_{}_{}_{}.csv".format( NLIST, RANK, "flat" if IS_FLAT else "approximate") print_message("Loading embeddings.") passage_embeddings = obj_reader( "/home/jianx/results/passage_0__emb_p__data_obj_0.pb") query_train_embeddings = obj_reader( "/home/jianx/results/query_0__emb_p__data_obj_0.pb") query_train_mapping = obj_reader( "/datadrive/jianx/data/annoy/100_ance_query_train_map.dict") pid_mapping = obj_reader( "/datadrive/jianx/data/annoy/100_ance_passage_map.dict") print_message("Building index") faiss.omp_set_num_threads(16) dim = passage_embeddings.shape[1] if IS_FLAT: cpu_index = faiss.IndexFlatIP(dim) else:
sys.path.insert(0, '/home/jianx/search-exposure/') import faiss import numpy as np import forward_ranker.load_data as load_data from forward_ranker.utils import print_message obj_reader = load_data.obj_reader obj_writer = load_data.obj_writer SAMPLE_SIZE = 1000 RANK = 100 TRAINING_DATA_PATH = "/datadrive/jianx/data/train_data/ance_training_rank{}_{}.csv".format( RANK, SAMPLE_SIZE) print_message("Loading embeddings.") passage_embeddings = obj_reader( "/home/jianx/results/passage_0__emb_p__data_obj_0.pb") query_train_embeddings = obj_reader( "/home/jianx/results/query_0__emb_p__data_obj_0.pb") query_train_mapping = obj_reader( "/datadrive/jianx/data/annoy/100_ance_query_train_map.dict") pid_mapping = obj_reader( "/datadrive/jianx/data/annoy/100_ance_passage_map.dict") pid_offset = obj_reader( "/datadrive/data/preprocessed_data_with_test/pid2offset.pickle") print_message("Building index") faiss.omp_set_num_threads(16) dim = passage_embeddings.shape[1] passage_index = faiss.IndexFlatIP(dim)