def select_device(gpu_root): argmin = -1 if not torch.cuda.is_available(): device = torch.device("cpu") else: if not os.path.exists(gpu_root): obj_writer([0, 0, 0, 0], gpu_root) gpu_usage_list = obj_reader(gpu_root) min = 100000 argmin = 0 for i, count in enumerate(gpu_usage_list): if count < min: argmin = i min = count gpu_usage_list[argmin] += 1 device = torch.device("cuda:" + str(argmin)) obj_writer(gpu_usage_list, gpu_root) return argmin, device
from load_data import obj_writer import pickle with open("/home/jianx/data/results/passage_embeddings.dict", 'rb') as handle: dictionary = pickle.load(handle) obj_writer(dictionary, "/home/jianx/data/results/passage_embeddings.dict")
"/" + str(len(pids_2d))) cosine_sim_calc = torch.nn.CosineSimilarity(dim=-1) passage_batch_embeddings = [] for pid in batch: passage_batch_embeddings.append( torch.FloatTensor(passage_embeddings[pid])) passage_batch_embedding_tensor = torch.stack( passage_batch_embeddings, dim=0).unsqueeze(dim=0).to(DEVICE) sim = cosine_sim_calc(query_embedding_tensor, passage_batch_embedding_tensor) cosine_similarities_tensor = torch.cat([cosine_similarities_tensor, sim], dim=1) print_message("Finished calculating cosine similarities.") result_dict = {} pids_tensor = torch.tensor(pids) for i, qid in enumerate(qids): print_message("Processing query: " + str(qid) + " No." + str(i + 1) + "/" + str(len(qids))) scores = cosine_similarities_tensor[i].cpu() sorted_score = scores.sort(descending=True)[:1000] score_ids = scores.argsort(descending=True)[:1000] q_results = dict( zip(pids_tensor[score_ids].tolist(), sorted_score.tolist())) print_message("Query ") result_dict[qid] = q_results print_message("Saving result dictionary.") obj_writer(result_dict, "/datadrive/brute_search_all_result.dict") print_message("Successfully saved.")
for pid, embedding in passage_embeddings.items(): print_message("Processing passage No. " + str(counter) + "/" + str(NUM_OF_DOCUMENTS)) nearest_queries = query_index.get_nns_by_vector(embedding, NUM_OF_NEAREST_QUERIES) matching_list = [] for i, annoy_qid in enumerate(nearest_queries): qid = qid_mapping[annoy_qid] top_list = passage_index.get_nns_by_vector(NET(generate_sparse(query_train_dict[qid]).to(DEVICE)).detach(), TOP_K_RANKING) is_matched = False for j, annoy_pid in enumerate(top_list): if pid_mapping[annoy_pid] == pid: matching_list.append(j + 1) is_matched = True break if not is_matched: matching_list.append(0) rankings.append(matching_list) counter += 1 rankings_array = np.array(rankings) obj_writer(rankings_array, "/home/jianx/data/train_data/test_rankings_10000.np") print_message("Avg No. of matching: " + str(np.count_nonzero(rankings_array) / len(rankings_array))) mean_rank = 0 for matching_array in rankings_array: if np.count_nonzero(matching_array) != 0: mean_rank += np.sum(matching_array) / np.count_nonzero(matching_array) mean_rank /= len(rankings_array) print_message("Avg rank of each document: " + str(mean_rank))
def cleanup_gpu_list(current_gpu, gpu_root): gpu_usage_list = obj_reader(gpu_root) gpu_usage_list[current_gpu] -= 1 obj_writer(gpu_usage_list, gpu_root)
def generate_collection_embedding(net, passage_dict, device=torch.device("cuda")): embedding_dict = {} counter = 0 for key, value in passage_dict.items(): if len(value) != 0: embedding_dict[key] = net( generate_sparse(value).to(device)).detach().tolist() if counter % 10000 == 0: print("Generating embeddings: " + str(counter) + "/" + str(len(passage_dict))) counter += 1 return embedding_dict if __name__ == '__main__': model_path = sys.argv[1] target_dict = sys.argv[2] save_path = sys.argv[3] model = network.DSSM(embed_size=EMBED_SIZE) model.load_state_dict(torch.load(model_path)) model.to(DEVICE) model.eval() print("Reading target dictionary.") passages = obj_reader(target_dict) embedding = generate_collection_embedding(model, passages) print("Saving embeddings dictionary") obj_writer(embedding, save_path)