Esempio n. 1
0
def select_device(gpu_root):
    argmin = -1
    if not torch.cuda.is_available():
        device = torch.device("cpu")
    else:
        if not os.path.exists(gpu_root):
            obj_writer([0, 0, 0, 0], gpu_root)
        gpu_usage_list = obj_reader(gpu_root)
        min = 100000
        argmin = 0
        for i, count in enumerate(gpu_usage_list):
            if count < min:
                argmin = i
                min = count
        gpu_usage_list[argmin] += 1
        device = torch.device("cuda:" + str(argmin))
        obj_writer(gpu_usage_list, gpu_root)
    return argmin, device
from load_data import obj_writer
import pickle

with open("/home/jianx/data/results/passage_embeddings.dict", 'rb') as handle:
    dictionary = pickle.load(handle)
obj_writer(dictionary, "/home/jianx/data/results/passage_embeddings.dict")
Esempio n. 3
0
                  "/" + str(len(pids_2d)))
    cosine_sim_calc = torch.nn.CosineSimilarity(dim=-1)
    passage_batch_embeddings = []
    for pid in batch:
        passage_batch_embeddings.append(
            torch.FloatTensor(passage_embeddings[pid]))
    passage_batch_embedding_tensor = torch.stack(
        passage_batch_embeddings, dim=0).unsqueeze(dim=0).to(DEVICE)
    sim = cosine_sim_calc(query_embedding_tensor,
                          passage_batch_embedding_tensor)
    cosine_similarities_tensor = torch.cat([cosine_similarities_tensor, sim],
                                           dim=1)

print_message("Finished calculating cosine similarities.")
result_dict = {}
pids_tensor = torch.tensor(pids)
for i, qid in enumerate(qids):
    print_message("Processing query: " + str(qid) + " No." + str(i + 1) + "/" +
                  str(len(qids)))
    scores = cosine_similarities_tensor[i].cpu()
    sorted_score = scores.sort(descending=True)[:1000]
    score_ids = scores.argsort(descending=True)[:1000]
    q_results = dict(
        zip(pids_tensor[score_ids].tolist(), sorted_score.tolist()))
    print_message("Query ")
    result_dict[qid] = q_results

print_message("Saving result dictionary.")
obj_writer(result_dict, "/datadrive/brute_search_all_result.dict")
print_message("Successfully saved.")
Esempio n. 4
0
for pid, embedding in passage_embeddings.items():
    print_message("Processing passage No. " + str(counter) + "/" + str(NUM_OF_DOCUMENTS))
    nearest_queries = query_index.get_nns_by_vector(embedding, NUM_OF_NEAREST_QUERIES)
    matching_list = []

    for i, annoy_qid in enumerate(nearest_queries):
        qid = qid_mapping[annoy_qid]
        top_list = passage_index.get_nns_by_vector(NET(generate_sparse(query_train_dict[qid]).to(DEVICE)).detach(),
                                                   TOP_K_RANKING)
        is_matched = False
        for j, annoy_pid in enumerate(top_list):
            if pid_mapping[annoy_pid] == pid:
                matching_list.append(j + 1)
                is_matched = True
                break
        if not is_matched:
            matching_list.append(0)
    rankings.append(matching_list)
    counter += 1

rankings_array = np.array(rankings)
obj_writer(rankings_array, "/home/jianx/data/train_data/test_rankings_10000.np")

print_message("Avg No. of matching: " + str(np.count_nonzero(rankings_array) / len(rankings_array)))
mean_rank = 0
for matching_array in rankings_array:
    if np.count_nonzero(matching_array) != 0:
        mean_rank += np.sum(matching_array) / np.count_nonzero(matching_array)
mean_rank /= len(rankings_array)
print_message("Avg rank of each document: " + str(mean_rank))
Esempio n. 5
0
def cleanup_gpu_list(current_gpu, gpu_root):
    gpu_usage_list = obj_reader(gpu_root)
    gpu_usage_list[current_gpu] -= 1
    obj_writer(gpu_usage_list, gpu_root)
def generate_collection_embedding(net,
                                  passage_dict,
                                  device=torch.device("cuda")):
    embedding_dict = {}
    counter = 0

    for key, value in passage_dict.items():
        if len(value) != 0:
            embedding_dict[key] = net(
                generate_sparse(value).to(device)).detach().tolist()
        if counter % 10000 == 0:
            print("Generating embeddings: " + str(counter) + "/" +
                  str(len(passage_dict)))
        counter += 1
    return embedding_dict


if __name__ == '__main__':
    model_path = sys.argv[1]
    target_dict = sys.argv[2]
    save_path = sys.argv[3]
    model = network.DSSM(embed_size=EMBED_SIZE)
    model.load_state_dict(torch.load(model_path))
    model.to(DEVICE)
    model.eval()
    print("Reading target dictionary.")
    passages = obj_reader(target_dict)
    embedding = generate_collection_embedding(model, passages)
    print("Saving embeddings dictionary")
    obj_writer(embedding, save_path)