def test_merge_subsentences(self): test_full_emb_file = tempfile.NamedTemporaryFile() test_merged_emb_file = tempfile.NamedTemporaryFile() gold_merged_emb_file = tempfile.NamedTemporaryFile() num_examples = 3 total_num_mentions = 7 M = 3 K = 2 hidden_size = 2 # create full embedding file storage_type_full = np.dtype([('M', int), ('K', int), ('hidden_size', int), ('sent_idx', int), ('subsent_idx', int), ('alias_list_pos', int, M), ('entity_emb', float, M*hidden_size), ('final_loss_true', int, M), ('final_loss_pred', int, M), ('final_loss_prob', float, M), ('final_loss_cand_probs', float, M*K)]) full_emb = np.memmap(test_full_emb_file.name, dtype=storage_type_full, mode='w+', shape=(num_examples,)) # 2 sentences, 1st sent has 1 subsentence, 2nd sentence has 2 subsentences # first sentence full_emb['hidden_size'] = hidden_size full_emb['M'] = M full_emb['K'] = K full_emb[0]['sent_idx'] = 0 full_emb[0]['subsent_idx'] = 0 # last alias is padded full_emb[0]['alias_list_pos'] = np.array([0, 1, -1]) full_emb[0]['final_loss_true'] = np.array([0, 1, -1]) # entity embs are flattened full_emb[0]['entity_emb'] = np.array([0, 1, 2, 3, 0, 0]) full_emb[1]['sent_idx'] = 1 full_emb[1]['subsent_idx'] = 0 full_emb[1]['alias_list_pos'] = np.array([0, 1, 2]) # last alias goes with next subsentence full_emb[1]['final_loss_true'] = np.array([1, 1, -1]) full_emb[1]['entity_emb'] = np.array([4, 5, 6, 7, 8, 9]) full_emb[2]['sent_idx'] = 1 full_emb[2]['subsent_idx'] = 1 full_emb[2]['alias_list_pos'] = np.array([2, 3, 4]) full_emb[2]['final_loss_true'] = np.array([1, 1, 1]) full_emb[2]['entity_emb'] = np.array([10, 11, 12, 13, 14, 15]) # create merged embedding file storage_type_merged = np.dtype([('hidden_size', int), ('sent_idx', int), ('alias_list_pos', int), ('entity_emb', float, hidden_size), ('final_loss_pred', int), ('final_loss_prob', float), ('final_loss_cand_probs', float, K)]) merged_emb_gold = np.memmap(gold_merged_emb_file.name, dtype=storage_type_merged, mode="w+", shape=(total_num_mentions,)) merged_emb_gold['entity_emb'] = np.array([[0, 1], [2, 3], [4, 5], [6, 7], [10, 11], [12, 13], [14, 15]]) # create data file -- just needs aliases and sentence indices data = [{'aliases': ['a', 'b'], 'sent_idx_unq': 0}, {'aliases': ['c', 'd', 'e', 'f', 'g'], 'sent_idx_unq': 1}] temp_file = tempfile.NamedTemporaryFile(delete=False).name with jsonlines.open(temp_file, 'w') as f: for row in data: f.write(row) # assert that output of merge_subsentences is correct num_processes = 2 eval_utils.merge_subsentences( num_processes, temp_file, test_merged_emb_file.name, storage_type_merged, test_full_emb_file.name, storage_type_full, dump_embs=True) bootleg_merged_emb = np.memmap(test_merged_emb_file.name, dtype=storage_type_merged, mode="r+") merged_emb_gold = np.memmap(gold_merged_emb_file.name, dtype=storage_type_merged, mode="r+") assert len(bootleg_merged_emb) == total_num_mentions for i in range(len(bootleg_merged_emb)): assert np.array_equal(bootleg_merged_emb[i]['entity_emb'], merged_emb_gold[i]['entity_emb']) # clean up if os.path.exists(temp_file): os.remove(temp_file) test_full_emb_file.close() test_merged_emb_file.close() gold_merged_emb_file.close()
def test_merge_subsentences(self): test_full_emb_file = tempfile.NamedTemporaryFile() test_merged_emb_file = tempfile.NamedTemporaryFile() gold_merged_emb_file = tempfile.NamedTemporaryFile() cache_folder = tempfile.TemporaryDirectory() num_examples = 3 total_num_mentions = 7 M = 3 K = 2 hidden_size = 2 # create full embedding file storage_type_full = np.dtype([ ("M", int), ("K", int), ("hidden_size", int), ("sent_idx", int), ("subsent_idx", int), ("alias_list_pos", int, M), ("entity_emb", float, M * hidden_size), ("final_loss_true", int, M), ("final_loss_pred", int, M), ("final_loss_prob", float, M), ("final_loss_cand_probs", float, M * K), ]) full_emb = np.memmap( test_full_emb_file.name, dtype=storage_type_full, mode="w+", shape=(num_examples, ), ) # 2 sentences, 1st sent has 1 subsentence, 2nd sentence has 2 subsentences # first sentence full_emb["hidden_size"] = hidden_size full_emb["M"] = M full_emb["K"] = K full_emb[0]["sent_idx"] = 0 full_emb[0]["subsent_idx"] = 0 # last alias is padded full_emb[0]["alias_list_pos"] = np.array([0, 1, -1]) full_emb[0]["final_loss_true"] = np.array([0, 1, -1]) # entity embs are flattened full_emb[0]["entity_emb"] = np.array([0, 1, 2, 3, 0, 0]) full_emb[1]["sent_idx"] = 1 full_emb[1]["subsent_idx"] = 0 full_emb[1]["alias_list_pos"] = np.array([0, 1, 2]) # last alias goes with next subsentence full_emb[1]["final_loss_true"] = np.array([1, 1, -1]) full_emb[1]["entity_emb"] = np.array([4, 5, 6, 7, 8, 9]) full_emb[2]["sent_idx"] = 1 full_emb[2]["subsent_idx"] = 1 full_emb[2]["alias_list_pos"] = np.array([2, 3, 4]) full_emb[2]["final_loss_true"] = np.array([1, 1, 1]) full_emb[2]["entity_emb"] = np.array([10, 11, 12, 13, 14, 15]) # create merged embedding file storage_type_merged = np.dtype([ ("hidden_size", int), ("sent_idx", int), ("alias_list_pos", int), ("entity_emb", float, hidden_size), ("final_loss_pred", int), ("final_loss_prob", float), ("final_loss_cand_probs", float, K), ]) merged_emb_gold = np.memmap( gold_merged_emb_file.name, dtype=storage_type_merged, mode="w+", shape=(total_num_mentions, ), ) merged_emb_gold["entity_emb"] = np.array([[0, 1], [2, 3], [4, 5], [6, 7], [10, 11], [12, 13], [14, 15]]) # create data file -- just needs aliases and sentence indices data = [ { "aliases": ["a", "b"], "sent_idx_unq": 0 }, { "aliases": ["c", "d", "e", "f", "g"], "sent_idx_unq": 1 }, ] # Keys are string for trie sent_idx2num_mentions = {"0": 2, "1": 5} temp_file = tempfile.NamedTemporaryFile(delete=False).name with jsonlines.open(temp_file, "w") as f: for row in data: f.write(row) # assert that output of merge_subsentences is correct num_processes = 1 eval_utils.merge_subsentences( num_processes, sent_idx2num_mentions, cache_folder.name, test_merged_emb_file.name, storage_type_merged, test_full_emb_file.name, storage_type_full, dump_embs=True, ) bootleg_merged_emb = np.memmap(test_merged_emb_file.name, dtype=storage_type_merged, mode="r+") merged_emb_gold = np.memmap(gold_merged_emb_file.name, dtype=storage_type_merged, mode="r+") assert len(bootleg_merged_emb) == total_num_mentions for i in range(len(bootleg_merged_emb)): assert np.array_equal(bootleg_merged_emb[i]["entity_emb"], merged_emb_gold[i]["entity_emb"]) # Try with multiprocessing num_processes = 5 eval_utils.merge_subsentences( num_processes, sent_idx2num_mentions, cache_folder.name, test_merged_emb_file.name, storage_type_merged, test_full_emb_file.name, storage_type_full, dump_embs=True, ) bootleg_merged_emb = np.memmap(test_merged_emb_file.name, dtype=storage_type_merged, mode="r+") merged_emb_gold = np.memmap(gold_merged_emb_file.name, dtype=storage_type_merged, mode="r+") assert len(bootleg_merged_emb) == total_num_mentions for i in range(len(bootleg_merged_emb)): assert np.array_equal(bootleg_merged_emb[i]["entity_emb"], merged_emb_gold[i]["entity_emb"]) # clean up if os.path.exists(temp_file): os.remove(temp_file) test_full_emb_file.close() test_merged_emb_file.close() gold_merged_emb_file.close() cache_folder.cleanup()