def main(): args = parse_args() # compute the max alias len in alias2qids with open(args.alias2qids) as f: alias2qids = ujson.load(f) with open(args.qid2title) as f: qid2title = ujson.load(f) for alias in alias2qids: assert ( alias.lower() == alias ), f"bootleg assumes lowercase aliases in alias candidate maps: {alias}" # ensure only max_candidates per alias qids = sorted(alias2qids[alias], key=lambda x: (x[1], x[0]), reverse=True) alias2qids[alias] = qids[:args.max_candidates] entity_mappings = EntitySymbols( max_candidates=args.max_candidates, alias2qids=alias2qids, qid2title=qid2title, alias_cand_map_file=args.alias_cand_map_file, ) entity_mappings.save(os.path.join(args.entity_dir, args.entity_map_dir)) print("entity mappings exported.")
def test_create_entities(self): truealias2qids = { "alias1": [["Q1", 10.0], ["Q4", 6]], "multi word alias2": [["Q2", 5.0], ["Q1", 3], ["Q4", 2]], "alias3": [["Q1", 30.0]], "alias4": [["Q4", 20], ["Q3", 15.0], ["Q2", 1]], } trueqid2title = { "Q1": "alias1", "Q2": "multi alias2", "Q3": "word alias3", "Q4": "nonalias4", } # the non-candidate class is included in entity_dump trueqid2eid = {"Q1": 1, "Q2": 2, "Q3": 3, "Q4": 4} truealias2id = {"alias1": 0, "alias3": 1, "alias4": 2, "multi word alias2": 3} truealiastrie = {"multi word alias2": 0, "alias1": 1, "alias3": 2, "alias4": 3} entity_symbols = EntitySymbols( max_candidates=3, alias2qids=truealias2qids, qid2title=trueqid2title, ) tri_as_dict = {} for k in entity_symbols._alias_trie: tri_as_dict[k] = entity_symbols._alias_trie[k] self.assertEqual(entity_symbols.max_candidates, 3) self.assertEqual(entity_symbols.max_eid, 4) self.assertEqual(entity_symbols.max_alid, 3) self.assertDictEqual(entity_symbols._alias2qids, truealias2qids) self.assertDictEqual(entity_symbols._qid2title, trueqid2title) self.assertDictEqual(entity_symbols._qid2eid, trueqid2eid) self.assertDictEqual(tri_as_dict, truealiastrie) self.assertDictEqual(entity_symbols._alias2id, truealias2id) self.assertIsNone(entity_symbols._qid2aliases) # Test load from dump temp_save_dir = "test/data/entity_loader_test" entity_symbols.save(temp_save_dir) entity_symbols = EntitySymbols.load_from_cache(temp_save_dir) self.assertEqual(entity_symbols.max_candidates, 3) self.assertEqual(entity_symbols.max_eid, 4) self.assertEqual(entity_symbols.max_alid, 3) self.assertDictEqual(entity_symbols._alias2qids, truealias2qids) self.assertDictEqual(entity_symbols._qid2title, trueqid2title) self.assertDictEqual(entity_symbols._qid2eid, trueqid2eid) self.assertDictEqual(tri_as_dict, truealiastrie) self.assertDictEqual(entity_symbols._alias2id, truealias2id) self.assertIsNone(entity_symbols._qid2aliases) shutil.rmtree(temp_save_dir) # Test edit mode entity_symbols = EntitySymbols( max_candidates=3, alias2qids=truealias2qids, qid2title=trueqid2title, edit_mode=True, ) trueqid2aliases = { "Q1": {"alias1", "multi word alias2", "alias3"}, "Q2": {"multi word alias2", "alias4"}, "Q3": {"alias4"}, "Q4": {"alias1", "multi word alias2", "alias4"}, } self.assertDictEqual(entity_symbols._qid2aliases, trueqid2aliases)
def main(): gl_start = time.time() multiprocessing.set_start_method("spawn") args = get_arg_parser().parse_args() print(json.dumps(vars(args), indent=4)) utils.ensure_dir(args.data_dir) out_dir = os.path.join(args.data_dir, args.out_subdir) if os.path.exists(out_dir): shutil.rmtree(out_dir) os.makedirs(out_dir, exist_ok=True) # Reading in files in_files_train = glob.glob(os.path.join(args.data_dir, "*.jsonl")) in_files_cand = glob.glob( os.path.join(args.contextual_cand_data, "*.jsonl")) assert len(in_files_train ) > 0, f"We didn't find any train files at {args.data_dir}" assert ( len(in_files_cand) > 0 ), f"We didn't find any contextual files at {args.contextual_cand_data}" in_files = [] for file in in_files_train: file_name = os.path.basename(file) tag = os.path.splitext(file_name)[0] is_train = "train" in tag if is_train: print( f"{file_name} is a training dataset...will be processed as such" ) pair = None for f in in_files_cand: if tag in f: pair = f break assert pair is not None, f"{file_name} name, {tag} tag" out_file = os.path.join(out_dir, file_name) in_files.append([file, pair, out_file, is_train]) final_cand_map = {} max_cands = 0 for pair in in_files: print( f"Reading in {pair[0]} with cand maps {pair[1]} and dumping to {pair[2]}" ) new_alias2qids = merge_data( args.processes, args.train_in_candidates, args.keep_orig, args.max_candidates, pair, args.entity_dump, ) for al in new_alias2qids: assert al not in final_cand_map, f"{al} is already in final_cand_map" final_cand_map[al] = new_alias2qids[al] max_cands = max(max_cands, len(final_cand_map[al])) print(f"Buidling new entity symbols") entity_dump = EntitySymbols.load_from_cache(load_dir=args.entity_dump) entity_dump_new = EntitySymbols( max_candidates=max_cands, alias2qids=final_cand_map, qid2title=entity_dump.get_qid2title(), ) out_dir = os.path.join(out_dir, "entity_db/entity_mappings") entity_dump_new.save(out_dir) print(f"Finished in {time.time() - gl_start}s")