def load(in_file, mmap_mode='r'): obj = joblib.load(in_file, mmap_mode=mmap_mode) title_dict = Trie() redirect_dict = RecordTrie('<I') title_dict.frombytes(obj['title_dict']) redirect_dict.frombytes(obj['redirect_dict']) return EntityDB(title_dict, redirect_dict, obj['inlink_arr'])
def load(input): if isinstance(input, dict): obj = input else: obj = joblib.load(input) dic = Trie() dic.frombytes(obj['dic']) return WordVocab(dic, obj['lowercase'], obj.get('start_index', 0))
def load(in_file: str, mmap_mode="r"): data = joblib.load(in_file, mmap_mode=mmap_mode) title_trie = Trie() title_trie = title_trie.frombytes(data["title_trie"]) data["title_trie"] = title_trie return InterwikiDB(**data)
def load(target, device, mmap=True): word_dict = Trie() entity_dict = Trie() redirect_dict = RecordTrie("<I") if not isinstance(target, dict): if mmap: target = joblib.load(target, mmap_mode="r") else: target = joblib.load(target) word_dict.frombytes(target["word_dict"]) entity_dict.frombytes(target["entity_dict"]) redirect_dict.frombytes(target["redirect_dict"]) word_stats = target["word_stats"] entity_stats = target["entity_stats"] if not isinstance(word_stats, np.ndarray): word_stats = np.frombuffer( word_stats, dtype=np.int32, ).reshape(-1, 2) word_stats = torch.tensor( word_stats, device=device, requires_grad=False, ) entity_stats = np.frombuffer( entity_stats, dtype=np.int32, ).reshape(-1, 2) entity_stats = torch.tensor( entity_stats, device=device, requires_grad=False, ) return Wikipedia2VecDict( word_dict, entity_dict, redirect_dict, word_stats, entity_stats, **target["meta"], )
dictionary = Dictionary.load(args.src) dumpdb = DumpDB(args.dumpdb) with open(args.entity_file) as handle: all_needed_entities_raw = set(handle.readlines()) title2dest_title = dict(dumpdb.redirects()) all_needed_entities = set([ title2dest_title.get(title, title) for title in all_needed_entities_raw ]) src_file = joblib.load(args.src) old_word_dict = Trie() old_word_dict.frombytes(src_file['word_dict']) old_word_stats = src_file['word_stats'] old_entity_stats = src_file['entity_stats'] all_old_entities = [ent.title for ent in dictionary.entities()] all_old_entities_set = set(all_old_entities) all_new_entities = sorted([ ent for ent in all_needed_entities if not ent in all_old_entities_set ]) joint_entity_stats = np.concatenate([ old_entity_stats, np.array([[5, 5] for _ in all_new_entities]).astype(old_entity_stats.dtype) ])