コード例 #1
0
ファイル: entity_db.py プロジェクト: zwytop/GraphEmbedding
    def load(in_file, mmap_mode='r'):
        obj = joblib.load(in_file, mmap_mode=mmap_mode)

        title_dict = Trie()
        redirect_dict = RecordTrie('<I')
        title_dict.frombytes(obj['title_dict'])
        redirect_dict.frombytes(obj['redirect_dict'])

        return EntityDB(title_dict, redirect_dict, obj['inlink_arr'])
コード例 #2
0
    def load(input):
        if isinstance(input, dict):
            obj = input
        else:
            obj = joblib.load(input)

        dic = Trie()
        dic.frombytes(obj['dic'])
        return WordVocab(dic, obj['lowercase'], obj.get('start_index', 0))
コード例 #3
0
ファイル: interwiki_db.py プロジェクト: pnarsina/w266_final
    def load(in_file: str, mmap_mode="r"):
        data = joblib.load(in_file, mmap_mode=mmap_mode)
        title_trie = Trie()
        title_trie = title_trie.frombytes(data["title_trie"])
        data["title_trie"] = title_trie

        return InterwikiDB(**data)
コード例 #4
0
ファイル: wikipedia2vec.py プロジェクト: peternara/pororo-nlp
    def load(target, device, mmap=True):
        word_dict = Trie()
        entity_dict = Trie()
        redirect_dict = RecordTrie("<I")

        if not isinstance(target, dict):
            if mmap:
                target = joblib.load(target, mmap_mode="r")
            else:
                target = joblib.load(target)

        word_dict.frombytes(target["word_dict"])
        entity_dict.frombytes(target["entity_dict"])
        redirect_dict.frombytes(target["redirect_dict"])

        word_stats = target["word_stats"]
        entity_stats = target["entity_stats"]
        if not isinstance(word_stats, np.ndarray):
            word_stats = np.frombuffer(
                word_stats,
                dtype=np.int32,
            ).reshape(-1, 2)
            word_stats = torch.tensor(
                word_stats,
                device=device,
                requires_grad=False,
            )
            entity_stats = np.frombuffer(
                entity_stats,
                dtype=np.int32,
            ).reshape(-1, 2)
            entity_stats = torch.tensor(
                entity_stats,
                device=device,
                requires_grad=False,
            )

        return Wikipedia2VecDict(
            word_dict,
            entity_dict,
            redirect_dict,
            word_stats,
            entity_stats,
            **target["meta"],
        )
コード例 #5
0
    dictionary = Dictionary.load(args.src)
    dumpdb = DumpDB(args.dumpdb)

    with open(args.entity_file) as handle:
        all_needed_entities_raw = set(handle.readlines())

    title2dest_title = dict(dumpdb.redirects())
    all_needed_entities = set([
        title2dest_title.get(title, title) for title in all_needed_entities_raw
    ])

    src_file = joblib.load(args.src)

    old_word_dict = Trie()
    old_word_dict.frombytes(src_file['word_dict'])

    old_word_stats = src_file['word_stats']
    old_entity_stats = src_file['entity_stats']

    all_old_entities = [ent.title for ent in dictionary.entities()]
    all_old_entities_set = set(all_old_entities)

    all_new_entities = sorted([
        ent for ent in all_needed_entities if not ent in all_old_entities_set
    ])
    joint_entity_stats = np.concatenate([
        old_entity_stats,
        np.array([[5, 5]
                  for _ in all_new_entities]).astype(old_entity_stats.dtype)
    ])