Beispiel #1
0
    def test_save_load(self):
        def validate(obj):
            s1 = self.dictionary.serialize()
            s2 = obj.serialize()
            for key in s1.keys():
                if isinstance(s1[key], np.ndarray):
                    np.array_equal(s1[key], s2[key])
                else:
                    eq_(s1[key], s2[key])

        validate(Dictionary.load(self.dictionary.serialize()))

        with NamedTemporaryFile() as f:
            self.dictionary.save(f.name)
            validate(Dictionary.load(f.name))
Beispiel #2
0
 def setUp(self):
     tokenizer = get_tokenizer('regexp')
     self.dictionary = Dictionary.build(get_dump_db(),
                                        tokenizer=tokenizer,
                                        lowercase=True,
                                        min_word_count=2,
                                        min_entity_count=1,
                                        min_paragraph_len=5,
                                        category=True,
                                        disambi=False,
                                        pool_size=1,
                                        chunk_size=1,
                                        progressbar=False)
Beispiel #3
0
 def setUp(self):
     self.dictionary = Dictionary.build(get_dump_db(),
                                        None,
                                        lowercase=True,
                                        min_word_count=2,
                                        min_entity_count=1,
                                        pool_size=1,
                                        chunk_size=1,
                                        min_paragraph_len=5,
                                        category=True,
                                        progressbar=False)
     self.link_graph = LinkGraph.build(get_dump_db(),
                                       self.dictionary,
                                       pool_size=1,
                                       chunk_size=1,
                                       progressbar=False)
 def setUp(self):
     self.phrase_dic = PhraseDictionary.build(get_dump_db(),
                                              min_link_count=0,
                                              min_link_prob=0.1,
                                              lowercase=True,
                                              max_phrase_len=3,
                                              pool_size=1,
                                              chunk_size=1,
                                              progressbar=False)
     self.dictionary = Dictionary.build(get_dump_db(),
                                        phrase_dict=self.phrase_dic,
                                        lowercase=True,
                                        min_word_count=2,
                                        min_entity_count=1,
                                        min_paragraph_len=5,
                                        category=True,
                                        pool_size=1,
                                        chunk_size=1,
                                        progressbar=False)
Beispiel #5
0
 def __init__(self):
     self.dic = Dictionary.load(WIKI_DICTIONARY_PATH)
     self.db = MentionDB.load(WIKI_MENTION_DB_PATH, self.dic)
Beispiel #6
0
# -*- coding: utf-8 -*-

import sys
from wikipedia2vec.dictionary import Dictionary
from wikipedia2vec.mention_db import MentionDB
from wikipedia2vec.utils.tokenizer.mecab_tokenizer import MeCabTokenizer

dic = Dictionary.load(sys.argv[2])
db = MentionDB.load(sys.argv[3], dic)
with open(sys.argv[1]) as f:
    text = f.read()

tokenizer = MeCabTokenizer()
tokens = tokenizer.tokenize(text)

for mention in db.detect_mentions(text, tokens):
    print(mention)
 def __init__(self, lang, dic, mention_db):
     self.lang = lang
     self.dic = Dictionary.load(dic)
     self.mention_db = MentionDB.load(mention_db, self.dic)