def detect_mentions(self, text):
     logging.info("Detecting mentions...")
     tokenizer = RegexpTokenizer()
     tokens = tokenizer.tokenize(text)
     response = []
     for mention in self.mention_db.detect_mentions(text, tokens):
         m = {
             "text": mention.text,
             "entity": mention.entity.title,
             "url": "https://{}.wikipedia.org/wiki/".format(self.lang) + \
                 mention.entity.title.replace(' ', '_')
         }
         if m not in response:
             response.append(m)
     return response
class TestRegexpTokenizer(unittest.TestCase):
    def setUp(self):
        self._tokenizer = RegexpTokenizer()

    def test_tokenize(self):
        text = 'Tokyo is the capital of Japan'
        tokens = self._tokenizer.tokenize(text)

        ok_(all([isinstance(t, Token) for t in tokens]))
        eq_(['Tokyo', 'is', 'the', 'capital', 'of', 'Japan'], [t.text for t in tokens])
        eq_([(0, 5), (6, 8), (9, 12), (13, 20), (21, 23), (24, 29)], [t.span for t in tokens])
Esempio n. 3
0
class TestRegexpTokenizer(unittest.TestCase):
    def setUp(self):
        self._tokenizer = RegexpTokenizer()
        phrase_dict = PhraseDictionary(Trie(['New York City', 'New York', 'United States']), False, {})
        self._phrase_tokenizer = RegexpTokenizer(phrase_dict)

    def test_tokenize(self):
        text = 'Tokyo is the capital of Japan'
        tokens = self._tokenizer.tokenize(text)

        ok_(all([isinstance(t, Token) for t in tokens]))
        eq_(['Tokyo', 'is', 'the', 'capital', 'of', 'Japan'], [t.text for t in tokens])
        eq_([(0, 5), (6, 8), (9, 12), (13, 20), (21, 23), (24, 29)], [t.span for t in tokens])

    def test_tokenize_with_phrases(self):
        text = 'New York City is the capital city of the United States'
        tokens = self._phrase_tokenizer.tokenize(text)

        ok_(all([isinstance(t, Token) for t in tokens]))
        eq_(['New York City', 'is', 'the', 'capital', 'city', 'of', 'the', 'United States'],
            [t.text for t in tokens])
        eq_([(0, 13), (14, 16), (17, 20), (21, 28), (29, 33), (34, 36), (37, 40), (41, 54)],
            [t.span for t in tokens])
Esempio n. 4
0
def train_classifier(wikipedia2vec_file, entity_linker_file, dataset,
                     dataset_path, dev_size, **kwargs):
    if dataset == '20ng':
        data = load_20ng_dataset(dev_size)
    else:
        data = load_r8_dataset(dataset_path, dev_size)

    for key, value in DEFAULT_HYPER_PARAMS[dataset].items():
        if kwargs[key] is None:
            kwargs[key] = value

    tokenizer = RegexpTokenizer()
    entity_linker = EntityLinker(entity_linker_file)
    embedding = Wikipedia2Vec.load(wikipedia2vec_file)

    return train(data, embedding, tokenizer, entity_linker, **kwargs)
Esempio n. 5
0
def build_entity_linker(dump_db_file, **kwargs):
    dump_db = DumpDB(dump_db_file)
    tokenizer = RegexpTokenizer()
    EntityLinker.build(dump_db, tokenizer, **kwargs)
Esempio n. 6
0
 def setUp(self):
     self._tokenizer = RegexpTokenizer()
Esempio n. 7
0
 def setUp(self):
     self._tokenizer = RegexpTokenizer()
     phrase_dict = PhraseDictionary(Trie(['New York City', 'New York', 'United States']), False, {})
     self._phrase_tokenizer = RegexpTokenizer(phrase_dict)