def test_prettyprint_class(self): text = 'Hello everyone, this is me speaking. And me.' source = NerSourceLexicon({'everyone': 'http://example.com/everyone', 'me': 'http://example.com/me'}) ner = NerProcess((source,)) named_entities = ner.process_text(text) html = HTMLPrettyPrint().pprint_text(text, named_entities, html_class='ner') self.assertEqual(html, (u'Hello <a href="http://example.com/everyone" class="ner">everyone</a>, ' u'this is <a href="http://example.com/me" class="ner">me</a> speaking. ' u'And <a href="http://example.com/me" class="ner">me</a>.'))
def test_ner_process_preprocess(self): """ Test ner process """ text = 'Hello Toto, this is me speaking. And me.' source = NerSourceLexicon({'Toto': 'http://example.com/toto', 'me': 'http://example.com/me'}) preprocessor = NerStopwordsFilterPreprocessor() ner = NerProcess((source,), preprocessors=(preprocessor,)) named_entities = ner.process_text(text) self.assertEqual(named_entities, [('http://example.com/toto', None, Token(word='Toto', start=6, end=10, sentence=Sentence(indice=0, start=0, end=34)))])
def test_occurence_filter_max_occ(self): """ Test occurence filter """ text = 'Hello everyone, this is me speaking. And me.' source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone', 'me': 'http://example.com/me'}) source2 = NerSourceLexicon({'me': 'http://example2.com/me'}) _filter = NerOccurenceFilter(max_occ=1) ner = NerProcess((source1, source2), filters=(_filter,)) named_entities = ner.process_text(text) self.assertEqual(named_entities, [('http://example.com/everyone', None, Token(word='everyone', start=6, end=14, sentence=Sentence(indice=0, start=0, end=38))),])
def test_disambiguation_word_case(self): """ Test occurence filter """ text = 'Hello Toto Tutu. And Toto.' source = NerSourceLexicon({'Toto Tutu': 'http://example.com/toto_tutu', 'Toto': 'http://example.com/toto'}) _filter = NerDisambiguationWordParts() ner = NerProcess((source,), filters=(_filter,)) named_entities = ner.process_text(text) self.assertEqual(named_entities, [('http://example.com/toto_tutu', None, Token(word='Toto Tutu', start=6, end=15, sentence=Sentence(indice=0, start=0, end=16))), ('http://example.com/toto_tutu', None, Token(word='Toto', start=21, end=25, sentence=Sentence(indice=1, start=17, end=26)))])
def test_rules_filter(self): """ Test rules filter """ text = 'Hello toto tutu. And toto.' source = NerSourceLexicon({'toto tutu': 'http://example.com/toto_tutu', 'toto': 'http://example.com/toto'}) rules = {'http://example.com/toto': 'http://example.com/tata'} _filter = NerReplacementRulesFilter(rules) ner = NerProcess((source,), filters=(_filter,)) named_entities = ner.process_text(text) self.assertEqual(named_entities, [('http://example.com/toto_tutu', None, Token(word='toto tutu', start=6, end=15, sentence=Sentence(indice=0, start=0, end=16))), ('http://example.com/tata', None, Token(word='toto', start=21, end=25, sentence=Sentence(indice=1, start=17, end=26)))])
def test_ner_process(self): """ Test ner process """ text = 'Hello everyone, this is me speaking. And me.' source = NerSourceLexicon({'everyone': 'http://example.com/everyone', 'me': 'http://example.com/me'}) ner = NerProcess((source,)) named_entities = ner.process_text(text) self.assertEqual(named_entities, [('http://example.com/everyone', None, Token(word='everyone', start=6, end=14, sentence=Sentence(indice=0, start=0, end=38))), ('http://example.com/me', None, Token(word='me', start=26, end=28, sentence=Sentence(indice=0, start=0, end=38))), ('http://example.com/me', None, Token(word='me', start=43, end=45, sentence=Sentence(indice=1, start=39, end=46)))])
def test_ner_process_multisources(self): """ Test ner process """ text = 'Hello everyone, this is me speaking. And me.' source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone', 'me': 'http://example.com/me'}) source2 = NerSourceLexicon({'me': 'http://example2.com/me'}) # Two sources, not unique ner = NerProcess((source1, source2)) named_entities = ner.process_text(text) self.assertEqual(named_entities, [('http://example.com/everyone', None, Token(word='everyone', start=6, end=14, sentence=Sentence(indice=0, start=0, end=38))), ('http://example.com/me', None, Token(word='me', start=26, end=28, sentence=Sentence(indice=0, start=0, end=38))), ('http://example2.com/me', None, Token(word='me', start=26, end=28, sentence=Sentence(indice=0, start=0, end=38))), ('http://example.com/me', None, Token(word='me', start=43, end=45, sentence=Sentence(indice=1, start=39, end=46))), ('http://example2.com/me', None, Token(word='me', start=43, end=45, sentence=Sentence(indice=1, start=39, end=46)))]) # Two sources, unique ner = NerProcess((source1, source2), unique=True) named_entities = ner.process_text(text) self.assertEqual(named_entities, [('http://example.com/everyone', None, Token(word='everyone', start=6, end=14, sentence=Sentence(indice=0, start=0, end=38))), ('http://example.com/me', None, Token(word='me', start=26, end=28, sentence=Sentence(indice=0, start=0, end=38))), ('http://example.com/me', None, Token(word='me', start=43, end=45, sentence=Sentence(indice=1, start=39, end=46)))]) # Two sources inversed, unique ner = NerProcess((source2, source1), unique=True) named_entities = ner.process_text(text) self.assertEqual(named_entities, [('http://example.com/everyone', None, Token(word='everyone', start=6, end=14, sentence=Sentence(indice=0, start=0, end=38))), ('http://example2.com/me', None, Token(word='me', start=26, end=28, sentence=Sentence(indice=0, start=0, end=38))), ('http://example2.com/me', None, Token(word='me', start=43, end=45, sentence=Sentence(indice=1, start=39, end=46)))])