def test_prettyprint_class(self):
     text = 'Hello everyone, this is   me speaking. And me.'
     source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
                                'me': 'http://example.com/me'})
     ner = NerProcess((source,))
     named_entities = ner.process_text(text)
     html = HTMLPrettyPrint().pprint_text(text, named_entities, html_class='ner')
     self.assertEqual(html, (u'Hello <a href="http://example.com/everyone" class="ner">everyone</a>, '
                             u'this is   <a href="http://example.com/me" class="ner">me</a> speaking. '
                             u'And <a href="http://example.com/me" class="ner">me</a>.'))
 def test_ner_process_preprocess(self):
     """ Test ner process """
     text = 'Hello Toto, this is   me speaking. And me.'
     source = NerSourceLexicon({'Toto': 'http://example.com/toto',
                                'me': 'http://example.com/me'})
     preprocessor = NerStopwordsFilterPreprocessor()
     ner = NerProcess((source,),
                               preprocessors=(preprocessor,))
     named_entities = ner.process_text(text)
     self.assertEqual(named_entities, [('http://example.com/toto', None,
                                        Token(word='Toto', start=6, end=10,
                                              sentence=Sentence(indice=0, start=0, end=34)))])
 def test_occurence_filter_max_occ(self):
     """ Test occurence filter """
     text = 'Hello everyone, this is   me speaking. And me.'
     source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
                                 'me': 'http://example.com/me'})
     source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
     _filter = NerOccurenceFilter(max_occ=1)
     ner = NerProcess((source1, source2), filters=(_filter,))
     named_entities = ner.process_text(text)
     self.assertEqual(named_entities,
                      [('http://example.com/everyone', None,
                        Token(word='everyone', start=6, end=14,
                                        sentence=Sentence(indice=0, start=0, end=38))),])
 def test_disambiguation_word_case(self):
     """ Test occurence filter """
     text = 'Hello Toto Tutu. And Toto.'
     source = NerSourceLexicon({'Toto Tutu': 'http://example.com/toto_tutu',
                                'Toto': 'http://example.com/toto'})
     _filter = NerDisambiguationWordParts()
     ner = NerProcess((source,), filters=(_filter,))
     named_entities = ner.process_text(text)
     self.assertEqual(named_entities,
                      [('http://example.com/toto_tutu', None,
                        Token(word='Toto Tutu', start=6, end=15,
                              sentence=Sentence(indice=0, start=0, end=16))),
                       ('http://example.com/toto_tutu', None,
                        Token(word='Toto', start=21, end=25,
                              sentence=Sentence(indice=1, start=17, end=26)))])
 def test_rules_filter(self):
     """ Test rules filter """
     text = 'Hello toto tutu. And toto.'
     source = NerSourceLexicon({'toto tutu': 'http://example.com/toto_tutu',
                                'toto': 'http://example.com/toto'})
     rules = {'http://example.com/toto': 'http://example.com/tata'}
     _filter = NerReplacementRulesFilter(rules)
     ner = NerProcess((source,), filters=(_filter,))
     named_entities = ner.process_text(text)
     self.assertEqual(named_entities,
                      [('http://example.com/toto_tutu', None,
                        Token(word='toto tutu', start=6, end=15,
                              sentence=Sentence(indice=0, start=0, end=16))),
                       ('http://example.com/tata', None,
                        Token(word='toto', start=21, end=25,
                              sentence=Sentence(indice=1, start=17, end=26)))])
 def test_ner_process(self):
     """ Test ner process """
     text = 'Hello everyone, this is   me speaking. And me.'
     source = NerSourceLexicon({'everyone': 'http://example.com/everyone',
                                'me': 'http://example.com/me'})
     ner = NerProcess((source,))
     named_entities = ner.process_text(text)
     self.assertEqual(named_entities,
                      [('http://example.com/everyone', None,
                        Token(word='everyone', start=6, end=14,
                                        sentence=Sentence(indice=0, start=0, end=38))),
                       ('http://example.com/me', None,
                        Token(word='me', start=26, end=28,
                                        sentence=Sentence(indice=0, start=0, end=38))),
                       ('http://example.com/me', None,
                        Token(word='me', start=43, end=45,
                                        sentence=Sentence(indice=1, start=39, end=46)))])
 def test_ner_process_multisources(self):
     """ Test ner process """
     text = 'Hello everyone, this is   me speaking. And me.'
     source1 = NerSourceLexicon({'everyone': 'http://example.com/everyone',
                                 'me': 'http://example.com/me'})
     source2 = NerSourceLexicon({'me': 'http://example2.com/me'})
     # Two sources, not unique
     ner = NerProcess((source1, source2))
     named_entities = ner.process_text(text)
     self.assertEqual(named_entities,
                      [('http://example.com/everyone', None,
                        Token(word='everyone', start=6, end=14,
                                        sentence=Sentence(indice=0, start=0, end=38))),
                       ('http://example.com/me', None,
                        Token(word='me', start=26, end=28,
                                        sentence=Sentence(indice=0, start=0, end=38))),
                       ('http://example2.com/me', None,
                        Token(word='me', start=26, end=28,
                                        sentence=Sentence(indice=0, start=0, end=38))),
                       ('http://example.com/me', None,
                        Token(word='me', start=43, end=45,
                                        sentence=Sentence(indice=1, start=39, end=46))),
                       ('http://example2.com/me', None,
                        Token(word='me', start=43, end=45,
                                        sentence=Sentence(indice=1, start=39, end=46)))])
     # Two sources, unique
     ner = NerProcess((source1, source2), unique=True)
     named_entities = ner.process_text(text)
     self.assertEqual(named_entities,
                      [('http://example.com/everyone', None,
                        Token(word='everyone', start=6, end=14,
                                        sentence=Sentence(indice=0, start=0, end=38))),
                       ('http://example.com/me', None,
                        Token(word='me', start=26, end=28,
                                        sentence=Sentence(indice=0, start=0, end=38))),
                       ('http://example.com/me', None,
                        Token(word='me', start=43, end=45,
                                        sentence=Sentence(indice=1, start=39, end=46)))])
     # Two sources inversed, unique
     ner = NerProcess((source2, source1), unique=True)
     named_entities = ner.process_text(text)
     self.assertEqual(named_entities,
                      [('http://example.com/everyone', None,
                        Token(word='everyone', start=6, end=14,
                                        sentence=Sentence(indice=0, start=0, end=38))),
                       ('http://example2.com/me', None,
                        Token(word='me', start=26, end=28,
                                        sentence=Sentence(indice=0, start=0, end=38))),
                       ('http://example2.com/me', None,
                        Token(word='me', start=43, end=45,
                                        sentence=Sentence(indice=1, start=39, end=46)))])