def test_phonenumber(self): agg = EntityAggregator() text = "Mr. Flubby Flubber called the number tel:+919988111222 twice" agg.extract(text, ['en']) entities = [l for l, c, w in agg.entities] assert '+919988111222' in entities assert 'in' in entities
def test_merkel(self): agg = EntityAggregator() text = 'Das ist der Pudel von Angela Merkel. ' text = text + text + text + text + text agg.extract(text, ['de', 'en']) entities = [l for l, c, w in agg.entities] assert 'Angela Merkel' in entities, entities
def test_multi(self): agg = EntityAggregator() text = "This is a text about Foo Blubb, a leader in " \ "this industry. The should not be confused with Foo Blubb, " \ "a smaller firm." agg.extract(text, ['en']) entities = [l for l, c, w in agg.entities] assert 'Foo Blubb' in entities, entities
def test_entities(self): agg = EntityAggregator() agg.add(PersonResult.create(agg, 'Mr. Max Banana', 0, 12)) agg.add(PersonResult.create(agg, 'Mr. Max Banana', 0, 12)) agg.add(PersonResult.create(agg, 'max Banana', 0, 12)) for label, category, weight in agg.entities: assert label == 'Max Banana', label # assert category == 'baa', label assert weight == 3, weight
def test_aggregator(self): agg = EntityAggregator() agg.add(PersonResult.create(agg, 'Banana', 0, 12)) assert len(agg) == 0, agg agg.add(PersonResult.create(agg, 'Mr. Max Banana', 0, 12)) assert len(agg) == 1, agg agg.add(PersonResult.create(agg, 'Max Banana', 0, 12)) assert len(agg) == 1, agg
def test_ner_service(self): ctx = EntityAggregator() text = """This is a document about the United States. But also about Syria and Germany. """ text = text + text + text + text entities = extract_entities(ctx, text, 'en') entities = [str(r) for r in entities] assert 'United States' in entities, entities assert 'Germany' in entities, entities assert 'Syria' in entities, entities
def extract_document_tags(document): if document.status != Document.STATUS_SUCCESS: return load_places() log.info("Tagging [%s]: %s", document.id, document.name) languages = list(document.languages) if not len(languages): languages = [settings.DEFAULT_LANGUAGE] aggregator = EntityAggregator() for text in document.texts: aggregator.extract(text, languages) DocumentTagCollector(document, 'polyglot').save() DocumentTagCollector(document, 'spacy').save() collector = DocumentTagCollector(document, 'ner') for (label, category, weight) in aggregator.entities: collector.emit(label, category, weight=weight) log.info("Extracted tags: %s", len(collector)) collector.save() db.session.add(document) db.session.commit()