def test_ner_extract(self): text = 'Das ist der Pudel von Angela Merkel. ' text = text * 5 entity = model.make_entity('PlainText') entity.add('bodyText', text) tag_entity(entity) names = entity.get_type_values(registry.name) assert 'Angela Merkel' in names, names
def test_pattern_extract(self): text = "Mr. Flubby Flubber called the number tel:+919988111222 twice" entity = model.make_entity('PlainText') entity.add('bodyText', text) tag_entity(entity) phones = entity.get_type_values(registry.phone) assert '+919988111222' in phones countries = entity.get_type_values(registry.country) assert 'in' in countries
def test_language_tagging(self): text = "C'est le caniche d'Emmanuel Macron. " * 2 entity = model.make_entity('PlainText') entity.add('bodyText', text) tag_entity(entity) names = entity.get_type_values(registry.name) assert "d'Emmanuel Macron" in names, names assert entity.get('detectedLanguage') == ['fra'], entity.get( 'detectedLanguage') # noqa
def _process_entity(entity, sync=False): """Perform pre-index processing on an entity, includes running the NLP pipeline.""" if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) tag_entity(entity) if sync: refresh_entity_id(entity.id) # log.debug("Index: %r", entity) return entity
def index_entities(stage, collection, iterable, sync=False): entities = [] for entity in iterable: if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) tag_entity(entity) entities.append(entity) if len(entities) >= BULK_PAGE: stage.report_finished(len(entities)) index_bulk(collection, entities, job_id=stage.job.id, sync=sync) entities = [] if len(entities): stage.report_finished(len(entities)) index_bulk(collection, entities, job_id=stage.job.id, sync=sync) refresh_collection(collection)
def index_entities(collection, iterable, sync=False): queue = get_queue(collection, OP_INDEX) queue.progress.mark_pending(len(iterable)) entities = [] for entity in iterable: if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) tag_entity(entity) entities.append(entity) if len(entities) >= BULK_PAGE: queue.progress.mark_finished(len(entities)) index_bulk(collection, entities, sync=sync) entities = [] if len(entities): queue.progress.mark_finished(len(entities)) index_bulk(collection, entities, sync=sync) refresh_collection(collection)