Ejemplo n.º 1
0
 def test_ner_extract(self):
     text = 'Das ist der Pudel von Angela Merkel. '
     text = text * 5
     entity = model.make_entity('PlainText')
     entity.add('bodyText', text)
     tag_entity(entity)
     names = entity.get_type_values(registry.name)
     assert 'Angela Merkel' in names, names
Ejemplo n.º 2
0
 def test_pattern_extract(self):
     text = "Mr. Flubby Flubber called the number tel:+919988111222 twice"
     entity = model.make_entity('PlainText')
     entity.add('bodyText', text)
     tag_entity(entity)
     phones = entity.get_type_values(registry.phone)
     assert '+919988111222' in phones
     countries = entity.get_type_values(registry.country)
     assert 'in' in countries
Ejemplo n.º 3
0
 def test_language_tagging(self):
     text = "C'est le caniche d'Emmanuel Macron. " * 2
     entity = model.make_entity('PlainText')
     entity.add('bodyText', text)
     tag_entity(entity)
     names = entity.get_type_values(registry.name)
     assert "d'Emmanuel Macron" in names, names
     assert entity.get('detectedLanguage') == ['fra'], entity.get(
         'detectedLanguage')  # noqa
Ejemplo n.º 4
0
def _process_entity(entity, sync=False):
    """Perform pre-index processing on an entity, includes running the
    NLP pipeline."""
    if entity.id is None:
        raise InvalidData("No ID for entity", errors=entity.to_dict())
    tag_entity(entity)
    if sync:
        refresh_entity_id(entity.id)
    # log.debug("Index: %r", entity)
    return entity
Ejemplo n.º 5
0
def index_entities(stage, collection, iterable, sync=False):
    entities = []
    for entity in iterable:
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())

        tag_entity(entity)
        entities.append(entity)
        if len(entities) >= BULK_PAGE:
            stage.report_finished(len(entities))
            index_bulk(collection, entities, job_id=stage.job.id, sync=sync)
            entities = []

    if len(entities):
        stage.report_finished(len(entities))
        index_bulk(collection, entities, job_id=stage.job.id, sync=sync)
    refresh_collection(collection)
Ejemplo n.º 6
0
def index_entities(collection, iterable, sync=False):
    queue = get_queue(collection, OP_INDEX)
    queue.progress.mark_pending(len(iterable))
    entities = []
    for entity in iterable:
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())

        tag_entity(entity)
        entities.append(entity)
        if len(entities) >= BULK_PAGE:
            queue.progress.mark_finished(len(entities))
            index_bulk(collection, entities, sync=sync)
            entities = []

    if len(entities):
        queue.progress.mark_finished(len(entities))
        index_bulk(collection, entities, sync=sync)
    refresh_collection(collection)