Python get_aggregator Exemples, aleph.logic.aggregator.get_aggregator Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : util.py Projet : djoffrey/aleph

    def load_fixtures(self):
        self.admin = self.create_user(foreign_id='admin', is_admin=True)
        self.private_coll = self.create_collection(
            foreign_id='test_private',
            label="Private Collection",
            category='grey',
            creator=self.admin
        )
        self._banana = self.create_entity({
            'schema': 'Person',
            'properties': {
                'name': ['Banana'],
                'birthDate': '1970-08-21'
            }
        }, self.private_coll)
        self._banana2 = self.create_entity({
            'schema': 'Person',
            'properties': {
                'name': ['Banana'],
                'birthDate': '1970-03-21'
            }
        }, self.private_coll)
        self._banana3 = self.create_entity({
            'schema': 'Person',
            'properties': {
                'name': ['Banana'],
                'birthDate': '1970-05-21'
            }
        }, self.private_coll)
        user = Role.by_foreign_id(Role.SYSTEM_USER)
        Permission.grant(self.private_coll, user, True, False)
        self.public_coll = self.create_collection(
            foreign_id='test_public',
            label="Public Collection",
            category='news',
            creator=self.admin
        )
        self._kwazulu = self.create_entity({
            'schema': 'Company',
            'properties': {
                'name': ['KwaZulu'],
                'alias': ['kwazulu']
            }
        }, self.public_coll)
        visitor = Role.by_foreign_id(Role.SYSTEM_GUEST)
        Permission.grant(self.public_coll, visitor, True, False)
        db.session.commit()

        aggregator = get_aggregator(self.public_coll)
        aggregator.delete()
        aggregator.close()
        reindex_collection(self.public_coll, sync=True)

        aggregator = get_aggregator(self.private_coll)
        aggregator.delete()
        for sample in read_entities(self.get_fixture_path('samples.ijson')):
            aggregator.put(sample, fragment='sample')
        aggregator.close()
        reindex_collection(self.private_coll, sync=True)

Exemple #2

0

Afficher le fichier

Fichier : bulkload.py Projet : seekersapp2013/aleph

def bulk_load_query(queue, collection, query_id, query):
    namespace = Namespace(collection.foreign_id)
    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source)
    if records_total:
        queue.progress.mark_pending(records_total)
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entities_count = 0
    for idx, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            entity = namespace.apply(entity)
            entities_count += 1
            fragment = '%s-%s' % (query_id, idx)
            writer.put(entity, fragment=fragment)

        if idx > 0 and idx % 1000 == 0:
            queue.progress.mark_finished(1000)
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id, idx, records_total or 'streaming',
                     entities_count)
    writer.flush()
    aggregator.close()
    log.info("[%s] Query done (%s entities)", collection.foreign_id,
             entities_count)

Exemple #3

0

Afficher le fichier

Fichier : collections.py Projet : moreymat/aleph

def reindex_collection(collection, skip_errors=True, sync=False, flush=False):
    """Re-index all entities from the model, mappings and aggregator cache."""
    from aleph.logic.mapping import map_to_aggregator
    from aleph.logic.profiles import profile_fragments

    aggregator = get_aggregator(collection)
    for mapping in collection.mappings:
        if mapping.disabled:
            log.debug("[%s] Skip mapping: %r", collection, mapping)
            continue
        try:
            map_to_aggregator(collection, mapping, aggregator)
        except Exception:
            # More or less ignore broken models.
            log.exception("Failed mapping: %r", mapping)
    aggregate_model(collection, aggregator)
    profile_fragments(collection, aggregator)
    if flush:
        log.debug("[%s] Flushing...", collection)
        index.delete_entities(collection.id, sync=True)
    index_aggregator(collection,
                     aggregator,
                     skip_errors=skip_errors,
                     sync=sync)
    compute_collection(collection, force=True)

Exemple #4

0

Afficher le fichier

def process_collection(stage,
                       collection,
                       ingest=True,
                       reset=False,
                       sync=False):
    """Trigger a full re-parse of all documents and re-build the
    search index from the aggregator."""
    ingest = ingest or reset
    if reset:
        reset_collection(collection, sync=True)
    aggregator = get_aggregator(collection)
    try:
        writer = aggregator.bulk()
        for proxy in _collection_proxies(collection):
            writer.put(proxy, fragment='db')
            stage.report_finished(1)
        writer.flush()
        if ingest:
            for proxy in aggregator:
                ingest_entity(collection, proxy, job_id=stage.job.id)
        else:
            queue_task(collection,
                       OP_INDEX,
                       job_id=stage.job.id,
                       context={'sync': sync})
    finally:
        aggregator.close()

Exemple #5

0

Afficher le fichier

Fichier : entities.py Projet : moreymat/aleph

def prune_entity(collection, entity_id=None, job_id=None):
    """Prune handles the full deletion of an entity outside of the HTTP request
    cycle. This involves cleaning up adjacent entities like xref results, notifications
    and so on."""
    # This is recursive and will also delete any entities which
    # reference the given entity. Usually this is going to be child
    # documents, or directoships referencing a person. It's a pretty
    # dangerous operation, though.
    log.info("[%s] Prune entity: %s", collection, entity_id)
    for adjacent in index.iter_adjacent(collection.id, entity_id):
        log.warning("Recursive delete: %s", adjacent.get("id"))
        delete_entity(collection, adjacent, job_id=job_id)
    flush_notifications(entity_id, clazz=Entity)
    obj = Entity.by_id(entity_id, collection=collection)
    if obj is not None:
        obj.delete()
    doc = Document.by_id(entity_id, collection=collection)
    if doc is not None:
        doc.delete()
    EntitySetItem.delete_by_entity(entity_id)
    Mapping.delete_by_table(entity_id)
    xref_index.delete_xref(collection, entity_id=entity_id)
    aggregator = get_aggregator(collection)
    aggregator.delete(entity_id=entity_id)
    refresh_entity(collection, entity_id)
    collection.touch()
    db.session.commit()

Exemple #6

0

Afficher le fichier

Fichier : collections.py Projet : djoffrey/aleph

def delete_collection(collection, keep_metadata=False, sync=False):
    cancel_queue(collection)
    aggregator = get_aggregator(collection)
    try:
        aggregator.drop()
    finally:
        aggregator.close()
    flush_notifications(collection, sync=sync)
    index.delete_entities(collection.id, sync=sync)
    xref_index.delete_xref(collection, sync=sync)
    deleted_at = collection.deleted_at or datetime.utcnow()
    Entity.delete_by_collection(collection.id, deleted_at=deleted_at)
    Mapping.delete_by_collection(collection.id, deleted_at=deleted_at)
    Diagram.delete_by_collection(collection.id, deleted_at=deleted_at)
    Document.delete_by_collection(collection.id)
    if not keep_metadata:
        # Considering linkages metadata for now, might be wrong:
        Linkage.delete_by_collection(collection.id)
        Permission.delete_by_collection(collection.id, deleted_at=deleted_at)
        collection.delete(deleted_at=deleted_at)
    db.session.commit()
    if not keep_metadata:
        index.delete_collection(collection.id, sync=True)
        Authz.flush()
    refresh_collection(collection.id, sync=True)

Exemple #7

0

Afficher le fichier

def bulk_write(collection,
               entities,
               safe=False,
               role_id=None,
               mutable=True,
               index=True):
    """Write a set of entities - given as dicts - to the index."""
    # This is called mainly by the /api/2/collections/X/_bulk API.
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entity_ids = set()
    for data in entities:
        entity = model.get_proxy(data, cleaned=False)
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())
        entity = collection.ns.apply(entity)
        if safe:
            entity = remove_checksums(entity)
        entity.context = {"role_id": role_id, "mutable": mutable}
        for field in ("created_at", "updated_at"):
            timestamp = data.get(field)
            if timestamp is not None:
                dt = registry.date.to_datetime(timestamp)
                if dt is not None:
                    entity.context[field] = dt.isoformat()
        writer.put(entity, origin="bulk")
        if index and len(entity_ids) < MAX_PAGE:
            entity_ids.add(entity.id)
    writer.flush()
    if index:
        if len(entity_ids) >= MAX_PAGE:
            entity_ids = None
        index_aggregator(collection, aggregator, entity_ids=entity_ids)
        refresh_collection(collection.id)

Exemple #8

0

Afficher le fichier

Fichier : entities.py Projet : catskillmarina/aleph

def update_entity(collection, entity_id=None):
    """Update xref and aggregator after an entity has been edited."""
    from aleph.logic.xref import xref_entity
    from aleph.logic.profiles import profile_fragments

    log.info("[%s] Update entity: %s", collection, entity_id)
    entity = index.get_entity(entity_id)
    proxy = model.get_proxy(entity)
    if collection.casefile:
        xref_entity(collection, proxy)

    aggregator = get_aggregator(collection, origin=MODEL_ORIGIN)
    profile_fragments(collection, aggregator, entity_id=entity_id)

    # Inline name properties from adjacent entities. See the
    # docstring on `inline_names` for a more detailed discussion.
    prop = proxy.schema.get("namesMentioned")
    if prop is not None:
        entity_ids = proxy.get_type_values(registry.entity)
        names = set()
        for related in index.entities_by_ids(entity_ids):
            related = model.get_proxy(related)
            names.update(related.get_type_values(registry.name))

        if len(names) > 0:
            name_proxy = model.make_entity(proxy.schema)
            name_proxy.id = proxy.id
            name_proxy.add(prop, names)
            aggregator.put(name_proxy, fragment="names")

    index_aggregator(collection, aggregator, entity_ids=[entity_id])
    refresh_entity(collection, proxy.id)

Exemple #9

0

Afficher le fichier

Fichier : entities.py Projet : moreymat/aleph

def upsert_entity(data,
                  collection,
                  authz=None,
                  sync=False,
                  sign=False,
                  job_id=None):
    """Create or update an entity in the database. This has a side effect  of migrating
    entities created via the _bulk API or a mapper to a database entity in the event
    that it gets edited by the user.
    """
    from aleph.logic.profiles import profile_fragments

    entity = None
    entity_id = collection.ns.sign(data.get("id"))
    if entity_id is not None:
        entity = Entity.by_id(entity_id, collection=collection)
    if entity is None:
        role_id = authz.id if authz is not None else None
        entity = Entity.create(data, collection, sign=sign, role_id=role_id)
    else:
        entity.update(data, collection, sign=sign)
    collection.touch()

    proxy = entity.to_proxy()
    aggregator = get_aggregator(collection)
    aggregator.delete(entity_id=proxy.id)
    aggregator.put(proxy, origin=MODEL_ORIGIN)
    profile_fragments(collection, aggregator, entity_id=proxy.id)

    index.index_proxy(collection, proxy, sync=sync)
    refresh_entity(collection, proxy.id)
    queue_task(collection, OP_UPDATE_ENTITY, job_id=job_id, entity_id=proxy.id)
    return entity.id

Exemple #10

0

Afficher le fichier

Fichier : mapping.py Projet : rmallof/aleph

def load_mapping(collection, mapping_id, sync=False):
    """Flush and reload all entities generated by a mapping."""
    mapping = Mapping.by_id(mapping_id)
    if mapping is None:
        return log.error("Could not find mapping: %s", mapping_id)
    origin = mapping_origin(mapping.id)
    aggregator = get_aggregator(collection)
    aggregator.delete(origin=origin)
    delete_entities(collection.id, origin=origin, sync=True)
    if mapping.disabled:
        return log.info("Mapping is disabled: %s", mapping_id)
    publish(
        Events.LOAD_MAPPING,
        params={
            "collection": collection,
            "table": mapping.table_id
        },
        channels=[collection, mapping.role],
        actor_id=mapping.role_id,
    )
    try:
        map_to_aggregator(collection, mapping, aggregator)
        aggregate_model(collection, aggregator)
        index_aggregator(collection, aggregator, sync=sync)
        mapping.set_status(status=Status.SUCCESS)
        db.session.commit()
    except Exception as exc:
        mapping.set_status(status=Status.FAILED, error=str(exc))
        db.session.commit()
        aggregator.delete(origin=origin)
    finally:
        aggregator.close()

Exemple #11

0

Afficher le fichier

Fichier : processing.py Projet : djoffrey/aleph

def bulk_write(collection, entities, unsafe=False, role_id=None, index=True):
    """Write a set of entities - given as dicts - to the index."""
    # This is called mainly by the /api/2/collections/X/_bulk API.
    now = datetime.utcnow().isoformat()
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entity_ids = set()
    for data in entities:
        if not is_mapping(data):
            raise InvalidData("Failed to read input data", errors=data)
        entity = model.get_proxy(data)
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())
        entity = collection.ns.apply(entity)
        if not unsafe:
            entity = remove_checksums(entity)
        entity.context = {
            'role_id': role_id,
            'created_at': now,
            'updated_at': now,
        }
        writer.put(entity, origin='bulk')
        if index and len(entity_ids) < MAX_PAGE:
            entity_ids.add(entity.id)
    writer.flush()
    if index:
        if len(entity_ids) >= MAX_PAGE:
            entity_ids = None
        index_aggregator(collection, aggregator, entity_ids=entity_ids)
        refresh_collection(collection.id)

Exemple #12

0

Afficher le fichier

def _query_mentions(collection):
    aggregator = get_aggregator(collection, origin=ORIGIN)
    aggregator.delete(origin=ORIGIN)
    writer = aggregator.bulk()
    for proxy in _iter_mentions(collection):
        schemata = set()
        countries = set()
        for score, _, collection_id, match in _query_item(proxy):
            schemata.add(match.schema)
            countries.update(match.get_type_values(registry.country))
            yield score, proxy, collection_id, match
        if len(schemata):
            # Assign only those countries that are backed by one of
            # the matches:
            countries = countries.intersection(proxy.get("country"))
            proxy.set("country", countries)
            # Try to be more specific about schema:
            _merge_schemata(proxy, schemata)
            # Pick a principal name:
            proxy = name_entity(proxy)
            proxy.context["mutable"] = True
            log.debug("Reifying [%s]: %s", proxy.schema.name, proxy)
            writer.put(proxy, fragment="mention")
            # pprint(proxy.to_dict())
    writer.flush()
    aggregator.close()

Exemple #13

0

Afficher le fichier

def flush_mapping(collection, mapping_id, sync=True):
    """Delete entities loaded by a mapping"""
    log.debug("Flushing entities for mapping: %s", mapping_id)
    origin = mapping_origin(mapping_id)
    aggregator = get_aggregator(collection)
    aggregator.delete(origin=origin)
    delete_entities(collection.id, origin=origin, sync=sync)
    update_collection(collection, sync=sync)

Exemple #14

0

Afficher le fichier

Fichier : collections.py Projet : djoffrey/aleph

def reingest_collection(collection, job_id=None, index=False):
    """Trigger a re-ingest for all documents in the collection."""
    job_id = job_id or Job.random_id()
    aggregator = get_aggregator(collection)
    aggregator.delete(origin=OP_ANALYZE)
    aggregator.delete(origin=OP_INGEST)
    aggregator.close()
    for document in Document.by_collection(collection.id):
        proxy = document.to_proxy(ns=collection.ns)
        ingest_entity(collection, proxy, job_id=job_id, index=index)

Exemple #15

0

Afficher le fichier

def load_mapping(stage, collection, mapping_id):
    """Flush and reload all entities generated by a mapping."""
    mapping = Mapping.by_id(mapping_id)
    if mapping is None:
        return log.error("Could not find mapping: %s", mapping_id)
    flush_mapping(stage, collection, mapping_id)
    publish(Events.LOAD_MAPPING,
            params={'collection': collection, 'table': mapping.table_id},
            channels=[collection, mapping.role],
            actor_id=mapping.role_id)
    mapper = make_mapper(collection, mapping)
    aggregator = get_aggregator(collection)
    try:
        writer = aggregator.bulk()
        entities_count = 0
        entity_ids = set()
        for idx, record in enumerate(mapper.source.records, 1):
            for entity in mapper.map(record).values():
                if entity.schema.is_a('Thing'):
                    entity.add('proof', mapping.table_id)
                entity = collection.ns.apply(entity)
                entity_ids.add(entity.id)
                entities_count += 1
                fragment = '%s-%s' % (mapping.id, idx)
                writer.put(entity, fragment=fragment)

            if idx > 0 and idx % 500 == 0:
                payload = {
                    'entity_ids': entity_ids,
                    'mapping_id': mapping.id
                }
                queue_task(collection, OP_INDEX,
                           job_id=stage.job.id,
                           payload=payload)
                entity_ids = set()
                stage.report_finished(500)
                log.info("[%s] Loaded %s records, %s entities...",
                         collection.foreign_id,
                         idx, entities_count)

        writer.flush()
        payload = {
            'entity_ids': entity_ids,
            'mapping_id': mapping.id
        }
        queue_task(collection, OP_INDEX,
                   job_id=stage.job.id,
                   payload=payload)
        mapping.set_status(status=Mapping.SUCCESS)
        log.info("[%s] Mapping done (%s entities)",
                 mapping.id, entities_count)
    except Exception as exc:
        mapping.set_status(status=Mapping.FAILED, error=str(exc))
    finally:
        aggregator.close()

Exemple #16

0

Afficher le fichier

def index_aggregate(queue, collection, sync=False):
    """Project the contents of the collections aggregator into the index."""
    aggregator = get_aggregator(collection)
    try:
        index_entities(collection, aggregator, sync=sync)
        refresh_collection(collection.id, sync=sync)
        index_collection(collection, sync=sync)
        log.info("Aggregate indexed: %r", collection)
    finally:
        aggregator.close()
        queue.remove()

Exemple #17

0

Afficher le fichier

def flush_mapping(stage, collection, mapping_id, sync=True):
    """Delete entities loaded by a mapping"""
    log.debug("Flushing entities for mapping: %s", mapping_id)
    origin = mapping_origin(mapping_id)
    aggregator = get_aggregator(collection)
    aggregator.delete(origin=origin)
    aggregator.close()
    delete_entities(collection.id, origin=origin, sync=sync)
    collection.touch()
    db.session.commit()
    update_collection(collection, sync=sync)

Exemple #18

0

Afficher le fichier

def _fetch_entities(stage, collection, entity_id=None, batch=100):
    aggregator = get_aggregator(collection)
    if entity_id is not None:
        entity_id = ensure_list(entity_id)
        # WEIRD: Instead of indexing a single entity, this will try
        # pull a whole batch of them off the queue and do it at once.
        for task in stage.get_tasks(limit=batch):
            entity_id.append(task.payload.get('entity_id'))
        stage.mark_done(len(entity_id) - 1)

    yield from aggregator.iterate(entity_id=entity_id)
    aggregator.close()

Exemple #19

0

Afficher le fichier

def index_many(stage, collection, sync=False, entity_ids=None, batch=BATCH_SIZE):
    """Project the contents of the collections aggregator into the index."""
    if entity_ids is not None:
        entity_ids = ensure_list(entity_ids)
        # WEIRD: Instead of indexing a single entity, this will try
        # pull a whole batch of them off the queue and do it at once.
        tasks = stage.get_tasks(limit=max(1, batch - len(entity_ids)))
        for task in tasks:
            entity_ids.extend(ensure_list(task.payload.get("entity_ids")))
        stage.mark_done(len(tasks))
    aggregator = get_aggregator(collection)
    index_aggregator(collection, aggregator, entity_ids=entity_ids, sync=sync)
    refresh_collection(collection.id)

Exemple #20

0

Afficher le fichier

def _fetch_entities(stage, collection, entity_ids=None, batch=100):
    aggregator = get_aggregator(collection)
    if entity_ids is not None:
        entity_ids = ensure_list(entity_ids)
        # WEIRD: Instead of indexing a single entity, this will try
        # pull a whole batch of them off the queue and do it at once.
        tasks = stage.get_tasks(limit=max(1, batch - len(entity_ids)))
        for task in tasks:
            entity_ids.extend(ensure_list(task.payload.get('entity_ids')))
        # FIXME: this doesn't retain mapping_id properly.
        stage.mark_done(len(tasks))

    yield from aggregator.iterate(entity_id=entity_ids)
    aggregator.close()

Exemple #21

0

Afficher le fichier

def process_collection(stage, collection, ingest=True, sync=False):
    """Trigger a full re-parse of all documents and re-build the
    search index from the aggregator."""
    aggregator = get_aggregator(collection)
    for proxy in _collection_proxies(collection):
        if ingest and proxy.schema.is_a(Document.SCHEMA):
            ingest_entity(collection, proxy, job_id=stage.job.id, sync=sync)
        else:
            aggregator.put(proxy, fragment='db')
            queue_task(collection,
                       OP_INDEX,
                       job_id=stage.job.id,
                       payload={'entity_id': proxy.id},
                       context={'sync': sync})
    aggregator.close()

Exemple #22

0

Afficher le fichier

    def load_fixtures(self):
        self.admin = self.create_user(foreign_id='admin', is_admin=True)
        self.private_coll = self.create_collection(foreign_id='test_private',
                                                   label="Private Collection",
                                                   category='grey',
                                                   casefile=False,
                                                   creator=self.admin)
        self._banana = Entity.create(
            {
                'schema': 'Person',
                'properties': {
                    'name': ['Banana'],
                }
            }, self.private_coll)
        user = Role.by_foreign_id(Role.SYSTEM_USER)
        Permission.grant(self.private_coll, user, True, False)
        self.public_coll = self.create_collection(foreign_id='test_public',
                                                  label="Public Collection",
                                                  category='news',
                                                  casefile=False,
                                                  creator=self.admin)
        self._kwazulu = Entity.create(
            {
                'schema': 'Company',
                'properties': {
                    'name': ['KwaZulu'],
                    'alias': ['kwazulu']
                }
            }, self.public_coll)
        visitor = Role.by_foreign_id(Role.SYSTEM_GUEST)
        Permission.grant(self.public_coll, visitor, True, False)
        db.session.commit()

        drop_aggregator(self.public_coll)
        stage = get_stage(self.public_coll, OP_PROCESS)
        process_collection(stage, self.public_coll, ingest=False, sync=True)

        aggregator = get_aggregator(self.private_coll)
        aggregator.delete()
        stage = get_stage(self.private_coll, OP_PROCESS)
        for sample in read_entities(self.get_fixture_path('samples.ijson')):
            aggregator.put(sample, fragment='sample')
            index_aggregate(stage,
                            self.private_coll,
                            entity_id=sample.id,
                            sync=True)
        aggregator.close()
        process_collection(stage, self.private_coll, ingest=False, sync=True)

Exemple #23

0

Afficher le fichier

 def setUp(self):
     super(MappingAPITest, self).setUp()
     self.col = self.create_collection(foreign_id="map1")
     aggregator = get_aggregator(self.col)
     aggregator.delete()
     _, self.headers = self.login(is_admin=True)
     self.rolex = self.create_user(foreign_id="user_3")
     _, self.headers_x = self.login(foreign_id="user_3")
     self.fixture = self.get_fixture_path("experts.csv")
     self.content_hash = archive.archive_file(self.fixture)
     data = {
         "id": "foo",
         "schema": "Table",
         "properties": {
             "csvHash": self.content_hash,
             "contentHash": self.content_hash,
             "mimeType": "text/csv",
             "fileName": "experts.csv",
             "name": "experts.csv",
         },
     }
     self.ent = EntityProxy.from_dict(model, data, cleaned=False)
     self.ent.id = self.col.ns.sign(self.ent.id)
     index_proxy(self.col, self.ent)
     data = {
         "id": "foo2",
         "schema": "Table",
         "properties": {
             "csvHash": self.content_hash,
             "contentHash": self.content_hash,
             "mimeType": "text/csv",
             "fileName": "experts.csv",
             "name": "experts.csv",
         },
     }
     self.ent2 = EntityProxy.from_dict(model, data, cleaned=False)
     self.ent2.id = self.col.ns.sign(self.ent2.id)
     index_proxy(self.col, self.ent2)
     data = {
         "id": "bar",
         "schema": "LegalEntity",
         "properties": {
             "name": "John Doe"
         },
     }
     ent = EntityProxy.from_dict(model, data, cleaned=False)
     ent.id = self.col.ns.sign(ent.id)
     index_proxy(self.col, ent)

Exemple #24

0

Afficher le fichier

Fichier : collections.py Projet : djoffrey/aleph

def reindex_collection(collection, sync=False, flush=False):
    """Re-index all entities from the model, mappings and aggregator cache."""
    from aleph.logic.mapping import map_to_aggregator
    if flush:
        log.debug("[%s] Flushing...", collection)
        index.delete_entities(collection.id, sync=True)
    aggregator = get_aggregator(collection)
    for mapping in collection.mappings:
        try:
            map_to_aggregator(collection, mapping, aggregator)
        except Exception as ex:
            # More or less ignore broken models.
            log.warn("Failed mapping [%s]: %s", mapping.id, ex)
    aggregate_model(collection, aggregator)
    index_aggregator(collection, aggregator, sync=sync)
    compute_collection(collection, sync=True)

Exemple #25

0

Afficher le fichier

def save_entityset_item(entityset, collection, entity_id, **data):
    """Change the association between an entity and an entityset. In the case of
    a profile, this may require re-indexing of the entity to update the associated
    profile_id.
    """
    item = EntitySetItem.save(entityset,
                              entity_id,
                              collection_id=collection.id,
                              **data)
    if entityset.type == EntitySet.PROFILE and entityset.collection_id == collection.id:
        from aleph.logic.profiles import profile_fragments

        aggregator = get_aggregator(collection)
        profile_fragments(collection, aggregator, entity_id=entity_id)
        index_aggregator(collection, aggregator, entity_ids=[entity_id])
        refresh_entity(collection, entity_id)
    refresh_entityset(entityset.id)
    return item

Exemple #26

0

Afficher le fichier

Fichier : entities.py Projet : moreymat/aleph

def update_entity(collection, entity_id=None, job_id=None):
    """Worker post-processing for entity changes. This action collects operations
    that should be done after each change to an entity but are too slow to run
    inside the request cycle.

    Update xref and aggregator, trigger NER and re-index."""
    from aleph.logic.xref import xref_entity
    from aleph.logic.profiles import profile_fragments

    log.info("[%s] Update entity: %s", collection, entity_id)
    entity = index.get_entity(entity_id)
    proxy = model.get_proxy(entity)
    if collection.casefile:
        xref_entity(collection, proxy)

    aggregator = get_aggregator(collection, origin=MODEL_ORIGIN)
    profile_fragments(collection, aggregator, entity_id=entity_id)
    inline_names(aggregator, proxy)
    pipeline_entity(collection, proxy, job_id=job_id)

Exemple #27

0

Afficher le fichier

def _fetch_entities(stage, collection, entity_id=None, batch=50):
    aggregator = get_aggregator(collection)
    try:
        if entity_id is None:
            yield from aggregator
            return
        yield from aggregator.iterate(entity_id=entity_id)

        # WEIRD: Instead of indexing a single entity, this will try
        # pull a whole batch of them off the queue and do it at once.
        done = 0
        for task in stage.get_tasks(limit=batch):
            entity_id = task.payload.get('entity_id')
            for entity in aggregator.iterate(entity_id=entity_id):
                yield entity
                done += 1
        stage.mark_done(done)
    finally:
        aggregator.close()

Exemple #28

0

Afficher le fichier

def process_collection(collection, ingest=True, reset=False):
    """Trigger a full re-parse of all documents and re-build the
    search index from the aggregator."""
    if reset:
        reset_collection(collection)
    aggregator = get_aggregator(collection)
    try:
        writer = aggregator.bulk()
        for proxy in _collection_proxies(collection):
            writer.put(proxy, fragment='db')
            if ingest:
                ingest_entity(collection, proxy)
        writer.flush()
        if ingest:
            ingest_wait(collection)
        else:
            index_entities(collection, aggregator)
    finally:
        aggregator.close()

Exemple #29

0

Afficher le fichier

def delete_collection(collection, keep_metadata=False, sync=False):
    cancel_queue(collection)
    aggregator = get_aggregator(collection)
    aggregator.drop()
    flush_notifications(collection, sync=sync)
    index.delete_entities(collection.id, sync=sync)
    xref_index.delete_xref(collection, sync=sync)
    deleted_at = collection.deleted_at or datetime.utcnow()
    Mapping.delete_by_collection(collection.id)
    EntitySet.delete_by_collection(collection.id, deleted_at)
    Entity.delete_by_collection(collection.id)
    Document.delete_by_collection(collection.id)
    if not keep_metadata:
        Permission.delete_by_collection(collection.id)
        collection.delete(deleted_at=deleted_at)
    db.session.commit()
    if not keep_metadata:
        index.delete_collection(collection.id, sync=True)
        Authz.flush()
    refresh_collection(collection.id)

Exemple #30

0

Afficher le fichier

def index_aggregate(stage, collection, entity_id=None, sync=False):
    """Project the contents of the collections aggregator into the index."""
    aggregator = get_aggregator(collection)
    try:
        entities = aggregator
        if entity_id is not None:
            entities = list(aggregator.iterate(entity_id=entity_id))

            # WEIRD: Instead of indexing a single entity, this will try
            # pull a whole batch of them off the queue and do it at once.
            for task in stage.get_tasks(limit=50):
                entity_id = task.payload.get('entity_id')
                entities.extend(aggregator.iterate(entity_id=entity_id))
            stage.mark_done(len(entities) - 1)

            for entity in entities:
                log.debug("Index: %r", entity)
                refresh_entity_id(entity.id)
        index_entities(stage, collection, entities, sync=sync)
    finally:
        aggregator.close()