예제 #1
0
def bulk_write(collection, items):
    """Write a set of entities - given as raw dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data")

        entity = model.get_proxy(item)
        if entity.id is None:
            raise InvalidData("No ID for entity")

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities)

    refresh_collection(collection)
예제 #2
0
def bulk_load_query(collection_id, query):
    collection = Collection.by_id(collection_id)
    if collection is None:
        log.warning("Collection does not exist: %s", collection_id)
        return

    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source) or 'streaming'
    entities = {}
    entities_count = 0
    for records_index, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            # When loading from a tabular data source, we will often
            # encounter mappings where the same entity is emitted
            # multiple times in short sequence, e.g. when the data
            # describes all the directors of a single company.
            if entity.id in entities:
                entities[entity.id].merge(entity)
            else:
                entities[entity.id] = entity
                entities_count += 1

        if records_index > 0 and records_index % 1000 == 0:
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id,
                     records_index,
                     records_total,
                     entities_count)

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    index.index_bulk(collection.id, entities)
    refresh_collection(collection)
예제 #3
0
def bulk_write(collection, entities, unsafe=False, role_id=None, index=True):
    """Write a set of entities - given as dicts - to the index."""
    # This is called mainly by the /api/2/collections/X/_bulk API.
    now = datetime.utcnow().isoformat()
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entity_ids = set()
    for data in entities:
        if not is_mapping(data):
            raise InvalidData("Failed to read input data", errors=data)
        entity = model.get_proxy(data)
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())
        entity = collection.ns.apply(entity)
        if not unsafe:
            entity = remove_checksums(entity)
        entity.context = {
            'role_id': role_id,
            'created_at': now,
            'updated_at': now,
        }
        writer.put(entity, origin='bulk')
        if index and len(entity_ids) < MAX_PAGE:
            entity_ids.add(entity.id)
    writer.flush()
    if index:
        if len(entity_ids) >= MAX_PAGE:
            entity_ids = None
        index_aggregator(collection, aggregator, entity_ids=entity_ids)
        refresh_collection(collection.id)
예제 #4
0
def bulk_write(collection,
               entities,
               safe=False,
               role_id=None,
               mutable=True,
               index=True):
    """Write a set of entities - given as dicts - to the index."""
    # This is called mainly by the /api/2/collections/X/_bulk API.
    aggregator = get_aggregator(collection)
    writer = aggregator.bulk()
    entity_ids = set()
    for data in entities:
        entity = model.get_proxy(data, cleaned=False)
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())
        entity = collection.ns.apply(entity)
        if safe:
            entity = remove_checksums(entity)
        entity.context = {"role_id": role_id, "mutable": mutable}
        for field in ("created_at", "updated_at"):
            timestamp = data.get(field)
            if timestamp is not None:
                dt = registry.date.to_datetime(timestamp)
                if dt is not None:
                    entity.context[field] = dt.isoformat()
        writer.put(entity, origin="bulk")
        if index and len(entity_ids) < MAX_PAGE:
            entity_ids.add(entity.id)
    writer.flush()
    if index:
        if len(entity_ids) >= MAX_PAGE:
            entity_ids = None
        index_aggregator(collection, aggregator, entity_ids=entity_ids)
        refresh_collection(collection.id)
예제 #5
0
def cancel(collection_id):
    """
    ---
    delete:
      summary: Cancel processing of a collection
      description: >
        Cancel all queued tasks for the collection with id `collection_id`
      parameters:
      - description: The collection ID.
        in: path
        name: collection_id
        required: true
        schema:
          minimum: 1
          type: integer
      responses:
        '200':
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/CollectionStatus'
          description: OK
      tags:
      - Collection
    """
    collection = get_db_collection(collection_id, request.authz.WRITE)
    cancel_queue(collection)
    refresh_collection(collection_id)
    return ("", 204)
예제 #6
0
def bulk_write(collection, items, merge=True):
    """Write a set of entities - given as dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    namespace = Namespace(collection.foreign_id)
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data", errors=item)

        entity = model.get_proxy(item)
        entity = namespace.apply(entity)
        entity.context = {
            'bulk': True,
            'collection_id': collection.id
        }
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=item)

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities, merge=merge)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities, merge=merge)

    refresh_collection(collection)
예제 #7
0
def bulk(id):
    collection = get_db_collection(id, request.authz.WRITE)
    require(request.authz.can_bulk_import())
    merge = get_flag('merge', default=False)
    entities = ensure_list(request.get_json(force=True))
    bulk_write(collection, entities, merge=merge)
    refresh_collection(id)
    return ('', 204)
예제 #8
0
파일: entities.py 프로젝트: sudhu26/aleph
def create_entity(data, collection, role=None, sync=False):
    entity = Entity.create(data, collection)
    collection.touch()
    db.session.commit()
    index.index_entity(entity, sync=sync)
    refresh_entity(entity.signed_id, sync=sync)
    refresh_collection(collection.id, sync=sync)
    return entity.signed_id
예제 #9
0
def refresh_entity(entity, sync=False):
    if is_mapping(entity):
        entity_id = entity.get('id')
        collection_id = entity.get('collection_id')
    else:
        entity_id = entity.id
        collection_id = entity.collection_id
    cache.kv.delete(cache.object_key(Entity, entity_id))
    refresh_collection(collection_id, sync=sync)
예제 #10
0
def process(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    # re-process the documents
    data = {'reset': get_flag('reset', True)}
    queue_task(collection, OP_PROCESS, job_id=get_session_id(), payload=data)
    collection.touch()
    db.session.commit()
    refresh_collection(collection_id)
    return ('', 202)
예제 #11
0
def index_aggregate(stage,
                    collection,
                    sync=False,
                    entity_ids=None,
                    mapping_id=None):
    """Project the contents of the collections aggregator into the index."""
    entities = _fetch_entities(stage, collection, entity_ids=entity_ids)
    entities = (_process_entity(e, sync=sync) for e in entities)
    extra = {'job_id': stage.job.id, 'mapping_id': mapping_id}
    index_bulk(collection, entities, extra, sync=sync)
    refresh_collection(collection.id, sync=sync)
예제 #12
0
def index_aggregate(queue, collection, sync=False):
    """Project the contents of the collections aggregator into the index."""
    aggregator = get_aggregator(collection)
    try:
        index_entities(collection, aggregator, sync=sync)
        refresh_collection(collection.id, sync=sync)
        index_collection(collection, sync=sync)
        log.info("Aggregate indexed: %r", collection)
    finally:
        aggregator.close()
        queue.remove()
예제 #13
0
파일: manage.py 프로젝트: SinSiXX/aleph
def crawldir(path, language=None, foreign_id=None):
    """Crawl the given directory."""
    path = Path(path)
    if foreign_id is None:
        foreign_id = 'directory:%s' % slugify(path)
    authz = Authz.from_role(Role.load_cli_user())
    config = {'foreign_id': foreign_id, 'label': path.name, 'casefile': False}
    create_collection(config, authz)
    collection = Collection.by_foreign_id(foreign_id)
    log.info('Crawling %s to %s (%s)...', path, foreign_id, collection.id)
    crawl_directory(collection, path)
    log.info('Complete. Make sure a worker is running :)')
    refresh_collection(collection.id)
예제 #14
0
def index_many(stage, collection, sync=False, entity_ids=None, batch=BATCH_SIZE):
    """Project the contents of the collections aggregator into the index."""
    if entity_ids is not None:
        entity_ids = ensure_list(entity_ids)
        # WEIRD: Instead of indexing a single entity, this will try
        # pull a whole batch of them off the queue and do it at once.
        tasks = stage.get_tasks(limit=max(1, batch - len(entity_ids)))
        for task in tasks:
            entity_ids.extend(ensure_list(task.payload.get("entity_ids")))
        stage.mark_done(len(tasks))
    aggregator = get_aggregator(collection)
    index_aggregator(collection, aggregator, entity_ids=entity_ids, sync=sync)
    refresh_collection(collection.id)
예제 #15
0
def bulk(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    require(request.authz.can_bulk_import())

    # This will disable checksum security measures in order to allow bulk
    # loading of document data.
    unsafe = get_flag('unsafe', default=False)
    unsafe = unsafe and request.authz.is_admin

    entities = ensure_list(request.get_json(force=True))
    bulk_write(collection, entities, unsafe=unsafe)
    refresh_collection(id)
    return ('', 204)
예제 #16
0
def bulk_write(collection, entities, job_id=None, unsafe=False):
    """Write a set of entities - given as dicts - to the index."""
    def _generate():
        for data in entities:
            if not is_mapping(data):
                raise InvalidData("Failed to read input data", errors=data)
            entity = model.get_proxy(data)
            if not unsafe:
                entity = remove_checksums(entity)
            yield _process_entity(entity)

    index_bulk(collection, _generate(), job_id=job_id)
    refresh_collection(collection.id)
예제 #17
0
def bulk(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    require(request.authz.can_bulk_import())
    merge = get_flag('merge', default=False)

    # This will disable certain security measures in order to allow bulk
    # loading of document data.
    unsafe = get_flag('unsafe', default=False)
    unsafe = unsafe and request.authz.is_admin

    entities = ensure_list(request.get_json(force=True))
    bulk_write(collection, entities, merge=merge, unsafe=unsafe)
    refresh_collection(id)
    return ('', 204)
예제 #18
0
파일: manage.py 프로젝트: wdsn/aleph
def update(foreign_id=None, index=False, process=False, reset=False):
    """Re-index all the collections and entities."""
    update_roles()
    q = Collection.all(deleted=True)
    if foreign_id is not None:
        q = [get_collection(foreign_id)]
    for collection in q:
        if reset:
            reset_collection(collection, sync=True)
        refresh_collection(collection.id)
        index_collection(collection)
        if collection.deleted_at is not None:
            continue
        if index or process:
            payload = {'ingest': process}
            queue_task(collection, OP_PROCESS, payload=payload)
예제 #19
0
def mapping(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    require(request.authz.can_bulk_import())
    if not request.is_json:
        raise BadRequest()
    data = request.get_json().get(collection.foreign_id)
    for query in keys_values(data, 'queries', 'query'):
        try:
            model.make_mapping(query)
        except InvalidMapping as invalid:
            raise BadRequest(invalid)
    queue_task(collection, OP_BULKLOAD, job_id=get_session_id(), payload=data)
    collection.touch()
    db.session.commit()
    refresh_collection(collection_id)
    return ('', 202)
예제 #20
0
def bulk_write(collection, entities, job_id=None, unsafe=False):
    """Write a set of entities - given as dicts - to the index."""

    # This is called mainly by the /api/2/collections/X/_bulk API.
    def _generate():
        for data in entities:
            if not is_mapping(data):
                raise InvalidData("Failed to read input data", errors=data)
            entity = model.get_proxy(data)
            if entity.id is None:
                raise InvalidData("No ID for entity", errors=entity.to_dict())
            if not unsafe:
                entity = remove_checksums(entity)
            yield _process_entity(entity)

    index_bulk(collection, _generate(), {'job_id': job_id})
    refresh_collection(collection.id)
예제 #21
0
def index_entities(stage, collection, iterable, sync=False):
    entities = []
    for entity in iterable:
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())

        tag_entity(entity)
        entities.append(entity)
        if len(entities) >= BULK_PAGE:
            stage.report_finished(len(entities))
            index_bulk(collection, entities, job_id=stage.job.id, sync=sync)
            entities = []

    if len(entities):
        stage.report_finished(len(entities))
        index_bulk(collection, entities, job_id=stage.job.id, sync=sync)
    refresh_collection(collection)
예제 #22
0
파일: entities.py 프로젝트: sudhu26/aleph
def delete_entity(entity, deleted_at=None, sync=False):
    # This is recursive and will also delete any entities which
    # reference the given entity. Usually this is going to be child
    # documents, or directoships referencing a person. It's a pretty
    # dangerous operation, though.
    for adjacent in index.iter_adjacent(entity):
        log.warning("Recursive delete: %r", adjacent)
        delete_entity(adjacent, deleted_at=deleted_at, sync=sync)
    flush_notifications(entity.get('id'), clazz=Entity)
    obj = Entity.by_id(entity.get('id'))
    if obj is not None:
        obj.delete(deleted_at=deleted_at)
    doc = Document.by_id(entity.get('id'))
    if doc is not None:
        doc.delete(deleted_at=deleted_at)
    index.delete_entity(entity.get('id'), sync=sync)
    refresh_entity(entity.get('id'), sync=sync)
    refresh_collection(entity.get('collection_id'), sync=sync)
예제 #23
0
def upsert_entity(data, collection, sync=False):
    entity = None
    entity_id = collection.ns.sign(data.get('id'))
    if entity_id is not None:
        entity = Entity.by_id(entity_id,
                              collection=collection,
                              deleted=True)
    # TODO: migrate softly from index.
    if entity is None:
        entity = Entity.create(data, collection)
    else:
        entity.update(data, collection)
    collection.touch()
    db.session.commit()
    delete_aggregator_entity(collection, entity.id)
    index.index_entity(entity, sync=sync)
    refresh_entity(entity.id, sync=sync)
    refresh_collection(collection.id, sync=sync)
    return entity.id
예제 #24
0
def index_entities(collection, iterable, sync=False):
    queue = get_queue(collection, OP_INDEX)
    queue.progress.mark_pending(len(iterable))
    entities = []
    for entity in iterable:
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=entity.to_dict())

        tag_entity(entity)
        entities.append(entity)
        if len(entities) >= BULK_PAGE:
            queue.progress.mark_finished(len(entities))
            index_bulk(collection, entities, sync=sync)
            entities = []

    if len(entities):
        queue.progress.mark_finished(len(entities))
        index_bulk(collection, entities, sync=sync)
    refresh_collection(collection)
예제 #25
0
파일: entities.py 프로젝트: gunda01/aleph
def upsert_entity(data, collection, validate=True, sync=False):
    """Create or update an entity in the database. This has a side hustle
    of migrating entities created via the _bulk API or a mapper to a
    database entity in the event that it gets edited by the user.
    """
    entity = None
    entity_id = collection.ns.sign(data.get('id'))
    if entity_id is not None:
        entity = Entity.by_id(entity_id, collection=collection, deleted=True)
    # TODO: migrate softly from index.
    if entity is None:
        entity = Entity.create(data, collection, validate=validate)
    else:
        entity.update(data, collection, validate=validate)
    collection.touch()
    db.session.commit()
    delete_aggregator_entity(collection, entity.id)
    index.index_entity(entity, sync=sync)
    refresh_entity(entity.id, sync=sync)
    refresh_collection(collection.id, sync=sync)
    return entity.id
예제 #26
0
파일: entities.py 프로젝트: djoffrey/aleph
def delete_entity(collection, entity, deleted_at=None, sync=False):
    # This is recursive and will also delete any entities which
    # reference the given entity. Usually this is going to be child
    # documents, or directoships referencing a person. It's a pretty
    # dangerous operation, though.
    entity_id = collection.ns.sign(entity.get('id'))
    for adjacent in index.iter_adjacent(entity):
        log.warning("Recursive delete: %r", adjacent)
        delete_entity(collection, adjacent, deleted_at=deleted_at, sync=sync)
    flush_notifications(entity_id, clazz=Entity)
    obj = Entity.by_id(entity_id, collection=collection)
    if obj is not None:
        obj.delete(deleted_at=deleted_at)
    doc = Document.by_id(entity_id, collection=collection)
    if doc is not None:
        doc.delete(deleted_at=deleted_at)
    index.delete_entity(entity_id, sync=sync)
    Linkage.delete_by_entity(entity_id)
    Mapping.delete_by_table(entity_id)
    xref_index.delete_xref(collection, entity_id=entity_id, sync=sync)
    delete_aggregator_entity(collection, entity_id)
    refresh_entity(entity_id, sync=sync)
    refresh_collection(collection.id, sync=sync)
예제 #27
0
파일: bulk.py 프로젝트: pudo/aleph
def bulk_load_query(collection_id, query):
    collection = Collection.by_id(collection_id)
    if collection is None:
        log.warning("Collection does not exist: %s", collection_id)
        return

    namespace = Namespace(collection.foreign_id)
    mapping = model.make_mapping(query, key_prefix=collection.foreign_id)
    records_total = len(mapping.source) or 'streaming'
    entities = {}
    entities_count = 0
    for records_index, record in enumerate(mapping.source.records, 1):
        for entity in mapping.map(record).values():
            entity = namespace.apply(entity)
            # When loading from a tabular data source, we will often
            # encounter mappings where the same entity is emitted
            # multiple times in short sequence, e.g. when the data
            # describes all the directors of a single company.
            if entity.id in entities:
                entities[entity.id].merge(entity)
            else:
                entities[entity.id] = entity
                entities_count += 1

        if records_index > 0 and records_index % 1000 == 0:
            log.info("[%s] Loaded %s records (%s), %s entities...",
                     collection.foreign_id,
                     records_index,
                     records_total,
                     entities_count)

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities)
            entities = {}

    index.index_bulk(collection.id, entities)
    refresh_collection(collection)
예제 #28
0
파일: bulk.py 프로젝트: pudo/aleph
def bulk_write(collection, items, merge=True, unsafe=False):
    """Write a set of entities - given as dicts - to the index in bulk
    mode. This will perform validation but is dangerous as it means the
    application has no control over key generation and a few other aspects
    of building the entity.
    """
    namespace = Namespace(collection.foreign_id)
    entities = {}
    for item in items:
        if not is_mapping(item):
            raise InvalidData("Failed to read input data", errors=item)

        entity = model.get_proxy(item)
        if not unsafe:
            entity = namespace.apply(entity)
            entity = remove_checksums(entity)
        entity.context = {
            'bulk': True,
            'collection_id': collection.id
        }
        if entity.id is None:
            raise InvalidData("No ID for entity", errors=item)

        if entity.id in entities:
            entities[entity.id].merge(entity)
        else:
            entities[entity.id] = entity

        if len(entities) >= BULK_PAGE:
            index.index_bulk(collection.id, entities, merge=merge)
            entities = {}

    if len(entities):
        index.index_bulk(collection.id, entities, merge=merge)

    refresh_collection(collection)
예제 #29
0
def process(collection_id):
    """
    ---
    post:
      summary: Process a collection
      description: Start processing the collection with id `collection_id`
      parameters:
      - description: The collection ID.
        in: path
        name: collection_id
        required: true
        schema:
          minimum: 1
          type: integer
      - in: query
        name: ingest
        schema:
          type: boolean
      - in: query
        name: reset
        schema:
          type: boolean
      responses:
        '202':
          description: Accepted
      tags:
      - Collection
    """
    collection = get_db_collection(collection_id, request.authz.WRITE)
    # re-process the documents
    data = {'reset': get_flag('reset', True)}
    queue_task(collection, OP_PROCESS, job_id=get_session_id(), payload=data)
    collection.touch()
    db.session.commit()
    refresh_collection(collection_id)
    return ('', 202)
예제 #30
0
파일: manage.py 프로젝트: SinSiXX/aleph
def cancel(foreign_id):
    """Cancel all queued tasks for the dataset."""
    collection = get_collection(foreign_id)
    cancel_queue(collection)
    refresh_collection(collection.id)
예제 #31
0
파일: entities.py 프로젝트: wayne9qiu/aleph
def refresh_entity(collection, entity_id):
    cache.kv.delete(cache.object_key(Entity, entity_id))
    refresh_collection(collection.id)
예제 #32
0
파일: worker.py 프로젝트: SinSiXX/aleph
 def after_task(self, task):
     if task.job.is_done():
         collection = Collection.by_foreign_id(task.job.dataset.name)
         if collection is not None:
             refresh_collection(collection.id)
         task.job.remove()
예제 #33
0
파일: worker.py 프로젝트: butilities/aleph
 def cleanup_job(self, job):
     if job.is_done():
         collection = Collection.by_foreign_id(job.dataset.name)
         if collection is not None:
             refresh_collection(collection.id)
         job.remove()