Beispiel #1
0
def process_collection(stage,
                       collection,
                       ingest=True,
                       reset=False,
                       sync=False):
    """Trigger a full re-parse of all documents and re-build the
    search index from the aggregator."""
    ingest = ingest or reset
    if reset:
        reset_collection(collection, sync=True)
    aggregator = get_aggregator(collection)
    try:
        writer = aggregator.bulk()
        for proxy in _collection_proxies(collection):
            writer.put(proxy, fragment='db')
            stage.report_finished(1)
        writer.flush()
        if ingest:
            for proxy in aggregator:
                ingest_entity(collection, proxy, job_id=stage.job.id)
        else:
            queue_task(collection,
                       OP_INDEX,
                       job_id=stage.job.id,
                       context={'sync': sync})
    finally:
        aggregator.close()
Beispiel #2
0
def generate(collection_id):
    data = parse_request(XrefSchema)
    collection = get_db_collection(collection_id, request.authz.WRITE)
    against = ensure_list(data.get("against_collection_ids"))
    payload = {'against_collection_ids': against}
    queue_task(collection, OP_XREF, payload=payload)
    return jsonify({'status': 'accepted'}, status=202)
Beispiel #3
0
def export(collection_id):
    """
    ---
    post:
      summary: Download cross-reference results
      description: Download results of cross-referencing as an Excel file
      parameters:
      - in: path
        name: collection_id
        required: true
        schema:
          type: integer
      responses:
        '202':
          description: Accepted
      tags:
      - Xref
      - Collection
    """
    collection = get_db_collection(collection_id, request.authz.READ)
    label = "%s - Crossreference results" % collection.label
    export = create_export(
        operation=OP_EXPORT_XREF_RESULTS,
        role_id=request.authz.id,
        label=label,
        collection=collection,
        mime_type=XLSX,
    )
    job_id = get_session_id()
    payload = {
        "collection_id": collection_id,
        "export_id": export.id,
    }
    queue_task(None, OP_EXPORT_XREF_RESULTS, job_id=job_id, payload=payload)
    return ("", 202)
Beispiel #4
0
def flush(collection_id, mapping_id):
    """Flush all entities loaded by mapping with id `mapping_id`.
    ---
    post:
      summary: Flush entities loaded by a mapping
      parameters:
      - description: The collection id.
        in: path
        name: collection_id
        required: true
        schema:
          minimum: 1
          type: integer
        example: 2
      - description: The mapping id.
        in: path
        name: mapping_id
        required: true
        schema:
          minimum: 1
          type: integer
        example: 2
      responses:
        '202':
          description: No Content
      tags:
      - Collection
    """
    collection = get_db_collection(collection_id, request.authz.WRITE)
    mapping = obj_or_404(Mapping.by_id(mapping_id))
    queue_task(collection,
               OP_FLUSH_MAPPING,
               job_id=get_session_id(),
               payload={'mapping_id': mapping.id})
    return ('', 202)
Beispiel #5
0
def generate(collection_id):
    """
    ---
    post:
      summary: Generate cross-reference matches
      description: >
        Generate cross-reference matches for entities in a collection.
      parameters:
      - in: path
        name: collection_id
        required: true
        schema:
          type: integer
      responses:
        '202':
          content:
            application/json:
              schema:
                properties:
                  status:
                    description: accepted
                    type: string
                type: object
          description: Accepted
      tags:
      - Xref
      - Collection
    """
    collection = get_db_collection(collection_id, request.authz.WRITE)
    queue_task(collection, OP_XREF)
    return jsonify({"status": "accepted"}, status=202)
Beispiel #6
0
def reingest(collection_id):
    """
    ---
    post:
      summary: Re-ingest a collection
      description: >
        Trigger a process to re-parse the content of all documents stored
        in the collection with id `collection_id`.
      parameters:
      - description: The collection ID.
        in: path
        name: collection_id
        required: true
        schema:
          minimum: 1
          type: integer
      - in: query
        name: index
        description: Index documents while they're being processed.
        schema:
          type: boolean
      responses:
        '202':
          description: Accepted
      tags:
      - Collection
    """
    collection = get_db_collection(collection_id, request.authz.WRITE)
    job_id = get_session_id()
    data = {"index": get_flag("index", False)}
    queue_task(collection, OP_REINGEST, job_id=job_id, payload=data)
    return ("", 202)
Beispiel #7
0
def reindex(collection_id):
    """
    ---
    post:
      summary: Re-index a collection
      description: >
        Re-index the entities in the collection with id `collection_id`
      parameters:
      - description: The collection ID.
        in: path
        name: collection_id
        required: true
        schema:
          minimum: 1
          type: integer
      - in: query
        description: Delete the index before re-generating it.
        name: flush
        schema:
          type: boolean
      responses:
        '202':
          description: Accepted
      tags:
      - Collection
    """
    collection = get_db_collection(collection_id, request.authz.WRITE)
    job_id = get_session_id()
    data = {"flush": get_flag("flush", False)}
    queue_task(collection, OP_REINDEX, job_id=job_id, payload=data)
    return ("", 202)
Beispiel #8
0
def export():
    """
    ---
    post:
      summary: Download the results of a search
      description: >-
        Downloads all the results of a search as a zip archive; upto a max of
        10,000 results. The returned file will contain an Excel document with
        structured data as well as the binary files from all matching
        documents.

        Supports the same query parameters as the search API.
      responses:
        '202':
          description: Accepted
      tags:
      - Entity
    """
    require(request.authz.logged_in)
    parser = SearchQueryParser(request.args, request.authz)
    tag_request(query=parser.text, prefix=parser.prefix)
    query = EntitiesQuery(parser)
    label = gettext("Search: %s") % query.to_text()
    export = create_export(
        operation=OP_EXPORT_SEARCH,
        role_id=request.authz.id,
        label=label,
        mime_type=ZIP,
        meta={"query": query.get_full_query()},
    )
    job_id = get_session_id()
    queue_task(None, OP_EXPORT_SEARCH, job_id=job_id, export_id=export.id)
    return ("", 202)
Beispiel #9
0
def upsert_entity(data,
                  collection,
                  authz=None,
                  sync=False,
                  sign=False,
                  job_id=None):
    """Create or update an entity in the database. This has a side effect  of migrating
    entities created via the _bulk API or a mapper to a database entity in the event
    that it gets edited by the user.
    """
    from aleph.logic.profiles import profile_fragments

    entity = None
    entity_id = collection.ns.sign(data.get("id"))
    if entity_id is not None:
        entity = Entity.by_id(entity_id, collection=collection)
    if entity is None:
        role_id = authz.id if authz is not None else None
        entity = Entity.create(data, collection, sign=sign, role_id=role_id)
    else:
        entity.update(data, collection, sign=sign)
    collection.touch()

    proxy = entity.to_proxy()
    aggregator = get_aggregator(collection)
    aggregator.delete(entity_id=proxy.id)
    aggregator.put(proxy, origin=MODEL_ORIGIN)
    profile_fragments(collection, aggregator, entity_id=proxy.id)

    index.index_proxy(collection, proxy, sync=sync)
    refresh_entity(collection, proxy.id)
    queue_task(collection, OP_UPDATE_ENTITY, job_id=job_id, entity_id=proxy.id)
    return entity.id
Beispiel #10
0
def xref_collection(stage, collection):
    """Cross-reference all the entities and documents in a collection."""
    index.delete_xref(collection)
    matchable = [s.name for s in model if s.matchable]
    entities = iter_entities(collection_id=collection.id, schemata=matchable)
    for entity in entities:
        queue_task(collection, OP_XREF_ITEM, job_id=stage.job.id,
                   payload={'entity_id': entity.get('id')})
Beispiel #11
0
def item_update(entityset_id):
    """Add an item to the entity set with id `entityset_id`, or change
    the items judgement.

    To delete an item from the entity set, apply the judgement: `no_judgement`.
    ---
    post:
      summary: Add item to an entityset
      parameters:
      - description: The entityset id.
        in: path
        name: entityset_id
        required: true
        schema:
          type: string
        example: 3a0d91ece2dce88ad3259594c7b642485235a048
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/EntitySetItemUpdate'
      responses:
        '200':
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/EntitySetItem'
          description: OK
        '204':
          description: Item removed
      tags:
      - EntitySetItem
    """
    entityset = get_entityset(entityset_id, request.authz.WRITE)
    data = parse_request("EntitySetItemUpdate")
    entity = data.pop("entity", {})
    entity_id = data.pop("entity_id", entity.get("id"))
    entity = get_index_entity(entity_id, request.authz.READ)
    collection = get_db_collection(entity["collection_id"])
    data["added_by_id"] = request.authz.id
    data.pop("collection", None)
    item = save_entityset_item(entityset, collection, entity_id, **data)
    db.session.commit()
    job_id = get_session_id()
    queue_task(collection, OP_UPDATE_ENTITY, job_id=job_id, entity_id=entity_id)
    if item is not None:
        # The entityset is needed to check if the item is writeable in the serializer:
        item = item.to_dict(entityset=entityset)
    else:
        item = {
            "id": "$".join((entityset_id, entity_id)),
            "entityset_id": entityset_id,
            "entityset_collection_id": entityset.collection_id,
            "entity_id": entity_id,
            "collection_id": entity["collection_id"],
            "judgement": Judgement.NO_JUDGEMENT,
        }
    return EntitySetItemSerializer.jsonify(item)
Beispiel #12
0
def process(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    # re-process the documents
    payload = {
        'ingest': get_flag('ingest', True),
        'reset': get_flag('reset', True)
    }
    queue_task(collection, OP_PROCESS, payload=payload)
    return ('', 202)
Beispiel #13
0
def process(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    # re-process the documents
    data = {'reset': get_flag('reset', True)}
    queue_task(collection, OP_PROCESS, job_id=get_session_id(), payload=data)
    collection.touch()
    db.session.commit()
    refresh_collection(collection_id)
    return ('', 202)
Beispiel #14
0
def bulkload(file_name):
    """Load entities from the specified mapping file."""
    log.info("Loading bulk data from: %s", file_name)
    config = load_mapping_file(file_name)
    for foreign_id, data in config.items():
        data['foreign_id'] = foreign_id
        data['label'] = data.get('label', foreign_id)
        create_collection(data)
        collection = Collection.by_foreign_id(foreign_id)
        queue_task(collection, OP_BULKLOAD, payload=data)
Beispiel #15
0
def load_mapping(stage, collection, mapping_id):
    """Flush and reload all entities generated by a mapping."""
    mapping = Mapping.by_id(mapping_id)
    if mapping is None:
        return log.error("Could not find mapping: %s", mapping_id)
    flush_mapping(stage, collection, mapping_id)
    publish(Events.LOAD_MAPPING,
            params={'collection': collection, 'table': mapping.table_id},
            channels=[collection, mapping.role],
            actor_id=mapping.role_id)
    mapper = make_mapper(collection, mapping)
    aggregator = get_aggregator(collection)
    try:
        writer = aggregator.bulk()
        entities_count = 0
        entity_ids = set()
        for idx, record in enumerate(mapper.source.records, 1):
            for entity in mapper.map(record).values():
                if entity.schema.is_a('Thing'):
                    entity.add('proof', mapping.table_id)
                entity = collection.ns.apply(entity)
                entity_ids.add(entity.id)
                entities_count += 1
                fragment = '%s-%s' % (mapping.id, idx)
                writer.put(entity, fragment=fragment)

            if idx > 0 and idx % 500 == 0:
                payload = {
                    'entity_ids': entity_ids,
                    'mapping_id': mapping.id
                }
                queue_task(collection, OP_INDEX,
                           job_id=stage.job.id,
                           payload=payload)
                entity_ids = set()
                stage.report_finished(500)
                log.info("[%s] Loaded %s records, %s entities...",
                         collection.foreign_id,
                         idx, entities_count)

        writer.flush()
        payload = {
            'entity_ids': entity_ids,
            'mapping_id': mapping.id
        }
        queue_task(collection, OP_INDEX,
                   job_id=stage.job.id,
                   payload=payload)
        mapping.set_status(status=Mapping.SUCCESS)
        log.info("[%s] Mapping done (%s entities)",
                 mapping.id, entities_count)
    except Exception as exc:
        mapping.set_status(status=Mapping.FAILED, error=str(exc))
    finally:
        aggregator.close()
Beispiel #16
0
def bulk_load(queue, collection, config):
    """Bulk load entities from a CSV file or SQL database.

    This is done by mapping the rows in the source data to entities and links
    which can be understood by the entity index.
    """
    queries = keys_values(config, 'queries', 'query')
    for query in queries:
        bulk_load_query(queue, collection, hash_data(query), query)
    queue_task(collection, OP_INDEX)
    queue.remove()
Beispiel #17
0
def pairwise():
    """
    ---
    post:
      summary: Make a pairwise judgement between an entity and a match.
      description: >
        This lets a user decide if they think a given xref match is a true or
        false match. Implicitly, this might create or alter a profile in the
        collection used by
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/Pairwise'
      responses:
        '200':
          content:
            application/json:
              schema:
                properties:
                  status:
                    description: accepted
                    type: string
                  profile_id:
                    description: profile_id for `entity`.
                    type: string
                type: object
          description: Accepted
      tags:
      - Profile
    """
    data = parse_request("Pairwise")
    entity = get_index_entity(data.get("entity_id"))
    collection = get_db_collection(entity["collection_id"],
                                   request.authz.WRITE)
    match = get_index_entity(data.get("match_id"))
    match_collection = get_db_collection(match["collection_id"])
    profile = decide_pairwise(
        collection,
        entity,
        match_collection,
        match,
        judgement=data.get("judgement"),
        authz=request.authz,
    )
    job_id = get_session_id()
    queue_task(collection,
               OP_UPDATE_ENTITY,
               job_id=job_id,
               entity_id=entity.get("id"))
    profile_id = profile.id if profile is not None else None
    return jsonify({"status": "ok", "profile_id": profile_id}, status=200)
Beispiel #18
0
def mapping(collection_id):
    collection = get_db_collection(collection_id, request.authz.WRITE)
    require(request.authz.can_bulk_import())
    if not request.is_json:
        raise BadRequest()
    data = request.get_json().get(collection.foreign_id)
    for query in keys_values(data, 'queries', 'query'):
        try:
            model.make_mapping(query)
        except InvalidMapping as invalid:
            raise BadRequest(invalid)
    queue_task(collection, OP_BULKLOAD, payload=data)
    return ('', 202)
Beispiel #19
0
def xref_collection(stage, collection, against_collection_ids=None):
    """Cross-reference all the entities and documents in a collection."""
    matchable = [s.name for s in model if s.matchable]
    entities = iter_entities(collection_id=collection.id, schemata=matchable)
    for entity in entities:
        payload = {
            'entity_id': entity.get('id'),
            'against_collection_ids': against_collection_ids
        }
        queue_task(collection,
                   OP_XREF_ITEM,
                   job_id=stage.job.id,
                   payload=payload)
Beispiel #20
0
def process_collection(stage, collection, ingest=True, sync=False):
    """Trigger a full re-parse of all documents and re-build the
    search index from the aggregator."""
    aggregator = get_aggregator(collection)
    for proxy in _collection_proxies(collection):
        if ingest and proxy.schema.is_a(Document.SCHEMA):
            ingest_entity(collection, proxy, job_id=stage.job.id, sync=sync)
        else:
            aggregator.put(proxy, fragment='db')
            queue_task(collection,
                       OP_INDEX,
                       job_id=stage.job.id,
                       payload={'entity_id': proxy.id},
                       context={'sync': sync})
    aggregator.close()
Beispiel #21
0
def update(foreign_id=None, index=False, process=False, reset=False):
    """Re-index all the collections and entities."""
    update_roles()
    q = Collection.all(deleted=True)
    if foreign_id is not None:
        q = [get_collection(foreign_id)]
    for collection in q:
        if reset:
            reset_collection(collection, sync=True)
        refresh_collection(collection.id)
        index_collection(collection)
        if collection.deleted_at is not None:
            continue
        if index or process:
            payload = {'ingest': process}
            queue_task(collection, OP_PROCESS, payload=payload)
Beispiel #22
0
def flush(collection_id, mapping_id):
    """Flush all entities loaded by mapping with id `mapping_id`.
    ---
    post:
      summary: Flush entities loaded by a mapping
      parameters:
      - description: The collection id.
        in: path
        name: collection_id
        required: true
        schema:
          minimum: 1
          type: integer
        example: 2
      - description: The mapping id.
        in: path
        name: mapping_id
        required: true
        schema:
          minimum: 1
          type: integer
        example: 2
      responses:
        '202':
          description: No Content
      tags:
      - Collection
      - Mapping
    """
    collection = get_db_collection(collection_id, request.authz.WRITE)
    mapping = obj_or_404(Mapping.by_id(mapping_id))
    mapping.disabled = True
    mapping.last_run_status = None
    mapping.last_run_err_msg = None
    db.session.add(mapping)
    db.session.commit()
    queue_task(
        collection,
        OP_FLUSH_MAPPING,
        job_id=get_session_id(),
        mapping_id=mapping_id,
    )
    return ("", 202)
Beispiel #23
0
def trigger(collection_id, mapping_id):
    """Load entities by running the mapping with id `mapping_id`. Flushes
    previously loaded entities before loading new entities.
    ---
    post:
      summary: Load entities from a mapping
      parameters:
      - description: The collection id.
        in: path
        name: collection_id
        required: true
        schema:
          minimum: 1
          type: integer
        example: 2
      - description: The mapping id.
        in: path
        name: mapping_id
        required: true
        schema:
          minimum: 1
          type: integer
        example: 2
      responses:
        '202':
          description: No Content
      tags:
      - Collection
      - Mapping
    """
    collection = get_db_collection(collection_id, request.authz.WRITE)
    mapping = obj_or_404(Mapping.by_id(mapping_id))
    mapping.disabled = False
    mapping.set_status(Status.PENDING)
    db.session.commit()
    job_id = get_session_id()
    queue_task(collection,
               OP_LOAD_MAPPING,
               job_id=job_id,
               mapping_id=mapping.id)
    mapping = obj_or_404(Mapping.by_id(mapping_id))
    return MappingSerializer.jsonify(mapping, status=202)
Beispiel #24
0
def export():
    """
    ---
    post:
      summary: Download the results of a search
      description: >-
        Downloads all the results of a search as a zip archive; upto a max of
        10,000 results. The returned file will contain an Excel document with
        structured data as well as the binary files from all matching
        documents.

        Supports the same query parameters as the search API.
      responses:
        '202':
          description: Accepted
      tags:
      - Entity
    """
    require(request.authz.logged_in)
    parser = SearchQueryParser(request.args, request.authz)
    parser.limit = MAX_PAGE
    tag_request(query=parser.text, prefix=parser.prefix)
    result = EntitiesQuery.handle(request, parser=parser)
    label = "Search results for query: %s" % parser.text
    export = create_export(
        operation=OP_EXPORT_SEARCH_RESULTS,
        role_id=request.authz.id,
        label=label,
        file_path=None,
        expires_after=Export.DEFAULT_EXPIRATION,
        collection=None,
        mime_type=ZIP,
    )
    job_id = get_session_id()
    payload = {
        "export_id": export.id,
        "result": result.to_dict(),
    }
    queue_task(None, OP_EXPORT_SEARCH_RESULTS, job_id=job_id, payload=payload)
    return ("", 202)
Beispiel #25
0
def delete(collection_id, mapping_id):
    """Delete a mapping.
    ---
    delete:
      summary: Delete a mapping
      parameters:
      - description: The collection id.
        in: path
        name: collection_id
        required: true
        schema:
          minimum: 1
          type: integer
        example: 2
      - description: The mapping id.
        in: path
        name: mapping_id
        required: true
        schema:
          minimum: 1
          type: integer
        example: 2
      responses:
        '204':
          description: No Content
      tags:
      - Collection
      - Mapping
    """
    collection = get_db_collection(collection_id, request.authz.WRITE)
    mapping = obj_or_404(Mapping.by_id(mapping_id))
    mapping.delete()
    db.session.commit()
    queue_task(
        collection,
        OP_FLUSH_MAPPING,
        job_id=get_session_id(),
        mapping_id=mapping_id,
    )
    return ("", 204)
Beispiel #26
0
def generate(collection_id):
    """
    ---
    post:
      summary: Generate cross-reference matches
      description: >
        Generate cross-reference matches for entities in a collection.
      parameters:
      - in: path
        name: collection_id
        required: true
        schema:
          type: integer
      requestBody:
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/XrefGenerate'
      responses:
        '202':
          content:
            application/json:
              schema:
                properties:
                  status:
                    description: accepted
                    type: string
                type: object
          description: Accepted
      tags:
      - Xref
      - Collection
    """
    data = parse_request('XrefGenerate')
    collection = get_db_collection(collection_id, request.authz.WRITE)
    against = ensure_list(data.get("against_collection_ids"))
    payload = {'against_collection_ids': against}
    queue_task(collection, OP_XREF, payload=payload)
    return jsonify({'status': 'accepted'}, status=202)
Beispiel #27
0
def trigger(collection_id, mapping_id):
    """Load entities by running the mapping with id `mapping_id`. Flushes
    previously loaded entities before loading new entities.
    ---
    post:
      summary: Load entities from a mapping
      parameters:
      - description: The collection id.
        in: path
        name: collection_id
        required: true
        schema:
          minimum: 1
          type: integer
        example: 2
      - description: The mapping id.
        in: path
        name: mapping_id
        required: true
        schema:
          minimum: 1
          type: integer
        example: 2
      responses:
        '202':
          description: No Content
      tags:
      - Collection
      - Mapping
    """
    collection = get_db_collection(collection_id, request.authz.WRITE)
    mapping = obj_or_404(Mapping.by_id(mapping_id))
    job_id = get_session_id()
    payload = {'mapping_id': mapping.id}
    queue_task(collection, OP_LOAD_MAPPING, job_id=job_id, payload=payload)
    collection.touch()
    db.session.commit()
    return ('', 202)
Beispiel #28
0
def process(collection_id):
    """
    ---
    post:
      summary: Process a collection
      description: Start processing the collection with id `collection_id`
      parameters:
      - description: The collection ID.
        in: path
        name: collection_id
        required: true
        schema:
          minimum: 1
          type: integer
      - in: query
        name: ingest
        schema:
          type: boolean
      - in: query
        name: reset
        schema:
          type: boolean
      responses:
        '202':
          description: Accepted
      tags:
      - Collection
    """
    collection = get_db_collection(collection_id, request.authz.WRITE)
    # re-process the documents
    data = {'reset': get_flag('reset', True)}
    queue_task(collection, OP_PROCESS, job_id=get_session_id(), payload=data)
    collection.touch()
    db.session.commit()
    refresh_collection(collection_id)
    return ('', 202)
Beispiel #29
0
def xref(foreign_id, against=None):
    """Cross-reference all entities and documents in a collection."""
    collection = get_collection(foreign_id)
    against = [get_collection(c).id for c in ensure_list(against)]
    against = {'against_collection_ids': against}
    queue_task(collection, OP_XREF, payload=against)
Beispiel #30
0
def retry_exports():
    for export in Export.get_pending():
        queue_task(None, export.operation, payload={"export_id": export.id})