Beispiel #1
0
def index(collection_id):
    collection = get_db_collection(collection_id)
    record_audit(Audit.ACT_COLLECTION, id=collection.id)
    parser = QueryParser(request.args, request.authz)
    q = Match.group_by_collection(collection.id, authz=request.authz)
    result = DatabaseQueryResult(request, q, parser=parser)
    return MatchCollectionsSerializer.jsonify_result(result)
Beispiel #2
0
def matches(collection_id, other_id):
    collection = get_db_collection(collection_id)
    record_audit(Audit.ACT_COLLECTION, id=collection.id)
    other = get_db_collection(other_id)
    record_audit(Audit.ACT_COLLECTION, id=other.id)
    parser = QueryParser(request.args, request.authz)
    q = Match.find_by_collection(collection.id, other.id)
    result = DatabaseQueryResult(request, q, parser=parser)
    return MatchSerializer.jsonify_result(result)
Beispiel #3
0
def delete_collection_content(collection_id):
    # Deleting a collection affects many associated objects and requires
    # checks, so this is done manually and in detail here.
    q = db.session.query(Collection)
    q = q.filter(Collection.id == collection_id)
    collection = q.first()
    if collection is None:
        log.error("No collection with ID: %r", collection_id)
        return

    log.info("Deleting collection [%r]: %r", collection.id, collection.label)
    deleted_at = collection.deleted_at or datetime.utcnow()
    Entity.delete_by_collection(collection_id, deleted_at=deleted_at)
    Match.delete_by_collection(collection_id, deleted_at=deleted_at)
    Permission.delete_by_collection(collection_id, deleted_at=deleted_at)
    index.delete_collection(collection_id)
    index.delete_entities(collection_id)
    collection.delete(deleted_at=deleted_at)
    db.session.commit()
Beispiel #4
0
def _xref_item(item, collection_id=None):
    """Cross-reference an entity or document, given as an indexed document."""
    name = item.get('name') or item.get('title')
    query = entity_query(item, collection_id=collection_id)
    if 'match_none' in query:
        return

    query = {
        'query': query,
        'size': 10,
        '_source': ['collection_id', 'name'],
    }
    result = search_safe(index=entities_index(), body=query)
    results = result.get('hits').get('hits')
    entity_id, document_id = None, None
    if Document.SCHEMA in item.get('schemata'):
        document_id = item.get('id')
    else:
        entity_id = item.get('id')

    dq = db.session.query(Match)
    dq = dq.filter(Match.entity_id == entity_id)
    dq = dq.filter(Match.document_id == document_id)
    if collection_id is not None:
        dq = dq.filter(Match.match_collection_id == collection_id)
    dq.delete()

    for result in results:
        source = result.get('_source', {})
        log.info("Xref [%.1f]: %s <=> %s", result.get('_score'), name,
                 source.get('name'))
        obj = Match()
        obj.entity_id = entity_id
        obj.document_id = document_id
        obj.collection_id = item.get('collection_id')
        obj.match_id = result.get('_id')
        obj.match_collection_id = source.get('collection_id')
        obj.score = result.get('_score')
        db.session.add(obj)
    db.session.commit()
Beispiel #5
0
def xref_item(item, collection_id=None):
    """Cross-reference an entity or document, given as an indexed document."""
    name = item.get('name') or item.get('title')
    result = es.search(index=es_index,
                       doc_type=TYPE_ENTITY,
                       body={
                           'query': entity_query(item, collection_id),
                           'size': 10,
                           '_source': ['collection_id', 'name'],
                       })
    results = result.get('hits').get('hits')
    entity_id, document_id = None, None
    if item.get('$type') == TYPE_DOCUMENT:
        document_id = item.get('id')
    else:
        entity_id = item.get('id')

    dq = db.session.query(Match)
    dq = dq.filter(Match.entity_id == entity_id)
    dq = dq.filter(Match.document_id == document_id)
    if collection_id is not None:
        dq = dq.filter(Match.match_collection_id == collection_id)
    dq.delete()

    for result in results:
        source = result.get('_source', {})
        log.info("Xref [%.1f]: %s <=> %s", result.get('_score'), name,
                 source.get('name'))
        obj = Match()
        obj.entity_id = entity_id
        obj.document_id = document_id
        obj.collection_id = item.get('collection_id')
        obj.match_id = result.get('_id')
        obj.match_collection_id = source.get('collection_id')
        obj.score = result.get('_score')
        db.session.add(obj)
    db.session.commit()
Beispiel #6
0
def xref_collection(collection_id, against_collection_ids=None):
    """Cross-reference all the entities and documents in a collection."""
    matchable = [s.name for s in model if s.matchable]
    entities = iter_proxies(collection_id=collection_id, schemata=matchable)
    for entity in entities:
        proxy = model.get_proxy(entity)
        dq = db.session.query(Match)
        dq = dq.filter(Match.entity_id == proxy.id)
        dq.delete()
        matches = xref_item(proxy, collection_ids=against_collection_ids)
        for (score, other_id, other) in matches:
            log.info("Xref [%.3f]: %s <=> %s", score, proxy, other)
            obj = Match()
            obj.entity_id = proxy.id
            obj.collection_id = collection_id
            obj.match_id = other.id
            obj.match_collection_id = other_id
            obj.score = score
            db.session.add(obj)
        db.session.commit()
Beispiel #7
0
def index(collection_id):
    """
    ---
    get:
      summary: Fetch cross-reference summary
      description: >-
        Fetch cross-reference matches grouped by collection, for entities in
        the collection with id `collection_id`
      parameters:
      - in: path
        name: collection_id
        required: true
        schema:
          type: integer
      responses:
        '200':
          description: OK
          content:
            application/json:
              schema:
                type: object
                allOf:
                - $ref: '#/components/schemas/QueryResponse'
                properties:
                  results:
                    type: array
                    items:
                      $ref: '#/components/schemas/XrefCollection'
      tags:
      - Xref
      - Collection
    """
    collection = get_db_collection(collection_id)
    parser = QueryParser(request.args, request.authz)
    q = Match.group_by_collection(collection.id, authz=request.authz)
    result = DatabaseQueryResult(request, q, parser=parser)
    return MatchCollectionsSerializer.jsonify_result(result)
Beispiel #8
0
def xref_collection(collection_id, against_collection_ids=None):
    """Cross-reference all the entities and documents in a collection."""
    matchable = [s.name for s in model if s.matchable]
    entities = iter_proxies(collection_id=collection_id, schemata=matchable)
    for entity in entities:
        proxy = model.get_proxy(entity)
        dq = db.session.query(Match)
        dq = dq.filter(Match.entity_id == proxy.id)
        dq.delete()
        matches = xref_item(proxy, collection_ids=against_collection_ids)
        for (score, other_id, other) in matches:
            log.info("Xref [%.3f]: %s <=> %s", score, proxy, other)
            obj = Match()
            obj.entity_id = proxy.id
            obj.collection_id = collection_id
            obj.match_id = other.id
            obj.match_collection_id = other_id
            obj.score = score
            db.session.add(obj)
        db.session.commit()
Beispiel #9
0
def xref_item(stage, collection, entity_id=None, against_collection_ids=None):
    "Cross-reference an entity against others to generate potential matches."
    entity_ids = [entity_id]
    # This is running as a background job. In order to avoid running each
    # entity one by one, we do it 101 at a time. This avoids sending redudant
    # queries to the database and elasticsearch, making cross-ref much faster.
    for task in stage.get_tasks(limit=50):
        entity_ids.append(task.payload.get('entity_id'))
    stage.mark_done(len(entity_ids) - 1)
    # log.debug("Have %d entity IDs for xref", len(entity_ids))
    for data in entities_by_ids(entity_ids, includes=['schema', 'properties']):
        proxy = model.get_proxy(data)
        # log.info("XRef: %r", proxy)
        matches = xref_query_item(proxy, collection_ids=against_collection_ids)
        for (score, other_id, other) in matches:
            log.info("Xref [%.3f]: %s <=> %s", score, proxy, other)
            obj = Match()
            obj.entity_id = proxy.id
            obj.collection_id = collection.id
            obj.match_id = other.id
            obj.match_collection_id = other_id
            obj.score = score
            db.session.add(obj)
        db.session.commit()
Beispiel #10
0
def index(collection_id):
    collection = get_db_collection(collection_id)
    parser = QueryParser(request.args, request.authz)
    q = Match.group_by_collection(collection.id, authz=request.authz)
    result = DatabaseQueryResult(request, q, parser=parser)
    return MatchCollectionsSerializer.jsonify_result(result)
Beispiel #11
0
def delete_bulk_entities(collection_id, deleted_at=None):
    deleted_at = deleted_at or datetime.utcnow()
    log.info("Deleting entities...")
    index.delete_entities(collection_id, bulk_only=True)
    Match.delete_by_collection(collection_id, deleted_at=deleted_at)
Beispiel #12
0
def generate_excel(collection, authz, links=True, one_sheet=False):

    limit = 1000

    output = StringIO.StringIO()
    workbook = xlsxwriter.Workbook(output)
    workbook.link_format = workbook.add_format({
        'font_color': 'blue',
        'underline': True
    })
    workbook.header_format = workbook.add_format({
        'font_color': 'white',
        'fg_color': '#982022',
        'bold': True
    })

    # Write the summary worksheet (Collection names and count)
    sheet = workbook.add_worksheet('Summary')
    sheet.set_zoom(125)
    title = 'Cross-referencing: %s' % collection.label
    sheet.merge_range(0, 0, 0, 2, title, workbook.header_format)
    sheet.write(1, 0, 'Collection', workbook.header_format)
    sheet.write(1, 1, 'Matches', workbook.header_format)
    if not one_sheet:
        sheet.write(1, 2, 'Details', workbook.header_format)
    sheet.set_column(2, 2, 20)
    sheet.freeze_panes(1, 0)

    # Query for all the collections with matches
    collections = Match.group_by_collection(collection.id, authz=authz)
    max_label = 70
    offset = 2  # Number of header rows
    for row, result in enumerate(collections, 2):
        if links:
            url = collection_url(result.collection.id)
            sheet.write_url(row, 0, url, workbook.link_format,
                            result.collection.label)
        else:
            sheet.write_string(row, 0, result.collection.label)
        max_label = max(max_label, len(result.collection.label))
        sheet.set_column(0, 0, float(max_label))
        sheet.write_number(row, 1, result.matches)

        if not one_sheet:
            matches_sheet_name = make_excel_safe_name(result.collection)
            matches_sheet = workbook.add_worksheet(matches_sheet_name)
            url = "internal:'%s'!B3" % matches_sheet_name
            sheet.write_url(row, 2, url, workbook.link_format, 'See matches')

        try:
            matches_sheet
        except NameError:
            matches_sheet = workbook.add_worksheet("All matches")

        matches_sheet = generate_matches_sheet(workbook,
                                               matches_sheet,
                                               collection,
                                               result.collection,
                                               authz,
                                               links=links,
                                               one_sheet=one_sheet,
                                               offset=offset,
                                               limit=limit)

        if one_sheet:
            if result.matches > limit:
                offset = offset + limit
            else:
                offset = offset + result.matches

    workbook.close()
    output.seek(0)
    return output
Beispiel #13
0
def generate_matches_sheet(workbook,
                           sheet,
                           collection,
                           match_collection,
                           authz,
                           links=True,
                           one_sheet=False,
                           offset=0,
                           limit=1000):
    from aleph.views.serializers import MatchSchema

    if one_sheet:
        sheet_label = "All matches (top %s per collection)" % limit
    else:
        sheet_label = "%s (top %s)" % (match_collection.label, limit)

    sheet.set_zoom(125)
    parser = QueryParser({}, authz, limit=limit)
    q_match = Match.find_by_collection(collection.id, match_collection.id)
    matches = MatchQueryResult({}, q_match, parser=parser, schema=MatchSchema)

    if offset < 3:

        sheet.write(0, 0, '', workbook.header_format)
        sheet.write(1, 0, 'Score', workbook.header_format)
        sheet.merge_range(0, 1, 0, 4, collection.label, workbook.header_format)
        sheet.write(1, 1, 'Name', workbook.header_format)
        sheet.write(1, 2, 'Type', workbook.header_format)
        sheet.write(1, 3, 'Country', workbook.header_format)
        sheet.write(1, 4, 'Source URL', workbook.header_format)
        sheet.merge_range(0, 5, 0, 8, sheet_label, workbook.header_format)
        sheet.write(1, 5, 'Name', workbook.header_format)
        sheet.write(1, 6, 'Type', workbook.header_format)
        sheet.write(1, 7, 'Country', workbook.header_format)
        if one_sheet:
            sheet.write(1, 8, 'Collection', workbook.header_format)

        sheet.freeze_panes(2, 0)
        sheet.autofilter(1, 1, 2 + len(matches.results), 8)

    widths = {}
    for row, result in enumerate(matches.results, offset):
        sheet.write_number(row, 0, int(result.score))
        name = result.entity.get('name')
        widths[1] = max(widths.get(1, 0), len(name))
        if links:
            url = entity_url(result.entity_id)
            sheet.write_url(row, 1, url, workbook.link_format, name)
        else:
            sheet.write_string(row, 1, name)
        schema = model.get(result.entity['schema'])
        sheet.write_string(row, 2, schema.label)
        countries = ', '.join(sorted(result.entity.get('countries', [])))
        sheet.write_string(row, 3, countries.upper())
        ent_props = result.entity.get('properties', {})
        if (ent_props.get('sourceUrl') is not None):
            source_url = ', '.join(ent_props.get('sourceUrl'))
        else:
            source_url = ''
        sheet.write_string(row, 4, source_url)

        name = result.match.get('name')
        widths[5] = max(widths.get(5, 0), len(name))
        if links:
            url = entity_url(result.match_id)
            sheet.write_url(row, 5, url, workbook.link_format, name)
        else:
            sheet.write_string(row, 5, name)
        schema = model.get(result.match['schema'])
        sheet.write_string(row, 6, schema.label)
        countries = ', '.join(sorted(result.match.get('countries', [])))
        sheet.write_string(row, 7, countries.upper())
        if one_sheet:
            sheet.write_string(row, 8, match_collection.label)

    for idx, max_len in widths.items():
        max_len = min(70, max(7, max_len + 1))
        sheet.set_column(idx, idx, float(max_len))

    return sheet
Beispiel #14
0
def delete_bulk_entities(collection_id, deleted_at=None):
    deleted_at = deleted_at or datetime.utcnow()
    log.info("Deleting entities...")
    index.delete_entities(collection_id, bulk_only=True)
    Match.delete_by_collection(collection_id, deleted_at=deleted_at)