Exemple #1
0
def _iter_match_batch(batch, authz):
    matchable = [s.name for s in model if s.matchable]
    entities = set()
    for match in batch:
        entities.add(match.entity_id)
        entities.add(match.match_id)

    entities = entities_by_ids(list(entities), schemata=matchable)
    entities = {e.get('id'): e for e in entities}
    for obj in batch:
        if not authz.can(obj.match_collection_id, authz.READ):
            continue
        entity = entities.get(str(obj.entity_id))
        match = entities.get(str(obj.match_id))
        collection = get_collection(obj.match_collection_id)
        if entity is None or match is None or collection is None:
            continue
        eproxy = model.get_proxy(entity)
        mproxy = model.get_proxy(match)
        yield (
            int(obj.score * 100),
            eproxy.caption,
            _format_date(eproxy),
            _format_country(eproxy),
            collection.get('label'),
            mproxy.caption,
            _format_date(mproxy),
            _format_country(mproxy),
            entity_url(eproxy.id),
            entity_url(mproxy.id),
        )
Exemple #2
0
def _iter_match_batch(stub, sheet, batch):
    matchable = [s.name for s in model if s.matchable]
    entities = set()
    for match in batch:
        entities.add(match.get("entity_id"))
        entities.add(match.get("match_id"))
        resolver.queue(stub, Collection, match.get("match_collection_id"))

    resolver.resolve(stub)
    entities = entities_by_ids(list(entities), schemata=matchable)
    entities = {e.get("id"): e for e in entities}

    for obj in batch:
        entity = entities.get(str(obj.get("entity_id")))
        match = entities.get(str(obj.get("match_id")))
        collection_id = obj.get("match_collection_id")
        collection = resolver.get(stub, Collection, collection_id)
        if entity is None or match is None or collection is None:
            continue
        eproxy = model.get_proxy(entity)
        mproxy = model.get_proxy(match)
        sheet.append(
            [
                obj.get("score"),
                eproxy.caption,
                _format_date(eproxy),
                _format_country(eproxy),
                collection.get("label"),
                mproxy.caption,
                _format_date(mproxy),
                _format_country(mproxy),
                entity_url(eproxy.id),
                entity_url(mproxy.id),
            ]
        )
Exemple #3
0
def _iter_match_batch(batch, authz):
    entities = set()
    collections = set()
    for match in batch:
        entities.add(match.entity_id)
        entities.add(match.match_id)
        collections.add(match.match_collection_id)

    collections = Collection.all_by_ids(collections, authz=authz)
    collections = {c.id: c.label for c in collections}
    entities = iter_entities_by_ids(list(entities), authz=authz)
    entities = {e.get('id'): e for e in entities}
    for obj in batch:
        entity = entities.get(str(obj.entity_id))
        match = entities.get(str(obj.match_id))
        collection = collections.get(obj.match_collection_id)
        if entity is None or match is None or collection is None:
            continue
        eproxy = model.get_proxy(entity)
        mproxy = model.get_proxy(match)
        yield (
            int(obj.score * 100),
            eproxy.caption,
            _format_date(eproxy),
            _format_country(eproxy),
            collection,
            mproxy.caption,
            _format_date(mproxy),
            _format_country(mproxy),
            entity_url(eproxy.id),
            entity_url(mproxy.id),
        )
Exemple #4
0
def _iter_match_batch(batch, authz):
    matchable = [s.name for s in model if s.matchable]
    entities = set()
    for match in batch:
        entities.add(match.entity_id)
        entities.add(match.match_id)

    entities = entities_by_ids(list(entities), schemata=matchable)
    entities = {e.get('id'): e for e in entities}
    for obj in batch:
        if not authz.can(obj.match_collection_id, authz.READ):
            continue
        entity = entities.get(str(obj.entity_id))
        match = entities.get(str(obj.match_id))
        collection = get_collection(obj.match_collection_id)
        if entity is None or match is None or collection is None:
            continue
        eproxy = model.get_proxy(entity)
        mproxy = model.get_proxy(match)
        yield (
            int(obj.score * 100),
            eproxy.caption,
            _format_date(eproxy),
            _format_country(eproxy),
            collection.get('label'),
            mproxy.caption,
            _format_date(mproxy),
            _format_country(mproxy),
            entity_url(eproxy.id),
            entity_url(mproxy.id),
        )
Exemple #5
0
def reconcile_index(collection=None):
    domain = settings.APP_UI_URL.strip("/")
    label = settings.APP_TITLE
    suggest_query = []
    schemata = list(model)
    if collection is not None:
        label = "%s (%s)" % (collection.get("label"), label)
        suggest_query.append(("filter:collection_id", collection.get("id")))
        things = get_collection_things(collection.get("id"))
        schemata = [model.get(s) for s in things.keys()]
    return jsonify({
        "name":
        label,
        "identifierSpace":
        "http://rdf.freebase.com/ns/type.object.id",
        "schemaSpace":
        "http://rdf.freebase.com/ns/type.object.id",
        "view": {
            "url": entity_url("{{id}}")
        },
        "preview": {
            "url": entity_url("{{id}}"),
            "width": 800,
            "height": 400
        },
        "suggest": {
            "entity": {
                "service_url":
                domain,
                "service_path":
                url_for(
                    "reconcile_api.suggest_entity",
                    _query=suggest_query,
                    _authz=request.authz,
                    _relative=True,
                ),
            },
            "type": {
                "service_url":
                domain,
                "service_path":
                url_for("reconcile_api.suggest_type", _relative=True),
            },
            "property": {
                "service_url":
                domain,
                "service_path":
                url_for("reconcile_api.suggest_property", _relative=True),
            },
        },
        "defaultTypes":
        [get_freebase_type(s) for s in schemata if s.matchable],
    })
Exemple #6
0
def reconcile_index(collection=None):
    domain = settings.APP_UI_URL.strip('/')
    label = settings.APP_TITLE
    suggest_query = []
    schemata = list(model)
    if collection is not None:
        label = '%s (%s)' % (collection.get('label'), label)
        suggest_query.append(('filter:collection_id', collection.get('id')))
        things = get_collection_things(collection.get('id'))
        schemata = [model.get(s) for s in things.keys()]
    return jsonify({
        'name':
        label,
        'identifierSpace':
        'http://rdf.freebase.com/ns/type.object.id',
        'schemaSpace':
        'http://rdf.freebase.com/ns/type.object.id',
        'view': {
            'url': entity_url('{{id}}')
        },
        'preview': {
            'url': entity_url('{{id}}'),
            'width': 800,
            'height': 400
        },
        'suggest': {
            'entity': {
                'service_url':
                domain,
                'service_path':
                url_for('reconcile_api.suggest_entity',
                        _query=suggest_query,
                        _authorize=True,
                        _relative=True)
            },
            'type': {
                'service_url':
                domain,
                'service_path':
                url_for('reconcile_api.suggest_type', _relative=True)
            },
            'property': {
                'service_url':
                domain,
                'service_path':
                url_for('reconcile_api.suggest_property', _relative=True)
            }
        },
        'defaultTypes':
        [get_freebase_type(s) for s in schemata if s.matchable]
    })
Exemple #7
0
def render_notification(stub, notification):
    """Generate a text version of the notification, suitable for use
    in an email or text message."""
    from aleph.logic import resolver
    for name, clazz, value in notification.iterparams():
        resolver.queue(stub, clazz, value)
    resolver.resolve(stub)

    plain = str(notification.event.template)
    html = str(notification.event.template)
    for name, clazz, value in notification.iterparams():
        data = resolver.get(stub, clazz, value)
        if data is None:
            return
        link, title = None, None
        if clazz == Role:
            title = data.get('label')
        elif clazz == Alert:
            title = data.get('query')
        elif clazz == Collection:
            title = data.get('label')
            link = collection_url(value)
        elif clazz == Entity:
            title = data.get('name')
            link = entity_url(value)

        template = '{{%s}}' % name
        html = html.replace(template, html_link(title, link))
        plain = plain.replace(template, "'%s'" % title)
        if name == notification.event.link_to:
            plain = '%s (%s)' % (plain, link)
    return {'plain': plain, 'html': html}
Exemple #8
0
def generate_sitemap(collection_id):
    """Generate entries for a collection-based sitemap.xml file."""
    # cf. https://www.sitemaps.org/protocol.html
    query = {
        'query': {
            'bool': {
                'filter': [
                    {'term': {'collection_id': collection_id}},
                    {'term': {'schemata': Entity.THING}},
                    authz_query(Authz.from_role(None))
                ]
            }
        },
        '_source': {'includes': ['schemata', 'updated_at']}
    }
    scanner = scan(es, index=entities_index(), query=query)
    # strictly, the limit for sitemap.xml is 50,000
    for res in islice(scanner, 49500):
        source = res.get('_source', {})
        updated_at = source.get('updated_at', '').split('T', 1)[0]
        if Document.SCHEMA in source.get('schemata', []):
            url = document_url(res.get('_id'))
        else:
            url = entity_url(res.get('_id'))
        yield (url, updated_at)
Exemple #9
0
 def entity_links(self, data, pk, schemata):
     return {
         'self': url_for('entities_api.view', id=pk),
         'references': url_for('entities_api.references', id=pk),
         'tags': url_for('entities_api.tags', id=pk),
         'ui': entity_url(pk)
     }
Exemple #10
0
def export_entities(export_id):
    export = Export.by_id(export_id)
    log.info("Export entities [%r]...", export)
    export_dir = ensure_path(mkdtemp(prefix="aleph.export."))
    collections = {}
    try:
        filters = [export.meta.get("query", {"match_none": {}})]
        file_path = export_dir.joinpath("query-export.zip")
        with ZipFile(file_path, mode="w") as zf:
            excel_path = export_dir.joinpath(EXCEL_FILE)
            exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS)
            for entity in iter_proxies(filters=filters):
                collection_id = entity.context.get("collection_id")
                if collection_id not in collections:
                    collections[collection_id] = get_collection(collection_id)
                collection = collections[collection_id]
                if collection is None:
                    continue
                extra = [entity_url(entity.id), collection.get("label")]
                exporter.write(entity, extra=extra)
                write_document(export_dir, zf, collection, entity)
                if file_path.stat().st_size >= Export.MAX_FILE_SIZE:
                    log.warn("Export too large: %r", export)
                    break

            exporter.finalize()
            zf.write(excel_path, arcname=EXCEL_FILE)
        complete_export(export_id, file_path)
    except Exception:
        log.exception("Failed to process export [%s]", export_id)
        export = Export.by_id(export_id)
        export.set_status(status=Status.FAILED)
        db.session.commit()
    finally:
        shutil.rmtree(export_dir)
Exemple #11
0
def export_entities(export_id, result):
    from aleph.logic import resolver

    export_dir = ensure_path(mkdtemp(prefix="aleph.export."))
    try:
        entities = []
        stub = types.SimpleNamespace(result=result)
        for entity in result["results"]:
            resolver.queue(stub, Collection, entity.get("collection_id"))
            entities.append(model.get_proxy(entity))
        resolver.resolve(stub)

        file_path = export_dir.joinpath("query-export.zip")
        zf = zipfile.ZipFile(file_path, "w")
        exporter = ExcelExporter(None, extra=EXTRA_HEADERS)
        for entity in entities:
            collection_id = entity.context.get("collection_id")
            collection = resolver.get(stub, Collection, collection_id)
            extra = [entity_url(entity.id), collection.get("label")]
            exporter.write(entity, extra=extra)
            write_document(export_dir, zf, collection, entity)
        content = exporter.get_bytesio().getvalue()
        zf.writestr("Export.xlsx", content)
        zf.close()
        complete_export(export_id, file_path)
    except Exception:
        log.exception("Failed to process export [%s]", export_id)
        export = Export.by_id(export_id)
        export.set_status(status=Export.STATUS_FAILED)
        db.session.commit()
    finally:
        shutil.rmtree(export_dir)
Exemple #12
0
def render_notification(stub, notification):
    """Generate a text version of the notification, suitable for use
    in an email or text message."""
    from aleph.logic import resolver
    for name, clazz, value in notification.iterparams():
        resolver.queue(stub, clazz, value)
    resolver.resolve(stub)

    plain = str(notification.event.template)
    html = str(notification.event.template)
    for name, clazz, value in notification.iterparams():
        data = resolver.get(stub, clazz, value)
        if data is None:
            return
        link, title = None, None
        if clazz == Role:
            title = data.get('label')
        elif clazz == Alert:
            title = data.get('query')
        elif clazz == Collection:
            title = data.get('label')
            link = collection_url(value)
        elif clazz == Entity:
            title = data.get('name')
            link = entity_url(value)

        template = '{{%s}}' % name
        html = html.replace(template, html_link(title, link))
        plain = plain.replace(template, "'%s'" % title)
        if name == notification.event.link_to:
            plain = '%s (%s)' % (plain, link)
    return {'plain': plain, 'html': html}
Exemple #13
0
    def _serialize(self, obj):
        pk = obj.get("id")
        collection_id = obj.pop("collection_id", None)
        obj["collection"] = self.resolve(
            Collection, collection_id, CollectionSerializer
        )
        proxy = model.get_proxy(obj)
        properties = obj.get("properties", {})
        for prop in proxy.iterprops():
            if prop.type != registry.entity:
                continue
            values = ensure_list(properties.get(prop.name))
            properties[prop.name] = []
            for value in values:
                entity = self.resolve(Entity, value, EntitySerializer)
                properties[prop.name].append(entity or value)

        links = {
            "self": url_for("entities_api.view", entity_id=pk),
            "references": url_for("entities_api.references", entity_id=pk),
            "tags": url_for("entities_api.tags", entity_id=pk),
            "ui": entity_url(pk),
        }
        if proxy.schema.is_a(Document.SCHEMA):
            content_hash = first(properties.get("contentHash"))
            if content_hash:
                name = entity_filename(proxy)
                mime = first(properties.get("mimeType"))
                links["file"] = archive_url(
                    content_hash,
                    file_name=name,
                    mime_type=mime,
                    expire=request.authz.expire,
                )

            pdf_hash = first(properties.get("pdfHash"))
            if pdf_hash:
                name = entity_filename(proxy, extension="pdf")
                links["pdf"] = archive_url(
                    pdf_hash,
                    file_name=name,
                    mime_type=PDF,
                    expire=request.authz.expire,
                )
            csv_hash = first(properties.get("csvHash"))
            if csv_hash:
                name = entity_filename(proxy, extension="csv")
                links["csv"] = archive_url(
                    csv_hash,
                    file_name=name,
                    mime_type=CSV,
                    expire=request.authz.expire,
                )

        obj["links"] = links
        obj["latinized"] = transliterate_values(proxy)
        obj["writeable"] = check_write_entity(obj, request.authz)
        obj["shallow"] = obj.get("shallow", True)
        return obj
Exemple #14
0
    def _serialize(self, obj):
        pk = obj.get('id')
        obj['id'] = str(pk)
        authz = request.authz
        collection_id = obj.pop('collection_id', None)
        obj['collection'] = self.resolve(Collection, collection_id,
                                         CollectionSerializer)
        schema = model.get(obj.get('schema'))
        if schema is None:
            return None
        obj['schemata'] = schema.names
        properties = obj.get('properties', {})
        for prop in schema.properties.values():
            if prop.type != registry.entity:
                continue
            values = ensure_list(properties.get(prop.name))
            properties[prop.name] = []
            for value in values:
                entity = self.resolve(Entity, value, EntitySerializer)
                properties[prop.name].append(entity)

        links = {
            'self': url_for('entities_api.view', entity_id=pk),
            'references': url_for('entities_api.references', entity_id=pk),
            'tags': url_for('entities_api.tags', entity_id=pk),
            'ui': entity_url(pk)
        }
        if schema.is_a(Document.SCHEMA):
            links['content'] = url_for('entities_api.content', entity_id=pk)
            file_name = first(properties.get('fileName'))
            content_hash = first(properties.get('contentHash'))
            if content_hash:
                mime_type = first(properties.get('mimeType'))
                name = safe_filename(file_name, default=pk)
                links['file'] = archive_url(request.authz.id,
                                            content_hash,
                                            file_name=name,
                                            mime_type=mime_type)

            pdf_hash = first(properties.get('pdfHash'))
            if pdf_hash:
                name = safe_filename(file_name, default=pk, extension='.pdf')
                links['pdf'] = archive_url(request.authz.id,
                                           pdf_hash,
                                           file_name=name,
                                           mime_type=PDF)
            csv_hash = first(properties.get('csvHash'))
            if csv_hash:
                name = safe_filename(file_name, default=pk, extension='.csv')
                links['csv'] = archive_url(request.authz.id,
                                           csv_hash,
                                           file_name=name,
                                           mime_type=CSV)

        obj['links'] = links
        obj['writeable'] = authz.can(collection_id, authz.WRITE)
        obj.pop('_index', None)
        return self._clean_response(obj)
Exemple #15
0
def export_entity_excel(workbook, collection, entity):
    fields = {
        'url': entity_url(entity.id),
        'collection': collection.get('label'),
        'collection_url': collection_url(collection.get('id'))
    }
    write_entity_excel(workbook, entity,
                       extra_fields=fields,
                       extra_headers=EXTRA_HEADERS)
Exemple #16
0
def export_entity_excel(workbook, collection, entity):
    fields = {
        'url': entity_url(entity.id),
        'collection': collection.get('label'),
        'collection_url': collection_url(collection.get('id'))
    }
    write_entity_excel(workbook,
                       entity,
                       extra_fields=fields,
                       extra_headers=EXTRA_HEADERS)
Exemple #17
0
def export_entity_csv(handlers, collection, entity):
    fh = handlers.get(entity.schema.plural)
    if fh is None:
        handlers[entity.schema.plural] = fh = io.StringIO()
        write_headers(fh, entity.schema,
                      extra_headers=EXTRA_HEADERS)
    write_entity_csv(fh, entity, extra_fields={
        'url': entity_url(entity.id),
        'collection': collection.get('label'),
        'collection_url': collection_url(collection.get('id'))
    })
Exemple #18
0
    def _serialize(self, obj):
        pk = obj.get('id')
        collection_id = obj.pop('collection_id', None)
        obj['collection'] = self.resolve(Collection, collection_id,
                                         CollectionSerializer)
        proxy = model.get_proxy(obj)
        obj['schemata'] = proxy.schema.names
        properties = obj.get('properties', {})
        for prop in proxy.iterprops():
            if prop.type != registry.entity:
                continue
            values = ensure_list(properties.get(prop.name))
            properties[prop.name] = []
            for value in values:
                entity = self.resolve(Entity, value, EntitySerializer)
                properties[prop.name].append(entity or value)

        links = {
            'self': url_for('entities_api.view', entity_id=pk),
            'references': url_for('entities_api.references', entity_id=pk),
            'tags': url_for('entities_api.tags', entity_id=pk),
            'ui': entity_url(pk)
        }
        if proxy.schema.is_a(Document.SCHEMA):
            links['content'] = url_for('entities_api.content', entity_id=pk)
            content_hash = first(properties.get('contentHash'))
            if content_hash:
                name = entity_filename(proxy)
                mime_type = first(properties.get('mimeType'))
                links['file'] = archive_url(request.authz.id,
                                            content_hash,
                                            file_name=name,
                                            mime_type=mime_type)

            pdf_hash = first(properties.get('pdfHash'))
            if pdf_hash:
                name = entity_filename(proxy, extension='pdf')
                links['pdf'] = archive_url(request.authz.id,
                                           pdf_hash,
                                           file_name=name,
                                           mime_type=PDF)
            csv_hash = first(properties.get('csvHash'))
            if csv_hash:
                name = entity_filename(proxy, extension='csv')
                links['csv'] = archive_url(request.authz.id,
                                           csv_hash,
                                           file_name=name,
                                           mime_type=CSV)

        obj['links'] = links
        write = request.authz.WRITE
        obj['writeable'] = request.authz.can(collection_id, write)
        return obj
Exemple #19
0
def export_entity_csv(handlers, collection, entity):
    fh = handlers.get(entity.schema.plural)
    if fh is None:
        handlers[entity.schema.plural] = fh = io.StringIO()
        write_headers(fh, entity.schema, extra_headers=EXTRA_HEADERS)
    write_entity_csv(fh,
                     entity,
                     extra_fields={
                         'url': entity_url(entity.id),
                         'collection': collection.get('label'),
                         'collection_url': collection_url(collection.get('id'))
                     })
Exemple #20
0
def reconcile_index(collection=None):
    domain = settings.APP_UI_URL.strip('/')
    label = settings.APP_TITLE
    suggest_query = []
    schemata = list(model)
    if collection is not None:
        label = '%s (%s)' % (collection.get('label'), label)
        suggest_query.append(('filter:collection_id', collection.get('id')))
        schemata = [model.get(s) for s in collection.get('schemata').keys()]
    return jsonify({
        'name': label,
        'identifierSpace': 'http://rdf.freebase.com/ns/type.object.id',
        'schemaSpace': 'http://rdf.freebase.com/ns/type.object.id',
        'view': {'url': entity_url('{{id}}')},
        'preview': {
            'url': entity_url('{{id}}'),
            'width': 800,
            'height': 400
        },
        'suggest': {
            'entity': {
                'service_url': domain,
                'service_path': url_for('reconcile_api.suggest_entity',
                                        _query=suggest_query,
                                        _authorize=True,
                                        _relative=True)
            },
            'type': {
                'service_url': domain,
                'service_path': url_for('reconcile_api.suggest_type',
                                        _relative=True)
            },
            'property': {
                'service_url': domain,
                'service_path': url_for('reconcile_api.suggest_property',
                                        _relative=True)
            }
        },
        'defaultTypes': [get_freebase_type(s) for s in schemata if s.matchable]
    })
Exemple #21
0
def reconcile_index():
    domain = settings.APP_UI_URL.strip('/')
    api_key = None
    if request.authz.logged_in:
        role = Role.by_id(request.authz.id)
        api_key = role.api_key
    meta = {
        'name': settings.APP_TITLE,
        'identifierSpace': 'http://rdf.freebase.com/ns/type.object.id',
        'schemaSpace': 'http://rdf.freebase.com/ns/type.object.id',
        'view': {
            'url': entity_url('{{id}}')
        },
        'preview': {
            'url': entity_url('{{id}}') + '?api_key=%s' % api_key,
            'width': 800,
            'height': 400
        },
        'suggest': {
            'entity': {
                'service_url':
                domain,
                'service_path':
                url_for('reconcile_api.suggest_entity', api_key=api_key)
            },
            'type': {
                'service_url': domain,
                'service_path': url_for('reconcile_api.suggest_type')
            },
            'property': {
                'service_url': domain,
                'service_path': url_for('reconcile_api.suggest_property')
            }
        },
        'defaultTypes': [{
            'id': 'Entity',
            'name': 'Persons and Companies'
        }]
    }
    return jsonify(meta)
Exemple #22
0
def reconcile_index():
    domain = settings.APP_UI_URL.strip('/')
    meta = {
        'name':
        settings.APP_TITLE,
        'identifierSpace':
        'http://rdf.freebase.com/ns/type.object.id',
        'schemaSpace':
        'http://rdf.freebase.com/ns/type.object.id',
        'view': {
            'url': entity_url('{{id}}')
        },
        'preview': {
            'url': entity_url('{{id}}'),
            'width': 800,
            'height': 400
        },
        'suggest': {
            'entity': {
                'service_url':
                domain,
                'service_path':
                url_for('reconcile_api.suggest_entity', _authorize=True)
            },
            'type': {
                'service_url': domain,
                'service_path': url_for('reconcile_api.suggest_type')
            },
            'property': {
                'service_url': domain,
                'service_path': url_for('reconcile_api.suggest_property')
            }
        },
        'defaultTypes': [{
            'id': Entity.THING,
            'name': model.get(Entity.THING).label
        }]
    }
    return jsonify(meta)
Exemple #23
0
def entity_matches(result):
    for doc in result.get('hits').get('hits'):
        entity = unpack_result(doc)
        proxy = model.get_proxy(entity)
        yield {
            'id': proxy.id,
            'name': proxy.caption,
            'n:type': get_freebase_type(proxy.schema),
            'type': [get_freebase_type(proxy.schema)],
            'r:score': doc.get('_score'),
            'uri': entity_url(proxy.id, _relative=True),
            'match': False
        }
Exemple #24
0
def entity_matches(result):
    for doc in result.get('hits').get('hits'):
        entity = unpack_result(doc)
        proxy = model.get_proxy(entity)
        yield {
            'id': proxy.id,
            'name': proxy.caption,
            'n:type': get_freebase_type(proxy.schema),
            'type': [get_freebase_type(proxy.schema)],
            'r:score': doc.get('_score'),
            'uri': entity_url(proxy.id, _relative=True),
            'match': False
        }
Exemple #25
0
def entity_matches(result):
    for doc in result.get("hits").get("hits"):
        entity = unpack_result(doc)
        proxy = model.get_proxy(entity)
        yield {
            "id": proxy.id,
            "name": proxy.caption,
            "n:type": get_freebase_type(proxy.schema),
            "type": [get_freebase_type(proxy.schema)],
            "r:score": doc.get("_score"),
            "uri": entity_url(proxy.id, _relative=True),
            "match": False,
        }
Exemple #26
0
def render_notification(stub, notification):
    """Generate a text version of the notification, suitable for use
    in an email or text message."""
    from aleph.logic import resolver

    notification = unpack_result(notification)
    event = Events.get(notification.get("event"))
    if event is None:
        return

    for name, clazz, value in _iter_params(notification, event):
        resolver.queue(stub, clazz, value)
    resolver.resolve(stub)
    plain = str(event.template)
    html = str(event.template)
    for name, clazz, value in _iter_params(notification, event):
        data = resolver.get(stub, clazz, value)
        if data is None:
            return
        link, title = None, None
        if clazz == Role:
            title = data.get("label")
        elif clazz == Alert:
            title = data.get("query")
        elif clazz == Collection:
            title = data.get("label")
            link = collection_url(value)
        elif clazz == Entity:
            proxy = model.get_proxy(data)
            title = proxy.caption
            link = entity_url(value)
        elif clazz == EntitySet:
            title = data.label
            link = entityset_url(data.id)
        elif clazz == Export:
            title = data.get("label")
            link = archive_url(
                data.get("content_hash"),
                file_name=data.get("file_name"),
                mime_type=data.get("file_name"),
            )
            link = url_for("exports_api.download", export_id=data.get("id"))

        template = "{{%s}}" % name
        html = html.replace(template, html_link(title, link))
        plain = plain.replace(template, "'%s'" % title)
        if name == event.link_to:
            plain = "%s (%s)" % (plain, link)
    return {"plain": plain, "html": html}
Exemple #27
0
def generate_sitemap(collection_id):
    """Generate entries for a collection-based sitemap.xml file."""
    # cf. https://www.sitemaps.org/protocol.html
    entities = iter_entities(authz=Authz.from_role(None),
                             collection_id=collection_id,
                             schemata=[Entity.THING],
                             includes=['schemata', 'updated_at'])
    # strictly, the limit for sitemap.xml is 50,000
    for entity in islice(entities, 49500):
        updated_at = entity.get('updated_at', '').split('T', 1)[0]
        if Document.SCHEMA in entity.get('schemata', []):
            url = document_url(entity.get('id'))
        else:
            url = entity_url(entity.get('id'))
        yield (url, updated_at)
Exemple #28
0
    def _serialize(self, obj):
        pk = obj.get('id')
        authz = request.authz
        collection_id = obj.pop('collection_id', None)
        obj['collection'] = self.resolve(Collection, collection_id,
                                         CollectionSerializer)
        schema = model.get(obj.get('schema'))
        if schema is None:
            return None
        obj['schemata'] = schema.names
        properties = obj.get('properties', {})
        for prop in schema.properties.values():
            if prop.type != registry.entity:
                continue
            values = ensure_list(properties.get(prop.name))
            properties[prop.name] = []
            for value in values:
                entity = self.resolve(Entity, value, EntitySerializer)
                properties[prop.name].append(entity)

        links = {
            'self': url_for('entities_api.view', entity_id=pk),
            'references': url_for('entities_api.references', entity_id=pk),
            'tags': url_for('entities_api.tags', entity_id=pk),
            'ui': entity_url(pk)
        }
        if schema.is_a(Document.SCHEMA):
            links['content'] = url_for('entities_api.content', entity_id=pk)

        for content_hash in ensure_list(properties.get('contentHash')):
            links['file'] = url_for('documents_api.file',
                                    document_id=pk,
                                    _authorize=True)

        for pdf_hash in ensure_list(properties.get('pdfHash')):
            links['pdf'] = url_for('documents_api.pdf',
                                   document_id=pk,
                                   _authorize=True)

        obj['links'] = links
        obj['writeable'] = authz.can(collection_id, authz.WRITE)
        if obj.get('bulk'):
            obj['writeable'] = False
        obj.pop('_index', None)
        return self._clean_response(obj)
Exemple #29
0
def export_entities(request, result):
    entities = []
    for entity in result.results:
        resolver.queue(result, Collection, entity.get('collection_id'))
        entities.append(model.get_proxy(entity))
    resolver.resolve(result)
    zip_archive = zipstream.ZipFile()
    exporter = ExcelExporter(None, extra=EXTRA_HEADERS)
    for entity in entities:
        collection_id = entity.context.get('collection_id')
        collection = resolver.get(result, Collection, collection_id)
        extra = [entity_url(entity.id), collection.get('label')]
        exporter.write(entity, extra=extra)
        write_document(zip_archive, collection, entity)
    content = exporter.get_bytesio()
    zip_archive.write_iter('Export.xlsx', content)
    for chunk in zip_archive:
        yield chunk
Exemple #30
0
def export_entities(export_id):
    export = Export.by_id(export_id)
    log.info("Export entities [%r]...", export)
    export_dir = ensure_path(mkdtemp(prefix="aleph.export."))
    collections = {}
    try:
        filters = [export.meta.get("query", {"match_none": {}})]
        file_path = export_dir.joinpath("export.zip")
        with ZipFile(file_path, mode="w") as zf:
            excel_name = safe_filename(export.label, extension="xlsx")
            excel_path = export_dir.joinpath(excel_name)
            exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS)
            for idx, entity in enumerate(iter_proxies(filters=filters)):
                collection_id = entity.context.get("collection_id")
                if collection_id not in collections:
                    collections[collection_id] = get_collection(collection_id)
                collection = collections[collection_id]
                if collection is None:
                    continue
                extra = [entity_url(entity.id), collection.get("label")]
                exporter.write(entity, extra=extra)
                write_document(export_dir, zf, collection, entity)
                if file_path.stat().st_size >= settings.EXPORT_MAX_SIZE:
                    concern = "total size of the"
                    zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern)
                    break
                if idx >= settings.EXPORT_MAX_RESULTS:
                    concern = "number of"
                    zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern)
                    break

            exporter.finalize()
            zf.write(excel_path, arcname=excel_name)
        file_name = "Export: %s" % export.label
        file_name = safe_filename(file_name, extension="zip")
        complete_export(export_id, file_path, file_name)
    except Exception:
        log.exception("Failed to process export [%s]", export_id)
        export = Export.by_id(export_id)
        export.set_status(status=Status.FAILED)
        db.session.commit()
    finally:
        shutil.rmtree(export_dir)
Exemple #31
0
def generate_sitemap(collection_id):
    """Generate entries for a collection-based sitemap.xml file."""
    # cf. https://www.sitemaps.org/protocol.html
    document = model.get(Document.SCHEMA)
    entities = iter_entities(authz=Authz.from_role(None),
                             collection_id=collection_id,
                             schemata=[Entity.THING],
                             includes=['schema', 'updated_at'])
    # strictly, the limit for sitemap.xml is 50,000
    for entity in islice(entities, 49500):
        updated_at = entity.get('updated_at', '').split('T', 1)[0]
        updated_at = max(settings.SITEMAP_FLOOR, updated_at)
        schema = model.get(entity.get('schema'))
        if schema is None:
            continue
        if schema.is_a(document):
            url = document_url(entity.get('id'))
        else:
            url = entity_url(entity.get('id'))
        yield (url, updated_at)
Exemple #32
0
def reconcile_op(query):
    """Reconcile operation for a single query."""
    parser = SearchQueryParser({
        'limit': query.get('limit', '5'),
        'strict': 'false'
    }, request.authz)

    name = query.get('query', '')
    schema = query.get('type') or Entity.THING
    entity = {
        'id': 'fake',
        'names': [name],
        'fingerprints': [fingerprints.generate(name)],
        'schemata': ensure_list(schema),
        'schema': schema
    }

    for p in query.get('properties', []):
        entity[p.get('pid')] = ensure_list(p.get('v'))

    query = SimilarEntitiesQuery(parser, entity=entity)
    matches = []
    for doc in query.search().get('hits').get('hits'):
        source = doc.get('_source')
        match = {
            'id': doc.get('_id'),
            'name': source.get('name'),
            'score': min(100, doc.get('_score') * 10),
            'uri': entity_url(doc.get('_id')),
            'match': source.get('name') == name
        }
        for type_ in get_freebase_types():
            if source['schema'] == type_['id']:
                match['type'] = [type_]
        matches.append(match)

    log.info("Reconciled: %r -> %d matches", name, len(matches))
    return {
        'result': matches,
        'num': len(matches)
    }
Exemple #33
0
def render_notification(stub, notification):
    """Generate a text version of the notification, suitable for use
    in an email or text message."""
    from aleph.logic import resolver
    notification = unpack_result(notification)
    event = Events.get(notification.get('event'))
    if event is None:
        return

    for name, clazz, value in _iter_params(notification, event):
        resolver.queue(stub, clazz, value)
    resolver.resolve(stub)
    plain = str(event.template)
    html = str(event.template)
    for name, clazz, value in _iter_params(notification, event):
        data = resolver.get(stub, clazz, value)
        if data is None:
            return
        link, title = None, None
        if clazz == Role:
            title = data.get('label')
        elif clazz == Alert:
            title = data.get('query')
        elif clazz == Collection:
            title = data.get('label')
            link = collection_url(value)
        elif clazz == Entity:
            proxy = model.get_proxy(data)
            title = proxy.caption
            link = entity_url(value)
        elif clazz == Diagram:
            title = data.label
            link = diagram_url(data.id)

        template = '{{%s}}' % name
        html = html.replace(template, html_link(title, link))
        plain = plain.replace(template, "'%s'" % title)
        if name == event.link_to:
            plain = '%s (%s)' % (plain, link)
    return {'plain': plain, 'html': html}
Exemple #34
0
def resolve_id(object_id, clazz):
    """From an object ID and class type, generate a human-readable
    label and a link that can be rendered into the notification.
    """
    if clazz == Role:
        role = Role.by_id(object_id)
        return role.name, None
    elif clazz == Alert:
        alert = Alert.by_id(object_id)
        return alert.query, None
    elif clazz == Collection:
        collection = Collection.by_id(object_id)
        if collection is not None:
            return collection.label, collection_url(object_id)
    elif clazz in [Document, Entity]:
        entity = get_entity(object_id)
        if entity is not None:
            if Document.SCHEMA in entity.get('schemata'):
                title = entity.get('title', entity.get('file_name'))
                return title, document_url(object_id)
            else:
                return entity.get('name'), entity_url(object_id)
    return None, None
Exemple #35
0
def sitemap(id):
    """Generate entries for a collection-based sitemap.xml file."""
    # cf. https://www.sitemaps.org/protocol.html
    collection = get_db_collection(id, request.authz.READ)
    document = model.get(Document.SCHEMA)
    entries = []
    for entity in get_sitemap_entities(id):
        updated_at = entity.get('updated_at', '').split('T', 1)[0]
        updated_at = max(settings.SITEMAP_FLOOR, updated_at)
        schema = model.get(entity.get('schema'))
        if schema is None:
            continue
        if schema.is_a(document):
            url = document_url(entity.get('id'))
        else:
            url = entity_url(entity.get('id'))
        entries.append((url, updated_at))
    url = collection_url(collection_id=collection.id)
    updated_at = collection.updated_at.date().isoformat()
    return render_xml('sitemap.xml',
                      url=url,
                      updated_at=updated_at,
                      entries=entries)
Exemple #36
0
def reconcile_op(query):
    """Reconcile operation for a single query."""
    parser = SearchQueryParser(
        {
            'limit': query.get('limit', '5'),
            'strict': 'false'
        }, request.authz)

    name = query.get('query', '')
    schema = query.get('type') or Entity.THING
    proxy = model.make_entity(schema)
    proxy.add('name', query.get('query', ''))
    for p in query.get('properties', []):
        proxy.add(p.get('pid'), p.get('v'), quiet=True)

    query = MatchQuery(parser, entity=proxy)
    matches = []
    for doc in query.search().get('hits').get('hits'):
        entity = unpack_result(doc)
        if entity is None:
            continue
        entity = model.get_proxy(entity)
        score = math.ceil(compare(model, proxy, entity) * 100)
        match = {
            'id': entity.id,
            'name': entity.caption,
            'score': score,
            'uri': entity_url(entity.id),
            'match': False
        }
        for type_ in get_freebase_types():
            if entity.schema.name == type_['id']:
                match['type'] = [type_]
        matches.append(match)

    log.info("Reconciled: %r -> %d matches", name, len(matches))
    return {'result': matches, 'num': len(matches)}