Esempio n. 1
0
def export_entities(export_id):
    export = Export.by_id(export_id)
    log.info("Export entities [%r]...", export)
    export_dir = ensure_path(mkdtemp(prefix="aleph.export."))
    collections = {}
    try:
        filters = [export.meta.get("query", {"match_none": {}})]
        file_path = export_dir.joinpath("query-export.zip")
        with ZipFile(file_path, mode="w") as zf:
            excel_path = export_dir.joinpath(EXCEL_FILE)
            exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS)
            for entity in iter_proxies(filters=filters):
                collection_id = entity.context.get("collection_id")
                if collection_id not in collections:
                    collections[collection_id] = get_collection(collection_id)
                collection = collections[collection_id]
                if collection is None:
                    continue
                extra = [entity_url(entity.id), collection.get("label")]
                exporter.write(entity, extra=extra)
                write_document(export_dir, zf, collection, entity)
                if file_path.stat().st_size >= Export.MAX_FILE_SIZE:
                    log.warn("Export too large: %r", export)
                    break

            exporter.finalize()
            zf.write(excel_path, arcname=EXCEL_FILE)
        complete_export(export_id, file_path)
    except Exception:
        log.exception("Failed to process export [%s]", export_id)
        export = Export.by_id(export_id)
        export.set_status(status=Status.FAILED)
        db.session.commit()
    finally:
        shutil.rmtree(export_dir)
Esempio n. 2
0
def export_matches(export_id):
    """Export the top N matches of cross-referencing for the given collection
    to an Excel formatted export."""
    export = Export.by_id(export_id)
    export_dir = ensure_path(mkdtemp(prefix="aleph.export."))
    try:
        role = Role.by_id(export.creator_id)
        authz = Authz.from_role(role)
        collection = Collection.by_id(export.collection_id)
        file_name = "%s - Crossreference.xlsx" % collection.label
        file_path = export_dir.joinpath(file_name)
        excel = ExcelWriter()
        headers = [
            "Score",
            "Entity Name",
            "Entity Date",
            "Entity Countries",
            "Candidate Collection",
            "Candidate Name",
            "Candidate Date",
            "Candidate Countries",
            "Entity Link",
            "Candidate Link",
        ]
        sheet = excel.make_sheet("Cross-reference", headers)
        batch = []

        for match in iter_matches(collection, authz):
            batch.append(match)
            if len(batch) >= BULK_PAGE:
                _iter_match_batch(excel, sheet, batch)
                batch = []
        if len(batch):
            _iter_match_batch(excel, sheet, batch)

        with open(file_path, "wb") as fp:
            buffer = excel.get_bytesio()
            for data in buffer:
                fp.write(data)

        complete_export(export_id, file_path)
    except Exception:
        log.exception("Failed to process export [%s]", export_id)
        export = Export.by_id(export_id)
        export.set_status(status=Status.FAILED)
        db.session.commit()
    finally:
        shutil.rmtree(export_dir)
Esempio n. 3
0
def cleanup_archive(prefix=None):
    """Clean up the blob archive behind aleph. Files inside of the archive
    are keyed on their SHA1 checksum, but the archive itself doesn't know
    what entities or exports a blob is linked to. So this is basically a
    garbage collector that needs to determine if any part of the database
    or index references the given hash. It's a messy process and it should
    be applied carefully."""
    for batch in _chunked_hashes(prefix):
        for content_hash, count in checksums_count(batch):
            if count > 0:
                # log.info("Used hash: %s", content_hash)
                continue
            # In theory, this is a redundant check. In practice, it's shit
            # to delete seed data from the docs table by accident:
            docs = Document.by_content_hash(content_hash)
            if docs.count() > 0:
                # log.info("Doc hash: %s", content_hash)
                continue
            exports = Export.by_content_hash(content_hash)
            if exports.count() > 0:
                continue
            # path = archive.load_file(content_hash)
            # log.info("Dangling hash [%s]: %s", content_hash, path)
            log.info("Dangling hash: %s", content_hash)
            archive.delete_file(content_hash)
Esempio n. 4
0
def export_entities(export_id, result):
    from aleph.logic import resolver

    export_dir = ensure_path(mkdtemp(prefix="aleph.export."))
    try:
        entities = []
        stub = types.SimpleNamespace(result=result)
        for entity in result["results"]:
            resolver.queue(stub, Collection, entity.get("collection_id"))
            entities.append(model.get_proxy(entity))
        resolver.resolve(stub)

        file_path = export_dir.joinpath("query-export.zip")
        zf = zipfile.ZipFile(file_path, "w")
        exporter = ExcelExporter(None, extra=EXTRA_HEADERS)
        for entity in entities:
            collection_id = entity.context.get("collection_id")
            collection = resolver.get(stub, Collection, collection_id)
            extra = [entity_url(entity.id), collection.get("label")]
            exporter.write(entity, extra=extra)
            write_document(export_dir, zf, collection, entity)
        content = exporter.get_bytesio().getvalue()
        zf.writestr("Export.xlsx", content)
        zf.close()
        complete_export(export_id, file_path)
    except Exception:
        log.exception("Failed to process export [%s]", export_id)
        export = Export.by_id(export_id)
        export.set_status(status=Export.STATUS_FAILED)
        db.session.commit()
    finally:
        shutil.rmtree(export_dir)
Esempio n. 5
0
def complete_export(export_id, file_path):
    export = Export.by_id(export_id)
    file_path = ensure_path(file_path)
    export.file_name = safe_filename(file_path)
    export.file_size = file_path.stat().st_size
    export.content_hash = checksum(file_path)
    try:
        archive.archive_file(file_path,
                             content_hash=export.content_hash,
                             mime_type=export.mime_type)
        export.set_status(status=Status.SUCCESS)
    except Exception:
        log.exception("Failed to upload export: %s", export)
        export.set_status(status=Status.FAILED)

    db.session.commit()
    params = {"export": export}
    role = Role.by_id(export.creator_id)
    log.info("Export [%r] complete: %s", export, export.status)
    publish(
        Events.COMPLETE_EXPORT,
        params=params,
        channels=[role],
    )
    send_export_notification(export)
Esempio n. 6
0
def index():
    """Returns a list of exports for the user.
    ---
    get:
      summary: List exports
      responses:
        '200':
          content:
            application/json:
              schema:
                type: object
                allOf:
                - $ref: '#/components/schemas/QueryResponse'
                properties:
                  results:
                    type: array
                    items:
                      $ref: '#/components/schemas/Export'
          description: OK
      tags:
        - Export
    """
    require(request.authz.logged_in)
    query = Export.by_role_id(request.authz.id)
    result = DatabaseQueryResult(request, query)
    return ExportSerializer.jsonify_result(result)
Esempio n. 7
0
File: export.py Progetto: sunu/aleph
def export_entities(export_id):
    export = Export.by_id(export_id)
    log.info("Export entities [%r]...", export)
    export_dir = ensure_path(mkdtemp(prefix="aleph.export."))
    collections = {}
    try:
        filters = [export.meta.get("query", {"match_none": {}})]
        file_path = export_dir.joinpath("export.zip")
        with ZipFile(file_path, mode="w") as zf:
            excel_name = safe_filename(export.label, extension="xlsx")
            excel_path = export_dir.joinpath(excel_name)
            exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS)
            for idx, entity in enumerate(iter_proxies(filters=filters)):
                collection_id = entity.context.get("collection_id")
                if collection_id not in collections:
                    collections[collection_id] = get_collection(collection_id)
                collection = collections[collection_id]
                if collection is None:
                    continue
                extra = [entity_url(entity.id), collection.get("label")]
                exporter.write(entity, extra=extra)
                write_document(export_dir, zf, collection, entity)
                if file_path.stat().st_size >= settings.EXPORT_MAX_SIZE:
                    concern = "total size of the"
                    zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern)
                    break
                if idx >= settings.EXPORT_MAX_RESULTS:
                    concern = "number of"
                    zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern)
                    break

            exporter.finalize()
            zf.write(excel_path, arcname=excel_name)
        file_name = "Export: %s" % export.label
        file_name = safe_filename(file_name, extension="zip")
        complete_export(export_id, file_path, file_name)
    except Exception:
        log.exception("Failed to process export [%s]", export_id)
        export = Export.by_id(export_id)
        export.set_status(status=Status.FAILED)
        db.session.commit()
    finally:
        shutil.rmtree(export_dir)
Esempio n. 8
0
    def test_delete_expired(self):
        q = Export.by_role_id(self.role_email.id)
        assert q.count() == 3, q.count()

        delete_expired_exports()
        q = Export.by_role_id(self.role_email.id)
        assert q.count() == 1, q.count()
        exp1 = Export.by_id(self.export1.id, deleted=False)
        assert exp1 is not None
        assert exp1.deleted is False
        exp2 = Export.by_id(self.export2.id, deleted=True)
        assert exp2 is not None
        assert exp2.deleted is True

        path = archive.load_file(self.export1.content_hash)
        assert path is not None
        assert path.exists()

        exp1.expires_at = datetime.utcnow() + timedelta(days=-1)
        db.session.add(exp1)
        db.session.commit()

        delete_expired_exports()
        q = Export.by_role_id(self.role_email.id)
        assert q.count() == 0, q.count()
        exp1 = Export.by_id(self.export1.id, deleted=True)
        assert exp1 is not None
        assert exp1.deleted is True
        path = archive.load_file(self.export1.content_hash)
        assert path is not None, path

        path = archive.load_file(self.export3.content_hash)
        assert path is None, path
Esempio n. 9
0
def download(export_id):
    """Downloads the exported file from the archive.
    ---
    get:
      summary: Download an export from the archive
      parameters:
      - description: export id
        in: path
        name: export_id
        required: true
        schema:
          type: string
      - description: Authorization token for an export
        in: query
        name: claim
        required: false
        schema:
          type: string
          description: A signed JWT with the object hash.
      responses:
        '200':
          description: OK
          content:
            '*/*': {}
        '404':
          description: Object does not exist.
      tags:
      - Export
    """
    require(request.authz.logged_in)
    export = obj_or_404(Export.by_id(export_id, role_id=request.authz.id))
    expires_after = export.expires_at - datetime.utcnow()
    url = archive.generate_publication_url(
        export.namespace,
        export.content_hash,
        mime_type=export.mime_type,
        expire=expires_after.total_seconds(),
        attachment_name=export.file_name,
    )
    if url is not None:
        return redirect(url)
    local_path = archive.load_publication(export.namespace,
                                          export.content_hash)
    if local_path is None:
        raise NotFound()
    return send_file(
        str(local_path),
        as_attachment=True,
        conditional=True,
        attachment_filename=export.file_name,
        mimetype=export.mime_type,
    )
Esempio n. 10
0
def get_export(export_id):
    if export_id is None:
        return
    key = cache.object_key(Export, export_id)
    data = cache.get_complex(key)
    if data is None:
        export = Export.by_id(export_id)
        if export is None:
            return
        log.debug("Export cache refresh: %r", export)
        data = export.to_dict()
        cache.set_complex(key, data, expires=cache.EXPIRE)
    return data
Esempio n. 11
0
def create_export(
    operation,
    role_id,
    label,
    file_path=None,
    expires_after=Export.DEFAULT_EXPIRATION,
    collection=None,
    mime_type=None,
):
    export = Export.create(operation, role_id, label, file_path, expires_after,
                           collection, mime_type)
    db.session.commit()
    return export
Esempio n. 12
0
def delete_expired_exports():
    """Delete export files from the archive after their time
    limit has expired."""
    expired_exports = Export.get_expired(deleted=False)
    for export in expired_exports:
        log.info("Deleting expired export: %r", export)
        if export.should_delete_publication():
            counts = list(checksums_count([export.content_hash]))
            if counts[0][1] == 0:
                archive.delete_file(export.content_hash)
        export.deleted = True
        db.session.add(export)
    db.session.commit()
Esempio n. 13
0
def complete_export(export_id, file_path=None):
    export = Export.by_id(export_id)
    if file_path:
        export.set_filepath(file_path)
    export.publish()
    db.session.commit()
    params = {"export": export}
    role = Role.by_id(export.creator_id)
    publish(
        Events.COMPLETE_EXPORT,
        params=params,
        channels=[role],
    )
    send_export_notification(export)
Esempio n. 14
0
File: roles.py Progetto: sunu/aleph
def get_deep_role(role):
    authz = Authz.from_role(role)
    alerts = Alert.by_role_id(role.id).count()
    exports = Export.by_role_id(role.id).count()
    casefiles = Collection.all_casefiles(authz=authz).count()
    entitysets = EntitySet.type_counts(authz=authz)
    return {
        "counts": {
            "alerts": alerts,
            "entitysets": entitysets,
            "casefiles": casefiles,
            "exports": exports,
        },
        "shallow": False,
    }
Esempio n. 15
0
def create_export(
    operation,
    role_id,
    label,
    collection=None,
    mime_type=None,
    meta=None,
):
    export = Export.create(
        operation,
        role_id,
        label,
        collection=collection,
        mime_type=mime_type,
        meta=meta,
    )
    db.session.commit()
    return export
Esempio n. 16
0
File: export.py Progetto: sunu/aleph
def retry_exports():
    for export in Export.get_pending():
        queue_task(None, export.operation, export_id=export.id)
Esempio n. 17
0
def get_export(export_id):
    if export_id is None:
        return
    export = Export.by_id(export_id, deleted=True)
    if export is not None:
        return export.to_dict()
Esempio n. 18
0
def retry_exports():
    for export in Export.get_pending():
        queue_task(None, export.operation, payload={"export_id": export.id})
Esempio n. 19
0
def delete_expired_exports():
    expired_exports = Export.get_expired(deleted=False)
    for export in expired_exports:
        export.delete_publication()
    db.session.commit()