def export_entities(export_id): export = Export.by_id(export_id) log.info("Export entities [%r]...", export) export_dir = ensure_path(mkdtemp(prefix="aleph.export.")) collections = {} try: filters = [export.meta.get("query", {"match_none": {}})] file_path = export_dir.joinpath("query-export.zip") with ZipFile(file_path, mode="w") as zf: excel_path = export_dir.joinpath(EXCEL_FILE) exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS) for entity in iter_proxies(filters=filters): collection_id = entity.context.get("collection_id") if collection_id not in collections: collections[collection_id] = get_collection(collection_id) collection = collections[collection_id] if collection is None: continue extra = [entity_url(entity.id), collection.get("label")] exporter.write(entity, extra=extra) write_document(export_dir, zf, collection, entity) if file_path.stat().st_size >= Export.MAX_FILE_SIZE: log.warn("Export too large: %r", export) break exporter.finalize() zf.write(excel_path, arcname=EXCEL_FILE) complete_export(export_id, file_path) except Exception: log.exception("Failed to process export [%s]", export_id) export = Export.by_id(export_id) export.set_status(status=Status.FAILED) db.session.commit() finally: shutil.rmtree(export_dir)
def export_matches(export_id): """Export the top N matches of cross-referencing for the given collection to an Excel formatted export.""" export = Export.by_id(export_id) export_dir = ensure_path(mkdtemp(prefix="aleph.export.")) try: role = Role.by_id(export.creator_id) authz = Authz.from_role(role) collection = Collection.by_id(export.collection_id) file_name = "%s - Crossreference.xlsx" % collection.label file_path = export_dir.joinpath(file_name) excel = ExcelWriter() headers = [ "Score", "Entity Name", "Entity Date", "Entity Countries", "Candidate Collection", "Candidate Name", "Candidate Date", "Candidate Countries", "Entity Link", "Candidate Link", ] sheet = excel.make_sheet("Cross-reference", headers) batch = [] for match in iter_matches(collection, authz): batch.append(match) if len(batch) >= BULK_PAGE: _iter_match_batch(excel, sheet, batch) batch = [] if len(batch): _iter_match_batch(excel, sheet, batch) with open(file_path, "wb") as fp: buffer = excel.get_bytesio() for data in buffer: fp.write(data) complete_export(export_id, file_path) except Exception: log.exception("Failed to process export [%s]", export_id) export = Export.by_id(export_id) export.set_status(status=Status.FAILED) db.session.commit() finally: shutil.rmtree(export_dir)
def cleanup_archive(prefix=None): """Clean up the blob archive behind aleph. Files inside of the archive are keyed on their SHA1 checksum, but the archive itself doesn't know what entities or exports a blob is linked to. So this is basically a garbage collector that needs to determine if any part of the database or index references the given hash. It's a messy process and it should be applied carefully.""" for batch in _chunked_hashes(prefix): for content_hash, count in checksums_count(batch): if count > 0: # log.info("Used hash: %s", content_hash) continue # In theory, this is a redundant check. In practice, it's shit # to delete seed data from the docs table by accident: docs = Document.by_content_hash(content_hash) if docs.count() > 0: # log.info("Doc hash: %s", content_hash) continue exports = Export.by_content_hash(content_hash) if exports.count() > 0: continue # path = archive.load_file(content_hash) # log.info("Dangling hash [%s]: %s", content_hash, path) log.info("Dangling hash: %s", content_hash) archive.delete_file(content_hash)
def export_entities(export_id, result): from aleph.logic import resolver export_dir = ensure_path(mkdtemp(prefix="aleph.export.")) try: entities = [] stub = types.SimpleNamespace(result=result) for entity in result["results"]: resolver.queue(stub, Collection, entity.get("collection_id")) entities.append(model.get_proxy(entity)) resolver.resolve(stub) file_path = export_dir.joinpath("query-export.zip") zf = zipfile.ZipFile(file_path, "w") exporter = ExcelExporter(None, extra=EXTRA_HEADERS) for entity in entities: collection_id = entity.context.get("collection_id") collection = resolver.get(stub, Collection, collection_id) extra = [entity_url(entity.id), collection.get("label")] exporter.write(entity, extra=extra) write_document(export_dir, zf, collection, entity) content = exporter.get_bytesio().getvalue() zf.writestr("Export.xlsx", content) zf.close() complete_export(export_id, file_path) except Exception: log.exception("Failed to process export [%s]", export_id) export = Export.by_id(export_id) export.set_status(status=Export.STATUS_FAILED) db.session.commit() finally: shutil.rmtree(export_dir)
def complete_export(export_id, file_path): export = Export.by_id(export_id) file_path = ensure_path(file_path) export.file_name = safe_filename(file_path) export.file_size = file_path.stat().st_size export.content_hash = checksum(file_path) try: archive.archive_file(file_path, content_hash=export.content_hash, mime_type=export.mime_type) export.set_status(status=Status.SUCCESS) except Exception: log.exception("Failed to upload export: %s", export) export.set_status(status=Status.FAILED) db.session.commit() params = {"export": export} role = Role.by_id(export.creator_id) log.info("Export [%r] complete: %s", export, export.status) publish( Events.COMPLETE_EXPORT, params=params, channels=[role], ) send_export_notification(export)
def index(): """Returns a list of exports for the user. --- get: summary: List exports responses: '200': content: application/json: schema: type: object allOf: - $ref: '#/components/schemas/QueryResponse' properties: results: type: array items: $ref: '#/components/schemas/Export' description: OK tags: - Export """ require(request.authz.logged_in) query = Export.by_role_id(request.authz.id) result = DatabaseQueryResult(request, query) return ExportSerializer.jsonify_result(result)
def export_entities(export_id): export = Export.by_id(export_id) log.info("Export entities [%r]...", export) export_dir = ensure_path(mkdtemp(prefix="aleph.export.")) collections = {} try: filters = [export.meta.get("query", {"match_none": {}})] file_path = export_dir.joinpath("export.zip") with ZipFile(file_path, mode="w") as zf: excel_name = safe_filename(export.label, extension="xlsx") excel_path = export_dir.joinpath(excel_name) exporter = ExcelExporter(excel_path, extra=EXTRA_HEADERS) for idx, entity in enumerate(iter_proxies(filters=filters)): collection_id = entity.context.get("collection_id") if collection_id not in collections: collections[collection_id] = get_collection(collection_id) collection = collections[collection_id] if collection is None: continue extra = [entity_url(entity.id), collection.get("label")] exporter.write(entity, extra=extra) write_document(export_dir, zf, collection, entity) if file_path.stat().st_size >= settings.EXPORT_MAX_SIZE: concern = "total size of the" zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern) break if idx >= settings.EXPORT_MAX_RESULTS: concern = "number of" zf.writestr("EXPORT_TOO_LARGE.txt", WARNING % concern) break exporter.finalize() zf.write(excel_path, arcname=excel_name) file_name = "Export: %s" % export.label file_name = safe_filename(file_name, extension="zip") complete_export(export_id, file_path, file_name) except Exception: log.exception("Failed to process export [%s]", export_id) export = Export.by_id(export_id) export.set_status(status=Status.FAILED) db.session.commit() finally: shutil.rmtree(export_dir)
def test_delete_expired(self): q = Export.by_role_id(self.role_email.id) assert q.count() == 3, q.count() delete_expired_exports() q = Export.by_role_id(self.role_email.id) assert q.count() == 1, q.count() exp1 = Export.by_id(self.export1.id, deleted=False) assert exp1 is not None assert exp1.deleted is False exp2 = Export.by_id(self.export2.id, deleted=True) assert exp2 is not None assert exp2.deleted is True path = archive.load_file(self.export1.content_hash) assert path is not None assert path.exists() exp1.expires_at = datetime.utcnow() + timedelta(days=-1) db.session.add(exp1) db.session.commit() delete_expired_exports() q = Export.by_role_id(self.role_email.id) assert q.count() == 0, q.count() exp1 = Export.by_id(self.export1.id, deleted=True) assert exp1 is not None assert exp1.deleted is True path = archive.load_file(self.export1.content_hash) assert path is not None, path path = archive.load_file(self.export3.content_hash) assert path is None, path
def download(export_id): """Downloads the exported file from the archive. --- get: summary: Download an export from the archive parameters: - description: export id in: path name: export_id required: true schema: type: string - description: Authorization token for an export in: query name: claim required: false schema: type: string description: A signed JWT with the object hash. responses: '200': description: OK content: '*/*': {} '404': description: Object does not exist. tags: - Export """ require(request.authz.logged_in) export = obj_or_404(Export.by_id(export_id, role_id=request.authz.id)) expires_after = export.expires_at - datetime.utcnow() url = archive.generate_publication_url( export.namespace, export.content_hash, mime_type=export.mime_type, expire=expires_after.total_seconds(), attachment_name=export.file_name, ) if url is not None: return redirect(url) local_path = archive.load_publication(export.namespace, export.content_hash) if local_path is None: raise NotFound() return send_file( str(local_path), as_attachment=True, conditional=True, attachment_filename=export.file_name, mimetype=export.mime_type, )
def get_export(export_id): if export_id is None: return key = cache.object_key(Export, export_id) data = cache.get_complex(key) if data is None: export = Export.by_id(export_id) if export is None: return log.debug("Export cache refresh: %r", export) data = export.to_dict() cache.set_complex(key, data, expires=cache.EXPIRE) return data
def create_export( operation, role_id, label, file_path=None, expires_after=Export.DEFAULT_EXPIRATION, collection=None, mime_type=None, ): export = Export.create(operation, role_id, label, file_path, expires_after, collection, mime_type) db.session.commit() return export
def delete_expired_exports(): """Delete export files from the archive after their time limit has expired.""" expired_exports = Export.get_expired(deleted=False) for export in expired_exports: log.info("Deleting expired export: %r", export) if export.should_delete_publication(): counts = list(checksums_count([export.content_hash])) if counts[0][1] == 0: archive.delete_file(export.content_hash) export.deleted = True db.session.add(export) db.session.commit()
def complete_export(export_id, file_path=None): export = Export.by_id(export_id) if file_path: export.set_filepath(file_path) export.publish() db.session.commit() params = {"export": export} role = Role.by_id(export.creator_id) publish( Events.COMPLETE_EXPORT, params=params, channels=[role], ) send_export_notification(export)
def get_deep_role(role): authz = Authz.from_role(role) alerts = Alert.by_role_id(role.id).count() exports = Export.by_role_id(role.id).count() casefiles = Collection.all_casefiles(authz=authz).count() entitysets = EntitySet.type_counts(authz=authz) return { "counts": { "alerts": alerts, "entitysets": entitysets, "casefiles": casefiles, "exports": exports, }, "shallow": False, }
def create_export( operation, role_id, label, collection=None, mime_type=None, meta=None, ): export = Export.create( operation, role_id, label, collection=collection, mime_type=mime_type, meta=meta, ) db.session.commit() return export
def retry_exports(): for export in Export.get_pending(): queue_task(None, export.operation, export_id=export.id)
def get_export(export_id): if export_id is None: return export = Export.by_id(export_id, deleted=True) if export is not None: return export.to_dict()
def retry_exports(): for export in Export.get_pending(): queue_task(None, export.operation, payload={"export_id": export.id})
def delete_expired_exports(): expired_exports = Export.get_expired(deleted=False) for export in expired_exports: export.delete_publication() db.session.commit()