def test_delete_expired(self): q = Export.by_role_id(self.role_email.id) assert q.count() == 3, q.count() delete_expired_exports() q = Export.by_role_id(self.role_email.id) assert q.count() == 1, q.count() exp1 = Export.by_id(self.export1.id, deleted=False) assert exp1 is not None assert exp1.deleted is False exp2 = Export.by_id(self.export2.id, deleted=True) assert exp2 is not None assert exp2.deleted is True path = archive.load_file(self.export1.content_hash) assert path is not None assert path.exists() exp1.expires_at = datetime.utcnow() + timedelta(days=-1) db.session.add(exp1) db.session.commit() delete_expired_exports() q = Export.by_role_id(self.role_email.id) assert q.count() == 0, q.count() exp1 = Export.by_id(self.export1.id, deleted=True) assert exp1 is not None assert exp1.deleted is True path = archive.load_file(self.export1.content_hash) assert path is not None, path path = archive.load_file(self.export3.content_hash) assert path is None, path
def test_create(self): assert self.export1.content_hash is not None assert self.export1.content_hash == self.export2.content_hash assert archive.load_file(self.export1.content_hash) is not None assert archive.load_file( self.export1.content_hash) == archive.load_file( self.export2.content_hash) assert self.export1.file_name == self.export2.file_name == "experts.csv" res = get_notifications(self.role_email) notification_count = res.get("hits").get("total").get("value") assert notification_count == 3, notification_count
def ingest(document_id, file_path=None): """Process a given document by extracting its contents. This may include creating or updating child documents.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return # Work path will be used by storagelayer to cache a local # copy of data from an S3-based archive, and by ingestors # to perform processing and generate intermediary files. work_path = mkdtemp(prefix="aleph.ingest.") if file_path is None: file_path = archive.load_file(document.content_hash, file_name=document.safe_file_name, temp_path=work_path) try: manager = get_manager() result = DocumentResult(manager, document, file_path=file_path) get_manager().ingest(file_path, result=result, work_path=work_path) log.debug('Ingested [%s:%s]: %s', document.id, document.schema, document.name) db.session.commit() process_document(document) except Exception: db.session.rollback() document = Document.by_id(document_id) log.exception("Ingest failed [%s]: %s", document.id, document.name) finally: # Removing the temp_path given to storagelayer makes it redundant # to also call cleanup on the archive. remove_directory(work_path)
def file(document_id): document = get_document(document_id) url = archive.generate_url(document.meta) if url is not None: return redirect(url) local_path = archive.load_file(document.meta) fh = open(local_path, 'rb') return send_file(fh, as_attachment=True, attachment_filename=document.meta.file_name, mimetype=document.meta.mime_type)
def dispatch(cls, collection_id, meta): local_path = archive.load_file(meta) try: best_cls = cls.auction_file(meta, local_path) log.debug("Dispatching %r to %r", meta.file_name, best_cls) best_cls(collection_id).ingest(meta, local_path) db.session.commit() except Exception as exc: cls.handle_exception(meta, collection_id, exc) finally: archive.cleanup_file(meta)
def pdf(document_id): document = get_document(document_id) if document.type != Document.TYPE_TEXT: raise BadRequest("PDF is only available for text documents") pdf = document.meta.pdf url = archive.generate_url(pdf) if url is not None: return redirect(url) local_path = archive.load_file(pdf) fh = open(local_path, 'rb') return send_file(fh, mimetype=pdf.mime_type)
def _get_table_csv_link(table): proxy = model.get_proxy(table) csv_hash = proxy.first("csvHash") if csv_hash is None: raise RuntimeError("Source table doesn't have a CSV version") url = archive.generate_url(csv_hash) if url is None: local_path = archive.load_file(csv_hash) if local_path is not None: url = local_path.as_posix() if url is None: raise RuntimeError("Could not generate CSV URL for the table") return url
def write_document(export_dir, zf, collection, entity): content_hash = entity.first("contentHash", quiet=True) if content_hash is None: return file_name = entity_filename(entity) arcname = "{0}-{1}".format(entity.id, file_name) arcname = os.path.join(collection.get("label"), arcname) try: local_path = archive.load_file(content_hash, temp_path=export_dir) if local_path is not None and os.path.exists(local_path): zf.write(local_path, arcname=arcname) finally: archive.cleanup_file(content_hash, temp_path=export_dir)
def retrieve(): """Downloads a binary blob from the blob storage archive. --- get: summary: Download a blob from the archive parameters: - description: Authorization token for an archive blob in: query name: claim schema: type: string description: A signed JWT with the object hash. responses: '200': description: OK content: '*/*': {} '404': description: Object does not exist. tags: - Archive """ token = request.args.get("token") token = jwt.decode(token, key=settings.SECRET_KEY, verify=True) content_hash = token.get("c") file_name = token.get("f") mime_type = token.get("m") expire = datetime.utcfromtimestamp(token["exp"]) tag_request(content_hash=content_hash, file_name=file_name) url = archive.generate_url( content_hash, file_name=file_name, mime_type=mime_type, expire=expire, ) if url is not None: return redirect(url) try: local_path = archive.load_file(content_hash) if local_path is None: return Response(status=404) return send_file( str(local_path), as_attachment=True, conditional=True, attachment_filename=file_name, mimetype=mime_type, ) finally: archive.cleanup_file(content_hash)
def get_table_csv_link(table_id): table = get_entity(table_id) properties = table.get('properties', {}) csv_hash = first(properties.get('csvHash')) if csv_hash is None: raise RuntimeError("Source table doesn't have a CSV version") url = archive.generate_url(csv_hash) if not url: local_path = archive.load_file(csv_hash) if local_path is not None: url = local_path.as_posix() if url is None: raise RuntimeError("Could not generate CSV URL for the table") return url
def write_document(zip_archive, collection, entity): if not entity.has('contentHash', quiet=True): return name = entity.first('fileName') or entity.caption name = "{0}-{1}".format(entity.id, name) path = os.path.join(collection.get('label'), name) content_hash = entity.first('contentHash') url = archive.generate_url(content_hash) if url is not None: stream = requests.get(url, stream=True) zip_archive.write_iter(path, stream.iter_content()) else: local_path = archive.load_file(content_hash) if local_path is not None: zip_archive.write(local_path, arcname=path)
def make_mapper(collection, mapping): table = get_entity(mapping.table_id) properties = table.get('properties', {}) csv_hash = first(properties.get('csvHash')) if csv_hash is None: raise RuntimeError("Source table doesn't have a CSV version") url = archive.generate_url(csv_hash) if not url: local_path = archive.load_file(csv_hash) if local_path is not None: url = local_path.as_posix() if url is None: raise RuntimeError("Could not generate CSV URL for the table") data = {'csv_url': url, 'entities': mapping.query} return model.make_mapping(data, key_prefix=collection.foreign_id)
def pdf(document_id): document = get_document(document_id) enable_cache(server_side=True) log_event(request, document_id=document.id) if document.type != Document.TYPE_TEXT: raise BadRequest("PDF is only available for text documents") url = archive.generate_url(document.pdf_version, mime_type=PDF_MIME) if url is not None: return redirect(url) path = archive.load_file(document.pdf_version, file_name=document.file_name) if path is None: raise NotFound("Missing PDF file.") return send_file(open(path, 'rb'), mimetype=PDF_MIME)
def ingest(document_id, file_path=None, refresh=False): """Process a given document by extracting its contents. This may include creating or updating child documents.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return # Work path will be used by storagelayer to cache a local # copy of data from an S3-based archive, and by ingestors # to perform processing and generate intermediary files. work_path = mkdtemp(prefix="aleph.ingest.") if file_path is None: file_path = archive.load_file(document.content_hash, file_name=document.safe_file_name, temp_path=work_path) try: manager = get_manager() result = DocumentResult(manager, document, file_path=file_path) get_manager().ingest(file_path, result=result, work_path=work_path) document.status = Document.STATUS_SUCCESS log.debug('Ingested [%s:%s]: %s', document.id, document.schema, document.name) if document.collection.casefile and not refresh: params = {'collection': document.collection, 'document': document} publish(Events.INGEST_DOCUMENT, actor_id=document.uploader_id, params=params) db.session.commit() except Exception: db.session.rollback() document = Document.by_id(document_id) log.exception("Ingest failed [%s]: %s", document.id, document.name) document.status = Document.STATUS_FAIL db.session.commit() finally: # Removing the temp_path given to storagelayer makes it redundant # to also call cleanup on the archive. remove_directory(work_path) extract_document_tags(document) # delete_entity(document.id, exclude=document.schema) index_document(document) refresh_entity(document)
def file(document_id): document = get_document(document_id) enable_cache(server_side=True) log_event(request, document_id=document.id) url = archive.generate_url(document.meta) if url is not None: return redirect(url) local_path = archive.load_file(document.meta) if not os.path.isfile(local_path): raise NotFound("File does not exist.") fh = open(local_path, 'rb') return send_file(fh, as_attachment=True, attachment_filename=document.meta.file_name, mimetype=document.meta.mime_type)
def pdf(document_id): document = get_document(document_id) enable_cache(server_side=True) log_event(request, document_id=document.id) if document.type != Document.TYPE_TEXT: raise BadRequest("PDF is only available for text documents") pdf = document.meta.pdf url = archive.generate_url(pdf) if url is not None: return redirect(url) try: local_path = archive.load_file(pdf) fh = open(local_path, 'rb') except Exception as ex: raise NotFound("Missing PDF file: %r" % ex) return send_file(fh, mimetype=pdf.mime_type)
def retrieve(): """Downloads a binary blob from the blob storage archive. --- get: summary: Download a blob from the archive parameters: - description: Authorization token for an archive blob in: query name: claim schema: type: string description: A signed JWT with the object hash. responses: '200': description: OK content: '*/*': {} '404': description: Object does not exist. tags: - Archive """ claim = request.args.get("claim") role_id, content_hash, file_name, mime_type = archive_claim(claim) require(request.authz.id == role_id) tag_request(content_hash=content_hash, file_name=file_name) url = archive.generate_url(content_hash, file_name=file_name, mime_type=mime_type) if url is not None: return redirect(url) try: local_path = archive.load_file(content_hash) if local_path is None: return Response(status=404) return send_file( str(local_path), as_attachment=True, conditional=True, attachment_filename=file_name, mimetype=mime_type, ) finally: archive.cleanup_file(content_hash)
def _serve_archive(content_hash, file_name, mime_type): """Serve a file from the archive or by generating an external URL.""" url = archive.generate_url(content_hash, file_name=file_name, mime_type=mime_type) if url is not None: return redirect(url) try: local_path = archive.load_file(content_hash, file_name=file_name) if local_path is None: return Response(status=404) return send_file(local_path, as_attachment=True, conditional=True, attachment_filename=file_name, mimetype=mime_type) finally: archive.cleanup_file(content_hash)
def dispatch(cls, source_id, meta): best_score, best_cls = 0, None local_path = archive.load_file(meta) try: for cls in get_ingestors().values(): score = cls.match(meta, local_path) if score > best_score: best_score = score best_cls = cls if best_cls is None: log.debug("No ingestor found for: %r", meta.file_name) return log.debug("Dispatching %r to %r", meta.file_name, best_cls.__name__) best_cls(source_id).ingest(meta, local_path) except Exception as ex: log.exception(ex) db.session.rollback() finally: archive.cleanup_file(meta)
def _serve_archive(content_hash, file_name, mime_type): """Serve a file from the archive or by generating an external URL.""" url = archive.generate_url(content_hash, file_name=file_name, mime_type=mime_type) if url is not None: return redirect(url) enable_cache() try: local_path = archive.load_file(content_hash, file_name=file_name) if local_path is None: raise NotFound("File does not exist.") return send_file(open(local_path, 'rb'), as_attachment=True, attachment_filename=file_name, mimetype=mime_type) finally: archive.cleanup_file(content_hash)
def retrieve(): claim = request.args.get('claim') role_id, content_hash, file_name, mime_type = archive_claim(claim) require(request.authz.id == role_id) tag_request(content_hash=content_hash, file_name=file_name) url = archive.generate_url(content_hash, file_name=file_name, mime_type=mime_type) if url is not None: return redirect(url) try: local_path = archive.load_file(content_hash) if local_path is None: return Response(status=404) return send_file(str(local_path), as_attachment=True, conditional=True, attachment_filename=file_name, mimetype=mime_type) finally: archive.cleanup_file(content_hash)
def retrieve(): claim = request.args.get('claim') role_id, content_hash, file_name, mime_type = archive_claim(claim) require(request.authz.id == role_id) record_audit(Audit.ACT_ARCHIVE, content_hash=content_hash) tag_request(content_hash=content_hash, file_name=file_name) url = archive.generate_url(content_hash, file_name=file_name, mime_type=mime_type) if url is not None: return redirect(url) try: local_path = archive.load_file(content_hash) if local_path is None: return Response(status=404) return send_file(local_path, as_attachment=True, conditional=True, attachment_filename=file_name, mimetype=mime_type) finally: archive.cleanup_file(content_hash)