def complete_export(export_id, file_path): export = Export.by_id(export_id) file_path = ensure_path(file_path) export.file_name = safe_filename(file_path) export.file_size = file_path.stat().st_size export.content_hash = checksum(file_path) try: archive.archive_file(file_path, content_hash=export.content_hash, mime_type=export.mime_type) export.set_status(status=Status.SUCCESS) except Exception: log.exception("Failed to upload export: %s", export) export.set_status(status=Status.FAILED) db.session.commit() params = {"export": export} role = Role.by_id(export.creator_id) log.info("Export [%r] complete: %s", export, export.status) publish( Events.COMPLETE_EXPORT, params=params, channels=[role], ) send_export_notification(export)
def ingest_file(source_id, meta, file_name, move=False): if not os.path.isfile(file_name): raise ValueError("No such file: %r", file_name) if not meta.has("source_path"): meta.source_path = file_name meta = archive.archive_file(file_name, meta, move=move) ingest.delay(source_id, meta.data)
def ingest_upload(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) job_id = get_session_id() sync = get_flag('sync', default=False) meta, foreign_id = _load_metadata() parent = _load_parent(collection, meta) upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.')) try: content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = upload_dir.joinpath(path) storage.save(str(path)) content_hash = archive.archive_file(path) document = Document.save(collection=collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, uploader_id=request.authz.id) collection.touch() db.session.commit() proxy = document.to_proxy() if proxy.schema.is_a(Document.SCHEMA_FOLDER) and sync: index_proxy(collection, proxy, sync=sync) ingest_entity(collection, proxy, job_id=job_id, sync=sync) document_id = collection.ns.sign(document.id) _notify(collection, document_id) finally: shutil.rmtree(upload_dir) return jsonify({'status': 'ok', 'id': document_id}, status=201)
def ingest_url(self, collection_id, metadata, url): meta = Metadata.from_data(metadata) if meta.foreign_id is None: meta.foreign_id = url tmp_path = make_tempfile(meta.file_name, suffix=meta.extension) try: log.info("Ingesting URL: %s", url) res = requests.get(url, stream=True) if res.status_code == 404: log.info("HTTP not found: %s", url) return if res.status_code >= 399: countdown = 3600**self.request.retries self.retry(countdown=countdown) with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = archive.archive_file(tmp_path, meta, move=True) Ingestor.dispatch(collection_id, meta) except IOError as ioe: log.info("IO Failure: %r", ioe) countdown = 3600**self.request.retries self.retry(countdown=countdown) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove() remove_tempfile(tmp_path)
def ingest_upload(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) meta, foreign_id = _load_metadata() parent = _load_parent(collection, meta) upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.')) try: content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = upload_dir.joinpath(path) storage.save(str(path)) content_hash = archive.archive_file(path) document = Document.save(collection=collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, uploader_id=request.authz.id) db.session.commit() proxy = document.to_proxy() ingest_entity(collection, proxy) finally: shutil.rmtree(upload_dir) return jsonify({ 'status': 'ok', 'id': stringify(document.id) }, status=201)
def ingest_document(document, file_path, user_queue=False): """Given a stub document and file path, extract information. This does not attempt to infer metadata such as a file name.""" document.status = Document.STATUS_PENDING if os.path.isdir(file_path): manager = get_manager() manager.ingest_document(document, file_path=file_path) else: ch = archive.archive_file(file_path, content_hash=document.content_hash) document.content_hash = ch or document.content_hash db.session.commit() queue = USER_QUEUE if user_queue else WORKER_QUEUE routing_key = USER_ROUTING_KEY if user_queue else WORKER_ROUTING_KEY ingest.apply_async([document.id], queue=queue, routing_key=routing_key)
def setUp(self): super(MappingAPITest, self).setUp() self.col = self.create_collection(foreign_id="map1") aggregator = get_aggregator(self.col) aggregator.delete() _, self.headers = self.login(is_admin=True) self.rolex = self.create_user(foreign_id="user_3") _, self.headers_x = self.login(foreign_id="user_3") self.fixture = self.get_fixture_path("experts.csv") self.content_hash = archive.archive_file(self.fixture) data = { "id": "foo", "schema": "Table", "properties": { "csvHash": self.content_hash, "contentHash": self.content_hash, "mimeType": "text/csv", "fileName": "experts.csv", "name": "experts.csv", }, } self.ent = EntityProxy.from_dict(model, data, cleaned=False) self.ent.id = self.col.ns.sign(self.ent.id) index_proxy(self.col, self.ent) data = { "id": "foo2", "schema": "Table", "properties": { "csvHash": self.content_hash, "contentHash": self.content_hash, "mimeType": "text/csv", "fileName": "experts.csv", "name": "experts.csv", }, } self.ent2 = EntityProxy.from_dict(model, data, cleaned=False) self.ent2.id = self.col.ns.sign(self.ent2.id) index_proxy(self.col, self.ent2) data = { "id": "bar", "schema": "LegalEntity", "properties": { "name": "John Doe" }, } ent = EntityProxy.from_dict(model, data, cleaned=False) ent.id = self.col.ns.sign(ent.id) index_proxy(self.col, ent)
def setUp(self): super(MappingAPITest, self).setUp() self.col = self.create_collection(data={'foreign_id': 'map1'}) _, self.headers = self.login(is_admin=True) self.rolex = self.create_user(foreign_id='user_3') _, self.headers_x = self.login(foreign_id='user_3') self.fixture = self.get_fixture_path('experts.csv') self.content_hash = archive.archive_file(self.fixture) data = { 'id': 'foo', 'schema': 'Table', 'properties': { 'csvHash': self.content_hash, 'contentHash': self.content_hash, 'mimeType': 'text/csv', 'fileName': 'experts.csv', 'name': 'experts.csv' } } self.ent = EntityProxy.from_dict(model, data) self.ent.id = self.col.ns.sign(self.ent.id) index_proxy(self.col, self.ent) data = { 'id': 'foo2', 'schema': 'Table', 'properties': { 'csvHash': self.content_hash, 'contentHash': self.content_hash, 'mimeType': 'text/csv', 'fileName': 'experts.csv', 'name': 'experts.csv' } } self.ent2 = EntityProxy.from_dict(model, data) self.ent2.id = self.col.ns.sign(self.ent2.id) index_proxy(self.col, self.ent2) data = { 'id': 'bar', 'schema': 'LegalEntity', 'properties': { 'name': 'John Doe' } } ent = EntityProxy.from_dict(model, data) ent.id = self.col.ns.sign(ent.id) index_proxy(self.col, ent)
def ingest_url(source_id, metadata, url): clear_session() meta = Metadata(data=metadata) with NamedTemporaryFile() as fh: log.info("Ingesting URL: %r", url) res = requests.get(url, stream=True) if res.status_code >= 400: log.error("Error ingesting %r: %r", url, res.status_code) for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) fh.flush() if not meta.has("source_url"): meta.source_url = res.url meta.headers = res.headers meta = archive.archive_file(fh.name, meta, move=True) ingest.delay(source_id, meta.data)
def ingest_document(document, file_path, role_id=None, shallow=False): """Given a stub document and file path, extract information. This does not attempt to infer metadata such as a file name.""" document.status = Document.STATUS_PENDING if not is_file(file_path): manager = get_manager() manager.ingest_document(document, file_path=file_path, role_id=role_id, shallow=shallow) else: document.content_hash = archive.archive_file(file_path) db.session.commit() priority = 5 if document.collection.casefile else 3 ingest.apply_async(args=[document.id], kwargs={'role_id': role_id}, priority=priority)
def ingest_url(self, document_id, url): """Load the given URL into the document specified by document_id.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return tmp_path = make_tempfile(document.file_name, suffix=document.extension) try: log.info("Ingesting URL: %s", url) res = requests.get(url, stream=True) if res.status_code >= 500: countdown = 3600 ** self.request.retries self.retry(countdown=countdown) return if res.status_code >= 400: document.status = Document.STATUS_FAIL document.error_message = "HTTP %s: %s" % (res.status_code, url) db.session.commit() return with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not document.has_meta('source_url'): document.source_url = res.url if not document.foreign_id: document.foreign_id = res.url document.headers = res.headers document.content_hash = archive.archive_file(tmp_path) db.session.commit() get_manager().ingest_document(document) except IOError as ioe: log.info("IO Failure: %r", ioe) countdown = 3600 ** self.request.retries self.retry(countdown=countdown) except Exception as ex: document.status = Document.STATUS_FAIL document.error_type = type(ex).__name__ document.error_message = six.text_type(ex) db.session.commit() log.exception(ex) finally: db.session.remove() remove_tempfile(tmp_path)
def ingest_document(document, file_path, role_id=None): """Given a stub document and file path, extract information. This does not attempt to infer metadata such as a file name.""" document.status = Document.STATUS_PENDING if role_id is not None: document.uploader_id = role_id if file_path is not None: # Directories cannot be archived first and then processed # later. So they are effectively sent into a short-cut here if os.path.isdir(file_path): db.session.commit() return ingest(document.id, file_path=file_path) document.content_hash = archive.archive_file(file_path) db.session.commit() priority = 5 if document.collection.casefile else 3 ingest.apply_async(args=[document.id], priority=priority)
def ingest_document(document, file_path, role_id=None): """Given a stub document and file path, extract information. This does not attempt to infer metadata such as a file name.""" document.status = Document.STATUS_PENDING if not is_file(file_path): manager = get_manager() manager.ingest_document(document, file_path=file_path, role_id=role_id) else: document.content_hash = archive.archive_file(file_path) db.session.commit() queue = WORKER_QUEUE routing_key = WORKER_ROUTING_KEY if role_id is not None: queue = USER_QUEUE routing_key = USER_ROUTING_KEY ingest.apply_async(args=[document.id], kwargs={'role_id': role_id}, queue=queue, routing_key=routing_key)
def crawl_directory(collection, path, parent=None): """Crawl the contents of the given path.""" content_hash = None if not path.is_dir(): content_hash = archive.archive_file(path) foreign_id = path.name if parent is not None: foreign_id = os.path.join(parent.foreign_id, foreign_id) meta = {'file_name': path.name} document = Document.save(collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta) db.session.commit() ingest_entity(collection, document.to_proxy()) log.info("Crawl [%s]: %s -> %s", collection.id, path, document.id) if path.is_dir(): for child in path.iterdir(): crawl_directory(collection, child, document)
def ingest_file(collection_id, meta, file_path, move=False, queue=WORKER_QUEUE, routing_key=WORKER_ROUTING_KEY): # the queue and routing key arguments are a workaround to # expedite user uploads over long-running batch imports. file_path = string_value(file_path) try: if not os.path.isfile(file_path): raise IngestorException("No such file: %r", file_path) if not meta.has('source_path'): meta.source_path = file_path meta = archive.archive_file(file_path, meta, move=move) ingest.apply_async([collection_id, meta.to_attr_dict()], queue=queue, routing_key=routing_key) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove()
def crawl_directory(collection, path, parent=None, job_id=None): """Crawl the contents of the given path.""" try: content_hash = None if not path.is_dir(): content_hash = archive.archive_file(path) foreign_id = path.name if parent is not None: foreign_id = os.path.join(parent.foreign_id, foreign_id) # if the job_id is not set yet and path.is_dir(), we know it is the # first iteration and we don't create an initial root folder as parent # to be consistent with the behaviour of alephclient if path.is_dir() and job_id is None: document = None job_id = Job.random_id() else: meta = {"file_name": path.name} document = Document.save( collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, ) db.session.commit() job_id = job_id or Job.random_id() proxy = document.to_proxy() ingest_flush(collection, entity_id=proxy.id) ingest_entity(collection, proxy, job_id=job_id) log.info("Crawl [%s]: %s -> %s", collection.id, path, document.id) if path.is_dir(): for child in path.iterdir(): crawl_directory(collection, child, document, job_id) except OSError: log.exception("Cannot crawl directory: %s", path)
def setUp(self): super(ArchiveApiTestCase, self).setUp() self.fixture = self.get_fixture_path('samples/website.html') self.content_hash = archive.archive_file(self.fixture)
def setUp(self): super(ArchiveApiTestCase, self).setUp() self.fixture = self.get_fixture_path("samples/website.html") self.content_hash = archive.archive_file(self.fixture) self.fixture2 = self.get_fixture_path("samples/taggable.txt") self.content_hash2 = archive.archive_file(self.fixture2)
def ingest_upload(collection_id): """ --- post: summary: Upload a document to a collection description: Upload a document to a collection with id `collection_id` parameters: - in: path name: collection_id required: true schema: type: integer requestBody: content: multipart/form-data: schema: type: object properties: file: type: string format: binary description: The document to upload meta: $ref: '#/components/schemas/DocumentIngest' responses: '200': description: OK content: application/json: schema: properties: id: description: id of the uploaded document type: integer status: type: string type: object tags: - Ingest - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) job_id = get_session_id() sync = get_flag('sync', default=False) meta, foreign_id = _load_metadata() parent = _load_parent(collection, meta) upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.')) try: content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = upload_dir.joinpath(path) storage.save(str(path)) content_hash = archive.archive_file(path) document = Document.save(collection=collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, uploader_id=request.authz.id) collection.touch() db.session.commit() proxy = document.to_proxy() if proxy.schema.is_a(Document.SCHEMA_FOLDER) and sync: index_proxy(collection, proxy, sync=sync) ingest_entity(collection, proxy, job_id=job_id, sync=sync) document_id = collection.ns.sign(document.id) _notify(collection, document_id) finally: shutil.rmtree(upload_dir) return jsonify({'status': 'ok', 'id': document_id}, status=201)
def emit_pdf_alternative(self, file_path): content_hash = archive.archive_file(file_path) self.document.pdf_version = content_hash
def store_pdf(self, meta, pdf_path): archive.archive_file(pdf_path, meta.pdf, move=False)
def store_pdf(self, meta, pdf_path, move=True): archive.archive_file(pdf_path, meta.pdf, move=move)