def ingest_document(self, document, file_path=None, role_id=None): """Ingest a database-backed document. First retrieve it's data and then call the actual ingestor. """ if file_path is None: file_path = self.archive.load_file(document.content_hash, file_name=document.file_name) if file_path is None: # When a directory is ingested, the data is not stored. Thus, try # to recurse transparently. for child in Document.by_parent(document): self.ingest_document(child, role_id=role_id) return if not os.path.exists(file_path): # Probably indicative of file system encoding issues. log.warn("Ingest non-existant path [%r]: %s", document, file_path) return try: if not len(document.languages) and document.collection is not None: document.languages = document.collection.languages or [] if not len(document.countries) and document.collection is not None: document.countries = document.collection.countries or [] result = DocumentResult(self, document, file_path=file_path, role_id=role_id) self.ingest(file_path, result=result) finally: self.archive.cleanup_file(document.content_hash)
def ingest_document(self, document, file_path=None, role_id=None, shallow=False): """Ingest a database-backed document. First retrieve its data and then call the actual ingestor. """ # Work path will be used by storagelayer to cache a local # copy of data from an S3-based archive, and by ingestors # to perform processing and generate intermediary files. work_path = mkdtemp(prefix="aleph.ingest.") content_hash = document.content_hash if file_path is None and content_hash is not None: file_name = document.safe_file_name file_path = self.archive.load_file(content_hash, file_name=file_name, temp_path=work_path) if file_path is not None and not os.path.exists(file_path): # Probably indicative of file system encoding issues. log.error("Invalid path [%r]: %s", document, file_path) return try: if not len(document.languages): document.languages = document.collection.languages if not len(document.countries): document.countries = document.collection.countries result = DocumentResult(self, document, file_path=file_path, role_id=role_id) self.ingest(file_path, result=result) if not shallow and file_path is None: # When a directory is ingested, the data is not stored. Thus, # try to recurse on the database-backed known children. for child in Document.by_parent(document): from aleph.ingest import ingest_document ingest_document(child, None, role_id=role_id) finally: db.session.rollback() # Removing the temp_path given to storagelayer makes it redundant # to also call cleanup on the archive. remove_directory(work_path)
def ingest_document(self, document, file_path=None, role_id=None): """Ingest a database-backed document. First retrieve its data and then call the actual ingestor. """ content_hash = document.content_hash if file_path is None and content_hash is not None: file_path = self.archive.load_file( content_hash, file_name=document.safe_file_name) # noqa if file_path is not None and not os.path.exists(file_path): # Probably indicative of file system encoding issues. log.error("Ingest invalid path [%r]: %s", document, file_path) return try: if document.collection is not None: if not len(document.languages): document.languages = document.collection.languages if not len(document.countries): document.countries = document.collection.countries result = DocumentResult(self, document, file_path=file_path, role_id=role_id) self.ingest(file_path, result=result) if file_path is None: # When a directory is ingested, the data is not stored. Thus, # try to recurse transparently. for child in Document.by_parent(document): self.ingest_document(child, role_id=role_id) finally: db.session.rollback() if content_hash is not None: self.archive.cleanup_file(content_hash)