def create_temp_dir(self, *args, **kwargs): """Creates a temporary folder and removes it later.""" temp_dir = tempfile.mkdtemp(*args, **kwargs) try: yield decode_path(temp_dir) finally: remove_directory(temp_dir)
def ingest(document_id, file_path=None): """Process a given document by extracting its contents. This may include creating or updating child documents.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return # Work path will be used by storagelayer to cache a local # copy of data from an S3-based archive, and by ingestors # to perform processing and generate intermediary files. work_path = mkdtemp(prefix="aleph.ingest.") if file_path is None: file_path = archive.load_file(document.content_hash, file_name=document.safe_file_name, temp_path=work_path) try: manager = get_manager() result = DocumentResult(manager, document, file_path=file_path) get_manager().ingest(file_path, result=result, work_path=work_path) log.debug('Ingested [%s:%s]: %s', document.id, document.schema, document.name) db.session.commit() process_document(document) except Exception: db.session.rollback() document = Document.by_id(document_id) log.exception("Ingest failed [%s]: %s", document.id, document.name) finally: # Removing the temp_path given to storagelayer makes it redundant # to also call cleanup on the archive. remove_directory(work_path)
def ingest_document(self, document, file_path=None, role_id=None, shallow=False): """Ingest a database-backed document. First retrieve its data and then call the actual ingestor. """ # Work path will be used by storagelayer to cache a local # copy of data from an S3-based archive, and by ingestors # to perform processing and generate intermediary files. work_path = mkdtemp(prefix="aleph.ingest.") content_hash = document.content_hash if file_path is None and content_hash is not None: file_name = document.safe_file_name file_path = self.archive.load_file(content_hash, file_name=file_name, temp_path=work_path) if file_path is not None and not os.path.exists(file_path): # Probably indicative of file system encoding issues. log.error("Invalid path [%r]: %s", document, file_path) return try: if not len(document.languages): document.languages = document.collection.languages if not len(document.countries): document.countries = document.collection.countries result = DocumentResult(self, document, file_path=file_path, role_id=role_id) self.ingest(file_path, result=result) if not shallow and file_path is None: # When a directory is ingested, the data is not stored. Thus, # try to recurse on the database-backed known children. for child in Document.by_parent(document): from aleph.ingest import ingest_document ingest_document(child, None, role_id=role_id) finally: db.session.rollback() # Removing the temp_path given to storagelayer makes it redundant # to also call cleanup on the archive. remove_directory(work_path)
def ingest(document_id, file_path=None, refresh=False): """Process a given document by extracting its contents. This may include creating or updating child documents.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return # Work path will be used by storagelayer to cache a local # copy of data from an S3-based archive, and by ingestors # to perform processing and generate intermediary files. work_path = mkdtemp(prefix="aleph.ingest.") if file_path is None: file_path = archive.load_file(document.content_hash, file_name=document.safe_file_name, temp_path=work_path) try: manager = get_manager() result = DocumentResult(manager, document, file_path=file_path) get_manager().ingest(file_path, result=result, work_path=work_path) document.status = Document.STATUS_SUCCESS log.debug('Ingested [%s:%s]: %s', document.id, document.schema, document.name) if document.collection.casefile and not refresh: params = {'collection': document.collection, 'document': document} publish(Events.INGEST_DOCUMENT, actor_id=document.uploader_id, params=params) db.session.commit() except Exception: db.session.rollback() document = Document.by_id(document_id) log.exception("Ingest failed [%s]: %s", document.id, document.name) document.status = Document.STATUS_FAIL db.session.commit() finally: # Removing the temp_path given to storagelayer makes it redundant # to also call cleanup on the archive. remove_directory(work_path) extract_document_tags(document) # delete_entity(document.id, exclude=document.schema) index_document(document) refresh_entity(document)
def extract_message(self, zipf, name): if 'message_' not in name or not name.endswith('.xml'): return parent = self.extract_hierarchy(name) message_dir = self.make_empty_directory() try: xml_path = self.extract_file(zipf, name, message_dir) foreign_id = os.path.join(self.result.id, name) message = self.manager.handle_child(parent, xml_path, id=foreign_id, mime_type=MIME) try: doc = self.parse_xml(xml_path) for el in doc.findall('.//messageAttachment'): self.extract_attachment(zipf, message, el, message_dir) except TypeError: pass # this will be reported for the individual file. finally: remove_directory(message_dir)
def close(self): self.writer.flush() remove_directory(self.work_path)
def finalize(self, entity): self.emit_entity(entity) self.writer.flush() remove_directory(self.work_path)
def cleanup(self): remove_directory(self.work_path)