Example #1
0
    def ingest_document(self, document, file_path=None, role_id=None):
        """Ingest a database-backed document.

        First retrieve it's data and then call the actual ingestor.
        """
        if file_path is None:
            file_path = self.archive.load_file(document.content_hash,
                                               file_name=document.file_name)

        if file_path is None:
            # When a directory is ingested, the data is not stored. Thus, try
            # to recurse transparently.
            for child in Document.by_parent(document):
                self.ingest_document(child, role_id=role_id)
            return

        if not os.path.exists(file_path):
            # Probably indicative of file system encoding issues.
            log.warn("Ingest non-existant path [%r]: %s", document, file_path)
            return

        try:
            if not len(document.languages) and document.collection is not None:
                document.languages = document.collection.languages or []

            if not len(document.countries) and document.collection is not None:
                document.countries = document.collection.countries or []

            result = DocumentResult(self,
                                    document,
                                    file_path=file_path,
                                    role_id=role_id)
            self.ingest(file_path, result=result)
        finally:
            self.archive.cleanup_file(document.content_hash)
Example #2
0
    def ingest_document(self,
                        document,
                        file_path=None,
                        role_id=None,
                        shallow=False):
        """Ingest a database-backed document.

        First retrieve its data and then call the actual ingestor.
        """
        # Work path will be used by storagelayer to cache a local
        # copy of data from an S3-based archive, and by ingestors
        # to perform processing and generate intermediary files.
        work_path = mkdtemp(prefix="aleph.ingest.")
        content_hash = document.content_hash
        if file_path is None and content_hash is not None:
            file_name = document.safe_file_name
            file_path = self.archive.load_file(content_hash,
                                               file_name=file_name,
                                               temp_path=work_path)

        if file_path is not None and not os.path.exists(file_path):
            # Probably indicative of file system encoding issues.
            log.error("Invalid path [%r]: %s", document, file_path)
            return

        try:
            if not len(document.languages):
                document.languages = document.collection.languages

            if not len(document.countries):
                document.countries = document.collection.countries

            result = DocumentResult(self,
                                    document,
                                    file_path=file_path,
                                    role_id=role_id)
            self.ingest(file_path, result=result)

            if not shallow and file_path is None:
                # When a directory is ingested, the data is not stored. Thus,
                # try to recurse on the database-backed known children.
                for child in Document.by_parent(document):
                    from aleph.ingest import ingest_document
                    ingest_document(child, None, role_id=role_id)
        finally:
            db.session.rollback()
            # Removing the temp_path given to storagelayer makes it redundant
            # to also call cleanup on the archive.
            remove_directory(work_path)
Example #3
0
    def ingest_document(self, document, file_path=None, role_id=None):
        """Ingest a database-backed document.

        First retrieve its data and then call the actual ingestor.
        """
        content_hash = document.content_hash
        if file_path is None and content_hash is not None:
            file_path = self.archive.load_file(
                content_hash, file_name=document.safe_file_name)  # noqa

        if file_path is not None and not os.path.exists(file_path):
            # Probably indicative of file system encoding issues.
            log.error("Ingest invalid path [%r]: %s", document, file_path)
            return

        try:
            if document.collection is not None:
                if not len(document.languages):
                    document.languages = document.collection.languages

                if not len(document.countries):
                    document.countries = document.collection.countries

            result = DocumentResult(self,
                                    document,
                                    file_path=file_path,
                                    role_id=role_id)
            self.ingest(file_path, result=result)

            if file_path is None:
                # When a directory is ingested, the data is not stored. Thus,
                # try to recurse transparently.
                for child in Document.by_parent(document):
                    self.ingest_document(child, role_id=role_id)
        finally:
            db.session.rollback()
            if content_hash is not None:
                self.archive.cleanup_file(content_hash)