Beispiel #1
0
    def parse(self, document_path, mime_type):
        self.log("info",
                 f"[TIKA_PARSE] Sending {document_path} to Tika server")

        try:
            parsed = parser.from_file(document_path)
        except requests.exceptions.HTTPError as err:
            raise ParseError(
                f"Could not parse {document_path} with tika server: {err}")

        try:
            content = parsed["content"].strip()
        except:
            content = ""

        try:
            creation_date = dateutil.parser.isoparse(
                parsed["metadata"]["Creation-Date"])
        except:
            creation_date = None

        archive_path = os.path.join(self.tempdir, "convert.pdf")
        convert_to_pdf(self, document_path, archive_path)

        self.archive_path = archive_path
        self.date = creation_date
        self.text = content
Beispiel #2
0
    def extract_metadata(self, document_path, mime_type):
        tika_server = settings.PAPERLESS_TIKA_ENDPOINT
        try:
            parsed = parser.from_file(document_path, tika_server)
        except Exception as e:
            self.log(
                "warning", f"Error while fetching document metadata for "
                f"{document_path}: {e}")
            return []

        return [{
            "namespace": "",
            "prefix": "",
            "key": key,
            "value": parsed['metadata'][key]
        } for key in parsed['metadata']]
Beispiel #3
0
    def parse(self, document_path, mime_type, file_name=None):
        self.log("info", f"Sending {document_path} to Tika server")
        tika_server = settings.PAPERLESS_TIKA_ENDPOINT

        try:
            parsed = parser.from_file(document_path, tika_server)
        except Exception as err:
            raise ParseError(
                f"Could not parse {document_path} with tika server at "
                f"{tika_server}: {err}")

        self.text = parsed["content"].strip()

        try:
            self.date = dateutil.parser.isoparse(
                parsed["metadata"]["Creation-Date"])
        except Exception as e:
            self.log(
                "warning", f"Unable to extract date for document "
                f"{document_path}: {e}")

        self.archive_path = self.convert_to_pdf(document_path, file_name)