コード例 #1
0
ファイル: test_parsers.py プロジェクト: bauerj/paperless-ng
    def test_file_extensions(self):

        for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
            self.assertIn(ext, get_supported_file_extensions())
        self.assertEqual(get_default_file_extension('application/pdf'), ".pdf")
        self.assertEqual(get_default_file_extension('image/png'), ".png")
        self.assertEqual(get_default_file_extension('image/jpeg'), ".jpg")
        self.assertEqual(get_default_file_extension('text/plain'), ".txt")
        self.assertEqual(get_default_file_extension('text/csv'), ".csv")
        self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), None)

        self.assertEqual(get_parser_class_for_mime_type('application/pdf'),
                         RasterisedDocumentParser)
        self.assertEqual(get_parser_class_for_mime_type('text/plain'),
                         TextDocumentParser)
        self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None)
コード例 #2
0
def move_old_to_new_locations(apps, schema_editor):
    Document = apps.get_model("documents", "Document")

    affected_document_ids = set()

    old_archive_path_to_id = {}

    # check for documents that have incorrect archive versions
    for doc in Document.objects.filter(archive_checksum__isnull=False):
        old_path = archive_path_old(doc)

        if old_path in old_archive_path_to_id:
            affected_document_ids.add(doc.id)
            affected_document_ids.add(old_archive_path_to_id[old_path])
        else:
            old_archive_path_to_id[old_path] = doc.id

    # check that archive files of all unaffected documents are in place
    for doc in Document.objects.filter(archive_checksum__isnull=False):
        old_path = archive_path_old(doc)
        if doc.id not in affected_document_ids and not os.path.isfile(
                old_path):
            raise ValueError(
                f"Archived document ID:{doc.id} does not exist at: "
                f"{old_path}")

    # check that we can regenerate affected archive versions
    for doc_id in affected_document_ids:
        from documents.parsers import get_parser_class_for_mime_type

        doc = Document.objects.get(id=doc_id)
        parser_class = get_parser_class_for_mime_type(doc.mime_type)
        if not parser_class:
            raise ValueError(
                f"Document ID:{doc.id} has an invalid archived document, "
                f"but no parsers are available. Cannot migrate.")

    for doc in Document.objects.filter(archive_checksum__isnull=False):

        if doc.id in affected_document_ids:
            old_path = archive_path_old(doc)
            # remove affected archive versions
            if os.path.isfile(old_path):
                logger.debug(f"Removing {old_path}")
                os.unlink(old_path)
        else:
            # Set archive path for unaffected files
            doc.archive_filename = archive_name_from_filename(doc.filename)
            Document.objects.filter(id=doc.id).update(
                archive_filename=doc.archive_filename)

    # regenerate archive documents
    for doc_id in affected_document_ids:
        doc = Document.objects.get(id=doc_id)
        create_archive_version(doc)
コード例 #3
0
    def test_file_extensions(self):

        for ext in [".pdf", ".jpe", ".jpg", ".jpeg", ".txt", ".csv"]:
            self.assertIn(ext, get_supported_file_extensions())
        self.assertEqual(get_default_file_extension('application/pdf'), ".pdf")
        self.assertEqual(get_default_file_extension('image/png'), ".png")
        self.assertEqual(get_default_file_extension('image/jpeg'), ".jpg")
        self.assertEqual(get_default_file_extension('text/plain'), ".txt")
        self.assertEqual(get_default_file_extension('text/csv'), ".csv")
        self.assertEqual(get_default_file_extension('application/zip'), ".zip")
        self.assertEqual(get_default_file_extension('aasdasd/dgfgf'), "")

        self.assertIsInstance(
            get_parser_class_for_mime_type('application/pdf')(
                logging_group=None), RasterisedDocumentParser)
        self.assertIsInstance(
            get_parser_class_for_mime_type('text/plain')(logging_group=None),
            TextDocumentParser)
        self.assertEqual(get_parser_class_for_mime_type('text/sdgsdf'), None)

        self.assertTrue(is_file_ext_supported('.pdf'))
        self.assertFalse(is_file_ext_supported('.hsdfh'))
        self.assertFalse(is_file_ext_supported(''))
コード例 #4
0
def create_archive_version(doc, retry_count=3):
    from documents.parsers import get_parser_class_for_mime_type, \
        DocumentParser, \
        ParseError

    logger.info(f"Regenerating archive document for document ID:{doc.id}")
    parser_class = get_parser_class_for_mime_type(doc.mime_type)
    for try_num in range(retry_count):
        parser: DocumentParser = parser_class(None, None)
        try:
            parse_wrapper(parser, source_path(doc), doc.mime_type,
                          os.path.basename(doc.filename))
            doc.content = parser.get_text()

            if parser.get_archive_path() and os.path.isfile(
                    parser.get_archive_path()):
                doc.archive_filename = generate_unique_filename(
                    doc, archive_filename=True)
                with open(parser.get_archive_path(), "rb") as f:
                    doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
                os.makedirs(os.path.dirname(archive_path_new(doc)),
                            exist_ok=True)
                shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
            else:
                doc.archive_checksum = None
                logger.error(
                    f"Parser did not return an archive document for document "
                    f"ID:{doc.id}. Removing archive document.")
            doc.save()
            return
        except ParseError:
            if try_num + 1 == retry_count:
                logger.exception(
                    f"Unable to regenerate archive document for ID:{doc.id}. You "
                    f"need to invoke the document_archiver management command "
                    f"manually for that document.")
                doc.archive_checksum = None
                doc.save()
                return
            else:
                # This is mostly here for the tika parser in docker
                # environemnts. The servers for parsing need to come up first,
                # and the docker setup doesn't ensure that tika is running
                # before attempting migrations.
                logger.error("Parse error, will try again in 5 seconds...")
                sleep(5)
        finally:
            parser.cleanup()