Ejemplo n.º 1
0
def import_documents(directory):
    files = []
    for entry in os.scandir(directory):
        if entry.is_file():
            file = (entry.path, entry.stat().st_mtime)
            files.append(file)
        else:
            logger.warning("Skipping %s as it is not a file", entry.path)

    if not files:
        return

    files_old_to_new = sorted(files, key=itemgetter(1))

    time.sleep(int(settings.PAPERMERGE_FILES_MIN_UNMODIFIED_DURATION))

    for file, mtime in files_old_to_new:
        if mtime == os.path.getmtime(file):
            # File has not been modified and can be consumed
            logger.info(f"Importing file {file}...")
            basename = os.path.basename(file)
            with tempfile.TemporaryDirectory() as tempdirname:
                shutil.move(file, tempdirname)
                temp_file_name = os.path.join(tempdirname, basename)
                logger.info(f"Same as temp_file_name={temp_file_name}...")
                imp = DocumentImporter(temp_file_name)
                imp.import_file()
Ejemplo n.º 2
0
def read_email_message(message):
    """
    message is an instance of python's module email.message
    """
    for index, part in enumerate(message.walk()):
        # search for payload
        maintype = part.get_content_maintype()
        subtype = part.get_content_subtype()
        logger.debug(
            f"IMAP import: payload {index} maintype={maintype}"
            f" subtype={subtype}."
        )
        if is_payload_supported(maintype=maintype, subtype=subtype):
            logger.debug(
                f"IMAP import: importing..."
            )
            with tempfile.NamedTemporaryFile() as temp:
                temp.write(part.get_payload(decode=True))
                temp.flush()
                imp = DocumentImporter(temp.name)
                imp.import_file(
                    delete_after_import=False
                )
        else:
            logger.debug(
                f"IMAP import: ignoring payload."
            )
Ejemplo n.º 3
0
    def test_import_file_with_title_arg(self):
        src_file_path = os.path.join(BASE_DIR, "data", "berlin.pdf")

        imp = DocumentImporter(src_file_path)
        if not imp.import_file(
                file_title="X1.pdf", delete_after_import=False, skip_ocr=True):
            self.assertTrue(False, "Error while importing file")

        self.assertEqual(
            Document.objects.filter(title="X1.pdf").count(), 1,
            "Document X1.pdf was not created.")
Ejemplo n.º 4
0
    def put(self, request, filename):
        file_obj = request.data['file']
        imp = DocumentImporter(
            file=file_obj.temporary_file_path(),
            username=request.user.username,
        )
        doc = imp.import_file(file_title=filename,
                              apply_async=True,
                              delete_after_import=False)
        if isinstance(doc, Document):
            serializer = DocumentSerializer(doc)
            return Response(serializer.data)

        return Response(status=200)
Ejemplo n.º 5
0
def read_email_message(message):
    """
    message is an instance of python's module email.message
    """
    for part in message.walk():
        # search for payload
        maintype = part.get_content_maintype()
        subtype = part.get_content_subtype()
        if maintype == 'application' and subtype == 'pdf':

            with tempfile.NamedTemporaryFile() as temp:
                temp.write(part.get_payload(decode=True))
                temp.flush()
                imp = DocumentImporter(temp.name)
                imp.import_file(delete_after_import=False)
Ejemplo n.º 6
0
    def test_delete_pages(self):
        # Create a document with two pages
        src_file_path = os.path.join(
            BASE_DIR, "data", "berlin.pdf"
        )

        imp = DocumentImporter(src_file_path)
        if not imp.import_file(
            file_title="berlin.pdf",
            delete_after_import=False,
            skip_ocr=True
        ):
            self.assertTrue(False, "Error while importing file")

        doc = Document.objects.get(title="berlin.pdf")
        self.assertEqual(
            doc.page_count,
            2
        )
        # initial version of any document is 0
        self.assertEqual(
            doc.version,
            0
        )

        doc.delete_pages(
            page_numbers=[1],
            skip_migration=True
        )

        self.assertEqual(
            doc.page_count,
            1
        )

        self.assertEqual(
            doc.pages.count(),
            1
        )

        # version should have been incremented
        self.assertEqual(
            doc.version,
            1
        )