Beispiel #1
0
    def import_file(
        self,
        file_title=None,
        inbox_title=Folder.INBOX_NAME,
        delete_after_import=True,
        skip_ocr=False,
        apply_async=False
    ):
        """
        Gets as input a path to a file on a local file system and:
            1. creates a document instance
            2. Copies file to doc_instance.url()
            4. OCR the doc

        Used with
            ./manage.py local_importer
            ./manage.py imap_importer
        command
        """
        logger.debug(f"Importing file {self.filepath}.")

        if file_title is None:
            file_title = os.path.basename(self.filepath)

        try:
            page_count = get_pagecount(self.filepath)
        except Exception:
            logger.error(f"Error while getting page count of {self.filepath}.")
            return False

        inbox, _ = Folder.objects.get_or_create(
            title=inbox_title,
            parent=None,
            user=self.user
        )
        doc = Document.create_document(
            user=self.user,
            title=file_title,
            size=os.path.getsize(self.filepath),
            lang=self.user_ocr_language,
            file_name=file_title,
            parent_id=inbox.id,
            page_count=page_count,
            rebuild_tree=False
        )
        logger.debug(
            f"Uploading file {self.filepath} to {doc.path.url()}"
        )
        default_storage.copy_doc(
            src=self.filepath,
            dst=doc.path.url(),
        )

        if not skip_ocr:
            if apply_async:
                for page_num in range(1, page_count + 1):
                    ocr_page_task.apply_async(kwargs={
                        'user_id': self.user.id,
                        'document_id': doc.id,
                        'file_name': file_title,
                        'page_num': page_num,
                        'lang': self.user_ocr_language}
                    )
            else:
                DocumentImporter.ocr_document(
                    document=doc,
                    page_count=page_count,
                    lang=self.user_ocr_language,
                )

        if delete_after_import:
            # Usually we want to delete files when importing
            # them from local directory
            # When importing from Email attachment - deleting
            # files does not apply
            os.remove(self.filepath)

        logger.debug("Import complete.")

        return doc
Beispiel #2
0
def restore_documents(restore_file: io.BytesIO, user: User, skip_ocr=False):

    restore_file.seek(0)

    with tarfile.open(fileobj=restore_file, mode="r") as restore_archive:
        backup_json = restore_archive.extractfile('backup.json')
        backup_info = json.load(backup_json)

        leading_user_in_path = False
        _user = user
        if not user:
            leading_user_in_path = True
            # user was not specified. It is assument that
            # backup.json contains a list of users.
            # Thus recreate users first.
            for backup_user in backup_info['users']:
                user = User.objects.create(
                    username=backup_user['username'],
                    email=backup_user['email'],
                    is_active=backup_user['is_active'],
                    is_superuser=backup_user['is_superuser'])
                # in case --include-user-password switch was used
                # update user (raw digest of) password field
                password = backup_user.get('password')
                if password:
                    user.password = password
                    user.save()

        for restore_file in restore_archive.getnames():

            if restore_file == "backup.json":
                continue

            logger.debug(f"Restoring file {restore_file}...")

            splitted_path = PurePath(restore_file).parts
            base, ext = os.path.splitext(
                remove_backup_filename_id(splitted_path[-1]))

            # if there is leading username, remove it.
            if leading_user_in_path:
                username = splitted_path[0]
                _user = User.objects.get(username=username)
                splitted_path = splitted_path[1:]

            if backup_info.get('documents', False):
                backup_info_documents = backup_info['documents']
            else:
                backup_info_documents = _get_json_user_documents_list(
                    backup_info, _user)
                leading_user_in_path = True

            for info in backup_info_documents:
                document_info = info
                if info['path'] == restore_file:
                    break

            parent = None

            # variables used only to shorten debug message
            _sp = splitted_path
            _rf = restore_file
            logger.debug(
                f"{_rf}: splitted_path={_sp} len(splitted_path)={len(_sp)}")
            # we first have to create a folder structure
            if len(splitted_path) > 1:
                for folder in splitted_path[:-1]:

                    folder_object = Folder.objects.filter(
                        title=folder,
                        user=_user).filter(parent=parent).first()

                    if folder_object is None:
                        new_folder = Folder.objects.create(title=folder,
                                                           parent=parent,
                                                           user=_user)
                        parent = new_folder
                    else:
                        parent = folder_object

            with NamedTemporaryFile("w+b", suffix=ext) as temp_output:
                logger.debug(f"Extracting {restore_file}...")

                ff = restore_archive.extractfile(restore_file)
                temp_output.write(ff.read())
                temp_output.seek(0)
                size = os.path.getsize(temp_output.name)

                page_count = get_pagecount(temp_output.name)

                if parent:
                    parent_id = parent.id
                else:
                    parent_id = None

                new_doc = Document.objects.create_document(
                    user=_user,
                    title=document_info['title'],
                    size=size,
                    lang=document_info['lang'],
                    file_name=remove_backup_filename_id(splitted_path[-1]),
                    parent_id=parent_id,
                    notes="",
                    page_count=page_count,
                    rebuild_tree=False  # speeds up 100x
                )

                tag_attributes = document_info.get('tags', [])

                for attrs in tag_attributes:
                    attrs['user'] = _user
                    tag, created = Tag.objects.get_or_create(**attrs)
                    new_doc.tags.add(tag)

                default_storage.copy_doc(src=temp_output.name,
                                         dst=new_doc.path.url())

            if not skip_ocr:
                for page_num in range(1, page_count + 1):
                    ocr_page.apply_async(
                        kwargs={
                            'user_id': _user.id,
                            'document_id': new_doc.id,
                            'file_name': new_doc.file_name,
                            'page_num': page_num,
                            'lang': document_info['lang']
                        })
Beispiel #3
0
def upload(request):
    """
    To understand returned value, have a look at
    papermerge.core.views.decorators.json_reponse decorator
    """
    files = request.FILES.getlist('file')
    if not files:
        logger.warning("POST request.FILES is empty. Forgot adding file?")
        return "Missing input file", 400

    if len(files) > 1:
        msg = "More then one files per ajax? how come?"
        logger.warning(msg)

        return msg, 400

    f = files[0]

    logger.debug("upload for f=%s user=%s", f, request.user)

    user = request.user
    size = os.path.getsize(f.temporary_file_path())
    parent_id = request.POST.get('parent', "-1")
    if parent_id and "-1" in parent_id:
        parent_id = None

    lang = request.POST.get('language')
    notes = request.POST.get('notes')
    try:
        page_count = get_pagecount(f.temporary_file_path())
    except exceptions.FileTypeNotSupported:
        status = 400
        msg = _("File type not supported."
                " Only pdf, tiff, png, jpeg files are supported")
        return msg, status

    logger.debug("creating document {}".format(f.name))

    doc = Document.create_document(user=user,
                                   title=f.name,
                                   size=size,
                                   lang=lang,
                                   file_name=f.name,
                                   parent_id=parent_id,
                                   notes=notes,
                                   page_count=page_count)
    logger.debug("uploading to {}".format(doc.path.url()))

    default_storage.copy_doc(src=f.temporary_file_path(), dst=doc.path.url())
    for page_num in range(1, page_count + 1):
        ocr_page.apply_async(
            kwargs={
                'user_id': user.id,
                'document_id': doc.id,
                'file_name': f.name,
                'page_num': page_num,
                'lang': lang
            })

    # upload only one file at time.
    # after each upload return a json object with
    # following fields:
    #
    # - title
    # - preview_url
    # - doc_id
    # - action_url  -> needed for renaming/deleting selected item
    #
    # with that info a new thumbnail will be created.
    preview_url = reverse('core:preview', args=(doc.id, 200, 1))

    result = {
        'title': doc.title,
        'doc_id': doc.id,
        'action_url': "",
        'preview_url': preview_url
    }

    return result
Beispiel #4
0
    def post(self, request):
        files = request.FILES.getlist('file')
        if not files:
            logger.warning("POST request.FILES is empty. Forgot adding file?")
            return HttpResponseBadRequest("Missing input file")

        if len(files) > 1:
            logger.warning("More then one files per ajax? how come?")
            return HttpResponse(json.dumps({}),
                                content_type="application/json",
                                status_code=400)

        f = files[0]

        logger.debug("upload for f=%s user=%s", f, request.user)

        user = request.user
        size = os.path.getsize(f.temporary_file_path())
        parent_id = request.POST.get('parent', "-1")
        if parent_id and "-1" in parent_id:
            parent_id = None

        lang = request.POST.get('language')
        notes = request.POST.get('notes')
        page_count = get_pagecount(f.temporary_file_path())
        logger.info("creating document {}".format(f.name))

        doc = Document.create_document(user=user,
                                       title=f.name,
                                       size=size,
                                       lang=lang,
                                       file_name=f.name,
                                       parent_id=parent_id,
                                       notes=notes,
                                       page_count=page_count)
        logger.debug("uploading to {}".format(doc.path.url()))

        default_storage.copy_doc(src=f.temporary_file_path(),
                                 dst=doc.path.url())

        for page_num in range(1, page_count + 1):
            ocr_page.apply_async(
                kwargs={
                    'user_id': user.id,
                    'document_id': doc.id,
                    'file_name': f.name,
                    'page_num': page_num,
                    'lang': lang
                })

        # upload only one file at time.
        # after each upload return a json object with
        # following fields:
        #
        # - title
        # - preview_url
        # - doc_id
        # - action_url  -> needed for renaming/deleting selected item
        #
        # with that info a new thumbnail will be created.

        action_url = reverse('boss:core_basetreenode_change', args=(doc.id, ))

        preview_url = reverse('core:preview', args=(doc.id, 200, 1))

        result = {
            'title': doc.title,
            'doc_id': doc.id,
            'action_url': action_url,
            'preview_url': preview_url
        }
        logger.info("and response is!")
        return HttpResponse(json.dumps(result),
                            content_type="application/json")
def restore_documents(restore_file: io.BytesIO, user: User, skip_ocr=False):

    restore_file.seek(0)

    with tarfile.open(fileobj=restore_file, mode="r") as restore_archive:
        backup_json = restore_archive.extractfile('backup.json')
        backup_info = json.load(backup_json)

        leading_user_in_path = False
        _user = user
        if not user:
            leading_user_in_path = True
            # user was not specified. It is assument that
            # backup.json contains a list of users.
            # Thus recreate users first.
            for backup_user in backup_info['users']:
                User.objects.create(username=backup_user['username'],
                                    email=backup_user['email'],
                                    is_active=backup_user['is_active'],
                                    is_superuser=backup_user['is_superuser'])

        for restore_file in restore_archive.getnames():

            if restore_file == "backup.json":
                continue

            splitted_path = PurePath(restore_file).parts
            base, ext = os.path.splitext(
                remove_backup_filename_id(splitted_path[-1]))

            # if there is leading username, remove it.
            if leading_user_in_path:
                username = splitted_path[0]
                _user = User.objects.get(username=username)
                splitted_path = splitted_path[1:]

            if backup_info.get('documents', False):
                backup_info_documents = backup_info['documents']
            else:
                backup_info_documents = _get_json_user_documents_list(
                    backup_info, _user)
                leading_user_in_path = True

            for info in backup_info_documents:
                document_info = info
                if info['path'] == restore_file:
                    break

            parent = None
            # we first have to create a folder structure

            if len(splitted_path) > 1:
                for folder in splitted_path[:-1]:

                    folder_object = Folder.objects.filter(
                        title=folder,
                        user=_user).filter(parent=parent).first()

                    if folder_object is None:
                        new_folder = Folder.objects.create(title=folder,
                                                           parent=parent,
                                                           user=_user)
                        parent = new_folder
                    else:
                        parent = folder_object

                with NamedTemporaryFile("w+b", suffix=ext) as temp_output:
                    ff = restore_archive.extractfile(restore_file)
                    temp_output.write(ff.read())
                    temp_output.seek(0)
                    size = os.path.getsize(temp_output.name)

                    page_count = get_pagecount(temp_output.name)

                    if parent:
                        parent_id = parent.id
                    else:
                        parent_id = None

                    new_doc = Document.create_document(
                        user=_user,
                        title=document_info['title'],
                        size=size,
                        lang=document_info['lang'],
                        file_name=remove_backup_filename_id(splitted_path[-1]),
                        parent_id=parent_id,
                        notes="",
                        page_count=page_count,
                        rebuild_tree=False  # speeds up 100x
                    )

                    default_storage.copy_doc(src=temp_output.name,
                                             dst=new_doc.path.url())

                if not skip_ocr:
                    for page_num in range(1, page_count + 1):
                        ocr_page.apply_async(
                            kwargs={
                                'user_id': _user.id,
                                'document_id': new_doc.id,
                                'file_name': new_doc.file_name,
                                'page_num': page_num,
                                'lang': document_info['lang']
                            })
Beispiel #6
0
    def apply(self,
              user=None,
              parent=None,
              lang=None,
              notes=None,
              name=None,
              skip_ocr=False,
              apply_async=False,
              create_document=True,
              **kwargs):
        """
        Apply the pipeline. The document is created or modified here.  This
method is not supposed to throw errors.

        Arguments:
        - user (User, optional): document owner.
        - parent (Folder, optional): folder containing the document.
        - lang (str, optional): OCR language.
        - notes (str, optional): document notes.
        - name (str, optional): document name.
        - skip_ocr (bool, optional):
            whether to skip OCR processing. Defaults to False.
        - apply_async (bool, optional):
            whether to apply OCR asynchronously.
            Defaults to False.
        - create_document (bool, optional): whether to
        create or update a document. Defaults to True.

        Returns:
            Document: the created or updated document
        """
        if parent is None:
            user, lang, inbox = self.get_user_properties(user)
            # in case of upload via WEB interface, documents
            # must land in root directory (as opposite to inbox)
            if self.processor != WEB:
                parent = inbox.id
        if name is None:
            name = basename(self.path)
        page_count = self.page_count()
        size = getsize(self.path)

        if create_document and self.doc is None:
            try:
                doc = Document.objects.create_document(user=user,
                                                       title=name,
                                                       size=size,
                                                       lang=lang,
                                                       file_name=name,
                                                       parent_id=parent,
                                                       page_count=page_count,
                                                       notes=notes)
                self.doc = doc
            except ValidationError as error:
                logger.error(f"{self.processor} importer: validation failed")
                raise error
        elif self.doc is not None:
            doc = self.doc
            doc.version = doc.version + 1
            doc.page_count = page_count
            doc.file_name = name
            doc.size = size
            doc.save()
            try:
                doc.recreate_pages()
            except ValueError:
                doc.create_pages()
            doc.full_clean()

        self.move_tempfile(doc)
        self.payload.close()
        if not skip_ocr:

            namespace = default_storage.upload(doc_path_url=doc.path().url())

            if apply_async:
                for page_num in range(1, page_count + 1):
                    ocr_page.apply_async(
                        kwargs={
                            'user_id': user.id,
                            'document_id': doc.id,
                            'file_name': name,
                            'page_num': page_num,
                            'lang': lang,
                            'namespace': namespace
                        })
            else:
                self.ocr_document(
                    document=doc,
                    page_count=page_count,
                    lang=lang,
                )

        logger.debug(f"{self.processor} importer: import complete.")
        return doc
Beispiel #7
0
    def apply(self,
              user=None,
              parent=None,
              lang=None,
              notes=None,
              name=None,
              skip_ocr=False,
              apply_async=False,
              delete_after_import=False,
              create_document=True,
              *args,
              **kwargs):
        """
        Is this function supposed to return something ?
        Please document.
        """
        if not self.check_mimetype():
            logger.debug(f"{self.processor} importer: invalid filetype")
            return None
        if self.processor != WEB:
            user, lang, inbox = self.get_user_properties(user)
            parent = inbox.id
        if name is None:
            name = basename(self.tempfile.name)
        page_count = self.page_count()
        size = getsize(self.temppath)

        if create_document:
            try:
                doc = Document.objects.create_document(user=user,
                                                       title=name,
                                                       size=size,
                                                       lang=lang,
                                                       file_name=name,
                                                       parent_id=parent,
                                                       page_count=page_count,
                                                       notes=notes)
                self.doc = doc
            except ValidationError as e:
                logger.error("{} importer: validation failed".format(
                    self.processor))
                raise e
        elif self.doc is not None:
            doc = self.doc
            doc.version = doc.version + 1
            doc.page_count = page_count
            doc.file_name = name
            doc.save()
            try:
                doc.recreate_pages()
            except ValueError:
                doc.create_pages()
            except Exception:
                logger.error(
                    f"{self.processor} importer: could not create pages")

        self.move_tempfile(doc)
        self.tempfile.close()
        if not skip_ocr:
            if apply_async:
                for page_num in range(1, page_count + 1):
                    ocr_page.apply_async(
                        kwargs={
                            'user_id': user.id,
                            'document_id': doc.id,
                            'file_name': name,
                            'page_num': page_num,
                            'lang': lang
                        })
            else:
                self.ocr_document(
                    document=doc,
                    page_count=page_count,
                    lang=lang,
                )

        if delete_after_import:
            os.remove(self.temppath)

        logger.debug("{} importer: import complete.".format(self.processor))
        return {'doc': doc}
def restore_documents(restore_file: io.BytesIO, username, skip_ocr=False):

    restore_file.seek(0)
    user = User.objects.filter(username=username).first()

    with tarfile.open(fileobj=restore_file, mode="r") as restore_archive:

        backup_json = restore_archive.extractfile('backup.json')
        backup_info = json.load(backup_json)

        for restore_file in restore_archive.getnames():
            if restore_file == "backup.json":
                continue
            for info in backup_info['documents']:
                document_info = info
                if info['path'] == restore_file:
                    break

            splitted_path = PurePath(restore_file).parts
            parent = None
            # we first have to create a folder structure

            if len(splitted_path) > 1:
                for folder in splitted_path[:-1]:

                    folder_object = Folder.objects.filter(title=folder).filter(
                        parent=parent).first()

                    if folder_object is None:
                        new_folder = Folder.objects.create(title=folder,
                                                           parent=parent,
                                                           user=user)
                        parent = new_folder
                    else:
                        parent = folder_object

            document_object = Document.objects.filter(
                title=splitted_path[-1]).filter(parent=parent).first()

            if document_object is not None:
                logger.error("Document %s already exists, skipping",
                             restore_file)
            else:

                with NamedTemporaryFile("w+b") as temp_output:

                    temp_output.write(
                        restore_archive.extractfile(restore_file).read())
                    temp_output.seek(0)
                    size = os.path.getsize(temp_output.name)
                    page_count = get_pagecount(temp_output.name)
                    if parent:
                        parent_id = parent.id
                    else:
                        parent_id = None
                    new_doc = Document.create_document(
                        user=user,
                        title=splitted_path[-1],
                        size=size,
                        lang=document_info['lang'],
                        file_name=splitted_path[-1],
                        parent_id=parent_id,
                        notes="",
                        page_count=page_count)
                    default_storage.copy_doc(src=temp_output.name,
                                             dst=new_doc.path.url())

                for page_num in range(1, page_count + 1):
                    if not skip_ocr:
                        ocr_page.apply_async(
                            kwargs={
                                'user_id': user.id,
                                'document_id': new_doc.id,
                                'file_name': splitted_path[-1],
                                'page_num': page_num,
                                'lang': document_info['lang']
                            })