def page_eps(self):
        """
        Enables document instance to get quickly page
        endpoints:

            page_ep = doc.page_eps[2]
            page_ep.url() # local url to second page of the doc.

        This is shortcut method when most used Step(1) is required.
        """

        results = [None]  # indexing starts from 1

        # doc.page_count might be wrong because per
        # page logic was added just recently. So, let's use
        # this opportunity and correct it!
        page_count = get_pagecount(self.doc_ep.url())

        if page_count != self.page_count:
            self.page_count = page_count
            self.save()

        for page_num in range(1, page_count + 1):
            ep = endpoint.PageEp(document_ep=self.doc_ep,
                                 page_num=page_num,
                                 step=step.Step(1),
                                 page_count=self.page_count)
            results.append(ep)

        return results
Beispiel #2
0
def delete_pages(doc_ep, page_numbers):
    ep_url = doc_ep.url()
    page_count = get_pagecount(ep_url)

    cat_ranges = cat_ranges_for_delete(
        page_count,
        page_numbers
    )

    doc_ep.inc_version()

    cmd = [
        "pdftk",
        ep_url,
        "cat"
    ]
    for page in cat_ranges:
        cmd.append(
            str(page)
        )

    cmd.append("output")
    make_sure_path_exists(doc_ep.url())
    cmd.append(doc_ep.url())

    run(cmd)

    return doc_ep.version
Beispiel #3
0
    def import_file(self,
                    file_title=None,
                    inbox_title="Inbox",
                    delete_after_import=True,
                    skip_ocr=False):
        """
        Gets as input a path to a file on a local file system and:
            1. creates a document instance
            2. Copies file to doc_instance.url()
            4. OCR the doc

        Used with
            ./manage.py local_importer
            ./manage.py imap_importer
        command
        """
        logger.debug(f"Importing file {self.filepath}")

        if file_title is None:
            file_title = os.path.basename(self.filepath)

        try:
            page_count = get_pagecount(self.filepath)
        except Exception:
            logger.error(f"Error while getting page count of {self.filepath}.")
            return False

        inbox, _ = Folder.objects.get_or_create(title=inbox_title,
                                                parent=None,
                                                user=self.user)
        doc = Document.create_document(user=self.user,
                                       title=file_title,
                                       size=os.path.getsize(self.filepath),
                                       lang=self.user_ocr_language,
                                       file_name=file_title,
                                       parent_id=inbox.id,
                                       page_count=page_count)
        logger.debug(f"Uploading file {self.filepath} to {doc.path.url()}")
        default_storage.copy_doc(
            src=self.filepath,
            dst=doc.path.url(),
        )
        if not skip_ocr:
            DocumentImporter.ocr_document(
                document=doc,
                page_count=page_count,
                lang=self.user_ocr_language,
            )

        if delete_after_import:
            # Usually we want to delete files when importing
            # them from local directory
            # When importing from Email attachment - deleting
            # files does not apply
            os.remove(self.filepath)

        logger.debug("Import complete.")

        return doc
Beispiel #4
0
 def recreate_pages(self):
     """
     Recreate page models
     """
     self.page_set.all().delete()
     self.page_count = get_pagecount(self.doc_ep.url())
     self.save()
     self.create_pages()
Beispiel #5
0
def hocr(request, id, step=None, page="1"):

    logger.debug(f"hocr for doc_id={id}, step={step}, page={page}")

    try:
        doc = Document.objects.get(id=id)
    except Document.DoesNotExist:
        raise Http404("Document does not exists")

    doc_ep = doc.doc_ep

    if request.user.has_perm(Access.PERM_READ, doc):
        if not doc_ep.exists():
            download(doc_ep)

        page_count = get_pagecount(doc_ep.url())
        if page > page_count or page < 0:
            raise Http404("Page does not exists")

        page_ep = doc.page_eps[page]

        logger.debug(f"Extract words from {page_ep.hocr_url()}")

        if not page_ep.hocr_exists():
            # check if HOCR data exists on S3
            if settings.S3 and page_ep.hocr_exists(ep=Endpoint.S3):
                # ok, it should be able to download it.
                download_hocr(page_ep)
            else:
                # normal scenario, HOCR is not yet ready
                raise Http404("HOCR data not yet ready.")

        # At this point local HOCR data should be available.
        hocr = Hocr(
            hocr_file_path=page_ep.hocr_url()
        )

        return HttpResponse(
            json.dumps({
                'hocr': hocr.good_json_words(),
                'hocr_meta': hocr.get_meta()
            }),
            content_type="application/json",
        )

    return HttpResponseForbidden()
Beispiel #6
0
def hocr(request, id, step=None, page="1"):

    logger.debug(f"hocr for doc_id={id}, step={step}, page={page}")

    try:
        doc = Document.objects.get(id=id)
    except Document.DoesNotExist:
        raise Http404("Document does not exists")

    doc_path = doc.path

    if request.user.has_perm(Access.PERM_READ, doc):
        # document absolute path
        doc_abs_path = default_storage.abspath(doc_path.url())
        if not os.path.exists(
            doc_abs_path
        ):
            raise Http404("HOCR data not yet ready.")

        page_count = get_pagecount(doc_abs_path)
        if page > page_count or page < 0:
            raise Http404("Page does not exists")

        page_path = doc.page_paths[page]
        hocr_abs_path = default_storage.abspath(page_path.hocr_url())

        logger.debug(f"Extract words from {hocr_abs_path}")

        if not os.path.exists(hocr_abs_path):
            raise Http404("HOCR data not yet ready.")

        # At this point local HOCR data should be available.
        hocr = Hocr(
            hocr_file_path=hocr_abs_path
        )

        return HttpResponse(
            json.dumps({
                'hocr': hocr.good_json_words(),
                'hocr_meta': hocr.get_meta()
            }),
            content_type="application/json",
        )

    return HttpResponseForbidden()
Beispiel #7
0
def reorder_pages(doc_ep, new_order):
    """
    new_order is a list of following format:

        [
            {'page_num': 2, page_order: 1},
            {'page_num': 1, page_order: 2},
            {'page_num': 3, page_order: 3},
            {'page_num': 4, page_order: 4},
        ]
    Example above means that in current document of 4 pages,
    first page was swapped with second one.
    page_num    = older page order
    page_order  = current page order
    So in human language, each hash is read:
        <page_num> now should be <page_order>
    """
    ep_url = doc_ep.url()
    page_count = get_pagecount(ep_url)

    cat_ranges = cat_ranges_for_reorder(
        page_count=page_count,
        new_order=new_order
    )

    doc_ep.inc_version()

    cmd = [
        "pdftk",
        ep_url,
        "cat"
    ]
    for page in cat_ranges:
        cmd.append(
            str(page)
        )

    cmd.append("output")
    make_sure_path_exists(doc_ep.url())
    cmd.append(doc_ep.url())
    run(cmd)

    return doc_ep.version
Beispiel #8
0
def ocr_page_pdf(doc_ep, page_num, lang):
    page_count = get_pagecount(doc_ep.url())
    logger.debug(f"page_count={page_count}")
    if page_num <= page_count:
        page_url = PageEp(document_ep=doc_ep,
                          page_num=page_num,
                          step=Step(1),
                          page_count=page_count)
        extract_img(page_url)
        extract_txt(page_url, lang=lang)

        for step in Steps():
            page_url.step = step
            extract_img(page_url)
            # tesseract unterhalt-1.jpg page-1 -l deu hocr
            if not step.is_thumbnail:
                extract_hocr(page_url, lang=lang)

    return page_url
Beispiel #9
0
def ocr_page_pdf(doc_path, page_num, lang):
    """
    doc_path is an mglib.path.DocumentPath instance
    """
    page_count = get_pagecount(default_storage.abspath(doc_path.url()))
    if page_num <= page_count:
        page_url = PagePath(document_path=doc_path,
                            page_num=page_num,
                            step=Step(1),
                            page_count=page_count)
        extract_img(page_url, media_root=settings.MEDIA_ROOT)
        extract_txt(page_url, lang=lang, media_root=settings.MEDIA_ROOT)

        for step in Steps():
            page_url.step = step
            extract_img(page_url, media_root=settings.MEDIA_ROOT)
            # tesseract unterhalt-1.jpg page-1 -l deu hocr
            if not step.is_thumbnail:
                extract_hocr(page_url,
                             lang=lang,
                             media_root=settings.MEDIA_ROOT)

    return page_url
Beispiel #10
0
    def post(self, request):
        files = request.FILES.getlist('file')
        if not files:
            logger.warning("POST request.FILES is empty. Forgot adding file?")
            return HttpResponseBadRequest("Missing input file")

        if len(files) > 1:
            logger.warning("More then one files per ajax? how come?")
            return HttpResponse(json.dumps({}),
                                content_type="application/json",
                                status_code=400)

        f = files[0]

        logger.debug("upload for f=%s user=%s", f, request.user)

        user = request.user
        size = os.path.getsize(f.temporary_file_path())
        parent_id = request.POST.get('parent', "-1")
        if parent_id and "-1" in parent_id:
            parent_id = None

        lang = request.POST.get('language')
        notes = request.POST.get('notes')
        page_count = get_pagecount(f.temporary_file_path())
        logger.info("creating document {}".format(f.name))

        doc = Document.create_document(user=user,
                                       title=f.name,
                                       size=size,
                                       lang=lang,
                                       file_name=f.name,
                                       parent_id=parent_id,
                                       notes=notes,
                                       page_count=page_count)
        logger.debug("uploading to {}".format(doc.path.url()))

        default_storage.copy_doc(src=f.temporary_file_path(),
                                 dst=doc.path.url())

        for page_num in range(1, page_count + 1):
            ocr_page.apply_async(
                kwargs={
                    'user_id': user.id,
                    'document_id': doc.id,
                    'file_name': f.name,
                    'page_num': page_num,
                    'lang': lang
                })

        # upload only one file at time.
        # after each upload return a json object with
        # following fields:
        #
        # - title
        # - preview_url
        # - doc_id
        # - action_url  -> needed for renaming/deleting selected item
        #
        # with that info a new thumbnail will be created.

        action_url = reverse('boss:core_basetreenode_change', args=(doc.id, ))

        preview_url = reverse('core:preview', args=(doc.id, 200, 1))

        result = {
            'title': doc.title,
            'doc_id': doc.id,
            'action_url': action_url,
            'preview_url': preview_url
        }
        logger.info("and response is!")
        return HttpResponse(json.dumps(result),
                            content_type="application/json")
Beispiel #11
0
    def import_file(filepath,
                    username=None,
                    file_title=None,
                    inbox_title="Inbox",
                    delete_after_import=False,
                    start_ocr_async=True,
                    upload=True):
        """
        Gets as input a path to a file on a local file system and:
            1. creates a document instance (if there is a available space).
            2. Copies file to doc_instance.url()
            3. (optionally) uploads the document to S3 storage.
            4. (optionally) starts ocr_async task.

        Is used on customers instance by:
            * import_file command - to import files from SFTP directory
            * import_attachment command - to import attachments from mailbox
        """
        logger.debug(f"Importing file {filepath}")

        if username is None:
            user = get_root_user()
        else:
            user = User.objects.get(username=username)

        if file_title is None:
            file_title = get_file_title(filepath)

        if not is_storage_left(filepath, user=user):
            logger.error(f"user.username reached his disk quota")
            return False

        lang = Document.get_default_language()
        # get_pagecount() might raise an exception in case
        # file is either wrong (not a PDF) or not yet
        # completed to upload
        try:
            page_count = get_pagecount(filepath)
        except Exception:
            # which means that document is not yet fully
            # uploaded by SFTP client.
            logger.error(f"File {filepath} not yet ready for importing.")
            return False

        inbox, _ = Folder.objects.get_or_create(title=inbox_title,
                                                parent=None,
                                                user=user)
        doc = Document.create_document(user=user,
                                       title=file_title,
                                       size=get_file_size(filepath),
                                       lang=lang,
                                       file_name=file_title,
                                       parent_id=inbox.id,
                                       page_count=page_count)
        logger.debug(f"Uploading file {filepath} to {doc.doc_ep.url()}")
        # Import file is executed as root (import-file.service)
        # (because import-file need to access/delete sftp files, folder
        # as of another system user)
        # Thus, after copying file into (newly created) folders,
        # it need to change permissions (of newly created files and folders)
        # to the app_user/app_group.
        copy2doc_url(src_file_path=filepath,
                     doc_url=doc.doc_ep.url(),
                     user=settings.APP_USER,
                     group=settings.APP_GROUP)

        if upload and settings.S3:
            upload_document_to_s3(doc.doc_ep)

        if start_ocr_async and settings.OCR:
            Document.ocr_async(document=doc,
                               page_count=page_count,
                               lang=lang,
                               s3_enabled=settings.S3)

        if delete_after_import:
            os.remove(filepath)

        return True
Beispiel #12
0
    def post(self, request):

        files = request.FILES.getlist('file')
        if not files:
            logger.warning("POST request.FILES is empty. Forgot adding file?")

        if len(files) > 1:
            logger.warning("More then one files per ajax? how come?")
            return HttpResponse(json.dumps({}),
                                content_type="application/json",
                                status_code=400)

        f = files[0]

        logger.debug("upload for f=%s user=%s", f, request.user)

        if not is_storage_left(f.temporary_file_path()):
            logger.warning("Storage is full for user=%s.", request.user)
            msg = "Cannot upload file {}. Storage is full.".format(f.name)

            return HttpResponse(json.dumps({'error': msg}),
                                status=400,
                                content_type="application/json")

        user = request.user
        size = os.path.getsize(f.temporary_file_path())
        parent_id = request.POST.get('parent', "-1")
        if parent_id and "-1" in parent_id:
            parent_id = None

        lang = request.POST.get('language')
        notes = request.POST.get('notes')
        page_count = get_pagecount(f.temporary_file_path())
        logger.info("creating document {}".format(f.name))

        doc = Document.create_document(user=user,
                                       title=f.name,
                                       size=size,
                                       lang=lang,
                                       file_name=f.name,
                                       parent_id=parent_id,
                                       notes=notes,
                                       page_count=page_count)
        logger.debug("uploading to {}".format(doc.doc_ep.url()))

        copy2doc_url(src_file_path=f.temporary_file_path(),
                     doc_url=doc.doc_ep.url())

        if settings.S3:
            upload_document_to_s3(doc.doc_ep)

        if settings.OCR:
            Document.ocr_async(document=doc, page_count=page_count, lang=lang)

        # upload only one file at time.
        # after each upload return a json object with
        # following fields:
        #
        # - title
        # - preview_url
        # - doc_id
        # - action_url  -> needed for renaming/deleting selected item
        #
        # with that info a new thumbnail will be created.

        action_url = reverse('boss:core_basetreenode_change', args=(doc.id, ))

        preview_url = reverse('core:preview', args=(doc.id, 200, 1))

        result = {
            'title': doc.title,
            'doc_id': doc.id,
            'action_url': action_url,
            'preview_url': preview_url
        }
        logger.info("and response is!")
        return HttpResponse(json.dumps(result),
                            content_type="application/json")
Beispiel #13
0
def paste_pages_into_existing_doc(
    dest_doc_ep,
    src_doc_ep_list,
    after_page_number=False,
    before_page_number=False
):
    page_count = get_pagecount(dest_doc_ep.url())
    list1, list2 = split_ranges(
        total=page_count,
        after=after_page_number,
        before=before_page_number
    )
    # notice missing A
    # Letter A is assignent to current folder and
    # pages from list1 and list2
    letters = "BCDEFGHIJKLMNOPQRSTUVWXYZ"
    letters_2_doc_map = []
    letters_pages = []
    letters_pages_before = []
    letters_pages_after = []

    letters_2_doc_map.append(
        f"A={dest_doc_ep.url()}"
    )

    for idx in range(0, len(src_doc_ep_list)):
        letter = letters[idx]
        doc_ep = src_doc_ep_list[idx]['doc_ep']
        pages = src_doc_ep_list[idx]['page_nums']

        letters_2_doc_map.append(
            f"{letter}={doc_ep.url()}"
        )
        for p in pages:
            letters_pages.append(
                f"{letter}{p}"
            )

    dest_doc_ep.inc_version()

    for p in list1:
        letters_pages_before.append(
            f"A{p}"
        )

    for p in list2:
        letters_pages_after.append(
            f"A{p}"
        )

    cmd = [
        "pdftk",
    ]
    # add A=doc1_path, B=doc2_path
    cmd.extend(letters_2_doc_map)

    cmd.append("cat")

    # existing doc pages (may be empty)
    cmd.extend(letters_pages_before)
    # newly inserted pages
    cmd.extend(letters_pages)
    # existing doc pages (may be empty)
    cmd.extend(letters_pages_after)

    cmd.append("output")

    make_sure_path_exists(dest_doc_ep.url())

    cmd.append(dest_doc_ep.url())

    run(cmd)

    return dest_doc_ep.version
Beispiel #14
0
def restore_documents(restore_file: io.BytesIO, username, skip_ocr=False):

    restore_file.seek(0)
    user = User.objects.filter(username=username).first()

    with tarfile.open(fileobj=restore_file, mode="r") as restore_archive:

        backup_json = restore_archive.extractfile('backup.json')
        backup_info = json.load(backup_json)

        for restore_file in restore_archive.getnames():
            if restore_file == "backup.json":
                continue
            for info in backup_info['documents']:
                document_info = info
                if info['path'] == restore_file:
                    break

            splitted_path = PurePath(restore_file).parts
            parent = None
            # we first have to create a folder structure

            if len(splitted_path) > 1:
                for folder in splitted_path[:-1]:

                    folder_object = Folder.objects.filter(title=folder).filter(
                        parent=parent).first()

                    if folder_object is None:
                        new_folder = Folder.objects.create(title=folder,
                                                           parent=parent,
                                                           user=user)
                        parent = new_folder
                    else:
                        parent = folder_object

            document_object = Document.objects.filter(
                title=splitted_path[-1]).filter(parent=parent).first()

            if document_object is not None:
                logger.error("Document %s already exists, skipping",
                             restore_file)
            else:

                with NamedTemporaryFile("w+b") as temp_output:

                    temp_output.write(
                        restore_archive.extractfile(restore_file).read())
                    temp_output.seek(0)
                    size = os.path.getsize(temp_output.name)
                    page_count = get_pagecount(temp_output.name)
                    if parent:
                        parent_id = parent.id
                    else:
                        parent_id = None
                    new_doc = Document.create_document(
                        user=user,
                        title=splitted_path[-1],
                        size=size,
                        lang=document_info['lang'],
                        file_name=splitted_path[-1],
                        parent_id=parent_id,
                        notes="",
                        page_count=page_count)
                    default_storage.copy_doc(src=temp_output.name,
                                             dst=new_doc.path.url())

                for page_num in range(1, page_count + 1):
                    if not skip_ocr:
                        ocr_page.apply_async(
                            kwargs={
                                'user_id': user.id,
                                'document_id': new_doc.id,
                                'file_name': splitted_path[-1],
                                'page_num': page_num,
                                'lang': document_info['lang']
                            })