def reorder_pages(src, dst, new_order): """ new_order is a list of following format: [ {'page_num': 2, page_order: 1}, {'page_num': 1, page_order: 2}, {'page_num': 3, page_order: 3}, {'page_num': 4, page_order: 4}, ] Example above means that in current document of 4 pages, first page was swapped with second one. page_num = older page order page_order = current page order So in human language, each hash is read: <page_num> now should be <page_order> """ page_count = get_pagecount(src) cat_ranges = cat_ranges_for_reorder(page_count=page_count, new_order=new_order) cmd = [settings.BINARY_PDFTK, src, "cat"] for page in cat_ranges: cmd.append(str(page)) cmd.append("output") cmd.append(dst) run(cmd)
def resize_img(page_path, media_root): local_abspath = os.path.join( media_root, page_path.document_path.url() ) logger.debug(f"Resizing image {page_path.img_url()}") ppmroot = os.path.join(media_root, page_path.ppmroot) ppmroot_dirname = os.path.dirname(ppmroot) width = page_path.step.width if not os.path.exists(ppmroot_dirname): logger.debug(f"PPMROOT {ppmroot_dirname} does not exists. Creating.") os.makedirs( ppmroot_dirname, exist_ok=True ) else: logger.debug(f"PPMROOT {ppmroot_dirname} already exists.") cmd = ( settings.BINARY_CONVERT, "-resize", f"{width}x", local_abspath, # output directory path, similar to ppmroot f"{ppmroot}-1.jpg" ) run(cmd)
def delete_pages(src, dst, page_numbers): cmd = [settings.BINARY_STAPLER, "del", src] for page in page_numbers: cmd.append(str(page)) cmd.append(dst) run(cmd)
def extract_hocr(page_url, lang, media_root): page_abspath = os.path.join(media_root, page_url.img_url()) hocr_root, hocr_ext = os.path.splitext( os.path.join(media_root, page_url.hocr_url())) cmd = (settings.BINARY_OCR, "-l", lang, page_abspath, hocr_root, "hocr") run(cmd) logger.debug(f"OCR for {page_url.img_url()} - Complete.") logger.debug(f"OCR Result {page_url.hocr_url()}.")
def paste_pages_into_existing_doc(src, dst, data_list, after_page_number=False, before_page_number=False): page_count = get_pagecount(src) list1, list2 = split_ranges(total=page_count, after=after_page_number, before=before_page_number) # notice missing A # Letter A is assignent to current folder and # pages from list1 and list2 letters = "BCDEFGHIJKLMNOPQRSTUVWXYZ" letters_2_doc_map = [] letters_pages = [] letters_pages_before = [] letters_pages_after = [] letters_2_doc_map.append(f"A={src}") for idx in range(0, len(data_list)): letter = letters[idx] src = data_list[idx]['src'] pages = data_list[idx]['page_nums'] letters_2_doc_map.append(f"{letter}={src}") for p in pages: letters_pages.append(f"{letter}{p}") for p in list1: letters_pages_before.append(f"A{p}") for p in list2: letters_pages_after.append(f"A{p}") cmd = [ settings.BINARY_PDFTK, ] # add A=doc1_path, B=doc2_path cmd.extend(letters_2_doc_map) cmd.append("cat") # existing doc pages (may be empty) cmd.extend(letters_pages_before) # newly inserted pages cmd.extend(letters_pages) # existing doc pages (may be empty) cmd.extend(letters_pages_after) cmd.append("output") cmd.append(dst) run(cmd)
def delete_pages(src, dst, page_numbers): page_count = get_pagecount(src) cat_ranges = cat_ranges_for_delete(page_count, page_numbers) cmd = [settings.BINARY_PDFTK, src, "cat"] for page in cat_ranges: cmd.append(str(page)) cmd.append("output") cmd.append(dst) run(cmd)
def convert_tiff2pdf(doc_url): logger.debug(f"convert_tiff2pdf for {doc_url}") new_doc_url, new_filename = pdfname_from_tiffname(doc_url) logger.debug(f"tiff2pdf source={doc_url} dest={new_doc_url}") cmd = ( settings.BINARY_CONVERT, doc_url, new_doc_url, ) run(cmd) # returns new filename return new_filename
def extract_txt(page_url, lang, media_root): page_abspath = os.path.join( media_root, page_url.img_url() ) txt_root, txt_ext = os.path.splitext( os.path.join( media_root, page_url.txt_url() ) ) cmd = ( settings.BINARY_OCR, "-l", lang, page_abspath, txt_root ) run(cmd)
def extract_txt(page_url, lang, media_root): page_abspath = os.path.join( media_root, page_url.img_url() ) txt_root, txt_ext = os.path.splitext( os.path.join( media_root, page_url.txt_url() ) ) cmd = ( "tesseract", "-l", lang, page_abspath, txt_root ) run(cmd)
def extract_hocr(page_url, lang, media_root): page_abspath = os.path.join( media_root, page_url.img_url() ) hocr_root, hocr_ext = os.path.splitext( os.path.join(media_root, page_url.hocr_url()) ) cmd = ( "tesseract", "-l", lang, page_abspath, hocr_root, "hocr" ) run(cmd)
def convert_tiff2pdf(doc_url): logger.debug(f"convert_tiff2pdf for {doc_url}") # basename is filename + ext (no path) basename = os.path.basename(doc_url) base_root, base_ext = os.path.splitext(basename) root, ext = os.path.splitext(doc_url) new_doc_url = f"{root}.pdf" logger.debug(f"tiff2pdf source={doc_url} dest={new_doc_url}") cmd = ( settings.BINARY_CONVERT, doc_url, new_doc_url, ) run(cmd) # returns new filename return f"{base_root}.pdf"
def extract_img(page_path, media_root): local_abspath = os.path.join( media_root, page_path.document_path.url() ) logger.debug(f"Extracing image for {page_path.img_url()}") ppmroot = os.path.join(media_root, page_path.ppmroot) ppmroot_dirname = os.path.dirname(ppmroot) page_num = page_path.page_num width = page_path.step.width if not os.path.exists(ppmroot_dirname): logger.debug(f"PPMROOT {ppmroot_dirname} does not exists. Creating.") os.makedirs( ppmroot_dirname, exist_ok=True ) else: logger.debug(f"PPMROOT {ppmroot_dirname} already exists.") cmd = ( settings.BINARY_PDFTOPPM, "-jpeg", "-f", str(page_num), "-l", # generate only one page str(page_num), "-scale-to-x", str(width), "-scale-to-y", "-1", # it will adjust height according to img ratio local_abspath, # output directory path, ppmroot ) run(cmd)
def paste_pages(src, dst, data_list, dst_doc_is_new=True, after_page_number=False, before_page_number=False): """ dest_doc_ep = endpoint of the doc where newly created file will be placed. src_doc_ep_list is a list of following format: [ { 'doc_ep': doc_ep, 'page_nums': [page_num_1, page_num_2, page_num_3] }, { 'doc_ep': doc_ep, 'page_nums': [page_num_1, page_num_2, page_num_3] }, ... ] src_doc_ep_list is a list of documents where pages (with numbers page_num_1...) will be paste from. dst_doc_is_new = True well.. destination document was just created, we are pasting here cutted pages into some folder as new document. In this case 'after' and 'before' arguments are ignored dst_doc_is_new = False, pasting pages into exiting document. If before_page_number > 0 - paste pages before page number 'before_page_number' If after_page_number > 0 - paste pages after page number 'after_page_number' before_page_number argument has priority over after_page_number. If both before_page_number and after_page_number are < 0 - just paste pages at the end of the document. """ if not dst_doc_is_new: return paste_pages_into_existing_doc( src=src, dst=dst, data_list=data_list, after_page_number=after_page_number, before_page_number=before_page_number) letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" letters_2_doc_map = [] letters_pages = [] for idx in range(0, len(data_list)): letter = letters[idx] src = data_list[idx]['src'] pages = data_list[idx]['page_nums'] letters_2_doc_map.append(f"{letter}={src}") for p in pages: letters_pages.append(f"{letter}{p}") cmd = [ settings.BINARY_PDFTK, ] # add A=doc1_path, B=doc2_path cmd.extend(letters_2_doc_map) cmd.append("cat") cmd.extend(letters_pages) cmd.append("output") cmd.append(dst) run(cmd)