Beispiel #1
0
def split_into_small_pdfs(pdf_path,
                          output_directory=None,
                          start_page=1,
                          end_page=None,
                          small_pdf_pages=25):

    pdf_name_stem = Path(pdf_path).stem
    if output_directory == None:
        output_directory = _get_ocr_dir(pdf_path)
    # noinspection PyArgumentList
    with Pdf.open(pdf_path) as pdf:
        if end_page == None:
            end_page = len(pdf.pages)
        pages = range(start_page, end_page + 1)
        page_sets = list_helper.divide_chunks(list_in=pages, n=small_pdf_pages)
        for page_set in page_sets:
            pages = [pdf.pages[i - 1] for i in page_set]
            dest_pdf_path = os.path.join(
                output_directory, "%s_%04d-%04d.pdf" %
                (pdf_name_stem, page_set[0], page_set[-1]))
            if not os.path.exists(dest_pdf_path):
                # noinspection PyArgumentList
                dest_pdf = Pdf.new()
                dest_pdf.pages.extend(pages)
                os.makedirs(os.path.dirname(dest_pdf_path), exist_ok=True)
                dest_pdf.save(filename=dest_pdf_path)
            else:
                logging.warning("%s exists", dest_pdf_path)
Beispiel #2
0
def break_to_small_pdf_paths_original(pdf_path,
                                      output_directory=None,
                                      start_page=1,
                                      end_page=None,
                                      small_pdf_pages=25):
    #   logging.info("Splitting %s into segments of %d", pdf_path, 25)
    pdf_name_stem = Path(pdf_path).stem
    if output_directory == None:
        output_directory = os.path.join(
            os.path.dirname(pdf_path),
            Path(pdf_path).stem + "_small_originals")
    # noinspection PyArgumentList
    with Pdf.open(pdf_path) as pdf:
        if end_page == None:
            end_page = len(pdf.pages)
        pages = range(start_page, end_page + 1)
        page_sets = list_helper.divide_chunks(list_in=pages, n=small_pdf_pages)
        dest_pdfs = []
        for page_set in page_sets:
            pages = [pdf.pages[i - 1] for i in page_set]
            dest_pdf_path = os.path.join(
                output_directory,
                "%s_%04d-%04d.pdf" %
                (pdf_name_stem, page_set[0], page_set[-1]),
            )
            if not os.path.exists(dest_pdf_path):
                # noinspection PyArgumentList
                dest_pdf = Pdf.new()
                dest_pdf.pages.extend(pages)
                os.makedirs(os.path.dirname(dest_pdf_path), exist_ok=True)
                dest_pdf.save(filename_or_stream=dest_pdf_path)
            else:
                logging.warning("%s exists", dest_pdf_path)
            dest_pdfs.append(dest_pdf_path)
    return dest_pdfs