コード例 #1
0
total_num_pages = len(pages)
bar_widgets = [
    progressbar.Bar(),
    progressbar.Counter(format='%(value)i/%(max_value)i')
]
bar = progressbar.ProgressBar(max_value=total_num_pages, widgets=bar_widgets, redirect_stdout=True)
bar.start()
with open(input_filepath, "rb") as fp:
    rsrcmgr = PDFResourceManager(caching=True)

    device = TextBoxStripper(rsrcmgr, outfp)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for (page_num, page) in PDFPage.get_pages2(fp,
                                               pages,
                                               password="",
                                               caching=True,
                                               check_extractable=True,
                                               fallback=False):
        try:
            #print("===== Page {}".format(page_num))
            # Text box processing:
            device.text_boxes = []
            device.tables = []
            interpreter.process_page(page)
            device.drop_empty_textboxes()
            device.merge_textboxes()
            # Table processing
            device.build_tables()
            ## For now, we don't care about the title of the page.
            ## With table contents we have all the information
            #for text_box in device.text_boxes: