コード例 #1
0
document = open('1a_Foundations.pdf', 'rb')
#Create resource manager
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.create_pages(document)
# interpreter.process_page(pages)

#page is the iterator of the pages, it is for one single page object
title = []
content = []
for page in PDFPage.get_pages(document):
    interpreter.process_page(page)
    layout = device.get_result()
    if layout.pageid > 1:
        # print ("aaa")
        if parse_obj(layout._objs) == True:
            text_title = parse_obj_title(layout._objs).encode(
                'ascii', 'ignore')
            text_title = text_title[:-1]
            title.append(text_title)
            text_content = parse_obj_content(layout._objs)
            content.append(text_content)
with open('title111.txt', 'w', encoding='utf-8') as f:
    for i in title:
        f.write(i)
        f.write('\n')
コード例 #2
0
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io

resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager,
                          fake_file_handle,
                          laparams=LAParams())
page_interpreter = PDFPageInterpreter(resource_manager, converter)

with open('test.pdf', 'rb') as fh:

    for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
        page_interpreter.process_page(page)

    text = fake_file_handle.getvalue()

# close open handles
converter.close()
fake_file_handle.close()

print(text)
コード例 #3
0
ファイル: ocr.py プロジェクト: mslee93/PDF2XML
    def run(self):
        super(XML, self).run()
        if self.xml_type == 'letter':
            resource_manager = PDFResourceManager()
            fake_file_handle = io.BytesIO()
            converter = XMLConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)

            with open(self.output_file_path, 'rb') as fh:

                for page in PDFPage.get_pages(fh,
                                              caching=True,
                                              check_extractable=True):
                    page_interpreter.process_page(page)

                text = fake_file_handle.getvalue().decode('utf-8')
                text += '\n</pages>'
                self.output_xml_content = text

            # close open handles
            converter.close()
            fake_file_handle.close()
            super(XML, self).deleteOutputFile()
        else:

            def __extract_text_by_page(pdf_path):
                with open(pdf_path, 'rb') as fh:
                    for page in PDFPage.get_pages(fh,
                                                  caching=True,
                                                  check_extractable=True):
                        resource_manager = PDFResourceManager()
                        fake_file_handle = io.StringIO()
                        converter = TextConverter(resource_manager,
                                                  fake_file_handle,
                                                  codec='utf-8')
                        page_interpreter = PDFPageInterpreter(
                            resource_manager, converter)
                        page_interpreter.process_page(page)

                        text = fake_file_handle.getvalue()
                        # text = text.encode('utf-8')
                        yield text

                        # close open handles
                        converter.close()
                        fake_file_handle.close()

            def __replace_nontext(text, replacement=u'\uFFFD'):
                _char_tail = ''
                if sys.maxunicode > 0x10000:
                    _char_tail = u'%s-%s' % (
                        chr(0x10000), chr(min(sys.maxunicode, 0x10FFFF)))
                _nontext_sub = re.compile(
                    r'[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD%s]' % _char_tail,
                    re.U).sub

                return _nontext_sub(replacement, text)

            root = xml.Element('{filename}'.format(filename='Result'))
            pages = xml.Element('Pages')
            root.append(pages)
            counter = 1

            for page in __extract_text_by_page(self.output_file_path):
                text = xml.SubElement(pages, 'Page_{}'.format(counter))
                text.text = page[:]
                counter += 1

            #root.append(pages)
            tree = xml.ElementTree(root)
            #xml_string = xml.tostring(tree, 'utf-8', method='xml')
            xml_string = xml.tostring(root, 'utf-8', method='xml')
            xml_string = __replace_nontext(
                xml_string.decode('utf-8'),
                replacement=u'\uFFFD').encode('utf-8')
            #xml_string += '\n</pages>'
            parsed_string = minidom.parseString(xml_string)
            pretty_string = parsed_string.toprettyxml(indent='  ')

            #pretty_string += '\n</pages>'
            # with open(output_file_path, 'w', encoding="utf-8") as f:
            #     f.write(pretty_string)

            self.output_xml_content = pretty_string
            super(XML, self).deleteOutputFile()
コード例 #4
0
interpreter = PDFPageInterpreter(resource_manager, device)

# 出力用のテキストファイル
# output_txt = open('output.txt', 'w')


def print_and_write(txt):
    print(txt)
    # output_txt.write(txt)
    # output_txt.write('\n')


with open(sys.argv[1], 'rb') as f:
    # PDFPage.get_pages()にファイルオブジェクトを指定して、PDFPageオブジェクトを順に取得する。
    # 時間がかかるファイルは、キーワード引数pagenosで処理するページ番号(0始まり)のリストを指定するとよい。
    for page in PDFPage.get_pages(f):
        print_and_write('\n====== ページ区切り ======\n')
        interpreter.process_page(page)  # ページを処理する。
        layout = device.get_result()  # LTPageオブジェクトを取得。

        # ページ内のテキストボックスのリストを取得する。
        boxes = find_textboxes_recursively(layout)

        # テキストボックスの左上の座標の順でテキストボックスをソートする。
        # y1(Y座標の値)は上に行くほど大きくなるので、正負を反転させている。
        boxes.sort(key=lambda b: (-b.y1, b.x0))

        for box in boxes:
            print_and_write('-' * 10)  # 読みやすいよう区切り線を表示する。
            print_and_write(box.get_text().strip())  # テキストボックス内のテキストを表示する。
            print_and_write(f"{box.x0}, {box.y0} - {box.x1}, {box.y1}"
コード例 #5
0
def scanDoublePages(parent, galleryprices, dailyprices):
    daily = open("daily.txt", "w+")
    gal = open("gallery.txt", "w+")
    sweet = open("sweet.txt", "w+")
    duplicatestest = open("duplicatestest.txt", "w+")
    number = 0
    numberlist = []
    originalfile = []
    folder = "temp"
    cleanFolder(folder)
    # listing the files inside the folder
    parentnew = list_files_walk(parent)
    # creating a temporary folder
    os.mkdir(folder)
    # splitting the temporary files
    splitter(parentnew, folder)
    # getting the temporary files
    parentnew2 = list_files_walk(folder)
    # sorting the files by name
    parentnew2.sort()
    # iterate over all the files in directory 'parent'
    for file_name in parentnew2:
        resource_manager = PDFResourceManager()
        handle = io.StringIO()
        converter = TextConverter(resource_manager, handle)
        page_interpreter = PDFPageInterpreter(resource_manager, converter)
        arquivo = open(file_name, 'rb')
        if "page_001.pdf" in file_name:
            number = 0
        with arquivo as fh:
            for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
                booleangal = True
                booleanSweet = True
                page_interpreter.process_page(page)
                text = handle.getvalue()
                text = text[:-1]
                text = text + "¬¬¬"
                #print(text)
                # searching the reference number
                search = find_between(text, "#", "Order")
                # searching the order number
                search2 = find_between(text, "# ", "Order Date")
                # Searching the design name
                name = find_between(text, "SKUPrice1", "$")
                # Prices
                price = find_between(text, name, ",")
                # Products
                products = find_between(text, "SKUPrice1", "¬¬¬")
                #print(products)
                originalfilenumber = find_between(file_name, "_file_", "_page")
                print(originalfilenumber)
                if search == "":
                    numberlist.append(str(number) + ";" + originalfilenumber)
                    originalfile.append(originalfilenumber)
                    # print(result[number-1])
                    # f.write(result[number - 1] + "\n")
                else:
                    duplicatestest.write(search2 + "\n")
                    for daprices in dailyprices:
                        if products.find(daprices) != -1:
                            print(search2 + " Daily Shirt")
                            booleangal = False
                            daily.write(name + "^" + file_name + "^" + search2 + "\n")
                            break
                    if booleangal:
                        for gaprices in galleryprices:
                            if products.find(gaprices) != -1:
                                print(search2 + " Gallery Shirt")
                                gal.write(name + "^" + file_name + "^" + search2 + "\n")
                                booleanSweet = False
                                break
                        if booleanSweet:
                            sweet.write(name + "^" + file_name + "^" + search2 + "\n")
                            print(search2 + " Sweet Deal")
                number = number + 1
        converter.close()
        handle.close()
    daily.close()
    gal.close()
    duplicatestest.close()
    sweet.close()
    cleanFolder(folder)
    print(originalfile)
    print("Files with double pages: ")
    print(numberlist)
    os.mkdir(folder)
    splitterCustom(parentnew, folder, numberlist,originalfile)