Beispiel #1
0
    def init_test(self, filename):
        ''' Initialize parsed layout and benchmark layout.'''
        # restore sample layout
        layout_file = os.path.join(self.layout_dir, f'{filename}.json')
        with open(layout_file, 'r') as f:
            raw_dict = json.load(f)
        self.sample = Layout(raw_dict)

        # parsed layout: first page only
        pdf_file = os.path.join(self.sample_dir, f'{filename}.pdf')
        docx_file = os.path.join(self.output_dir, f'{filename}.docx')
        cv = Converter(pdf_file, docx_file)        
        cv.make_page(cv[0], debug=False)
        self.test = cv.layout # type: Layout
        cv.close()

        return self
Beispiel #2
0
def local_test(filename, make_test_case=False):
    pdf_file = os.path.join(output, f'{filename}.pdf')
    docx_file = os.path.join(output, f'{filename}.docx')

    cv = Converter(pdf_file, docx_file)

    # process page by page
    for page in cv[0:1]:

        # print(page.rotation, page.rotationMatrix)
        # print(page.transformationMatrix)
        # print(page.rect, page.MediaBox, page.CropBox)


        # print(page.xref)
        # print(page.getContents())
        # print(cv.doc_pdf.xrefObject(page.xref))
        # page.cleanContents()
        # c = page.readContents().decode(encoding="ISO-8859-1")
        # with open('c.txt', 'w') as f:
        #     f.write(c)
        
        # print(cv.doc_pdf.xrefObject(94))

        # with open('x.svg', 'w') as f:
        #     f.write(page.getSVGimage(text_as_path=False))
        
        # parse layout
        cv.make_page(page)
        
        # # extract tables
        # tables = cv.extract_tables(page)
        # for table in tables:
        #     print(table)
    
    cv.close() # close pdf


    # check results
    check_result(pdf_file, docx_file, 'comparison.pdf', make_test_case)
Beispiel #3
0
    for page in cv[0:1]:

        # print(page.rotation, page.rotationMatrix)
        # print(page.transformationMatrix)
        # print(page.rect, page.MediaBox, page.CropBox)

        # print(page.xref)
        # print(page.getContents())
        # print(cv.doc_pdf.xrefObject(page.xref))
        # page.cleanContents()
        # c = page.readContents().decode(encoding="ISO-8859-1")
        # with open('c.txt', 'w') as f:
        #     f.write(c)

        # print(cv.doc_pdf.xrefObject(94))

        # with open('x.svg', 'w') as f:
        #     f.write(page.getSVGimage(text_as_path=False))

        # parse layout
        cv.make_page(page)

        # # extract tables
        # tables = cv.extract_tables(page)
        # for table in tables:
        #     print(table)

    cv.close()  # close pdf

    # check results
    # check_result(pdf_file, docx_file, 'comparison.pdf')