def test_output_is_correct(self):
        file_path = os.path.join(os.path.dirname(__file__),
                                 "../../docs/source/example_files/figure.pdf")

        # Without all_texts
        document = load_file(file_path)
        self.assertListEqual(
            [element.text() for element in document.elements],
            ["Here is some text outside of an image"],
        )

        document = load_file(file_path, la_params={"all_texts": True})
        self.assertListEqual(
            [element.text() for element in document.elements],
            [
                "This is some text in an image",
                "Here is some text outside of an image"
            ],
        )
Esempio n. 2
0
def main():
    parser = ArgumentParser()
    parser.add_argument("file_path")
    args = parser.parse_args()
    document = load_file(args.file_path)

    print("Parsing PDF: " + args.file_path)

    if document is not None:
        tomorrow_parser = TomorrowParser(document)
        statement = tomorrow_parser.run()
        write_csv(statement, args.file_path.replace(".pdf", ".csv"))

    else:
        print("Document not found at path [" + args.filepath + "]")
        exit(-1)
Esempio n. 3
0
    def test_visualise(self):
        file_path = os.path.join(
            os.path.dirname(__file__), "../docs/source/example_files/tables.pdf"
        )

        FONT_MAPPING = {
            "BAAAAA+LiberationSerif-Bold,12.0": "header",
            "CAAAAA+LiberationSerif,12.0": "table_element",
        }
        document = load_file(file_path, font_mapping=FONT_MAPPING)

        visualiser = PDFVisualiser(
            self.root, document, show_info=True, width=1920, height=1080
        )

        self.check_images(visualiser, "tables1")

        visualiser.toolbar._buttons["Next page"].invoke()
        self.check_images(visualiser, "tables2")
Esempio n. 4
0
 def test_load_file(self):
     file_path = os.path.join(os.path.dirname(__file__), "data", "pdfs", "test.pdf")
     document = load_file(file_path)
     self.assertIsInstance(document, PDFDocument)
Esempio n. 5
0
 def test_load_file_with_text_in_image(self):
     file_path = os.path.join(os.path.dirname(__file__), "data", "pdfs", "image.pdf")
     document = load_file(file_path, la_params={"all_texts": True})
     self.assertIsInstance(document, PDFDocument)
     self.assertEqual(len(document.elements), 2)
Esempio n. 6
0
    def test_output_is_correct(self):
        # The code below should match that in the documentation example "simple_memo"
        # Step 1 - Load the document
        file_path = os.path.join(
            os.path.dirname(__file__),
            "../../docs/source/example_files/simple_memo.pdf",
        )
        document = load_file(file_path)

        # We could visualise it here to check it looks correct:
        # from py_pdf_parser.visualise import visualise
        # visualise(document)

        # Step 2 - Extract reference elements:
        to_element = document.elements.filter_by_text_equal(
            "TO:").extract_single_element()
        from_element = document.elements.filter_by_text_equal(
            "FROM:").extract_single_element()
        date_element = document.elements.filter_by_text_equal(
            "DATE:").extract_single_element()
        subject_element = document.elements.filter_by_text_equal(
            "SUBJECT:").extract_single_element()

        # Step 3 - Extract the data
        to_text = (document.elements.to_the_right_of(
            to_element).extract_single_element().text())
        from_text = (document.elements.to_the_right_of(
            from_element).extract_single_element().text())
        date_text = (document.elements.to_the_right_of(
            date_element).extract_single_element().text())
        subject_text_element = document.elements.to_the_right_of(
            subject_element).extract_single_element()
        subject_text = subject_text_element.text()

        content_elements = document.elements.after(subject_element)
        content_text = "\n".join(element.text()
                                 for element in content_elements)

        output = {
            "to": to_text,
            "from": from_text,
            "date": date_text,
            "subject": subject_text,
            "content": content_text,
        }

        self.assertDictEqual(
            output,
            {
                "content":
                ("A new PDF Parsing tool\n"
                 "There is a new PDF parsing tool available, called py-pdf-parser - "
                 "you should all check it out!\n"
                 "I think it could really help you extract that data we need from "
                 "those PDFs."),
                "date":
                "1st January 2020",
                "from":
                "John Smith",
                "subject":
                "A new PDF Parsing tool",
                "to":
                "All Developers",
            },
        )
Esempio n. 7
0
    def test_output_is_correct(self):
        file_path = os.path.join(os.path.dirname(__file__),
                                 "../../docs/source/example_files/tables.pdf")

        # Step 1 - Load the file
        FONT_MAPPING = {
            "BAAAAA+LiberationSerif-Bold,12.0": "header",
            "CAAAAA+LiberationSerif,12.0": "table_element",
        }
        document = load_file(file_path, font_mapping=FONT_MAPPING)

        headers = document.elements.filter_by_font("header")

        # Extract reference elements
        simple_table_header = headers.filter_by_text_equal(
            "Simple Table").extract_single_element()

        simple_table_with_gaps_header = headers.filter_by_text_equal(
            "Simple Table with gaps").extract_single_element()

        simple_table_with_gaps_in_first_row_col_header = headers.filter_by_text_equal(
            "Simple Table with gaps in first row/col").extract_single_element(
            )

        non_simple_table_header = headers.filter_by_text_equal(
            "Non Simple Table").extract_single_element()

        non_simple_table_with_merged_cols_header = headers.filter_by_text_equal(
            "Non Simple Table with Merged Columns").extract_single_element()

        non_simple_table_with_merged_rows_header = headers.filter_by_text_equal(
            "Non Simple Table with Merged Rows and Columns"
        ).extract_single_element()

        over_the_page_header = headers.filter_by_text_equal(
            "Over the page").extract_single_element()

        # Extract table elements
        simple_table_elements = document.elements.between(
            simple_table_header, simple_table_with_gaps_header)
        simple_table_with_gaps_elements = document.elements.between(
            simple_table_with_gaps_header,
            simple_table_with_gaps_in_first_row_col_header,
        )

        simple_table_with_gaps_in_first_row_col_elements = document.elements.between(
            simple_table_with_gaps_in_first_row_col_header,
            non_simple_table_header)

        non_simple_table_elements = document.elements.between(
            non_simple_table_header, non_simple_table_with_merged_cols_header)

        non_simple_table_with_merged_cols_elements = document.elements.between(
            non_simple_table_with_merged_cols_header,
            non_simple_table_with_merged_rows_header,
        )

        non_simple_table_with_merged_rows_and_cols_elements = document.elements.between(
            non_simple_table_with_merged_rows_header, over_the_page_header)

        over_the_page_elements = document.elements.after(over_the_page_header)

        # Simple Table
        table = tables.extract_simple_table(simple_table_elements,
                                            as_text=True)
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
                ["A", "1", "A", "1"],
                ["B", "2", "B", "2"],
                ["C", "3", "C", "3"],
            ],
        )

        # Simple Table with gaps

        with self.assertRaises(TableExtractionError):
            tables.extract_simple_table(simple_table_with_gaps_elements,
                                        as_text=True)

        table = tables.extract_simple_table(simple_table_with_gaps_elements,
                                            as_text=True,
                                            allow_gaps=True)
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
                ["A", "1", "", "1"],
                ["B", "", "", ""],
                ["C", "", "C", "3"],
            ],
        )

        # Simple Table with gaps in first row/col
        with self.assertRaises(TableExtractionError):
            tables.extract_simple_table(
                simple_table_with_gaps_in_first_row_col_elements,
                as_text=True,
                allow_gaps=True,
            )

        reference_element = simple_table_with_gaps_in_first_row_col_elements[9]
        table = tables.extract_simple_table(
            simple_table_with_gaps_in_first_row_col_elements,
            as_text=True,
            allow_gaps=True,
            reference_element=reference_element,
        )
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "", "Heading 4"],
                ["", "1", "A", ""],
                ["B", "2", "", "2"],
                ["C", "3", "C", "3"],
            ],
        )

        # Non Simple Table
        table = tables.extract_table(non_simple_table_elements, as_text=True)
        self.assertListEqual(
            table,
            [
                ["", "Heading 2", "Heading 3", "Heading 4"],
                ["A", "1", "", "1"],
                ["B", "", "B", "2"],
                ["C", "3", "C", ""],
            ],
        )

        # Non Simple Table with Merged Columns
        with self.assertRaises(TableExtractionError):
            tables.extract_table(non_simple_table_with_merged_cols_elements,
                                 as_text=True)

        table = tables.extract_table(
            non_simple_table_with_merged_cols_elements,
            as_text=True,
            fix_element_in_multiple_cols=True,
        )
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
                ["A", "1", "A", "1"],
                ["This text spans across multiple columns", "", "B", "2"],
                ["C", "3", "C", "3"],
            ],
        )

        # Non Simple Table with Merged Rows and Columns
        table = tables.extract_table(
            non_simple_table_with_merged_rows_and_cols_elements,
            as_text=True,
            fix_element_in_multiple_rows=True,
            fix_element_in_multiple_cols=True,
        )
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
                [
                    "This text spans across multiple rows and \nmultiple columns.",
                    "",
                    "A",
                    "1",
                ],
                ["", "", "B", "2"],
                ["C", "3", "C", "3"],
            ],
        )

        # Over the page
        table = tables.extract_simple_table(over_the_page_elements,
                                            as_text=True)
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
                ["A", "1", "A", "1"],
                ["B", "2", "B", "2"],
                ["C", "3", "C", "3"],
            ],
        )
Esempio n. 8
0
    def test_output_is_correct(self):
        # The code below should match that in the documentation example "order_summary"
        # Step 1 - Load the document
        file_path = os.path.join(
            os.path.dirname(__file__),
            "../../docs/source/example_files/order_summary.pdf",
        )
        FONT_MAPPING = {
            "BAAAAA+LiberationSerif-Bold,16.0": "title",
            "BAAAAA+LiberationSerif-Bold,12.0": "sub_title",
            "CAAAAA+LiberationSerif,12.0": "text",
            "DAAAAA+FreeMonoBold,12.0": "table_header",
            "EAAAAA+FreeMono,12.0": "table_text",
        }
        document = load_file(file_path, font_mapping=FONT_MAPPING)

        # visualise(document)

        # Step 3 - Add sections
        order_summary_sub_title_element = (
            document.elements.filter_by_font("sub_title").filter_by_text_equal(
                "Order Summary:").extract_single_element())

        totals_sub_title_element = (
            document.elements.filter_by_font("sub_title").filter_by_text_equal(
                "Totals:").extract_single_element())

        final_element = document.elements[-1]

        order_summary_section = document.sectioning.create_section(
            name="order_summary",
            start_element=order_summary_sub_title_element,
            end_element=totals_sub_title_element,
            include_last_element=False,
        )

        totals_section = document.sectioning.create_section(
            name="totals",
            start_element=totals_sub_title_element,
            end_element=final_element,
        )

        # visualise(document)

        # Step 4 - Extract tables

        order_summary_table = tables.extract_simple_table(
            order_summary_section.elements.filter_by_fonts(
                "table_header", "table_text"),
            as_text=True,
        )

        totals_table = tables.extract_simple_table(
            totals_section.elements.filter_by_fonts("table_header",
                                                    "table_text"),
            as_text=True,
        )

        order_summary_with_header = tables.add_header_to_table(
            order_summary_table)

        self.assertListEqual(
            order_summary_table,
            [
                ["Item", "Unit Cost", "Quantity", "Cost"],
                ["Challenger 100g\nWhole Hops", "£3.29", "1", "£3.29"],
                [
                    "Maris Otter \nPale Ale Malt \n(Crushed)",
                    "£1.50/1000g",
                    "4000g",
                    "£6.00",
                ],
                ["WLP037 \nYorkshire Ale \nYeast", "£7.08", "1", "£7.08"],
                ["Bottle Caps", "£1 per 100", "500", "£5"],
            ],
        )

        self.assertListEqual(
            totals_table,
            [
                ["Subtotal:", "£26.28"],
                ["Shipping", "£6"],
                ["VAT 20%", "£6.45"],
                ["Total:", "£38.73"],
            ],
        )

        self.assertListEqual(
            order_summary_with_header,
            [
                {
                    "Item": "Challenger 100g\nWhole Hops",
                    "Unit Cost": "£3.29",
                    "Quantity": "1",
                    "Cost": "£3.29",
                },
                {
                    "Item": "Maris Otter \nPale Ale Malt \n(Crushed)",
                    "Unit Cost": "£1.50/1000g",
                    "Quantity": "4000g",
                    "Cost": "£6.00",
                },
                {
                    "Item": "WLP037 \nYorkshire Ale \nYeast",
                    "Unit Cost": "£7.08",
                    "Quantity": "1",
                    "Cost": "£7.08",
                },
                {
                    "Item": "Bottle Caps",
                    "Unit Cost": "£1 per 100",
                    "Quantity": "500",
                    "Cost": "£5",
                },
            ],
        )
Esempio n. 9
0
    def test_output_is_correct(self):
        file_path = os.path.join(os.path.dirname(__file__),
                                 "../../docs/source/example_files/grid.pdf")

        # Default - left to right, top to bottom
        document = load_file(file_path)
        self.assertListEqual(
            [element.text() for element in document.elements],
            ["Top Left", "Top Right", "Bottom Left", "Bottom Right"],
        )

        # Preset - right to left, top to bottom
        document = load_file(
            file_path,
            element_ordering=ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM)
        self.assertListEqual(
            [element.text() for element in document.elements],
            ["Top Right", "Top Left", "Bottom Right", "Bottom Left"],
        )

        # Preset - top to bottom, left to right
        document = load_file(
            file_path,
            element_ordering=ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT)
        self.assertListEqual(
            [element.text() for element in document.elements],
            ["Bottom Left", "Top Left", "Bottom Right", "Top Right"],
        )

        # Preset - top to bottom, right to left
        document = load_file(
            file_path,
            element_ordering=ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT)
        self.assertListEqual(
            [element.text() for element in document.elements],
            ["Top Right", "Bottom Right", "Top Left", "Bottom Left"],
        )

        # Custom - bottom to top, left to right
        def ordering_function(elements):
            return sorted(elements, key=lambda elem: (elem.x0, elem.y0))

        document = load_file(file_path, element_ordering=ordering_function)
        self.assertListEqual(
            [element.text() for element in document.elements],
            ["Bottom Left", "Top Left", "Bottom Right", "Top Right"],
        )

        # Custom - This PDF has columns!
        # TODO: CHANGE PATH!
        file_path = os.path.join(
            os.path.dirname(__file__),
            "../../docs/source/example_files/columns.pdf")

        # Default - left to right, top to bottom
        document = load_file(file_path)
        self.assertListEqual(
            [element.text() for element in document.elements],
            [
                "Column 1 Title",
                "Column 2 Title",
                "Here is some column 1 text.",
                "Here is some column 2 text.",
                "Col 1 left",
                "Col 1 right",
                "Col 2 left",
                "Col 2 right",
            ],
        )

        # Visualise, and we can see that the middle is at around x = 300.
        # visualise(document)

        def column_ordering_function(elements):
            return sorted(elements,
                          key=lambda elem: (elem.x0 > 300, -elem.y0, elem.x0))

        document = load_file(file_path,
                             element_ordering=column_ordering_function)
        self.assertListEqual(
            [element.text() for element in document.elements],
            [
                "Column 1 Title",
                "Here is some column 1 text.",
                "Col 1 left",
                "Col 1 right",
                "Column 2 Title",
                "Here is some column 2 text.",
                "Col 2 left",
                "Col 2 right",
            ],
        )