def enrich_metadata(pdf: StructuredPdfDocument, source: Source):
    """
    add some metadata to parsed PDF if possible
        # add filename
        # todo create document summary
        # todo extract document-title from best titles
    @return:
    """
    # try to capture filename
    try:
        filename = Path(source.uri).name
        pdf.update_metadata("filename", filename)
    except Exception as e:
        pass
    def parse_pdf(self, source: Source) -> StructuredPdfDocument:
        """
        Analysises and parses a PDF document from a given @Source containing its natural hierarchy.
        @param source:
        @return:
        """
        # 1. iterate once through PDF and analyse style distribution
        distribution = count_sizes(source.read())
        size_mapper = PivotLogMapper(distribution)
        style_annotator = StyleAnnotator(sizemapper=size_mapper,
                                         style_info=distribution)

        # 2. iterate second time trough pdf
        # - annotate each paragraph with mapped Style
        elements_with_style = style_annotator.process(
            source.read(override_la_params=LAParams(
                boxes_flow=None,
                detect_vertical=False,
                line_margin=distribution.line_margin)))

        # - create nested document structure on the fly
        structured_elements = self.create_hierarchy(elements_with_style,
                                                    distribution)

        # 3. create wrapped document and capture some metadata
        pdf_document = StructuredPdfDocument(elements=structured_elements,
                                             style_info=distribution)
        enrich_metadata(pdf_document, source)
        return pdf_document
Example #3
0
    def test_full_content(self):
        with open(str(Path("resources/parsed/interview_cheatsheet.json").absolute()), "r") as fp:
            json_string = json.load(fp)
            document = StructuredPdfDocument.from_json(json_string)
            text = document.text

            expected_newline_merged_subsections_excerpt = "Greedy Algorithm\nDefinition:\nAn algorithm that, while"

            self.assertTrue(expected_newline_merged_subsections_excerpt in text)
Example #4
0
    def test_print_json_string(self):
        printer = JsonStringPrinter()

        jsonString = printer.print(self.testDocument)

        decoded_document = StructuredPdfDocument.from_json(
            json.loads(jsonString))

        self.assertEqual(self.testDocument.elements[1].heading.text,
                         decoded_document.elements[1].heading.text)
        self.assertEqual(self.testDocument.elements[-1].heading.text,
                         decoded_document.elements[-1].heading.text)

        self.assertEqual("Array",
                         decoded_document.elements[5].children[0].heading.text)
        self.assertEqual(
            "Time Complexity:",
            decoded_document.elements[5].children[0].children[2].heading.text)
Example #5
0
    def test_print_json_file(self):
        printer = JsonFilePrinter()

        file_path = Path("resources/parsed/interview_cheatsheet.json")
        printer.print(self.testDocument, file_path=str(file_path.absolute()))

        with open(file_path, "r") as file:
            decoded_document = StructuredPdfDocument.from_json(json.load(file))

            self.assertEqual(self.testDocument.elements[1].heading.text,
                             decoded_document.elements[1].heading.text)
            self.assertEqual(self.testDocument.elements[-1].heading.text,
                             decoded_document.elements[-1].heading.text)

            self.assertEqual(
                "Array", decoded_document.elements[5].children[0].heading.text)
            self.assertEqual(
                "Time Complexity:", decoded_document.elements[5].children[0].
                children[2].heading.text)
Example #6
0
# from pdfstructure.hierarchy.parser import HierarchyParser
# from pdfstructure.source import FileSource
# import pathlib

# path = "./Nurse.pdf"
# parser = HierarchyParser()

# source = FileSource(path)

# document = parser.parse_pdf(source)

# from pdfstructure.printer import JsonFilePrinter
# printer = JsonFilePrinter()
# file_path = pathlib.Path("test-pdf.json")

# printer.print(document, file_path=str(file_path.absolute()))

from pdfstructure.model.document import StructuredPdfDocument
file = './test-pdf.json'
jsonString = json.load(test-pdf.json)
document = StructuredPdfDocument.from_json(jsonString)