Python StructuredPdfDocument Examples

Programming Language: Python

Namespace/Package Name: pdfstructure.model.document

Examples at hotexamples.com: 6

Python StructuredPdfDocument - 6 examples found. These are the top rated real world Python examples of pdfstructure.model.document.StructuredPdfDocument extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

from_json(4)

StructuredPdfDocument(1)

update_metadata(1)

Example #1

Show file

File: parser.py Project: trungthanhnguyen0502/pdfstructure

def enrich_metadata(pdf: StructuredPdfDocument, source: Source):
    """
    add some metadata to parsed PDF if possible
        # add filename
        # todo create document summary
        # todo extract document-title from best titles
    @return:
    """
    # try to capture filename
    try:
        filename = Path(source.uri).name
        pdf.update_metadata("filename", filename)
    except Exception as e:
        pass

Example #2

Show file

File: parser.py Project: trungthanhnguyen0502/pdfstructure

    def parse_pdf(self, source: Source) -> StructuredPdfDocument:
        """
        Analysises and parses a PDF document from a given @Source containing its natural hierarchy.
        @param source:
        @return:
        """
        # 1. iterate once through PDF and analyse style distribution
        distribution = count_sizes(source.read())
        size_mapper = PivotLogMapper(distribution)
        style_annotator = StyleAnnotator(sizemapper=size_mapper,
                                         style_info=distribution)

        # 2. iterate second time trough pdf
        # - annotate each paragraph with mapped Style
        elements_with_style = style_annotator.process(
            source.read(override_la_params=LAParams(
                boxes_flow=None,
                detect_vertical=False,
                line_margin=distribution.line_margin)))

        # - create nested document structure on the fly
        structured_elements = self.create_hierarchy(elements_with_style,
                                                    distribution)

        # 3. create wrapped document and capture some metadata
        pdf_document = StructuredPdfDocument(elements=structured_elements,
                                             style_info=distribution)
        enrich_metadata(pdf_document, source)
        return pdf_document

Example #3

Show file

    def test_full_content(self):
        with open(str(Path("resources/parsed/interview_cheatsheet.json").absolute()), "r") as fp:
            json_string = json.load(fp)
            document = StructuredPdfDocument.from_json(json_string)
            text = document.text

            expected_newline_merged_subsections_excerpt = "Greedy Algorithm\nDefinition:\nAn algorithm that, while"

            self.assertTrue(expected_newline_merged_subsections_excerpt in text)

Example #4

Show file

    def test_print_json_string(self):
        printer = JsonStringPrinter()

        jsonString = printer.print(self.testDocument)

        decoded_document = StructuredPdfDocument.from_json(
            json.loads(jsonString))

        self.assertEqual(self.testDocument.elements[1].heading.text,
                         decoded_document.elements[1].heading.text)
        self.assertEqual(self.testDocument.elements[-1].heading.text,
                         decoded_document.elements[-1].heading.text)

        self.assertEqual("Array",
                         decoded_document.elements[5].children[0].heading.text)
        self.assertEqual(
            "Time Complexity:",
            decoded_document.elements[5].children[0].children[2].heading.text)

Example #5

Show file

    def test_print_json_file(self):
        printer = JsonFilePrinter()

        file_path = Path("resources/parsed/interview_cheatsheet.json")
        printer.print(self.testDocument, file_path=str(file_path.absolute()))

        with open(file_path, "r") as file:
            decoded_document = StructuredPdfDocument.from_json(json.load(file))

            self.assertEqual(self.testDocument.elements[1].heading.text,
                             decoded_document.elements[1].heading.text)
            self.assertEqual(self.testDocument.elements[-1].heading.text,
                             decoded_document.elements[-1].heading.text)

            self.assertEqual(
                "Array", decoded_document.elements[5].children[0].heading.text)
            self.assertEqual(
                "Time Complexity:", decoded_document.elements[5].children[0].
                children[2].heading.text)

Example #6

Show file

# from pdfstructure.hierarchy.parser import HierarchyParser
# from pdfstructure.source import FileSource
# import pathlib

# path = "./Nurse.pdf"
# parser = HierarchyParser()

# source = FileSource(path)

# document = parser.parse_pdf(source)

# from pdfstructure.printer import JsonFilePrinter
# printer = JsonFilePrinter()
# file_path = pathlib.Path("test-pdf.json")

# printer.print(document, file_path=str(file_path.absolute()))

from pdfstructure.model.document import StructuredPdfDocument
file = './test-pdf.json'
jsonString = json.load(test-pdf.json)
document = StructuredPdfDocument.from_json(jsonString)