Ejemplo n.º 1
0
    def parse_pdf_complex(self, pdffile, intermediatedir):
        pdf = PDFReader()
        pdf.read(pdffile, intermediatedir)
        res = CompoundElement
        cnt = 0
        for srcpage in pdf:
            cnt += 1
            # Page is a wonderful and magical class. Read the comments
            # to find out exactly how awesome it is.
            tgtpage = Page(ordinal=cnt)
            # TODO: use magic to find the bounding box of actual page
            # content. 510 is a rough cutoff that might not be
            # appropriate for all page layouts.
            boxes = srcpage.boundingbox(right=510)
            for box in boxes:
                print((box.getfont()))
                print(("    [%dx%d][%dx%d][%s@%s] %s" %
                      (box.top, box.left, box.bottom, box.right, box.getfont()['family'], box.getfont()['size'], str(box))))
                # Heuristic: If something is in large type, it's a heading.
                if int(box.getfont()['size']) > 12:
                    if isinstance(ctx, Heading):
                        if vertical_space(box, boxes.previous()) > 10:
                            # Page.new closes the current context and
                            # creates a new context of the given class
                            tgtpage.new(Heading)

                    # Heading is a DimensionedElement with top,
                    # left, width, height props. Page.set creates a new
                    # context, but only if needed.
                    txtpage.set(Heading)

                    # calls the current context's append() method. If
                    # it's a DimensionedElement (it should be), it's
                    # implementation of append() expands the bounding
                    # box as new stuff is added (provided they have
                    # top/left+width/height attribs
                    txtpage.write(box)

                    continue

                # add more heuristicts here...

                # Last resort: Everything that is not something else is a Paragraph
                page.set(Paragraph)
                if horizontal_diff(box, boxes.previous()) > 0:  # maybe something like 4-5
                    page.new(Paragraph)
                if vertical_space(box.boxes.previous()) > 5:
                    page.new(Paragraph)

        print((pdf.median_box_width(threshold=0)))
Ejemplo n.º 2
0
 def parse_document_from_soup(self, soup, doc):
     from ferenda.elements import Page
     from ferenda import Describer
     part = Page(["This is a part of a document"],
                 ordinal=42,
                 uri="http://example.org/doc#42",
                 meta=self.make_graph())
     d = Describer(part.meta, part.uri)
     d.rdftype(self.ns['bibo'].DocumentPart)
     # the dcterms:identifier for a document part is often whatever
     # would be the preferred way to cite that part in another
     # document
     d.value(self.ns['dcterms'].identifier, "Doc:4711, p 42")
     # end part
     from lxml import etree
     return etree.tostring(part.as_xhtml("http://example.org/doc"))
Ejemplo n.º 3
0
    def parse_document_from_soup(self, soup, doc):
        from ferenda.elements import Page
        from ferenda import Describer
        part = Page(["This is a part of a document"],
                    ordinal=42,
                    uri="http://example.org/doc#42",
                    meta=self.make_graph())
        d = Describer(part.meta, part.uri)
        d.rdftype(self.ns['bibo'].DocumentPart)
        # the dcterms:identifier for a document part is often whatever
        # would be the preferred way to cite that part in another
        # document
        d.value(self.ns['dcterms'].identifier, "Doc:4711, p 42")
# end part
        from lxml import etree
        return etree.tostring(part.as_xhtml("http://example.org/doc"))