Esempio n. 1
0
    def append_highlight_annotation(
        self,
        rectangle: Rectangle,
        color: Color = X11Color("Yellow"),
        contents: Optional[str] = None,
    ) -> "Page":
        # create generic annotation
        annot = self._create_annotation(rectangle=rectangle,
                                        color=color,
                                        contents=contents)

        # (Required) The type of annotation that this dictionary describes; shall
        # be Highlight, Underline, Squiggly, or StrikeOut for a highlight,
        # underline, squiggly-underline, or strikeout annotation, respectively.
        annot[Name("Subtype")] = Name("Highlight")

        # (Required) An array of 8 × n numbers specifying the coordinates of n
        # quadrilaterals in default user space. Each quadrilateral shall
        # encompasses a word or group of contiguous words in the text
        # underlying the annotation. The coordinates for each quadrilateral shall
        # be given in the order
        # x1 y1 x2 y2 x3 y3 x4 y4
        annot[Name("QuadPoints")] = List().set_can_be_referenced(
            False)  # type: ignore [attr-defined]
        # x1, y1
        annot["QuadPoints"].append(pDecimal(rectangle.get_x()))
        annot["QuadPoints"].append(pDecimal(rectangle.get_y()))
        # x4, y4
        annot["QuadPoints"].append(pDecimal(rectangle.get_x()))
        annot["QuadPoints"].append(
            pDecimal(rectangle.get_y() + rectangle.get_height()))
        # x2, y2
        annot["QuadPoints"].append(
            pDecimal(rectangle.get_x() + rectangle.get_width()))
        annot["QuadPoints"].append(pDecimal(rectangle.get_y()))
        # x3, y3
        annot["QuadPoints"].append(
            pDecimal(rectangle.get_x() + rectangle.get_width()))
        annot["QuadPoints"].append(
            pDecimal(rectangle.get_y() + rectangle.get_height()))

        # border
        annot[Name("Border")] = List().set_can_be_referenced(
            False)  # type: ignore [attr-defined]
        annot["Border"].append(pDecimal(0))
        annot["Border"].append(pDecimal(0))
        annot["Border"].append(pDecimal(1))

        # CA
        annot[Name("CA")] = pDecimal(1)

        # append to /Annots
        if "Annots" not in self:
            self[Name("Annots")] = List()
        assert isinstance(self["Annots"], List)
        self["Annots"].append(annot)

        # return
        return self
    def test_extract_text_with_regex(self):

        l = RegularExpressionTextExtraction("[dD]ue [dD]ate [0-9]+/[0-9]+/[0-9]+")
        file: Path = Path("/home/joris/Code/pdf-corpus/0600.pdf")
        with open(file, "rb") as pdf_file_handle:
            doc = PDF.loads(pdf_file_handle, [l])

        bounding_box: typing.Optional[Rectangle] = None
        output_file = self.output_dir / (file.stem + "_due_date_marked.pdf")
        with open(output_file, "wb") as pdf_file_handle:
            rects: typing.List[Rectangle] = [
                x.get_bounding_box()
                for x in l.get_matched_chunk_of_text_render_events_per_page(0)
            ]
            bounding_box = self.bounding_box(rects)
            doc.get_page(0).append_polygon_annotation(
                LineArtFactory.rectangle(bounding_box),
                stroke_color=X11Color("Red"),
            )
            PDF.dumps(pdf_file_handle, doc)

        # expand box a bit
        if bounding_box:
            p = Decimal(2)
            bounding_box = Rectangle(
                bounding_box.get_x() - p,
                bounding_box.get_y() - p,
                bounding_box.get_width() + 2 * p,
                bounding_box.get_height() + 2 * p,
            )

        l1 = SimpleTextExtraction()
        l2 = LocationFilter(
            bounding_box.get_x(),
            bounding_box.get_y(),
            bounding_box.get_x() + bounding_box.get_width(),
            bounding_box.get_y() + bounding_box.get_height(),
        ).add_listener(l1)

        with open(file, "rb") as pdf_file_handle:
            doc = PDF.loads(pdf_file_handle, [l2])

        print(l1.get_text(0))
    def test_extract_text_in_area(self):
        r = Rectangle(Decimal(50), Decimal(400), Decimal(200), Decimal(100))
        doc = None
        file: Path = Path("/home/joris/Code/pdf-corpus/0600.pdf")
        with open(file, "rb") as pdf_file_handle:
            doc = PDF.loads(pdf_file_handle)

        output_file = self.output_dir / (file.stem + "_bill_to_marked.pdf")
        with open(output_file, "wb") as pdf_file_handle:
            doc.get_page(0).append_polygon_annotation(
                LineArtFactory.rectangle(r),
                stroke_color=X11Color("Red"),
            )
            PDF.dumps(pdf_file_handle, doc)

        l1 = SimpleTextExtraction()
        l2 = LocationFilter(
            r.get_x(), r.get_y(), r.get_x() + r.get_width(), r.get_y() + r.get_height()
        ).add_listener(l1)

        with open(file, "rb") as pdf_file_handle:
            doc = PDF.loads(pdf_file_handle, [l2])

        print(l1.get_text(0))
Esempio n. 4
0
    def _create_annotation(
        self,
        rectangle: Rectangle,
        contents: Optional[str] = None,
        color: Optional[Color] = None,
        border_horizontal_corner_radius: Optional[Decimal] = None,
        border_vertical_corner_radius: Optional[Decimal] = None,
        border_width: Optional[Decimal] = None,
    ):
        annot = Dictionary()

        # (Optional) The type of PDF object that this dictionary describes; if
        # present, shall be Annot for an annotation dictionary.
        annot[Name("Type")] = Name("Annot")

        # (Required) The annotation rectangle, defining the location of the
        # annotation on the page in default user space units.
        annot[Name("Rect")] = List().set_can_be_referenced(
            False)  # type: ignore [attr-defined]
        annot["Rect"].append(pDecimal(rectangle.get_x()))
        annot["Rect"].append(pDecimal(rectangle.get_y()))
        annot["Rect"].append(
            pDecimal(rectangle.get_x() + rectangle.get_width()))
        annot["Rect"].append(
            pDecimal(rectangle.get_y() + rectangle.get_height()))

        # (Optional) Text that shall be displayed for the annotation or, if this type of
        # annotation does not display text, an alternate description of the
        # annotation’s contents in human-readable form. In either case, this text is
        # useful when extracting the document’s contents in support of
        # accessibility to users with disabilities or for other purposes (see 14.9.3,
        # “Alternate Descriptions”). See 12.5.6, “Annotation Types” for more
        # details on the meaning of this entry for each annotation type.
        if contents is not None:
            annot[Name("Contents")] = String(contents)

        # (Optional except as noted below; PDF 1.3; not used in FDF files) An
        # indirect reference to the page object with which this annotation is
        # associated.
        # This entry shall be present in screen annotations associated with
        # rendition actions (PDF 1.5; see 12.5.6.18, “Screen Annotations” and
        # 12.6.4.13, “Rendition Actions”).
        annot[Name("P")] = self

        # (Optional; PDF 1.4) The annotation name, a text string uniquely
        # identifying it among all the annotations on its page.
        len_annots = len(self["Annots"]) if "Annots" in self else 0
        annot[Name("NM")] = String("annotation-{0:03d}".format(len_annots))

        # (Optional; PDF 1.1) The date and time when the annotation was most
        # recently modified. The format should be a date string as described in
        # 7.9.4, “Dates,” but conforming readers shall accept and display a string
        # in any format.
        annot[Name("M")] = String(self._timestamp_to_str())

        # (Optional; PDF 1.1) A set of flags specifying various characteristics of
        # the annotation (see 12.5.3, “Annotation Flags”). Default value: 0.
        annot[Name("F")] = pDecimal(4)

        # (Optional; PDF 1.2) An appearance dictionary specifying how the
        # annotation shall be presented visually on the page (see 12.5.5,
        # “Appearance Streams”). Individual annotation handlers may ignore this
        # entry and provide their own appearances.
        # annot[Name("AP")] = None

        # (Required if the appearance dictionary AP contains one or more
        # subdictionaries; PDF 1.2) The annotation’s appearance state, which
        # selects the applicable appearance stream from an appearance
        # subdictionary (see Section 12.5.5, “Appearance Streams”).
        # annot[Name("AS")] = None

        # Optional) An array specifying the characteristics of the annotation’s
        # border, which shall be drawn as a rounded rectangle.
        # (PDF 1.0) The array consists of three numbers defining the horizontal
        # corner radius, vertical corner radius, and border width, all in default user
        # space units. If the corner radii are 0, the border has square (not rounded)
        # corners; if the border width is 0, no border is drawn.
        # (PDF 1.1) The array may have a fourth element, an optional dash array
        # defining a pattern of dashes and gaps that shall be used in drawing the
        # border. The dash array shall be specified in the same format as in the
        # line dash pattern parameter of the graphics state (see 8.4.3.6, “Line
        # Dash Pattern”).
        if (border_horizontal_corner_radius is not None
                and border_vertical_corner_radius is not None
                and border_width is not None):
            annot[Name("Border")] = List().set_can_be_referenced(
                False)  # type: ignore [attr-defined]
            annot["Border"].append(pDecimal(border_horizontal_corner_radius))
            annot["Border"].append(pDecimal(border_vertical_corner_radius))
            annot["Border"].append(pDecimal(border_width))

        # (Optional; PDF 1.1) An array of numbers in the range 0.0 to 1.0,
        # representing a colour used for the following purposes:
        # The background of the annotation’s icon when closed
        # The title bar of the annotation’s pop-up window
        # The border of a link annotation
        # The number of array elements determines the colour space in which the
        # colour shall be defined
        if color is not None:
            color_max = pDecimal(256)
            annot[Name("C")] = List().set_can_be_referenced(
                False)  # type: ignore [attr-defined]
            annot["C"].append(pDecimal(color.to_rgb().red / color_max))
            annot["C"].append(pDecimal(color.to_rgb().green / color_max))
            annot["C"].append(pDecimal(color.to_rgb().blue / color_max))

        # (Required if the annotation is a structural content item; PDF 1.3) The
        # integer key of the annotation’s entry in the structural parent tree (see
        # 14.7.4.4, “Finding Structure Elements from Content Items”)
        # annot[Name("StructParent")] = None

        # (Optional; PDF 1.5) An optional content group or optional content
        # membership dictionary (see 8.11, “Optional Content”) specifying the
        # optional content properties for the annotation. Before the annotation is
        # drawn, its visibility shall be determined based on this entry as well as the
        # annotation flags specified in the F entry (see 12.5.3, “Annotation Flags”).
        # If it is determined to be invisible, the annotation shall be skipped, as if it
        # were not in the document.
        # annot[Name("OC")] = None

        # return
        return annot