Exemple #1
0
    def init(self, page: fitz.Page) -> Layout:
        '''Initialize layout object.'''
        # Layout object based on raw dict
        raw_layout = page.getText('rawdict')
        self._layout = Layout(raw_layout)

        # get rectangle shapes from page source:
        # these shapes are generally converted from docx, e.g. highlight, underline,
        # which are different from PDF comments like highlight, rectangle.
        if not page._isWrapped:
            page._wrapContents()

        # transformation matrix from PDF to PyMuPDF
        M = page.transformationMatrix  # PyMuPDF>=1.17.0

        for xref in page.getContents():
            page_content = self._doc_pdf._getXrefStream(xref).decode(
                encoding="ISO-8859-1")
            self._layout.rects.from_stream(page_content, M)

        # get annotations(comment shapes) from PDF page: consider highlight, underline,
        # strike-through-line only.
        annots = page.annots()
        self._layout.rects.from_annotations(annots)

        # plot raw layout
        if self.debug_mode:
            # new section for current pdf page
            new_page_section(self._doc_debug, self._layout.width,
                             self._layout.height, f'Page {page.number}')

            # initial layout
            self._layout.plot(self._doc_debug,
                              'Original Text Blocks',
                              key=PlotControl.LAYOUT)
            self._layout.plot(self._doc_debug,
                              'Original Rectangle Shapes',
                              key=PlotControl.SHAPE)

        return self._layout
Exemple #2
0
def paths_from_annotations(page: fitz.Page):
    ''' Get shapes, e.g. Line, Square, Highlight, from annotations(comment shapes) in PDF page.
        ---
        Args:
        - page: fitz.Page, current page
        
        There are stroke and fill properties for each shape, representing border and filling area respectively.
        So, a square annotation with both stroke and fill will be converted to five rectangles here:
        four borders and one filling area.

        read more:
            - https://pymupdf.readthedocs.io/en/latest/annot.html
            - https://pymupdf.readthedocs.io/en/latest/vars.html#annotation-types
    '''
    res = []
    for annot in page.annots():

        # annot type, e.g. (8, 'Highlight')
        key = annot.type[0]

        # color, e.g. {'stroke': [1.0, 1.0, 0.0], 'fill': []}
        c = annot.colors
        sc = utils.RGB_value(c['stroke']) if c['stroke'] else None
        fc = utils.RGB_value(c['fill']) if c['fill'] else None

        # width
        w = annot.border.get('width', 1.0)  # width=-1 if not set
        w = 1.0 if w == -1 else w  # 1.0 by default

        # bbox
        rect = annot.rect

        # considering the contributions to text format and table borders,
        # only the following types are processed.
        # PDF_ANNOT_LINE 3
        # PDF_ANNOT_SQUARE 4
        # PDF_ANNOT_HIGHLIGHT 8
        # PDF_ANNOT_UNDERLINE 9
        # PDF_ANNOT_STRIKEOUT 11

        # Line: a space of 1.5*w around each border
        #
        # +----------------------------+
        # |         space              |
        # |     +--------------+       |
        # |     |   border     | 1.5w  |
        # |     +--------------+       |
        # |         1.5w               |
        # +----------------------------+
        #
        if key == 3:
            x0 = rect.x0 + 1.5 * w
            x1 = rect.x1 - 1.5 * w
            y0 = y1 = (rect.y0 + rect.y1) / 2.0
            path = _add_stroke_line((x0, y0), (x1, y1), sc, w)
            res.append(path)

        # Square: a space of 0.5*w around eah border
        # border rects and filling rects are to be extracted from original square
        #
        # +------------------------------------------+
        # |                space                     |
        # |      +----------------------------+      |
        # |      |         border             |      |
        # |      |     +--------------+       |      |
        # |            |     fill     |  w    | 0.5w |
        # |      |     +--------------+       |      |
        # |      |            w               |      |
        # |      +----------------------------+      |
        # |                  0.5w                    |
        # +------------------------------------------+
        #
        elif key == 4:
            # stroke rectangles
            if not sc is None:
                x0, y0 = rect.x0 + w, rect.y0 + w
                x1, y1 = rect.x1 - w, rect.y1 - w
                path = _add_stroke_rect((x0, y0), (x1, y1), sc, w)
                res.append(path)

            # fill rectangle
            if not fc is None:
                d = 1.5 * w
                x0, y0 = rect.x0 + d, rect.y0 + d
                x1, y1 = rect.x1 - d, rect.y1 - d
                path = _add_fill_rect((x0, y0), (x1, y1), fc)
                res.append(path)

        # highlight, underline, strikethrough: on space
        # For these shapes, `annot.rect` is a combination of all sub-highlights, especially
        # the highlight is continuous in multi-lines.
        # So, `annot.vertices` should be used here, i.e. vertices marked with `+` below.
        #          +------------------------+
        #          +------------------------+
        # +-----------+
        # +-----------+
        # NOTE: Though underline and strikethrough are just lines, the affected areas are same as
        # highlights, as illustrated above.
        #
        # https://github.com/pymupdf/PyMuPDF/issues/318
        #
        elif key in (8, 9, 11):
            points = annot.vertices
            for i in range(int(len(points) / 4.0)):  # four points in a group
                # highlight: whole bbox
                if key == 8:
                    x0, y0 = points[4 * i]
                    x1, y1 = points[4 * i + 3]

                    # NOTE: this indded a stroke for PyMuPDF -> no fill color but stroke color !!
                    path = _add_fill_rect((x0, y0), (x1, y1), sc)
                    res.append(path)

                else:
                    # underline: bottom edge
                    if key == 9:
                        start, end = points[4 * i + 2], points[4 * i + 3]

                    # strikethrough: average of top and bottom edge
                    else:
                        x0, x1 = points[4 * i][0], points[4 * i + 1][0]
                        y_ = (points[4 * i][1] + points[4 * i + 2][1]) / 2.0
                        start = x0, y_
                        end = x1, y_

                    path = _add_stroke_line(start, end, sc, w)
                    res.append(path)

    return res