Beispiel #1
0
    def clip_page(cls,
                  page: fitz.Page,
                  bbox: fitz.Rect = None,
                  zoom: float = 3.0):
        """Clip page pixmap (without text) according to ``bbox``.

        Args:
            page (fitz.Page): pdf page to extract.
            bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
            zoom (float, optional): Improve resolution by this rate. Defaults to 3.0.

        Returns:
            dict: Raw dict of the extracted pixmap.
        """
        # hide text before clip the image only
        # render Tr: set the text rendering mode
        # - 3: neither fill nor stroke the text -> invisible
        # read more:
        # - https://github.com/pymupdf/PyMuPDF/issues/257
        # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf
        doc = page.parent
        for xref in page.get_contents():
            stream = doc.xrefStream(xref).replace(b'BT', b'BT 3 Tr') \
                                             .replace(b'Tm', b'Tm 3 Tr') \
                                             .replace(b'Td', b'Td 3 Tr')
            doc.updateStream(xref, stream)

        # improve resolution
        # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution
        # - https://github.com/pymupdf/PyMuPDF/issues/181
        bbox = page.rect if bbox is None else bbox & page.rect
        image = page.getPixmap(clip=bbox,
                               matrix=fitz.Matrix(zoom,
                                                  zoom))  # type: fitz.Pixmap
        return cls.to_raw_dict(image, bbox)
Beispiel #2
0
    def clip_page(cls,
                  page: fitz.Page,
                  bbox: fitz.Rect = None,
                  zoom: float = 3.0):
        '''Clip page pixmap (without text) according to `bbox` (entire page by default).
        '''
        # hide text before clip the image only
        # render Tr: set the text rendering mode
        # - 3: neither fill nor stroke the text -> invisible
        # read more:
        # - https://github.com/pymupdf/PyMuPDF/issues/257
        # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf
        doc = page.parent
        for xref in page._getContents():
            stream = doc._getXrefStream(xref).replace(b'BT', b'BT 3 Tr') \
                                             .replace(b'Tm', b'Tm 3 Tr') \
                                             .replace(b'Td', b'Td 3 Tr')
            doc._updateStream(xref, stream)

        # improve resolution
        # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution
        # - https://github.com/pymupdf/PyMuPDF/issues/181
        bbox = page.rect if bbox is None else bbox & page.rect
        image = page.getPixmap(clip=bbox,
                               matrix=fitz.Matrix(zoom,
                                                  zoom))  # type: fitz.Pixmap
        return cls.to_raw_dict(image, bbox)
Beispiel #3
0
def total_image_area(page: fitz.Page) -> int:
    """Returns the total area (in pixels) consumed by images that appear
    in `page`.
    Does not account for overlap between images, so it is possible for
    the total computed area to exceed the actual area of the page.
    """
    return sum(
        rect.getArea()
        for image in page.get_images()
        for rect in page.get_image_rects(image)  # type: ignore
    )
    def extract_images(cls,
                       page: fitz.Page,
                       clip_image_res_ratio: float = 3.0):
        """Extract normal images with ``Page.getImageList()``.

        Args:
            page (fitz.Page): pdf page to extract images.
            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0.

        Returns:
            list: A list of extracted and recovered image raw dict.
        
        .. note::
            ``Page.getImageList()`` contains each image only once, which may less than the real count of images in a page.
        """
        # pdf document
        doc = page.parent

        # check each image item:
        # (xref, smask, width, height, bpc, colorspace, ...)
        images = []
        for item in page.getImageList(full=True):
            # should always wrap getImageBbox in a try-except clause, per
            # https://github.com/pymupdf/PyMuPDF/issues/487
            try:
                item = list(item)
                item[-1] = 0
                bbox = page.getImageBbox(
                    item)  # item[7]: name entry of such an item
            except ValueError:
                continue

            # ignore images outside page
            if not bbox.intersects(page.rect): continue

            # recover image
            pix = cls._recover_pixmap(doc, item)

            # regarding images consist of alpha values only, i.e. colorspace is None,
            # the turquoise color shown in the PDF is not part of the image, but part of PDF background.
            # So, just to clip page pixmap according to the right bbox
            # https://github.com/pymupdf/PyMuPDF/issues/677
            if not pix.colorspace:
                pix = cls._clip_page(page, bbox, zoom=clip_image_res_ratio)

            raw_dict = cls._to_raw_dict(pix, bbox)
            images.append(raw_dict)
        return images
def filter_redactions_by_pixmap(
    redactions: List[RedactionType],
    page: Page,
) -> List[RedactionType]:
    """Convert each bad redaction to an image and check it for text

    :param redactions: A list of redactions that might be bad
    :param page: The PyMuPDF.Page object where the bad redactions might be
    :return: The redactions, if they are valid
    """
    bad_redactions = []
    for redaction in redactions:
        pixmap = page.get_pixmap(
            # Use gray for simplicity and speed, though this risks missing a
            # bad redaction.
            colorspace=fitz.csRGB,
            clip=fitz.Rect(redaction["bbox"]),
        )
        if not pixmap.is_unicolor:
            # There's some degree of variation in the colors of the pixels.
            # ∴ it's not a uniform box and it's not a bad redaction.
            # filename = f'{redaction["text"].replace("/", "_")}.png'
            # pixmap.save(filename)
            continue
        bad_redactions.append(redaction)
    return bad_redactions
Beispiel #6
0
 def clip_page(cls, page:fitz.Page, bbox:fitz.Rect, zoom:float=3.0):
     '''Clip pixmap according to bbox from page.'''
     # improve resolution
     # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution
     # - https://github.com/pymupdf/PyMuPDF/issues/181        
     image = page.getPixmap(clip=bbox, matrix=fitz.Matrix(zoom, zoom)) # type: fitz.Pixmap
     return cls.to_raw_dict(image, bbox)
Beispiel #7
0
    def init(self, page:fitz.Page) -> Layout:
        '''Initialize layout object.'''
        # Layout object based on raw dict
        # NOTE: all these coordinates are relative to un-rotated page
        # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages
        raw_layout = page.getText('rawdict')

        # though 'width', 'height' are contained in `raw_dict`, they are based on un-rotated page.
        # so, update page width/height to right direction in case page is rotated
        *_, w, h = page.rect # always reflecting page rotation
        raw_layout.update({
            'width' : w,
            'height': h
        })
        self._layout = Layout(raw_layout, page.rotationMatrix)
        
        # get rectangle shapes from page source
        self._layout.rects.from_stream(self.doc_pdf, page)
        
        # get annotations(comment shapes) from PDF page, e.g. 
        # highlight, underline and strike-through-line        
        self._layout.rects.from_annotations(page)

        # plot raw layout
        if self.debug_mode:
            # new section for current pdf page
            new_page_section(self._doc_debug, self._layout.width, self._layout.height, f'Page {page.number}')

            # initial layout
            self._layout.plot(self._doc_debug, 'Original Text Blocks', key=PlotControl.LAYOUT)
            self._layout.plot(self._doc_debug, 'Original Rectangle Shapes', key=PlotControl.SHAPE)

        return self._layout
Beispiel #8
0
def search_in_page(regex: re.Pattern, page: Page) -> List[dict]:
    """Search for `text` in `page` and extract meta

    Arguments
      needle: the text to search for
      page: page number (1-based index)
    Returns
      a list of meta
    """
    result = []

    page_meta = page.getTextPage().extractDICT()

    # we are using get(key, []) to bypass any missing key errors
    for blk in page_meta.get('blocks', []):
        for ln in blk.get('lines', []):
            for spn in ln.get('spans', []):
                text = spn.get('text', "")
                # the current search algorithm is very naive and doesn't handle
                # line breaks and more complex layout. might want to take a
                # look at `page.searchFor`, but the current algorithm should be
                # enough for TeX-generated pdf
                if regex.search(text):
                    result.append(spn)
    return result
Beispiel #9
0
    def __init__(self, doc: fitz.Document, page: fitz.Page) -> None:
        self.doc = doc
        self._page = page

        # Page's cropbox (to help identify what part of an image is actually
        # being displayed), already rotated
        self.bbox = page.bound()

        # Cross-referenced images (ignore masks since they can't be easily downsampled)
        all_xref_images = list(map(self._build_xref_image, page.get_images(full=True)))
        smask_xrefs = set(filter(None, map(lambda xref_image: xref_image['smask'], all_xref_images)))
        self.xref_images = list(filter(lambda xref_image: xref_image['xref'] not in smask_xrefs, all_xref_images))

        # Lazy, memoized attributes

        # Match block numbers to image hashes
        self._block_hashes = None
Beispiel #10
0
def render_fitz_page(
    page: fitz.Page, zoom: float, pixel_ratio: float, clip: fitz.Rect = None
) -> QtGui.QPixmap:
    scale_ratio = zoom * pixel_ratio
    pix = page.getPixmap(matrix=fitz.Matrix(scale_ratio, scale_ratio), clip=clip)
    mode = "RGBA" if pix.alpha else "RGB"
    img = Image.frombytes(mode, [pix.width, pix.height], pix.samples)
    img = ImageQt.ImageQt(img)
    img.setDevicePixelRatio(pixel_ratio)
    return img
Beispiel #11
0
def image_from_page(page: fitz.Page, scale: float = 1) -> Image:
    """Converts a page to an image.
    :param page: the page to be represented as an image
    :param scale: the proportion by which to scale the image
    """
    pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale))  # type: ignore
    return Image.frombytes(  # type: ignore
        ("RGBA" if pix.alpha else "RGB"),
        (pix.width, pix.height), pix.samples
    )
Beispiel #12
0
    def extract_images(
        cls,
        page: fitz.Page,
        clip_image_res_ratio: float = 3.0  # resolution ratio of cliiped bitmap
    ):
        ''' Get images dict based on image contents from `Page.getImageList()`.

            NOTE: Page.getImageList() contains each image only once, which may less than the real
            count if images in a page.
        '''
        # pdf document
        doc = page.parent

        # check each image item:
        # (xref, smask, width, height, bpc, colorspace, ...)
        images = []
        for item in page.getImageList(full=True):
            # should always wrap getImageBbox in a try-except clause, per
            # https://github.com/pymupdf/PyMuPDF/issues/487
            try:
                item = list(item)
                item[-1] = 0
                bbox = page.getImageBbox(
                    item)  # item[7]: name entry of such an item
            except ValueError:
                continue

            # ignore images outside page
            if not bbox.intersects(page.rect): continue

            pix = ImagesExtractor.recover_pixmap(doc, item)

            # regarding images consist of alpha values only, i.e. colorspace is None,
            # the turquoise color shown in the PDF is not part of the image, but part of PDF background.
            # So, just to clip page pixmap according to the right bbox
            # https://github.com/pymupdf/PyMuPDF/issues/677
            if not pix.colorspace:
                raw_dict = cls.clip_page(page, bbox, zoom=clip_image_res_ratio)
            else:
                raw_dict = cls.to_raw_dict(pix, bbox)
            images.append(raw_dict)
        return images
Beispiel #13
0
 def to_image(self, page: fitz.Page):
     '''Convert to image block dict if this is a vector graphic paths.'''
     bbox = self.bbox
     image = page.getPixmap(clip=bbox)
     return {
         'type': 1,
         'bbox': tuple(bbox),
         'ext': 'png',
         'width': bbox.width,
         'height': bbox.height,
         'image': image.getImageData(output="png")
     }
Beispiel #14
0
    def _hide_page_text(page: fitz.Page):
        '''Hide page text before clipping page.'''
        # NOTE: text might exist in both content stream and form object stream
        # - content stream, i.e. direct page content
        # - form object, i.e. contents referenced by this page
        xref_list = [
            xref for (xref, name, invoker, bbox) in page.get_xobjects()
        ]
        xref_list.extend(page.get_contents())

        # render Tr: set the text rendering mode
        # - 3: neither fill nor stroke the text -> invisible
        # read more:
        # - https://github.com/pymupdf/PyMuPDF/issues/257
        # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf
        doc = page.parent  # type: fitz.Document
        for xref in xref_list:
            stream = doc.xref_stream(xref).replace(b'BT', b'BT 3 Tr') \
                                             .replace(b'Tm', b'Tm 3 Tr') \
                                             .replace(b'Td', b'Td 3 Tr')
            doc.update_stream(xref, stream)
Beispiel #15
0
    def init(self, page: fitz.Page) -> Layout:
        '''Initialize layout object.'''
        # Layout object based on raw dict
        raw_layout = page.getText('rawdict')
        self._layout = Layout(raw_layout)

        # get rectangle shapes from page source:
        # these shapes are generally converted from docx, e.g. highlight, underline,
        # which are different from PDF comments like highlight, rectangle.
        if not page._isWrapped:
            page._wrapContents()

        # transformation matrix from PDF to PyMuPDF
        M = page.transformationMatrix  # PyMuPDF>=1.17.0

        for xref in page.getContents():
            page_content = self._doc_pdf._getXrefStream(xref).decode(
                encoding="ISO-8859-1")
            self._layout.rects.from_stream(page_content, M)

        # get annotations(comment shapes) from PDF page: consider highlight, underline,
        # strike-through-line only.
        annots = page.annots()
        self._layout.rects.from_annotations(annots)

        # plot raw layout
        if self.debug_mode:
            # new section for current pdf page
            new_page_section(self._doc_debug, self._layout.width,
                             self._layout.height, f'Page {page.number}')

            # initial layout
            self._layout.plot(self._doc_debug,
                              'Original Text Blocks',
                              key=PlotControl.LAYOUT)
            self._layout.plot(self._doc_debug,
                              'Original Rectangle Shapes',
                              key=PlotControl.SHAPE)

        return self._layout
Beispiel #16
0
    def extract_images(cls, page:fitz.Page):
        '''Get images from current page.'''
        # pdf document
        doc = page.parent

        # check each image item:
        # (xref, smask, width, height, bpc, colorspace, ...)
        images = []
        for item in page.getImageList(full=True):
            pix = recover_pixmap(doc, item)
            bbox = page.getImageBbox(item[7]) # item[7]: name entry of such an item

            # regarding images consist of alpha values only, i.e. colorspace is None,
            # the turquoise color shown in the PDF is not part of the image, but part of PDF background.
            # So, just to clip page pixmap according to the right bbox
            # https://github.com/pymupdf/PyMuPDF/issues/677
            if not pix.colorspace:
                raw_dict = cls.clip_page(page, bbox, zoom=3.0)
            else:
                raw_dict = cls.to_raw_dict(pix, bbox)
            images.append(raw_dict)
        return images
Beispiel #17
0
 def _analyze_page(self, page: fitz.Page):
     """Analyzes `page` and records the data extracted from it. Does
     nothing if the page cannot be analyzed successfully.
     """
     original_text = page.get_text()  # type: ignore
     if (
         total_image_area(page) / page.bound().getArea()
         < self.image_area_thresh
         and not len([a for a in original_text if a == '�'])
         > self.max_unreadable
     ):
         metadata, orientation_used, scale = None, None, None
         language = detected_language(original_text)
         self.texts.append(original_text)
         self.mean_confidences.append(None)
         used_original_text = True
     else:
         metadata, orientation_used, language, scale = self._run_ocr(
             page,
             (detected_language(original_text)
              if len(original_text) >= self.text_len_thresh
              else self.languages.items[0])
         )
         if mean_conf(metadata) < self.coarse_thresh:
             warnings.warn('Failed to analyze image.')
         self.texts.append(data_to_string(
             metadata.corrected if 'corrected' in metadata.columns
             else metadata.text
         ))
         self.mean_confidences.append(mean_conf(metadata))
         used_original_text = False
     self.languages.add_weight(language)
     self.metadata.append(metadata)
     self.orientations.append(orientation_used)
     self.page_languages.append(language)
     self.used_original_texts.append(used_original_text)
     self.times.append(time.time())
     self.scales.append(scale)
    def _hide_page_text(cls, page: fitz.Page):
        """Hide page text before clipping page.

        Args:
            page (fitz.Page): pdf page to extract.
        """
        # render Tr: set the text rendering mode
        # - 3: neither fill nor stroke the text -> invisible
        # read more:
        # - https://github.com/pymupdf/PyMuPDF/issues/257
        # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf
        doc = page.parent
        for xref in page.get_contents():
            stream = doc.xrefStream(xref).replace(b'BT', b'BT 3 Tr') \
                                             .replace(b'Tm', b'Tm 3 Tr') \
                                             .replace(b'Td', b'Td 3 Tr')
            doc.updateStream(xref, stream)
Beispiel #19
0
    def initialize(self, page: fitz.Page):
        '''Initialize layout object.'''
        # -----------------------------------------
        # Layout object based on raw dict
        # -----------------------------------------
        # NOTE: all these coordinates are relative to un-rotated page
        # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages
        raw_layout = page.getText('rawdict')

        # -----------------------------------------
        # page size
        # -----------------------------------------
        # though 'width', 'height' are contained in `raw_dict`, they are based on un-rotated page.
        # so, update page width/height to right direction in case page is rotated
        *_, w, h = page.rect  # always reflecting page rotation
        raw_layout.update({'width': w, 'height': h})

        # -----------------------------------------
        # page images
        # -----------------------------------------
        # image bytes from page.getText('rawdict') can't reproduce transparent images,
        # so we re-extract page images
        for block in raw_layout['blocks']:
            # disable image in raw dict
            if block['type'] == 1: block['type'] = -1

        # extract and recover images
        images = ImagesExtractor.extract_images(page)
        raw_layout['blocks'].extend(images)

        # -----------------------------------------
        # page paths
        # -----------------------------------------
        # convert vector graphic paths to pixmap
        self._paths_extractor = PathsExtractor()
        images, paths = self._paths_extractor.extract_paths(page)
        raw_layout['blocks'].extend(images)
        raw_layout['paths'] = paths

        # init layout
        self._layout = Layout(raw_layout, page.rotationMatrix)

        return self._layout
Beispiel #20
0
    def _page_blocks(page:fitz.Page):
        '''Get page blocks and adjust image blocks.'''
        # Layout object based on raw dict:
        # NOTE: all these coordinates are relative to un-rotated page
        # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages
        raw_layout = page.getText('rawdict')

        # Adjust image blocks:
        # Image blocks are generated for every image location – whether or not there are any duplicates. 
        # This is in contrast to Page.getImageList(), which will contain each image only once.
        # https://pymupdf.readthedocs.io/en/latest/textpage.html#dictionary-structure-of-extractdict-and-extractrawdict
        # 
        # So, a compromise:
        # - get image contents with `page.getImageList`
        # - get image location with `page.getText('rawdict')`
        # 
        # extract and recover images
        recovered_images = ImagesExtractor.extract_images(page)

        # group original image blocks by image contents
        image_blocks_group = defaultdict(list)
        for block in raw_layout['blocks']:
            if block['type'] != 1: continue
            image_blocks_group[hash(block['image'])].append(block)

        # update raw layout blocks
        def same_images(img, img_list):
            bbox = list(map(round, img['bbox']))
            for _img in img_list:
                if list(map(round, _img['bbox']))==bbox: return True
            return False

        for image in recovered_images:
            for k, image_blocks in image_blocks_group.items():
                if not same_images(image, image_blocks): continue
                for image_block in image_blocks: image_block['image'] = image['image']
                break

            # an image outside the page is not counted in page.getText(), so let's add it here
            else:
                raw_layout['blocks'].append(image)

        return raw_layout
Beispiel #21
0
    def initialize(self, page:fitz.Page):
        '''Initialize layout object.'''
        # Layout object based on raw dict
        # NOTE: all these coordinates are relative to un-rotated page
        # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages
        raw_layout = page.getText('rawdict')

        # though 'width', 'height' are contained in `raw_dict`, they are based on un-rotated page.
        # so, update page width/height to right direction in case page is rotated
        *_, w, h = page.rect # always reflecting page rotation
        raw_layout.update({ 'width' : w, 'height': h })
        self._layout = Layout(raw_layout, page.rotationMatrix)
        
        # get rectangle shapes from page source
        self._layout.rects.from_stream(self.doc_pdf, page)
        
        # get annotations(comment shapes) from PDF page, e.g. 
        # highlight, underline and strike-through-line        
        self._layout.rects.from_annotations(page)

        return self._layout
Beispiel #22
0
    def initialize(self, page:fitz.Page):
        '''Initialize layout object.'''
        # Layout object based on raw dict
        # NOTE: all these coordinates are relative to un-rotated page
        # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages
        raw_layout = page.getText('rawdict')

        # though 'width', 'height' are contained in `raw_dict`, they are based on un-rotated page.
        # so, update page width/height to right direction in case page is rotated
        *_, w, h = page.rect # always reflecting page rotation
        raw_layout.update({ 'width' : w, 'height': h })

        # pdf paths and converted images
        self._paths = PathsExtractor()
        images, paths = self._paths.parse(page).filter_pixmaps(page)
        raw_layout['blocks'].extend(images)
        raw_layout['paths'] = paths

        # init layout
        self._layout = Layout(raw_layout, page.rotationMatrix)    

        return self._layout
def get_intersecting_chars(page: Page,
                           rectangles: List[Rect]) -> List[CharDictType]:
    """Get the chars that are occluded by the rectangles

    We do this in two stages. First, we check for intersecting spans, then we
    check for intersecting chars within those spans. The idea of this is

    :param page: The PyMuPDF.Page object to inspect
    :param rectangles: A list of PyMuPDF.Rect objects from the page (aka the
    redactions).
    :return A list of characters that are under the rectangles
    """
    if len(rectangles) == 0:
        return []

    spans = page.get_texttrace()
    intersecting_chars = []
    for span in spans:
        span_seq_no = span["seqno"]
        span_color = span["color"]
        span_rect = fitz.Rect(span["bbox"])
        span_rect.seqno = span_seq_no
        span_rect.fill = span_color
        if not intersects(span_rect, rectangles):
            continue
        for char in span["chars"]:
            char_rect = fitz.Rect(char[3])
            char_rect.seqno = span_seq_no
            char_rect.fill = span_color
            if intersects(char_rect, rectangles, occlusion_threshold=0.8):
                char_dict: CharDictType = {
                    "rect": char_rect,
                    "c": chr(char[0]),
                }
                intersecting_chars.append(char_dict)

    return intersecting_chars
    def extract_vector_graphics(cls,
                                page: fitz.Page,
                                exclude_areas: list,
                                clip_image_res_ratio: float = 3.0):
        """Detect and extract vector graphics by clipping associated page area.

        Args:
            page (fitz.Page): pdf page to extract images.
            exclude_areas (list): A list of bbox-like ``(x0, y0, x1, y1)`` area to exclude, 
                e.g. raster image area, table area.
            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0.

        Returns:
            list: A list of extracted and recovered image raw dict.
        
        .. note::
            Contours for vector graphics are detected first with ``opencv-python``.
        """
        # find contours
        contours = cls._detect_svg_contours(page, exclude_areas)

        # filter contours
        fun = lambda a, b: a.bbox & b.bbox
        groups = contours.group(fun)

        # clip images
        images = []
        for group in groups:
            bbox = group.bbox
            pix = page.getPixmap(clip=bbox,
                                 matrix=fitz.Matrix(clip_image_res_ratio,
                                                    clip_image_res_ratio))
            raw_dict = cls._to_raw_dict(pix, bbox)
            images.append(raw_dict)

        return images
    def _clip_page(cls,
                   page: fitz.Page,
                   bbox: fitz.Rect = None,
                   zoom: float = 3.0):
        """Clip page pixmap (without text) according to ``bbox``.

        Args:
            page (fitz.Page): pdf page to extract.
            bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
            zoom (float, optional): Improve resolution by this rate. Defaults to 3.0.

        Returns:
            fitz.Pixmap: The extracted pixmap.
        """
        # hide text
        cls._hide_page_text(page)

        # improve resolution
        # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution
        # - https://github.com/pymupdf/PyMuPDF/issues/181
        bbox = page.rect if bbox is None else bbox & page.rect
        return page.getPixmap(clip=bbox,
                              matrix=fitz.Matrix(zoom,
                                                 zoom))  # type: fitz.Pixmap
Beispiel #26
0
    def init(self, page:fitz.Page) -> Layout:
        '''Initialize layout object.'''
        # Layout object based on raw dict
        raw_layout = page.getText('rawdict')
        new_layout = split_blocks(raw_layout)
        self._layout = Layout(new_layout)
        
        # get rectangle shapes from page source
        self._layout.rects.from_stream(self.doc_pdf, page)
        
        # get annotations(comment shapes) from PDF page, e.g. 
        # highlight, underline and strike-through-line        
        self._layout.rects.from_annotations(page)

        # plot raw layout
        if self.debug_mode:
            # new section for current pdf page
            new_page_section(self._doc_debug, self._layout.width, self._layout.height, f'Page {page.number}')

            # initial layout
            self._layout.plot(self._doc_debug, 'Original Text Blocks', key=PlotControl.LAYOUT)
            self._layout.plot(self._doc_debug, 'Original Rectangle Shapes', key=PlotControl.SHAPE)

        return self._layout
Beispiel #27
0
def paths_from_stream(page: fitz.Page):
    ''' Get paths, e.g. highlight, underline and table borders, from page source contents.
        ---
        Args:
        - page: fitz.Page, current page

        The page source is represented as contents of stream object. For example,
        ```
            /P<</MCID 0>> BDC
            ...
            1 0 0 1 90.0240021 590.380005 cm
            ...
            1 1 0 rg # or 0 g
            ...
            285.17 500.11 193.97 13.44 re f*
            ...
            214 320 m
            249 322 l
            426 630 425 630 422 630 c
            ...
            EMC
        ```
        where,
        - `cm` specify a coordinate system transformation, here (0,0) translates to (90.0240021 590.380005)
        - `q`/`Q` save/restores graphic status
        - `rg` / `g` specify color mode: rgb / grey
        - `re`, `f` or `f*`: fill rectangle path with pre-defined color
        - `m` (move to) and `l` (line to) defines a path
        - `c` draw cubic Bezier curve with given control points
        
        In this case,
        - a rectangle with:
            - fill color is yellow (1,1,0)
            - lower left corner: (285.17 500.11)
            - width: 193.97
            - height: 13.44
        - a line from (214, 320) to (249, 322)
        - a Bezier curve with control points (249,322), (426,630), (425,630), (422,630)

        Read more:        
        - https://github.com/pymupdf/PyMuPDF/issues/263
        - https://github.com/pymupdf/PyMuPDF/issues/225
        - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdf_reference_archive/pdf_reference_1-7.pdf
    '''
    # Each object in PDF has a cross-reference number (xref):
    # - to get its source contents: `doc.xrefObject()` or low level API `doc._getXrefString()`; but for stream objects, only the non-stream part is returned
    # - to get the stream data: `doc.xrefStream(xref)` or low level API `doc._getXrefStream(xref)`
    # - the xref for a page object itself: `page.xref`
    # - all stream xref contained in one page: `page.getContents()`
    # - combine all stream object contents together: `page.readContents()` with PyMuPDF>=1.17.0
    #
    # Clean contents first:
    # syntactically correct, standardize and pretty print the contents stream
    page.cleanContents()
    xref_stream = page.readContents().decode(encoding="ISO-8859-1")

    # transformation matrix for coordinate system conversion from pdf to fitz
    # NOTE: transformation matrix converts PDF CS to UNROTATED PyMuPDF page CS,
    #       so need further rotation transformation to the real page CS (applied in Object BBox)
    # https://github.com/pymupdf/PyMuPDF/issues/619
    matrix = page.transformationMatrix

    # Graphic States: working CS is coincident with the absolute origin (0, 0)
    # Refer to PDF reference v1.7 4.2.3 Transformation Metrics
    #                        | a b 0 |
    # [a, b, c, d, e, f] =>  | c b 0 |
    #                        | e f 1 |
    ACS = [fitz.Matrix(0.0)]  # identity matrix
    WCS = fitz.Matrix(0.0)

    # Graphics color:
    # - color space: PDF Reference Section 4.5 Color Spaces
    # NOTE: it should have to calculate color value under arbitrary color space, but it's really hard work for now.
    # So, consider device color space only like DeviceGray, DeviceRGB, DeviceCMYK, and set black for all others.
    device_space = True
    color_spaces = _check_device_cs(page)

    # - stroking color
    Acs = [utils.RGB_value((0.0, 0.0, 0.0))]  # stored value -> stack
    Wcs = Acs[0]  # working value
    # - filling color
    Acf = [utils.RGB_value((0.0, 0.0, 0.0))]
    Wcf = Acf[0]

    # Stroke width
    Ad = [0.0]
    Wd = Ad[0]

    # collecting paths: each path is a list of points
    paths = []  # a list of path

    # clip path
    Acp = []  # stored clipping path
    Wcp = []  # working clipping path

    # Check line by line
    # Cleaned by `page.cleanContents()`, operator and operand are aligned in a same line;
    # otherwise, have to check stream contents word by word (line always changes)
    lines = xref_stream.splitlines()

    res = []  # final results
    for line in lines:

        words = line.split()
        if not words: continue

        op = words[-1]  # operator always at the end after page.cleanContents()

        # -----------------------------------------------
        # Color Operators: PDF Reference Table 4.24
        # -----------------------------------------------
        # - set color space:
        #   color_space_name cs  # specify color space
        #   c1 c2 ... SC/SCN     # components under defined color space
        if op.upper() == 'CS':
            Wcs = utils.RGB_value((0.0, 0.0, 0.0))
            Wcf = utils.RGB_value((0.0, 0.0, 0.0))

            # Consider normal device cs only
            device_space = color_spaces.get(words[0], False)

        # - set color: color components under specified color space
        elif op.upper() == 'SC':  # c1 c2 ... cn SC
            c = _RGB_from_color_components(words[0:-1], device_space)
            #  non-stroking color
            if op == 'sc':
                Wcf = c
            # stroking color
            else:
                Wcs = c

        # - set color: color components under specified color space
        elif op.upper() == 'SCN':  # c1 c2 ... cn [name] SC
            if utils.is_number(words[-2]):
                c = _RGB_from_color_components(words[0:-1], device_space)
            else:
                c = _RGB_from_color_components(words[0:-2], device_space)

            #  non-stroking color
            if op == 'scn':
                Wcf = c
            # stroking color
            else:
                Wcs = c

        # - DeviceGray space, equal to:
        # /DeviceGray cs
        # c sc
        elif op.upper() == 'G':  # 0 g
            g = float(words[0])
            # nonstroking color, i.e. filling color here
            if op == 'g':
                Wcf = utils.RGB_value((g, g, g))
            # stroking color
            else:
                Wcs = utils.RGB_value((g, g, g))

        # - DeviceRGB space
        elif op.upper() == 'RG':  # 1 1 0 rg
            r, g, b = map(float, words[0:-1])

            #  nonstroking color
            if op == 'rg':
                Wcf = utils.RGB_value((r, g, b))
            # stroking color
            else:
                Wcs = utils.RGB_value((r, g, b))

        # - DeviceCMYK space
        elif op.upper() == 'K':  # c m y k K
            c, m, y, k = map(float, words[0:-1])
            #  nonstroking color
            if op == 'k':
                Wcf = utils.CMYK_to_RGB(c, m, y, k, cmyk_scale=1.0)
            # stroking color
            else:
                Wcs = utils.CMYK_to_RGB(c, m, y, k, cmyk_scale=1.0)

        # -----------------------------------------------
        # Graphics State Operators: PDF References Table 4.7
        # -----------------------------------------------
        # CS transformation: a b c d e f cm, e.g.
        # 0.05 0 0 -0.05 0 792 cm
        # refer to PDF Reference 4.2.2 Common Transformations for detail
        elif op == 'cm':
            # update working CS
            components = list(map(float, words[0:-1]))
            Mt = fitz.Matrix(*components)
            WCS = Mt * WCS  # M' = Mt x M

        # stroke width
        elif op == 'w':  # 0.5 w
            Wd = float(words[0])

        # save or restore graphics state:
        # only consider transformation and color here
        elif op == 'q':  # save
            ACS.append(fitz.Matrix(WCS))  # copy as new matrix
            Acf.append(Wcf)
            Acs.append(Wcs)
            Ad.append(Wd)
            Acp.append(Wcp)

        elif op == 'Q':  # restore
            WCS = fitz.Matrix(ACS.pop())  # copy as new matrix
            Wcf = Acf.pop()
            Wcs = Acs.pop()
            Wd = Ad.pop()
            Wcp = Acp.pop()

        # -----------------------------------------------
        # Path Construction Operators: PDF References Table 4.9
        # -----------------------------------------------
        # rectangle block:
        # x y w h re is equivalent to
        # x   y   m
        # x+w y   l
        # x+w y+h l
        # x   y+h l
        # h          # close the path
        elif op == 're':
            # ATTENTION:
            # top/bottom, left/right is relative to the positive direction of CS,
            # while a reverse direction may be performed, so be careful when calculating
            # the corner points.
            # Coordinates in the transformed PDF CS:
            #   y1 +----------+
            #      |          | h
            #   y0 +----w-----+
            #      x0        x1
            #

            # (x, y, w, h) before this line
            x0, y0, w, h = map(float, words[0:-1])
            path = []
            path.append((x0, y0))
            path.append((x0 + w, y0))
            path.append((x0 + w, y0 + h))
            path.append((x0, y0 + h))
            path.append((x0, y0))

            paths.append(path)

        # path: m -> move to point to start a path
        elif op == 'm':  # x y m
            x0, y0 = map(float, words[0:-1])
            paths.append([(x0, y0)])

        # path: l -> straight line to point
        elif op == 'l':  # x y l
            x0, y0 = map(float, words[0:-1])
            paths[-1].append((x0, y0))

        # path: c -> cubic Bezier curve with control points
        elif op in ('c', 'v', 'y'):
            coords = list(map(float, words[0:-1]))
            P = [(coords[i], coords[i + 1]) for i in range(0, len(coords), 2)]
            x0, y0 = paths[-1][-1]

            # x1 y1 x2 y2 x3 y3 c -> (x1,y1), (x2,y2) as control points
            if op == 'c':
                P.insert(0, (x0, y0))

            # x2 y2 x3 y3 v -> (x0,y0), (x2,y2) as control points
            elif op == 'v':
                P.insert(0, (x0, y0))
                P.insert(0, (x0, y0))

            # x1 y1 x3 y3 y -> (x1,y1), (x3,y3) as control points
            else:
                P.insert(0, (x0, y0))
                P.append(P[-1])

            # calculate points on Bezier points with parametric equation
            bezier = _bezier_paths(P, segments=5)
            paths[-1].extend(bezier)

        # close the path
        elif op == 'h':
            for path in paths:
                _close_path(path)

        # -----------------------------------------------
        # Path-painting Operatores: PDF Reference Table 4.10
        # -----------------------------------------------
        # close and stroke the path
        elif op.upper() == 'S':
            # close
            if op == 's':
                for path in paths:
                    _close_path(path)

            # stroke path
            for path in paths:
                p = _stroke_path(path, WCS, Wcs, Wd, matrix)
                res.append(p)

            # reset path
            paths = []

        # fill the path
        elif line in ('f', 'F', 'f*'):
            for path in paths:
                # close the path implicitly
                _close_path(path)

                # fill path
                p = _fill_rect_path(path, WCS, Wcf, matrix)
                res.append(p)

            # reset path
            paths = []

        # close, fill and stroke the path
        elif op.upper() in ('B', 'B*'):
            for path in paths:
                # close path
                _close_path(path)

                # fill path
                p = _fill_rect_path(path, WCS, Wcf, matrix)
                res.append(p)

                # stroke path
                p = _stroke_path(path, WCS, Wcs, Wd, matrix)
                res.append(p)

            # reset path
            paths = []

        # TODO: clip the path
        # https://stackoverflow.com/questions/17003171/how-to-identify-which-clip-paths-apply-to-a-path-or-fill-in-pdf-vector-graphics
        elif line in ('W', 'W*'):
            Wcp = paths[-1] if paths else []
            paths = []

        # end the path without stroking or filling
        elif op == 'n':
            paths = []

    return res
Beispiel #28
0
 def _text_from_page(self, page: fitz.Page) -> str:
     bloks = page.get_text_blocks()
     text = [blk[4].replace("\n", " ") for blk in bloks if blk[-1] == 0]
     text = "\r\n".join(text)
     return ftfy.fix_text(text, normalization="NFKC")
Beispiel #29
0
def paths_from_annotations(page: fitz.Page):
    ''' Get shapes, e.g. Line, Square, Highlight, from annotations(comment shapes) in PDF page.
        ---
        Args:
        - page: fitz.Page, current page
        
        There are stroke and fill properties for each shape, representing border and filling area respectively.
        So, a square annotation with both stroke and fill will be converted to five rectangles here:
        four borders and one filling area.

        read more:
            - https://pymupdf.readthedocs.io/en/latest/annot.html
            - https://pymupdf.readthedocs.io/en/latest/vars.html#annotation-types
    '''
    res = []
    for annot in page.annots():

        # annot type, e.g. (8, 'Highlight')
        key = annot.type[0]

        # color, e.g. {'stroke': [1.0, 1.0, 0.0], 'fill': []}
        c = annot.colors
        sc = utils.RGB_value(c['stroke']) if c['stroke'] else None
        fc = utils.RGB_value(c['fill']) if c['fill'] else None

        # width
        w = annot.border.get('width', 1.0)  # width=-1 if not set
        w = 1.0 if w == -1 else w  # 1.0 by default

        # bbox
        rect = annot.rect

        # considering the contributions to text format and table borders,
        # only the following types are processed.
        # PDF_ANNOT_LINE 3
        # PDF_ANNOT_SQUARE 4
        # PDF_ANNOT_HIGHLIGHT 8
        # PDF_ANNOT_UNDERLINE 9
        # PDF_ANNOT_STRIKEOUT 11

        # Line: a space of 1.5*w around each border
        #
        # +----------------------------+
        # |         space              |
        # |     +--------------+       |
        # |     |   border     | 1.5w  |
        # |     +--------------+       |
        # |         1.5w               |
        # +----------------------------+
        #
        if key == 3:
            x0 = rect.x0 + 1.5 * w
            x1 = rect.x1 - 1.5 * w
            y0 = y1 = (rect.y0 + rect.y1) / 2.0
            path = _add_stroke_line((x0, y0), (x1, y1), sc, w)
            res.append(path)

        # Square: a space of 0.5*w around eah border
        # border rects and filling rects are to be extracted from original square
        #
        # +------------------------------------------+
        # |                space                     |
        # |      +----------------------------+      |
        # |      |         border             |      |
        # |      |     +--------------+       |      |
        # |            |     fill     |  w    | 0.5w |
        # |      |     +--------------+       |      |
        # |      |            w               |      |
        # |      +----------------------------+      |
        # |                  0.5w                    |
        # +------------------------------------------+
        #
        elif key == 4:
            # stroke rectangles
            if not sc is None:
                x0, y0 = rect.x0 + w, rect.y0 + w
                x1, y1 = rect.x1 - w, rect.y1 - w
                path = _add_stroke_rect((x0, y0), (x1, y1), sc, w)
                res.append(path)

            # fill rectangle
            if not fc is None:
                d = 1.5 * w
                x0, y0 = rect.x0 + d, rect.y0 + d
                x1, y1 = rect.x1 - d, rect.y1 - d
                path = _add_fill_rect((x0, y0), (x1, y1), fc)
                res.append(path)

        # highlight, underline, strikethrough: on space
        # For these shapes, `annot.rect` is a combination of all sub-highlights, especially
        # the highlight is continuous in multi-lines.
        # So, `annot.vertices` should be used here, i.e. vertices marked with `+` below.
        #          +------------------------+
        #          +------------------------+
        # +-----------+
        # +-----------+
        # NOTE: Though underline and strikethrough are just lines, the affected areas are same as
        # highlights, as illustrated above.
        #
        # https://github.com/pymupdf/PyMuPDF/issues/318
        #
        elif key in (8, 9, 11):
            points = annot.vertices
            for i in range(int(len(points) / 4.0)):  # four points in a group
                # highlight: whole bbox
                if key == 8:
                    x0, y0 = points[4 * i]
                    x1, y1 = points[4 * i + 3]

                    # NOTE: this indded a stroke for PyMuPDF -> no fill color but stroke color !!
                    path = _add_fill_rect((x0, y0), (x1, y1), sc)
                    res.append(path)

                else:
                    # underline: bottom edge
                    if key == 9:
                        start, end = points[4 * i + 2], points[4 * i + 3]

                    # strikethrough: average of top and bottom edge
                    else:
                        x0, x1 = points[4 * i][0], points[4 * i + 1][0]
                        y_ = (points[4 * i][1] + points[4 * i + 2][1]) / 2.0
                        start = x0, y_
                        end = x1, y_

                    path = _add_stroke_line(start, end, sc, w)
                    res.append(path)

    return res
def get_good_rectangles(page: Page) -> List[Rect]:
    """Find rectangles in the PDFs that might be redactions.

    :param page: The PyMuPDF Page to look for rectangles within.
    :returns A list of PyMUPDF.Rect objects for each non-white, fully opaque
    rectangle that's big enough to be a possible redaction. If none, returns
    an empty list. Also enhances the Rect object by including the sequence
    number and fill color of the parent drawing. This allows us to later
    determine if a letter is above or below a rectangle or whether it's the
    same color.
    """
    drawings = page.get_drawings()
    good_rectangles = []
    for drawing in drawings:
        if drawing.get("fill_opacity") is None or drawing["fill_opacity"] != 1:
            # Not opaque. Probably a highlight or similar.
            continue

        if drawing["fill"] is None:
            # Unfilled box (transparent to the eye, but distinct from ones that
            # have opacity of 0).
            continue

        # Each drawing can contain multiple "draw" commands that could be
        # rectangles, lines, quads or curves. Each takes the form of a tuple,
        # where the first item is the type for the object, then the rest of the
        # items in the tuple define the object. In the case of rectangles, the
        # type is "re", and the second key is a fitz.Rect object. Gather those
        # here.
        #
        # N.B.: Each _drawing_ also contains a key for "rect" that defines a
        # rectangle around the whole shape. Using that, however, you will get
        # the outer dimensions of multi-line redactions, which will make you
        # sad. For example:
        #
        # +----------------------------------------------------+
        # | some visible letters █████████████████████████████ |
        # | ████████████████████████████████ more letters here |
        # +----------------------------------------------------+
        #
        # If you use the dimensions of the outer rectangle, you will wrongly
        # say that the letters before and after the redaction are badly
        # redacted. Instead, use the rectangles from the "items" key, which in
        # the above example would yield two rectangles ("re" types).
        rectangles = [item[1] for item in drawing["items"] if item[0] == "re"]

        for rectangle in rectangles:
            # Give it the sequence number and color of its parent drawing
            rectangle.seqno = drawing["seqno"]
            rectangle.fill = drawing["fill"]
            if rectangle.y1 <= 43:
                # It's a header, ignore it
                continue

            if all([
                    # Eliminate horizontal lines
                    rectangle.height > 4,
                    # Eliminate vertical lines, like those along margins.
                    rectangle.width > 4,
            ]):
                if rectangle.is_infinite:
                    rectangle.normalize()
                good_rectangles.append(rectangle)
    return good_rectangles