def clip_page(cls, page: fitz.Page, bbox: fitz.Rect = None, zoom: float = 3.0): """Clip page pixmap (without text) according to ``bbox``. Args: page (fitz.Page): pdf page to extract. bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page. zoom (float, optional): Improve resolution by this rate. Defaults to 3.0. Returns: dict: Raw dict of the extracted pixmap. """ # hide text before clip the image only # render Tr: set the text rendering mode # - 3: neither fill nor stroke the text -> invisible # read more: # - https://github.com/pymupdf/PyMuPDF/issues/257 # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf doc = page.parent for xref in page.get_contents(): stream = doc.xrefStream(xref).replace(b'BT', b'BT 3 Tr') \ .replace(b'Tm', b'Tm 3 Tr') \ .replace(b'Td', b'Td 3 Tr') doc.updateStream(xref, stream) # improve resolution # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution # - https://github.com/pymupdf/PyMuPDF/issues/181 bbox = page.rect if bbox is None else bbox & page.rect image = page.getPixmap(clip=bbox, matrix=fitz.Matrix(zoom, zoom)) # type: fitz.Pixmap return cls.to_raw_dict(image, bbox)
def clip_page(cls, page: fitz.Page, bbox: fitz.Rect = None, zoom: float = 3.0): '''Clip page pixmap (without text) according to `bbox` (entire page by default). ''' # hide text before clip the image only # render Tr: set the text rendering mode # - 3: neither fill nor stroke the text -> invisible # read more: # - https://github.com/pymupdf/PyMuPDF/issues/257 # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf doc = page.parent for xref in page._getContents(): stream = doc._getXrefStream(xref).replace(b'BT', b'BT 3 Tr') \ .replace(b'Tm', b'Tm 3 Tr') \ .replace(b'Td', b'Td 3 Tr') doc._updateStream(xref, stream) # improve resolution # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution # - https://github.com/pymupdf/PyMuPDF/issues/181 bbox = page.rect if bbox is None else bbox & page.rect image = page.getPixmap(clip=bbox, matrix=fitz.Matrix(zoom, zoom)) # type: fitz.Pixmap return cls.to_raw_dict(image, bbox)
def total_image_area(page: fitz.Page) -> int: """Returns the total area (in pixels) consumed by images that appear in `page`. Does not account for overlap between images, so it is possible for the total computed area to exceed the actual area of the page. """ return sum( rect.getArea() for image in page.get_images() for rect in page.get_image_rects(image) # type: ignore )
def extract_images(cls, page: fitz.Page, clip_image_res_ratio: float = 3.0): """Extract normal images with ``Page.getImageList()``. Args: page (fitz.Page): pdf page to extract images. clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0. Returns: list: A list of extracted and recovered image raw dict. .. note:: ``Page.getImageList()`` contains each image only once, which may less than the real count of images in a page. """ # pdf document doc = page.parent # check each image item: # (xref, smask, width, height, bpc, colorspace, ...) images = [] for item in page.getImageList(full=True): # should always wrap getImageBbox in a try-except clause, per # https://github.com/pymupdf/PyMuPDF/issues/487 try: item = list(item) item[-1] = 0 bbox = page.getImageBbox( item) # item[7]: name entry of such an item except ValueError: continue # ignore images outside page if not bbox.intersects(page.rect): continue # recover image pix = cls._recover_pixmap(doc, item) # regarding images consist of alpha values only, i.e. colorspace is None, # the turquoise color shown in the PDF is not part of the image, but part of PDF background. # So, just to clip page pixmap according to the right bbox # https://github.com/pymupdf/PyMuPDF/issues/677 if not pix.colorspace: pix = cls._clip_page(page, bbox, zoom=clip_image_res_ratio) raw_dict = cls._to_raw_dict(pix, bbox) images.append(raw_dict) return images
def filter_redactions_by_pixmap( redactions: List[RedactionType], page: Page, ) -> List[RedactionType]: """Convert each bad redaction to an image and check it for text :param redactions: A list of redactions that might be bad :param page: The PyMuPDF.Page object where the bad redactions might be :return: The redactions, if they are valid """ bad_redactions = [] for redaction in redactions: pixmap = page.get_pixmap( # Use gray for simplicity and speed, though this risks missing a # bad redaction. colorspace=fitz.csRGB, clip=fitz.Rect(redaction["bbox"]), ) if not pixmap.is_unicolor: # There's some degree of variation in the colors of the pixels. # ∴ it's not a uniform box and it's not a bad redaction. # filename = f'{redaction["text"].replace("/", "_")}.png' # pixmap.save(filename) continue bad_redactions.append(redaction) return bad_redactions
def clip_page(cls, page:fitz.Page, bbox:fitz.Rect, zoom:float=3.0): '''Clip pixmap according to bbox from page.''' # improve resolution # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution # - https://github.com/pymupdf/PyMuPDF/issues/181 image = page.getPixmap(clip=bbox, matrix=fitz.Matrix(zoom, zoom)) # type: fitz.Pixmap return cls.to_raw_dict(image, bbox)
def init(self, page:fitz.Page) -> Layout: '''Initialize layout object.''' # Layout object based on raw dict # NOTE: all these coordinates are relative to un-rotated page # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages raw_layout = page.getText('rawdict') # though 'width', 'height' are contained in `raw_dict`, they are based on un-rotated page. # so, update page width/height to right direction in case page is rotated *_, w, h = page.rect # always reflecting page rotation raw_layout.update({ 'width' : w, 'height': h }) self._layout = Layout(raw_layout, page.rotationMatrix) # get rectangle shapes from page source self._layout.rects.from_stream(self.doc_pdf, page) # get annotations(comment shapes) from PDF page, e.g. # highlight, underline and strike-through-line self._layout.rects.from_annotations(page) # plot raw layout if self.debug_mode: # new section for current pdf page new_page_section(self._doc_debug, self._layout.width, self._layout.height, f'Page {page.number}') # initial layout self._layout.plot(self._doc_debug, 'Original Text Blocks', key=PlotControl.LAYOUT) self._layout.plot(self._doc_debug, 'Original Rectangle Shapes', key=PlotControl.SHAPE) return self._layout
def search_in_page(regex: re.Pattern, page: Page) -> List[dict]: """Search for `text` in `page` and extract meta Arguments needle: the text to search for page: page number (1-based index) Returns a list of meta """ result = [] page_meta = page.getTextPage().extractDICT() # we are using get(key, []) to bypass any missing key errors for blk in page_meta.get('blocks', []): for ln in blk.get('lines', []): for spn in ln.get('spans', []): text = spn.get('text', "") # the current search algorithm is very naive and doesn't handle # line breaks and more complex layout. might want to take a # look at `page.searchFor`, but the current algorithm should be # enough for TeX-generated pdf if regex.search(text): result.append(spn) return result
def __init__(self, doc: fitz.Document, page: fitz.Page) -> None: self.doc = doc self._page = page # Page's cropbox (to help identify what part of an image is actually # being displayed), already rotated self.bbox = page.bound() # Cross-referenced images (ignore masks since they can't be easily downsampled) all_xref_images = list(map(self._build_xref_image, page.get_images(full=True))) smask_xrefs = set(filter(None, map(lambda xref_image: xref_image['smask'], all_xref_images))) self.xref_images = list(filter(lambda xref_image: xref_image['xref'] not in smask_xrefs, all_xref_images)) # Lazy, memoized attributes # Match block numbers to image hashes self._block_hashes = None
def render_fitz_page( page: fitz.Page, zoom: float, pixel_ratio: float, clip: fitz.Rect = None ) -> QtGui.QPixmap: scale_ratio = zoom * pixel_ratio pix = page.getPixmap(matrix=fitz.Matrix(scale_ratio, scale_ratio), clip=clip) mode = "RGBA" if pix.alpha else "RGB" img = Image.frombytes(mode, [pix.width, pix.height], pix.samples) img = ImageQt.ImageQt(img) img.setDevicePixelRatio(pixel_ratio) return img
def image_from_page(page: fitz.Page, scale: float = 1) -> Image: """Converts a page to an image. :param page: the page to be represented as an image :param scale: the proportion by which to scale the image """ pix = page.get_pixmap(matrix=fitz.Matrix(scale, scale)) # type: ignore return Image.frombytes( # type: ignore ("RGBA" if pix.alpha else "RGB"), (pix.width, pix.height), pix.samples )
def extract_images( cls, page: fitz.Page, clip_image_res_ratio: float = 3.0 # resolution ratio of cliiped bitmap ): ''' Get images dict based on image contents from `Page.getImageList()`. NOTE: Page.getImageList() contains each image only once, which may less than the real count if images in a page. ''' # pdf document doc = page.parent # check each image item: # (xref, smask, width, height, bpc, colorspace, ...) images = [] for item in page.getImageList(full=True): # should always wrap getImageBbox in a try-except clause, per # https://github.com/pymupdf/PyMuPDF/issues/487 try: item = list(item) item[-1] = 0 bbox = page.getImageBbox( item) # item[7]: name entry of such an item except ValueError: continue # ignore images outside page if not bbox.intersects(page.rect): continue pix = ImagesExtractor.recover_pixmap(doc, item) # regarding images consist of alpha values only, i.e. colorspace is None, # the turquoise color shown in the PDF is not part of the image, but part of PDF background. # So, just to clip page pixmap according to the right bbox # https://github.com/pymupdf/PyMuPDF/issues/677 if not pix.colorspace: raw_dict = cls.clip_page(page, bbox, zoom=clip_image_res_ratio) else: raw_dict = cls.to_raw_dict(pix, bbox) images.append(raw_dict) return images
def to_image(self, page: fitz.Page): '''Convert to image block dict if this is a vector graphic paths.''' bbox = self.bbox image = page.getPixmap(clip=bbox) return { 'type': 1, 'bbox': tuple(bbox), 'ext': 'png', 'width': bbox.width, 'height': bbox.height, 'image': image.getImageData(output="png") }
def _hide_page_text(page: fitz.Page): '''Hide page text before clipping page.''' # NOTE: text might exist in both content stream and form object stream # - content stream, i.e. direct page content # - form object, i.e. contents referenced by this page xref_list = [ xref for (xref, name, invoker, bbox) in page.get_xobjects() ] xref_list.extend(page.get_contents()) # render Tr: set the text rendering mode # - 3: neither fill nor stroke the text -> invisible # read more: # - https://github.com/pymupdf/PyMuPDF/issues/257 # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf doc = page.parent # type: fitz.Document for xref in xref_list: stream = doc.xref_stream(xref).replace(b'BT', b'BT 3 Tr') \ .replace(b'Tm', b'Tm 3 Tr') \ .replace(b'Td', b'Td 3 Tr') doc.update_stream(xref, stream)
def init(self, page: fitz.Page) -> Layout: '''Initialize layout object.''' # Layout object based on raw dict raw_layout = page.getText('rawdict') self._layout = Layout(raw_layout) # get rectangle shapes from page source: # these shapes are generally converted from docx, e.g. highlight, underline, # which are different from PDF comments like highlight, rectangle. if not page._isWrapped: page._wrapContents() # transformation matrix from PDF to PyMuPDF M = page.transformationMatrix # PyMuPDF>=1.17.0 for xref in page.getContents(): page_content = self._doc_pdf._getXrefStream(xref).decode( encoding="ISO-8859-1") self._layout.rects.from_stream(page_content, M) # get annotations(comment shapes) from PDF page: consider highlight, underline, # strike-through-line only. annots = page.annots() self._layout.rects.from_annotations(annots) # plot raw layout if self.debug_mode: # new section for current pdf page new_page_section(self._doc_debug, self._layout.width, self._layout.height, f'Page {page.number}') # initial layout self._layout.plot(self._doc_debug, 'Original Text Blocks', key=PlotControl.LAYOUT) self._layout.plot(self._doc_debug, 'Original Rectangle Shapes', key=PlotControl.SHAPE) return self._layout
def extract_images(cls, page:fitz.Page): '''Get images from current page.''' # pdf document doc = page.parent # check each image item: # (xref, smask, width, height, bpc, colorspace, ...) images = [] for item in page.getImageList(full=True): pix = recover_pixmap(doc, item) bbox = page.getImageBbox(item[7]) # item[7]: name entry of such an item # regarding images consist of alpha values only, i.e. colorspace is None, # the turquoise color shown in the PDF is not part of the image, but part of PDF background. # So, just to clip page pixmap according to the right bbox # https://github.com/pymupdf/PyMuPDF/issues/677 if not pix.colorspace: raw_dict = cls.clip_page(page, bbox, zoom=3.0) else: raw_dict = cls.to_raw_dict(pix, bbox) images.append(raw_dict) return images
def _analyze_page(self, page: fitz.Page): """Analyzes `page` and records the data extracted from it. Does nothing if the page cannot be analyzed successfully. """ original_text = page.get_text() # type: ignore if ( total_image_area(page) / page.bound().getArea() < self.image_area_thresh and not len([a for a in original_text if a == '�']) > self.max_unreadable ): metadata, orientation_used, scale = None, None, None language = detected_language(original_text) self.texts.append(original_text) self.mean_confidences.append(None) used_original_text = True else: metadata, orientation_used, language, scale = self._run_ocr( page, (detected_language(original_text) if len(original_text) >= self.text_len_thresh else self.languages.items[0]) ) if mean_conf(metadata) < self.coarse_thresh: warnings.warn('Failed to analyze image.') self.texts.append(data_to_string( metadata.corrected if 'corrected' in metadata.columns else metadata.text )) self.mean_confidences.append(mean_conf(metadata)) used_original_text = False self.languages.add_weight(language) self.metadata.append(metadata) self.orientations.append(orientation_used) self.page_languages.append(language) self.used_original_texts.append(used_original_text) self.times.append(time.time()) self.scales.append(scale)
def _hide_page_text(cls, page: fitz.Page): """Hide page text before clipping page. Args: page (fitz.Page): pdf page to extract. """ # render Tr: set the text rendering mode # - 3: neither fill nor stroke the text -> invisible # read more: # - https://github.com/pymupdf/PyMuPDF/issues/257 # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf doc = page.parent for xref in page.get_contents(): stream = doc.xrefStream(xref).replace(b'BT', b'BT 3 Tr') \ .replace(b'Tm', b'Tm 3 Tr') \ .replace(b'Td', b'Td 3 Tr') doc.updateStream(xref, stream)
def initialize(self, page: fitz.Page): '''Initialize layout object.''' # ----------------------------------------- # Layout object based on raw dict # ----------------------------------------- # NOTE: all these coordinates are relative to un-rotated page # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages raw_layout = page.getText('rawdict') # ----------------------------------------- # page size # ----------------------------------------- # though 'width', 'height' are contained in `raw_dict`, they are based on un-rotated page. # so, update page width/height to right direction in case page is rotated *_, w, h = page.rect # always reflecting page rotation raw_layout.update({'width': w, 'height': h}) # ----------------------------------------- # page images # ----------------------------------------- # image bytes from page.getText('rawdict') can't reproduce transparent images, # so we re-extract page images for block in raw_layout['blocks']: # disable image in raw dict if block['type'] == 1: block['type'] = -1 # extract and recover images images = ImagesExtractor.extract_images(page) raw_layout['blocks'].extend(images) # ----------------------------------------- # page paths # ----------------------------------------- # convert vector graphic paths to pixmap self._paths_extractor = PathsExtractor() images, paths = self._paths_extractor.extract_paths(page) raw_layout['blocks'].extend(images) raw_layout['paths'] = paths # init layout self._layout = Layout(raw_layout, page.rotationMatrix) return self._layout
def _page_blocks(page:fitz.Page): '''Get page blocks and adjust image blocks.''' # Layout object based on raw dict: # NOTE: all these coordinates are relative to un-rotated page # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages raw_layout = page.getText('rawdict') # Adjust image blocks: # Image blocks are generated for every image location – whether or not there are any duplicates. # This is in contrast to Page.getImageList(), which will contain each image only once. # https://pymupdf.readthedocs.io/en/latest/textpage.html#dictionary-structure-of-extractdict-and-extractrawdict # # So, a compromise: # - get image contents with `page.getImageList` # - get image location with `page.getText('rawdict')` # # extract and recover images recovered_images = ImagesExtractor.extract_images(page) # group original image blocks by image contents image_blocks_group = defaultdict(list) for block in raw_layout['blocks']: if block['type'] != 1: continue image_blocks_group[hash(block['image'])].append(block) # update raw layout blocks def same_images(img, img_list): bbox = list(map(round, img['bbox'])) for _img in img_list: if list(map(round, _img['bbox']))==bbox: return True return False for image in recovered_images: for k, image_blocks in image_blocks_group.items(): if not same_images(image, image_blocks): continue for image_block in image_blocks: image_block['image'] = image['image'] break # an image outside the page is not counted in page.getText(), so let's add it here else: raw_layout['blocks'].append(image) return raw_layout
def initialize(self, page:fitz.Page): '''Initialize layout object.''' # Layout object based on raw dict # NOTE: all these coordinates are relative to un-rotated page # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages raw_layout = page.getText('rawdict') # though 'width', 'height' are contained in `raw_dict`, they are based on un-rotated page. # so, update page width/height to right direction in case page is rotated *_, w, h = page.rect # always reflecting page rotation raw_layout.update({ 'width' : w, 'height': h }) self._layout = Layout(raw_layout, page.rotationMatrix) # get rectangle shapes from page source self._layout.rects.from_stream(self.doc_pdf, page) # get annotations(comment shapes) from PDF page, e.g. # highlight, underline and strike-through-line self._layout.rects.from_annotations(page) return self._layout
def initialize(self, page:fitz.Page): '''Initialize layout object.''' # Layout object based on raw dict # NOTE: all these coordinates are relative to un-rotated page # https://pymupdf.readthedocs.io/en/latest/page.html#modifying-pages raw_layout = page.getText('rawdict') # though 'width', 'height' are contained in `raw_dict`, they are based on un-rotated page. # so, update page width/height to right direction in case page is rotated *_, w, h = page.rect # always reflecting page rotation raw_layout.update({ 'width' : w, 'height': h }) # pdf paths and converted images self._paths = PathsExtractor() images, paths = self._paths.parse(page).filter_pixmaps(page) raw_layout['blocks'].extend(images) raw_layout['paths'] = paths # init layout self._layout = Layout(raw_layout, page.rotationMatrix) return self._layout
def get_intersecting_chars(page: Page, rectangles: List[Rect]) -> List[CharDictType]: """Get the chars that are occluded by the rectangles We do this in two stages. First, we check for intersecting spans, then we check for intersecting chars within those spans. The idea of this is :param page: The PyMuPDF.Page object to inspect :param rectangles: A list of PyMuPDF.Rect objects from the page (aka the redactions). :return A list of characters that are under the rectangles """ if len(rectangles) == 0: return [] spans = page.get_texttrace() intersecting_chars = [] for span in spans: span_seq_no = span["seqno"] span_color = span["color"] span_rect = fitz.Rect(span["bbox"]) span_rect.seqno = span_seq_no span_rect.fill = span_color if not intersects(span_rect, rectangles): continue for char in span["chars"]: char_rect = fitz.Rect(char[3]) char_rect.seqno = span_seq_no char_rect.fill = span_color if intersects(char_rect, rectangles, occlusion_threshold=0.8): char_dict: CharDictType = { "rect": char_rect, "c": chr(char[0]), } intersecting_chars.append(char_dict) return intersecting_chars
def extract_vector_graphics(cls, page: fitz.Page, exclude_areas: list, clip_image_res_ratio: float = 3.0): """Detect and extract vector graphics by clipping associated page area. Args: page (fitz.Page): pdf page to extract images. exclude_areas (list): A list of bbox-like ``(x0, y0, x1, y1)`` area to exclude, e.g. raster image area, table area. clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0. Returns: list: A list of extracted and recovered image raw dict. .. note:: Contours for vector graphics are detected first with ``opencv-python``. """ # find contours contours = cls._detect_svg_contours(page, exclude_areas) # filter contours fun = lambda a, b: a.bbox & b.bbox groups = contours.group(fun) # clip images images = [] for group in groups: bbox = group.bbox pix = page.getPixmap(clip=bbox, matrix=fitz.Matrix(clip_image_res_ratio, clip_image_res_ratio)) raw_dict = cls._to_raw_dict(pix, bbox) images.append(raw_dict) return images
def _clip_page(cls, page: fitz.Page, bbox: fitz.Rect = None, zoom: float = 3.0): """Clip page pixmap (without text) according to ``bbox``. Args: page (fitz.Page): pdf page to extract. bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page. zoom (float, optional): Improve resolution by this rate. Defaults to 3.0. Returns: fitz.Pixmap: The extracted pixmap. """ # hide text cls._hide_page_text(page) # improve resolution # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution # - https://github.com/pymupdf/PyMuPDF/issues/181 bbox = page.rect if bbox is None else bbox & page.rect return page.getPixmap(clip=bbox, matrix=fitz.Matrix(zoom, zoom)) # type: fitz.Pixmap
def init(self, page:fitz.Page) -> Layout: '''Initialize layout object.''' # Layout object based on raw dict raw_layout = page.getText('rawdict') new_layout = split_blocks(raw_layout) self._layout = Layout(new_layout) # get rectangle shapes from page source self._layout.rects.from_stream(self.doc_pdf, page) # get annotations(comment shapes) from PDF page, e.g. # highlight, underline and strike-through-line self._layout.rects.from_annotations(page) # plot raw layout if self.debug_mode: # new section for current pdf page new_page_section(self._doc_debug, self._layout.width, self._layout.height, f'Page {page.number}') # initial layout self._layout.plot(self._doc_debug, 'Original Text Blocks', key=PlotControl.LAYOUT) self._layout.plot(self._doc_debug, 'Original Rectangle Shapes', key=PlotControl.SHAPE) return self._layout
def paths_from_stream(page: fitz.Page): ''' Get paths, e.g. highlight, underline and table borders, from page source contents. --- Args: - page: fitz.Page, current page The page source is represented as contents of stream object. For example, ``` /P<</MCID 0>> BDC ... 1 0 0 1 90.0240021 590.380005 cm ... 1 1 0 rg # or 0 g ... 285.17 500.11 193.97 13.44 re f* ... 214 320 m 249 322 l 426 630 425 630 422 630 c ... EMC ``` where, - `cm` specify a coordinate system transformation, here (0,0) translates to (90.0240021 590.380005) - `q`/`Q` save/restores graphic status - `rg` / `g` specify color mode: rgb / grey - `re`, `f` or `f*`: fill rectangle path with pre-defined color - `m` (move to) and `l` (line to) defines a path - `c` draw cubic Bezier curve with given control points In this case, - a rectangle with: - fill color is yellow (1,1,0) - lower left corner: (285.17 500.11) - width: 193.97 - height: 13.44 - a line from (214, 320) to (249, 322) - a Bezier curve with control points (249,322), (426,630), (425,630), (422,630) Read more: - https://github.com/pymupdf/PyMuPDF/issues/263 - https://github.com/pymupdf/PyMuPDF/issues/225 - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdf_reference_archive/pdf_reference_1-7.pdf ''' # Each object in PDF has a cross-reference number (xref): # - to get its source contents: `doc.xrefObject()` or low level API `doc._getXrefString()`; but for stream objects, only the non-stream part is returned # - to get the stream data: `doc.xrefStream(xref)` or low level API `doc._getXrefStream(xref)` # - the xref for a page object itself: `page.xref` # - all stream xref contained in one page: `page.getContents()` # - combine all stream object contents together: `page.readContents()` with PyMuPDF>=1.17.0 # # Clean contents first: # syntactically correct, standardize and pretty print the contents stream page.cleanContents() xref_stream = page.readContents().decode(encoding="ISO-8859-1") # transformation matrix for coordinate system conversion from pdf to fitz # NOTE: transformation matrix converts PDF CS to UNROTATED PyMuPDF page CS, # so need further rotation transformation to the real page CS (applied in Object BBox) # https://github.com/pymupdf/PyMuPDF/issues/619 matrix = page.transformationMatrix # Graphic States: working CS is coincident with the absolute origin (0, 0) # Refer to PDF reference v1.7 4.2.3 Transformation Metrics # | a b 0 | # [a, b, c, d, e, f] => | c b 0 | # | e f 1 | ACS = [fitz.Matrix(0.0)] # identity matrix WCS = fitz.Matrix(0.0) # Graphics color: # - color space: PDF Reference Section 4.5 Color Spaces # NOTE: it should have to calculate color value under arbitrary color space, but it's really hard work for now. # So, consider device color space only like DeviceGray, DeviceRGB, DeviceCMYK, and set black for all others. device_space = True color_spaces = _check_device_cs(page) # - stroking color Acs = [utils.RGB_value((0.0, 0.0, 0.0))] # stored value -> stack Wcs = Acs[0] # working value # - filling color Acf = [utils.RGB_value((0.0, 0.0, 0.0))] Wcf = Acf[0] # Stroke width Ad = [0.0] Wd = Ad[0] # collecting paths: each path is a list of points paths = [] # a list of path # clip path Acp = [] # stored clipping path Wcp = [] # working clipping path # Check line by line # Cleaned by `page.cleanContents()`, operator and operand are aligned in a same line; # otherwise, have to check stream contents word by word (line always changes) lines = xref_stream.splitlines() res = [] # final results for line in lines: words = line.split() if not words: continue op = words[-1] # operator always at the end after page.cleanContents() # ----------------------------------------------- # Color Operators: PDF Reference Table 4.24 # ----------------------------------------------- # - set color space: # color_space_name cs # specify color space # c1 c2 ... SC/SCN # components under defined color space if op.upper() == 'CS': Wcs = utils.RGB_value((0.0, 0.0, 0.0)) Wcf = utils.RGB_value((0.0, 0.0, 0.0)) # Consider normal device cs only device_space = color_spaces.get(words[0], False) # - set color: color components under specified color space elif op.upper() == 'SC': # c1 c2 ... cn SC c = _RGB_from_color_components(words[0:-1], device_space) # non-stroking color if op == 'sc': Wcf = c # stroking color else: Wcs = c # - set color: color components under specified color space elif op.upper() == 'SCN': # c1 c2 ... cn [name] SC if utils.is_number(words[-2]): c = _RGB_from_color_components(words[0:-1], device_space) else: c = _RGB_from_color_components(words[0:-2], device_space) # non-stroking color if op == 'scn': Wcf = c # stroking color else: Wcs = c # - DeviceGray space, equal to: # /DeviceGray cs # c sc elif op.upper() == 'G': # 0 g g = float(words[0]) # nonstroking color, i.e. filling color here if op == 'g': Wcf = utils.RGB_value((g, g, g)) # stroking color else: Wcs = utils.RGB_value((g, g, g)) # - DeviceRGB space elif op.upper() == 'RG': # 1 1 0 rg r, g, b = map(float, words[0:-1]) # nonstroking color if op == 'rg': Wcf = utils.RGB_value((r, g, b)) # stroking color else: Wcs = utils.RGB_value((r, g, b)) # - DeviceCMYK space elif op.upper() == 'K': # c m y k K c, m, y, k = map(float, words[0:-1]) # nonstroking color if op == 'k': Wcf = utils.CMYK_to_RGB(c, m, y, k, cmyk_scale=1.0) # stroking color else: Wcs = utils.CMYK_to_RGB(c, m, y, k, cmyk_scale=1.0) # ----------------------------------------------- # Graphics State Operators: PDF References Table 4.7 # ----------------------------------------------- # CS transformation: a b c d e f cm, e.g. # 0.05 0 0 -0.05 0 792 cm # refer to PDF Reference 4.2.2 Common Transformations for detail elif op == 'cm': # update working CS components = list(map(float, words[0:-1])) Mt = fitz.Matrix(*components) WCS = Mt * WCS # M' = Mt x M # stroke width elif op == 'w': # 0.5 w Wd = float(words[0]) # save or restore graphics state: # only consider transformation and color here elif op == 'q': # save ACS.append(fitz.Matrix(WCS)) # copy as new matrix Acf.append(Wcf) Acs.append(Wcs) Ad.append(Wd) Acp.append(Wcp) elif op == 'Q': # restore WCS = fitz.Matrix(ACS.pop()) # copy as new matrix Wcf = Acf.pop() Wcs = Acs.pop() Wd = Ad.pop() Wcp = Acp.pop() # ----------------------------------------------- # Path Construction Operators: PDF References Table 4.9 # ----------------------------------------------- # rectangle block: # x y w h re is equivalent to # x y m # x+w y l # x+w y+h l # x y+h l # h # close the path elif op == 're': # ATTENTION: # top/bottom, left/right is relative to the positive direction of CS, # while a reverse direction may be performed, so be careful when calculating # the corner points. # Coordinates in the transformed PDF CS: # y1 +----------+ # | | h # y0 +----w-----+ # x0 x1 # # (x, y, w, h) before this line x0, y0, w, h = map(float, words[0:-1]) path = [] path.append((x0, y0)) path.append((x0 + w, y0)) path.append((x0 + w, y0 + h)) path.append((x0, y0 + h)) path.append((x0, y0)) paths.append(path) # path: m -> move to point to start a path elif op == 'm': # x y m x0, y0 = map(float, words[0:-1]) paths.append([(x0, y0)]) # path: l -> straight line to point elif op == 'l': # x y l x0, y0 = map(float, words[0:-1]) paths[-1].append((x0, y0)) # path: c -> cubic Bezier curve with control points elif op in ('c', 'v', 'y'): coords = list(map(float, words[0:-1])) P = [(coords[i], coords[i + 1]) for i in range(0, len(coords), 2)] x0, y0 = paths[-1][-1] # x1 y1 x2 y2 x3 y3 c -> (x1,y1), (x2,y2) as control points if op == 'c': P.insert(0, (x0, y0)) # x2 y2 x3 y3 v -> (x0,y0), (x2,y2) as control points elif op == 'v': P.insert(0, (x0, y0)) P.insert(0, (x0, y0)) # x1 y1 x3 y3 y -> (x1,y1), (x3,y3) as control points else: P.insert(0, (x0, y0)) P.append(P[-1]) # calculate points on Bezier points with parametric equation bezier = _bezier_paths(P, segments=5) paths[-1].extend(bezier) # close the path elif op == 'h': for path in paths: _close_path(path) # ----------------------------------------------- # Path-painting Operatores: PDF Reference Table 4.10 # ----------------------------------------------- # close and stroke the path elif op.upper() == 'S': # close if op == 's': for path in paths: _close_path(path) # stroke path for path in paths: p = _stroke_path(path, WCS, Wcs, Wd, matrix) res.append(p) # reset path paths = [] # fill the path elif line in ('f', 'F', 'f*'): for path in paths: # close the path implicitly _close_path(path) # fill path p = _fill_rect_path(path, WCS, Wcf, matrix) res.append(p) # reset path paths = [] # close, fill and stroke the path elif op.upper() in ('B', 'B*'): for path in paths: # close path _close_path(path) # fill path p = _fill_rect_path(path, WCS, Wcf, matrix) res.append(p) # stroke path p = _stroke_path(path, WCS, Wcs, Wd, matrix) res.append(p) # reset path paths = [] # TODO: clip the path # https://stackoverflow.com/questions/17003171/how-to-identify-which-clip-paths-apply-to-a-path-or-fill-in-pdf-vector-graphics elif line in ('W', 'W*'): Wcp = paths[-1] if paths else [] paths = [] # end the path without stroking or filling elif op == 'n': paths = [] return res
def _text_from_page(self, page: fitz.Page) -> str: bloks = page.get_text_blocks() text = [blk[4].replace("\n", " ") for blk in bloks if blk[-1] == 0] text = "\r\n".join(text) return ftfy.fix_text(text, normalization="NFKC")
def paths_from_annotations(page: fitz.Page): ''' Get shapes, e.g. Line, Square, Highlight, from annotations(comment shapes) in PDF page. --- Args: - page: fitz.Page, current page There are stroke and fill properties for each shape, representing border and filling area respectively. So, a square annotation with both stroke and fill will be converted to five rectangles here: four borders and one filling area. read more: - https://pymupdf.readthedocs.io/en/latest/annot.html - https://pymupdf.readthedocs.io/en/latest/vars.html#annotation-types ''' res = [] for annot in page.annots(): # annot type, e.g. (8, 'Highlight') key = annot.type[0] # color, e.g. {'stroke': [1.0, 1.0, 0.0], 'fill': []} c = annot.colors sc = utils.RGB_value(c['stroke']) if c['stroke'] else None fc = utils.RGB_value(c['fill']) if c['fill'] else None # width w = annot.border.get('width', 1.0) # width=-1 if not set w = 1.0 if w == -1 else w # 1.0 by default # bbox rect = annot.rect # considering the contributions to text format and table borders, # only the following types are processed. # PDF_ANNOT_LINE 3 # PDF_ANNOT_SQUARE 4 # PDF_ANNOT_HIGHLIGHT 8 # PDF_ANNOT_UNDERLINE 9 # PDF_ANNOT_STRIKEOUT 11 # Line: a space of 1.5*w around each border # # +----------------------------+ # | space | # | +--------------+ | # | | border | 1.5w | # | +--------------+ | # | 1.5w | # +----------------------------+ # if key == 3: x0 = rect.x0 + 1.5 * w x1 = rect.x1 - 1.5 * w y0 = y1 = (rect.y0 + rect.y1) / 2.0 path = _add_stroke_line((x0, y0), (x1, y1), sc, w) res.append(path) # Square: a space of 0.5*w around eah border # border rects and filling rects are to be extracted from original square # # +------------------------------------------+ # | space | # | +----------------------------+ | # | | border | | # | | +--------------+ | | # | | fill | w | 0.5w | # | | +--------------+ | | # | | w | | # | +----------------------------+ | # | 0.5w | # +------------------------------------------+ # elif key == 4: # stroke rectangles if not sc is None: x0, y0 = rect.x0 + w, rect.y0 + w x1, y1 = rect.x1 - w, rect.y1 - w path = _add_stroke_rect((x0, y0), (x1, y1), sc, w) res.append(path) # fill rectangle if not fc is None: d = 1.5 * w x0, y0 = rect.x0 + d, rect.y0 + d x1, y1 = rect.x1 - d, rect.y1 - d path = _add_fill_rect((x0, y0), (x1, y1), fc) res.append(path) # highlight, underline, strikethrough: on space # For these shapes, `annot.rect` is a combination of all sub-highlights, especially # the highlight is continuous in multi-lines. # So, `annot.vertices` should be used here, i.e. vertices marked with `+` below. # +------------------------+ # +------------------------+ # +-----------+ # +-----------+ # NOTE: Though underline and strikethrough are just lines, the affected areas are same as # highlights, as illustrated above. # # https://github.com/pymupdf/PyMuPDF/issues/318 # elif key in (8, 9, 11): points = annot.vertices for i in range(int(len(points) / 4.0)): # four points in a group # highlight: whole bbox if key == 8: x0, y0 = points[4 * i] x1, y1 = points[4 * i + 3] # NOTE: this indded a stroke for PyMuPDF -> no fill color but stroke color !! path = _add_fill_rect((x0, y0), (x1, y1), sc) res.append(path) else: # underline: bottom edge if key == 9: start, end = points[4 * i + 2], points[4 * i + 3] # strikethrough: average of top and bottom edge else: x0, x1 = points[4 * i][0], points[4 * i + 1][0] y_ = (points[4 * i][1] + points[4 * i + 2][1]) / 2.0 start = x0, y_ end = x1, y_ path = _add_stroke_line(start, end, sc, w) res.append(path) return res
def get_good_rectangles(page: Page) -> List[Rect]: """Find rectangles in the PDFs that might be redactions. :param page: The PyMuPDF Page to look for rectangles within. :returns A list of PyMUPDF.Rect objects for each non-white, fully opaque rectangle that's big enough to be a possible redaction. If none, returns an empty list. Also enhances the Rect object by including the sequence number and fill color of the parent drawing. This allows us to later determine if a letter is above or below a rectangle or whether it's the same color. """ drawings = page.get_drawings() good_rectangles = [] for drawing in drawings: if drawing.get("fill_opacity") is None or drawing["fill_opacity"] != 1: # Not opaque. Probably a highlight or similar. continue if drawing["fill"] is None: # Unfilled box (transparent to the eye, but distinct from ones that # have opacity of 0). continue # Each drawing can contain multiple "draw" commands that could be # rectangles, lines, quads or curves. Each takes the form of a tuple, # where the first item is the type for the object, then the rest of the # items in the tuple define the object. In the case of rectangles, the # type is "re", and the second key is a fitz.Rect object. Gather those # here. # # N.B.: Each _drawing_ also contains a key for "rect" that defines a # rectangle around the whole shape. Using that, however, you will get # the outer dimensions of multi-line redactions, which will make you # sad. For example: # # +----------------------------------------------------+ # | some visible letters █████████████████████████████ | # | ████████████████████████████████ more letters here | # +----------------------------------------------------+ # # If you use the dimensions of the outer rectangle, you will wrongly # say that the letters before and after the redaction are badly # redacted. Instead, use the rectangles from the "items" key, which in # the above example would yield two rectangles ("re" types). rectangles = [item[1] for item in drawing["items"] if item[0] == "re"] for rectangle in rectangles: # Give it the sequence number and color of its parent drawing rectangle.seqno = drawing["seqno"] rectangle.fill = drawing["fill"] if rectangle.y1 <= 43: # It's a header, ignore it continue if all([ # Eliminate horizontal lines rectangle.height > 4, # Eliminate vertical lines, like those along margins. rectangle.width > 4, ]): if rectangle.is_infinite: rectangle.normalize() good_rectangles.append(rectangle) return good_rectangles