def clip_page(cls, page: fitz.Page, bbox: fitz.Rect = None, zoom: float = 3.0): """Clip page pixmap (without text) according to ``bbox``. Args: page (fitz.Page): pdf page to extract. bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page. zoom (float, optional): Improve resolution by this rate. Defaults to 3.0. Returns: dict: Raw dict of the extracted pixmap. """ # hide text before clip the image only # render Tr: set the text rendering mode # - 3: neither fill nor stroke the text -> invisible # read more: # - https://github.com/pymupdf/PyMuPDF/issues/257 # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf doc = page.parent for xref in page.get_contents(): stream = doc.xrefStream(xref).replace(b'BT', b'BT 3 Tr') \ .replace(b'Tm', b'Tm 3 Tr') \ .replace(b'Td', b'Td 3 Tr') doc.updateStream(xref, stream) # improve resolution # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution # - https://github.com/pymupdf/PyMuPDF/issues/181 bbox = page.rect if bbox is None else bbox & page.rect image = page.getPixmap(clip=bbox, matrix=fitz.Matrix(zoom, zoom)) # type: fitz.Pixmap return cls.to_raw_dict(image, bbox)
def _hide_page_text(cls, page: fitz.Page): """Hide page text before clipping page. Args: page (fitz.Page): pdf page to extract. """ # render Tr: set the text rendering mode # - 3: neither fill nor stroke the text -> invisible # read more: # - https://github.com/pymupdf/PyMuPDF/issues/257 # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf doc = page.parent for xref in page.get_contents(): stream = doc.xrefStream(xref).replace(b'BT', b'BT 3 Tr') \ .replace(b'Tm', b'Tm 3 Tr') \ .replace(b'Td', b'Td 3 Tr') doc.updateStream(xref, stream)
def _hide_page_text(page: fitz.Page): '''Hide page text before clipping page.''' # NOTE: text might exist in both content stream and form object stream # - content stream, i.e. direct page content # - form object, i.e. contents referenced by this page xref_list = [ xref for (xref, name, invoker, bbox) in page.get_xobjects() ] xref_list.extend(page.get_contents()) # render Tr: set the text rendering mode # - 3: neither fill nor stroke the text -> invisible # read more: # - https://github.com/pymupdf/PyMuPDF/issues/257 # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf doc = page.parent # type: fitz.Document for xref in xref_list: stream = doc.xref_stream(xref).replace(b'BT', b'BT 3 Tr') \ .replace(b'Tm', b'Tm 3 Tr') \ .replace(b'Td', b'Td 3 Tr') doc.update_stream(xref, stream)