def extract_images(cls, page: fitz.Page, clip_image_res_ratio: float = 3.0): """Extract normal images with ``Page.getImageList()``. Args: page (fitz.Page): pdf page to extract images. clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0. Returns: list: A list of extracted and recovered image raw dict. .. note:: ``Page.getImageList()`` contains each image only once, which may less than the real count of images in a page. """ # pdf document doc = page.parent # check each image item: # (xref, smask, width, height, bpc, colorspace, ...) images = [] for item in page.getImageList(full=True): # should always wrap getImageBbox in a try-except clause, per # https://github.com/pymupdf/PyMuPDF/issues/487 try: item = list(item) item[-1] = 0 bbox = page.getImageBbox( item) # item[7]: name entry of such an item except ValueError: continue # ignore images outside page if not bbox.intersects(page.rect): continue # recover image pix = cls._recover_pixmap(doc, item) # regarding images consist of alpha values only, i.e. colorspace is None, # the turquoise color shown in the PDF is not part of the image, but part of PDF background. # So, just to clip page pixmap according to the right bbox # https://github.com/pymupdf/PyMuPDF/issues/677 if not pix.colorspace: pix = cls._clip_page(page, bbox, zoom=clip_image_res_ratio) raw_dict = cls._to_raw_dict(pix, bbox) images.append(raw_dict) return images
def extract_images( cls, page: fitz.Page, clip_image_res_ratio: float = 3.0 # resolution ratio of cliiped bitmap ): ''' Get images dict based on image contents from `Page.getImageList()`. NOTE: Page.getImageList() contains each image only once, which may less than the real count if images in a page. ''' # pdf document doc = page.parent # check each image item: # (xref, smask, width, height, bpc, colorspace, ...) images = [] for item in page.getImageList(full=True): # should always wrap getImageBbox in a try-except clause, per # https://github.com/pymupdf/PyMuPDF/issues/487 try: item = list(item) item[-1] = 0 bbox = page.getImageBbox( item) # item[7]: name entry of such an item except ValueError: continue # ignore images outside page if not bbox.intersects(page.rect): continue pix = ImagesExtractor.recover_pixmap(doc, item) # regarding images consist of alpha values only, i.e. colorspace is None, # the turquoise color shown in the PDF is not part of the image, but part of PDF background. # So, just to clip page pixmap according to the right bbox # https://github.com/pymupdf/PyMuPDF/issues/677 if not pix.colorspace: raw_dict = cls.clip_page(page, bbox, zoom=clip_image_res_ratio) else: raw_dict = cls.to_raw_dict(pix, bbox) images.append(raw_dict) return images
def extract_images(cls, page:fitz.Page): '''Get images from current page.''' # pdf document doc = page.parent # check each image item: # (xref, smask, width, height, bpc, colorspace, ...) images = [] for item in page.getImageList(full=True): pix = recover_pixmap(doc, item) bbox = page.getImageBbox(item[7]) # item[7]: name entry of such an item # regarding images consist of alpha values only, i.e. colorspace is None, # the turquoise color shown in the PDF is not part of the image, but part of PDF background. # So, just to clip page pixmap according to the right bbox # https://github.com/pymupdf/PyMuPDF/issues/677 if not pix.colorspace: raw_dict = cls.clip_page(page, bbox, zoom=3.0) else: raw_dict = cls.to_raw_dict(pix, bbox) images.append(raw_dict) return images