def dump_pdf(fname, dir): RE_XOBJ = r"/Type\s*/XObject" RE_IMG = r"/Subtype\s*/Image" doc = fitz.open(fname) img_idcs = [] for i in range(1, doc.xref_length()): xref = doc.xref_object(i) is_xobj = re.search(RE_XOBJ, xref) is_img = re.search(RE_IMG, xref) if is_xobj and is_img: img_idcs.append(i) l = len(str(len(img_idcs) - 1)) for i, j in enumerate(img_idcs): print(f'no: {i}, xref: {j}') img = fitz.Pixmap(doc, j) if img.n >= 5: pix = fitz.Pixmap(fitz.csRGB, pix) imgname = path.join(dir, f'{i:0{l}d}.png') img.writePNG(imgname) doc.close() '''
def extractImage(myfile): checkXO = r"/Type(?= */XObject)" # finds "/Type/XObject" checkIM = r"/Subtype(?= */Image)" # finds "/Subtype/Image" this_file_path = MEDIA_ROOT + "/" + myfile.name dest_file = MEDIA_ROOT + r'\imageStore\\' if not os.path.exists(dest_file): os.makedirs(dest_file) doc = fitz.open(this_file_path) imgcount = 0 lenXREF = doc._getXrefLength() # number of objects - do not use entry 0! for i in range(1, lenXREF): # scan through all objects text = doc._getObjectString(i) # string defining the object isXObject = re.search(checkXO, text) # tests for XObject isImage = re.search(checkIM, text) # tests for Image if not isXObject or not isImage: # not an image object if not both True continue imgcount += 1 pix = fitz.Pixmap(doc, i) # make pixmap from image if pix.n < 5: # can be saved as PNG pix.writePNG(dest_file + extension(myfile) + "-" + "img-%s.png" % (i, )) else: # must convert the CMYK first pix0 = fitz.Pixmap(fitz.csRGB, pix) pix0.writePNG(dest_file + extension(myfile) + "-" + "img-%s.png" % (i, )) pix0 = None # free Pixmap resources pix = None # free Pixmap resources t1 = time.clock()
def get_pdf_images(pdf_file): checkXO = r"/Type(?= */XObject)" # finds "/Type/XObject" checkIM = r"/Subtype(?= */Image)" # finds "/Subtype/Image" doc = fitz.open(pdf_file) imgcount = 0 lenXREF = doc._getXrefLength() # number of objects - do not use entry 0! img_files = [] for i in range(1, lenXREF): # scan through all objects text = doc._getObjectString(i) # string defining the object isXObject = re.search(checkXO, text) # tests for XObject isImage = re.search(checkIM, text) # tests for Image if not isImage: isImage = "/Subtype/Image" in text if not isXObject and not isImage: # not an image object if not both True continue imgcount += 1 pix = fitz.Pixmap(doc, i) # make pixmap from image filename = pdf_file.replace(".pdf", "") + "-%s.png" % (i, ) if pix.n < 5: # can be saved as PNG pix.writePNG(filename) else: # must convert the CMYK first pix0 = fitz.Pixmap(fitz.csRGB, pix) pix.writePNG(filename) pix0 = None # free Pixmap resources reverse_image(filename) img_files.append(filename) pix = None # free Pixmap resources return img_files
def get_images(self): if self.doctype == "pdf": image_names = [] for i in range(len(self.doc)): for img in self.doc.getPageImageList(i): xref = img[0] pix = fitz.Pixmap(self.doc, xref) image = np.array(Image.open(io.BytesIO( pix.getImageData()))) pixies = [] for abc in image: for ab in abc: pixies.append((ab[0], ab[1], ab[2])) if len(set(pixies)) > 1: if pix.n < 5: # this is GRAY or RGB pix.writePNG( os.path.join(self.file_path, "p%s-%s.png" % (i, xref))) image_names.append("p%s-%s.png" % (i, xref)) else: # CMYK: convert to RGB first pix = fitz.Pixmap(fitz.csRGB, pix) pix.writePNG( os.path.join(self.file_path, "p%s-%s.png" % (i, xref))) image_names.append("p%s-%s.png" % (i, xref)) pix = None elif self.doctype == "docx": image_names = [] return image_names
def pdf2pic(path, pic_path): t0 = time.clock() # 生成图片初始时间 checkXO = r"/Type(?= */XObject)" # 使用正则表达式来查找图片 checkIM = r"/Subtype(?= */Image)" doc = fitz.open(path) # 打开pdf文件 imgcount = 0 # 图片计数 lenXREF = doc._getXrefLength() # 获取对象数量长度 # 打印PDF的信息 print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF - 1)) # 遍历每一个对象 for i in range(1, lenXREF): text = doc._getXrefString(i) # 定义对象字符串 isXObject = re.search(checkXO, text) # 使用正则表达式查看是否是对象 isImage = re.search(checkIM, text) # 使用正则表达式查看是否是图片 if not isXObject or not isImage: # 如果不是对象也不是图片,则continue continue imgcount += 1 pix = fitz.Pixmap(doc, i) # 生成图像对象 new_name = "图片{}.png".format(imgcount) # 生成图片的名称 if pix.n < 5: # 如果pix.n<5,可以直接存为PNG pix.writePNG(os.path.join(pic_path, new_name)) else: # 否则先转换CMYK pix0 = fitz.Pixmap(fitz.csRGB, pix) pix0.writePNG(os.path.join(pic_path, new_name)) pix0 = None pix = None # 释放资源 t1 = time.clock() # 图片完成时间 print("运行时间:{}s".format(t1 - t0)) print("提取了{}张图片".format(imgcount))
def recoverpix(doc, item): xref = item[0] # xref of PDF image smask = item[1] # xref of its /SMask # special case: /SMask exists # use Pillow to recover original image if smask > 0: fpx = io.BytesIO( # BytesIO object from image binary doc.extractImage(xref)["image"], ) fps = io.BytesIO( # BytesIO object from smask binary doc.extractImage(smask)["image"], ) img0 = Image.open(fpx) # Pillow Image mask = Image.open(fps) # Pillow Image img = Image.new("RGBA", img0.size) # prepare result Image img.paste(img0, None, mask) # fill in base image and mask bf = io.BytesIO() img.save(bf, "png") # save to BytesIO return { # create dictionary expected by caller "ext": "png", "colorspace": 3, "image": bf.getvalue(), } # special case: /ColorSpace definition exists # to be sure, we convert these cases to RGB PNG images if "/ColorSpace" in doc.xrefObject(xref, compressed=True): pix1 = fitz.Pixmap(doc, xref) pix2 = fitz.Pixmap(fitz.csRGB, pix1) return { # create dictionary expected by caller "ext": "png", "colorspace": 3, "image": pix2.getImageData("png"), } return doc.extractImage(xref)
def ImageExtraction(folderName, DiskPath, filepath): try: #print(fileName+'::::::fileName') FileLocation = os.path.dirname(filepath) + "\\" + Path(filepath).stem os.makedirs(FileLocation, exist_ok=True) doc = fitz.open(filepath) print('------------------------Extracting Images from ' + filepath + '------------------------') for i in range(len(doc)): for img in doc.getPageImageList(i): xref = img[0] pix = fitz.Pixmap(doc, xref) imagepath = FileLocation + "\\%sp%s%s.jpg" % (folderName, i, xref) fileNamewithExt = os.path.basename(imagepath) #print('fileNamewithExt::::::::::'+fileNamewithExt) if pix.n < 5: # this is GRAY or RGB pix.writePNG(imagepath) else: # CMYK: convert to RGB first pix1 = fitz.Pixmap(fitz.csRGB, pix) pix1.writePNG(imagepath) pix1 = None pix = None except Exception as e: print('Error occurred in ImageExtraction:::.', e)
def extractImages(file): """ The images seem to be inverted in some funny way, but i'm sure there must be a way of dealing with it and converting back to the proper format. """ pdf_title = file.split('.')[0] images_dir = pdf_title + '/images' if os.path.isdir(pdf_title) == False: os.mkdir(pdf_title) if os.path.isdir(images_dir) == False: os.mkdir(images_dir) doc = fitz.open(file) temp = file.split('/') for i in range(len(doc)): count = 0 for img in doc.getPageImageList(i): xref = img[0] pix = fitz.Pixmap(doc, xref) if pix.n < 5: # this is GRAY or RGB pix1 = fitz.Pixmap(fitz.csRGB, pix) pix1.writePNG(images_dir + "/%s-Pg%s-Img%s.png" % (temp[-1], i + 1, count + 1)) else: # CMYK: convert to RGB first pix1 = fitz.Pixmap(fitz.csRGB, pix) pix1.writePNG(images_dir + "/%s-%s-%s.png" % (temp[-1], i, count)) pix1 = None pix = None count += 1
def test_filepixmap(): # pixmaps from file and from stream pix1 = fitz.Pixmap(imgfile) stream = open(imgfile, "rb").read() pix2 = fitz.Pixmap(stream) assert repr(pix1) == repr(pix2) assert pix1.samples == pix2.samples
def process_text(filename, filepath): # time.sleep(180) output_folder = "/home/flask/app/output_files" all_paths = [] file_at = f"{output_folder}/{filename}" if not os.path.exists(file_at): return [] doc = fitz.open(file_at) for i in range(len(doc)): for img in doc.getPageImageList(i): xref = img[0] pix = fitz.Pixmap(doc, xref) file_out = f"{output_folder}/{filename}_p%s-%s.png" % (i, xref) all_paths.append(f"output_files/{filename}_p%s-%s.png" % (i, xref)) if pix.n < 5: # this is GRAY or RGB pix.writePNG(file_out) else: # CMYK: convert to RGB first pix1 = fitz.Pixmap(fitz.csRGB, pix) pix1.writePNG(file_out) pix1 = None pix = None return all_paths
def extract_images_from_pdf(pdf_path): doc = fitz.open(pdf_path) image_count = 0 images = [] for i in range(len(doc)): page = i + 1 print(f"extracting images from page {page}..") count_per_page = 1 for img in doc.getPageImageList(i): xref = img[0] pix = fitz.Pixmap(doc, xref) target_out = Path(extract_path, f"{image_count:02}.png") if pix.n - pix.alpha < 4: # this is GRAY or RGB pix.writePNG(target_out) else: # CMYK: convert to RGB first pix = fitz.Pixmap(fitz.csRGB, pix) pix.writePNG(target_out) pix = None images.append(target_out) count_per_page = count_per_page + 1 image_count = image_count + 1 print(f"finished extracting {image_count} images") return images
def ExtractImagesFromPDF(self, filename_pdf): min_width = 128 min_height = 16 doc = fitz.open(filename_pdf) imageDatas = [] for i in range(len(doc)): for img in doc.getPageImageList(i): xref = img[0] _pix = fitz.Pixmap(doc, xref) if _pix.width <= min_width and _pix.height <= min_height: continue if _pix.n < 5: # GRAY or RGB pix = _pix else: # CMYK: convert to RGB pix = fitz.Pixmap(fitz.csRGB, _pix) imageData = pix.getImageData("png") imageDatas.append(imageData) # dedup in case clipped copies are used on successive pdf pages (e.g. to avoid downscaling) dedupImageDatas = [] for imageData in imageDatas: if len(dedupImageDatas) == 0 or imageData != dedupImageDatas[-1]: dedupImageDatas.append(imageData) images = [] for imageData in dedupImageDatas: image = Image.open(io.BytesIO(imageData)) images.append(image) return images
def extract_images_rgb(doc): pages_containing_images = [] rgb_pages = [] imgcount = 0 for i in range(len(doc)): print(' ' * 10, end='\r') print(round(100 * i / len(doc)), '%', end='') imglist = doc.getPageImageList(i) if len(imglist) != 0: pages_containing_images.append(i) for img in imglist: xref = img[0] # xref number pix = fitz.Pixmap(doc, xref) # make pixmap from image imgcount += 1 if pix.n < 5: # can be saved as PNG pix.writePNG("img.png") if i not in rgb_pages: if RGBimageanalyze("img.png") == True: rgb_pages.append(i) else: # must convert CMYK first pix0 = fitz.Pixmap(fitz.csRGB, pix) pix0.writePNG("img.png") pix0 = None # free Pixmap resources if i not in rgb_pages: if RGBimageanalyze("img.png") == True: rgb_pages.append(i) pix = None # free Pixmap resources try: os.remove("img.png") except: print('unable to remove img.png') finally: print(' ' * 10, end='\r') print('100 %') return pages_containing_images, rgb_pages
def photoextraction(self, doc): for i in range(len(doc)): for img in doc.getPageImageList(i): xref = img[0] pix = fitz.Pixmap(doc, xref) if pix.n < 1: pix.writePNG( os.path.join(self.path, "p%s-%s.png" % (i, xref))) else: pix1 = fitz.Pixmap(fitz.csRGB, pix) pix1.writePNG( os.path.join(self.path, "p%s-%s.png" % (i, xref))) try: # HUMAN IMAGE IN ADHAR if pix.width == float( 0.8 ) * pix.height or pix.width == 0.75 * pix.height: self.human_image = os.path.join( self.path, "p%s-%s.png" % (i, xref)) pixmap = QPixmap(self.human_image) self.label_9.setPixmap(pixmap) self.label_9.setScaledContents(True) # SCANNER CODE IN ADHAR elif pix.width == pix.height: pixmap = QPixmap( os.path.join(self.path, "p%s-%s.png" % (i, xref))) self.label_25.setPixmap(pixmap) self.label_25.setScaledContents(True) except Exception as e: print(e) print("fault in human and scanner image")
def worker(i, img_count, pdf_path, pic_path): # print("进程%d开始执行,进程号为%d" % (img_count, os.getpid())) # print(i, img_count, pdf_path, pic_path) # t0 = time.perf_counter() # 生成图片初始时间 doc = fitz.open(pdf_path) # 打开pdf文件 # trans = fitz.Matrix(2, 2).preRotate(0) pix = fitz.Pixmap(doc, i) # 根据索引生成图像 生成图像对象 # pix.setResolution(pix.w // 2, pix.h // 2) # print(type(pix)) c1, c2 = 140, 50 # print(type(pix), pix.w, pix.pixel(1, 2)) for x in range(pix.w): for y in range(pix.h): if pix.pixel(x, y)[0] > c1: pix.setPixel(x, y, [255, 255, 255]) else: pix.setPixel(x, y, [c2, c2, c2]) # print(pix.pixel(1, 2)) # # 根据pdf的路径生成图片的名称 new_name = os.path.join(pic_path, f'{img_count}.png') if pix.n < 5: # 如果pix.n<5,可以直接存为PNG pix.writePNG(new_name) else: # 否则先转换CMYK pix0 = fitz.Pixmap(fitz.csRGB, pix) pix0.writePNG(new_name) pix0 = None pix = None # 释放资源 doc.close()
def extract_from_pdf(file_path): prefix = _prefix(file_path) outfiles = [] doc = fitz.open(file_path) try: for i in range(len(doc)): for img in doc.getPageImageList(i): try: xref = img[0] pix = fitz.Pixmap(doc, xref) outfile = '{} - p{}-{}.png'.format(prefix, i, xref) if pix.n < 5: # this is GRAY or RGB pix.writePNG(outfile) else: # CMYK: convert to RGB first pix1 = fitz.Pixmap(fitz.csRGB, pix) pix1.writePNG(outfile) pix1 = None pix = None outfiles.append(outfile) except: print(' failed to export image {} from pdf {}'.format(outfile, file_path)) pass finally: doc.close() return outfiles
def get_pdf_content(self, filePath, languages): """ First this function get all texts in the file if exist. Then it creates a list of pictures to make the OCR method.""" text = "" with fitz.open(filePath) as doc: images = [] for page in doc: text += page.getText("text") images += doc.getPageImageList(page.number) for image in images: xref = image[0] picture = fitz.Pixmap(doc, xref) if picture.n > 4: # CMYK colorspace picture = fitz.Pixmap(fitz.csRGB, picture) # convert to RGB bytes_img = BytesIO(picture.getImageData()) page_text = self.ocrize(bytes_img, languages) if page_text == -1: text = -1 break elif page_text: text += page_text return text
def recoverpix(doc, item): """Return image for a given XREF. """ x = item[0] # xref of PDF image s = item[1] # xref of its /SMask if s == 0: # no smask: use direct image output return doc.extractImage(x) def getimage(pix): if pix.colorspace.n != 4: return pix tpix = fitz.Pixmap(fitz.csRGB, pix) return tpix # we need to reconstruct the alpha channel with the smask pix1 = fitz.Pixmap(doc, x) pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry """Sanity check: - both pixmaps must have the same rectangle - both pixmaps must have alpha=0 - pix2 must consist of 1 byte per pixel """ if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): print("Warning: unsupported /SMask %i for %i:" % (s, x)) print(pix2) pix2 = None return getimage(pix1) # return the pixmap as is pix = fitz.Pixmap(pix1) # copy of pix1, with an alpha channel added pix.setAlpha(pix2.samples) # treat pix2.samples as the alpha values pix1 = pix2 = None # free temp pixmaps # we may need to adjust something for CMYK pixmaps here: return getimage(pix)
def pdftoimages(input_dir,output_dir): """ Converts pdfs in input dir to .png and stores them in output_dir Args: input_dir = path to location of pdfs output_dir= path where images would be stored Returns: Saves a .png image to output_dir """ dirListing = os.listdir(input_dir) files = [] imagespath = output_dir for item in dirListing: files.append(item) n = len(files) for num in range(n): doc = fitz.open(input_dir+"/"+files[num]) for img in doc.getPageImageList(0): xref = img[0] pix = fitz.Pixmap(doc, xref) if pix.n < 5: # this is GRAY or RGB pix.writePNG(os.path.join(imagespath,"p%s-%s.png" % (num, xref))) else: # CMYK: convert to RGB first pix1 = fitz.Pixmap(fitz.csRGB, pix) pix1.writePNG(os.path.join(imagespath,"p%s-%s.png" % (num, xref))) pix1 = None pix=None break
def pdf2pic(filepath, pic_path): checkXO = r"/Type(?= */XObject)" # 使用正则表达式来查找图片 checkIM = r"/Subtype(?= */Image)" doc = fitz.open(filepath) # 打开pdf文件 imgcount = 0 # 图片计数 lenXREF = doc._getXrefLength() # 获取对象数量长度 imageList = [] # 遍历每一个对象 for i in range(1, lenXREF): text = doc._getXrefString(i) # 定义对象字符串 isXObject = re.search(checkXO, text) # 使用正则表达式查看是否是对象 isImage = re.search(checkIM, text) # 使用正则表达式查看是否是图片 if not isXObject or not isImage: # 如果不是对象也不是图片,则continue continue imgcount += 1 # if imgcount != 12: # continue pix = fitz.Pixmap(doc, i) # 生成图像对象 new_name = "图片{}.png".format(time.time()) # 生成图片的名称 imageList.append(new_name) if pix.n < 5: # 如果pix.n<5,可以直接存为PNG pix.writePNG(os.path.join(pic_path, new_name)) else: # 否则先转换CMYK pix0 = fitz.Pixmap(fitz.csRGB, pix) pix0.writePNG(os.path.join(pic_path, new_name)) pix0 = None pix = None # 释放资源 time.sleep(0.1) time.sleep(1) imagePath = imageList[-2] imageList.pop(len(imageList) - 2) for image in imageList: os.remove(pic_path + '\\' + image) return pic_path + '\\' + imagePath
def recoverpix(doc, xref, item): """Return pixmap for item, if an /SMask exists. """ def getimage(pix): if pix.colorspace.n != 4: return pix tpix = fitz.Pixmap(fitz.csRGB, pix) return tpix s = item["smask"] # xref of its /SMask try: pix1 = fitz.Pixmap(doc, xref) # make pixmap from image except: return None # skip if error try: pix2 = fitz.Pixmap(doc, s) # create pixmap of /SMask entry except: print("cannot create mask %i for image %i" % (s, xref)) return getimage(pix1) # return w/ failed transparency # check that we are safe if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): return getimage(pix1) pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added pix.setAlpha(pix2.samples) # treat pix2.samples as alpha values pix1 = pix2 = None # free temp pixmaps return getimage(pix)
def pdf2pic(path, pic_path): # 打开pdf doc = fitz.open(path) nums = doc._getXrefLength() imgcount = 0 for i in range(1, nums): text = doc._getXrefString(i) if ('Width 2550' in text) and ('Height 3300' in text) or ('thumbnail' in text): continue checkXO = r"/Type(?= */XObject)" checkIM = r"/Subtype(?= */Image)" isXObject = re.search(checkXO, text) isImage = re.search(checkIM, text) if not isXObject or not isImage: continue imgcount += 1 pix = fitz.Pixmap(doc, i) img_name = "img{}.png".format(imgcount) if pix.n < 5: try: pix.writePNG(os.path.join(pic_path, img_name)) pix = None except: pix0 = fitz.Pixmap(fitz.csRGB, pix) pix0.writePNG(os.path.join(pic_path, img_name)) pix0 = None
def count_extract_pdf_images(pdf_file_path, save_images = False): doc, count, saved_image_filepaths = fitz.open(pdf_file_path), 0, [] pdf_directory, pdf_name = Path(pdf_file_path).parent, str(Path(pdf_file_path).stem) for i in range(len(doc)): page = i+1 for img in doc.getPageImageList(i): xref = img[0] pix = fitz.Pixmap(doc, xref) if pix.n < 5: # this is GRAY or RGB # Mysterious plain black images are just 3 characters long if these replacements made if len(str(pix.samples).replace('\\','').replace('x','').replace('f','').replace('0','')) > 3: count += 1 image_name = '{}_image_{}_page_{}.png'.format(pdf_name, count, page) if save_images: if not (pdf_directory / (pdf_name + '_images')).exists(): (pdf_directory / (pdf_name + '_images')).mkdir() pix.writePNG(str(pdf_directory/(pdf_name+'_images')/image_name)) saved_image_filepaths.append(str(pdf_directory/(pdf_name+'_images')/image_name)) else: # CMYK: convert to RGB first pix1 = fitz.Pixmap(fitz.csRGB, pix) # Mysterious plain black images are just 3 characters long if these replacements made if len(str(pix1.samples).replace('\\', '').replace('x', '').replace('f', '').replace('0', '')) > 3: count += 1 image_name = '{}_image_{}_page_{}.png'.format(pdf_name, count, page) if save_images: if not (pdf_directory/(pdf_name+'_images')).exists(): (pdf_directory/(pdf_name+'_images')).mkdir() pix1.writePNG(str(pdf_directory/(pdf_name+'_images')/image_name)) saved_image_filepaths.append(str(pdf_directory / (pdf_name + '_images') / image_name)) pix1 = None pix = None return count, saved_image_filepaths
def extract_images_pymupdf(message): doc = fitz.open(message["path"]) for i in range(len(doc)): for img in doc.getPageImageList(i): processing_dir = "./data/processing/" identifier = str(uuid.uuid4()) workfile = processing_dir + identifier xref = img[0] pix = fitz.Pixmap(doc, xref) if pix.n < 5: pix.writePNG(workfile) else: pix1 = fitz.Pixmap(fitz.czRGB, pix) pix1.writePNG(workfile) pix1 = None pix = None filename = str(i) + "-" + str(xref) + ".png" new_message = { "identifier": identifier, "parent": message["identifier"], "path": workfile, "filename": filename, "filetype": "unknown", "history": [], "metadata": {}, "original_file": False } sendEvent(new_message) doc.close()
def recoverpix(doc, item): x = item[0] # xref of PDF image s = item[1] # xref of its /SMask if s == 0: # no smask: use direct image output return doc.extractImage(x) def getimage(pix): if pix.colorspace.n != 4: return pix tpix = fitz.Pixmap(fitz.csRGB, pix) return tpix # we need to reconstruct the alpha channel with the smask pix1 = fitz.Pixmap(doc, x) pix2 = fitz.Pixmap(doc, s) # create pixmap of /SMask entry # sanity check if not (pix1.irect == pix2.irect and \ pix1.alpha == pix2.alpha == 0 and \ pix2.n == 1): pix2 = None return getimage(pix1) pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added pix.setAlpha(pix2.samples) # treat pix2.samples as alpha value pix1 = pix2 = None # free temp pixmaps # we may need to adjust something for CMYK pixmaps here: return getimage(pix)
def handlePDF(doc): taxon_regex = "(?:^|\W)[A-Z]{4}(?:^|\W)" p = re.compile(taxon_regex) for i in range(20, len(doc)): page = doc.loadPage(i) text = page.getText("text") match = p.search(text) if match and match.group().find("TOME") == -1: print("---------------------------------") print(page) print(match.group(), page.number) taxon = ''.join(e for e in match.group() if e.isalnum()) i = 0 for img in doc.getPageImageList(page.number): xref = img[0] pix = fitz.Pixmap(doc, xref) if pix.n >= 5: # CMYK: convert to RGB first pix = fitz.Pixmap(fitz.csRGB, pix) ROI = pix2np(pix) ROI_w, ROI_h = ROI.shape[0], ROI.shape[1] if ROI_w > 10 and ROI_h > 10: #CHECKING IF IMAGE CONTAINS TO MUCH WHITE area = ROI_w * ROI_h if ROI.shape[2] != 1: ROI_gray = cv2.cvtColor(ROI, cv2.COLOR_BGR2GRAY) else: ROI_gray = ROI ret, ROI_thr = cv2.threshold(ROI_gray, 254, 255, cv2.THRESH_BINARY) n = len(np.where(ROI_thr == 255)[0]) # IF NOT, SAVE IMAGE if n < area * 0.2: saveImg(ROI_gray, "tmp", taxon, i) i += 1
def recoverpix(doc, item): """Return pixmap for item, which is a list of 2 xref numbers. Second xref is that of an smask if > 0. Return None for any error. """ x = item[0] # xref of PDF image s = item[1] # xref of its /SMask try: pix1 = fitz.Pixmap(doc, x) # make pixmap from image except: print("xref %i " % x + doc._getGCTXerrmsg()) return None # skip if error if s == 0: # has no /SMask return pix1 # no special handling try: pix2 = fitz.Pixmap(doc, s) # create pixmap of /SMask entry except: print("cannot create mask %i for image xref %i" % (s,x)) return pix1 # return w/ failed transparency # check that we are safe if not (pix1.irect == pix2.irect and \ pix1.alpha == pix2.alpha == 0 and \ pix2.n == 1): print("unexpected /SMask situation: pix1", pix1, "pix2", pix2) return pix1 pix = fitz.Pixmap(pix1) # copy of pix1, alpha channel added pix.setAlpha(pix2.samples) # treat pix2.samples as alpha values pix1 = pix2 = None # free temp pixmaps return pix
def extractImagesJPG(filename, outfolder, verbose, imagequality): import fitz from PIL import Image imgList=[] writelog('Extracting images', verbose) doc = fitz.open(filename) pages=len(doc) suffix=len(str(pages))+1 for i in range(len(doc)): for img in doc.getPageImageList(i): xref = img[0] pix = fitz.Pixmap(doc, xref) if pix.n > 5: pix= fitz.Pixmap(fitz.csRGB, pix) newname="{0}Page_{1}.jpg".format(outfolder,str(i).zfill(suffix)) writelog('Extracting Page {0}, Saving as {1}'.format(i,newname), verbose) saveJPEG(newname, pix, verbose, imagequality) imgList.append(newname) pix = None return imgList
def test_filepixmap(): # pixmaps from file and from stream # should lead to same result pix1 = fitz.Pixmap(imgfile) stream = open(imgfile, "rb").read() pix2 = fitz.Pixmap(stream) assert repr(pix1) == repr(pix2) assert pix1.digest == pix2.digest
def pdf2pic(path, num, bar_value): t0 = time.clock() # 生成图片初始时间 checkXO = r"/Type(?= */XObject)" # 使用正则表达式来查找图片 checkIM = r"/Subtype(?= */Image)" doc = fitz.open(path) # 打开pdf文件 imgcount = 0 # 图片计数 lenXREF = doc._getXrefLength() # 获取对象数量长度 parrent = re.compile(r'[a-zA-Z0-9]{%s}' % num) (filepath, tempfilename) = os.path.split(path) filename, tmp = os.path.splitext(tempfilename) result = re.findall(parrent, filename) result = "".join(result) print('==========', result, '==========') if result: res = result pic_name = res else: res = filename pic_name = "_" # 打印PDF的信息 print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF - 1)) # 遍历每一个对象 for i in range(1, lenXREF): text = doc._getXrefString(i) # 定义对象字符串 isXObject = re.search(checkXO, text) # 使用正则表达式查看是否是对象 isImage = re.search(checkIM, text) # 使用正则表达式查看是否是图片 if not isXObject or not isImage: # 如果不是对象也不是图片,则continue continue imgcount += 1 pix = fitz.Pixmap(doc, i) # 生成图像对象 new_name = "{}_{}.jpg".format(pic_name, imgcount) # 生成图片的名称 new_path = os.getcwd() + r"\图片" bar_value.emit(['bar', 90 / lenXREF * (i + 1)]) if os.path.exists(new_path): print("文件夹已存在,不必重新创建!") pass else: os.makedirs(new_path) new_filepath = new_path + r"\%s" % res print(new_filepath) if os.path.exists(new_filepath): print("文件夹已存在,不必重新创建!") else: os.makedirs(new_filepath) if pix.n < 5: # 如果pix.n<5,可以直接存为PNG pix.writePNG(os.path.join(new_filepath, new_name)) else: # 否则先转换CMYK pix0 = fitz.Pixmap(fitz.csRGB, pix) pix0.writePNG(os.path.join(new_filepath, new_name)) pix0 = None pix = None # 释放资源 t1 = time.clock() # 图片完成时间 print("运行时间:{}s".format(t1 - t0)) print("提取了{}张图片".format(imgcount))