import fitz, os thisdir = lambda f: os.path.join(os.path.dirname(__file__), f) thisfile = os.path.abspath(__file__) outfile = thisfile.replace(".py", ".pdf") font1 = fitz.Font("helv") font2 = fitz.Font("tiro") doc = fitz.open() page = doc.newPage() point = fitz.Point(50, 72) matrix = fitz.Matrix(-20) wrt1 = fitz.TextWriter(page.rect, color=(0, 0, 1)) wrt2 = fitz.TextWriter(page.rect, color=(1, 0, 0)) _, last = wrt1.append(point, "This text changes color,", font1, 11) _, last = wrt2.append(last, " font and fontsize", font2, 18) _, last = wrt1.append(last, " several", font1, 11) _, last = wrt2.append(last, " times!", font2, 24) # output both text writers on current page in arbitrary sequence wrt1.writeText(page, morph=(point, matrix)) # using the same morph parameter wrt2.writeText(page, morph=(point, matrix)) # also preserves the joint text. # make a new page page = doc.newPage() rect = wrt1.textRect | wrt2.textRect # join rect of blue and red text # make new rectangle from it, rotated by 90 degrees nrect = fitz.Rect( rect.tl, # same top-left, but width and height exchanged
def pyMuPDF_fitz(pdfPath, imagePath, imageName): startTime_pdf2img = datetime.datetime.now() # 开始时间 print("图片输出路径为:" + imagePath) print("正在转化,请稍后...") file_name = os.path.basename(pdfPath) # 获取文件名字 name = file_name.split('.')[0] # 去除后缀,获取名字 pdfDoc = fitz.open(pdfPath) for pg in range(pdfDoc.pageCount): page = pdfDoc[pg] rotate = int(0) # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。 # 此处若是不做设置,默认图片大小为:792X612, dpi=96 #zoom_x = 1.33333333 # (1.33333333-->1056x816) (2-->1584x1224) #zoom_y = 1.33333333 # 缩放系数都为2,分辨率提高4倍 #zoom_x = 2 # (1.33333333-->1056x816) (2-->1584x1224) #zoom_y = 2 #zoom_x = 1 # (1.33333333-->1056x816) (2-->1584x1224) #zoom_y = 1 zoom_x = 1.111111 # (1.33333333-->1056x816) (2-->1584x1224) zoom_y = 1.111111 mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate) pix = page.getPixmap(matrix=mat, alpha=False) if not os.path.exists(imagePath): # 判断存放图片的文件夹是否存在 os.makedirs(imagePath) # 若图片文件夹不存在就创建 # 页码从0开始,水印都加一 pix.writePNG(imagePath + '/' + imageName + '-%s.JPEG' % str(int(pg) + 1)) # 将图片写入指定的文件夹内 # 为图片添加水印 imageInfo = PIL.Image.open(imagePath + '/' + imageName + '-%s.JPEG' % str(int(pg) + 1)) # 文字水印 fontOne = ImageFont.truetype("C:\Windows\Fonts\simfang.ttf", 20) # 本地字体文件 draw = ImageDraw.Draw(imageInfo) #print(imageInfo.size) # 深灰色fill=(96, 96, 96) # 开始添加文字水印 draw.text((imageInfo.size[0] // 2 - 25, imageInfo.size[1] // 2 + 445), u"第%s页" % str(int(pg) + 1), fill=(0, 0, 0), font=fontOne) # imageInfo.show() #展示图片 # 添加图片水印 logo = PIL.Image.open("C:\\logo.png") w, h = logo.size # 获取图像宽高 logo.thumbnail((800, 800)) # 图像缩小1/2,图像缩放 layer = PIL.Image.new('RGBA', imageInfo.size, (255, 255, 255, 0)) # 添加图片水印 layer.paste(logo, (imageInfo.size[0] - logo.size[0] + 70, imageInfo.size[1] - logo.size[1] - 20)) imageInfo = PIL.Image.composite(layer, imageInfo, layer) #imageInfo.paste(logo, (0, 0)) # 将一张图片覆盖到另外一张图片上 imageInfo.save(imagePath + '/' + imageName + '-%s.JPEG' % str(int(pg) + 1), quality=1000, optimize=True, progressive=True) #imageInfo.save(imagePath + '/' + imageName + '-%s.JPEG' % str(int(pg) + 1), quality=200, optimize=True) endTime_pdf2img = datetime.datetime.now() # 结束时间 print('pdf2img转换时间:', (endTime_pdf2img - startTime_pdf2img).seconds) # 拼接为长图 '''imgs = [Image.open(imagePath + '\\' + fn) for fn in listdir(imagePath) if fn.endswith(".png")] # 打开路径下的所有图片
async def media_loop(): running = True while running: originalBookAndPage = False while not originalBookAndPage: validBookAndPageOpened = False while not validBookAndPageOpened: validBookFound = False while not validBookFound: randomBook = random.randint(0, bookCount) bookToPrint = bookPaths[randomBook] print(f"Book selected: {bookToPrint}") try: book = fitz.open(bookToPrint) totalPages = book.pageCount print(f"Book loaded. Page count: {totalPages}") validBookFound = True except RuntimeError as e: print(f"Error with book somewhere. {e}") # books that don't have valid table of contents page selection process randomLow = int(totalPages * 0.04) # randomPageNumber = random.randint(randomLow, totalPages - 8) randomPageNumber = random.randint(randomLow, totalPages - randomLow) if randomPageNumber == 0: randomPageNumber = 1 pageNumber = randomPageNumber try: page = book.loadPage(pageNumber) print(f"Page loaded: {pageNumber}") validBookAndPageOpened = True except RuntimeError as e: print(f"Error loading page: {pageNumber}. {e}") try: zoom = 2 matrix = fitz.Matrix(zoom, zoom) picOfPage = page.getPixmap(matrix=matrix) output = f"{bookToPrint}-{pageNumber}.png" if not book_in_history(bookToPrint, pageNumber): print( f"{Fore.RED}\nCOLLISION:\nPage {pageNumber} of {bookToPrint}\n" ) # if os.path.isfile(output): # print( # f"{Fore.RED}\nCOLLISION:\nPage {pageNumber} of {bookToPrint}\n") else: originalBookAndPage = True except: print(f"{Fore.RED}: Error in checking for collision") picOfPage.writePNG(output) status = poststatus(output) if status: print(f"{Fore.GREEN}\nSUCCESS\nUploaded: {output}\n") try: os.remove(output) print("File removed succesfully") except OSError as e: print(f"Error removing file: {e}") # print(status.created_at) else: print(f"{Fore.RED}\nFailure updating status\n") print(f"{Fore.BLUE} VALID EVERYTHING") minutes = 5 time = minutes * 60 print(f"{Fore.YELLOW}{Style.BRIGHT}Sleep for {minutes}m") await asyncio.sleep(time)
def pdf_ocr(pdf_name, path, method_get_image, words_per_line, ocr_method, client_id, client_secret): # 打开pdf doc = fitz.open(path) access_token = None if ocr_method == "online": # client_id 为官网获取的AK, client_secret 为官网获取的SK host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={}&client_secret={}'.format( client_id, client_secret) response = requests.get(host) if response: access_token = response.json()["access_token"] print(access_token) if if_image_to_pdf_or_hocr: doc_name = 'output/' + pdf_name[:-4] + "_output.pdf" else: doc_name = 'output/' + pdf_name[:-4] + ".docx" # 正则式提取图片法 if method_get_image == '正则式': # 使用正则表达式来查找图片 checkXO = r"/Type(?= */XObject)" checkIM = r"/Subtype(?= */Image)" # 图片计数 imgcount = 0 lenXREF = doc._getXrefLength() # 打印PDF的信息 print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF - 1)) # 遍历每一个对象 for i in range(1, lenXREF): # 定义对象字符串 text = doc._getXrefString(i) isXObject = re.search(checkXO, text) # 使用正则表达式查看是否是图片 isImage = re.search(checkIM, text) # 如果不是对象也不是图片,则continue if not isXObject or not isImage: continue imgcount += 1 # 根据索引生成图像 pix = fitz.Pixmap(doc, i) # 根据pdf的路径生成图片的名称 # new_name = path.replace('\\', '_') + "_img{}.png".format(imgcount) # new_name = new_name.replace(':', '') # out_image_path = os.path.join(pic_path, new_name) # # 如果pix.n<5,可以直接存为PNG # if pix.n < 5: # pix.writePNG(out_image_path) # # 否则先转换CMYK # else: # pix0 = fitz.Pixmap(fitz.csRGB, pix) # pix0.writePNG(out_image_path) # pix0 = None image = pix.getImageData() ocr_tesseract(image) # page = ocr_baidu(image) # time.sleep(1) # 释放资源 pix = None else: if if_image_to_pdf_or_hocr: if ocr_method == "online": raise ValueError('Unsupported filetype for online\ API: {}'.format('pdf')) return output_doc = fitz.open() else: output_doc = docx.Document() output_doc.styles['Normal'].font.name = u'等线' output_doc.styles['Normal']._element.rPr.rFonts\ .set(qn('w:eastAsia'), u'等线') time_start = time.time() page_count = doc.pageCount for pg in range(doc.pageCount): elapsed = time.time() - time_start eta = (page_count - pg) * elapsed / pg if pg > 0 else 0 print('[%d/%d] Elapsed: %s, ETA: %s' % (pg + 1, page_count, fmt_time(elapsed), fmt_time(eta))) page = doc[pg] rotate = int(0) # 每个尺寸的缩放系数为2,这将为我们生成分辨率提高4倍的图像。 zoom_x = 2 zoom_y = 2 trans = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate) pix = page.getPixmap(matrix=trans, alpha=False) image = pix.getImageData() # 这两个标志位解决 pdf 换页有可能不换段的问题 last_para = "" # 上一页的最后一段 last_para_ended = True # 上一页最后一段是否已结束,还是被分页了 if ocr_method == "local": if if_image_to_pdf_or_hocr: temp_doc = ocr_tesseract(image) output_doc.insertPDF(temp_doc, 0, 0) # output_doc.save(doc_name, incremental=True) output_doc.save(doc_name) else: final_para_ended, paras = ocr_tesseract( image, words_per_line) args = [ paras, output_doc, doc_name, last_para, final_para_ended, last_para_ended ] last_para, last_para_ended = paras2doc(*args) else: final_para_ended, paras = ocr_baidu(image, words_per_line, access_token) args = [ paras, output_doc, doc_name, last_para, final_para_ended, last_para_ended ] last_para, last_para_ended = paras2doc(*args) time.sleep(1) # 释放资源 pix = None
spend = time.time() - start_time print("Parsing: ", N, " seconds: ", spend) # Load Documents and meta information currentDocument = annotations[N] id = currentDocument['ID'] headerFields = currentDocument["HeaderElements"] ListElements = currentDocument["ListElements"] # load document doc = fitz.open(invoices_path + str(id) + ".pdf") page = doc[0] # convert to png zoom = 4 mat = fitz.Matrix(zoom, zoom) image = page.getPixmap(matrix = mat, alpha = False) w = image.width h = image.height image.writePNG(png_path + str(id) + ".png") # Create training data SegmentImage = np.zeros((height-3, width-1, num_classes), dtype=int) BoxMasks = np.zeros((height-3, width-1, 2*num_anchors), dtype=int) BoxCoords = np.zeros((height-3, width-1, 4*num_anchors), dtype=int) # Fill Seg Mapper with background for x in range(width-1): for y in range(height-3): SegmentImage[y,x] = segMapper("Background")
total_count = missed_count = 0 # iterate over identified words for word in chain(split_words, standard_words): wid = word['wid'] total_count += 1 chi_map, eng_map = [], [] # get all bboxs associated with text for group in word['groups']: bids, bbox, pg_num = group['bid'], group['bbox'], group['pg'] # obtain image of bbox bbox = resize(bbox, sf, [eps, 0, eps, 0]) if pg_num in pg_cache: img = pg_cache[pg_num] else: pix = doc[pg_num].getPixmap(matrix=fitz.Matrix(sf, sf)) img = Image.open(io.BytesIO(pix.getPNGData())) pg_cache[pg_num] = img block_img = img.crop(bbox) # perform OCR on bbox if len(bids) == 1: # single line chi_map.extend(classify(block_img, chi_detector)) eng_map.extend(classify(block_img, eng_detector)) else: # multi-line chi_map.extend(classify(block_img, chi_detector_multi)) eng_map.extend(classify(block_img, eng_detector_multi)) # use custom heuristics to obtain text (if high enough probability) chi_pinyin, chars, char_confs = getChinese(chi_map) eng_pinyin, nums = getEnglish(eng_map) guess = refine(chars, char_confs, chi_pinyin, eng_pinyin, bids[0], letters)
def create_exam_and_insert_QR( name, code, length, versions, test, page_versions, qr_file, test_mode=False, test_folder=None, ): """Creates the exam objects and insert the QR codes. Creates the exams objects from the pdfs stored at sourceVersions. Then adds the 3 QR codes for each page. (We create 4 QR codes but only add 3 of them because of the staple side, see below). Arguments: name {Str} -- Document Name. code {Str} -- 6 digit distinguished code for the document. length {int} -- Length of the document or number of pages. versions {int} -- Number of version of this Document. test {int} -- Test number based on the combination we have around (length ^ versions - initial pages) tests . page_versions {dict} -- (int,int) dictionary representing the version of each page for this test. qr_file {dict} -- dict(int: dict(int: Str)) Dictionary that has another embedded dictionary for each page. The embedded dictionary has a string for QR code paths saved for each corner. Keyword Arguments: test_mode {bool} -- Boolean elements used for testing, testing case with show the documents. (default: {False}) test_folder {Str} -- String for where to place the generated test files. (default: {None}) Returns: fitz.Document -- PDF document type returned as the exam, similar to a dictionary with the ge numbers as the keys. """ # A (int : fitz.fitz.Document) dictionary that has the page document/path from each source based on page version version_paths_for_pages = {} for version_index in range(1, versions + 1): version_paths_for_pages[version_index] = fitz.open( "sourceVersions/version{}.pdf".format(version_index)) # Create test pdf as "exam" exam = fitz.open() # Insert the relevant page-versions into this pdf. for page_index in range(1, length + 1): # Pymupdf starts pagecounts from 0 rather than 1. So offset things. exam.insertPDF( version_paths_for_pages[page_versions[page_index]], from_page=page_index - 1, to_page=page_index - 1, start_at=-1, ) # Get page width and height page_width = exam[0].bound().width page_height = exam[0].bound().height # create a box for the test number near top-centre rTC = fitz.Rect(page_width // 2 - 50, 20, page_width // 2 + 50, 40) # put marks at top left/right so students don't write near # staple or near where client will stamp marks # create two "do not write" (DNW) rectangles accordingly with TL (top left) and TR (top right) rDNW_TL = fitz.Rect(15, 15, 90, 90) rDNW_TR = fitz.Rect(page_width - 90, 15, page_width - 15, 90) # 70x70 page-corner boxes for the QR codes # TL: Top Left, TR: Top Right, BL: Bottom Left, BR: Bottom Right rTL = fitz.Rect(15, 20, 85, 90) rTR = fitz.Rect(page_width - 85, 20, page_width - 15, 90) rBL = fitz.Rect(15, page_height - 90, 85, page_height - 20) rBR = fitz.Rect(page_width - 85, page_height - 90, page_width - 15, page_height - 20) for page_index in range(length): # test/page stamp in top-centre of page # Rectangle size hacked by hand. TODO = do this more algorithmically # VALA SAYS: TODO still tands given that the pages are all the same # size. Will ask what it mean to do it algorithmically rect = fitz.Rect(page_width // 2 - 40, 20, page_width // 2 + 40, 44) text = "{}.{}".format(str(test).zfill(4), str(page_index + 1).zfill(2)) insertion_confirmed = exam[page_index].insertTextbox( rect, text, fontsize=18, color=[0, 0, 0], fontname="Helvetica", fontfile=None, align=1, ) exam[page_index].drawRect(rect, color=[0, 0, 0]) assert insertion_confirmed > 0 # stamp DNW near staple: even/odd pages different # Top Left for even pages, Top Right for odd pages # TODO: Perhaps this process could be improved by putting # into functions rDNW = rDNW_TL if page_index % 2 == 0 else rDNW_TR shape = exam[page_index].newShape() shape.drawLine(rDNW.top_left, rDNW.top_right) if page_index % 2 == 0: shape.drawLine(rDNW.top_right, rDNW.bottom_left) else: shape.drawLine(rDNW.top_right, rDNW.bottom_right) shape.finish(width=0.5, color=[0, 0, 0], fill=[0.75, 0.75, 0.75]) shape.commit() if page_index % 2 == 0: # offset by trial-and-error, could be improved rDNW = rDNW + (19, 19, 19, 19) else: rDNW = rDNW + (-19, 19, -19, 19) mat = fitz.Matrix(45 if page_index % 2 == 0 else -45) pivot = rDNW.tr / 2 + rDNW.bl / 2 morph = (pivot, mat) insertion_confirmed = exam[page_index].insertTextbox( rDNW, name, fontsize=8, fontname="Helvetica", fontfile=None, align=1, morph=morph, ) # exam[page_index].drawRect(rDNW, morph=morph) assert (insertion_confirmed > 0), "Text didn't fit: shortname too long? or font issue/bug?" # Grab the tpv QRcodes for current page and put them on the pdf # Remember that we only add 3 of the 4 QR codes for each page since # we always have a corner section for staples and such qr_code = {} for corner_index in range(1, 5): qr_code[corner_index] = fitz.Pixmap(qr_file[page_index + 1][corner_index]) if page_index % 2 == 0: exam[page_index].insertImage(rTR, pixmap=qr_code[1], overlay=True) exam[page_index].insertImage(rBR, pixmap=qr_code[4], overlay=True) exam[page_index].insertImage(rBL, pixmap=qr_code[3], overlay=True) else: exam[page_index].insertImage(rTL, pixmap=qr_code[2], overlay=True) exam[page_index].insertImage(rBL, pixmap=qr_code[3], overlay=True) exam[page_index].insertImage(rBR, pixmap=qr_code[4], overlay=True) return exam
pix_2 = fitz.Pixmap(barcode_sign) page.insertImage(rect_sign, pixmap=pix_2, overlay=True) page.insertImage(rect_mark, pixmap=pix_1, overlay=True) doc.save(output_file) import fitz for root, dirs, files in walk('D:\\py\\new_CR'): for f in files: doc = fitz.open(join(root, f)) width, height = fitz.PaperSize('a4') totaling = doc.pageCount for pg in range(totaling): page = doc[pg] zoom = int(100) rotate = int(0) trans = fitz.Matrix(zoom / 60, zoom / 60).preRotate(rotate) pm = page.getPixmap(matrix=trans, alpha=False) lurl = 'D:\\py\\new_CR\\{}.jpg'.format(str(f)[0:12]) pm.writePNG(lurl) doc.close() for root, dirs, files in walk('D:\\py\\new_CR'): for f in files: if '.jpg' in f: doc_pdf = fitz.open() imgdoc = fitz.open(join(root, f)) pdfbytes = imgdoc.convertToPDF() imgpdf = fitz.open('pdf', pdfbytes) doc_pdf.insertPDF(imgpdf) doc_pdf.save('D:\\py\\new_CR\\final\\{}.pdf'.format(str(f)[0:12])) doc_pdf.close()
def convertPDFPagesToJPG(bookName, bookID): book = fitz.open(PDFFileDirName + "/" + bookName + ".pdf") dirToSave = inputsDirName + "/" + bookName if not os.path.exists(dirToSave): os.makedirs(dirToSave) for page in book: page.getPixmap(matrix=fitz.Matrix(8, 8)).writeImage(dirToSave + "/" + str(page.number) + ".jpg") if showPrints: print("Converting PDF pages to JPG pages done")
def make_lessonslists(self, classes, filePath, pdffile): global boxOfImages try: os.makedirs("data") except: pass doc = None file = pdffile doc = fitz.open(file) for i in range(len(doc)): first_page = doc[i] image_matrix = fitz.Matrix(fitz.Identity) image_matrix.preScale(2, 2) pix = first_page.getPixmap(alpha=False, matrix=image_matrix) boxOfImages.append(f'{i}.jpg') pix.writePNG(f'data/{i}.jpg') NUMBEROFCLASS = 0 for _filename_ in boxOfImages: img = Image.open(f"{filePath}{_filename_}") pixMap = img.load() width, height = img.size listTemplates = [] boxTime2 = [False] FIRSTindent, SECONDindent = self.get_indent(pixMap, width, height) for i in range(FIRSTindent, SECONDindent): boxTime = [] tok = 0 for j in range(width): if pixMap[j, i] != (0, 0, 0): boxTime.append(True) else: tok += 1 boxTime.append(False) # print(boxTime) if (not all(boxTime2) and all(boxTime)) or (all(boxTime2) and not all(boxTime)): listTemplates.append(i) boxTime2 = boxTime.copy() listTemplates.extend([FIRSTindent, SECONDindent]) listTemplates.sort() try: os.makedirs("data/data") except: pass def getY(y): chek = 0 border = set() variableBorder = None for i in range(width): if pixMap[i, y] == (0, 0, 0): variableBorder = i elif variableBorder: border.add(variableBorder) return sorted(list(border)) BORDERNUM = self.get_num_of_borders(self.get_text_pdf()) for i in range(len(listTemplates) // 2): try: if 1: y0 = listTemplates[i * 2] y1 = listTemplates[i * 2 + 1] boxBorder = getY(y0 + 3) im0 = img.crop((boxBorder[0], y0, boxBorder[2], y1)) for k in range((len(boxBorder) - 3) // BORDERNUM + 1): # print(boxBorder) x0 = boxBorder[k * (BORDERNUM + 1) + 2] x1 = boxBorder[k * (BORDERNUM + 1) + 3 + BORDERNUM] # print(x0, x1) im1 = img.crop((x0, y0, x1 + 1, y1)) new_im = Image.new( 'RGB', (im0.size[0] + im1.size[0], im0.size[1])) new_im.paste(im0, (0, 0)) new_im.paste(im1, (im0.size[0], 0)) new_im.save('data/data/' + str(classes[NUMBEROFCLASS]) + '.jpg') NUMBEROFCLASS += 1 except Exception: pass
def _get_annots( self, annot_image_dir: str = "", ocr_api: str = "", zoom: int = 4, # image zoom factor run_test: bool = False, # get 3 annot and 3 pic at most ): if not self.doc.has_annots(): return annot_list = [] annot_count = 0 extracted_pic_count = 0 for page in self.doc.pages(): if run_test and annot_count > 2 and extracted_pic_count > 2: break annot_num = 0 word_list = page.getText("words") # list of words on page word_list.sort(key=lambda w: (w[3], w[0])) # ascending y, then x for annot in page.annots(): annot_type = annot.type[0] if annot_type not in ANNOT_TYPES: continue page_num = page.number + 1 annot_id = f"annot-{page_num}-{annot_num}" color = RGB(annot.colors.get("stroke")).to_hex() height = annot.rect[1] / page.rect[3] if annot_type == 4: # rectangle if run_test and extracted_pic_count > 2: continue pix = page.get_pixmap( annots= False, # TODO donnot display annots, maybe let user customize this? clip=annot.rect, matrix=fitz.Matrix(zoom, zoom), # zoom image ) base_name = self.file_name.replace(" ", "-") picture_path = os.path.join(annot_image_dir, f"{base_name}-{annot_id}.png") pix.writePNG(picture_path) extracted_pic_count += 1 content = [picture_path] if ocr_api: ocr_result = Picture(picture_path).get_ocr_result( ocr_api) content.append(ocr_result) else: if run_test and annot_count > 2: continue content = [annot.info.get("content")] if annot_type in [8, 9, 10, 11]: text = self._parse_highlight(annot, word_list) content.append(text) annot_list.append({ "type": annot.type[1], "page": page_num, "content": content, "id": annot_id, "height": height, "color": color, }) annot_num += 1 annot_count += 1 return annot_list
class BBox(IText): '''Boundary box with attribute in fitz.Rect type.''' # all coordinates are related to un-rotated page in PyMuPDF # e.g. Matrix(0.0, 1.0, -1.0, 0.0, 842.0, 0.0) ROTATION_MATRIX = fitz.Matrix(0.0) # rotation angle = 0 degree by default @classmethod def set_rotation_matrix(cls, rotation_matrix): if rotation_matrix and isinstance(rotation_matrix, fitz.Matrix): cls.ROTATION_MATRIX = rotation_matrix @classmethod def pure_rotation_matrix(cls): '''Pure rotation matrix used for calculating text direction after rotation.''' a,b,c,d,e,f = cls.ROTATION_MATRIX return fitz.Matrix(a,b,c,d,0,0) def __init__(self, raw:dict=None): ''' Initialize BBox and convert to the real (rotation considered) page coordinate system.''' self.bbox = fitz.Rect() # NOTE: Any coordinates provided in raw is in original page CS (without considering page rotation). if raw is None: raw = {} if 'bbox' in raw: rect = fitz.Rect(raw['bbox']) * BBox.ROTATION_MATRIX self.update_bbox(rect) def __bool__(self): '''Real object when bbox is defined.''' return bool(self.bbox) def __repr__(self): return f'{self.__class__.__name__}({tuple(self.bbox)})' def get_expand_bbox(self, dt:float): '''Get expanded bbox with margin dt in both x- and y- direction. Note this method doesn't change its bbox.''' return self.bbox + (-dt, -dt, dt, dt) def contains(self, bbox, threshold:float=1.0): '''Whether given bbox is contained in this instance, with margin considered.''' # it's not practical to set a general threshold to consider the margin, so two steps: # - set a coarse but acceptable area threshold, # - check the length in main direction strictly if not bbox: return False # A contains B => A & B = B intersection = self.bbox & bbox.bbox factor = round(intersection.getArea()/bbox.bbox.getArea(), 2) if factor<threshold: return False # check length if self.bbox.width >= self.bbox.height: return self.bbox.width+constants.MINOR_DIST >= bbox.bbox.width else: return self.bbox.height+constants.MINOR_DIST >= bbox.bbox.height def vertically_align_with(self, bbox, factor:float=0.0, text_direction:bool=True): ''' Check whether two boxes have enough intersection in vertical direction, i.e. perpendicular to reading direction. --- Args: - bbox: BBox to check with - factor: threshold of overlap ratio, the larger it is, the higher probability the two bbox-es are aligned. - text_direction: consider text direction or not. True by default, from left to right if False. ``` +--------------+ | | +--------------+ L1 +-------------------+ | | +-------------------+ L2 ``` An enough intersection is defined based on the minimum width of two boxes: ``` L1+L2-L>factor*min(L1,L2) ``` ''' if not bbox or not bool(self): return False # text direction is_horizontal_text = self.is_horizontal_text if text_direction else True idx = 0 if is_horizontal_text else 1 L1 = self.bbox[idx+2]-self.bbox[idx] L2 = bbox.bbox[idx+2]-bbox.bbox[idx] L = max(self.bbox[idx+2], bbox.bbox[idx+2]) - min(self.bbox[idx], bbox.bbox[idx]) return L1+L2-L>=factor*max(L1,L2) def horizontally_align_with(self, bbox, factor:float=0.0, text_direction:bool=True): ''' Check whether two boxes have enough intersection in horizontal direction, i.e. along the reading direction. --- Args: - bbox: BBox to check with - factor: threshold of overlap ratio, the larger it is, the higher probability the two bbox-es are aligned. - text_direction: consider text direction or not. True by default, from left to right if False. ``` +--------------+ | | L1 +--------------------+ +--------------+ | | L2 +--------------------+ ``` An enough intersection is defined based on the minimum width of two boxes: ``` L1+L2-L>factor*min(L1,L2) ``` ''' if not bbox or not bool(self): return False # text direction is_horizontal_text = self.is_horizontal_text if text_direction else True idx = 1 if is_horizontal_text else 0 L1 = self.bbox[idx+2]-self.bbox[idx] L2 = bbox.bbox[idx+2]-bbox.bbox[idx] L = max(self.bbox[idx+2], bbox.bbox[idx+2]) - min(self.bbox[idx], bbox.bbox[idx]) return L1+L2-L>=factor*max(L1,L2) def copy(self): '''make a deep copy.''' return copy.deepcopy(self) def update_bbox(self, rect): '''Update current bbox to specified `rect`. --- Args: - rect: fitz.rect or raw bbox like (x0, y0, x1, y1) in real page CS (with rotation considered). ''' self.bbox = fitz.Rect([round(x,1) for x in rect]) return self def union_bbox(self, bbox): '''Update current bbox to the union with specified `rect`. --- Args: - bbox: BBox, the target to get union ''' return self.update_bbox(self.bbox | bbox.bbox) def compare(self, bbox, threshold=0.9): '''Whether has same type and bbox.''' if not isinstance(bbox, self.__class__): return False, f'Inconsistent type: {self.__class__.__name__} v.s. {bbox.__class__.__name__} (expected)' if not get_main_bbox(self.bbox, bbox.bbox, threshold): return False, f'Inconsistent bbox: {self.bbox} v.s. {bbox.bbox}(expected)' return True, '' def store(self): '''Store in json format.''' return { 'bbox': tuple([x for x in self.bbox]) } def plot(self, page, stroke:tuple=(0,0,0), width:float=0.5, fill:tuple=None): '''Plot bbox in PDF page.''' page.drawRect(self.bbox, color=stroke, fill=fill, width=width, overlay=False)
doc = fitz.open() # empty new PDF page = doc.newPage() # create page (A4) img = page.newShape() # create shape # ============================================================================= # pencil 1 # ============================================================================= penheight = 100 # thickness of pencil pentip = fitz.Point(100, 150) # first pencil tip here pencil(img, pentip, penheight, True) # pencil points left # ============================================================================= # pencil 2 # ============================================================================= penheight = 20 # now a smaller one pentip = fitz.Point(100, 250) # new pencil tip pencil(img, pentip, penheight, False) # this one points right pentip.x += 10 # insert a little distance text = """Like the ReportLab User Guide does,\nyou may want to use this image, to\nemphasize content, e.g. cautionary\nremarks, notes, examples, etc.""" page.insertText(pentip, text) # insert explanatory text # ============================================================================= # pencil 3 # ============================================================================= # yet another pencil, which we will morph around its tip mat = fitz.Matrix(-150) * fitz.Matrix(0.5, 0.5, 1) # morphing: rotate & shear pentip = fitz.Point(300, 400) # instead of another thickness (40) we could have used a scale matrix pencil(img, pentip, 40, True, morph=(pentip, mat)) img.commit() doc.save("pencil.pdf")
def pdf_filter(): global FILTER_STATUS # 等待被触发 while WINDOW_STATUS: if FILTER_STATUS: FILTER_STATUS = False # 取消Button功能,触发进度提示 bt['state'] = 'disabled' progress = 0 var_01.set('正在处理,请稍候...') var_10.set(f'处理进度:{progress}%') # pdf转png try: input_pdf = filedialog.askopenfile(title='选择pdf文档').name except: close_window() input_doc = fitz.open(input_pdf) toc = input_doc.getToC() # 获取待处理pdf的目录 page_sum = input_doc.pageCount for i in range(0, page_sum): page = input_doc[i] zoom = 100 * RESOLUTION # 缩放 rotate = 0 # 无转动 trans = fitz.Matrix(zoom/100.0, zoom/100.0).preRotate(rotate) pm = page.getPixmap(matrix=trans, alpha=False) if(i+1<10): page_num = '000' + str(i+1) elif(i+1<100): page_num = '00' + str(i+1) elif(i+1<1000): page_num = '0' + str(i+1) else: page_num = str(i+1) pm.writePNG('pdf2png/%s.png' % page_num) progress = int(40 * (i/page_sum)) var_10.set(f'处理进度:{progress}%') input_doc.close() # png图片处理 m = page_sum path = ('pdf2png/') f = listdir(path) for i in f: if i=='0000.png': continue img = Image.open(path+i) if FILTER_MODE == '反色': inv_img = PIL.ImageOps.invert(img) elif FILTER_MODE == '灰度': inv_img = PIL.ImageOps.grayscale(img) elif FILTER_MODE == '去边': inv_img = PIL.ImageOps.crop(img, border=10) elif FILTER_MODE == '增强': inv_img = PIL.ImageOps.autocontrast(img, cutoff=10) else: # FILTER_MODE == '跳阶' inv_img = PIL.ImageOps.posterize(img, 2) inv_img.save(path+i) m -= 1 progress = int(40 + 30 * (page_sum-m)/page_sum) var_10.set(f'处理进度:{progress}%') # png转pdf n = page_sum output_doc = fitz.open() for img in sorted(glob.glob('pdf2png/*')): imgdoc = fitz.open(img) pdfbytes = imgdoc.convertToPDF() imgpdf = fitz.open('pdf', pdfbytes) output_doc.insertPDF(imgpdf) n -= 1 progress = int(70 + 30 * (page_sum-n)/page_sum) var_10.set(f'处理进度:{progress}%') var_01.set('处理完成!') #output_path = filedialog.askdirectory(title='请选择保存位置')+'/' output_path = filedialog.asksaveasfilename(title='另存为输出pdf文档', defaultextension='.pdf', initialfile=f'{FILTER_MODE} - '+input_pdf.split('/')[-1], filetypes=[('PDF','*.pdf')]) output_doc.setToC(toc) # 将原pdf的目录加入新pdf中 output_doc.save(output_path) output_doc.close() close_window() else: sleep(0.3)
def image_extract(path): keywords = "........" for file in os.listdir(path): try: if file[-4:] == ".pdf": mkpath = 'E:\\项目\\试运行\\201808\\提取图片\\' mkpath = mkpath + file[:-4] + '\\' mkdir(mkpath) doc = fitz.open(os.path.join(path, file)) page_count = doc.pageCount picname_num = 0 picname_list = [] for i in range(10): page = doc.loadPage(i) page_text = page.getText() if keywords in page_text: pattern = re.compile(r'图表 [0-9][^\.]*') picname_list1 = pattern.findall(page_text) picname_list.extend(picname_list1) # print(picname_list) for i in range(2, page_count): # page = doc[i] # links = page.getLinks() page = doc.loadPage(i) page_text = page.getText() pic_1 = page.searchFor("图表") data_1 = page.searchFor("资料来源:") if len(data_1) == 0: data_1 = page.searchFor("来源:") # print('未处理前---第', i, '页--图表的个数为', len(pic_1)) # print('未处理前---第', i, '页--数据来源的个数为', len(data_1)) # 处理重复数据 pic_2 = deal_repeat(pic_1) data = deal_repeat(data_1) # print('未处理正文重复----第', i, '页--图表的个数为', len(pic_2)) # print('未处理正文重复--图表坐标', pic_2) # print('未处理正文重复--资料来源坐标', data) # print('第', i, '页--数据来源的个数为', len(data)) # 处理正文出现的重复数据 pic = deal_reContext(pic_2, data) # print('处理完正文重复----第', i, '页--图表的个数为', len(pic)) # print('处理完正文重复----图表坐标', pic) # print('处理完正文重复----资料来源坐标', data) # print('第', i, '页--数据来源的个数为', len(data)) # if i == 7: # print('未处理表格的坐标', pic_1) # print('处理后表格的坐标', pic) # print('未处理资料来源的坐标', data_1) # print('处理后资料来源的坐标', data) if len(pic) > len(data): # 图表数多于资料来源 pic = pic[:len(data) - len(pic)] elif len(pic) < len(data): # 资料来源多于图表数 for i in range(len(data) - len(pic)): pic.insert(0, fitz.Rect(0, 0, 0, 0)) if len(pic) != 0 and len(data) != 0 and len(pic) == len(data): mat = fitz.Matrix(3, 3) # 缩放 page_ = page.rect # 页面大小 page_length = page_.x1 # 页面长 page_width = page_.y1 # 页面宽 # words = page.getTextWords() #获取页面文字 # print(words) pic1 = [[]] data1 = [[]] length = len(pic) # 图表数 for i in range(length): # 将图表按行分组 if i < (length - 1): if pic[i].y0 == pic[i + 1].y0: pic1[-1].append(pic[i]) data1[-1].append(data[i]) else: pic1[-1].append(pic[i]) pic1.append([]) data1[-1].append(data[i]) data1.append([]) pic1[-1].append(pic[i]) data1[-1].append(data[i]) picgroup_num = len(pic1) # 图片组数 for i in range(picgroup_num): # 按组处理图片 for j in range(len(pic1[i])): if j < len(pic1[i]) - 1: clip = fitz.Rect(data1[i][j].x0 - 5, pic1[i][j].y0, data1[i][j + 1].x0 - 18, data1[i][j].y1) pix = page.getPixmap(matrix=mat, clip=clip, alpha=False) # print(pix) # 预处理字符串中的非法字符 deal_name = validateTitle(picname_list[picname_num]) fn = deal_name + ".png" pix.writePNG(os.path.join(mkpath, fn)) picname_num = picname_num + 1 else: clip = fitz.Rect(data1[i][j].x0 - 5, pic1[i][j].y0, page_length - 49, data1[i][j].y1) pix = page.getPixmap(matrix=mat, clip=clip, alpha=False) deal_name = validateTitle(picname_list[picname_num]) fn = deal_name + ".png" pix.writePNG(os.path.join(mkpath, fn)) picname_num = picname_num + 1 doc.close() shutil.copy(os.path.join(path, file), os.path.join(mkpath, file)) except: print(file) mkdir(path+'\\未处理完成') shutil.move(mkpath, path+'\\未处理完成') continue
class Element(IText): '''Boundary box with attribute in fitz.Rect type.''' # all coordinates are related to un-rotated page in PyMuPDF # e.g. Matrix(0.0, 1.0, -1.0, 0.0, 842.0, 0.0) ROTATION_MATRIX = fitz.Matrix(0.0) # rotation angle = 0 degree by default @classmethod def set_rotation_matrix(cls, rotation_matrix): """Set global rotation matrix. Args: Rotation_matrix (fitz.Matrix): target matrix """ if rotation_matrix and isinstance(rotation_matrix, fitz.Matrix): cls.ROTATION_MATRIX = rotation_matrix @classmethod def pure_rotation_matrix(cls): '''Pure rotation matrix used for calculating text direction after rotation.''' a, b, c, d, e, f = cls.ROTATION_MATRIX return fitz.Matrix(a, b, c, d, 0, 0) def __init__(self, raw: dict = None, parent=None): ''' Initialize Element and convert to the real (rotation considered) page coordinate system.''' self.bbox = fitz.Rect() self._parent = parent # type: Element # NOTE: Any coordinates provided in raw is in original page CS (without considering page rotation). if 'bbox' in (raw or {}): rect = fitz.Rect(raw['bbox']) * Element.ROTATION_MATRIX self.update_bbox(rect) def __bool__(self): '''Real object when bbox is defined.''' return bool(self.bbox) def __repr__(self): return f'{self.__class__.__name__}({tuple(self.bbox)})' # ------------------------------------------------ # parent element # ------------------------------------------------ @property def parent(self): return self._parent @parent.setter def parent(self, parent): self._parent = parent # ------------------------------------------------ # bbox operations # ------------------------------------------------ def copy(self): '''make a deep copy.''' # NOTE: can't serialize data because parent is an Object, # so set it None in advance. parent, self.parent = self._parent, None obj = copy.deepcopy(self) self._parent = parent # set back parent return obj def get_expand_bbox(self, dt: float): """Get expanded bbox with margin in both x- and y- direction. Args: dt (float): Expanding margin. Returns: fitz.Rect: Expanded bbox. .. note:: This method creates a new bbox, rather than changing the bbox of itself. """ return self.bbox + (-dt, -dt, dt, dt) def update_bbox(self, rect): '''Update current bbox to specified ``rect``. Args: rect (fitz.Rect or list): bbox-like ``(x0, y0, x1, y1)`` in real page CS (with rotation considered). ''' self.bbox = fitz.Rect([round(x, 1) for x in rect]) return self def union_bbox(self, e): """Update current bbox to the union with specified Element. Args: e (Element): The target to get union Returns: Element: self """ return self.update_bbox(self.bbox | e.bbox) # -------------------------------------------- # location relationship to other Element instance # -------------------------------------------- def contains(self, e, threshold: float = 1.0): """Whether given element is contained in this instance, with margin considered. Args: e (Element): Target element threshold (float, optional): Intersection rate. Defaults to 1.0. The larger, the stricter. Returns: bool: [description] """ # NOTE the case bool(e)=True but e.bbox.get_area()=0 S = e.bbox.get_area() if not S: return False # it's not practical to set a general threshold to consider the margin, so two steps: # - set a coarse but acceptable area threshold, # - check the length in main direction strictly # A contains B => A & B = B intersection = self.bbox & e.bbox factor = round(intersection.get_area() / e.bbox.get_area(), 2) if factor < threshold: return False # check length if self.bbox.width >= self.bbox.height: return self.bbox.width + constants.MINOR_DIST >= e.bbox.width else: return self.bbox.height + constants.MINOR_DIST >= e.bbox.height def get_main_bbox(self, e, threshold: float = 0.95): """If the intersection with ``e`` exceeds the threshold, return the union of these two elements; else return None. Args: e (Element): Target element. threshold (float, optional): Intersection rate. Defaults to 0.95. Returns: fitz.Rect: Union bbox or None. """ bbox_1 = self.bbox bbox_2 = e.bbox if hasattr(e, 'bbox') else fitz.Rect(e) # areas b = bbox_1 & bbox_2 if not b: return None # no intersection a1, a2, a = bbox_1.get_area(), bbox_2.get_area(), b.get_area() # Note: if bbox_1 and bbox_2 intersects with only an edge, b is not empty but b.get_area()=0 # so give a small value when they're intersected but the area is zero factor = a / min(a1, a2) if a else 1e-6 return bbox_1 | bbox_2 if factor >= threshold else None def vertically_align_with(self, e, factor: float = 0.0, text_direction: bool = True): '''Check whether two Element instances have enough intersection in vertical direction, i.e. perpendicular to reading direction. Args: e (Element): Object to check with factor (float, optional): Threshold of overlap ratio, the larger it is, the higher probability the two bbox-es are aligned. text_direction (bool, optional): Consider text direction or not. True by default, from left to right if False. Returns: bool: [description] Examples:: +--------------+ | | +--------------+ L1 +-------------------+ | | +-------------------+ L2 An enough intersection is defined based on the minimum width of two boxes:: L1+L2-L>factor*min(L1,L2) ''' if not e or not bool(self): return False # text direction idx = 1 if text_direction and self.is_vertical_text else 0 L1 = self.bbox[idx + 2] - self.bbox[idx] L2 = e.bbox[idx + 2] - e.bbox[idx] L = max(self.bbox[idx + 2], e.bbox[idx + 2]) - min( self.bbox[idx], e.bbox[idx]) eps = 1e-3 # tolerent return L1 + L2 - L + eps >= factor * min(L1, L2) def horizontally_align_with(self, e, factor: float = 0.0, text_direction: bool = True): '''Check whether two Element instances have enough intersection in horizontal direction, i.e. along the reading direction. Args: e (Element): Element to check with factor (float, optional): threshold of overlap ratio, the larger it is, the higher probability the two bbox-es are aligned. text_direction (bool, optional): consider text direction or not. True by default, from left to right if False. Examples:: +--------------+ | | L1 +--------------------+ +--------------+ | | L2 +--------------------+ An enough intersection is defined based on the minimum width of two boxes:: L1+L2-L>factor*min(L1,L2) ''' if not e or not bool(self): return False # text direction idx = 0 if text_direction and self.is_vertical_text else 1 L1 = self.bbox[idx + 2] - self.bbox[idx] L2 = e.bbox[idx + 2] - e.bbox[idx] L = max(self.bbox[idx + 2], e.bbox[idx + 2]) - min( self.bbox[idx], e.bbox[idx]) eps = 1e-3 # tolerent return L1 + L2 - L + eps >= factor * min(L1, L2) def in_same_row(self, e): """Check whether in same row/line with specified Element instance. With text direction considered. Taking horizontal text as an example: * yes: the bottom edge of each box is lower than the centerline of the other one; * otherwise, not in same row. Args: e (Element): Target object. Returns: bool: [description] .. note:: The difference to method ``horizontally_align_with``: they may not in same line, though aligned horizontally. """ if not e or self.is_horizontal_text != e.is_horizontal_text: return False # normal reading direction by default idx = 1 if self.is_horizontal_text else 0 c1 = (self.bbox[idx] + self.bbox[idx + 2]) / 2.0 c2 = (e.bbox[idx] + e.bbox[idx + 2]) / 2.0 res = c1 <= e.bbox[idx + 2] and c2 <= self.bbox[ idx + 2] # Note y direction under PyMuPDF context return res # ------------------------------------------------ # others # ------------------------------------------------ def store(self): '''Store properties in raw dict.''' return {'bbox': tuple([x for x in self.bbox])} def plot(self, page, stroke: tuple = (0, 0, 0), width: float = 0.5, fill: tuple = None, dashes: str = None): '''Plot bbox in PDF page for debug purpose.''' page.draw_rect(self.bbox, color=stroke, fill=fill, width=width, dashes=dashes, overlay=False, fill_opacity=0.5)
# Define start / end points of x axis that we want to use as 0 and 2*pi. # They may be oriented in any way. #-------------------------------------------------------------------------- pb = fitz.Point(200, 200) # begin, treated as (0, 0) pe = fitz.Point(400, 100) # end, treated as (2*pi, 0) alfa = img.horizontal_angle(pb, pe) # connection angle towards x-axis rad = abs(pe - pb) # distance of these points pe1 = pb + (rad, 0) # make corresp. horizontal end point # ============================================================================= # first draw a rectangle in which the functions graphs will later appear # ============================================================================= f = abs(pe - pb) * 0.5 / math.pi # represents 1 unit rect = fitz.Rect(pb.x - 5, pb.y - f - 5, pe1.x + 5, pb.y + f + 5) img.drawRect(rect) # draw it morph = (pb, fitz.Matrix(math.degrees(-alfa))) img.finish(fill=yellow, morph=morph) # rotate it around begin point # ============================================================================= # get all points for the sine function # ============================================================================= pntsin = bsinPoints(pb, pe1) # only horizontal axis supported # therefore need rotate result points by angle alfa afterwards points = rot_points(pntsin, pb, alfa) for i in (0, 3, 6, 9): # draw all 4 function segments img.drawBezier(points[i], points[i + 1], points[i + 2], points[i + 3]) img.finish(color=red, width=w, closePath=False) # =============================================================================
def pure_rotation_matrix(cls): '''Pure rotation matrix used for calculating text direction after rotation.''' a, b, c, d, e, f = cls.ROTATION_MATRIX return fitz.Matrix(a, b, c, d, 0, 0)
def getImg(pg_num, bbox): bbox_resize = resize(bbox, sf, eps) pix = doc[pg_num].getPixmap(matrix=fitz.Matrix(sf, sf)) img = Image.open(io.BytesIO(pix.getPNGData())) block_img = img.crop(bbox_resize) return block_img
""" if __name__ == "__main__": green = getColor("limegreen") red = getColor("red2") doc = fitz.open() p = doc.newPage() img = p.newShape() r = fitz.Rect(100, 100, 200, 200) heart(img, r, red) img.commit() p.setCropBox(r + (10, 10, -10, -15)) p = doc.newPage() img = p.newShape() pnt = r.tl + (r.br - r.tl) * 0.5 clover(img, r, green, morph=(pnt, fitz.Matrix(45))) img.commit() p.setCropBox(r + (5, 5, -5, -5)) p = doc.newPage() img = p.newShape() diamond(img, r, red) img.commit() p.setCropBox(r) p = doc.newPage() img = p.newShape() pnt = r.tl + (r.br - r.tl) * 0.5 caro(img, r, red, morph=(pnt, fitz.Matrix(45))) img.commit() p.setCropBox(r + (10, 10, -10, -10))
# https://www.jianshu.com/p/2abe38044446 # pip install PyMuPDF # 将本目录下所有PDF文件转为png文件(第8行) import fitz import sys import glob pdffile = glob.glob("*.pdf") pngfile = [] for f in pdffile: pngfile.append(f.rstrip("pdf")) for i in range(len(pdffile)): doc = fitz.open(pdffile[i]) page = doc[0] zoom = int(100) rotate = int(0) trans = fitz.Matrix(zoom / 100.0, zoom / 100.0).preRotate(rotate) pm = page.getPixmap(matrix=trans, alpha=False) pm.writePNG(pngfile[i] + "png")
def PdftoImage_totext(file_path): if os.path.exists(file_path): re1 = r'[\w\W]+(\d{4})[\w\W]+(G\d{5})号$' file_path_1 = file_path[:file_path.rfind(".")] pdf = fitz.open(file_path) if pdf.pageCount > 2: png_name = file_path_1 + ".png" png_name2 = file_path_1 + "_jie.png" if os.path.exists(png_name): os.remove(png_name) if os.path.exists(png_name2): os.remove(png_name2) rotate = int(0) # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。 # 此处若是不做设置,默认图片大小为:792X612, dpi=96 zoom_x = 1.33333333 # (1.33333333-->1056x816) (2-->1584x1224) zoom_y = 1.33333333 mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate) png = pdf[2].getPixmap(matrix=mat, alpha=False) png.writePNG(png_name) pngg = cv2.imread(png_name) kuan = pngg.shape[0] chang = int(pngg.shape[1] * 0.35) print(pngg.shape) cropped = pngg[0:chang, 0:kuan] cv2.imwrite(png_name2, cropped) baidu = BaiduApi() text = baidu.picture(png_name2) reponse = True year = None num = None if len(text) >= 10: re_reponse = re.search(re1, text) if re_reponse: year = re_reponse.group(1) num = re_reponse.group(2) os.remove(png_name2) os.remove(png_name) else: reponse = False if reponse: pass else: t, fp = imagetoBig(png_name2) if t: baidu = BaiduApi() text = baidu.picture(fp) print(text) shutil.rmtree(fp[:fp.rfind("\\")]) if len(text) >= 10: re_reponse = re.search(re1, text) if re_reponse: year = re_reponse.group(1) num = re_reponse.group(2) os.remove(png_name2) os.remove(png_name) else: reponse = False else: reponse = False idn = "沪信衡估报字第G" + year + "-" + num[1:] + "号" return idn, reponse
def shapes_from_stream(doc: fitz.Document, page: fitz.Page): ''' Get rectangle shapes, e.g. highlight, underline, table borders, from page source contents. --- Args: - doc: fitz.Document representing the pdf file - page: fitz.Page, current page The page source is represented as contents of stream object. For example, ``` /P<</MCID 0>> BDC ... 1 0 0 1 90.0240021 590.380005 cm ... 1 1 0 rg # or 0 g ... 285.17 500.11 193.97 13.44 re f* ... 214 320 m 249 322 l ... EMC ``` where, - `cm` specify a coordinate system transformation, here (0,0) translates to (90.0240021 590.380005) - `q`/`Q` save/restores graphic status - `rg` / `g` specify color mode: rgb / grey - `re`, `f` or `f*`: fill rectangle path with pre-defined color - `m` (move to) and `l` (line to) defines a path In this case, - a rectangle with: - fill color is yellow (1,1,0) - lower left corner: (285.17 500.11) - width: 193.97 - height: 13.44 - a line from (214, 320) to (249, 322) Read more: - https://github.com/pymupdf/PyMuPDF/issues/263 - https://github.com/pymupdf/PyMuPDF/issues/225 - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdf_reference_archive/pdf_reference_1-7.pdf ''' # Each object in PDF has a cross-reference number (xref): # - to get its source contents: `doc.xrefObject()` or low level API `doc._getXrefString()`; but for stream objects, only the non-stream part is returned # - to get the stream data: `doc.xrefStream(xref)` or low level API `doc._getXrefStream(xref)` # - the xref for a page object itself: `page.xref` # - all stream xref contained in one page: `page.getContents()` # - combine all stream object contents together: `page.readContents()` with PyMuPDF>=1.17.0 # # Clean contents first: # syntactically correct, standardize and pretty print the contents stream page.cleanContents() xref_stream = page.readContents().decode(encoding="ISO-8859-1") # transformation matrix for coordinate system conversion from pdf to fitz matrix = page.transformationMatrix # Graphic States: working CS is coincident with the absolute origin (0, 0) # Refer to PDF reference v1.7 4.2.3 Transformation Metrices # | a b 0 | # [a, b, c, d, e, f] => | c b 0 | # | e f 1 | ACS = fitz.Matrix(0.0) # identity matrix WCS = fitz.Matrix(0.0) # Graphics color: # - color space: PDF Reference Section 4.5 Color Spaces # NOTE: it should have to calculate color value under arbitrary color space, but it's really hard work for now. # So, consider device color space only like DeviceGray, DeviceRGB, DeviceCMYK, and set black for all others. device_space = True color_spaces = _check_device_cs(doc, page) # - stroking color Acs = utils.RGB_value((0.0, 0.0, 0.0)) # stored value Wcs = Acs # working value # - filling color Acf = utils.RGB_value((0.0, 0.0, 0.0)) Wcf = Acf # Stroke width Ad = 0.0 Wd = 0.0 # In addition to lines, rectangles are also processed with border path paths = [] # a list of path, each path is a list of points # Check line by line # Cleaned by `page.cleanContents()`, operator and operand are aligned in a same line; # otherwise, have to check stream contents word by word (line always changes) lines = xref_stream.splitlines() strokes, fills = [], [] for line in lines: words = line.split() if not words: continue op = words[-1] # operator always at the end after page.cleanContents() # ----------------------------------------------- # Color Operators: PDF Reference Table 4.24 # ----------------------------------------------- # - set color space: # color_space_name cs # specify color space # c1 c2 ... SC/SCN # components under defined color space if op.upper() == 'CS': Wcs = utils.RGB_value((0.0, 0.0, 0.0)) Wcf = utils.RGB_value((0.0, 0.0, 0.0)) # Consider normal device cs only device_space = color_spaces.get(words[0], False) # - set color: color components under specified color space elif op.upper() == 'SC': # c1 c2 ... cn SC c = _RGB_from_color_components(words[0:-1], device_space) # nonstroking color if op == 'sc': Wcf = c # stroking color else: Wcs = c # - set color: color components under specified color space elif op.upper() == 'SCN': # c1 c2 ... cn [name] SC if utils.is_number(words[-2]): c = _RGB_from_color_components(words[0:-1], device_space) else: c = _RGB_from_color_components(words[0:-2], device_space) # nonstroking color if op == 'scn': Wcf = c # stroking color else: Wcs = c # - DeviceGray space, equal to: # /DeviceGray cs # c sc elif op.upper() == 'G': # 0 g g = float(words[0]) # nonstroking color, i.e. filling color here if op == 'g': Wcf = utils.RGB_value((g, g, g)) # stroking color else: Wcs = utils.RGB_value((g, g, g)) # - DeviceRGB space elif op.upper() == 'RG': # 1 1 0 rg r, g, b = map(float, words[0:-1]) # nonstroking color if op == 'rg': Wcf = utils.RGB_value((r, g, b)) # stroking color else: Wcs = utils.RGB_value((r, g, b)) # - DeviceCMYK space elif op.upper() == 'K': # c m y k K c, m, y, k = map(float, words[0:-1]) # nonstroking color if op == 'k': Wcf = utils.CMYK_to_RGB(c, m, y, k, cmyk_scale=1.0) # stroking color else: Wcs = utils.CMYK_to_RGB(c, m, y, k, cmyk_scale=1.0) # ----------------------------------------------- # Graphics State Operators: PDF References Table 4.7 # ----------------------------------------------- # CS transformation: a b c d e f cm, e.g. # 0.05 0 0 -0.05 0 792 cm # refer to PDF Reference 4.2.2 Common Transformations for detail elif op == 'cm': # update working CS components = list(map(float, words[0:-1])) Mt = fitz.Matrix(*components) WCS = Mt * WCS # M' = Mt x M # stroke width elif op == 'w': # 0.5 w Wd = float(words[0]) # save or restore graphics state: # only consider transformation and color here elif op == 'q': # save ACS = fitz.Matrix(WCS) # copy as new matrix Acf = Wcf Acs = Wcs Ad = Wd elif op == 'Q': # restore WCS = fitz.Matrix(ACS) # copy as new matrix Wcf = Acf Wcs = Acs Wd = Ad # ----------------------------------------------- # Path Construction Operators: PDF References Table 4.9 # ----------------------------------------------- # rectangle block: # x y w h re is equivalent to # x y m # x+w y l # x+w y+h l # x y+h l # h # close the path elif op == 're': # ATTENTION: # top/bottom, left/right is relative to the positive direction of CS, # while a reverse direction may be performed, so be careful when calculating # the corner points. # Coordinates in the transformed PDF CS: # y1 +----------+ # | | h # y0 +----w-----+ # x0 x1 # # (x, y, w, h) before this line x0, y0, w, h = map(float, words[0:-1]) path = [] path.append((x0, y0)) path.append((x0 + w, y0)) path.append((x0 + w, y0 + h)) path.append((x0, y0 + h)) path.append((x0, y0)) paths.append(path) # path: m -> move to point to start a path elif op == 'm': # x y m x0, y0 = map(float, words[0:-1]) paths.append([(x0, y0)]) # path: l -> straight line to point elif op == 'l': # x y l x0, y0 = map(float, words[0:-1]) paths[-1].append((x0, y0)) # close the path elif op == 'h': for path in paths: _close_path(path) # ----------------------------------------------- # Path-painting Operatores: PDF Reference Table 4.10 # ----------------------------------------------- # close and stroke the path elif op.upper() == 'S': # close if op == 's': for path in paths: _close_path(path) # stroke path for path in paths: res = _stroke_path(path, WCS, Wcs, Wd, matrix) strokes.extend(res) # reset path paths = [] # fill the path elif line in ('f', 'F', 'f*'): for path in paths: # close the path implicitly _close_path(path) # fill path res = _fill_rect_path(path, WCS, Wcf, matrix) fills.append(res) # reset path paths = [] # close, fill and stroke the path elif op.upper() in ('B', 'B*'): for path in paths: # close path _close_path(path) # fill path res = _fill_rect_path(path, WCS, Wcf, matrix) fills.append(res) # stroke path res = _stroke_path(path, WCS, Wcs, Wd, matrix) strokes.extend(res) # reset path paths = [] # TODO: clip the path elif line in ('W', 'W*'): pass # end the path without stroking or filling elif op == 'n': paths = [] return strokes, fills
def export_data_pdf(sender, instance, created, **kwargs): uploaded_pdf_url = instance.uploaded_pdf.path # address_string = camelot.read_pdf(uploaded_pdf_url, flavor='stream', row_tol=9, table_areas=['50,720,780,680']) address_string = camelot.read_pdf(uploaded_pdf_url, flavor='stream', row_tol=9, table_areas=['50,720,400,680']) csv_address_f = os.path.join(settings.MEDIA_ROOT, 'temp', 'csv_address.csv') csv = address_string[0].to_csv(csv_address_f) if csv_address_f: with open(csv_address_f, 'r', encoding='utf-8') as f: row_read = CSV.reader(f) for row in row_read: pp = (row[0].strip(" '")).split(":") if 'Квартира' in pp[0]: hh = [int(s) for s in pp[0].split() if s.isdigit()] global_appartment = hh[0] print(global_appartment) else: print('pp') print(pp) print(pp[1]) adress_item_list = pp[1].split(",") print('adress_item_list') print(adress_item_list) i = 0 total_list = [] for item in adress_item_list: k = '' v = '' total_val = adress_item_list[i].split() for word in total_val: if word[0].isupper() or word[0].isdigit(): v = word else: k = word if i == 1: v = adress_item_list[i].strip() total_list.append([k, v]) i+=1 print('total_list') print(total_list) total_dict = { 'city_type':'', 'city_name': '', 'street_type':'', 'street':'', 'micro_rayon':'', 'house_number':'', 'corpus_number':'', 'litera':'' } for item in total_list: if item[0] == 'город' or item[0] == 'поселение' or item[0] == 'деревня' or item[0] == 'поселок': k = 'city_type' v = item[0] total_dict.update({k: v}) k = 'city_name' v = item[1] total_dict.update({k: v}) elif item[0] == 'улица' or item[0] == 'ул.' or item[0] == 'переулок' or item[0] == 'пер.' or item[0] == 'проспект' or item[0] == 'просп.' or item[0] == 'проезд' or item[0] == 'шоссе' or item[0] == 'площадь' or item[0] == 'наб.' or item[0] == 'набережная' or item[0] == 'бульвар' or item[0] == 'бул.': k = 'street_type' v = item[0] total_dict.update({k: v}) k = 'street' v = item[1] total_dict.update({k: v}) elif item[0] == 'микрорайон': k = 'micro_rayon' v = item[1] total_dict.update({k: v}) elif item[0] == 'дом': k = 'house_number' v = item[1] total_dict.update({k: v}) elif item[0] == 'корпус': k = 'corpus_number' v = item[1] total_dict.update({k: v}) elif item[0] == 'litera': k = 'litera' v = item[1] total_dict.update({k: v}) elif item[0] == 'строение': k = 'build_number' v = item[1] total_dict.update({k: v}) print('total_dict') print(total_dict) # func(adress_item_list[i]) v, created = Adress.objects.update_or_create( order=instance, defaults=total_dict, ) if instance.new_source: tables = camelot.read_pdf(uploaded_pdf_url) else: tables = camelot.read_pdf(uploaded_pdf_url, flavor='stream', row_tol=9, table_areas=['50,680,780,100']) print(tables[0]) print(tables[0].parsing_report) print(tables[0].df) json_table = os.path.join(settings.MEDIA_ROOT, 'temp', 'json_table.json') json_table2 = os.path.join(settings.MEDIA_ROOT, 'temp', 'json_table2.csv') json = tables[0].to_json(path=json_table) # json1 = tables[0].to_csv(path=json_table2, orient = 'records', lines = 'True') if json_table: with open(json_table, 'r') as f: print("------------data-------------------") data = JSON.load(f) ExplicationListItem.objects.filter(order_list=instance).delete() i = 0 data.pop() for x in data: if i > 2 and x['9'] != '': ExplicationListItem.objects.create( order_list = instance, floor_number = x['0'], appart_number_item = x['1'], appart_name_item = x['2'], square_total_item = x['3'], square_general_item = x['4'], square_advanced_item = x['5'], square_logdi_item = x['6'], square_balkon_item = x['7'], square_another_item = x['8'], height_item = x['9'], apart_number = global_appartment ) i += 1 explication_list_items = ExplicationListItem.objects.filter(order_list=instance) def string_to_correct_decimal(string): print('string') print(string) result = Decimal(string.strip(" '").replace(',', '.')) return result square_total_sum = Decimal("0.0") square_general_sum = Decimal("0.0") square_advanced_sum = Decimal("0.0") square_logdi_sum = Decimal("0.0") square_balkon_sum = Decimal("0.0") square_another_sum = Decimal("0.0") square_total_sum_global = Decimal("0.0") for items in explication_list_items: if items.square_total_item: square_total_sum += string_to_correct_decimal(items.square_total_item) # print(square_total_sum) if items.square_general_item: square_general_sum += string_to_correct_decimal(items.square_general_item) # print(square_general_sum) if items.square_advanced_item: square_advanced_sum += string_to_correct_decimal(items.square_advanced_item) # print(square_advanced_sum) if items.square_logdi_item: square_logdi_sum += string_to_correct_decimal(items.square_logdi_item) # print(square_logdi_sum) if items.square_balkon_item: square_balkon_sum += string_to_correct_decimal(items.square_balkon_item) # print(square_balkon_sum) if items.square_another_item: square_another_sum += string_to_correct_decimal(items.square_another_item) # print(square_another_sum) v, created = ExplicationSquareTotal.objects.update_or_create( order=instance, defaults={ 'square_total_summa':square_total_sum, 'square_general_summa':square_general_sum, 'square_advanced_summa':square_advanced_sum, 'square_logdi_summa':square_logdi_sum, 'square_balkon_summa':square_balkon_sum, 'square_another_summa':square_another_sum, 'square_total_summa_global': square_total_sum + square_logdi_sum + square_balkon_sum + square_another_sum }, ) path_img_name = 'schema_' + str(instance.order_number) + '.png' path_img_scheme = os.path.join(settings.MEDIA_ROOT, 'uploaded_pdf/schemes/', path_img_name) path_img_scheme_bd = "uploaded_pdf/schemes/%s" % path_img_name current_site = Site.objects.get_current().domain path_full_pdf = "https://%s%s" % (current_site, reverse_lazy('pdftrans:order_full_pdf_view_n', kwargs={'pk': instance.pk})) doc = fitz.open(uploaded_pdf_url) i = 0 for page in doc: # iterate through the pages if i == 1: zoom = 2 # zoom factor mat = fitz.Matrix(zoom, zoom) pix = page.getPixmap(matrix = mat, alpha = False) # render page to an image pix.writePNG(path_img_scheme) # store image as a PNG def trim(im): bg = Image.new(im.mode, im.size, im.getpixel((0,0))) diff = ImageChops.difference(im, bg) diff = ImageChops.add(diff, diff, 2.0, -100) bbox = diff.getbbox() if bbox: return im.crop(bbox) im = Image.open(path_img_scheme) im = trim(im) im.save(path_img_scheme) order_img_clear = OrderImage.objects.filter(order_fk=instance).delete() v, created = OrderImage.objects.update_or_create( order_fk=instance, defaults={'order_image': path_img_scheme_bd, 'fullpdf_url_staff': path_full_pdf } ) i+=1 # sending email method -=send_mail=- path_full_pdf_for_email = str(path_full_pdf) path_full_link_site = 'https://' + str(current_site) + '/get-order-info/' + str(instance.pk) context = { 'order_number': instance.order_number, 'link_doc': path_full_pdf, 'link_site': path_full_link_site, } str_for_traslit = unidecode(str(instance.adress)) subject = str_for_traslit + ' - Док №: ' + str(instance.order_number) from_email = '*****@*****.**' to = '*****@*****.**' html_content = render_to_string('mail_templates/mail_template_btiorder.html', context) text_content = strip_tags(html_content) msg = EmailMultiAlternatives(subject, text_content, from_email, [to]) msg.attach_alternative(html_content, "text/html") if instance.is_emailed == False: if subject and html_content and from_email: try: if msg.send(): Order.objects.filter(pk=instance.pk).update(is_emailed=True) instance.is_emailed = True except BadHeaderError: return print('Invalid header found in email %s' % instance.pk) return print('email is sended %s' % instance.pk) else: return print('Make sure all fields are entered and valid %s' % instance.pk) pass
import fitz pdf = fitz.open('./image/1.pdf') for pg in range(0, 1): page = pdf[pg] rotate = int(0) zoom_x = 2.0 zoom_y = 2.0 trans = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate) pm = page.getPixmap(matrix=trans, alpha=False) pm.writePNG('./image/%s.png' % 1)
annot = page.addFreetextAnnot(r, t1, rotate = 90) annot.setBorder(border) annot.update(fontsize = 10, border_color=red, fill_color=gold, text_color=blue) print_descr(annot.rect, annot) r = annot.rect + displ print("added 'FreeText'") annot = page.addTextAnnot(r.tl, t1) print_descr(annot.rect, annot) print("added 'Sticky Note'") pos = annot.rect.tl + displ.tl # first insert 4 rotated text lines page.insertText(pos, text, fontsize=11, morph = (pos, fitz.Matrix(-15))) # now search text to get the quads rl = page.searchFor("text in line", quads = True) r0 = rl[0] r1 = rl[1] r2 = rl[2] r3 = rl[3] annot = page.addHighlightAnnot(r0) # need to convert quad to rect for descriptive text ... print_descr(r0.rect, annot) print("added 'HighLight'") annot = page.addStrikeoutAnnot(r1) print_descr(r1.rect, annot) print("added 'StrikeOut'")
def __init__(self, parent, filename): defPos = wx.DefaultPosition defSiz = wx.DefaultSize zoom = 1.2 # zoom factor of display wx.Dialog.__init__(self, parent, id=wx.ID_ANY, title=u"Display with PyMuPDF: ", pos=defPos, size=defSiz, style=wx.CAPTION | wx.CLOSE_BOX | wx.DEFAULT_DIALOG_STYLE) #====================================================================== # display an icon top left of dialog, append filename to title #====================================================================== if do_icon: self.SetIcon(ico_pdf.img.GetIcon()) # set a screen icon self.SetTitle(self.Title + filename) self.SetBackgroundColour(wx.Colour(240, 230, 140)) #====================================================================== # open the document with MuPDF when dialog gets created #====================================================================== self.doc = fitz.open(filename) # create Document object if self.doc.needsPass: # check password protection self.decrypt_doc() if self.doc.isEncrypted: # quit if we cannot decrpt self.Destroy() return self.dl_array = [0] * len(self.doc) self.last_page = -1 # memorize last page displayed self.link_rects = [] # store link rectangles here self.link_texts = [] # store link texts here self.current_idx = -1 # store entry of found rectangle self.current_lnks = [] # store entry of found rectangle #====================================================================== # define zooming matrix for displaying PDF page images # we increase images by 20%, so take 1.2 as scale factors #====================================================================== self.matrix = fitz.Matrix(zoom, zoom) # will use a constant zoom ''' ======================================================================= Overall Dialog Structure: ------------------------- szr10 (main sizer for the whole dialog - vertical orientation) +-> szr20 (sizer for buttons etc. - horizontal orientation) +-> button forward +-> button backward +-> field for page number to jump to +-> field displaying total pages +-> PDF image area ======================================================================= ''' # forward button self.ButtonNext = wx.Button(self, wx.ID_ANY, u"forw", defPos, defSiz, wx.BU_EXACTFIT) # backward button self.ButtonPrevious = wx.Button(self, wx.ID_ANY, u"back", defPos, defSiz, wx.BU_EXACTFIT) #====================================================================== # text field for entering a target page. wx.TE_PROCESS_ENTER is # required to get data entry fired as events. #====================================================================== self.TextToPage = wx.TextCtrl(self, wx.ID_ANY, u"1", defPos, wx.Size(40, -1), wx.TE_RIGHT | wx.TE_PROCESS_ENTER) # displays total pages and page paper format self.statPageMax = wx.StaticText( self, wx.ID_ANY, "of " + str(len(self.doc)) + " pages.", defPos, defSiz, 0) self.links = wx.CheckBox(self, wx.ID_ANY, u"show links", defPos, defSiz, wx.ALIGN_LEFT) self.links.Value = True self.paperform = wx.StaticText(self, wx.ID_ANY, "", defPos, defSiz, 0) # define the area for page images and load page 1 for primary display self.PDFimage = wx.StaticBitmap(self, wx.ID_ANY, self.pdf_show(1), defPos, defSiz, style=0) #====================================================================== # the main sizer of the dialog #====================================================================== self.szr10 = wx.BoxSizer(wx.VERTICAL) szr20 = wx.BoxSizer(wx.HORIZONTAL) szr20.Add(self.ButtonNext, 0, wx.ALL, 5) szr20.Add(self.ButtonPrevious, 0, wx.ALL, 5) szr20.Add(self.TextToPage, 0, wx.ALL, 5) szr20.Add(self.statPageMax, 0, wx.ALIGN_CENTER_VERTICAL | wx.ALL, 5) szr20.Add(self.links, 0, wx.ALIGN_CENTER_VERTICAL | wx.ALL, 5) szr20.Add(self.paperform, 0, wx.ALIGN_CENTER_VERTICAL | wx.ALL, 5) # sizer ready, represents top dialog line self.szr10.Add(szr20, 0, wx.EXPAND, 5) self.szr10.Add(self.PDFimage, 0, wx.ALL, 5) # main sizer now ready - request final size & layout adjustments self.szr10.Fit(self) self.SetSizer(self.szr10) self.Layout() # center dialog on screen self.Centre(wx.BOTH) # Bind buttons and fields to event handlers self.ButtonNext.Bind(wx.EVT_BUTTON, self.NextPage) self.ButtonPrevious.Bind(wx.EVT_BUTTON, self.PreviousPage) self.TextToPage.Bind(wx.EVT_TEXT_ENTER, self.GotoPage) self.PDFimage.Bind(wx.EVT_MOUSEWHEEL, self.OnMouseWheel) self.PDFimage.Bind(wx.EVT_MOTION, self.move_mouse) self.PDFimage.Bind(wx.EVT_LEFT_DOWN, self.OnLeftDown)
#------------------------------------------------------------------------------ # Main program #------------------------------------------------------------------------------ if __name__ == "__main__": green = getColor("limegreen") red = getColor("red2") doc = fitz.open() p = doc.newPage() img = p.newShape() r = fitz.Rect(100, 100, 200, 200) heart(img, r, red) r1 = r + (100, 0, 100, 0) p = r1.tl + (r1.br - r1.tl) * 0.5 clover(img, r1, green, morph=(p, fitz.Matrix(45))) r2 = r1 + (100, 0, 100, 0) diamond(img, r2, red) r3 = r2 + (100, 0, 100, 0) p = r3.tl + (r3.br - r3.tl) * 0.5 caro(img, r3, red, morph=(p, fitz.Matrix(45))) r4 = r + (0, 150, 0, 150) p = r4.tl + (r4.br - r4.tl) * 0.5 arrow(img, r4, red, morph=(p, fitz.Matrix(0))) r5 = r4 + (r4.width, 0, r4.width, 0) dontenter(img, r5, morph=None)
def save_to_excel(savefile, parsed_results, show_boundries, filename, bbox_all): tables = [] userfolder = os.path.split(os.path.dirname(savefile))[0] if show_boundries == 'yes': doc = fitz.open(filename) with pd.ExcelWriter(savefile) as writer: for page_n in parsed_results: page_n = int(page_n) imgname = os.path.join(userfolder, 'scanned', 'p%d.jpg' % page_n) if not os.path.exists(imgname): page = doc.loadPage(page_n) #number of page mat = fitz.Matrix(1, 1) pix = page.getPixmap(matrix=mat, alpha=False) pix.writePNG(imgname) if parsed_results[page_n] != []: for idx in range(len(parsed_results[page_n])): table_id = 'p' + str(page_n + 1) + '_id_' + str(idx + 1) t0 = datetime.now() imgheight, imgwidth = pdf_boundry_img( imgname, table_id, bbox_all[page_n][idx]) print('get pdf boundry img for %s in' % table_id, datetime.now() - t0) table = parsed_results[page_n][idx] table = table.applymap(lambda x: x.encode( 'unicode_escape').decode('utf-8') if isinstance(x, str) else x) table.to_excel(writer, sheet_name=table_id) table.columns = table.columns.astype(str) tables.append({ 'table_id': table_id, 'table_html': table.to_html(index=False, index_names=False), 'imgheight': imgheight, 'imgwidth': imgwidth, 'remove': False }) doc.close() elif show_boundries == 'ocr': with pd.ExcelWriter(savefile) as writer: for page_n in parsed_results: if parsed_results[page_n] != []: for idx in range(len(parsed_results[page_n])): table = parsed_results[page_n][idx] if table is None: continue table = table.applymap(lambda x: x.encode( 'unicode_escape').decode('utf-8') if isinstance(x, str) else x) table_id = 'p' + str(page_n + 1) + '_id_' + str(idx + 1) img = plt.imread( os.path.join(userfolder, 'scanned', 'p%d.jpg' % page_n)) imgheight, imgwidth = img.shape[0], img.shape[1] copyfile( os.path.join(userfolder, 'result_scanned', '%s.jpg' % table_id), os.path.join(userfolder, table_id + '.jpg')) table.to_excel(writer, sheet_name=table_id) table.columns = table.columns.astype(str) tables.append({ 'table_id': table_id, 'table_html': table.to_html(index=False, index_names=False), 'imgheight': imgheight, 'imgwidth': imgwidth, 'remove': False }) return tables
def get_qr_data(filename): # global variables for qr code extraction return_data = [] real_values = [] found_data = [] #check_values = [0, 6, 8, 11, 15, 18, 22, 24, 30, 32, 34, 36, 38, 40, 42, 45, 50] print("Reading given %s pdf file" % filename) doc = fitz.open(filename) zoom = 4 # to increase the resolution mat = fitz.Matrix(zoom, zoom) noOfPages = doc.pageCount print("Given pdf has %d pages" % noOfPages) print("Starting page by page qr code detection and extraction") for pageNo in range(noOfPages): print("--------------Page Number %d------------------" % pageNo) page = doc.loadPage(pageNo) # number of page pix = page.getPixmap(matrix=mat) output = str( pageNo) + '.jpg' # you could change image format accordingly print("Saving image for current page") pix.writePNG(output) print("Reading saved image for current page") img = cv2.imread(output) found = False print("Detecting QR code...") for delta_max_x in range(0, 200, 20): if found: found = False break for delta_max_y in range(0, 200, 20): if found: break for delta in range(0, 50, 1): y = 205 + delta x = 2225 h = 115 - delta w = 90 max_x = 900 - delta_max_x max_y = 700 + delta_max_y resized_cropped = img[y:y + h, x:x + w] resized_cropped = cv2.resize(resized_cropped, (max_x, max_y)) sharpen_filter = np.array([[-1, -1, -1], [-1, 10, -1], [-1, -1, -1]]) resized_cropped = cv2.filter2D(resized_cropped, -1, sharpen_filter) # resized_cropped = cv2.cvtColor(resized_cropped, cv2.COLOR_BGR2GRAY) # _, resized_cropped = cv2.threshold(resized_cropped, 10, 255, cv2.THRESH_OTSU) qr_file = "QR_Code%d.png" % pageNo cv2.imwrite(qr_file, resized_cropped) hash = imagehash.average_hash(Image.open(qr_file)) otherhash = imagehash.average_hash( Image.open('QR_Code_example.png')) if (hash - otherhash) < 20: barcodes = decode(resized_cropped, symbols=[ZBarSymbol.QRCODE]) if len(barcodes) > 0: print("Qr code detected and extracted!") print(" %d qr code is" % pageNo, barcodes[0].data) real_values.append(pageNo) return_data.append([pageNo, barcodes[0].data]) found = True found_data.append([ pageNo, y, h, delta_max_x, delta_max_y, delta ]) break else: if delta == 0: found = True break try: os.remove(qr_file) os.remove(output) except: pass #print("diff = ", set(check_values) - set(real_values)) pprint(found_data) return return_data