Exemple #1
0
import fitz, os

thisdir = lambda f: os.path.join(os.path.dirname(__file__), f)
thisfile = os.path.abspath(__file__)
outfile = thisfile.replace(".py", ".pdf")

font1 = fitz.Font("helv")
font2 = fitz.Font("tiro")
doc = fitz.open()
page = doc.newPage()
point = fitz.Point(50, 72)
matrix = fitz.Matrix(-20)

wrt1 = fitz.TextWriter(page.rect, color=(0, 0, 1))
wrt2 = fitz.TextWriter(page.rect, color=(1, 0, 0))

_, last = wrt1.append(point, "This text changes color,", font1, 11)
_, last = wrt2.append(last, " font and fontsize", font2, 18)
_, last = wrt1.append(last, " several", font1, 11)
_, last = wrt2.append(last, " times!", font2, 24)

# output both text writers on current page in arbitrary sequence
wrt1.writeText(page, morph=(point, matrix))  # using the same morph parameter
wrt2.writeText(page, morph=(point, matrix))  # also preserves the joint text.

# make a new page
page = doc.newPage()
rect = wrt1.textRect | wrt2.textRect  # join rect of blue and red text
# make new rectangle from it, rotated by 90 degrees
nrect = fitz.Rect(
    rect.tl,  # same top-left, but width and height exchanged
Exemple #2
0
def pyMuPDF_fitz(pdfPath, imagePath, imageName):
    startTime_pdf2img = datetime.datetime.now()  # 开始时间

    print("图片输出路径为:" + imagePath)
    print("正在转化,请稍后...")

    file_name = os.path.basename(pdfPath)  # 获取文件名字

    name = file_name.split('.')[0]  # 去除后缀,获取名字

    pdfDoc = fitz.open(pdfPath)
    for pg in range(pdfDoc.pageCount):
        page = pdfDoc[pg]
        rotate = int(0)
        # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
        # 此处若是不做设置,默认图片大小为:792X612, dpi=96
        #zoom_x = 1.33333333  # (1.33333333-->1056x816)   (2-->1584x1224)
        #zoom_y = 1.33333333
        # 缩放系数都为2,分辨率提高4倍
        #zoom_x = 2  # (1.33333333-->1056x816)   (2-->1584x1224)
        #zoom_y = 2

        #zoom_x = 1  # (1.33333333-->1056x816)   (2-->1584x1224)
        #zoom_y = 1

        zoom_x = 1.111111  # (1.33333333-->1056x816)   (2-->1584x1224)
        zoom_y = 1.111111

        mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
        pix = page.getPixmap(matrix=mat, alpha=False)

        if not os.path.exists(imagePath):  # 判断存放图片的文件夹是否存在
            os.makedirs(imagePath)  # 若图片文件夹不存在就创建

        # 页码从0开始,水印都加一
        pix.writePNG(imagePath + '/' + imageName +
                     '-%s.JPEG' % str(int(pg) + 1))  # 将图片写入指定的文件夹内

        # 为图片添加水印
        imageInfo = PIL.Image.open(imagePath + '/' + imageName +
                                   '-%s.JPEG' % str(int(pg) + 1))

        # 文字水印

        fontOne = ImageFont.truetype("‪C:\Windows\Fonts\simfang.ttf",
                                     20)  # 本地字体文件

        draw = ImageDraw.Draw(imageInfo)
        #print(imageInfo.size)

        # 深灰色fill=(96, 96, 96)
        # 开始添加文字水印

        draw.text((imageInfo.size[0] // 2 - 25, imageInfo.size[1] // 2 + 445),
                  u"第%s页" % str(int(pg) + 1),
                  fill=(0, 0, 0),
                  font=fontOne)

        # imageInfo.show()  #展示图片

        # 添加图片水印
        logo = PIL.Image.open("C:\\logo.png")

        w, h = logo.size  # 获取图像宽高
        logo.thumbnail((800, 800))  # 图像缩小1/2,图像缩放

        layer = PIL.Image.new('RGBA', imageInfo.size, (255, 255, 255, 0))

        # 添加图片水印

        layer.paste(logo, (imageInfo.size[0] - logo.size[0] + 70,
                           imageInfo.size[1] - logo.size[1] - 20))

        imageInfo = PIL.Image.composite(layer, imageInfo, layer)

        #imageInfo.paste(logo, (0, 0))    # 将一张图片覆盖到另外一张图片上

        imageInfo.save(imagePath + '/' + imageName +
                       '-%s.JPEG' % str(int(pg) + 1),
                       quality=1000,
                       optimize=True,
                       progressive=True)
        #imageInfo.save(imagePath + '/' + imageName + '-%s.JPEG' % str(int(pg) + 1), quality=200, optimize=True)

    endTime_pdf2img = datetime.datetime.now()  # 结束时间
    print('pdf2img转换时间:', (endTime_pdf2img - startTime_pdf2img).seconds)

    # 拼接为长图
    '''imgs = [Image.open(imagePath + '\\' + fn) for fn in listdir(imagePath) if fn.endswith(".png")]  # 打开路径下的所有图片
Exemple #3
0
async def media_loop():

    running = True

    while running:

        originalBookAndPage = False
        while not originalBookAndPage:

            validBookAndPageOpened = False
            while not validBookAndPageOpened:

                validBookFound = False
                while not validBookFound:
                    randomBook = random.randint(0, bookCount)

                    bookToPrint = bookPaths[randomBook]
                    print(f"Book selected: {bookToPrint}")

                    try:
                        book = fitz.open(bookToPrint)
                        totalPages = book.pageCount
                        print(f"Book loaded. Page count: {totalPages}")
                        validBookFound = True
                    except RuntimeError as e:
                        print(f"Error with book somewhere. {e}")

                # books that don't have valid table of contents page selection process
                randomLow = int(totalPages * 0.04)

                # randomPageNumber = random.randint(randomLow, totalPages - 8)
                randomPageNumber = random.randint(randomLow,
                                                  totalPages - randomLow)

                if randomPageNumber == 0:
                    randomPageNumber = 1

                pageNumber = randomPageNumber
                try:
                    page = book.loadPage(pageNumber)
                    print(f"Page loaded: {pageNumber}")
                    validBookAndPageOpened = True
                except RuntimeError as e:
                    print(f"Error loading page: {pageNumber}. {e}")

            try:
                zoom = 2
                matrix = fitz.Matrix(zoom, zoom)
                picOfPage = page.getPixmap(matrix=matrix)
                output = f"{bookToPrint}-{pageNumber}.png"

                if not book_in_history(bookToPrint, pageNumber):
                    print(
                        f"{Fore.RED}\nCOLLISION:\nPage {pageNumber} of {bookToPrint}\n"
                    )
                # if os.path.isfile(output):
                #     print(
                #         f"{Fore.RED}\nCOLLISION:\nPage {pageNumber} of {bookToPrint}\n")
                else:
                    originalBookAndPage = True
            except:
                print(f"{Fore.RED}: Error in checking for collision")

        picOfPage.writePNG(output)
        status = poststatus(output)
        if status:
            print(f"{Fore.GREEN}\nSUCCESS\nUploaded: {output}\n")

            try:
                os.remove(output)
                print("File removed succesfully")
            except OSError as e:
                print(f"Error removing file: {e}")
            # print(status.created_at)
        else:
            print(f"{Fore.RED}\nFailure updating status\n")

        print(f"{Fore.BLUE} VALID EVERYTHING")

        minutes = 5

        time = minutes * 60
        print(f"{Fore.YELLOW}{Style.BRIGHT}Sleep for {minutes}m")

        await asyncio.sleep(time)
Exemple #4
0
def pdf_ocr(pdf_name, path, method_get_image, words_per_line, ocr_method,
            client_id, client_secret):
    # 打开pdf
    doc = fitz.open(path)
    access_token = None
    if ocr_method == "online":
        # client_id 为官网获取的AK, client_secret 为官网获取的SK
        host = 'https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id={}&client_secret={}'.format(
            client_id, client_secret)
        response = requests.get(host)
        if response:
            access_token = response.json()["access_token"]
            print(access_token)
    if if_image_to_pdf_or_hocr:
        doc_name = 'output/' + pdf_name[:-4] + "_output.pdf"
    else:
        doc_name = 'output/' + pdf_name[:-4] + ".docx"

    # 正则式提取图片法
    if method_get_image == '正则式':
        # 使用正则表达式来查找图片
        checkXO = r"/Type(?= */XObject)"
        checkIM = r"/Subtype(?= */Image)"
        # 图片计数
        imgcount = 0
        lenXREF = doc._getXrefLength()

        # 打印PDF的信息
        print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF - 1))
        # 遍历每一个对象
        for i in range(1, lenXREF):
            # 定义对象字符串
            text = doc._getXrefString(i)
            isXObject = re.search(checkXO, text)
            # 使用正则表达式查看是否是图片
            isImage = re.search(checkIM, text)
            # 如果不是对象也不是图片,则continue
            if not isXObject or not isImage:
                continue
            imgcount += 1
            # 根据索引生成图像
            pix = fitz.Pixmap(doc, i)

        # 根据pdf的路径生成图片的名称
        # new_name = path.replace('\\', '_') + "_img{}.png".format(imgcount)
        # new_name = new_name.replace(':', '')
        # out_image_path = os.path.join(pic_path, new_name)
        # # 如果pix.n<5,可以直接存为PNG
        # if pix.n < 5:
        #     pix.writePNG(out_image_path)
        # # 否则先转换CMYK
        # else:
        #     pix0 = fitz.Pixmap(fitz.csRGB, pix)
        #     pix0.writePNG(out_image_path)
        #     pix0 = None

        image = pix.getImageData()
        ocr_tesseract(image)
        # page = ocr_baidu(image)
        # time.sleep(1)
        # 释放资源
        pix = None
    else:
        if if_image_to_pdf_or_hocr:
            if ocr_method == "online":
                raise ValueError('Unsupported filetype for online\
                                  API: {}'.format('pdf'))
                return
            output_doc = fitz.open()
        else:
            output_doc = docx.Document()
            output_doc.styles['Normal'].font.name = u'等线'
            output_doc.styles['Normal']._element.rPr.rFonts\
                      .set(qn('w:eastAsia'), u'等线')
        time_start = time.time()
        page_count = doc.pageCount
        for pg in range(doc.pageCount):
            elapsed = time.time() - time_start
            eta = (page_count - pg) * elapsed / pg if pg > 0 else 0
            print('[%d/%d] Elapsed: %s, ETA: %s' %
                  (pg + 1, page_count, fmt_time(elapsed), fmt_time(eta)))
            page = doc[pg]
            rotate = int(0)
            # 每个尺寸的缩放系数为2,这将为我们生成分辨率提高4倍的图像。
            zoom_x = 2
            zoom_y = 2
            trans = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
            pix = page.getPixmap(matrix=trans, alpha=False)
            image = pix.getImageData()

            # 这两个标志位解决 pdf 换页有可能不换段的问题
            last_para = ""  # 上一页的最后一段
            last_para_ended = True  # 上一页最后一段是否已结束,还是被分页了

            if ocr_method == "local":
                if if_image_to_pdf_or_hocr:
                    temp_doc = ocr_tesseract(image)
                    output_doc.insertPDF(temp_doc, 0, 0)
                    # output_doc.save(doc_name, incremental=True)
                    output_doc.save(doc_name)
                else:
                    final_para_ended, paras = ocr_tesseract(
                        image, words_per_line)
                    args = [
                        paras, output_doc, doc_name, last_para,
                        final_para_ended, last_para_ended
                    ]
                    last_para, last_para_ended = paras2doc(*args)
            else:
                final_para_ended, paras = ocr_baidu(image, words_per_line,
                                                    access_token)
                args = [
                    paras, output_doc, doc_name, last_para, final_para_ended,
                    last_para_ended
                ]
                last_para, last_para_ended = paras2doc(*args)
                time.sleep(1)
            # 释放资源
            pix = None
Exemple #5
0
        spend = time.time() - start_time
        print("Parsing: ", N, " seconds: ", spend)
    
    # Load Documents and meta information
    currentDocument = annotations[N]
    id = currentDocument['ID']
    headerFields = currentDocument["HeaderElements"]
    ListElements = currentDocument["ListElements"]

    # load document
    doc = fitz.open(invoices_path + str(id) + ".pdf")
    page = doc[0]

    # convert to png
    zoom = 4
    mat = fitz.Matrix(zoom, zoom)
    image = page.getPixmap(matrix = mat, alpha = False) 
    w = image.width
    h = image.height
    image.writePNG(png_path + str(id) + ".png")

    # Create training data
    SegmentImage = np.zeros((height-3, width-1, num_classes), dtype=int)
    BoxMasks = np.zeros((height-3, width-1, 2*num_anchors), dtype=int)
    BoxCoords = np.zeros((height-3, width-1, 4*num_anchors), dtype=int)

    # Fill Seg Mapper with background
    for x in range(width-1):
        for y in range(height-3):
            SegmentImage[y,x] = segMapper("Background")
    
Exemple #6
0
    total_count = missed_count = 0
    # iterate over identified words
    for word in chain(split_words, standard_words):
        wid = word['wid']
        total_count += 1
        chi_map, eng_map = [], []
        # get all bboxs associated with text
        for group in word['groups']:
            bids, bbox, pg_num = group['bid'], group['bbox'], group['pg']
            # obtain image of bbox
            bbox = resize(bbox, sf, [eps, 0, eps, 0])
            if pg_num in pg_cache:
                img = pg_cache[pg_num]
            else:
                pix = doc[pg_num].getPixmap(matrix=fitz.Matrix(sf, sf))
                img = Image.open(io.BytesIO(pix.getPNGData()))
                pg_cache[pg_num] = img
            block_img = img.crop(bbox)
            # perform OCR on bbox
            if len(bids) == 1:  # single line
                chi_map.extend(classify(block_img, chi_detector))
                eng_map.extend(classify(block_img, eng_detector))
            else:  # multi-line
                chi_map.extend(classify(block_img, chi_detector_multi))
                eng_map.extend(classify(block_img, eng_detector_multi))
        # use custom heuristics to obtain text (if high enough probability)
        chi_pinyin, chars, char_confs = getChinese(chi_map)
        eng_pinyin, nums = getEnglish(eng_map)
        guess = refine(chars, char_confs, chi_pinyin, eng_pinyin, bids[0],
                       letters)
Exemple #7
0
def create_exam_and_insert_QR(
    name,
    code,
    length,
    versions,
    test,
    page_versions,
    qr_file,
    test_mode=False,
    test_folder=None,
):
    """Creates the exam objects and insert the QR codes.

    Creates the exams objects from the pdfs stored at sourceVersions.
    Then adds the 3 QR codes for each page.
    (We create 4 QR codes but only add 3 of them because of the staple side, see below).

    Arguments:
        name {Str} -- Document Name.
        code {Str} -- 6 digit distinguished code for the document.
        length {int} -- Length of the document or number of pages.
        versions {int} -- Number of version of this Document.
        test {int} -- Test number based on the combination we have around (length ^ versions - initial pages) tests .
        page_versions {dict} -- (int,int) dictionary representing the version of each page for this test.
        qr_file {dict} -- dict(int: dict(int: Str)) Dictionary that has another embedded dictionary for each page.
                          The embedded dictionary has a string for QR code paths saved for each corner.

    Keyword Arguments:
        test_mode {bool} -- Boolean elements used for testing, testing case with show the documents.  (default: {False})
        test_folder {Str} -- String for where to place the generated test files. (default: {None})

    Returns:
        fitz.Document -- PDF document type returned as the exam, similar to a dictionary with the ge numbers as the keys.
    """

    # A (int : fitz.fitz.Document) dictionary that has the page document/path from each source based on page version
    version_paths_for_pages = {}
    for version_index in range(1, versions + 1):
        version_paths_for_pages[version_index] = fitz.open(
            "sourceVersions/version{}.pdf".format(version_index))

    # Create test pdf as "exam"
    exam = fitz.open()
    # Insert the relevant page-versions into this pdf.
    for page_index in range(1, length + 1):
        # Pymupdf starts pagecounts from 0 rather than 1. So offset things.
        exam.insertPDF(
            version_paths_for_pages[page_versions[page_index]],
            from_page=page_index - 1,
            to_page=page_index - 1,
            start_at=-1,
        )

    # Get page width and height
    page_width = exam[0].bound().width
    page_height = exam[0].bound().height

    # create a box for the test number near top-centre
    rTC = fitz.Rect(page_width // 2 - 50, 20, page_width // 2 + 50, 40)

    # put marks at top left/right so students don't write near
    # staple or near where client will stamp marks

    # create two "do not write" (DNW) rectangles accordingly with TL (top left) and TR (top right)
    rDNW_TL = fitz.Rect(15, 15, 90, 90)
    rDNW_TR = fitz.Rect(page_width - 90, 15, page_width - 15, 90)

    # 70x70 page-corner boxes for the QR codes
    # TL: Top Left, TR: Top Right, BL: Bottom Left, BR: Bottom Right
    rTL = fitz.Rect(15, 20, 85, 90)
    rTR = fitz.Rect(page_width - 85, 20, page_width - 15, 90)
    rBL = fitz.Rect(15, page_height - 90, 85, page_height - 20)
    rBR = fitz.Rect(page_width - 85, page_height - 90, page_width - 15,
                    page_height - 20)

    for page_index in range(length):
        # test/page stamp in top-centre of page
        # Rectangle size hacked by hand. TODO = do this more algorithmically
        # VALA SAYS: TODO still tands given that the pages are all the same
        # size. Will ask what it mean to do it algorithmically
        rect = fitz.Rect(page_width // 2 - 40, 20, page_width // 2 + 40, 44)
        text = "{}.{}".format(str(test).zfill(4), str(page_index + 1).zfill(2))
        insertion_confirmed = exam[page_index].insertTextbox(
            rect,
            text,
            fontsize=18,
            color=[0, 0, 0],
            fontname="Helvetica",
            fontfile=None,
            align=1,
        )
        exam[page_index].drawRect(rect, color=[0, 0, 0])
        assert insertion_confirmed > 0

        # stamp DNW near staple: even/odd pages different
        # Top Left for even pages, Top Right for odd pages
        # TODO: Perhaps this process could be improved by putting
        # into functions
        rDNW = rDNW_TL if page_index % 2 == 0 else rDNW_TR
        shape = exam[page_index].newShape()
        shape.drawLine(rDNW.top_left, rDNW.top_right)
        if page_index % 2 == 0:
            shape.drawLine(rDNW.top_right, rDNW.bottom_left)
        else:
            shape.drawLine(rDNW.top_right, rDNW.bottom_right)
        shape.finish(width=0.5, color=[0, 0, 0], fill=[0.75, 0.75, 0.75])
        shape.commit()
        if page_index % 2 == 0:
            # offset by trial-and-error, could be improved
            rDNW = rDNW + (19, 19, 19, 19)
        else:
            rDNW = rDNW + (-19, 19, -19, 19)
        mat = fitz.Matrix(45 if page_index % 2 == 0 else -45)
        pivot = rDNW.tr / 2 + rDNW.bl / 2
        morph = (pivot, mat)
        insertion_confirmed = exam[page_index].insertTextbox(
            rDNW,
            name,
            fontsize=8,
            fontname="Helvetica",
            fontfile=None,
            align=1,
            morph=morph,
        )
        # exam[page_index].drawRect(rDNW, morph=morph)
        assert (insertion_confirmed >
                0), "Text didn't fit: shortname too long?  or font issue/bug?"

        # Grab the tpv QRcodes for current page and put them on the pdf
        # Remember that we only add 3 of the 4 QR codes for each page since
        # we always have a corner section for staples and such
        qr_code = {}
        for corner_index in range(1, 5):
            qr_code[corner_index] = fitz.Pixmap(qr_file[page_index +
                                                        1][corner_index])
        if page_index % 2 == 0:
            exam[page_index].insertImage(rTR, pixmap=qr_code[1], overlay=True)
            exam[page_index].insertImage(rBR, pixmap=qr_code[4], overlay=True)
            exam[page_index].insertImage(rBL, pixmap=qr_code[3], overlay=True)
        else:
            exam[page_index].insertImage(rTL, pixmap=qr_code[2], overlay=True)
            exam[page_index].insertImage(rBL, pixmap=qr_code[3], overlay=True)
            exam[page_index].insertImage(rBR, pixmap=qr_code[4], overlay=True)

    return exam
Exemple #8
0
            pix_2 = fitz.Pixmap(barcode_sign)
            page.insertImage(rect_sign, pixmap=pix_2, overlay=True)
            page.insertImage(rect_mark, pixmap=pix_1, overlay=True)
            doc.save(output_file)

import fitz
for root, dirs, files in walk('D:\\py\\new_CR'):
    for f in files:
        doc = fitz.open(join(root, f))
        width, height = fitz.PaperSize('a4')
        totaling = doc.pageCount
        for pg in range(totaling):
            page = doc[pg]
            zoom = int(100)
            rotate = int(0)
            trans = fitz.Matrix(zoom / 60, zoom / 60).preRotate(rotate)
            pm = page.getPixmap(matrix=trans, alpha=False)
            lurl = 'D:\\py\\new_CR\\{}.jpg'.format(str(f)[0:12])
            pm.writePNG(lurl)
        doc.close()

for root, dirs, files in walk('D:\\py\\new_CR'):
    for f in files:
        if '.jpg' in f:
            doc_pdf = fitz.open()
            imgdoc = fitz.open(join(root, f))
            pdfbytes = imgdoc.convertToPDF()
            imgpdf = fitz.open('pdf', pdfbytes)
            doc_pdf.insertPDF(imgpdf)
            doc_pdf.save('D:\\py\\new_CR\\final\\{}.pdf'.format(str(f)[0:12]))
            doc_pdf.close()
Exemple #9
0
def convertPDFPagesToJPG(bookName, bookID):
    book = fitz.open(PDFFileDirName + "/" + bookName + ".pdf")
    dirToSave = inputsDirName + "/" + bookName
    if not os.path.exists(dirToSave): os.makedirs(dirToSave)
    for page in book: page.getPixmap(matrix=fitz.Matrix(8, 8)).writeImage(dirToSave + "/" + str(page.number) + ".jpg")
    if showPrints: print("Converting PDF pages to JPG pages done")
Exemple #10
0
    def make_lessonslists(self, classes, filePath, pdffile):
        global boxOfImages
        try:
            os.makedirs("data")
        except:
            pass

        doc = None
        file = pdffile
        doc = fitz.open(file)
        for i in range(len(doc)):
            first_page = doc[i]

            image_matrix = fitz.Matrix(fitz.Identity)
            image_matrix.preScale(2, 2)

            pix = first_page.getPixmap(alpha=False, matrix=image_matrix)
            boxOfImages.append(f'{i}.jpg')
            pix.writePNG(f'data/{i}.jpg')

        NUMBEROFCLASS = 0
        for _filename_ in boxOfImages:
            img = Image.open(f"{filePath}{_filename_}")
            pixMap = img.load()
            width, height = img.size

            listTemplates = []
            boxTime2 = [False]
            FIRSTindent, SECONDindent = self.get_indent(pixMap, width, height)
            for i in range(FIRSTindent, SECONDindent):
                boxTime = []
                tok = 0
                for j in range(width):
                    if pixMap[j, i] != (0, 0, 0):
                        boxTime.append(True)
                    else:
                        tok += 1
                        boxTime.append(False)
                # print(boxTime)
                if (not all(boxTime2)
                        and all(boxTime)) or (all(boxTime2)
                                              and not all(boxTime)):
                    listTemplates.append(i)
                boxTime2 = boxTime.copy()
            listTemplates.extend([FIRSTindent, SECONDindent])
            listTemplates.sort()

            try:
                os.makedirs("data/data")
            except:
                pass

            def getY(y):
                chek = 0
                border = set()
                variableBorder = None
                for i in range(width):
                    if pixMap[i, y] == (0, 0, 0):
                        variableBorder = i
                    elif variableBorder:
                        border.add(variableBorder)
                return sorted(list(border))

            BORDERNUM = self.get_num_of_borders(self.get_text_pdf())
            for i in range(len(listTemplates) // 2):
                try:
                    if 1:
                        y0 = listTemplates[i * 2]
                        y1 = listTemplates[i * 2 + 1]
                        boxBorder = getY(y0 + 3)
                        im0 = img.crop((boxBorder[0], y0, boxBorder[2], y1))
                        for k in range((len(boxBorder) - 3) // BORDERNUM + 1):

                            # print(boxBorder)
                            x0 = boxBorder[k * (BORDERNUM + 1) + 2]
                            x1 = boxBorder[k * (BORDERNUM + 1) + 3 + BORDERNUM]
                            # print(x0, x1)
                            im1 = img.crop((x0, y0, x1 + 1, y1))
                            new_im = Image.new(
                                'RGB',
                                (im0.size[0] + im1.size[0], im0.size[1]))
                            new_im.paste(im0, (0, 0))
                            new_im.paste(im1, (im0.size[0], 0))
                            new_im.save('data/data/' +
                                        str(classes[NUMBEROFCLASS]) + '.jpg')
                            NUMBEROFCLASS += 1
                except Exception:
                    pass
 def _get_annots(
         self,
         annot_image_dir: str = "",
         ocr_api: str = "",
         zoom: int = 4,  # image zoom factor
         run_test: bool = False,  # get 3 annot and 3 pic at most
 ):
     if not self.doc.has_annots():
         return
     annot_list = []
     annot_count = 0
     extracted_pic_count = 0
     for page in self.doc.pages():
         if run_test and annot_count > 2 and extracted_pic_count > 2:
             break
         annot_num = 0
         word_list = page.getText("words")  # list of words on page
         word_list.sort(key=lambda w: (w[3], w[0]))  # ascending y, then x
         for annot in page.annots():
             annot_type = annot.type[0]
             if annot_type not in ANNOT_TYPES:
                 continue
             page_num = page.number + 1
             annot_id = f"annot-{page_num}-{annot_num}"
             color = RGB(annot.colors.get("stroke")).to_hex()
             height = annot.rect[1] / page.rect[3]
             if annot_type == 4:  # rectangle
                 if run_test and extracted_pic_count > 2:
                     continue
                 pix = page.get_pixmap(
                     annots=
                     False,  # TODO donnot display annots, maybe let user customize this?
                     clip=annot.rect,
                     matrix=fitz.Matrix(zoom, zoom),  # zoom image
                 )
                 base_name = self.file_name.replace(" ", "-")
                 picture_path = os.path.join(annot_image_dir,
                                             f"{base_name}-{annot_id}.png")
                 pix.writePNG(picture_path)
                 extracted_pic_count += 1
                 content = [picture_path]
                 if ocr_api:
                     ocr_result = Picture(picture_path).get_ocr_result(
                         ocr_api)
                     content.append(ocr_result)
             else:
                 if run_test and annot_count > 2:
                     continue
                 content = [annot.info.get("content")]
                 if annot_type in [8, 9, 10, 11]:
                     text = self._parse_highlight(annot, word_list)
                     content.append(text)
             annot_list.append({
                 "type": annot.type[1],
                 "page": page_num,
                 "content": content,
                 "id": annot_id,
                 "height": height,
                 "color": color,
             })
             annot_num += 1
             annot_count += 1
     return annot_list
Exemple #12
0
class BBox(IText):
    '''Boundary box with attribute in fitz.Rect type.'''

    # all coordinates are related to un-rotated page in PyMuPDF
    # e.g. Matrix(0.0, 1.0, -1.0, 0.0, 842.0, 0.0)
    ROTATION_MATRIX = fitz.Matrix(0.0) # rotation angle = 0 degree by default


    @classmethod
    def set_rotation_matrix(cls, rotation_matrix):
        if rotation_matrix and isinstance(rotation_matrix, fitz.Matrix):
            cls.ROTATION_MATRIX = rotation_matrix


    @classmethod
    def pure_rotation_matrix(cls):
        '''Pure rotation matrix used for calculating text direction after rotation.'''
        a,b,c,d,e,f = cls.ROTATION_MATRIX
        return fitz.Matrix(a,b,c,d,0,0)


    def __init__(self, raw:dict=None):
        ''' Initialize BBox and convert to the real (rotation considered) page coordinate system.'''        
        self.bbox = fitz.Rect()

        # NOTE: Any coordinates provided in raw is in original page CS (without considering page rotation).
        if raw is None: raw = {}
        if 'bbox' in raw:
            rect = fitz.Rect(raw['bbox']) * BBox.ROTATION_MATRIX
            self.update_bbox(rect)


    def __bool__(self):
        '''Real object when bbox is defined.'''
        return bool(self.bbox)
    

    def __repr__(self): return f'{self.__class__.__name__}({tuple(self.bbox)})'

    
    def get_expand_bbox(self, dt:float):
        '''Get expanded bbox with margin dt in both x- and y- direction. Note this method doesn't change its bbox.'''
        return self.bbox + (-dt, -dt, dt, dt)
    

    def contains(self, bbox, threshold:float=1.0):
        '''Whether given bbox is contained in this instance, with margin considered.'''
        # it's not practical to set a general threshold to consider the margin, so two steps:
        # - set a coarse but acceptable area threshold,
        # - check the length in main direction strictly

        if not bbox: return False

        # A contains B => A & B = B
        intersection = self.bbox & bbox.bbox
        factor = round(intersection.getArea()/bbox.bbox.getArea(), 2)
        if factor<threshold: return False

        # check length
        if self.bbox.width >= self.bbox.height:
            return self.bbox.width+constants.MINOR_DIST >= bbox.bbox.width
        else:
            return self.bbox.height+constants.MINOR_DIST >= bbox.bbox.height
   
   
    def vertically_align_with(self, bbox, factor:float=0.0, text_direction:bool=True):
        ''' Check whether two boxes have enough intersection in vertical direction, i.e. perpendicular to reading direction.
            ---
            Args:
              - bbox: BBox to check with
              - factor: threshold of overlap ratio, the larger it is, the higher probability the two bbox-es are aligned.
              - text_direction: consider text direction or not. True by default, from left to right if False.

            ```
            +--------------+
            |              |
            +--------------+ 
                    L1
                    +-------------------+
                    |                   |
                    +-------------------+
                            L2
            ```
            
            An enough intersection is defined based on the minimum width of two boxes:
            ```
            L1+L2-L>factor*min(L1,L2)
            ```
        '''
        if not bbox or not bool(self): return False

        # text direction
        is_horizontal_text = self.is_horizontal_text if text_direction else True
        idx = 0 if is_horizontal_text else 1

        L1 = self.bbox[idx+2]-self.bbox[idx]
        L2 = bbox.bbox[idx+2]-bbox.bbox[idx]
        L = max(self.bbox[idx+2], bbox.bbox[idx+2]) - min(self.bbox[idx], bbox.bbox[idx])

        return L1+L2-L>=factor*max(L1,L2)


    def horizontally_align_with(self, bbox, factor:float=0.0, text_direction:bool=True):
        ''' Check whether two boxes have enough intersection in horizontal direction, i.e. along the reading direction.
            ---
            Args:
              - bbox: BBox to check with
              - factor: threshold of overlap ratio, the larger it is, the higher probability the two bbox-es are aligned.
              - text_direction: consider text direction or not. True by default, from left to right if False.

            ```
            +--------------+
            |              | L1  +--------------------+
            +--------------+     |                    | L2
                                 +--------------------+
            ```
            
            An enough intersection is defined based on the minimum width of two boxes:
            ```
            L1+L2-L>factor*min(L1,L2)
            ```
        '''
        if not bbox or not bool(self): return False

        # text direction
        is_horizontal_text = self.is_horizontal_text if text_direction else True
        idx = 1 if is_horizontal_text else 0
        
        L1 = self.bbox[idx+2]-self.bbox[idx]
        L2 = bbox.bbox[idx+2]-bbox.bbox[idx]
        L = max(self.bbox[idx+2], bbox.bbox[idx+2]) - min(self.bbox[idx], bbox.bbox[idx])

        return L1+L2-L>=factor*max(L1,L2)


    def copy(self):
        '''make a deep copy.'''
        return copy.deepcopy(self)    


    def update_bbox(self, rect):
        '''Update current bbox to specified `rect`.
            ---
            Args:
              - rect: fitz.rect or raw bbox like (x0, y0, x1, y1) in real page CS (with rotation considered).
        '''
        self.bbox = fitz.Rect([round(x,1) for x in rect])
        return self


    def union_bbox(self, bbox):
        '''Update current bbox to the union with specified `rect`.
            ---
            Args:
              - bbox: BBox, the target to get union
        '''
        return self.update_bbox(self.bbox | bbox.bbox)


    def compare(self, bbox, threshold=0.9):
        '''Whether has same type and bbox.'''
        if not isinstance(bbox, self.__class__):
            return False, f'Inconsistent type: {self.__class__.__name__} v.s. {bbox.__class__.__name__} (expected)'
        
        if not get_main_bbox(self.bbox, bbox.bbox, threshold):
            return False, f'Inconsistent bbox: {self.bbox} v.s. {bbox.bbox}(expected)'
        
        return True, ''


    def store(self):
        '''Store in json format.'''
        return { 'bbox': tuple([x for x in self.bbox]) }

    
    def plot(self, page, stroke:tuple=(0,0,0), width:float=0.5, fill:tuple=None):
        '''Plot bbox in PDF page.'''
        page.drawRect(self.bbox, color=stroke, fill=fill, width=width, overlay=False)
Exemple #13
0
    doc = fitz.open()  # empty new PDF
    page = doc.newPage()  # create page (A4)
    img = page.newShape()  # create shape
    # =============================================================================
    #   pencil 1
    # =============================================================================
    penheight = 100  # thickness of pencil
    pentip = fitz.Point(100, 150)  # first pencil tip here
    pencil(img, pentip, penheight, True)  # pencil points left
    # =============================================================================
    #   pencil 2
    # =============================================================================
    penheight = 20  # now a smaller one
    pentip = fitz.Point(100, 250)  # new pencil tip
    pencil(img, pentip, penheight, False)  # this one points right

    pentip.x += 10  # insert a little distance
    text = """Like the ReportLab User Guide does,\nyou may want to use this image, to\nemphasize content, e.g. cautionary\nremarks, notes, examples, etc."""
    page.insertText(pentip, text)  # insert explanatory text
    # =============================================================================
    #   pencil 3
    # =============================================================================
    # yet another pencil, which we will morph around its tip
    mat = fitz.Matrix(-150) * fitz.Matrix(0.5, 0.5,
                                          1)  # morphing: rotate & shear
    pentip = fitz.Point(300, 400)
    # instead of another thickness (40) we could have used a scale matrix
    pencil(img, pentip, 40, True, morph=(pentip, mat))
    img.commit()
    doc.save("pencil.pdf")
Exemple #14
0
def pdf_filter():
    global FILTER_STATUS
    # 等待被触发
    while WINDOW_STATUS:
        if FILTER_STATUS:
            FILTER_STATUS = False
            # 取消Button功能,触发进度提示
            bt['state'] = 'disabled'
            progress = 0
            var_01.set('正在处理,请稍候...')
            var_10.set(f'处理进度:{progress}%')

            # pdf转png
            try:
                input_pdf = filedialog.askopenfile(title='选择pdf文档').name
            except:
                close_window()
            input_doc = fitz.open(input_pdf)
            toc = input_doc.getToC() # 获取待处理pdf的目录
            page_sum = input_doc.pageCount
            for i in range(0, page_sum):
                page = input_doc[i]
                zoom = 100 * RESOLUTION # 缩放
                rotate = 0 # 无转动
                trans = fitz.Matrix(zoom/100.0, zoom/100.0).preRotate(rotate)
                pm = page.getPixmap(matrix=trans, alpha=False)
                if(i+1<10):
                    page_num = '000' + str(i+1)
                elif(i+1<100):
                    page_num = '00' + str(i+1)
                elif(i+1<1000):
                    page_num = '0' + str(i+1)
                else:
                    page_num = str(i+1)
                pm.writePNG('pdf2png/%s.png' % page_num)
                progress = int(40 * (i/page_sum))
                var_10.set(f'处理进度:{progress}%')
            input_doc.close()

            # png图片处理
            m = page_sum
            path = ('pdf2png/')
            f = listdir(path)
            for i in f:
                if i=='0000.png':
                    continue
                img = Image.open(path+i)
                if FILTER_MODE == '反色':
                    inv_img = PIL.ImageOps.invert(img)
                elif FILTER_MODE == '灰度':
                    inv_img = PIL.ImageOps.grayscale(img)
                elif FILTER_MODE == '去边':
                    inv_img = PIL.ImageOps.crop(img, border=10)
                elif FILTER_MODE == '增强':
                    inv_img = PIL.ImageOps.autocontrast(img, cutoff=10)
                else: # FILTER_MODE == '跳阶'
                    inv_img = PIL.ImageOps.posterize(img, 2)
                inv_img.save(path+i)
                m -= 1
                progress = int(40 + 30 * (page_sum-m)/page_sum)
                var_10.set(f'处理进度:{progress}%')

            # png转pdf
            n = page_sum
            output_doc = fitz.open()
            for img in sorted(glob.glob('pdf2png/*')):
                imgdoc = fitz.open(img)
                pdfbytes = imgdoc.convertToPDF()
                imgpdf = fitz.open('pdf', pdfbytes)
                output_doc.insertPDF(imgpdf)
                n -= 1
                progress = int(70 + 30 * (page_sum-n)/page_sum)
                var_10.set(f'处理进度:{progress}%')
            var_01.set('处理完成!')
            #output_path = filedialog.askdirectory(title='请选择保存位置')+'/'
            output_path = filedialog.asksaveasfilename(title='另存为输出pdf文档', defaultextension='.pdf', initialfile=f'{FILTER_MODE} - '+input_pdf.split('/')[-1], filetypes=[('PDF','*.pdf')])
            output_doc.setToC(toc) # 将原pdf的目录加入新pdf中
            output_doc.save(output_path)
            output_doc.close()
            close_window()
        else:
            sleep(0.3)
Exemple #15
0
def image_extract(path):
    keywords = "........"
    for file in os.listdir(path):
     try:
        if file[-4:] == ".pdf":
            mkpath = 'E:\\项目\\试运行\\201808\\提取图片\\'
            mkpath = mkpath + file[:-4] + '\\'
            mkdir(mkpath)
            doc = fitz.open(os.path.join(path, file))
            page_count = doc.pageCount
            picname_num = 0
            picname_list = []
            for i in range(10):
                page = doc.loadPage(i)
                page_text = page.getText()
                if keywords in page_text:
                    pattern = re.compile(r'图表 [0-9][^\.]*')
                    picname_list1 = pattern.findall(page_text)
                    picname_list.extend(picname_list1)
            # print(picname_list)
            for i in range(2, page_count):
                # page = doc[i]
                # links = page.getLinks()
                page = doc.loadPage(i)
                page_text = page.getText()
                pic_1 = page.searchFor("图表")
                data_1 = page.searchFor("资料来源:")
                if len(data_1) == 0:
                    data_1 = page.searchFor("来源:")
                # print('未处理前---第', i, '页--图表的个数为', len(pic_1))
                # print('未处理前---第', i, '页--数据来源的个数为', len(data_1))
                # 处理重复数据
                pic_2 = deal_repeat(pic_1)
                data = deal_repeat(data_1)
                # print('未处理正文重复----第', i, '页--图表的个数为', len(pic_2))
                # print('未处理正文重复--图表坐标', pic_2)
                # print('未处理正文重复--资料来源坐标', data)
                # print('第', i, '页--数据来源的个数为', len(data))
                # 处理正文出现的重复数据
                pic = deal_reContext(pic_2, data)
                # print('处理完正文重复----第', i, '页--图表的个数为', len(pic))
                # print('处理完正文重复----图表坐标', pic)
                # print('处理完正文重复----资料来源坐标', data)
                # print('第', i, '页--数据来源的个数为', len(data))
                # if i == 7:
                #     print('未处理表格的坐标', pic_1)
                #     print('处理后表格的坐标', pic)
                #     print('未处理资料来源的坐标', data_1)
                #     print('处理后资料来源的坐标', data)
                if len(pic) > len(data):  # 图表数多于资料来源
                    pic = pic[:len(data) - len(pic)]
                elif len(pic) < len(data):  # 资料来源多于图表数
                    for i in range(len(data) - len(pic)):
                        pic.insert(0, fitz.Rect(0, 0, 0, 0))

                if len(pic) != 0 and len(data) != 0 and len(pic) == len(data):
                    mat = fitz.Matrix(3, 3)  # 缩放
                    page_ = page.rect  # 页面大小
                    page_length = page_.x1  # 页面长
                    page_width = page_.y1  # 页面宽

                    # words = page.getTextWords()              #获取页面文字
                    # print(words)
                    pic1 = [[]]
                    data1 = [[]]
                    length = len(pic)  # 图表数
                    for i in range(length):  # 将图表按行分组
                        if i < (length - 1):
                            if pic[i].y0 == pic[i + 1].y0:
                                pic1[-1].append(pic[i])
                                data1[-1].append(data[i])
                            else:
                                pic1[-1].append(pic[i])
                                pic1.append([])
                                data1[-1].append(data[i])
                                data1.append([])
                    pic1[-1].append(pic[i])
                    data1[-1].append(data[i])

                    picgroup_num = len(pic1)  # 图片组数

                    for i in range(picgroup_num):  # 按组处理图片
                        for j in range(len(pic1[i])):
                            if j < len(pic1[i]) - 1:
                                clip = fitz.Rect(data1[i][j].x0 - 5, pic1[i][j].y0, data1[i][j + 1].x0 - 18,
                                                 data1[i][j].y1)
                                pix = page.getPixmap(matrix=mat, clip=clip, alpha=False)
                                # print(pix)
                                # 预处理字符串中的非法字符
                                deal_name = validateTitle(picname_list[picname_num])
                                fn = deal_name + ".png"
                                pix.writePNG(os.path.join(mkpath, fn))
                                picname_num = picname_num + 1
                            else:
                                clip = fitz.Rect(data1[i][j].x0 - 5, pic1[i][j].y0, page_length - 49,
                                                 data1[i][j].y1)
                                pix = page.getPixmap(matrix=mat, clip=clip, alpha=False)
                                deal_name = validateTitle(picname_list[picname_num])
                                fn = deal_name + ".png"
                                pix.writePNG(os.path.join(mkpath, fn))
                                picname_num = picname_num + 1
            doc.close()
            shutil.copy(os.path.join(path, file), os.path.join(mkpath, file))
     except:
        print(file)
        mkdir(path+'\\未处理完成')
        shutil.move(mkpath, path+'\\未处理完成')
        continue
Exemple #16
0
class Element(IText):
    '''Boundary box with attribute in fitz.Rect type.'''

    # all coordinates are related to un-rotated page in PyMuPDF
    # e.g. Matrix(0.0, 1.0, -1.0, 0.0, 842.0, 0.0)
    ROTATION_MATRIX = fitz.Matrix(0.0)  # rotation angle = 0 degree by default

    @classmethod
    def set_rotation_matrix(cls, rotation_matrix):
        """Set global rotation matrix.

        Args:
            Rotation_matrix (fitz.Matrix): target matrix
        """
        if rotation_matrix and isinstance(rotation_matrix, fitz.Matrix):
            cls.ROTATION_MATRIX = rotation_matrix

    @classmethod
    def pure_rotation_matrix(cls):
        '''Pure rotation matrix used for calculating text direction after rotation.'''
        a, b, c, d, e, f = cls.ROTATION_MATRIX
        return fitz.Matrix(a, b, c, d, 0, 0)

    def __init__(self, raw: dict = None, parent=None):
        ''' Initialize Element and convert to the real (rotation considered) page coordinate system.'''
        self.bbox = fitz.Rect()
        self._parent = parent  # type: Element

        # NOTE: Any coordinates provided in raw is in original page CS (without considering page rotation).
        if 'bbox' in (raw or {}):
            rect = fitz.Rect(raw['bbox']) * Element.ROTATION_MATRIX
            self.update_bbox(rect)

    def __bool__(self):
        '''Real object when bbox is defined.'''
        return bool(self.bbox)

    def __repr__(self):
        return f'{self.__class__.__name__}({tuple(self.bbox)})'

    # ------------------------------------------------
    # parent element
    # ------------------------------------------------
    @property
    def parent(self):
        return self._parent

    @parent.setter
    def parent(self, parent):
        self._parent = parent

    # ------------------------------------------------
    # bbox operations
    # ------------------------------------------------
    def copy(self):
        '''make a deep copy.'''
        # NOTE: can't serialize data because parent is an Object,
        # so set it None in advance.
        parent, self.parent = self._parent, None
        obj = copy.deepcopy(self)
        self._parent = parent  # set back parent
        return obj

    def get_expand_bbox(self, dt: float):
        """Get expanded bbox with margin in both x- and y- direction.

        Args:
            dt (float): Expanding margin.

        Returns:
            fitz.Rect: Expanded bbox.
        
        .. note::
            This method creates a new bbox, rather than changing the bbox of itself.
        """
        return self.bbox + (-dt, -dt, dt, dt)

    def update_bbox(self, rect):
        '''Update current bbox to specified ``rect``.
        
        Args:
            rect (fitz.Rect or list): bbox-like ``(x0, y0, x1, y1)`` in real page CS (with rotation considered).
        '''
        self.bbox = fitz.Rect([round(x, 1) for x in rect])
        return self

    def union_bbox(self, e):
        """Update current bbox to the union with specified Element.

        Args:
            e (Element): The target to get union

        Returns:
            Element: self
        """
        return self.update_bbox(self.bbox | e.bbox)

    # --------------------------------------------
    # location relationship to other Element instance
    # --------------------------------------------
    def contains(self, e, threshold: float = 1.0):
        """Whether given element is contained in this instance, with margin considered.

        Args:
            e (Element): Target element
            threshold (float, optional): Intersection rate. Defaults to 1.0. The larger, the stricter.

        Returns:
            bool: [description]
        """
        # NOTE the case bool(e)=True but e.bbox.get_area()=0
        S = e.bbox.get_area()
        if not S: return False

        # it's not practical to set a general threshold to consider the margin, so two steps:
        # - set a coarse but acceptable area threshold,
        # - check the length in main direction strictly

        # A contains B => A & B = B
        intersection = self.bbox & e.bbox
        factor = round(intersection.get_area() / e.bbox.get_area(), 2)
        if factor < threshold: return False

        # check length
        if self.bbox.width >= self.bbox.height:
            return self.bbox.width + constants.MINOR_DIST >= e.bbox.width
        else:
            return self.bbox.height + constants.MINOR_DIST >= e.bbox.height

    def get_main_bbox(self, e, threshold: float = 0.95):
        """If the intersection with ``e`` exceeds the threshold, return the union of these two elements; else return None.

        Args:
            e (Element): Target element.
            threshold (float, optional): Intersection rate. Defaults to 0.95.

        Returns:
            fitz.Rect: Union bbox or None.
        """
        bbox_1 = self.bbox
        bbox_2 = e.bbox if hasattr(e, 'bbox') else fitz.Rect(e)

        # areas
        b = bbox_1 & bbox_2
        if not b: return None  # no intersection

        a1, a2, a = bbox_1.get_area(), bbox_2.get_area(), b.get_area()

        # Note: if bbox_1 and bbox_2 intersects with only an edge, b is not empty but b.get_area()=0
        # so give a small value when they're intersected but the area is zero
        factor = a / min(a1, a2) if a else 1e-6
        return bbox_1 | bbox_2 if factor >= threshold else None

    def vertically_align_with(self,
                              e,
                              factor: float = 0.0,
                              text_direction: bool = True):
        '''Check whether two Element instances have enough intersection in vertical direction, i.e. perpendicular to reading direction.
        
        Args:
            e (Element): Object to check with
            factor (float, optional): Threshold of overlap ratio, the larger it is, the higher probability the two bbox-es are aligned.
            text_direction (bool, optional): Consider text direction or not. True by default, from left to right if False.

        Returns:
            bool: [description]
        
        Examples::

            +--------------+
            |              |
            +--------------+ 
                    L1
                    +-------------------+
                    |                   |
                    +-------------------+
                            L2
            
        An enough intersection is defined based on the minimum width of two boxes::
        
            L1+L2-L>factor*min(L1,L2)
        '''
        if not e or not bool(self): return False

        # text direction
        idx = 1 if text_direction and self.is_vertical_text else 0

        L1 = self.bbox[idx + 2] - self.bbox[idx]
        L2 = e.bbox[idx + 2] - e.bbox[idx]
        L = max(self.bbox[idx + 2], e.bbox[idx + 2]) - min(
            self.bbox[idx], e.bbox[idx])

        eps = 1e-3  # tolerent
        return L1 + L2 - L + eps >= factor * min(L1, L2)

    def horizontally_align_with(self,
                                e,
                                factor: float = 0.0,
                                text_direction: bool = True):
        '''Check whether two Element instances have enough intersection in horizontal direction, i.e. along the reading direction.
           
        Args:
            e (Element): Element to check with
            factor (float, optional): threshold of overlap ratio, the larger it is, the higher probability the two bbox-es are aligned.
            text_direction (bool, optional): consider text direction or not. True by default, from left to right if False.

        Examples::

            +--------------+
            |              | L1  +--------------------+
            +--------------+     |                    | L2
                                 +--------------------+
            
        An enough intersection is defined based on the minimum width of two boxes::
        
            L1+L2-L>factor*min(L1,L2)
        '''
        if not e or not bool(self): return False

        # text direction
        idx = 0 if text_direction and self.is_vertical_text else 1

        L1 = self.bbox[idx + 2] - self.bbox[idx]
        L2 = e.bbox[idx + 2] - e.bbox[idx]
        L = max(self.bbox[idx + 2], e.bbox[idx + 2]) - min(
            self.bbox[idx], e.bbox[idx])

        eps = 1e-3  # tolerent
        return L1 + L2 - L + eps >= factor * min(L1, L2)

    def in_same_row(self, e):
        """Check whether in same row/line with specified Element instance. With text direction considered.
           
           Taking horizontal text as an example:
           
           * yes: the bottom edge of each box is lower than the centerline of the other one;
           * otherwise, not in same row.

        Args:
            e (Element): Target object.

        Returns:
            bool: [description]
        
        .. note::
            The difference to method ``horizontally_align_with``: they may not in same line, though 
            aligned horizontally.
        """
        if not e or self.is_horizontal_text != e.is_horizontal_text:
            return False

        # normal reading direction by default
        idx = 1 if self.is_horizontal_text else 0

        c1 = (self.bbox[idx] + self.bbox[idx + 2]) / 2.0
        c2 = (e.bbox[idx] + e.bbox[idx + 2]) / 2.0
        res = c1 <= e.bbox[idx + 2] and c2 <= self.bbox[
            idx + 2]  # Note y direction under PyMuPDF context
        return res

    # ------------------------------------------------
    # others
    # ------------------------------------------------
    def store(self):
        '''Store properties in raw dict.'''
        return {'bbox': tuple([x for x in self.bbox])}

    def plot(self,
             page,
             stroke: tuple = (0, 0, 0),
             width: float = 0.5,
             fill: tuple = None,
             dashes: str = None):
        '''Plot bbox in PDF page for debug purpose.'''
        page.draw_rect(self.bbox,
                       color=stroke,
                       fill=fill,
                       width=width,
                       dashes=dashes,
                       overlay=False,
                       fill_opacity=0.5)
Exemple #17
0
    # Define start / end points of x axis that we want to use as 0 and 2*pi.
    # They may be oriented in any way.
    #--------------------------------------------------------------------------
    pb = fitz.Point(200, 200)  # begin, treated as (0, 0)
    pe = fitz.Point(400, 100)  # end, treated as (2*pi, 0)

    alfa = img.horizontal_angle(pb, pe)  # connection angle towards x-axis
    rad = abs(pe - pb)  # distance of these points
    pe1 = pb + (rad, 0)  # make corresp. horizontal end point
    # =============================================================================
    #   first draw a rectangle in which the functions graphs will later appear
    # =============================================================================
    f = abs(pe - pb) * 0.5 / math.pi  # represents 1 unit
    rect = fitz.Rect(pb.x - 5, pb.y - f - 5, pe1.x + 5, pb.y + f + 5)
    img.drawRect(rect)  # draw it
    morph = (pb, fitz.Matrix(math.degrees(-alfa)))
    img.finish(fill=yellow, morph=morph)  # rotate it around begin point

    # =============================================================================
    #   get all points for the sine function
    # =============================================================================
    pntsin = bsinPoints(pb, pe1)  # only horizontal axis supported
    # therefore need rotate result points by angle alfa afterwards
    points = rot_points(pntsin, pb, alfa)

    for i in (0, 3, 6, 9):  # draw all 4 function segments
        img.drawBezier(points[i], points[i + 1], points[i + 2], points[i + 3])

    img.finish(color=red, width=w, closePath=False)

    # =============================================================================
Exemple #18
0
 def pure_rotation_matrix(cls):
     '''Pure rotation matrix used for calculating text direction after rotation.'''
     a, b, c, d, e, f = cls.ROTATION_MATRIX
     return fitz.Matrix(a, b, c, d, 0, 0)
Exemple #19
0
def getImg(pg_num, bbox):
    bbox_resize = resize(bbox, sf, eps)
    pix = doc[pg_num].getPixmap(matrix=fitz.Matrix(sf, sf))
    img = Image.open(io.BytesIO(pix.getPNGData()))
    block_img = img.crop(bbox_resize)
    return block_img
"""
if __name__ == "__main__":
    green = getColor("limegreen")
    red = getColor("red2")
    doc = fitz.open()
    p = doc.newPage()
    img = p.newShape()
    r = fitz.Rect(100, 100, 200, 200)
    heart(img, r, red)
    img.commit()
    p.setCropBox(r + (10, 10, -10, -15))

    p = doc.newPage()
    img = p.newShape()
    pnt = r.tl + (r.br - r.tl) * 0.5
    clover(img, r, green, morph=(pnt, fitz.Matrix(45)))
    img.commit()
    p.setCropBox(r + (5, 5, -5, -5))

    p = doc.newPage()
    img = p.newShape()
    diamond(img, r, red)
    img.commit()
    p.setCropBox(r)

    p = doc.newPage()
    img = p.newShape()
    pnt = r.tl + (r.br - r.tl) * 0.5
    caro(img, r, red, morph=(pnt, fitz.Matrix(45)))
    img.commit()
    p.setCropBox(r + (10, 10, -10, -10))
# https://www.jianshu.com/p/2abe38044446
# pip install PyMuPDF
# 将本目录下所有PDF文件转为png文件(第8行)

import fitz
import sys
import glob
pdffile = glob.glob("*.pdf")
pngfile = []
for f in pdffile:
    pngfile.append(f.rstrip("pdf"))
for i in range(len(pdffile)):
    doc = fitz.open(pdffile[i])
    page = doc[0]
    zoom = int(100)
    rotate = int(0)
    trans = fitz.Matrix(zoom / 100.0, zoom / 100.0).preRotate(rotate)
    pm = page.getPixmap(matrix=trans, alpha=False)
    pm.writePNG(pngfile[i] + "png")
Exemple #22
0
def PdftoImage_totext(file_path):
    if os.path.exists(file_path):
        re1 = r'[\w\W]+(\d{4})[\w\W]+(G\d{5})号$'
        file_path_1 = file_path[:file_path.rfind(".")]
        pdf = fitz.open(file_path)
        if pdf.pageCount > 2:
            png_name = file_path_1 + ".png"
            png_name2 = file_path_1 + "_jie.png"

            if os.path.exists(png_name):
                os.remove(png_name)
            if os.path.exists(png_name2):
                os.remove(png_name2)
            rotate = int(0)
            # 每个尺寸的缩放系数为1.3,这将为我们生成分辨率提高2.6的图像。
            # 此处若是不做设置,默认图片大小为:792X612, dpi=96
            zoom_x = 1.33333333  # (1.33333333-->1056x816)   (2-->1584x1224)
            zoom_y = 1.33333333
            mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
            png = pdf[2].getPixmap(matrix=mat, alpha=False)
            png.writePNG(png_name)
            pngg = cv2.imread(png_name)
            kuan = pngg.shape[0]
            chang = int(pngg.shape[1] * 0.35)
            print(pngg.shape)
            cropped = pngg[0:chang, 0:kuan]
            cv2.imwrite(png_name2, cropped)
            baidu = BaiduApi()
            text = baidu.picture(png_name2)
            reponse = True
            year = None
            num = None
            if len(text) >= 10:
                re_reponse = re.search(re1, text)
                if re_reponse:
                    year = re_reponse.group(1)
                    num = re_reponse.group(2)
                    os.remove(png_name2)
                    os.remove(png_name)
                else:
                    reponse = False
            if reponse:
                pass
            else:
                t, fp = imagetoBig(png_name2)
                if t:
                    baidu = BaiduApi()
                    text = baidu.picture(fp)
                    print(text)
                    shutil.rmtree(fp[:fp.rfind("\\")])
                    if len(text) >= 10:
                        re_reponse = re.search(re1, text)
                        if re_reponse:
                            year = re_reponse.group(1)
                            num = re_reponse.group(2)
                            os.remove(png_name2)
                            os.remove(png_name)
                        else:
                            reponse = False
                    else:
                        reponse = False
            idn = "沪信衡估报字第G" + year + "-" + num[1:] + "号"
            return idn, reponse
Exemple #23
0
def shapes_from_stream(doc: fitz.Document, page: fitz.Page):
    ''' Get rectangle shapes, e.g. highlight, underline, table borders, from page source contents.
        ---
        Args:
        - doc: fitz.Document representing the pdf file
        - page: fitz.Page, current page

        The page source is represented as contents of stream object. For example,
        ```
            /P<</MCID 0>> BDC
            ...
            1 0 0 1 90.0240021 590.380005 cm
            ...
            1 1 0 rg # or 0 g
            ...
            285.17 500.11 193.97 13.44 re f*
            ...
            214 320 m
            249 322 l
            ...
            EMC
        ```
        where,
        - `cm` specify a coordinate system transformation, here (0,0) translates to (90.0240021 590.380005)
        - `q`/`Q` save/restores graphic status
        - `rg` / `g` specify color mode: rgb / grey
        - `re`, `f` or `f*`: fill rectangle path with pre-defined color
        - `m` (move to) and `l` (line to) defines a path
        
        In this case,
        - a rectangle with:
            - fill color is yellow (1,1,0)
            - lower left corner: (285.17 500.11)
            - width: 193.97
            - height: 13.44
        - a line from (214, 320) to (249, 322)

        Read more:        
        - https://github.com/pymupdf/PyMuPDF/issues/263
        - https://github.com/pymupdf/PyMuPDF/issues/225
        - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdf_reference_archive/pdf_reference_1-7.pdf
    '''
    # Each object in PDF has a cross-reference number (xref):
    # - to get its source contents: `doc.xrefObject()` or low level API `doc._getXrefString()`; but for stream objects, only the non-stream part is returned
    # - to get the stream data: `doc.xrefStream(xref)` or low level API `doc._getXrefStream(xref)`
    # - the xref for a page object itself: `page.xref`
    # - all stream xref contained in one page: `page.getContents()`
    # - combine all stream object contents together: `page.readContents()` with PyMuPDF>=1.17.0
    #
    # Clean contents first:
    # syntactically correct, standardize and pretty print the contents stream
    page.cleanContents()
    xref_stream = page.readContents().decode(encoding="ISO-8859-1")

    # transformation matrix for coordinate system conversion from pdf to fitz
    matrix = page.transformationMatrix

    # Graphic States: working CS is coincident with the absolute origin (0, 0)
    # Refer to PDF reference v1.7 4.2.3 Transformation Metrices
    #                        | a b 0 |
    # [a, b, c, d, e, f] =>  | c b 0 |
    #                        | e f 1 |
    ACS = fitz.Matrix(0.0)  # identity matrix
    WCS = fitz.Matrix(0.0)

    # Graphics color:
    # - color space: PDF Reference Section 4.5 Color Spaces
    # NOTE: it should have to calculate color value under arbitrary color space, but it's really hard work for now.
    # So, consider device color space only like DeviceGray, DeviceRGB, DeviceCMYK, and set black for all others.
    device_space = True
    color_spaces = _check_device_cs(doc, page)

    # - stroking color
    Acs = utils.RGB_value((0.0, 0.0, 0.0))  # stored value
    Wcs = Acs  # working value
    # - filling color
    Acf = utils.RGB_value((0.0, 0.0, 0.0))
    Wcf = Acf

    # Stroke width
    Ad = 0.0
    Wd = 0.0

    # In addition to lines, rectangles are also processed with border path
    paths = []  # a list of path, each path is a list of points

    # Check line by line
    # Cleaned by `page.cleanContents()`, operator and operand are aligned in a same line;
    # otherwise, have to check stream contents word by word (line always changes)
    lines = xref_stream.splitlines()
    strokes, fills = [], []

    for line in lines:

        words = line.split()
        if not words: continue

        op = words[-1]  # operator always at the end after page.cleanContents()

        # -----------------------------------------------
        # Color Operators: PDF Reference Table 4.24
        # -----------------------------------------------
        # - set color space:
        #   color_space_name cs  # specify color space
        #   c1 c2 ... SC/SCN     # components under defined color space
        if op.upper() == 'CS':
            Wcs = utils.RGB_value((0.0, 0.0, 0.0))
            Wcf = utils.RGB_value((0.0, 0.0, 0.0))

            # Consider normal device cs only
            device_space = color_spaces.get(words[0], False)

        # - set color: color components under specified color space
        elif op.upper() == 'SC':  # c1 c2 ... cn SC
            c = _RGB_from_color_components(words[0:-1], device_space)
            #  nonstroking color
            if op == 'sc':
                Wcf = c
            # stroking color
            else:
                Wcs = c

        # - set color: color components under specified color space
        elif op.upper() == 'SCN':  # c1 c2 ... cn [name] SC
            if utils.is_number(words[-2]):
                c = _RGB_from_color_components(words[0:-1], device_space)
            else:
                c = _RGB_from_color_components(words[0:-2], device_space)

            #  nonstroking color
            if op == 'scn':
                Wcf = c
            # stroking color
            else:
                Wcs = c

        # - DeviceGray space, equal to:
        # /DeviceGray cs
        # c sc
        elif op.upper() == 'G':  # 0 g
            g = float(words[0])
            # nonstroking color, i.e. filling color here
            if op == 'g':
                Wcf = utils.RGB_value((g, g, g))
            # stroking color
            else:
                Wcs = utils.RGB_value((g, g, g))

        # - DeviceRGB space
        elif op.upper() == 'RG':  # 1 1 0 rg
            r, g, b = map(float, words[0:-1])

            #  nonstroking color
            if op == 'rg':
                Wcf = utils.RGB_value((r, g, b))
            # stroking color
            else:
                Wcs = utils.RGB_value((r, g, b))

        # - DeviceCMYK space
        elif op.upper() == 'K':  # c m y k K
            c, m, y, k = map(float, words[0:-1])
            #  nonstroking color
            if op == 'k':
                Wcf = utils.CMYK_to_RGB(c, m, y, k, cmyk_scale=1.0)
            # stroking color
            else:
                Wcs = utils.CMYK_to_RGB(c, m, y, k, cmyk_scale=1.0)

        # -----------------------------------------------
        # Graphics State Operators: PDF References Table 4.7
        # -----------------------------------------------
        # CS transformation: a b c d e f cm, e.g.
        # 0.05 0 0 -0.05 0 792 cm
        # refer to PDF Reference 4.2.2 Common Transformations for detail
        elif op == 'cm':
            # update working CS
            components = list(map(float, words[0:-1]))
            Mt = fitz.Matrix(*components)
            WCS = Mt * WCS  # M' = Mt x M

        # stroke width
        elif op == 'w':  # 0.5 w
            Wd = float(words[0])

        # save or restore graphics state:
        # only consider transformation and color here
        elif op == 'q':  # save
            ACS = fitz.Matrix(WCS)  # copy as new matrix
            Acf = Wcf
            Acs = Wcs
            Ad = Wd

        elif op == 'Q':  # restore
            WCS = fitz.Matrix(ACS)  # copy as new matrix
            Wcf = Acf
            Wcs = Acs
            Wd = Ad

        # -----------------------------------------------
        # Path Construction Operators: PDF References Table 4.9
        # -----------------------------------------------
        # rectangle block:
        # x y w h re is equivalent to
        # x   y   m
        # x+w y   l
        # x+w y+h l
        # x   y+h l
        # h          # close the path
        elif op == 're':
            # ATTENTION:
            # top/bottom, left/right is relative to the positive direction of CS,
            # while a reverse direction may be performed, so be careful when calculating
            # the corner points.
            # Coordinates in the transformed PDF CS:
            #   y1 +----------+
            #      |          | h
            #   y0 +----w-----+
            #      x0        x1
            #

            # (x, y, w, h) before this line
            x0, y0, w, h = map(float, words[0:-1])
            path = []
            path.append((x0, y0))
            path.append((x0 + w, y0))
            path.append((x0 + w, y0 + h))
            path.append((x0, y0 + h))
            path.append((x0, y0))

            paths.append(path)

        # path: m -> move to point to start a path
        elif op == 'm':  # x y m
            x0, y0 = map(float, words[0:-1])
            paths.append([(x0, y0)])

        # path: l -> straight line to point
        elif op == 'l':  # x y l
            x0, y0 = map(float, words[0:-1])
            paths[-1].append((x0, y0))

        # close the path
        elif op == 'h':
            for path in paths:
                _close_path(path)

        # -----------------------------------------------
        # Path-painting Operatores: PDF Reference Table 4.10
        # -----------------------------------------------
        # close and stroke the path
        elif op.upper() == 'S':
            # close
            if op == 's':
                for path in paths:
                    _close_path(path)

            # stroke path
            for path in paths:
                res = _stroke_path(path, WCS, Wcs, Wd, matrix)
                strokes.extend(res)

            # reset path
            paths = []

        # fill the path
        elif line in ('f', 'F', 'f*'):
            for path in paths:
                # close the path implicitly
                _close_path(path)

                # fill path
                res = _fill_rect_path(path, WCS, Wcf, matrix)
                fills.append(res)

            # reset path
            paths = []

        # close, fill and stroke the path
        elif op.upper() in ('B', 'B*'):
            for path in paths:
                # close path
                _close_path(path)

                # fill path
                res = _fill_rect_path(path, WCS, Wcf, matrix)
                fills.append(res)

                # stroke path
                res = _stroke_path(path, WCS, Wcs, Wd, matrix)
                strokes.extend(res)

            # reset path
            paths = []

        # TODO: clip the path
        elif line in ('W', 'W*'):
            pass

        # end the path without stroking or filling
        elif op == 'n':
            paths = []

    return strokes, fills
Exemple #24
0
def export_data_pdf(sender, instance, created, **kwargs):
    uploaded_pdf_url = instance.uploaded_pdf.path
    # address_string = camelot.read_pdf(uploaded_pdf_url, flavor='stream', row_tol=9, table_areas=['50,720,780,680'])
    address_string = camelot.read_pdf(uploaded_pdf_url, flavor='stream', row_tol=9, table_areas=['50,720,400,680'])
    csv_address_f = os.path.join(settings.MEDIA_ROOT, 'temp', 'csv_address.csv')
    csv = address_string[0].to_csv(csv_address_f)
    if csv_address_f:
        with open(csv_address_f, 'r', encoding='utf-8') as f:
            row_read = CSV.reader(f)
            for row in row_read:
                pp = (row[0].strip(" '")).split(":")
                if 'Квартира' in pp[0]:
                    hh = [int(s) for s in pp[0].split() if s.isdigit()]
                    global_appartment = hh[0]
                    print(global_appartment)
                else:
                    print('pp')
                    print(pp)
                    print(pp[1])
                    adress_item_list = pp[1].split(",")
                    print('adress_item_list')
                    print(adress_item_list)
            i = 0
            total_list = []
            for item in adress_item_list:
                k = ''
                v = ''
                total_val = adress_item_list[i].split()
                for word in total_val:
                    if word[0].isupper() or word[0].isdigit():
                        v = word
                    else:
                        k = word
                if i == 1:
                    v = adress_item_list[i].strip()

                total_list.append([k, v])
                i+=1
            print('total_list')
            print(total_list)
            total_dict = {
                'city_type':'',
                'city_name': '',
                'street_type':'',
                'street':'',
                'micro_rayon':'',
                'house_number':'',
                'corpus_number':'',
                'litera':''
            }
            for item in total_list:
                if item[0] == 'город' or item[0] == 'поселение' or item[0] == 'деревня' or item[0] == 'поселок':
                    k = 'city_type'
                    v = item[0]
                    total_dict.update({k: v})
                    k = 'city_name'
                    v = item[1]
                    total_dict.update({k: v})

                elif item[0] == 'улица' or item[0] == 'ул.' or item[0] == 'переулок' or item[0] == 'пер.' or item[0] == 'проспект' or item[0] == 'просп.' or item[0] == 'проезд' or item[0] == 'шоссе' or item[0] == 'площадь' or item[0] == 'наб.' or item[0] == 'набережная' or item[0] == 'бульвар' or item[0] == 'бул.':
                    k = 'street_type'
                    v = item[0]
                    total_dict.update({k: v})
                    k = 'street'
                    v = item[1]
                    total_dict.update({k: v})

                elif item[0] == 'микрорайон':
                    k = 'micro_rayon'
                    v = item[1]
                    total_dict.update({k: v})

                elif item[0] == 'дом':
                    k = 'house_number'
                    v = item[1]
                    total_dict.update({k: v})

                elif item[0] == 'корпус':
                    k = 'corpus_number'
                    v = item[1]
                    total_dict.update({k: v})

                elif item[0] == 'litera':
                    k = 'litera'
                    v = item[1]
                    total_dict.update({k: v})

                elif item[0] == 'строение':
                    k = 'build_number'
                    v = item[1]
                    total_dict.update({k: v})

            print('total_dict')
            print(total_dict)
                # func(adress_item_list[i])
            v, created = Adress.objects.update_or_create(
            order=instance,
            defaults=total_dict,
            )
    if instance.new_source:
        tables = camelot.read_pdf(uploaded_pdf_url)
    else:
        tables = camelot.read_pdf(uploaded_pdf_url, flavor='stream', row_tol=9, table_areas=['50,680,780,100'])

    print(tables[0])
    print(tables[0].parsing_report)
    print(tables[0].df)

    json_table = os.path.join(settings.MEDIA_ROOT, 'temp', 'json_table.json')
    json_table2 = os.path.join(settings.MEDIA_ROOT, 'temp', 'json_table2.csv')
    json = tables[0].to_json(path=json_table)
    # json1 = tables[0].to_csv(path=json_table2, orient = 'records', lines = 'True')
    if json_table:
        with open(json_table, 'r') as f:
            print("------------data-------------------")
            data = JSON.load(f)
            ExplicationListItem.objects.filter(order_list=instance).delete()
            i = 0
            data.pop()
            for x in data:
                if i > 2 and x['9'] != '':
                    ExplicationListItem.objects.create(
                    order_list = instance,
                    floor_number = x['0'],
                    appart_number_item = x['1'],
                    appart_name_item = x['2'],
                    square_total_item = x['3'],
                    square_general_item = x['4'],
                    square_advanced_item = x['5'],
                    square_logdi_item = x['6'],
                    square_balkon_item = x['7'],
                    square_another_item = x['8'],
                    height_item = x['9'],
                    apart_number = global_appartment
                    )
                i += 1
            explication_list_items = ExplicationListItem.objects.filter(order_list=instance)
            def string_to_correct_decimal(string):
                print('string')
                print(string)
                result = Decimal(string.strip(" '").replace(',', '.'))
                return result
            square_total_sum = Decimal("0.0")
            square_general_sum = Decimal("0.0")
            square_advanced_sum = Decimal("0.0")
            square_logdi_sum = Decimal("0.0")
            square_balkon_sum = Decimal("0.0")
            square_another_sum = Decimal("0.0")
            square_total_sum_global = Decimal("0.0")

            for items in explication_list_items:
                if items.square_total_item:
                    square_total_sum += string_to_correct_decimal(items.square_total_item)
                    # print(square_total_sum)
                if items.square_general_item:
                    square_general_sum += string_to_correct_decimal(items.square_general_item)
                    # print(square_general_sum)
                if items.square_advanced_item:
                    square_advanced_sum += string_to_correct_decimal(items.square_advanced_item)
                    # print(square_advanced_sum)
                if items.square_logdi_item:
                    square_logdi_sum += string_to_correct_decimal(items.square_logdi_item)
                    # print(square_logdi_sum)
                if items.square_balkon_item:
                    square_balkon_sum += string_to_correct_decimal(items.square_balkon_item)
                    # print(square_balkon_sum)
                if items.square_another_item:
                    square_another_sum += string_to_correct_decimal(items.square_another_item)
                    # print(square_another_sum)
                v, created = ExplicationSquareTotal.objects.update_or_create(
                order=instance,
                defaults={
                'square_total_summa':square_total_sum,
                'square_general_summa':square_general_sum,
                'square_advanced_summa':square_advanced_sum,
                'square_logdi_summa':square_logdi_sum,
                'square_balkon_summa':square_balkon_sum,
                'square_another_summa':square_another_sum,
                'square_total_summa_global': square_total_sum + square_logdi_sum + square_balkon_sum + square_another_sum
                },
                )
    path_img_name = 'schema_' + str(instance.order_number) + '.png'
    path_img_scheme = os.path.join(settings.MEDIA_ROOT, 'uploaded_pdf/schemes/', path_img_name)
    path_img_scheme_bd = "uploaded_pdf/schemes/%s" % path_img_name
    current_site = Site.objects.get_current().domain
    path_full_pdf = "https://%s%s" % (current_site, reverse_lazy('pdftrans:order_full_pdf_view_n', kwargs={'pk': instance.pk}))
    doc = fitz.open(uploaded_pdf_url)
    i = 0
    for page in doc:                            # iterate through the pages
        if i == 1:
            zoom = 2    # zoom factor
            mat = fitz.Matrix(zoom, zoom)
            pix = page.getPixmap(matrix = mat, alpha = False)     # render page to an image
            pix.writePNG(path_img_scheme)    # store image as a PNG
            def trim(im):
                bg = Image.new(im.mode, im.size, im.getpixel((0,0)))
                diff = ImageChops.difference(im, bg)
                diff = ImageChops.add(diff, diff, 2.0, -100)
                bbox = diff.getbbox()
                if bbox:
                    return im.crop(bbox)
            im = Image.open(path_img_scheme)
            im = trim(im)
            im.save(path_img_scheme)
            order_img_clear = OrderImage.objects.filter(order_fk=instance).delete()
            v, created = OrderImage.objects.update_or_create(
            order_fk=instance,
            defaults={'order_image': path_img_scheme_bd, 'fullpdf_url_staff': path_full_pdf }
            )
        i+=1

    # sending email method -=send_mail=-
    path_full_pdf_for_email = str(path_full_pdf)
    path_full_link_site = 'https://' + str(current_site) + '/get-order-info/' + str(instance.pk)
    context = {
    'order_number': instance.order_number,
    'link_doc': path_full_pdf,
    'link_site': path_full_link_site,
    }
    str_for_traslit = unidecode(str(instance.adress))
    subject = str_for_traslit + ' - Док №: ' + str(instance.order_number)
    from_email = '*****@*****.**'
    to = '*****@*****.**'
    html_content = render_to_string('mail_templates/mail_template_btiorder.html', context)
    text_content = strip_tags(html_content)
    msg = EmailMultiAlternatives(subject, text_content, from_email, [to])
    msg.attach_alternative(html_content, "text/html")
    if instance.is_emailed == False:
        if subject and html_content and from_email:
            try:
                if msg.send():
                    Order.objects.filter(pk=instance.pk).update(is_emailed=True)
                    instance.is_emailed = True
            except BadHeaderError:
                return print('Invalid header found in email %s' % instance.pk)
            return print('email is sended %s' % instance.pk)
        else:
            return print('Make sure all fields are entered and valid %s' % instance.pk)
    pass
import fitz

pdf = fitz.open('./image/1.pdf')
for pg in range(0, 1):
    page = pdf[pg]
    rotate = int(0)
    zoom_x = 2.0
    zoom_y = 2.0
    trans = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
    pm = page.getPixmap(matrix=trans, alpha=False)
    pm.writePNG('./image/%s.png' % 1)
Exemple #26
0
annot = page.addFreetextAnnot(r, t1, rotate = 90)
annot.setBorder(border)
annot.update(fontsize = 10, border_color=red, fill_color=gold, text_color=blue)

print_descr(annot.rect, annot)
r = annot.rect + displ
print("added 'FreeText'")

annot = page.addTextAnnot(r.tl, t1)
print_descr(annot.rect, annot)
print("added 'Sticky Note'")

pos = annot.rect.tl + displ.tl

# first insert 4 rotated text lines
page.insertText(pos, text, fontsize=11, morph = (pos, fitz.Matrix(-15)))
# now search text to get the quads
rl = page.searchFor("text in line", quads = True)
r0 = rl[0]
r1 = rl[1]
r2 = rl[2]
r3 = rl[3]
annot = page.addHighlightAnnot(r0)
# need to convert quad to rect for descriptive text ...
print_descr(r0.rect, annot)
print("added 'HighLight'")

annot = page.addStrikeoutAnnot(r1)
print_descr(r1.rect, annot)
print("added 'StrikeOut'")
Exemple #27
0
    def __init__(self, parent, filename):
        defPos = wx.DefaultPosition
        defSiz = wx.DefaultSize
        zoom = 1.2  # zoom factor of display
        wx.Dialog.__init__(self,
                           parent,
                           id=wx.ID_ANY,
                           title=u"Display with PyMuPDF: ",
                           pos=defPos,
                           size=defSiz,
                           style=wx.CAPTION | wx.CLOSE_BOX
                           | wx.DEFAULT_DIALOG_STYLE)

        #======================================================================
        # display an icon top left of dialog, append filename to title
        #======================================================================
        if do_icon:
            self.SetIcon(ico_pdf.img.GetIcon())  # set a screen icon
        self.SetTitle(self.Title + filename)
        self.SetBackgroundColour(wx.Colour(240, 230, 140))

        #======================================================================
        # open the document with MuPDF when dialog gets created
        #======================================================================
        self.doc = fitz.open(filename)  # create Document object
        if self.doc.needsPass:  # check password protection
            self.decrypt_doc()
        if self.doc.isEncrypted:  # quit if we cannot decrpt
            self.Destroy()
            return
        self.dl_array = [0] * len(self.doc)
        self.last_page = -1  # memorize last page displayed
        self.link_rects = []  # store link rectangles here
        self.link_texts = []  # store link texts here
        self.current_idx = -1  # store entry of found rectangle
        self.current_lnks = []  # store entry of found rectangle

        #======================================================================
        # define zooming matrix for displaying PDF page images
        # we increase images by 20%, so take 1.2 as scale factors
        #======================================================================
        self.matrix = fitz.Matrix(zoom, zoom)  # will use a constant zoom
        '''
        =======================================================================
        Overall Dialog Structure:
        -------------------------
        szr10 (main sizer for the whole dialog - vertical orientation)
        +-> szr20 (sizer for buttons etc. - horizontal orientation)
          +-> button forward
          +-> button backward
          +-> field for page number to jump to
          +-> field displaying total pages
        +-> PDF image area
        =======================================================================
        '''

        # forward button
        self.ButtonNext = wx.Button(self, wx.ID_ANY, u"forw", defPos, defSiz,
                                    wx.BU_EXACTFIT)
        # backward button
        self.ButtonPrevious = wx.Button(self, wx.ID_ANY, u"back", defPos,
                                        defSiz, wx.BU_EXACTFIT)
        #======================================================================
        # text field for entering a target page. wx.TE_PROCESS_ENTER is
        # required to get data entry fired as events.
        #======================================================================
        self.TextToPage = wx.TextCtrl(self, wx.ID_ANY, u"1", defPos,
                                      wx.Size(40, -1),
                                      wx.TE_RIGHT | wx.TE_PROCESS_ENTER)
        # displays total pages and page paper format
        self.statPageMax = wx.StaticText(
            self, wx.ID_ANY, "of " + str(len(self.doc)) + " pages.", defPos,
            defSiz, 0)
        self.links = wx.CheckBox(self, wx.ID_ANY, u"show links", defPos,
                                 defSiz, wx.ALIGN_LEFT)
        self.links.Value = True
        self.paperform = wx.StaticText(self, wx.ID_ANY, "", defPos, defSiz, 0)
        # define the area for page images and load page 1 for primary display
        self.PDFimage = wx.StaticBitmap(self,
                                        wx.ID_ANY,
                                        self.pdf_show(1),
                                        defPos,
                                        defSiz,
                                        style=0)
        #======================================================================
        # the main sizer of the dialog
        #======================================================================
        self.szr10 = wx.BoxSizer(wx.VERTICAL)
        szr20 = wx.BoxSizer(wx.HORIZONTAL)
        szr20.Add(self.ButtonNext, 0, wx.ALL, 5)
        szr20.Add(self.ButtonPrevious, 0, wx.ALL, 5)
        szr20.Add(self.TextToPage, 0, wx.ALL, 5)
        szr20.Add(self.statPageMax, 0, wx.ALIGN_CENTER_VERTICAL | wx.ALL, 5)
        szr20.Add(self.links, 0, wx.ALIGN_CENTER_VERTICAL | wx.ALL, 5)
        szr20.Add(self.paperform, 0, wx.ALIGN_CENTER_VERTICAL | wx.ALL, 5)
        # sizer ready, represents top dialog line
        self.szr10.Add(szr20, 0, wx.EXPAND, 5)
        self.szr10.Add(self.PDFimage, 0, wx.ALL, 5)
        # main sizer now ready - request final size & layout adjustments
        self.szr10.Fit(self)
        self.SetSizer(self.szr10)
        self.Layout()
        # center dialog on screen
        self.Centre(wx.BOTH)

        # Bind buttons and fields to event handlers
        self.ButtonNext.Bind(wx.EVT_BUTTON, self.NextPage)
        self.ButtonPrevious.Bind(wx.EVT_BUTTON, self.PreviousPage)
        self.TextToPage.Bind(wx.EVT_TEXT_ENTER, self.GotoPage)
        self.PDFimage.Bind(wx.EVT_MOUSEWHEEL, self.OnMouseWheel)
        self.PDFimage.Bind(wx.EVT_MOTION, self.move_mouse)
        self.PDFimage.Bind(wx.EVT_LEFT_DOWN, self.OnLeftDown)
Exemple #28
0
#------------------------------------------------------------------------------
# Main program
#------------------------------------------------------------------------------
if __name__ == "__main__":
    green = getColor("limegreen")
    red = getColor("red2")
    doc = fitz.open()
    p = doc.newPage()
    img = p.newShape()
    r = fitz.Rect(100, 100, 200, 200)
    heart(img, r, red)

    r1 = r + (100, 0, 100, 0)
    p = r1.tl + (r1.br - r1.tl) * 0.5
    clover(img, r1, green, morph=(p, fitz.Matrix(45)))

    r2 = r1 + (100, 0, 100, 0)
    diamond(img, r2, red)

    r3 = r2 + (100, 0, 100, 0)
    p = r3.tl + (r3.br - r3.tl) * 0.5
    caro(img, r3, red, morph=(p, fitz.Matrix(45)))

    r4 = r + (0, 150, 0, 150)
    p = r4.tl + (r4.br - r4.tl) * 0.5
    arrow(img, r4, red, morph=(p, fitz.Matrix(0)))

    r5 = r4 + (r4.width, 0, r4.width, 0)
    dontenter(img, r5, morph=None)
Exemple #29
0
def save_to_excel(savefile, parsed_results, show_boundries, filename,
                  bbox_all):
    tables = []
    userfolder = os.path.split(os.path.dirname(savefile))[0]

    if show_boundries == 'yes':
        doc = fitz.open(filename)
        with pd.ExcelWriter(savefile) as writer:
            for page_n in parsed_results:
                page_n = int(page_n)
                imgname = os.path.join(userfolder, 'scanned',
                                       'p%d.jpg' % page_n)
                if not os.path.exists(imgname):
                    page = doc.loadPage(page_n)  #number of page
                    mat = fitz.Matrix(1, 1)
                    pix = page.getPixmap(matrix=mat, alpha=False)
                    pix.writePNG(imgname)

                if parsed_results[page_n] != []:
                    for idx in range(len(parsed_results[page_n])):
                        table_id = 'p' + str(page_n + 1) + '_id_' + str(idx +
                                                                        1)
                        t0 = datetime.now()
                        imgheight, imgwidth = pdf_boundry_img(
                            imgname, table_id, bbox_all[page_n][idx])
                        print('get pdf boundry img for %s in' % table_id,
                              datetime.now() - t0)
                        table = parsed_results[page_n][idx]
                        table = table.applymap(lambda x: x.encode(
                            'unicode_escape').decode('utf-8')
                                               if isinstance(x, str) else x)
                        table.to_excel(writer, sheet_name=table_id)
                        table.columns = table.columns.astype(str)
                        tables.append({
                            'table_id':
                            table_id,
                            'table_html':
                            table.to_html(index=False, index_names=False),
                            'imgheight':
                            imgheight,
                            'imgwidth':
                            imgwidth,
                            'remove':
                            False
                        })

        doc.close()

    elif show_boundries == 'ocr':
        with pd.ExcelWriter(savefile) as writer:
            for page_n in parsed_results:
                if parsed_results[page_n] != []:
                    for idx in range(len(parsed_results[page_n])):
                        table = parsed_results[page_n][idx]

                        if table is None:
                            continue
                        table = table.applymap(lambda x: x.encode(
                            'unicode_escape').decode('utf-8')
                                               if isinstance(x, str) else x)
                        table_id = 'p' + str(page_n + 1) + '_id_' + str(idx +
                                                                        1)
                        img = plt.imread(
                            os.path.join(userfolder, 'scanned',
                                         'p%d.jpg' % page_n))
                        imgheight, imgwidth = img.shape[0], img.shape[1]
                        copyfile(
                            os.path.join(userfolder, 'result_scanned',
                                         '%s.jpg' % table_id),
                            os.path.join(userfolder, table_id + '.jpg'))
                        table.to_excel(writer, sheet_name=table_id)
                        table.columns = table.columns.astype(str)
                        tables.append({
                            'table_id':
                            table_id,
                            'table_html':
                            table.to_html(index=False, index_names=False),
                            'imgheight':
                            imgheight,
                            'imgwidth':
                            imgwidth,
                            'remove':
                            False
                        })

    return tables
def get_qr_data(filename):
    # global variables for qr code extraction
    return_data = []
    real_values = []
    found_data = []
    #check_values = [0, 6, 8, 11, 15, 18, 22, 24, 30, 32, 34, 36, 38, 40, 42, 45, 50]

    print("Reading given %s pdf file" % filename)
    doc = fitz.open(filename)
    zoom = 4  # to increase the resolution
    mat = fitz.Matrix(zoom, zoom)
    noOfPages = doc.pageCount
    print("Given pdf has %d pages" % noOfPages)

    print("Starting page by page qr code detection and extraction")
    for pageNo in range(noOfPages):
        print("--------------Page Number %d------------------" % pageNo)
        page = doc.loadPage(pageNo)  # number of page
        pix = page.getPixmap(matrix=mat)
        output = str(
            pageNo) + '.jpg'  # you could change image format accordingly
        print("Saving image for current page")
        pix.writePNG(output)
        print("Reading saved image for current page")
        img = cv2.imread(output)

        found = False
        print("Detecting QR code...")
        for delta_max_x in range(0, 200, 20):
            if found:
                found = False
                break
            for delta_max_y in range(0, 200, 20):
                if found:
                    break
                for delta in range(0, 50, 1):
                    y = 205 + delta
                    x = 2225
                    h = 115 - delta
                    w = 90
                    max_x = 900 - delta_max_x
                    max_y = 700 + delta_max_y

                    resized_cropped = img[y:y + h, x:x + w]
                    resized_cropped = cv2.resize(resized_cropped,
                                                 (max_x, max_y))
                    sharpen_filter = np.array([[-1, -1, -1], [-1, 10, -1],
                                               [-1, -1, -1]])
                    resized_cropped = cv2.filter2D(resized_cropped, -1,
                                                   sharpen_filter)
                    # resized_cropped = cv2.cvtColor(resized_cropped, cv2.COLOR_BGR2GRAY)
                    # _, resized_cropped = cv2.threshold(resized_cropped, 10, 255, cv2.THRESH_OTSU)

                    qr_file = "QR_Code%d.png" % pageNo
                    cv2.imwrite(qr_file, resized_cropped)

                    hash = imagehash.average_hash(Image.open(qr_file))
                    otherhash = imagehash.average_hash(
                        Image.open('QR_Code_example.png'))

                    if (hash - otherhash) < 20:
                        barcodes = decode(resized_cropped,
                                          symbols=[ZBarSymbol.QRCODE])
                        if len(barcodes) > 0:
                            print("Qr code detected and extracted!")
                            print(" %d qr code is" % pageNo, barcodes[0].data)
                            real_values.append(pageNo)
                            return_data.append([pageNo, barcodes[0].data])
                            found = True
                            found_data.append([
                                pageNo, y, h, delta_max_x, delta_max_y, delta
                            ])
                            break
                    else:
                        if delta == 0:
                            found = True
                            break
        try:
            os.remove(qr_file)
            os.remove(output)
        except:
            pass

    #print("diff = ", set(check_values) - set(real_values))
    pprint(found_data)
    return return_data