Esempio n. 1
0
def dump_pdf(fname, dir):

    RE_XOBJ = r"/Type\s*/XObject"
    RE_IMG = r"/Subtype\s*/Image"

    doc = fitz.open(fname)

    img_idcs = []
    for i in range(1, doc.xref_length()):
        xref = doc.xref_object(i)

        is_xobj = re.search(RE_XOBJ, xref)
        is_img = re.search(RE_IMG, xref)
        if is_xobj and is_img: img_idcs.append(i)

    l = len(str(len(img_idcs) - 1))

    for i, j in enumerate(img_idcs):
        print(f'no: {i}, xref: {j}')
        img = fitz.Pixmap(doc, j)
        if img.n >= 5:
            pix = fitz.Pixmap(fitz.csRGB, pix)
        imgname = path.join(dir, f'{i:0{l}d}.png')
        img.writePNG(imgname)

    doc.close()
    '''
Esempio n. 2
0
def extractImage(myfile):
    checkXO = r"/Type(?= */XObject)"  # finds "/Type/XObject"
    checkIM = r"/Subtype(?= */Image)"  # finds "/Subtype/Image"

    this_file_path = MEDIA_ROOT + "/" + myfile.name

    dest_file = MEDIA_ROOT + r'\imageStore\\'
    if not os.path.exists(dest_file):
        os.makedirs(dest_file)

    doc = fitz.open(this_file_path)
    imgcount = 0
    lenXREF = doc._getXrefLength()  # number of objects - do not use entry 0!

    for i in range(1, lenXREF):  # scan through all objects
        text = doc._getObjectString(i)  # string defining the object
        isXObject = re.search(checkXO, text)  # tests for XObject
        isImage = re.search(checkIM, text)  # tests for Image
        if not isXObject or not isImage:  # not an image object if not both True
            continue
        imgcount += 1
        pix = fitz.Pixmap(doc, i)  # make pixmap from image
        if pix.n < 5:  # can be saved as PNG
            pix.writePNG(dest_file + extension(myfile) + "-" + "img-%s.png" %
                         (i, ))
        else:  # must convert the CMYK first
            pix0 = fitz.Pixmap(fitz.csRGB, pix)
            pix0.writePNG(dest_file + extension(myfile) + "-" + "img-%s.png" %
                          (i, ))
            pix0 = None  # free Pixmap resources
        pix = None  # free Pixmap resources

    t1 = time.clock()
Esempio n. 3
0
def get_pdf_images(pdf_file):
    checkXO = r"/Type(?= */XObject)"  # finds "/Type/XObject"
    checkIM = r"/Subtype(?= */Image)"  # finds "/Subtype/Image"

    doc = fitz.open(pdf_file)
    imgcount = 0
    lenXREF = doc._getXrefLength()  # number of objects - do not use entry 0!

    img_files = []

    for i in range(1, lenXREF):  # scan through all objects
        text = doc._getObjectString(i)  # string defining the object
        isXObject = re.search(checkXO, text)  # tests for XObject
        isImage = re.search(checkIM, text)  # tests for Image
        if not isImage:
            isImage = "/Subtype/Image" in text
        if not isXObject and not isImage:  # not an image object if not both True
            continue
        imgcount += 1
        pix = fitz.Pixmap(doc, i)  # make pixmap from image

        filename = pdf_file.replace(".pdf", "") + "-%s.png" % (i, )
        if pix.n < 5:  # can be saved as PNG
            pix.writePNG(filename)
        else:  # must convert the CMYK first
            pix0 = fitz.Pixmap(fitz.csRGB, pix)
            pix.writePNG(filename)
            pix0 = None  # free Pixmap resources

        reverse_image(filename)
        img_files.append(filename)
        pix = None  # free Pixmap resources

    return img_files
Esempio n. 4
0
    def get_images(self):
        if self.doctype == "pdf":
            image_names = []
            for i in range(len(self.doc)):

                for img in self.doc.getPageImageList(i):
                    xref = img[0]
                    pix = fitz.Pixmap(self.doc, xref)
                    image = np.array(Image.open(io.BytesIO(
                        pix.getImageData())))
                    pixies = []
                    for abc in image:
                        for ab in abc:
                            pixies.append((ab[0], ab[1], ab[2]))
                    if len(set(pixies)) > 1:
                        if pix.n < 5:  # this is GRAY or RGB
                            pix.writePNG(
                                os.path.join(self.file_path,
                                             "p%s-%s.png" % (i, xref)))
                            image_names.append("p%s-%s.png" % (i, xref))
                        else:  # CMYK: convert to RGB first
                            pix = fitz.Pixmap(fitz.csRGB, pix)
                            pix.writePNG(
                                os.path.join(self.file_path,
                                             "p%s-%s.png" % (i, xref)))
                            image_names.append("p%s-%s.png" % (i, xref))
                    pix = None
        elif self.doctype == "docx":
            image_names = []

        return image_names
Esempio n. 5
0
def pdf2pic(path, pic_path):
    t0 = time.clock()  # 生成图片初始时间
    checkXO = r"/Type(?= */XObject)"  # 使用正则表达式来查找图片
    checkIM = r"/Subtype(?= */Image)"
    doc = fitz.open(path)  # 打开pdf文件
    imgcount = 0  # 图片计数
    lenXREF = doc._getXrefLength()  # 获取对象数量长度

    # 打印PDF的信息
    print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF - 1))

    # 遍历每一个对象
    for i in range(1, lenXREF):
        text = doc._getXrefString(i)  # 定义对象字符串
        isXObject = re.search(checkXO, text)  # 使用正则表达式查看是否是对象
        isImage = re.search(checkIM, text)  # 使用正则表达式查看是否是图片
        if not isXObject or not isImage:  # 如果不是对象也不是图片,则continue
            continue
        imgcount += 1
        pix = fitz.Pixmap(doc, i)  # 生成图像对象
        new_name = "图片{}.png".format(imgcount)  # 生成图片的名称
        if pix.n < 5:  # 如果pix.n<5,可以直接存为PNG
            pix.writePNG(os.path.join(pic_path, new_name))
        else:  # 否则先转换CMYK
            pix0 = fitz.Pixmap(fitz.csRGB, pix)
            pix0.writePNG(os.path.join(pic_path, new_name))
            pix0 = None
        pix = None  # 释放资源
        t1 = time.clock()  # 图片完成时间
        print("运行时间:{}s".format(t1 - t0))
        print("提取了{}张图片".format(imgcount))
def recoverpix(doc, item):
    xref = item[0]  # xref of PDF image
    smask = item[1]  # xref of its /SMask

    # special case: /SMask exists
    # use Pillow to recover original image
    if smask > 0:
        fpx = io.BytesIO(  # BytesIO object from image binary
            doc.extractImage(xref)["image"], )
        fps = io.BytesIO(  # BytesIO object from smask binary
            doc.extractImage(smask)["image"], )
        img0 = Image.open(fpx)  # Pillow Image
        mask = Image.open(fps)  # Pillow Image
        img = Image.new("RGBA", img0.size)  # prepare result Image
        img.paste(img0, None, mask)  # fill in base image and mask
        bf = io.BytesIO()
        img.save(bf, "png")  # save to BytesIO
        return {  # create dictionary expected by caller
            "ext": "png",
            "colorspace": 3,
            "image": bf.getvalue(),
        }

    # special case: /ColorSpace definition exists
    # to be sure, we convert these cases to RGB PNG images
    if "/ColorSpace" in doc.xrefObject(xref, compressed=True):
        pix1 = fitz.Pixmap(doc, xref)
        pix2 = fitz.Pixmap(fitz.csRGB, pix1)
        return {  # create dictionary expected by caller
            "ext": "png",
            "colorspace": 3,
            "image": pix2.getImageData("png"),
        }
    return doc.extractImage(xref)
def ImageExtraction(folderName, DiskPath, filepath):
    try:
        #print(fileName+'::::::fileName')
        FileLocation = os.path.dirname(filepath) + "\\" + Path(filepath).stem
        os.makedirs(FileLocation, exist_ok=True)
        doc = fitz.open(filepath)
        print('------------------------Extracting Images from ' + filepath +
              '------------------------')
        for i in range(len(doc)):
            for img in doc.getPageImageList(i):
                xref = img[0]
                pix = fitz.Pixmap(doc, xref)
                imagepath = FileLocation + "\\%sp%s%s.jpg" % (folderName, i,
                                                              xref)
                fileNamewithExt = os.path.basename(imagepath)
                #print('fileNamewithExt::::::::::'+fileNamewithExt)
                if pix.n < 5:  # this is GRAY or RGB
                    pix.writePNG(imagepath)
                else:  # CMYK: convert to RGB first
                    pix1 = fitz.Pixmap(fitz.csRGB, pix)
                    pix1.writePNG(imagepath)
                    pix1 = None
                pix = None

    except Exception as e:
        print('Error occurred in ImageExtraction:::.', e)
def extractImages(file):
    """
    The images seem to be inverted in some funny way,
    but i'm sure there must be a way of dealing with it and converting
    back to the proper format.
    """

    pdf_title = file.split('.')[0]
    images_dir = pdf_title + '/images'

    if os.path.isdir(pdf_title) == False:
        os.mkdir(pdf_title)

    if os.path.isdir(images_dir) == False:
        os.mkdir(images_dir)

    doc = fitz.open(file)
    temp = file.split('/')
    for i in range(len(doc)):
        count = 0
        for img in doc.getPageImageList(i):
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)
            if pix.n < 5:  # this is GRAY or RGB
                pix1 = fitz.Pixmap(fitz.csRGB, pix)
                pix1.writePNG(images_dir + "/%s-Pg%s-Img%s.png" %
                              (temp[-1], i + 1, count + 1))
            else:  # CMYK: convert to RGB first
                pix1 = fitz.Pixmap(fitz.csRGB, pix)
                pix1.writePNG(images_dir + "/%s-%s-%s.png" %
                              (temp[-1], i, count))
                pix1 = None
            pix = None
            count += 1
Esempio n. 9
0
def test_filepixmap():
    # pixmaps from file and from stream
    pix1 = fitz.Pixmap(imgfile)
    stream = open(imgfile, "rb").read()
    pix2 = fitz.Pixmap(stream)
    assert repr(pix1) == repr(pix2)
    assert pix1.samples == pix2.samples
Esempio n. 10
0
def process_text(filename, filepath):
    # time.sleep(180)

    output_folder = "/home/flask/app/output_files"
    all_paths = []

    file_at = f"{output_folder}/{filename}"
    if not os.path.exists(file_at):
        return []
    
    doc = fitz.open(file_at)
    for i in range(len(doc)):
        for img in doc.getPageImageList(i):
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)

            file_out = f"{output_folder}/{filename}_p%s-%s.png" % (i, xref)
            all_paths.append(f"output_files/{filename}_p%s-%s.png" % (i, xref))

            if pix.n < 5:       # this is GRAY or RGB
                pix.writePNG(file_out)
            else:               # CMYK: convert to RGB first
                pix1 = fitz.Pixmap(fitz.csRGB, pix)
                pix1.writePNG(file_out)
                pix1 = None
            pix = None

    return all_paths
Esempio n. 11
0
def extract_images_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    image_count = 0

    images = []
    for i in range(len(doc)):
        page = i + 1
        print(f"extracting images from page {page}..")

        count_per_page = 1
        for img in doc.getPageImageList(i):
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)
            target_out = Path(extract_path, f"{image_count:02}.png")

            if pix.n - pix.alpha < 4:  # this is GRAY or RGB
                pix.writePNG(target_out)
            else:  # CMYK: convert to RGB first
                pix = fitz.Pixmap(fitz.csRGB, pix)
                pix.writePNG(target_out)
            pix = None

            images.append(target_out)

            count_per_page = count_per_page + 1
            image_count = image_count + 1

    print(f"finished extracting {image_count} images")
    return images
Esempio n. 12
0
 def ExtractImagesFromPDF(self, filename_pdf):
     min_width = 128
     min_height = 16
     doc = fitz.open(filename_pdf)
     imageDatas = []
     for i in range(len(doc)):
         for img in doc.getPageImageList(i):
             xref = img[0]
             _pix = fitz.Pixmap(doc, xref)
             if _pix.width <= min_width and _pix.height <= min_height:
                 continue
             if _pix.n < 5:  # GRAY or RGB
                 pix = _pix
             else:  # CMYK: convert to RGB
                 pix = fitz.Pixmap(fitz.csRGB, _pix)
             imageData = pix.getImageData("png")
             imageDatas.append(imageData)
     # dedup in case clipped copies are used on successive pdf pages (e.g. to avoid downscaling)
     dedupImageDatas = []
     for imageData in imageDatas:
         if len(dedupImageDatas) == 0 or imageData != dedupImageDatas[-1]:
             dedupImageDatas.append(imageData)
     images = []
     for imageData in dedupImageDatas:
         image = Image.open(io.BytesIO(imageData))
         images.append(image)
     return images
Esempio n. 13
0
def extract_images_rgb(doc):
    pages_containing_images = []
    rgb_pages = []
    imgcount = 0
    for i in range(len(doc)):
        print(' ' * 10, end='\r')
        print(round(100 * i / len(doc)), '%', end='')
        imglist = doc.getPageImageList(i)
        if len(imglist) != 0:
            pages_containing_images.append(i)
        for img in imglist:
            xref = img[0]  # xref number
            pix = fitz.Pixmap(doc, xref)  # make pixmap from image
            imgcount += 1
            if pix.n < 5:  # can be saved as PNG
                pix.writePNG("img.png")
                if i not in rgb_pages:
                    if RGBimageanalyze("img.png") == True:
                        rgb_pages.append(i)
            else:  # must convert CMYK first
                pix0 = fitz.Pixmap(fitz.csRGB, pix)
                pix0.writePNG("img.png")
                pix0 = None  # free Pixmap resources
                if i not in rgb_pages:
                    if RGBimageanalyze("img.png") == True:
                        rgb_pages.append(i)
            pix = None  # free Pixmap resources
    try:
        os.remove("img.png")
    except:
        print('unable to remove img.png')
    finally:
        print(' ' * 10, end='\r')
        print('100 %')
    return pages_containing_images, rgb_pages
Esempio n. 14
0
    def photoextraction(self, doc):

        for i in range(len(doc)):
            for img in doc.getPageImageList(i):
                xref = img[0]
                pix = fitz.Pixmap(doc, xref)
                if pix.n < 1:
                    pix.writePNG(
                        os.path.join(self.path, "p%s-%s.png" % (i, xref)))
                else:
                    pix1 = fitz.Pixmap(fitz.csRGB, pix)
                    pix1.writePNG(
                        os.path.join(self.path, "p%s-%s.png" % (i, xref)))
                try:
                    # HUMAN IMAGE IN ADHAR
                    if pix.width == float(
                            0.8
                    ) * pix.height or pix.width == 0.75 * pix.height:
                        self.human_image = os.path.join(
                            self.path, "p%s-%s.png" % (i, xref))
                        pixmap = QPixmap(self.human_image)
                        self.label_9.setPixmap(pixmap)
                        self.label_9.setScaledContents(True)

                    # SCANNER CODE IN ADHAR
                    elif pix.width == pix.height:
                        pixmap = QPixmap(
                            os.path.join(self.path, "p%s-%s.png" % (i, xref)))
                        self.label_25.setPixmap(pixmap)
                        self.label_25.setScaledContents(True)
                except Exception as e:
                    print(e)
                    print("fault in human and scanner image")
Esempio n. 15
0
def worker(i, img_count, pdf_path, pic_path):
    # print("进程%d开始执行,进程号为%d" % (img_count, os.getpid()))
    # print(i, img_count, pdf_path, pic_path)
    # t0 = time.perf_counter()  # 生成图片初始时间

    doc = fitz.open(pdf_path)  # 打开pdf文件
    # trans = fitz.Matrix(2, 2).preRotate(0)
    pix = fitz.Pixmap(doc, i)  # 根据索引生成图像 生成图像对象
    # pix.setResolution(pix.w // 2, pix.h // 2)
    # print(type(pix))

    c1, c2 = 140, 50

    # print(type(pix), pix.w, pix.pixel(1, 2))
    for x in range(pix.w):
        for y in range(pix.h):
            if pix.pixel(x, y)[0] > c1:
                pix.setPixel(x, y, [255, 255, 255])
            else:
                pix.setPixel(x, y, [c2, c2, c2])
    # print(pix.pixel(1, 2))

    # # 根据pdf的路径生成图片的名称
    new_name = os.path.join(pic_path, f'{img_count}.png')

    if pix.n < 5:  # 如果pix.n<5,可以直接存为PNG
        pix.writePNG(new_name)
    else:  # 否则先转换CMYK
        pix0 = fitz.Pixmap(fitz.csRGB, pix)
        pix0.writePNG(new_name)
        pix0 = None

    pix = None  # 释放资源
    doc.close()
def extract_from_pdf(file_path):
    prefix = _prefix(file_path)
    outfiles = []
    doc = fitz.open(file_path)
    try:
        for i in range(len(doc)):
            for img in doc.getPageImageList(i):
                try:
                    xref = img[0]
                    pix = fitz.Pixmap(doc, xref)
                    outfile = '{} - p{}-{}.png'.format(prefix, i, xref)
                    if pix.n < 5:       # this is GRAY or RGB
                        pix.writePNG(outfile)
                    else:               # CMYK: convert to RGB first
                        pix1 = fitz.Pixmap(fitz.csRGB, pix)
                        pix1.writePNG(outfile)
                        pix1 = None
                    pix = None
                    outfiles.append(outfile)
                except:
                    print('      failed to export image {} from pdf {}'.format(outfile, file_path))
                    pass
    finally:
        doc.close()
    return outfiles
    def get_pdf_content(self, filePath, languages):
        """ First this function get all texts in the file if exist. Then it
        creates a list of pictures to make the OCR method."""
        text = ""
        with fitz.open(filePath) as doc:
            images = []
            for page in doc:
                text += page.getText("text")
                images += doc.getPageImageList(page.number)

            for image in images:
                xref = image[0]
                picture = fitz.Pixmap(doc, xref)

                if picture.n > 4:  # CMYK colorspace
                    picture = fitz.Pixmap(fitz.csRGB,
                                          picture)  # convert to RGB

                bytes_img = BytesIO(picture.getImageData())

                page_text = self.ocrize(bytes_img, languages)

                if page_text == -1:
                    text = -1
                    break
                elif page_text:
                    text += page_text

        return text
Esempio n. 18
0
def recoverpix(doc, item):
    """Return image for a given XREF.
    """
    x = item[0]  # xref of PDF image
    s = item[1]  # xref of its /SMask
    if s == 0:  # no smask: use direct image output
        return doc.extractImage(x)

    def getimage(pix):
        if pix.colorspace.n != 4:
            return pix
        tpix = fitz.Pixmap(fitz.csRGB, pix)
        return tpix

    # we need to reconstruct the alpha channel with the smask
    pix1 = fitz.Pixmap(doc, x)
    pix2 = fitz.Pixmap(doc, s)  # create pixmap of the /SMask entry
    """Sanity check:
    - both pixmaps must have the same rectangle
    - both pixmaps must have alpha=0
    - pix2 must consist of 1 byte per pixel
    """
    if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0
            and pix2.n == 1):
        print("Warning: unsupported /SMask %i for %i:" % (s, x))
        print(pix2)
        pix2 = None
        return getimage(pix1)  # return the pixmap as is

    pix = fitz.Pixmap(pix1)  # copy of pix1, with an alpha channel added
    pix.setAlpha(pix2.samples)  # treat pix2.samples as the alpha values
    pix1 = pix2 = None  # free temp pixmaps

    # we may need to adjust something for CMYK pixmaps here:
    return getimage(pix)
Esempio n. 19
0
def pdftoimages(input_dir,output_dir):
   """    
   Converts pdfs in input dir to .png and stores them in output_dir
   
   Args:
   input_dir = path to location of pdfs
   output_dir= path where images would be stored
   
   Returns:
   Saves a .png image to output_dir
   
   """ 
   dirListing = os.listdir(input_dir)
   files = []
   imagespath = output_dir
   for item in dirListing:
       files.append(item)
   n = len(files)
   for num in range(n):
       doc = fitz.open(input_dir+"/"+files[num])
       for img in doc.getPageImageList(0):
           xref = img[0]
           pix = fitz.Pixmap(doc, xref)
           if pix.n < 5:       # this is GRAY or RGB
               pix.writePNG(os.path.join(imagespath,"p%s-%s.png" % (num, xref)))
           else:               # CMYK: convert to RGB first
               pix1 = fitz.Pixmap(fitz.csRGB, pix)
               pix1.writePNG(os.path.join(imagespath,"p%s-%s.png" % (num, xref)))
               pix1 = None 
           pix=None
           break
Esempio n. 20
0
def pdf2pic(filepath, pic_path):
    checkXO = r"/Type(?= */XObject)"  # 使用正则表达式来查找图片
    checkIM = r"/Subtype(?= */Image)"
    doc = fitz.open(filepath)  # 打开pdf文件
    imgcount = 0  # 图片计数
    lenXREF = doc._getXrefLength()  # 获取对象数量长度
    imageList = []
    # 遍历每一个对象
    for i in range(1, lenXREF):
        text = doc._getXrefString(i)  # 定义对象字符串
        isXObject = re.search(checkXO, text)  # 使用正则表达式查看是否是对象
        isImage = re.search(checkIM, text)  # 使用正则表达式查看是否是图片
        if not isXObject or not isImage:  # 如果不是对象也不是图片,则continue
            continue
        imgcount += 1
        # if imgcount != 12:
        #     continue
        pix = fitz.Pixmap(doc, i)  # 生成图像对象
        new_name = "图片{}.png".format(time.time())  # 生成图片的名称
        imageList.append(new_name)
        if pix.n < 5:  # 如果pix.n<5,可以直接存为PNG
            pix.writePNG(os.path.join(pic_path, new_name))
        else:  # 否则先转换CMYK
            pix0 = fitz.Pixmap(fitz.csRGB, pix)
            pix0.writePNG(os.path.join(pic_path, new_name))
            pix0 = None
        pix = None  # 释放资源
        time.sleep(0.1)
    time.sleep(1)
    imagePath = imageList[-2]
    imageList.pop(len(imageList) - 2)
    for image in imageList:
        os.remove(pic_path + '\\' + image)
    return pic_path + '\\' + imagePath
Esempio n. 21
0
def recoverpix(doc, xref, item):
    """Return pixmap for item, if an /SMask exists.
    """

    def getimage(pix):
        if pix.colorspace.n != 4:
            return pix
        tpix = fitz.Pixmap(fitz.csRGB, pix)
        return tpix

    s = item["smask"]  # xref of its /SMask

    try:
        pix1 = fitz.Pixmap(doc, xref)  # make pixmap from image
    except:
        return None  # skip if error

    try:
        pix2 = fitz.Pixmap(doc, s)  # create pixmap of /SMask entry
    except:
        print("cannot create mask %i for image %i" % (s, xref))
        return getimage(pix1)  # return w/ failed transparency

    # check that we are safe
    if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
        return getimage(pix1)
    pix = fitz.Pixmap(pix1)  # copy of pix1, alpha channel added
    pix.setAlpha(pix2.samples)  # treat pix2.samples as alpha values
    pix1 = pix2 = None  # free temp pixmaps
    return getimage(pix)
Esempio n. 22
0
 def pdf2pic(path, pic_path):
     # 打开pdf
     doc = fitz.open(path)
     nums = doc._getXrefLength()
     imgcount = 0 
     for i in range(1, nums):
         text = doc._getXrefString(i)
         if ('Width 2550' in text) and ('Height 3300' in text) or ('thumbnail' in text):
             continue
         checkXO = r"/Type(?= */XObject)"
         checkIM = r"/Subtype(?= */Image)"
         isXObject = re.search(checkXO, text)
         isImage = re.search(checkIM, text)
         if not isXObject or not isImage:
             continue
         imgcount += 1
         pix = fitz.Pixmap(doc, i)
         img_name = "img{}.png".format(imgcount)
         if pix.n < 5:
             try:
                 pix.writePNG(os.path.join(pic_path, img_name))
                 pix = None
             except:
                 pix0 = fitz.Pixmap(fitz.csRGB, pix)
                 pix0.writePNG(os.path.join(pic_path, img_name))
                 pix0 = None
Esempio n. 23
0
def count_extract_pdf_images(pdf_file_path, save_images = False):
    doc, count, saved_image_filepaths = fitz.open(pdf_file_path), 0, []
    pdf_directory, pdf_name = Path(pdf_file_path).parent, str(Path(pdf_file_path).stem)
    for i in range(len(doc)):
        page = i+1
        for img in doc.getPageImageList(i):
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)
            if pix.n < 5:  # this is GRAY or RGB
                # Mysterious plain black images are just 3 characters long if these replacements made
                if len(str(pix.samples).replace('\\','').replace('x','').replace('f','').replace('0','')) > 3:
                    count += 1
                    image_name = '{}_image_{}_page_{}.png'.format(pdf_name, count, page)
                    if save_images:
                        if not (pdf_directory / (pdf_name + '_images')).exists(): (pdf_directory / (pdf_name + '_images')).mkdir()
                        pix.writePNG(str(pdf_directory/(pdf_name+'_images')/image_name))
                        saved_image_filepaths.append(str(pdf_directory/(pdf_name+'_images')/image_name))
            else:  # CMYK: convert to RGB first
                pix1 = fitz.Pixmap(fitz.csRGB, pix)
                # Mysterious plain black images are just 3 characters long if these replacements made
                if len(str(pix1.samples).replace('\\', '').replace('x', '').replace('f', '').replace('0', '')) > 3:
                    count += 1
                    image_name = '{}_image_{}_page_{}.png'.format(pdf_name, count, page)
                    if save_images:
                        if not (pdf_directory/(pdf_name+'_images')).exists(): (pdf_directory/(pdf_name+'_images')).mkdir()
                        pix1.writePNG(str(pdf_directory/(pdf_name+'_images')/image_name))
                        saved_image_filepaths.append(str(pdf_directory / (pdf_name + '_images') / image_name))
                pix1 = None
            pix = None
    return count, saved_image_filepaths
Esempio n. 24
0
def extract_images_pymupdf(message):
    doc = fitz.open(message["path"])
    for i in range(len(doc)):
        for img in doc.getPageImageList(i):
            processing_dir = "./data/processing/"
            identifier = str(uuid.uuid4())
            workfile = processing_dir + identifier

            xref = img[0]
            pix = fitz.Pixmap(doc, xref)
            if pix.n < 5:
                pix.writePNG(workfile)
            else:
                pix1 = fitz.Pixmap(fitz.czRGB, pix)
                pix1.writePNG(workfile)
                pix1 = None
            pix = None
            filename = str(i) + "-" + str(xref) + ".png"
            new_message = {
                "identifier": identifier,
                "parent": message["identifier"],
                "path": workfile,
                "filename": filename,
                "filetype": "unknown",
                "history": [],
                "metadata": {},
                "original_file": False
            }
            sendEvent(new_message)
    doc.close()
Esempio n. 25
0
def recoverpix(doc, item):
    x = item[0]  # xref of PDF image
    s = item[1]  # xref of its /SMask
    if s == 0:  # no smask: use direct image output
        return doc.extractImage(x)

    def getimage(pix):
        if pix.colorspace.n != 4:
            return pix
        tpix = fitz.Pixmap(fitz.csRGB, pix)
        return tpix

    # we need to reconstruct the alpha channel with the smask
    pix1 = fitz.Pixmap(doc, x)
    pix2 = fitz.Pixmap(doc, s)  # create pixmap of /SMask entry

    # sanity check
    if not (pix1.irect == pix2.irect and \
            pix1.alpha == pix2.alpha == 0 and \
            pix2.n == 1):
        pix2 = None
        return getimage(pix1)

    pix = fitz.Pixmap(pix1)  # copy of pix1, alpha channel added
    pix.setAlpha(pix2.samples)  # treat pix2.samples as alpha value
    pix1 = pix2 = None  # free temp pixmaps

    # we may need to adjust something for CMYK pixmaps here:
    return getimage(pix)
Esempio n. 26
0
def handlePDF(doc):
    taxon_regex = "(?:^|\W)[A-Z]{4}(?:^|\W)"
    p = re.compile(taxon_regex)
    for i in range(20, len(doc)):
        page = doc.loadPage(i)
        text = page.getText("text")
        match = p.search(text)
        if match and match.group().find("TOME") == -1:
            print("---------------------------------")
            print(page)
            print(match.group(), page.number)
            taxon = ''.join(e for e in match.group() if e.isalnum())
            i = 0
            for img in doc.getPageImageList(page.number):
                xref = img[0]
                pix = fitz.Pixmap(doc, xref)
                if pix.n >= 5:  # CMYK: convert to RGB first
                    pix = fitz.Pixmap(fitz.csRGB, pix)
                ROI = pix2np(pix)
                ROI_w, ROI_h = ROI.shape[0], ROI.shape[1]
                if ROI_w > 10 and ROI_h > 10:
                    #CHECKING IF IMAGE CONTAINS TO MUCH WHITE
                    area = ROI_w * ROI_h
                    if ROI.shape[2] != 1:
                        ROI_gray = cv2.cvtColor(ROI, cv2.COLOR_BGR2GRAY)
                    else:
                        ROI_gray = ROI
                    ret, ROI_thr = cv2.threshold(ROI_gray, 254, 255,
                                                 cv2.THRESH_BINARY)
                    n = len(np.where(ROI_thr == 255)[0])
                    # IF NOT, SAVE IMAGE
                    if n < area * 0.2:
                        saveImg(ROI_gray, "tmp", taxon, i)
                        i += 1
Esempio n. 27
0
def recoverpix(doc, item):
    """Return pixmap for item, which is a list of 2 xref numbers. Second xref
    is that of an smask if > 0.
    Return None for any error.
    """
    x = item[0]  # xref of PDF image
    s = item[1]  # xref of its /SMask
    
    try:
        pix1 = fitz.Pixmap(doc, x)     # make pixmap from image
    except:
        print("xref %i " % x + doc._getGCTXerrmsg())
        return None                    # skip if error

    if s == 0:                    # has no /SMask
        return pix1               # no special handling
    
    try:
        pix2 = fitz.Pixmap(doc, s)    # create pixmap of /SMask entry
    except:
        print("cannot create mask %i for image xref %i" % (s,x))
        return pix1               # return w/ failed transparency
        
    # check that we are safe
    if not (pix1.irect == pix2.irect and \
            pix1.alpha == pix2.alpha == 0 and \
            pix2.n == 1):
        print("unexpected /SMask situation: pix1", pix1, "pix2", pix2)
        return pix1
    pix = fitz.Pixmap(pix1)       # copy of pix1, alpha channel added
    pix.setAlpha(pix2.samples)    # treat pix2.samples as alpha values
    pix1 = pix2 = None            # free temp pixmaps
    return pix
Esempio n. 28
0
def extractImagesJPG(filename, outfolder, verbose, imagequality):
    import fitz
    from PIL import Image

    imgList=[]

    writelog('Extracting images', verbose)

    doc = fitz.open(filename)
    pages=len(doc)
    suffix=len(str(pages))+1

    for i in range(len(doc)):
        for img in doc.getPageImageList(i):
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)
        
            if pix.n > 5: pix= fitz.Pixmap(fitz.csRGB, pix)
            newname="{0}Page_{1}.jpg".format(outfolder,str(i).zfill(suffix))
            writelog('Extracting Page {0}, Saving as {1}'.format(i,newname), verbose)
            saveJPEG(newname, pix, verbose, imagequality)
            imgList.append(newname)
            pix = None
    
    return imgList
Esempio n. 29
0
def test_filepixmap():
    # pixmaps from file and from stream
    # should lead to same result
    pix1 = fitz.Pixmap(imgfile)
    stream = open(imgfile, "rb").read()
    pix2 = fitz.Pixmap(stream)
    assert repr(pix1) == repr(pix2)
    assert pix1.digest == pix2.digest
Esempio n. 30
0
def pdf2pic(path, num, bar_value):
    t0 = time.clock()  # 生成图片初始时间
    checkXO = r"/Type(?= */XObject)"  # 使用正则表达式来查找图片
    checkIM = r"/Subtype(?= */Image)"
    doc = fitz.open(path)  # 打开pdf文件
    imgcount = 0  # 图片计数
    lenXREF = doc._getXrefLength()  # 获取对象数量长度
    parrent = re.compile(r'[a-zA-Z0-9]{%s}' % num)
    (filepath, tempfilename) = os.path.split(path)
    filename, tmp = os.path.splitext(tempfilename)
    result = re.findall(parrent, filename)
    result = "".join(result)
    print('==========', result, '==========')
    if result:
        res = result
        pic_name = res
    else:
        res = filename
        pic_name = "_"
    # 打印PDF的信息
    print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF - 1))

    # 遍历每一个对象
    for i in range(1, lenXREF):
        text = doc._getXrefString(i)  # 定义对象字符串
        isXObject = re.search(checkXO, text)  # 使用正则表达式查看是否是对象
        isImage = re.search(checkIM, text)  # 使用正则表达式查看是否是图片
        if not isXObject or not isImage:  # 如果不是对象也不是图片,则continue
            continue
        imgcount += 1
        pix = fitz.Pixmap(doc, i)  # 生成图像对象
        new_name = "{}_{}.jpg".format(pic_name, imgcount)  # 生成图片的名称
        new_path = os.getcwd() + r"\图片"
        bar_value.emit(['bar', 90 / lenXREF * (i + 1)])
        if os.path.exists(new_path):
            print("文件夹已存在,不必重新创建!")
            pass
        else:
            os.makedirs(new_path)

        new_filepath = new_path + r"\%s" % res
        print(new_filepath)
        if os.path.exists(new_filepath):
            print("文件夹已存在,不必重新创建!")
        else:
            os.makedirs(new_filepath)

        if pix.n < 5:  # 如果pix.n<5,可以直接存为PNG
            pix.writePNG(os.path.join(new_filepath, new_name))
        else:  # 否则先转换CMYK
            pix0 = fitz.Pixmap(fitz.csRGB, pix)
            pix0.writePNG(os.path.join(new_filepath, new_name))
            pix0 = None
        pix = None  # 释放资源
        t1 = time.clock()  # 图片完成时间
        print("运行时间:{}s".format(t1 - t0))
        print("提取了{}张图片".format(imgcount))