def bmp_or_jpg_to_pdf(list_of_file_paths, destination_directory, delete_source=False):
     operation_result = {}
     invalid_file_paths = []
     for files in list_of_file_paths:
         if os.path.exists(files):
             continue
         else:
             invalid_file_paths.append(files)
             list_of_file_paths.remove(files)
     for files in list_of_file_paths:
         image = PyImage()
         image.density("600")
         image.read(files)
         file_name = os.path.basename(files)
         name, ext = os.path.splitext(file_name)
         file_name_at_destination = os.path.join(destination_directory, name + ".pdf")
         image.write(file_name_at_destination)
     if delete_source is True:
         for files in list_of_file_paths:
             os.remove(files)
     if not invalid_file_paths:
         operation_result.update({"code": 0, "invalid_file_paths": "None"})
     else:
         invalid_files = ",".join(list_of_file_paths)
         operation_result.update({"code": 1, "invalid_file_paths": invalid_files})
     return operation_result
Beispiel #2
0
def main():
    # os.environ["MAGICK_HOME"] = r"path_to_ImageMagick"

    if len(sys.argv) == 1:
        dirpath = '.'
    else:
        dirpath = sys.argv[1]

    temp_dir = os.path.join(dirpath, '.temp')
    if not os.path.exists(temp_dir):
        os.mkdir(temp_dir)

    for file_name in os.listdir(dirpath):
        if not os.path.isfile(file_name) or not file_name.endswith('.pdf'):
            continue
        print('Converting file {} ...'.format(file_name))
        in_path = os.path.join(dirpath, file_name)
        with open(in_path, 'rb') as handle:
            inputpdf = PdfFileReader(handle)
            for i in xrange(inputpdf.numPages):
                outputpdf = PdfFileWriter()
                outputpdf.addPage(inputpdf.getPage(i))
                new_file_name = file_name.replace('.pdf', '') + '_{}.pdf'.format(i)
                new_in_path = os.path.join(temp_dir, new_file_name)
                with open(new_in_path, 'wb') as handle:
                    outputpdf.write(handle)

                output_file_name = new_file_name.replace('.pdf', '.jpeg')
                output_path = os.path.join(dirpath, output_file_name)
                p = Image()
                p.density('1080')
                p.read(os.path.abspath(new_in_path))
                p.write(os.path.abspath(output_path))
                os.remove(new_in_path)
        os.rmdir(temp_dir)
Beispiel #3
0
def pdf2images(name):
    np = getPdfNumPages(name)
    for p in range(np):
        i = Image()
        i.density('200')
        i.quality(100)
        i.depth(24)
        #i.backgroundColor(
        #i.channel(
        i.read(name + '[' + str(p) + ']')
        i.write(name + str(p) + defaultImageExtension)
Beispiel #4
0
def main():
    if len(sys.argv) < 2:
        print('invalid usage!')
        return
    pdf_filename = sys.argv[1]
    pdf = PdfFileReader(file(pdf_filename, "rb"))
    npage = pdf.getNumPages()
    fname = pdf_filename.split('/')[-1]
    tmppath = '/dev/shm/'
    for p in range(npage):
        im = Image()
        im.density('300')
        im.read(pdf_filename + '[' + str(p) + ']')
        im.write(tmppath + fname + '_' + str(p) + out_format)
Beispiel #5
0
def pdf2img(input_pdf, postfix='.png'):
    img = Image(input_pdf)
    img.density('300')
    size = "%sx%s" % (img.columns(), img.rows())
    output_img = Image(size, bgcolor)
    output_img.type = img.type
    output_img.composite(img, 0, 0,
                         PythonMagick.CompositeOperator.SrcOverCompositeOp)
    output_img.resize(str(img.rows()))
    output_img.magick('JPG')
    output_img.quality(75)
    output_jpg = input_pdf.replace(".pdf", postfix)
    if os.path.exists(output_jpg):
        os.remove(output_jpg)
    output_img.write(output_jpg)
Beispiel #6
0
class Answers:
    def __init__(self, filename, i):
        self.image = Image()
        self.image.density('%d' % DENSITY)
        self.image.read('%s[%d]' % (filename, i))

        temp = tempfile.NamedTemporaryFile(suffix='.png')
        self.image.write(temp.name)
        self.pimage = pygame.image.load(temp.name)
        temp.close()

    def draw(self, screen, C):
        pygame.draw.rect(screen, (255, 255, 255), (0, 0, W, H), 0)

        scaled = self.pimage
        screen.blit(scaled, (0, 0), (C[0], C[1], C[2] - C[0], C[3] - C[1]))
        pygame.display.flip()
Beispiel #7
0
class Answers:
    def __init__(self, filename, i):
        self.image = Image()
        self.image.density('%d' % DENSITY)
        self.image.read('%s[%d]' % (filename, i))

        temp = tempfile.NamedTemporaryFile(suffix='.png')
        self.image.write(temp.name)
        self.pimage = pygame.image.load(temp.name)
        temp.close()

    def draw(self, screen, C):
        pygame.draw.rect(screen, (255, 255, 255), (0, 0, W, H), 0)

        scaled = self.pimage
        screen.blit(scaled, (0, 0), (C[0], C[1], C[2] - C[0], C[3] - C[1]))
        pygame.display.flip()
Beispiel #8
0
def pdf2img(input_pdf, postfix='.png', **kwargs):
    # print os.path.exists(input_pdf)
    img = Image(input_pdf)
    img.density('300')
    size = "%sx%s" % (img.columns(), img.rows())
    output_img = Image(size, bgcolor)
    output_img.type = img.type
    output_img.composite(img, 0, 0,
                         PythonMagick.CompositeOperator.SrcOverCompositeOp)
    output_img.resize(str(img.rows()))
    output_img.magick('JPG')
    output_img.quality(75)
    if 'out_path' in kwargs:
        output_jpg = kwargs['out_path']
    else:
        output_jpg = input_pdf + postfix
    if os.path.exists(output_jpg):
        os.remove(output_jpg)
    output_img.write(output_jpg)
Beispiel #9
0
def pdf2jpg(pdf,temp):
    #Generate the path for the jpg file. Need to use a temp directory in case
    #pdf location is read only.
    pdf = str(pdf)
    base = os.path.basename(pdf)
    basefile = os.path.splitext(base)
    jpg = temp + basefile[0] + ".jpg"
    #jpg = str(jpg.replace("\\","\\\\"))
    jpg = str(jpg)
    pdf = str(pdf)
    img = PMImage()
    img.density('300')
    img.depth(24)
    img.read(pdf)
    img.write(jpg)    
    img = Image.open(jpg)
    rgbimg = Image.new("RGBA", img.size)
    rgbimg.paste(img)
    rgbimg.save(jpg)
    return jpg
Beispiel #10
0
def pdf_to_image():
    for pdf in [
            pdf_file for pdf_file in os.listdir(pdf_dir)
            if pdf_file.endswith(".pdf")
    ]:
        input_pdf = pdf_dir + "\\" + pdf + "[1]"
        img = Image()
        img.density('300')
        print input_pdf
        img.read(input_pdf)

        size = "%sx%s" % (img.columns(), img.rows())

        output_img = Image(size, bg_colour)
        output_img.type = img.type
        output_img.composite(img, 0, 0,
                             PythonMagick.CompositeOperator.SrcOverCompositeOp)
        output_img.resize(str(img.rows()))
        output_img.magick('JPG')
        output_img.quality(75)

        output_jpg = input_pdf.replace(".pdf", ".jpg")
        output_img.write(output_jpg)
Beispiel #11
0
def pdftojpg(fname):
    reader = PdfFileReader(open(fname, "rb"))
    directory = os.path.basename(fname)
    if not os.path.exists(directory):
        os.makedirs(directory)
    else:
        shutil.rmtree(directory)
        time.sleep(2)
        os.makedirs(directory)
    for page_num in range(reader.getNumPages()):
        writer = PdfFileWriter()
        writer.addPage(reader.getPage(page_num))
        temp = NamedTemporaryFile(prefix=str(page_num), suffix=".pdf", delete=False)
        writer.write(temp)
        temp.close()

        im = Magick_Image()
        im.density("300") # DPI, for better quality
        im.read(temp.name)
        im.write(directory+"/some%d.jpg" % (int(page_num)+1))

        os.remove(temp.name)
    return(directory)
def convert(args):

    dirname = getcwd()
    ifilename = path.join(
        dirname, args.ifile) if not path.isabs(args.ifile) else args.ifile
    ofilename = path.join(
        dirname, args.ofile) if not path.isabs(args.ofile) else args.ofile
    ofilename_n_ext = path.splitext(ofilename)[0]

    reader = PdfFileReader(open(ifilename, "rb"))
    for page_num in xrange(reader.getNumPages()):
        writer = PdfFileWriter()
        writer.addPage(reader.getPage(page_num))
        with open(path.join(dirname, 'temp.pdf'), 'wb') as temp:
            writer.write(temp)

        im = Image()
        im.density("300")  # DPI, for better quality
        im.backgroundColor('white')
        im.fillColor('white')
        im.read(path.join(dirname, 'temp.pdf'))
        im.write("%s_%d.jpg" % (ofilename_n_ext, page_num))

        remove(path.join(dirname, 'temp.pdf'))
Beispiel #13
0
DPI='85'
APP_ID='??'
API_KEY='??'
SECRET_KEY='??'
path_wk=r'pdfkit安装位置设置'
pdfkit_config=pdfkit.configuration(wkhtmltopdf=path_wk)
pdfkit_options={'encoding':'UTF-8',}

os.chdir(path)
pdf_input=PdfFileReader(open(pdfname, 'rb'))
page_count=pdf_input.getNumPages()
page_range=range(page_count)

for page_num in page_range:
	im=Image()
	im.density(DPI)
	im.read(pdfname + '[' + str(page_num) +']')
	im.write(str(page_num)+ '.jpg')

client=AipOcr(APP_ID, API_KEY, SECRET_KEY)
def get_file_content(filePath):
	with open(filePath, 'rb') as fp:
		return fp.read()

options={}
options["language_type"]="CHN_ENG"
options["detect_direction"]="false"
options["detect_language"]="false"
options["probability"]="false"
allteststr=[]
for page_num in page_range:
Beispiel #14
0
        dir, fname = os.path.split(_pdf_path)
        base, ext = os.path.splitext(fname)
        out_base, _ = os.path.splitext(_pdf_path)

        # convert the pdf file to the jpg images
        command = 'pdftoppm %s %s -jpeg' % (_pdf_path.replace(
            ' ', '\ '), out_base.replace(' ', '\ '))
        os.system(command)

        paths = []
        # convert the jpg files to the list of cv image
        for f in os.listdir(dir):
            path = os.path.join(dir, f)
            if os.path.exists(path) and f.find(
                    base) != -1 and os.path.splitext(f)[1].find('jpg') != -1:
                paths.append(path)

        return paths


if __name__ == '__main__':
    import os, PythonMagick
    from PythonMagick import Image
    from datetime import datetime

    bg_colour = "#ffffff"
    input_pdf = pdf_dir + "\\" + pdf
    img = Image()
    img.density('300')
    img.read(input_pdf)
    def tiff_to_pdf(list_of_file_paths, destination_directory):
        """
        This method converts individual TIFF image files to PDF.
        :param list_of_file_paths: This argument is the list of absolute file paths for example
        ['C:/User/Documents/Images/Image1.tiff', 'C:/User/Documents/Images/Image2.tiff' ]
        :param destination_directory: Pass in the absolute path to the directory you want the
        converted files to be saved to.
        :return: This method returns a dictionary giving information about the success or failure
        of the operation and also gives the list of files that failed the conversion..
        """
        operation_result = {}
        invalid_file_paths = []
        for files in list_of_file_paths:
            if os.path.exists(files):
                continue
            else:
                invalid_file_paths.append(files)
                list_of_file_paths.remove(files)

        if not os.path.exists(os.path.join(os.getcwd(), "temp_dir")):
            os.mkdir(os.path.join(os.getcwd(), "temp_dir"))

        path_to_temp = os.path.join(os.getcwd(), "temp_dir")

        for image in list_of_file_paths:
            temp_file_list = []
            img = Image.open(image)
            tif_file_full_name = os.path.basename(image)
            tif_file_name, tif_file_ext = os.path.splitext(tif_file_full_name)
            for i in range(100):
                try:
                    img.seek(i)
                    temp_file_list.append(os.path.join(path_to_temp, 'page' + str(i + 1) + ".tif"))
                    img.save(os.path.join(path_to_temp, 'page' + str(i + 1) + ".tif"))
                except EOFError:
                    break

            png_temp_array = []
            for tif_img in temp_file_list:
                img_png = Image.open(tif_img)
                file_name = os.path.basename(tif_img)
                name, ext = os.path.splitext(file_name)
                file_name_at_destination = os.path.join(path_to_temp, name + ".png")
                png_temp_array.append(file_name_at_destination)
                img_png.save(file_name_at_destination)
                os.remove(tif_img)

            pdf_temp_array = []
            for png_img in png_temp_array:
                image = PyImage()
                image.density("600")
                image.read(png_img)
                png_file_name = os.path.basename(png_img)
                png_name, png_ext = os.path.splitext(png_file_name)
                file_name_at_destination = os.path.join(path_to_temp, png_name + ".pdf")
                pdf_temp_array.append(file_name_at_destination)
                image.write(file_name_at_destination)
                os.remove(png_img)

            if pdf_temp_array.__len__() > 1:
                ImageOpr.merge_pdf(pdf_temp_array, destination_directory, tif_file_name, delete_source=False)
                for pdfs in pdf_temp_array:
                    os.remove(pdfs)
            else:
                os.rename(pdf_temp_array[0], os.path.join(path_to_temp, tif_file_name + ".pdf"))
                destination_directory_file_name = os.path.join(destination_directory, tif_file_name + ".pdf")
                shutil.move(os.path.join(path_to_temp, tif_file_name + ".pdf"), destination_directory_file_name)

        shutil.rmtree(path_to_temp, ignore_errors=True)

        if not invalid_file_paths:
            operation_result.update({"code": 0, "invalid_file_paths": "None"})
        else:
            invalid_files = ",".join(list_of_file_paths)
            operation_result.update({"code": 1, "invalid_file_paths": invalid_files})
        return operation_result
Beispiel #16
0
class Map:
    def __init__(self, map_path):
        self._map_path = map_path
        self._map_image = None

        self._ocr = OCR()

        self._x = 0.0
        self._y = 0.0
        self._width = 0.0
        self._height = 0.0

        # The OCR gets a bit buggy for scale factors
        # smaller than 3 and bigger than 5.
        self._scale_factor = 5

        self._RefreshCoordinates()

    def IsValid(self):
        if self._x is 0 or self._y is 0:
            return False

        if self._width is 0 or self._height is 0:
            return False

        if self._width < 0.001 or self._width > 0.04:
            return False

        if self._height > -0.001 or self._height < -0.04:
            return False

        map_geometry = self._GetMapGeometry()

        height_pixel_ratio = self.GetHeight() / map_geometry.height()
        width_pixel_ratio = self.GetWidth() / map_geometry.width()

        # The ratio should not be very different from each other, otherwise
        # we OCR'ed one of the coordinates wrong.
        if (abs(height_pixel_ratio) - abs(width_pixel_ratio)) > 0.0001:
            return False

        return True

    def SetScaleFactor(self, factor):
        if factor is self._scale_factor:
            return

        self._scale_factor = factor

        if self._map_image:
            self._GenerateImage()

        self._RefreshCoordinates()

    def GetMapImage(self):
        return self._CropGeometry(self._GetMapGeometry())

    def GetX(self):
        if self._x:
            return self._x

        map_geometry = self._GetMapGeometry()

        offset = _MAGIC_COORDINATE_OFFSET_ * self._scale_factor
        width = _MAGIC_COORDINATE_BBOX_WIDTH_ * self._scale_factor
        height = _MAGIC_COORDINATE_BBOX_HEIGHT_ * self._scale_factor

        coordinate_geometry = Geometry(width, height, map_geometry.xOff(), map_geometry.yOff() - offset - height)

        image = self._CropGeometry(coordinate_geometry)
        self._x = self._ocr.GetDecimalDegrees(image)

        return self._x

    def GetY(self):
        if self._y:
            return self._y

        map_geometry = self._GetMapGeometry()

        offset = _MAGIC_COORDINATE_OFFSET_ * self._scale_factor
        width = _MAGIC_COORDINATE_BBOX_HEIGHT_ * self._scale_factor
        height = _MAGIC_COORDINATE_BBOX_WIDTH_ * self._scale_factor

        coordinate_geometry = Geometry(width, height, map_geometry.xOff() - offset - width, map_geometry.yOff())

        image = self._CropGeometry(coordinate_geometry)
        image.rotate(90)
        self._y = self._ocr.GetDecimalDegrees(image)

        return self._y

    def GetWidth(self):
        if self._width:
            return self._width

        map_geometry = self._GetMapGeometry()

        offset = _MAGIC_COORDINATE_OFFSET_ * self._scale_factor
        width = _MAGIC_COORDINATE_BBOX_WIDTH_ * self._scale_factor
        height = _MAGIC_COORDINATE_BBOX_HEIGHT_ * self._scale_factor

        x_offset = map_geometry.xOff() + map_geometry.width()
        y_offset = map_geometry.yOff() + map_geometry.height()

        coordinate_geometry = Geometry(width, height, x_offset - width, y_offset + offset)

        image = self._CropGeometry(coordinate_geometry)
        self._width = self._ocr.GetDecimalDegrees(image) - self.GetX()

        return self._width

    def GetHeight(self):
        if self._height:
            return self._height

        map_geometry = self._GetMapGeometry()

        offset = _MAGIC_COORDINATE_OFFSET_ * self._scale_factor
        width = _MAGIC_COORDINATE_BBOX_HEIGHT_ * self._scale_factor
        height = _MAGIC_COORDINATE_BBOX_WIDTH_ * self._scale_factor

        x_offset = map_geometry.xOff() + map_geometry.width()
        y_offset = map_geometry.yOff() + map_geometry.height()

        coordinate_geometry = Geometry(width, height, x_offset + offset, y_offset - height)

        image = self._CropGeometry(coordinate_geometry)
        image.rotate(90)
        self._height = self._ocr.GetDecimalDegrees(image) - self.GetY()

        return self._height

    def _RefreshCoordinates(self):
        if self.IsValid():
            return

        self._x = 0.0
        self._y = 0.0
        self._width = 0.0
        self._height = 0.0

        self.GetWidth()
        self.GetHeight()

    def _CropGeometry(self, geometry):
        if not self._map_image:
            self._GenerateImage()

        image = Image(self._map_image)
        image.crop(geometry)

        return image

    def _GetMapGeometry(self):
        width = self.WIDTH - self.MARGIN_LEFT - self.MARGIN_RIGHT
        height = self.HEIGHT - self.MARGIN_TOP - self.MARGIN_BOTTOM

        width *= self._scale_factor
        height *= self._scale_factor
        margin_left = self.MARGIN_LEFT * self._scale_factor
        margin_top = self.MARGIN_TOP * self._scale_factor

        return Geometry(width, height, margin_left, margin_top)

    def _GenerateImage(self):
        scaled_density = 72 * self._scale_factor

        self._map_image = Image()
        self._map_image.density("%dx%d" % (scaled_density, scaled_density))
        self._map_image.read(self._map_path)
Beispiel #17
0
import os
import ghostscript
from PyPDF2 import PdfFileReader, PdfFileWriter
from tempfile import NamedTemporaryFile
from PythonMagick import Image

reader = PdfFileReader(open('C:\\Users\\user\\Desktop\\fp\\1.pdf', "rb"))
for page_num in xrange(reader.getNumPages()):
    writer = PdfFileWriter()
    writer.addPage(reader.getPage(page_num))
    temp = NamedTemporaryFile(prefix=str(page_num),
                              suffix=".pdf",
                              delete=False)

    writer.write(temp)

    print temp.name

    tempname = temp.name
    temp.close()

    im = Image(tempname)
    im.density("3000")
    #im.read(tempname)
    im.write("some_%d.jpg" % (page_num))

    os.remove(tempname)

reader.close()
Beispiel #18
0
def upload():
    if request.method == 'POST':
        file = request.files['file']
        name = os.path.splitext(file.filename)[0]
        extension = os.path.splitext(file.filename)[1]
        f_name = name + extension
        if extension == ".pdf":
            file.save(os.path.join(basedir,UPLOAD_FOLDER , f_name))
            filepath = os.path.join('./upload', f_name)
            os.makedirs(os.path.join("./images", name))
            filepath1 = os.path.join('./images', name)
            reader = PdfFileReader(open(filepath, "rb"))
            for page_num in xrange(reader.getNumPages()):
                writer = PdfFileWriter()
                writer.addPage(reader.getPage(page_num))
                temp = NamedTemporaryFile(prefix=str(page_num), suffix=".pdf", delete=False)
                writer.write(temp)
                temp.close()
                im = Image()
                im.density("300")  # DPI, for better quality
                im.read(temp.name)
                im.write("images/%s/ %d.jpg" % (str(name), page_num))
                os.remove(temp.name)
            os.remove(filepath)
            dirs = os.listdir(filepath1)
            dict = {}
            i = 0
            dict["file_name"] = f_name
            for file in dirs:
                filepath2 = os.path.join(filepath1, file)
                with io.open(filepath2, 'rb') as image_file:
                    content= base64.b64encode(image_file.read())
                    payload = {
                        "requests": [
                            {
                                "image": {
                                    "content": content
                                },
                                "features": [
                                    {
                                        "type": "TEXT_DETECTION"
                                    }
                                ]
                            }
                        ]
                    }
                r = requests.post(url, data=json.dumps(payload))
                r = json.loads(r.text)
                r=r['responses'][0]['textAnnotations'][0]['description']
                key = "page_no-" + str(file)
                dict[key] = r
            shutil.rmtree(filepath1)
            return json.dumps(dict)
        elif extension == '.png' or extension == '.jpg' or extension == '.jpeg':
            dict = {}
            dict["file_name"] = f_name
            file.save(os.path.join(basedir, UPLOAD_FOLDER, f_name))
            filepath = os.path.join('./upload', f_name)
            with io.open(filepath, 'rb') as image_file:
                content = base64.b64encode(image_file.read())
                payload = {
                    "requests": [
                        {
                            "image": {
                                "content": content
                            },
                            "features": [
                                {
                                    "type": "TEXT_DETECTION"
                                }
                            ]
                        }
                    ]
                }
            r = requests.post(url, data=json.dumps(payload))
            r=json.loads(r.text)
            r=r['responses'][0]['textAnnotations'][0]['description']
            dict["text"] = r
            os.remove(filepath)
            return json.dumps(dict)
        else:
            return json.dumps({'file format is wrong': f_name})

    return json.dumps({'file proess completed': f_name})
Beispiel #19
0
import os
from pyPdf import PdfFileReader, PdfFileWriter
from tempfile import NamedTemporaryFile
from PythonMagick import Image
import sys

pdfname=sys.argv[1]
#os.system("rm /tmp/test2/*")
#os.system("mkdir /tmp/test2")
cmd="cp /opt/%s /tmp/test2/ocr.pdf"%(pdfname)

#os.system("cp /opt/corrected.html /tmp/test2/some_0.hocr")
os.system(cmd)

reader = PdfFileReader(open("/tmp/test2/ocr.pdf", "rb"))
for page_num in xrange(reader.getNumPages()):
    writer = PdfFileWriter()
    writer.addPage(reader.getPage(page_num))
    temp = NamedTemporaryFile(prefix=str(page_num), suffix=".pdf", delete=False)
    writer.write(temp)
    temp.close()

    im = Image()
    im.density("300") # DPI, for better quality
    im.read(temp.name)
    im.write("/tmp/test2/some_%d.jpg" % (page_num))

    os.remove(temp.name)