コード例 #1
0
def pdfToTxt(filename):
    print('Extracting {}'.format(filename))
    print(
        'WARNING: Corrupted text on file may retrive errors during extraction')

    # open image
    filename = filename[0:-4]
    pdfFile = wi(filename='data/{}.pdf'.format(filename), resolution=300)
    image = pdfFile.convert('jpeg')

    # transform image to imageBlob
    imageBlobs = []
    for img in image.sequence:
        imgPage = wi(image=img)
        imageBlobs.append(imgPage.make_blob('jpeg'))

    # extract text from imageBlob
    extract = ''
    for imgBlob in imageBlobs:
        image = Image.open(io.BytesIO(imgBlob))
        text = pytesseract.image_to_string(image, lang='eng')
        extract += text

    # save as text file
    file = open('data/text/{}.txt'.format(filename), 'w')
    file.write(extract)
    file.close()
コード例 #2
0
def get_text(pdf_location, res=120, page=None):
    # import os
    import io
    from PIL import Image
    import pytesseract
    from wand.image import Image as wi
    from clean import _clean
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'

    # DIR = pdf_location[0:pdf_location.rindex("\\")]
    try:
        FILE = pdf_location[pdf_location.rindex("\\") + 1:]
    except ValueError:
        FILE = pdf_location
    # os.chdir(DIR)

    if page is None:
        pdf = wi(filename=FILE, resolution=res)
    else:
        pdf = wi(filename=FILE + "[" + str(page) + "]", resolution=res)
    pdfImg = pdf.convert('jpeg')
    extracted_text = []
    for img in pdfImg.sequence:
        page = wi(image=img)
        imgBlob = page.make_blob('jpeg')
        im = Image.open(io.BytesIO(imgBlob))
        text = pytesseract.image_to_string(im, lang='eng', config='--psm 6')
        extracted_text.append(_clean(text))
        # extracted_text.append(text)
    return extracted_text
コード例 #3
0
def pdfocr(path, pages=[], lang='eng'):

    if len(path) == 0:
        print('Path is empty')
        return

    pytesseract.pytesseract.tesseract_cmd = ConfigOcr.path_to_tesseract
    pdf = wi(filename=path, resolution=300)
    pdfImage = pdf.convert('jpeg')

    imageBlobs = []
    count = 1

    for img in pdfImage.sequence:
        if (not pages) or count in pages:
            imgPage = wi(image=img)
            imageBlobs.append(imgPage.make_blob('jpeg'))
        count = count + 1

    result_text = []

    for imgBlob in imageBlobs:
        im = Image.open(io.BytesIO(imgBlob))
        text = pytesseract.image_to_string(im, lang=lang)
        result_text.append(text)

    return result_text
コード例 #4
0
def is_crypto(_filename):
    pdf = wi(filename=_filename, resolution=140)
    pdf_image = pdf.convert('jpeg')

    image_blobs = []

    for img in pdf_image.sequence:
        img_page = wi(image=img)
        image_blobs.append(img_page.make_blob('jpeg'))

    recognized_text = []

    for imgBlob in image_blobs:
        im = Image.open(io.BytesIO(imgBlob))
        text = pytesseract.image_to_string(im, lang='eng')
        recognized_text.append(text)

    keyword = "crypto"
    keyword_second = "second"

    for i in recognized_text:
        # print(i)
        if keyword in i or keyword_second in i:
            print("CRYPTO STATEMENT!!!")
            return True

    print("PRAESCIRE STATEMENT")
    return False
コード例 #5
0
def generate_text(pdf_location, res=300):
    # import os
    import io
    from PIL import Image
    import pytesseract
    from wand.image import Image as wi
    # from clean import _clean
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'

    # DIR = pdf_location[0:pdf_location.rindex("\\")]
    try:
        FILE = pdf_location[pdf_location.rindex("\\") + 1:]
    except ValueError:
        FILE = pdf_location
    # os.chdir(DIR)

    pdf = wi(filename=FILE, resolution=res)
    pdfImg = pdf.convert('jpeg')
    for img in pdfImg.sequence:
        page = wi(image=img)
        imgBlob = page.make_blob('jpeg')
        im = Image.open(io.BytesIO(imgBlob))
        text = pytesseract.image_to_string(im, lang='eng', config='--psm 6')
        # yield _clean(text)
        yield text
コード例 #6
0
def is_crypto(_filename):
    """
	returns a bool whether the file is a crypto statement or not
	:param _filename:
	:return:
	"""

    pdf = wi(filename=_filename, resolution=100)
    pdf_image = pdf.convert('jpeg')

    image_blobs = []

    for img in pdf_image.sequence:
        img_page = wi(image=img)
        image_blobs.append(img_page.make_blob('jpeg'))

    recognized_text = []

    for imgBlob in image_blobs:
        im = Image.open(io.BytesIO(imgBlob))
        text = pytesseract.image_to_string(im, lang='eng')
        recognized_text.append(text)

    keyword = "crypto"

    for i in recognized_text:
        # print(i)
        if keyword in i:
            print("WE FOUND IT EXITING")
            print("CRYPTO STATEMENT!!!")
            return True

    print("PRAESCIRE STATEMENT")
    return False
コード例 #7
0
def pdf_to_image(filepath):
	#encrypt file to be uploaded onto IPFS 
	encrypted = encrypt(getKey("abc123"), filepath)
	#try connect to IPFS
	try:
		api = ipfsapi.connect('127.0.0.1', 5001)
		new_file = api.add(encrypted)
		filehash = new_file['Hash']
	except ipfsapi.exceptions.ConnectionError as ce:
		print("Check if Dameon is switched on")
		filehash = " "
	os.remove(encrypted)
	# CHECK IF pdf file
	file_name, extension = os.path.splitext(filepath)
	''' Returns a list of CV2 images which are ready for further preprocessing'''
	#See if in pdf form - otherwise return image path
	if extension == ".pdf":
		imagelist = []
		with(wi(filename=filepath,resolution=400)) as source:
			images=source.sequence
			pages=len(images)
			for i in range(pages):
				wi(images[i]).save(filename=f"{file_name} page {i+1}.png")
				imagelist.append(f"{file_name} page {i+1}.png")
		return imagelist, filehash
	else:
		print("not pdf")
		return filepath, filehash
コード例 #8
0
ファイル: tess.py プロジェクト: swarnimshukla/Receipt-Parser
def ocr(f_path):

    #   img = input()

    new_path = os.path.join(os.getcwd(), f_path[1:])
    pdf = wi(filename=os.path.join(os.getcwd(), new_path), resolution=300)
    pdfImage = pdf.convert('jpg')

    orig_stdout = sys.stdout
    # print(os.getcwd())
    f = open('receipt/media/txt/output1.txt', 'w+')
    sys.stdout = f

    imageBlob = []

    for img in pdfImage.sequence:
        imgPage = wi(image=img)
        imageBlob.append(imgPage.make_blob('jpeg'))

    recognized_text = []

    for blob in imageBlob:
        im = Image.open(io.BytesIO(blob))
        text = pytesseract.image_to_string(im, lang='eng')
        recognized_text.append(text)

    print(recognized_text[0])

    sys.stdout = orig_stdout
    f.close()
コード例 #9
0
ファイル: opencv_wand.py プロジェクト: lizy331/ocr_extract
def pdfs2txts(file_path, folderName):
    complete_name = os.path.join(file_path, folderName)
    reports_dir1 = os.listdir(complete_name)
    create_dir('Convert output')
    for elem in tqdm(reports_dir1):
        complete_name = os.path.join(folderName, elem)
        print(complete_name)
        pdf = wi(filename=complete_name,
                 resolution=750,
                 depth=8,
                 height=50,
                 background='white')
        pdfimage = pdf.convert('jpg')
        i = 1
        string = ''
        for img in pdfimage.sequence:
            page = wi(image=img)
            page.save(filename=str(i) + '.jpg')
            img_cv = cv2.imread(str(i) + '.jpg')
            # By default OpenCV stores images in BGR format and since pytesseract assumes RGB format,
            # we need to convert from BGR to RGB format/mode:
            img_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
            string += pytesseract.image_to_string(img_rgb)
            i += 1
        with open(os.path.join('Convert output', elem.replace('.pdf', '')) +
                  '.txt',
                  'w',
                  encoding='utf-8') as f:
            f.write(string)
コード例 #10
0
def convert(way):

    if not (os.path.exists('F:\python\open cv\jpegs')):
        os.mkdir('F:\python\open cv\jpegs')
        i = 1
        img_dir = way
        for root, dirs, files in os.walk(img_dir):
            for file in files:
                if file.endswith(".pdf"):
                    path = os.path.join(root, file)
                    pdf = wi(filename=path, resolution=400)
                    pdfimage = pdf.convert("jpeg")
                    for imgs in pdfimage.sequence:
                        page = wi(image=imgs)
                        page.save(
                            filename='F:\python\open cv\jpegs\conv_{}.jpg'.
                            format(i))
                    i += 1
                    print("***proccessed****")
                else:
                    messagebox.showerror("error",
                                         "{} is not PDF file".format(file))

        print("=====converted=======")
    else:
        print("File already exists")
コード例 #11
0
def conv_pdf(pdf):
    with open('./pdfs/' + pdf, 'rb') as f:
        # text from pdf
        bfr = io.BufferedReader(f)
        pdf_text = convert_pdf_to_txt(bfr)

        # recognized text (OCR) from pdf
        if len(pdf_text.strip()) <= 50:
            with wi(filename=pdf, resolution=200) as pdf_file:
                pdfImage = pdf_file.convert('jpeg')
                imageBlobs = []
                for img in pdfImage.sequence:
                    with wi(image=img) as imgPage:
                        imageBlobs.append(imgPage.make_blob('jpeg'))

            recognized_text = []

            for imgBlob in imageBlobs:
                im = Image.open(io.BytesIO(imgBlob))
                text = pytesseract.image_to_string(im, lang='eng')
                recognized_text.append(text)

            recognized_text = '\n\n\n'.join(recognized_text)

        pdf_text = pdf_text if len(pdf_text.strip()) > 50 else recognized_text

        return pdf_text
コード例 #12
0
ファイル: ocr.py プロジェクト: srivyshnavks/webocr
def pdfOcrTel(file_name):
	pdf = wi(filename = file_name, resolution = 300)
	pdfImage = pdf.convert('jpeg')

	imageBlobs = []

	for img in pdfImage.sequence:
		imgPage = wi(image = img)
		imageBlobs.append(imgPage.make_blob('jpeg'))

	extracted_text = []

	for imgBlob in imageBlobs:
		im = Image.open(io.BytesIO(imgBlob))
		text = pytesseract.image_to_string(im, lang = 'tel')
		extracted_text.append(text)

	fin = open('extracted.txt','w')
	fin.writelines(["%s\n" % item  for item in extracted_text])
	fin.close()

	f = open('extracted.txt','r')
	text = f.read()
	f.close()

	return text
コード例 #13
0
ファイル: Scan_OCR.py プロジェクト: priyansh8/PDF-extraction
def Get_text_from_image():
    import pytesseract, io, gc
    from PIL import Image
    from wand.image import Image as wi
    import gc
    """ Extracting text content from Image  """

    pdf = wi(filename='C:\\Users\\user\\Desktop\\Labs\\INE033L07EO0.pdf',
             resolution=300)
    pdfImg = pdf.convert('jpeg')
    imgBlobs = []
    extracted_text = []
    try:
        for img in pdfImg.sequence:
            page = wi(image=img)
            imgBlobs.append(page.make_blob('jpeg'))
            for i in range(0, 5):
                [gc.collect() for i in range(0, 10)]

        for imgBlob in imgBlobs:
            im = Image.open(io.BytesIO(imgBlob))
            pytesseract.pytesseract.tesseract_cmd = r'C:\Users\user\AppData\Local\Tesseract-OCR\tesseract.exe'
            text = pytesseract.image_to_string(im, lang='eng')
            text = text.replace(r"\n", " ")
            extracted_text.append(text)
            for i in range(0, 5):
                [gc.collect() for i in range(0, 10)]
        return (''.join([
            i.replace("\n", " ").replace("\n\n", " ") for i in extracted_text
        ]))
        [gc.collect() for i in range(0, 10)]
    finally:
        [gc.collect() for i in range(0, 10)]
        img.destroy()
コード例 #14
0
ファイル: file_chooser.py プロジェクト: Faisalsouz/PAN_OCR
def get_pdf(file_path):
    '''splitting pages of pdf into iamges
    input= pdf file path or pdf bytes array
    ouput= list of all image blobs'''


    # pdf_pages=convert_from_path(file_path)
    pdf_pages = []
    pdf = wi(blob=file_path, resolution=600)
    pdfimage = pdf.convert("jpg")
    print(pdfimage)
    i = 0
    for img in pdfimage.sequence:  # iterate over all pages extracted form the pdf doc
        page = wi(image=img)
        save=page.convert('jpg')

        #page.save(filename='./temp_pdfs/pdf_page_' + str(i) + '.jpg')
        # =blob=page.make_blob(format='jpeg')
        # pil_bytes=io.BytesIO(blob)
        # f_page=Image.open(blob)
        # pdf_pages.append(blob)
        #pdf_pages.append('./temp_pdfs/pdf_page_' + str(i) + '.jpg')
        i += 1
        pdf_pages.append(save)
    return pdf_pages
コード例 #15
0
ファイル: extract.py プロジェクト: Airypy/ocr-pdf-to-text
def answers_extr(page_name, ans_count):
    pdf = wi(filename=page_name, resolution=300)
    pdfImage = pdf.convert('jpeg')

    imageBlobs = []

    for img in pdfImage.sequence:
        imgPage = wi(image=img)
        imageBlobs.append(imgPage.make_blob('jpeg'))

    recognized_text = []
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'

    a = open('Answers.txt', 'a')

    for imgBlob in imageBlobs:
        im = Image.open(io.BytesIO(imgBlob))
        text = pytesseract.image_to_string(im, lang='eng')
        ans = re.compile(
            '(?!.*(Time|TARGET|ANSWER|DAY)).*[^SCORE]$'
        )  #REMOVING TEXTS STARTING FROM WORDS INSIDE THE BRACKETS
        for line in text.split('\n'):
            if "Correct Option:" in line or "Correct Option" in line or "Correct Answer" in line:
                ans_count += 1
                a.write('\n' +
                        "For Question {}.---------------".format(ans_count))
            if ans.match(line):
                text = re.sub('[\d{1}|\d{2}]\.|,', '',
                              line)  #Replacing 1. or 12. with blank value
                a.write('\n' + text)

    a.close()
    return ans_count
コード例 #16
0
def pdf_ocr(path, pages=[], lang='eng'):
    """Function return parsed text from pdf file using optical character recognition.

    path = path to pdf file
    pages = pages to recognize
    """
    if len(path) == 0:
        print('Path is empty')
        return

    pytesseract.pytesseract.tesseract_cmd = tesseract_path
    pdf = wi(filename=path, resolution=300)
    pdf_image = pdf.convert('jpeg')

    image_blobs = []
    count = 1

    for img in pdf_image.sequence:
        if (not pages) or count in pages:
            img_page = wi(image=img)
            image_blobs.append(img_page.make_blob('jpeg'))
        count = count + 1

    result_text = []

    for imgBlob in image_blobs:
        im = Image.open(io.BytesIO(imgBlob))
        text = pytesseract.image_to_string(im, lang=lang)
        result_text.append(text)

    return ' '.join(result_text)
コード例 #17
0
ファイル: extract.py プロジェクト: Airypy/ocr-pdf-to-text
def questions_extr(page_name, count):
    pdf = wi(filename=page_name, resolution=500)
    pdfImage = pdf.convert('jpeg')

    imageBlobs = []

    for img in pdfImage.sequence:
        imgPage = wi(image=img)
        imageBlobs.append(imgPage.make_blob('jpeg'))

    recognized_text = []
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'

    q = open('Questions.txt', 'a')
    o = open('Options.txt', 'a')
    for imgBlob in imageBlobs:
        im = Image.open(io.BytesIO(imgBlob))
        text = pytesseract.image_to_string(im, lang='eng')
        quest = re.compile(
            '(?!.*(\(|Time|TARGET|TEST|DAY|Maximum))[0-9]?[a-zA-z]?.*[^SCORE]$'
        )  #Pattern to select answers
        non_waste = re.compile(
            '(?!.*(Time|TARGET|TEST|DAY|Maximum|\d{2}\.)).*[^SCORE]$')
        options = re.compile('[(][a-c][)].*')  #Pattern to select options
        lines = text.split('\n')
        length = len(lines)
        i = 0
        while (i < length):

            if lines[i] == '' or len(lines[i]) <= 3 and lines[i][0].isdigit():
                i += 1
                continue
            if options.match(lines[i]):
                while (i < length and lines[i][1] != 'd'):
                    if non_waste.match(lines[i]):
                        o.write('\n' + lines[i])
                    i += 1
                    while (i < length and lines[i] == ''):
                        i += 1
                if len(lines[i]) == 3:
                    o.write('\n' + lines[i])
                    i += 1
                    while (i < length and lines[i] == ''):
                        i += 1
                while (i < length and lines[i] != ''):
                    o.write('\n' + lines[i])
                    i += 1
                count += 1
                o.write('\n' + 'Options {}.------------'.format(count))
                q.write('\n' + "Question {}.----------------".format(count) +
                        '\n')
                continue
            if quest.match(lines[i]):
                text = re.sub('\d{2}\.|,', '', lines[i])
                q.write(' ' + text)
            i += 1
    q.close()
    o.close()
    return count
コード例 #18
0
def pdfToImages(filepath):
    pdf = wi(filename=filepath, resolution=300)
    pdfImage = pdf.convert('jpeg')
    global i
    for image in pdfImage.sequence:
        page_i = wi(image=image)
        page_i.save(filename=str(i) + ".jpg")
        i += 1
コード例 #19
0
def toText(path='path to Offres Directorie'):
    text = ''

    #set up database
    mydb = Mongo('TTProject', ['offres-info', 'offres-text'])

    #move to directory that containes offres
    os.chdir(path)
    dirnames = open('names.txt').read().split(" ")
    references = open('references.txt').read().split(" ")
    i = 0
    for directory in dirnames:
        if len(directory):
            x = len(directory)
            dirname = str(directory)[1:x - 1]

            #informations to store
            reference = references[i]
            name = dirname
            text += '############################  ' + \
                dirname+'   ############################\n\n\n'
            #loop trought Directory :
            dirPath = path + '\\' + dirname
            for filename in os.listdir(os.path.join(dirPath)):
                text += '\n\n####################   ' + filename + '   ####################\n\n'
                filePath = path + '\\' + dirname + '\\' + filename
                # check extention
                # PDFs
                if filename.endswith('.pdf'):
                    pdfFile = wi(filename=os.path.join(filePath),
                                 resolution=300)
                    images = pdfFile.convert('jpeg')
                    for page in images:
                        page.save('page.jpg', 'JPEG')
                        image = Image.open(io.BytesIO(page))
                        imgPage = wi(image=image)
                        Image = imgPage.make_blob('jpeg')
                        text += pytesseract.image_to_string(
                            Image.open('page.jpg'), lang='fra')
                # DOCXs
                if filename.endswith('.docx'):
                    document = Document(filePath)
                    tables = document.tables
                    for p in document.paragraphs:
                        text += p.text
                    text += "\n  ###### Tables's Content ###### \n"
                    for table in tables:
                        for row in table.rows:
                            for cell in row.cells:
                                for paragraph in cell.paragraphs:
                                    text += (paragraph.text)

                #inserting to mongo
                offre = {'_id': reference, "name": name, 'text': text}
                mydb.insert(offre, 'offres-text')
                text = ''
                i += 1
コード例 #20
0
ファイル: indpdf.py プロジェクト: pozernishku/legis_rem
    def parsesave(self, response):
        # self.numb += 1
        # filename = response.meta.get('year') + '-' + str(self.numb) + '-' + response.meta.get('numbname') + '.pdf'
        # os.makedirs('./indiana/', exist_ok=True)

        # with open('./indiana/' + filename, 'wb') as f:
        # f.write(response.body)
        # yield IndpdfItem(url=response.url, year=response.meta.get('year'), bill=response.meta.get('numbname'))

        url = response.url
        state = 'indiana'
        html = ''
        bill_name = response.meta.get('numbname')
        session = response.meta.get('year')
        chamber = response.meta.get('chamber').capitalize()
        topic = response.meta.get('topic')
        topic = ' '.join(topic) if topic else ''
        date = '#TODO'
        md5 = hashlib.md5(response.body).hexdigest()

        # text from pdf
        bytesio = io.BytesIO(response.body)
        bfr = io.BufferedReader(bytesio)
        pdf_text = convert_pdf_to_txt(
            bfr
        )  # if response.url.strip()[-4:].lower() == '.pdf' else 'unsupported file'

        # recognized text (OCR) from pdf
        if len(pdf_text.strip()) <= 50:
            with wi(filename=response.url, resolution=200) as pdf:
                pdfImage = pdf.convert('jpeg')
                imageBlobs = []
                for img in pdfImage.sequence:
                    with wi(image=img) as imgPage:
                        imageBlobs.append(imgPage.make_blob('jpeg'))

            recognized_text = []

            for imgBlob in imageBlobs:
                im = Image.open(io.BytesIO(imgBlob))
                text = pytesseract.image_to_string(im, lang='eng')
                recognized_text.append(text)

            recognized_text = '\n\n\n'.join(recognized_text)

        pdf_text = pdf_text if len(pdf_text.strip()) > 50 else recognized_text

        yield IndpdfItem(url=url,
                         state=state,
                         html=html,
                         text=pdf_text,
                         bill_name=bill_name,
                         session=session,
                         chamber=chamber,
                         topic=topic,
                         date=date,
                         md5=md5)
コード例 #21
0
ファイル: convertir.py プロジェクト: Hyppox/proyecto_final
    def convertir(self):

        pdf = wi(filename="example.pdf", resolution=300)
        pdfimage = pdf.convert("jpeg")
        i = 1
        for img in pdfimage.sequence:
            page = wi(image=img)
            page.save(filename=str(i) + ".jpg")
            i += 1
コード例 #22
0
def save_img_as_pdf(file_dir, filename, count, output_dir, output_filename):
    path = os.path.join(file_dir, filename)
    output_path = os.path.join(output_dir, output_filename)
    with wi() as w:
        for i in range(1, count):
            with wi(filename=path + str(i) + '.jpg') as page:
                w.sequence.append(page)
        w.save(filename=output_path)
    return output_path
コード例 #23
0
ファイル: pdf2image.py プロジェクト: SunuSani/project
def p2f(name):
	pdf = wi(filename=name, resolution=300)
	pdfimage = pdf.convert("png")
	i=1
	#convert to image
	for img in pdfimage.sequence:
		page = wi(image=img)
		page.save(filename=str(i)+".png")
		s=segment.image_processing(str(i)+".png")
		return(s)
コード例 #24
0
def pdfimgconvert():
    pdffilepath = "Offer.pdf"
    PDFfile = wi(filename=pdffilepath, resolution=400)

    Images = PDFfile.convert('jpg')
    ImageSequence = 1
    for img in PDFfile.sequence:
        Image = wi(image=img)
        Image.save(filename="PDFImg/Image" + str(ImageSequence) + ".jpg")
        ImageSequence += 1
コード例 #25
0
 def convertPDF(self, filename):
     pdf = wi(filename=filename, resolution=300)
     pdfImage = pdf.convert('jpeg')
     blobs = []
     for img in pdfImage.sequence:
         imgPage = wi(image=img)
         blobs.append(imgPage.make_blob('jpeg'))
         if (pdfImage.sequence.index(img) == 0):
             self.initialize(blobs[0])
     return blobs
コード例 #26
0
def _create_to_page(j, lis, x):
    pdf = wi(filename="PDF/LIBROS CONTABLES/" + lis + "/" + x, resolution=100)
    pdfImage = pdf.convert("jpg")
    for img in pdfImage.sequence:
        page = wi(image=img)
        page.save(filename="Libro Contable/pages/" + str(j) + ".jpg")
        j += 1
        print "Imprimiendo pagina......" + str(j)

    return j
コード例 #27
0
 def openthefile(self, path, fname):
     pdf = wi(filename=fname, resolution=300)
     pdfImage = pdf.convert("jpeg")
     filename = fname[:-4]
     self.pagecnt = len(pdf.sequence)
     i = 0
     for img in pdf.sequence:
         page = wi(image=img)
         page.save(filename=path + '/' + filename.split('/')[-1] + "-" +
                   str(i) + ".jpg")
         i = i + 1
コード例 #28
0
def from_pdf(file_path, start, work_dir):
    pdf = wi(filename=file_path, resolution=300)
    pdfImage = pdf.convert("png")
    print(pdfImage)
    pg_n = start
    for img in pdfImage.sequence:
        page = wi(image=img)
        page_fn = os.path.join(work_dir, 'page-{:03}.png'.format(pg_n))
        page.save(filename=page_fn)
        pg_n += 1
        return page_fn
コード例 #29
0
ファイル: cropImg.py プロジェクト: kkamons/CropImageInPDF
def pdf2jpg(filename, source):

    pdf = wi(filename=source + '\\' + filename + '.pdf', resolution=300)
    pdfImg = pdf.convert('jpeg')
    i = 1
    name = str(filename).split('.')[0]

    for img in pdfImg.sequence:
        page = wi(image=img)
        page.save(filename=source + '\\' + name + '--' + str(i) + '.jpg')
        i += 1
コード例 #30
0
def pdf2image(pdfName, resolution=300, imgFormat='jpeg'):

    pdf = wi(filename=pdfName, resolution=resolution)
    pdfImage = pdf.convert(imgFormat)

    imgBlobs = []

    for img in pdfImage.sequence:
        imgPage = wi(image=img)
        imgBlobs.append(imgPage.make_blob(imgFormat))

    return imgBlobs