Esempio n. 1
0
    def process(self, pdf_filename, pdf_resolution, imageformat,
                do_orientation):
        final_text = ""
        image_pdf = Image(filename=pdf_filename, resolution=pdf_resolution)
        image_page = image_pdf.convert(imageformat)

        page = 1
        process_start = time.time()
        for img in image_page.sequence:
            img_per_page = Image(image=img)
            img_per_page.type = 'grayscale'
            img_per_page.depth = 8
            img_per_page.density = pdf_resolution
            try:
                img_per_page.level(black=0.3,
                                   white=1.0,
                                   gamma=1.5,
                                   channel=None)
            except AttributeError as e:
                print("Update Wand library: %s" % e)
            img_per_page.save(filename="buffer.png")
            page_start = time.time()
            txt = self.image2txt_pyocr(img_per_page.make_blob(imageformat),
                                       do_orientation)
            page_elaboration = time.time() - page_start
            print("page %s - size %s - process %2d sec. - text %s" %
                  (page, img_per_page.size, page_elaboration, len(txt)))
            final_text += "%s\n" % txt
            page += 1
            img.destroy()

        process_end = time.time() - process_start
        print("Total elaboration time: %s" % process_end)

        return final_text
Esempio n. 2
0
    def pdf_run(self, image_file_name, filename, path):
        
        image_pdf = Image(filename=image_file_name, resolution=300) #take filename
        image_page = image_pdf.convert("png") #png conversion

        page = 1 #init page
        process_start = time.time()

        for img in image_page.sequence: # Every single image in image_page for grayscale conversion in 300 resolution
            
            img_per_page = Image(image=img)
            img_per_page.type = 'grayscale'
            img_per_page.depth = 8
            img_per_page.density = 300

            try:
                img_per_page.level(black=0.3, white=1.0, gamma=1.5, channel=None)
            
            except AttributeError as e:
                print("Update Wand library: %s" % e)

            img_buf = path + '/' + "saram_" + filename + str(page) + ".png"

            os.chmod(path, 0o777)
            img_per_page.save(filename=img_buf)

            page_start = time.time()
            page_elaboration = time.time() - page_start
            print("page %s - size %s - process %2d sec." % (page, img_per_page.size, page_elaboration))
                
            page += 1
            img.destroy()

        process_end = time.time() - process_start
        print("Total elaboration time: %s" % process_end)
Esempio n. 3
0
    def process(self, pdf_filename, pdf_resolution, imageformat,
                do_orientation,
                png_filename):  #Attribute png File name is added by me

        final_text = ""
        image_pdf = Image(filename=pdf_filename, resolution=pdf_resolution)
        image_page = image_pdf.convert(imageformat)

        page = 1
        process_start = time.time()
        for img in image_page.sequence:
            img_per_page = Image(image=img)
            img_per_page.type = 'grayscale'
            img_per_page.depth = 8
            img_per_page.density = pdf_resolution
            #try:
            img_per_page.level(black=0.3, white=1.0, gamma=1.5, channel=None)
            #except AttributeError as e:
            #print("Update Wand library: {}".format(e))
            img_per_page.save(
                filename=png_filename
            )  # i have changed this to png_filename from 'buffer.png'
            page_start = time.time()
            self.image2txt_pyocr(img_per_page.make_blob(imageformat),
                                 do_orientation)
##            txt = self.image2txt_pyocr(img_per_page.make_blob(imageformat), do_orientation)
##            page_elaboration = time.time() - page_start
##            print("page %s - size %s - process %2d sec. - text %s" %
##                  (page, img_per_page.size, page_elaboration, len(txt)))
##            final_text += "%s\n" % txt
##            page += 1
##            img.destroy()

        process_end = time.time() - process_start
        print("Total elaboration time: %s" % process_end)

        return final_text