def pdf_run(self, image_file_name, filename, path): image_pdf = Image(filename=image_file_name, resolution=300) #take filename image_page = image_pdf.convert("png") #png conversion page = 1 #init page process_start = time.time() for img in image_page.sequence: # Every single image in image_page for grayscale conversion in 300 resolution img_per_page = Image(image=img) img_per_page.type = 'grayscale' img_per_page.depth = 8 img_per_page.density = 300 try: img_per_page.level(black=0.3, white=1.0, gamma=1.5, channel=None) except AttributeError as e: print("Update Wand library: %s" % e) img_buf = path + '/' + "saram_" + filename + str(page) + ".png" os.chmod(path, 0o777) img_per_page.save(filename=img_buf) page_start = time.time() page_elaboration = time.time() - page_start print("page %s - size %s - process %2d sec." % (page, img_per_page.size, page_elaboration)) page += 1 img.destroy() process_end = time.time() - process_start print("Total elaboration time: %s" % process_end)
def process(self, pdf_filename, pdf_resolution, imageformat, do_orientation): final_text = "" image_pdf = Image(filename=pdf_filename, resolution=pdf_resolution) image_page = image_pdf.convert(imageformat) page = 1 process_start = time.time() for img in image_page.sequence: img_per_page = Image(image=img) img_per_page.type = 'grayscale' img_per_page.depth = 8 img_per_page.density = pdf_resolution try: img_per_page.level(black=0.3, white=1.0, gamma=1.5, channel=None) except AttributeError as e: print("Update Wand library: %s" % e) img_per_page.save(filename="buffer.png") page_start = time.time() txt = self.image2txt_pyocr(img_per_page.make_blob(imageformat), do_orientation) page_elaboration = time.time() - page_start print("page %s - size %s - process %2d sec. - text %s" % (page, img_per_page.size, page_elaboration, len(txt))) final_text += "%s\n" % txt page += 1 img.destroy() process_end = time.time() - process_start print("Total elaboration time: %s" % process_end) return final_text
def rasterizeImage(self): if not os.path.isfile(self.getImgFilepath()): print 'rasterize page:', self.n # rasterize wand_img = Image(file=self.bytes, resolution=int(IMAGE_WIDTH * DPI_TO_PX_RATIO / (self.pdf_page.mediaBox[3]))) width, height = wand_img.width, wand_img.height wand_img.depth = 8 blob = wand_img.make_blob(format='RGB') # convert wand_image to cv_image img = np.zeros((height, width, 3), dtype=np.uint8) for y in xrange(height): for x in xrange(width): img[y, x, 0] = struct.unpack('B', blob[3 * (y * width + x) + 2])[0] img[y, x, 1] = struct.unpack('B', blob[3 * (y * width + x) + 1])[0] img[y, x, 2] = struct.unpack('B', blob[3 * (y * width + x) + 0])[0] cv2.imwrite(self.getImgFilepath(), img)
def runPage(pdf, n): # read the pdf page into bytes array pdf_writer = PyPDF2.PdfFileWriter() pdf_page = pdf.getPage(n) pdf_writer.addPage(pdf_page) bytes = io.BytesIO() pdf_writer.write(bytes) bytes.seek(0) # rasterize wand_img = Image(file=bytes, resolution=int(IMAGE_WIDTH * DPI_TO_PX_RATIO / (pdf_page.mediaBox[2]))) width, height = wand_img.width, wand_img.height wand_img.depth = 8 blob = wand_img.make_blob(format='RGB') # convert wand_image to cv_image cv_img = np.zeros((height, width, 3), dtype=np.uint8) for y in xrange(height): for x in xrange(width): cv_img[y, x, 0] = struct.unpack('B', blob[3 * (y * width + x) + 2])[0] cv_img[y, x, 1] = struct.unpack('B', blob[3 * (y * width + x) + 1])[0] cv_img[y, x, 2] = struct.unpack('B', blob[3 * (y * width + x) + 0])[0] cv2.imshow('img', cv_img) #undistort cv_img = undistort(cv_img.copy(), verbose=False) cv2.imshow('undistorted', cv_img) structure_data = {} structure_data['w'], structure_data['h'], structure_data[ 'split_pts'] = split(cv_img, verbose=False) lined_img = cv_img.copy() for h in structure_data['split_pts']: h = int(h) cv2.line(lined_img, (0, h), (width, h), (0, 0, 255), 0) cv2.imshow('img', lined_img) return cv_img, structure_data
def rasterizeImage(self): if not os.path.isfile(self.getImgFilepath()): print 'rasterize page:', self.n # rasterize wand_img = Image(file = self.bytes, resolution = int(IMAGE_WIDTH*DPI_TO_PX_RATIO/(self.pdf_page.mediaBox[3]))) width, height = wand_img.width, wand_img.height wand_img.depth = 8 blob = wand_img.make_blob(format='RGB') # convert wand_image to cv_image img = np.zeros((height, width, 3), dtype = np.uint8) for y in xrange(height): for x in xrange(width): img[y, x, 0] = struct.unpack('B', blob[3*(y*width+x)+2])[0] img[y, x, 1] = struct.unpack('B', blob[3*(y*width+x)+1])[0] img[y, x, 2] = struct.unpack('B', blob[3*(y*width+x)+0])[0] cv2.imwrite(self.getImgFilepath(), img)
def runPage(pdf, n): # read the pdf page into bytes array pdf_writer = PyPDF2.PdfFileWriter() pdf_page = pdf.getPage(n) pdf_writer.addPage(pdf_page) bytes = io.BytesIO() pdf_writer.write(bytes) bytes.seek(0) # rasterize wand_img = Image(file = bytes, resolution = int(IMAGE_WIDTH*DPI_TO_PX_RATIO/(pdf_page.mediaBox[2]))) width, height = wand_img.width, wand_img.height wand_img.depth = 8 blob = wand_img.make_blob(format='RGB') # convert wand_image to cv_image cv_img = np.zeros((height, width, 3), dtype = np.uint8) for y in xrange(height): for x in xrange(width): cv_img[y, x, 0] = struct.unpack('B', blob[3*(y*width+x)+2])[0] cv_img[y, x, 1] = struct.unpack('B', blob[3*(y*width+x)+1])[0] cv_img[y, x, 2] = struct.unpack('B', blob[3*(y*width+x)+0])[0] cv2.imshow('img', cv_img) #undistort cv_img = undistort(cv_img.copy(), verbose = False) cv2.imshow('undistorted', cv_img) structure_data = {} structure_data['w'], structure_data['h'], structure_data['split_pts'] = split(cv_img, verbose = False) lined_img = cv_img.copy() for h in structure_data['split_pts']: h = int(h) cv2.line(lined_img, (0, h), (width, h), (0,0,255), 0) cv2.imshow('img', lined_img) return cv_img, structure_data
def ExtractImg(pdf_reader, n, out_path): writer = PyPDF2.PdfFileWriter() page = pdf_reader.getPage(n) writer.addPage(page) bytes = io.BytesIO() writer.write(bytes) bytes.seek(0) wand_img = Image(file = bytes, resolution = IMAGE_WIDTH*DPI_TO_PX_RATIO/(page.mediaBox[3])) width, height = wand_img.width, wand_img.height wand_img.depth = 8 blob = wand_img.make_blob(format='RGB') # convert wand_image to cv_image img = np.zeros((height, width, 3), dtype = np.uint8) for y in xrange(height): for x in xrange(width): img[y, x, 0] = struct.unpack('B', blob[3*(y*width+x)+2])[0] img[y, x, 1] = struct.unpack('B', blob[3*(y*width+x)+1])[0] img[y, x, 2] = struct.unpack('B', blob[3*(y*width+x)+0])[0] cv2.imwrite(out_path, img)
def process(self, pdf_filename, pdf_resolution, imageformat, do_orientation, png_filename): #Attribute png File name is added by me final_text = "" image_pdf = Image(filename=pdf_filename, resolution=pdf_resolution) image_page = image_pdf.convert(imageformat) page = 1 process_start = time.time() for img in image_page.sequence: img_per_page = Image(image=img) img_per_page.type = 'grayscale' img_per_page.depth = 8 img_per_page.density = pdf_resolution #try: img_per_page.level(black=0.3, white=1.0, gamma=1.5, channel=None) #except AttributeError as e: #print("Update Wand library: {}".format(e)) img_per_page.save( filename=png_filename ) # i have changed this to png_filename from 'buffer.png' page_start = time.time() self.image2txt_pyocr(img_per_page.make_blob(imageformat), do_orientation) ## txt = self.image2txt_pyocr(img_per_page.make_blob(imageformat), do_orientation) ## page_elaboration = time.time() - page_start ## print("page %s - size %s - process %2d sec. - text %s" % ## (page, img_per_page.size, page_elaboration, len(txt))) ## final_text += "%s\n" % txt ## page += 1 ## img.destroy() process_end = time.time() - process_start print("Total elaboration time: %s" % process_end) return final_text
def run(path, pdf_filename): sys.stdout.flush() # create folder foldername = pdf_filename[:-4] if not os.path.isdir(path+'/'+foldername): print 'PdfFile', pdf_filename, os.makedirs(path+'/'+foldername) else: return # if a folder exists, assume that the images are ready # read pdf pdf = PyPDF2.PdfFileReader(file(path+'/'+pdf_filename, "rb")) print pdf.getNumPages(), 'pages', for n in xrange(0, pdf.getNumPages()): pdf_writer = PyPDF2.PdfFileWriter() pdf_page = pdf.getPage(n) pdf_writer.addPage(pdf_page) bytes = io.BytesIO() pdf_writer.write(bytes) bytes.seek(0) # rasterize wand_img = Image(file = bytes, resolution = int(IMAGE_WIDTH*DPI_TO_PX_RATIO/(pdf_page.mediaBox[3]))) width, height = wand_img.width, wand_img.height wand_img.depth = 8 blob = wand_img.make_blob(format='RGB') # convert wand_image to cv_image cv_img = np.zeros((height, width, 3), dtype = np.uint8) for y in xrange(height): for x in xrange(width): cv_img[y, x, 0] = struct.unpack('B', blob[3*(y*width+x)+2])[0] cv_img[y, x, 1] = struct.unpack('B', blob[3*(y*width+x)+1])[0] cv_img[y, x, 2] = struct.unpack('B', blob[3*(y*width+x)+0])[0] cv2.imwrite(path+'/'+foldername+'/'+str(n)+'.jpg', cv_img) print '.', print ''
def main(): args = get_args() draw = Drawing() draw.font = args.font_file draw.font_size = args.font_size font_name = args.font_name out_dir = args.out_dir img_ref = Image(width=1000, height=1000) if args.verbose: print "Writing " + out_dir + "/" + font_name + ".c" f = open(out_dir + "/" + font_name + ".c", 'wb+') write_comment(f) f.write("#include \"font.h\"\n\n") font_height = 0 range_first = 0x20 range_last = 0x7d font_width = [] max_width = 0 for x in range(range_first, range_last + 1): letter = chr(x) metrics = draw.get_font_metrics(img_ref, letter) text_height = int(round(metrics.text_height + 2)) if font_height == 0: font_height = text_height assert (font_height == text_height), "font height changed!" if max_width == 0: max_width = metrics.maximum_horizontal_advance + 2 assert (max_width == metrics.maximum_horizontal_advance + 2), \ "font advance width changed!" text_width = int(round(metrics.text_width + 2)) font_width.append(text_width) img = Image(width=text_width, height=text_height) d = draw.clone() d.text(0, int(metrics.ascender), letter) d(img) img.depth = 1; f.write("static const unsigned char ") f.write("letter_" + str(hex(x)[2:]) + "[] = {\n") c_hex_print(f, img.make_blob(format='A')) f.write("};\n\n") img.close() f.write("static const struct font_letter letters[] = {\n") for x in range(range_first, range_last + 1): letter_var_name = "letter_" + str(hex(x)[2:]) f.write("\t{ " + letter_var_name + ", ") f.write("sizeof(" + letter_var_name + "), ") f.write(str(font_width[x - range_first]) + "},\n") f.write("};\n\n") f.write("const struct font font_" + font_name + " = {\n") f.write("\t.first = " + str(hex(range_first)) + ",\n") f.write("\t.last = " + str(hex(range_last)) + ",\n") f.write("\t.letters = letters,\n") f.write("\t.height = " + str(font_height) + ",\n") f.write("\t.max_width = " + str(max_width) + ",\n") f.write("};\n") f.close() if args.verbose: print "Writing " + out_dir + "/" + font_name + ".h" f = open(out_dir + "/" + font_name + ".h", 'wb+') write_comment(f) f.write("#ifndef __" + font_name.upper() + "_H\n"); f.write("#define __" + font_name.upper() + "_H\n"); f.write("#include \"font.h\"\n") f.write("extern const struct font font_" + font_name + ";\n") f.write("#endif /*__" + font_name.upper() + "_H*/\n"); f.close()
def main(): args = get_args() draw = Drawing() draw.font = args.font_file draw.font_size = args.font_size font_name = args.font_name out_dir = args.out_dir img_ref = Image(width=1000, height=1000) if args.verbose: print "Writing " + out_dir + "/" + font_name + ".c" f = open(out_dir + "/" + font_name + ".c", 'wb+') write_comment(f) f.write("#include \"font.h\"\n\n") font_height = 0 range_first = 0x20 range_last = 0x7d font_width = [] max_width = 0 for x in range(range_first, range_last + 1): letter = chr(x) metrics = draw.get_font_metrics(img_ref, letter) text_height = int(round(metrics.text_height + 2)) if font_height == 0: font_height = text_height assert (font_height == text_height), "font height changed!" if max_width == 0: max_width = metrics.maximum_horizontal_advance + 2 assert (max_width == metrics.maximum_horizontal_advance + 2), \ "font advance width changed!" text_width = int(round(metrics.text_width + 2)) font_width.append(text_width) img = Image(width=text_width, height=text_height) d = draw.clone() d.text(0, int(metrics.ascender), letter) d(img) img.depth = 1 f.write("static const unsigned char ") f.write("letter_" + str(hex(x)[2:]) + "[] = {\n") c_hex_print(f, img.make_blob(format='A')) f.write("};\n\n") img.close() f.write("static const struct font_letter letters[] = {\n") for x in range(range_first, range_last + 1): letter_var_name = "letter_" + str(hex(x)[2:]) f.write("\t{ " + letter_var_name + ", ") f.write("sizeof(" + letter_var_name + "), ") f.write(str(font_width[x - range_first]) + "},\n") f.write("};\n\n") f.write("const struct font font_" + font_name + " = {\n") f.write("\t.first = " + str(hex(range_first)) + ",\n") f.write("\t.last = " + str(hex(range_last)) + ",\n") f.write("\t.letters = letters,\n") f.write("\t.height = " + str(font_height) + ",\n") f.write("\t.max_width = " + str(max_width) + ",\n") f.write("};\n") f.close() if args.verbose: print "Writing " + out_dir + "/" + font_name + ".h" f = open(out_dir + "/" + font_name + ".h", 'wb+') write_comment(f) f.write("#ifndef __" + font_name.upper() + "_H\n") f.write("#define __" + font_name.upper() + "_H\n") f.write("#include \"font.h\"\n") f.write("extern const struct font font_" + font_name + ";\n") f.write("#endif /*__" + font_name.upper() + "_H*/\n") f.close()
def main(): mc = redis.StrictRedis() while True: hash_value = mc.rpop('_incoming') if not hash_value: time.sleep(0.25) continue desc = mc.get(hash_value) if not desc: continue desc = cPickle.loads(desc) try: desc = desc._replace(status = 'Processing') mc.set(hash_value, cPickle.dumps(desc)) im = Image(filename=os.path.join('upload', hash_value+'.'+desc.ext)) dt_string = im.metadata.get('exif:DateTimeOriginal') uniq = im.metadata.get('exif:UniqueCameraModel') mm = im.metadata.get('exif:Make'),im.metadata.get('exif:Model') if dt_string is None: raise Exception("Image date undefined") dt = datetime.strptime(dt_string, '%Y:%m:%d %H:%M:%S') if (datetime.now() - dt).days > 365: raise Exception("Image too old") im.depth = 8 im.format = 'RGB' blob = im.make_blob() im = PIL.Image.frombytes('RGB', im.size, blob) imgif = {} model_string = '' if uniq: model_string = uniq imgif[piexif.ImageIFD.UniqueCameraModel] = uniq else: if mm[0]: imgif[piexif.ImageIFD.Make] = mm[0] if mm[1]: imgif[piexif.ImageIFD.Model] = mm[1] model_string = ' '.join([v for v in mm if v]) imgif[piexif.ImageIFD.DateTime] = desc.upload_date exif = {piexif.ExifIFD.DateTimeOriginal: dt_string, piexif.ExifIFD.UserComment: desc.name} exif_dict = { "0th": imgif, "Exif":exif } exif_bytes = piexif.dump(exif_dict) fp = io.BytesIO() im.save(fp, "JPEG", exif = exif_bytes) saved = fp.getvalue() size_string = '%i'%len(saved) desc = ImageDesc( hash_value = hash_value , name = desc.name , ext = desc.ext , upload_date = desc.upload_date , creation_date = dt_string , camera = model_string , size = size_string , status = 'OK') os.remove(os.path.join('upload', hash_value+'.'+desc.ext)) with open(os.path.join('static', hash_value+'.jpg'), 'wb') as f: f.write(saved) im.thumbnail((32,32), PIL.Image.ANTIALIAS) im.save(os.path.join('static', hash_value+'_thumb.jpg'), "JPEG") mc.set(hash_value, cPickle.dumps(desc)) mc.rpush('_images', hash_value) except Exception as e: desc = desc._replace(status = str(e)) mc.setex(hash_value, 60*24, cPickle.dumps(desc))
def main(): mc = redis.StrictRedis() while True: hash_value = mc.rpop('_incoming') if not hash_value: time.sleep(0.25) continue desc = mc.get(hash_value) if not desc: continue desc = cPickle.loads(desc) try: desc = desc._replace(status='Processing') mc.set(hash_value, cPickle.dumps(desc)) im = Image(filename=os.path.join('upload', hash_value + '.' + desc.ext)) dt_string = im.metadata.get('exif:DateTimeOriginal') uniq = im.metadata.get('exif:UniqueCameraModel') mm = im.metadata.get('exif:Make'), im.metadata.get('exif:Model') if dt_string is None: raise Exception("Image date undefined") dt = datetime.strptime(dt_string, '%Y:%m:%d %H:%M:%S') if (datetime.now() - dt).days > 365: raise Exception("Image too old") im.depth = 8 im.format = 'RGB' blob = im.make_blob() im = PIL.Image.frombytes('RGB', im.size, blob) imgif = {} model_string = '' if uniq: model_string = uniq imgif[piexif.ImageIFD.UniqueCameraModel] = uniq else: if mm[0]: imgif[piexif.ImageIFD.Make] = mm[0] if mm[1]: imgif[piexif.ImageIFD.Model] = mm[1] model_string = ' '.join([v for v in mm if v]) imgif[piexif.ImageIFD.DateTime] = desc.upload_date exif = { piexif.ExifIFD.DateTimeOriginal: dt_string, piexif.ExifIFD.UserComment: desc.name } exif_dict = {"0th": imgif, "Exif": exif} exif_bytes = piexif.dump(exif_dict) fp = io.BytesIO() im.save(fp, "JPEG", exif=exif_bytes) saved = fp.getvalue() size_string = '%i' % len(saved) desc = ImageDesc(hash_value=hash_value, name=desc.name, ext=desc.ext, upload_date=desc.upload_date, creation_date=dt_string, camera=model_string, size=size_string, status='OK') os.remove(os.path.join('upload', hash_value + '.' + desc.ext)) with open(os.path.join('static', hash_value + '.jpg'), 'wb') as f: f.write(saved) im.thumbnail((32, 32), PIL.Image.ANTIALIAS) im.save(os.path.join('static', hash_value + '_thumb.jpg'), "JPEG") mc.set(hash_value, cPickle.dumps(desc)) mc.rpush('_images', hash_value) except Exception as e: desc = desc._replace(status=str(e)) mc.setex(hash_value, 60 * 24, cPickle.dumps(desc))