def extract_text(self, is_captcha=False): image = (self.image if not isinstance(self.image, str) else Image.open( self.image)).convert('L') return is_captcha and sub('[\W]', '', (image_to_string(image) or '').strip()) or image_to_string( image) # noqa
def get_text_from_file(self, filepath): from pyocr import tesseract, builders from PIL import Image image = Image.open(filepath) text = tesseract.image_to_string(image=image, builder=builders.TextBuilder()) return text.strip('\r\n')
def __test_txt(self, image_file, expected_box_file, lang='eng'): image_file = "tests/data/" + image_file expected_box_file = "tests/tesseract/" + expected_box_file with codecs.open(expected_box_file, 'r', encoding='utf-8') \ as file_descriptor: expected_boxes = self.builder.read_file(file_descriptor) expected_boxes.sort() boxes = tesseract.image_to_string(Image.open(image_file), lang=lang, builder=self.builder) boxes.sort() self.assertTrue(len(boxes) > 0) self.assertEqual(len(boxes), len(expected_boxes)) for i in range(0, min(len(boxes), len(expected_boxes))): try: # python 2.7 self.assertEqual(type(expected_boxes[i].content), unicode) self.assertEqual(type(boxes[i].content), unicode) except NameError: # python 3 self.assertEqual(type(expected_boxes[i].content), str) self.assertEqual(type(boxes[i].content), str) self.assertEqual(boxes[i], expected_boxes[i])
def test_text(image_file, lang='eng'): print image_file return tesseract.image_to_string( Image.open(image_file), lang=lang, builder=tesseract.DigitBuilder())
def _read_from_img(self, image_path, lang=None): boxes = tesseract.image_to_string(Image.open(image_path), lang=lang, builder=self._builder) boxes.sort() return boxes
def get_digit(self, image, min_score): if self.recognize_digit: import pyocr import pyocr.tesseract as tess import pyocr.builders from PIL import Image digit_ratio = (0.1016, 0.1836, 0.2228, 0.2268) digits = selectROI(image, ratio=digit_ratio, round8=False) bw_img = digits cv2.imwrite('digit_sample.jpg', bw_img) txt = tess.image_to_string(Image.fromarray(bw_img), lang='eng', builder=pyocr.builders.TextBuilder()) txt = txt.replace(',', '').replace('.', '') if txt.isdigit( ) and int(txt) >= min_score: # reward should not decrease score = int(txt) / 100.0 else: score = min_score # print("TXT:", txt, 'score:', score , 'last_score:', last_score) # scale score return score else: return 0
def test_text_error_file(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "") copen.side_effect = Exception("Unknown error") with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(Exception): tesseract.image_to_string(self.image, builder=self.builder) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, )
def test_text_cannot_open_file(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "") copen.side_effect = PermissionError(errno.EPERM, "Error opening file") with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(PermissionError): tesseract.image_to_string(self.image, builder=self.builder) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, )
def main(self, text_img_name): txt = tool.image_to_string( Im.open(text_img_name), lang=self.lang, builder=pyocr.builders.TextBuilder() ) return txt
def _read_from_img(self, image_path, lang=None): boxes = tesseract.image_to_string( Image.open(image_path), lang=lang, builder=self._builder ) boxes.sort() return boxes
def find_secret_rects(image, secret_res, lang, tesseract_configs=None): """ Find secret rects in an image. param: Image image param: list secret_res param: str lang param: str tesseract_configs return: list of rects rtype: list """ # When using pyocr, an input image is converted to RGB (not RGBA). # During the conversion, transparent pixels are converted into BLACK. # Sometimes the black pixels get in the way of recognizing text. # # For example, macOS's screenshot image taken by Command+Shift+4+Space # has transparent pixels around an window. This results in a black and # thick border in the edge of image. The border worsen quality of OCR # of text near by the border. To avoid this, convert transparent pixels # into WHITE by pasting image into an white background. # # See: http://stackoverflow.com/questions/9166400/convert-rgba-png-to-rgb-with-pil if image.mode == 'RGBA': background = Image.new('RGB', image.size, (255, 255, 255)) background.paste(image, mask=image.split()[3]) # Paste image masked by alpha channel [3] image = background # offset = (0, 150) # cropped_image = image.crop((offset[0], offset[1], image.size[0], 220)) offset = (0, 0) cropped_image = image builder = ModifiedCharBoxBuilder(cropped_image.size[1]) if tesseract_configs: builder.tesseract_configs = tesseract_configs boxes = image_to_string(cropped_image, lang=lang, builder=builder) if os.environ.get('DEBUG'): for box in boxes: print(box.content, box.position) content = ''.join(box.content for box in boxes) assert len(boxes) == len(content) secret_rects = [] for secret_re in secret_res: for m in secret_re.finditer(content): matched_boxes = boxes[m.start():m.end()] matched_rects = [b.position for b in matched_boxes] for rect in bounding_boxes_by_line(matched_rects): rect = offset_rect(offset, padding_box(rect, 2)) secret_rects.append(rect) return secret_rects
def test_char_error(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (1, "Error") copen.return_value = StringIO(self._get_file_content("boxes")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(tesseract.TesseractError) as te: tesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(te.exception.status, 1) self.assertEqual(te.exception.message, "Error") run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, )
def test_char_no_output(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "No file output") copen.side_effect = FileNotFoundError( errno.ENOENT, "[Errno 2] No such file or directory: 'output'") with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with self.assertRaises(tesseract.TesseractError) as te: tesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(te.exception.status, -1) self.assertIn("Unable to find output file (tested", te.exception.message) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, )
def __test_txt(self, image_file, expected_output_file, lang="eng"): image_file = "tests/data/" + image_file expected_output_file = "tests/tesseract/" + expected_output_file expected_output = "" with codecs.open(expected_output_file, "r", encoding="utf-8") as file_descriptor: for line in file_descriptor: expected_output += line expected_output = expected_output.strip() output = tesseract.image_to_string(Image.open(image_file), lang=lang) self.assertEqual(output, expected_output)
def main(self, path): count = 0 other_files = 0 for f in os.listdir(path): #Return list of files in path directory ext = os.path.splitext( f )[1] #Split the pathname path into a pair i.e take .png/ .jpg etc if ext.lower( ) not in VALIDITY: #Convert to lowercase and check in validity list other_files += 1 #Increment if other than validity extension found #sys.stdout.write("Extension other than image is not supported. \n") continue else: count += 1 image_file_name = path + '/' + f #Full /dir/path/filename.extension txt = tool.image_to_string( Im.open(image_file_name), lang=self.lang, builder=pyocr.builders.TextBuilder()) initial = txt.replace('\a', ' ').replace('\b', ' ').replace( '\f', ' ' ).replace('\n', ' ').replace('\r', '').replace( '\t', ' ' ).replace( '\v', ' ' ) #.replace(' ','_') #.replace('.','_') #Replace \n and \t with space initial = initial[:60] #Take 1st 100 words print('Filename:' + initial + '\n') os.chmod(path, 0o777) os.rename(image_file_name, path + '/' + initial + ext) print( str(count) + (" file" if count == 1 else " files") + " processed") if count + other_files == 0: print("No files found") #No files found else: print( str(count) + " / " + str(count + other_files) + " files converted")
def __test_txt(self, image_file, expected_output_file, lang='eng'): image_file = "tests/data/" + image_file expected_output_file = "tests/tesseract/" + expected_output_file expected_output = "" with codecs.open(expected_output_file, 'r', encoding='utf-8') \ as file_descriptor: for line in file_descriptor: expected_output += line expected_output = expected_output.strip() output = tesseract.image_to_string(Image.open(image_file), lang=lang) self.assertEqual(output, expected_output)
def __test_text(self, image_file, expected_output_file, lang='eng'): image_file = "tests/data/" + image_file expected_output_file = "tests/tesseract/" + expected_output_file expected_output = "" with codecs.open(expected_output_file, 'r', encoding='utf-8') \ as file_descriptor: for line in file_descriptor: expected_output += line expected_output = expected_output.strip() output = tesseract.image_to_string(Image.open(image_file), lang=lang, builder=self.builder) self.assertEqual(output, expected_output)
def __test_txt(self, image_file, expected_box_file, lang="eng"): image_file = "tests/data/" + image_file expected_box_file = "tests/tesseract/" + expected_box_file with codecs.open(expected_box_file, "r", encoding="utf-8") as file_descriptor: expected_boxes = self.builder.read_file(file_descriptor) expected_boxes.sort() boxes = tesseract.image_to_string(Image.open(image_file), lang=lang, builder=self.builder) boxes.sort() self.assertEqual(len(boxes), len(expected_boxes)) for i in range(0, min(len(boxes), len(expected_boxes))): self.assertEqual(boxes[i], expected_boxes[i])
def __test_txt(self, image_file, expected_output_file, lang='eng'): image_file = os.path.join("tests", "input", "specific", image_file) expected_output_file = os.path.join("tests", "output", "specific", "tesseract", expected_output_file) expected_output = "" with codecs.open(expected_output_file, 'r', encoding='utf-8') \ as file_descriptor: for line in file_descriptor: expected_output += line expected_output = expected_output.strip() output = tesseract.image_to_string(Image.open(image_file), lang=lang) self.assertEqual(output, expected_output)
def rec_img(img): width = img.size[0] height = img.size[1] #构造指数的位置 rangle = (24.5+6.1*length+5,int(height/2),int(width),int(height)) #左、上、右、下 # 打开截图切割 img = img.crop(rangle) # 将图片放大 (x, y) = img.size x_s = int(x*2.4) y_s = int(y*2.4) imgzoom = img.resize((x_s,y_s),Image.ANTIALIAS) code = tesseract.image_to_string(imgzoom) result = re.sub("\D", "", code) return result
def __test_txt(self, image_file, expected_output_file, lang='eng'): image_file = os.path.join("tests", "input", "specific", image_file) expected_output_file = os.path.join( "tests", "output", "specific", "tesseract", expected_output_file ) expected_output = "" with codecs.open(expected_output_file, 'r', encoding='utf-8') \ as file_descriptor: for line in file_descriptor: expected_output += line expected_output = expected_output.strip() output = tesseract.image_to_string(Image.open(image_file), lang=lang) self.assertEqual(output, expected_output)
def __test_txt(self, image_file, expected_box_file, lang='eng'): image_file = "tests/data/" + image_file expected_box_file = "tests/tesseract/" + expected_box_file with codecs.open(expected_box_file, 'r', encoding='utf-8') \ as file_descriptor: expected_boxes = self.builder.read_file(file_descriptor) expected_boxes.sort() boxes = tesseract.image_to_string(Image.open(image_file), lang=lang, builder=self.builder) boxes.sort() self.assertEqual(len(boxes), len(expected_boxes)) for i in range(0, min(len(boxes), len(expected_boxes))): self.assertEqual(boxes[i], expected_boxes[i])
def login(username,password): while 1 : name="code.jpg" threshold = 140 table = [] for i in range(256): if i < threshold: table.append(0) else: table.append(1) try : image = requests.get('http://www.ourui.com/userself/RndCode.asp?rndtype=LOGIN_RndCode') #print(image.content) f=open(name,"wb+") f.write(image.content) f.close() imagefile = Image.open(name) #转化到亮度 imgry = imagefile.convert('L') imgry.save('g'+name) #二值化 #out = imgry.point(table,'1') out = imgry.point(lambda x: 255 if x > 141 else 0) out.save('b'+name) #识别 val=tesseract.image_to_string(out) print(val) imgreq=requests.get("http://www.ourui.com/UserSelf/rndcode_check.asp?name=lg&rndcode="+val) print(imgreq.content) if imgreq.content=="0" or imgreq.content=="": continue r = requests.post("http://www.ourui.com/userself/login.asp", data={"username":username,"userpass":password}) r.content print("login ok register",self.domain) break except: time.sleep(1) print("矮油网速太渣,登录失败,重新登录!")
def get_data(self, region, rerun=0, image_optimizer=None, data_optimizer=None, threshold=None): '''Gets data out of a single region :param rerun: incrementing integer for every rerun. :param image_optimizer: callback, takes PIL.Image and rerun as params. :param data_optimizer: callback, takes String and rerun as params. :param threshold: callback, takes String and should return False for a rerun. ''' if image_optimizer: optiregion = image_optimizer(region, rerun) data = tesseract.image_to_string(optiregion) if data_optimizer: data = data_optimizer(data, rerun) if threshold and not threshold(data): return self.get_data( region, rerun + 1, image_optimizer, data_optimizer, threshold) else: return data
def __test_txt(self, image_file, expected_box_file, lang='eng'): image_file = os.path.join("tests", "input", "specific", image_file) expected_box_file = os.path.join("tests", "output", "specific", "tesseract", expected_box_file) with codecs.open(expected_box_file, 'r', encoding='utf-8') \ as file_descriptor: expected_boxes = self.builder.read_file(file_descriptor) expected_boxes.sort() boxes = tesseract.image_to_string(Image.open(image_file), lang=lang, builder=self.builder) boxes.sort() self.assertEqual(len(boxes), len(expected_boxes)) for i in range(0, min(len(boxes), len(expected_boxes))): self.assertEqual(boxes[i], expected_boxes[i])
def __test_txt(self, image_file, expected_box_file, lang='eng'): image_file = os.path.join("tests", "input", "specific", image_file) expected_box_file = os.path.join( "tests", "output", "specific", "tesseract", expected_box_file ) with codecs.open(expected_box_file, 'r', encoding='utf-8') \ as file_descriptor: expected_boxes = self.builder.read_file(file_descriptor) expected_boxes.sort() boxes = tesseract.image_to_string(Image.open(image_file), lang=lang, builder=self.builder) boxes.sort() self.assertEqual(len(boxes), len(expected_boxes)) for i in range(0, min(len(boxes), len(expected_boxes))): self.assertEqual(boxes[i], expected_boxes[i])
def test_text(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "") copen.return_value = StringIO(self._get_file_content("text")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter result = tesseract.image_to_string(self.image, builder=self.builder) self.assertEqual(result, self._get_file_content("text").strip()) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, )
def test_write_read(self): original_boxes = tesseract.image_to_string(Image.open("tests/data/test.png"), builder=self.builder) self.assertTrue(len(original_boxes) > 0) (file_descriptor, tmp_path) = tempfile.mkstemp() try: # we must open the file with codecs.open() for utf-8 support os.close(file_descriptor) with codecs.open(tmp_path, "w", encoding="utf-8") as fdescriptor: self.builder.write_file(fdescriptor, original_boxes) with codecs.open(tmp_path, "r", encoding="utf-8") as fdescriptor: new_boxes = self.builder.read_file(fdescriptor) self.assertEqual(len(new_boxes), len(original_boxes)) for i in range(0, len(original_boxes)): self.assertEqual(new_boxes[i], original_boxes[i]) finally: os.remove(tmp_path)
def test_char(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "") copen.return_value = StringIO(self._get_file_content("boxes")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter result = tesseract.image_to_string(self.image, builder=self.builder) for box in result: self.assertIsInstance(box, builders.Box) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, )
def __test_txt(self, image_file, expected_box_file, lang='eng'): image_file = "tests/data/" + image_file expected_box_file = "tests/tesseract/" + expected_box_file boxes = tesseract.image_to_string(Image.open(image_file), lang=lang, builder=self.builder) boxes.sort() with codecs.open(expected_box_file, 'r', encoding='utf-8') \ as file_descriptor: expected_boxes = self.builder.read_file(file_descriptor) expected_boxes.sort() self.assertEqual(len(boxes), len(expected_boxes)) for i in range(0, min(len(boxes), len(expected_boxes))): for j in range(0, len(boxes[i].word_boxes)): self.assertEqual(type(boxes[i].word_boxes[j]), type(expected_boxes[i].word_boxes[j])) self.assertEqual(boxes[i], expected_boxes[i])
def test_write_read(self): original_boxes = tesseract.image_to_string( Image.open("tests/data/test.png"), builder=self.builder) self.assertTrue(len(original_boxes) > 0) (file_descriptor, tmp_path) = tempfile.mkstemp() try: # we must open the file with codecs.open() for utf-8 support os.close(file_descriptor) with codecs.open(tmp_path, 'w', encoding='utf-8') as fdescriptor: self.builder.write_file(fdescriptor, original_boxes) with codecs.open(tmp_path, 'r', encoding='utf-8') as fdescriptor: new_boxes = self.builder.read_file(fdescriptor) self.assertEqual(len(new_boxes), len(original_boxes)) for i in range(0, len(original_boxes)): self.assertEqual(new_boxes[i], original_boxes[i]) finally: os.remove(tmp_path)
def test_text_non_rgb_image(self, run_tesseract, copen, temp_dir): """This tests that image_to_string works with non RGB mode images and that image is converted in function.""" image = self.image.convert("L") run_tesseract.return_value = (0, "") copen.return_value = StringIO(self._get_file_content("text")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter result = tesseract.image_to_string(image, builder=self.builder) self.assertEqual(result, self._get_file_content("text").strip()) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, )
def get_data(self, region, rerun=0, image_optimizer=None, data_optimizer=None, threshold=None): '''Gets data out of a single region :param rerun: incrementing integer for every rerun. :param image_optimizer: callback, takes PIL.Image and rerun as params. :param data_optimizer: callback, takes String and rerun as params. :param threshold: callback, takes String and should return False for a rerun. ''' if image_optimizer: optiregion = image_optimizer(region, rerun) data = tesseract.image_to_string(optiregion) if data_optimizer: data = data_optimizer(data, rerun) if threshold and not threshold(data): return self.get_data(region, rerun + 1, image_optimizer, data_optimizer, threshold) else: return data
def test_digits(self, run_tesseract, copen, temp_dir): run_tesseract.return_value = (0, "") copen.return_value = StringIO(self._get_file_content("digits")) with TemporaryDirectory(prefix="tess_") as tmpdir: enter = MagicMock() enter.__enter__.return_value = tmpdir temp_dir.return_value = enter with open(os.path.join(tmpdir, "output.txt"), "w") as fh: fh.write("") result = tesseract.image_to_string(self.image, builder=self.builder) for digit in result: self.assertIsInstance(int(digit), int) run_tesseract.assert_called_once_with( "input.bmp", "output", cwd=tmpdir, lang=None, flags=self.builder.tesseract_flags, configs=self.builder.tesseract_configs, )
def get_text(img_byte): """识别""" im = Image.open(BytesIO(img_byte)) # 去底色 im = im.point(lambda i: 255 if i > 180 else 0) # 去除干扰线 size = im.size pimx = im.load() for x in range(size[0]): for y in range(size[1]): px = pimx[x, y] if px[0] == 0 and px[1] == 0 and px[2] == 0: pimx[x, y] = pimx[x, 0 if y == 0 else y - 1] im = ImageOps.invert(im).convert("1") old_width, old_height = im.size im.thumbnail((old_width*0.7, old_height*0.7)) arr = np.array(im).sum(axis=0) print(arr) # 剪裁 region = im.crop((36, 1, 105, 34)) # OCR builder = tesseract.builders.DigitBuilder() digits_address = os.path.join(os.getcwd(), 'config/digits') print("digits_address -> ", digits_address) builder.tesseract_configs = ['-psm', '7', digits_address] result = tesseract.image_to_string(region, 'eng', builder) code_text = result.replace(' ', '') print("out text -> ", code_text) if len(code_text) == 4: return str(code_text) else: return "0"
def get_text(img_byte, card_modul): """识别卡号""" im = Image.open(BytesIO(img_byte)) print(card_modul['typeId'], card_modul['name']) if card_modul['typeId'] == 1: # 剪裁 region = im.crop(card_modul["cut"]) elif card_modul['typeId'] == 2: # 剪裁 region = im.crop(card_modul["cut"]) elif card_modul['typeId'] == 3: # 剪裁 region = im.crop(card_modul["cut"]) region = ImageOps.invert(region).convert("L") region.show() # OCR builder = tesseract.builders.DigitBuilder() digits_address = os.path.join(os.getcwd(), 'config/digits') builder.tesseract_configs = ['-psm', '7', digits_address] result = tesseract.image_to_string(region, 'eng', builder) return result.replace(' ', '')
def extract(image_file, spellchecker=None): text = pytesseract.image_to_string(Image.open(image_file), lang="deu") clean_text = [] for word in re.findall(r"\w+", text): word = word.strip() if not word: continue if word.isdigit(): clean_text.append(word) continue if len(word) == 1: clean_text.append(word) continue if spellchecker: correction = spellchecker(word.lower()) if correction and not isinstance(correction, str): # some spellcheckers return a list of suggestions -> use # first suggestion correction = correction[0] else: correction = None if correction: if word[0].isupper(): # keep capitalization of first char. correction = correction[0].upper() + correction[1:] clean_text.append(correction) else: clean_text.append(word + "?") emails = [] for email in re.findall( "\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", text, flags=re.UNICODE + re.IGNORECASE, ): _log.debug("email: %s", email) emails.append(email) return " ".join(clean_text), emails
from pyocr import tesseract from PIL import Image imagefile = Image.open('code.jpg') val = tesseract.image_to_string(imagefile) print(val)
def scanImage(self, img): if tesseract.is_available(): return tesseract.image_to_string(Image.open(img)) else: return "FAIL"
def getText(filename): img = Image.open(filename) text = tesseract.image_to_string(img) return text
from PIL import Image from pyocr import tesseract pic_list = ['pic1.png', 'pic2.png'] for i in pic_list: im = Image.open(i) im = im.convert('L') # 图片转换为灰色图像 # 保存转换后的图片 im.save("temp.png") code = tesseract.image_to_string(im) print(code)
import pytesseract as tess import pytesseract as Output from PIL import Image import cv2 as cv import re #line 7 is optional tess.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe' from pyocr.tesseract import image_to_string text = image_to_string( Image.open(r'C:\Users\Admin\projects\webapp\media\report\images\med1.png'), lang='eng') img = cv.cv2.imread( r'C:\Users\Admin\projects\webapp\media\report\images\med1.png') custom_config = r'-c tessedit_char_whitelist=medical --psm 6' print(tess.pytesseract.image_to_string(img, config=custom_config)) #TESSERACT_CMD = 'tesseract.exe' if os.name == 'nt' else 'tesseract' #m = Image.open('c:/Users/Admin/projects/webapp/media/report/images/1.png') #text = pytesseract.image_to_string('m')
import numpy as np import cv2 from PIL import Image import sys import pyocr.tesseract as tess import pyocr.builders import time import os, subprocess ts = time.time() txt = tess.image_to_string(Image.open('digit_sample.jpg'), lang='eng', builder=pyocr.builders.TextBuilder()) #print(txt) #print(time.time() - ts) # txt is a Python string def image_to_string(img, cleanup=True, plus=''): # cleanup为True则识别完成后删除生成的文本文件 # plus参数为给tesseract的附加高级参数 # subprocess.check_output('tesseract ' + img + ' ' + # img + ' ' + plus, shell=True) # 生成同名txt文件 os.popen('tesseract ' + img + ' ' + img + ' ' + plus) text = '' with open(img + '.txt', 'r') as f: text = f.read().strip()
from pyocr import builders from PIL import Image, ImageEnhance, ImageFilter def test_text(image_file, lang='eng'): print image_file return tesseract.image_to_string( Image.open(image_file), lang=lang, builder=tesseract.DigitBuilder()) print test_text('./123.png') print Image.open('./123.png') print tesseract.image_to_string(Image.open('./11.jpg'), lang='eng') image_name = "./123.png" im = Image.open(image_name) im = im.filter(ImageFilter.MedianFilter()) enhancer = ImageEnhance.Contrast(im) im = enhancer.enhance(2) im = im.convert('1') #im.show() #all by pixel s = 12 #start postion of first number w = 10 #width of each number h = 15 #end postion from top t = 2 #start postion of top im_new = [] #split four numbers in the picture
except ValueError as e: continue # Use wand to convert each page in the PDF into an image blob # Loop over blobs for img in image_jpeg.sequence: img_page = Image(image=img) # append as a blob into the req_image list req_image.append(img_page.make_blob('jpeg')) # run OCR over the image blobs txt = "" for img in req_image: try: txt += tool.image_to_string( PI.open(io.BytesIO(img)), lang=lang, builder=pyocr.builders.TextBuilder()) error_flag = False except OSError as e: error_flag = True print("#####################") print("Error: txt") print("#####################") final_text.append(txt) if error_flag: continue print("#####################") print("TEXT FOUND:") print(final_text)