def generateProcessedFiles(splittedName): # Get string from Image fullName = splittedName[0] + '.' + splittedName[1] imgText = pt.image_to_string(Image.open(fullName), lang='ron') with open('../TextIntermediar/' + splittedName[0] + "text.txt", 'w') as f: f.write(imgText) # Get bounding boxes pt.run_tesseract(fullName, splittedName[0] + 'output', lang='ron', boxes=True, config="hocr") #Remove non-alphanumeric characters with open(splittedName[0] + 'output.box', 'r+') as f: buf = '' for line in f: if line[0].isalnum() or line[0] == '(' or line[0] == ')': buf += line f.seek(0) f.write(buf) # To read the coordinates boxes = [] with open(splittedName[0] + 'output.box', 'r') as f: reader = csv.reader(f, delimiter=' ') for row in reader: if (len(row) == 6): boxes.append(row)
def image_coordinates(image_path): "This function accepts path to an image, it will use pytesseract and Beautiful Soup to extract out the coordinates of the identified characters in the image " output_path = image_path[:-4] text_path =image_path.replace('.jpg', '_char_coord.txt') f = open(text_path, 'w') pytesseract.run_tesseract(image_path, output_path, lang = None, boxes = False, config = "hocr") #a hocr file with image's name will be created output_path = output_path + ".hocr" image_soup = BeautifulSoup(open(output_path)) #opening the .hocr file to extract the bounding box coordinates str_image_soup = str(image_soup) #converting the soup object into string to extract out data word_count = str_image_soup.count("word_") #number of elements with id word_some_number, which is basically the number for i in range(1, word_count + 1): str_temp = str(image_soup.find_all(id="word_1_"+ str(i))) start_point = int(str_temp.find("<em>")) #index of '<' in <em> tag start_point += 4 #index of the word that it it contains end_point = int(str_temp.find("</em>")) #index of '<' in </em> tag word = str_temp[start_point:end_point] #if(not (word == '')): #if word is not empty start_point = int(str_temp.find("bbox")) start_point += 5 end_point = int(str_temp.find(";")) bounding_box = str_temp[start_point:end_point] f.write(word + " " + bounding_box + "\n")
def ocr(img, mrz_mode=True, extra_cmdline_params=''): """Runs Tesseract on a given image. Writes an intermediate tempfile and then runs the tesseract command on the image. This is a simplified modification of image_to_string from PyTesseract, which is adapted to SKImage rather than PIL. In principle we could have reimplemented it just as well - there are some apparent bugs in PyTesseract, but it works so far :) :param mrz_mode: when this is True (default) the tesseract is configured to recognize MRZs rather than arbitrary texts. When False, no specific configuration parameters are passed (and you are free to provide your own via `extra_cmdline_params`) :param extra_cmdline_params: extra parameters passed to tesseract. When mrz_mode=True, these are appended to whatever is the "best known" configuration at the moment. "--oem 0" is the parameter you might want to pass. This selects the Tesseract's "legacy" OCR engine, which often seems to work better than the new LSTM-based one. """ if img is None or img.shape[-1] == 0: # Issue #34 return '' input_file_name = '%s.bmp' % _tempnam() output_file_name_base = '%s' % _tempnam() output_file_name = "%s.txt" % output_file_name_base try: # Prevent annoying warning about lossy conversion to uint8 if str(img.dtype).startswith( 'float') and np.nanmin(img) >= 0 and np.nanmax(img) <= 1: img = img.astype(np.float64) * (np.power(2.0, 8) - 1) + 0.499999999 img = img.astype(np.uint8) imwrite(input_file_name, img) if mrz_mode: # NB: Tesseract 4.0 does not seem to support tessedit_char_whitelist config = ( "--psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789><" " -c load_system_dawg=F -c load_freq_dawg=F {}" ).format(extra_cmdline_params) else: config = "{}".format(extra_cmdline_params) pytesseract.run_tesseract(input_file_name, output_file_name_base, 'txt', lang=None, config=config) if sys.version_info.major == 3: f = open(output_file_name, encoding='utf-8') else: f = open(output_file_name) try: return f.read().strip() finally: f.close() finally: pytesseract.cleanup(input_file_name) pytesseract.cleanup(output_file_name)
def perform_ocr(): pt.run_tesseract('/app/temp/img.jpg', 'file_ocr', extension=".jpg", lang='lao') txt = "" with codecs.open("/app/file_ocr.txt", encoding="utf-8") as f: for line in f: txt += line + '\n' return txt
def ocr(img, config=''): """Runs Tesseract on a given image. Writes an intermediate tempfile and then runs the tesseract command on the image. This is a simplified modification of image_to_string from PyTesseract, which is adapted to SKImage rather than PIL. In principle we could have reimplemented it just as well - there are some apparent bugs in PyTesseract (e.g. it may lose the NamedTemporaryFile due to its auto-delete behaviour). :param mrz_mode: when this is True (default) the tesseract is configured to recognize MRZs rather than arbitrary texts. """ input_file_name = '%s.bmp' % pytesseract.tempnam() output_file_name_base = '%s' % pytesseract.tempnam() output_file_name = "%s.txt" % output_file_name_base try: imsave(input_file_name, img) status, error_string = pytesseract.run_tesseract(input_file_name, output_file_name_base, lang=None, boxes=False, config=config) if status: errors = pytesseract.get_errors(error_string) raise pytesseract.TesseractError(status, errors) if 'vie' in config: f = codecs.open(output_file_name, encoding='utf-8') else: f = open(output_file_name) try: return f.read().strip() finally: f.close() finally: pytesseract.cleanup(input_file_name) pytesseract.cleanup(output_file_name)
def convertimagetoalto(imagepaths, outputfilename, basename): index = 0 for imagepath in imagepaths: conf_data = pytesseract.run_tesseract( imagepath, output_filename_base=outputfilename + '_' + str(index), lang='eng+hin', extension='xml', config='alto --oem 1') index += 1
def ocr(img, mrz_mode=True, extra_cmdline_params=''): """Runs Tesseract on a given image. Writes an intermediate tempfile and then runs the tesseract command on the image. This is a simplified modification of image_to_string from PyTesseract, which is adapted to SKImage rather than PIL. In principle we could have reimplemented it just as well - there are some apparent bugs in PyTesseract, but it works so far :) :param mrz_mode: when this is True (default) the tesseract is configured to recognize MRZs rather than arbitrary texts. :param extra_cmdline_params:extra parameters to the ocr.py """ input_file_name = '%s.bmp' % _tempnam() output_file_name_base = '%s' % _tempnam() output_file_name = "%s.txt" % output_file_name_base try: imsave(input_file_name, img) if mrz_mode: # NB: Tesseract 4.0 does not seem to support tessedit_char_whitelist config = "--psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789>< -c load_system_dawg=F -c load_freq_dawg=F {}".format( extra_cmdline_params) else: config = "" pytesseract.run_tesseract(input_file_name, output_file_name_base, 'txt', lang=None, config=config) if sys.version_info.major == 3: f = open(output_file_name, encoding='utf-8') else: f = open(output_file_name) try: return f.read().strip() finally: f.close() finally: pytesseract.cleanup(input_file_name) pytesseract.cleanup(output_file_name)
def ocr(img, mrz_mode=True): """Runs Tesseract on a given image. Writes an intermediate tempfile and then runs the tesseract command on the image. This is a simplified modification of image_to_string from PyTesseract, which is adapted to SKImage rather than PIL. In principle we could have reimplemented it just as well - there are some apparent bugs in PyTesseract (e.g. it may lose the NamedTemporaryFile due to its auto-delete behaviour). :param mrz_mode: when this is True (default) the tesseract is configured to recognize MRZs rather than arbitrary texts. """ input_file_name = '%s.bmp' % pytesseract.tempnam() output_file_name_base = '%s' % pytesseract.tempnam() output_file_name = "%s.txt" % output_file_name_base try: imsave(input_file_name, img) if mrz_mode: config = "-psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789>< -c load_system_dawg=F -c load_freq_dawg=F" else: config = None status, error_string = pytesseract.run_tesseract(input_file_name, output_file_name_base, lang=None, boxes=False, config=config) if status: errors = pytesseract.get_errors(error_string) raise pytesseract.TesseractError(status, errors) f = open(output_file_name) try: return f.read().strip() finally: f.close() finally: pytesseract.cleanup(input_file_name) pytesseract.cleanup(output_file_name)
def save_and_ocr(myScreenshot, part): global after_details global i myScreenshot = change_colors(myScreenshot) if part == "0": myScreenshot = advance_contour_filtering(myScreenshot, "") cv2.imwrite('output/filename' + part + '_' + str(i) + '.png', myScreenshot) if part == "0": datails_string = pytesseract.image_to_string( Image.open('output/filename' + part + '_' + str(i) + '.png'), config='-c load_system_dawg=false load_freq_dawg=false', lang="fra+eng").lower() if any(x in datails_string for x in ["detail", "detal", "detai"]): after_details = True myScreenshot = advance_contour_filtering(myScreenshot, "detail") cv2.imwrite('output/filename' + part + '_' + str(i) + '.png', myScreenshot) if any(x in datails_string for x in ["page"]): if after_details == False: myScreenshot = advance_contour_filtering(myScreenshot, "group") else: myScreenshot = advance_contour_filtering(myScreenshot, "page") cv2.imwrite('output/filename' + part + '_' + str(i) + '.png', myScreenshot) if after_details == False and any(x in datails_string for x in ["group"]): if datails_string.split('\n', 1)[0].endswith(':'): myScreenshot = advance_contour_filtering(myScreenshot, "group") cv2.imwrite('output/filename' + part + '_' + str(i) + '.png', myScreenshot) if datails_string == "": myScreenshot = advance_contour_filtering(myScreenshot, "empty") cv2.imwrite('output/filename' + part + '_' + str(i) + '.png', myScreenshot) datails_string = pytesseract.image_to_string( Image.open('output/filename' + part + '_' + str(i) + '.png'), config='-c load_system_dawg=false load_freq_dawg=false', lang="fra+eng").lower() if datails_string == "" and i > 1: while not pyautogui.locateOnScreen('break_line.png', region=(230, 197, 21, 100)): pyautogui.press('up') top3 = pyautogui.locateOnScreen('break_line.png', region=(230, 197, 21, 100))[1] myScreenshot = pyautogui.screenshot(region=(5, top3, 145, 50)) myScreenshot = cv2.cvtColor(np.array(myScreenshot), cv2.COLOR_RGB2BGR) save_and_ocr(myScreenshot, part) while not pyautogui.locateOnScreen('break_line.png', region=(230, 900, 21, 100)): pyautogui.press('down') return if after_details and check_if_last_is_letter(datails_string) == False: myScreenshot = advance_contour_filtering(myScreenshot, "footer1") cv2.imwrite('output/filename' + part + '_' + str(i) + '.png', myScreenshot) datails_string = pytesseract.image_to_string( Image.open('output/filename' + part + '_' + str(i) + '.png'), config='-c load_system_dawg=false load_freq_dawg=false', lang="fra+eng").lower() if check_if_last_is_letter(datails_string) == False: myScreenshot = advance_contour_filtering( myScreenshot, "footer2") cv2.imwrite('output/filename' + part + '_' + str(i) + '.png', myScreenshot) datails_string = pytesseract.image_to_string( Image.open('output/filename' + part + '_' + str(i) + '.png'), config='-c load_system_dawg=false load_freq_dawg=false', lang="fra+eng").lower() if check_if_last_is_letter(datails_string) == False: myScreenshot = advance_contour_filtering( myScreenshot, "footer3") cv2.imwrite( 'output/filename' + part + '_' + str(i) + '.png', myScreenshot) if part == "2": pytesseract.run_tesseract( 'output/filename' + part + '_' + str(i) + '.png', 'output/filename' + part + '_' + str(i), lang="fra+eng", extension='hocr', config='-c load_system_dawg=false load_freq_dawg=false') files_were_deleted = False hocr2html.main('output/filename' + part + '_' + str(i)) file_name = 'output/filename' + part + '_' + str(i) + ".html" txt_file = open(file_name, 'r', encoding="utf8") txt_datas = txt_file.readlines() with open(file_name, 'w', encoding="utf8") as f: for j, txt_data in enumerate(txt_datas): f.write("%s\n" % txt_data.replace('_O', '_0')) else: pytesseract.run_tesseract( 'output/filename' + part + '_' + str(i) + '.png', 'output/filename' + part + '_' + str(i), lang="fra+eng", extension='', config='-c load_system_dawg=false load_freq_dawg=false') files_were_deleted = clean_file('output/filename' + part + '_' + str(i) + '.txt') if not files_were_deleted: png_index = i else: png_index = i + 1 os.remove('output/filename' + part + '_' + str(png_index) + '.png')
cv.putText(text_img, str(idx), (x, y), color=1, fontFace=cv.FONT_HERSHEY_PLAIN, fontScale=1) padding_y, padding_x = y - 5 if y > 5 else y, x - 5 if x > 5 else x save_img( normalize_img(img[padding_y:y + h, padding_x:x + w], 0, 255), "{}/{}/tess/{}__{}.png".format(output_path, seg, img_out_name, idx)) run_tesseract( input_filename="{}/{}/tess/{}__{}.png".format( output_path, seg, img_out_name, idx), output_filename_base="{}/{}/tess/text/{}__{}".format( output_path, seg, img_out_name, idx), extension="png", lang=args.lang, config=tesseract_config, nice=0) text_cont += 1 cv.rectangle(text_img, (x, y), (x + w, y + h), color=1, thickness=1) save_img(img7c, "{}/{}/{}_step_7.pbm".format(output_path, seg, img_out_name)) print("Found {} text parts".format(text_cont)) black_pixels = np.where(text_img == BLACK) white_pixels = np.where(text_img == WHITE) text_img[black_pixels] = WHITE
def YaxisData(self): try: pt.run_tesseract(yAxisFile, '/tmp/y_temp', None, True, '-psm 3 nobatch digits') filename = '/tmp/y_temp.box' yData = open(filename , "r") d = yData.read() yData.close() data_lines = d.split('\n') del d data = [] for line in data_lines: l = [] x = line.split(" ") if(len(x) == 6): if(x[0].isdigit() or x[0] == '.' or x[0] == '-'): l.append(0) else: l.append(1) l.append(x[0]) l.append(int(x[1])) l.append(int(x[2])) l.append(int(x[3])) l.append(int(x[4])) data.append(l) # we need to take max distance between 2 and 4 max = 0 for line in data: if(abs(line[5] - line[3]) > max): max = abs(line[5] - line[3]) l = [] new_data = [] for i in range(len(data) - 1): l.append(data[i]) if(abs(data[i][3]-data[i+1][5]) > max): new_data.append(list(l)) l[:] = [] l.append(data[-1]) new_data.append(list(l)) for data in list(new_data): for d in data: if(d[0] == 1): new_data.remove(data) break new_new_data = [] for data in new_data: l = [] s = "" for d in data: s = s + d[1] l.append(float(s)) avg = (data[-1][5] + data[-1][3])/2 l.append(avg) new_new_data.append(l) return new_new_data except: return []
def XaxisData(self): try: pt.run_tesseract(xAxisFile, '/tmp/x_temp', None, True, '-psm 7 nobatch digits') filename = '/tmp/x_temp.box' xData = open(filename , "r") d = xData.read() xData.close() data_lines = d.split('\n') del d data = [] for line in data_lines: l = [] x = line.split(" ") if(len(x) == 6): if(x[0].isdigit() or x[0] == '.' or x[0] == '-'): l.append(0) else: l.append(1) l.append(x[0]) l.append(int(x[1])) l.append(int(x[2])) l.append(int(x[3])) l.append(int(x[4])) data.append(l) # here first data being 0 means first is a number # now the tough part begins here # we will go through all the thichkness and get the maximum thichkness max = 0 for line in data: if(abs(line[4] - line[2]) > max): max = abs(line[4] - line[2]) l = [] new_data = [] for i in range(len(data) - 1): l.append(data[i]) if(abs(data[i][4]-data[i+1][2]) > max): new_data.append(list(l)) l[:] = [] l.append(data[-1]) new_data.append(list(l)) for data in list(new_data): for d in data: if(d[0] == 1): new_data.remove(data) break new_new_data = [] for data in new_data: l = [] s = "" for d in data: s = s + d[1] l.append(float(s)) if(len(data)%2 == 0): avg = (data[(len(data)-1)/2][2] + data[(len(data)-1)/2][4])/2 l.append(avg) else: avg = (data[(len(data))/2 -1][2] + data[(len(data)-1)/2][4])/2 l.append(avg) new_new_data.append(l) return new_new_data except: return []
import os import subprocess from pytesseract import pytesseract test=int(subprocess.Popen("ls /tmp/test/ | grep .jpg | wc -l", shell=True, stdout=subprocess.PIPE).stdout.read()) for i in range(0,test): vjpg = "/tmp/test/some_%d.jpg"%i vosc = "cp output.hocr /tmp/test/some_%d.hocr"%i pytesseract.run_tesseract(vjpg, 'output', lang=None, boxes=False, config="hocr") os.system(vosc)
import sys from pytesseract import pytesseract #pytesseract.run_tesseract(sys.argv[1], 'output', lang=None, boxes=False, config="hocr") print( sys.argv[1]) pytesseract.run_tesseract(sys.argv[1], 'output', 'box', lang='jpn', config="hocr")
Get the coordinates of the bounding box Args: element_arr(list): List of bs4.element.Tag objects Returns: coordinates(list): 2D array containing coordinates in the form [x0, y0, x1, y1] """ title_atrs = [element["title"].split(";") for element in element_arr] coordinates = [atr_value[0].split(" ")[1:] for atr_value in title_atrs] coordinates = [[int(x) for x in coordinate_arr] for coordinate_arr in coordinates] return coordinates filename = 'test.png' pt.run_tesseract(filename, 'output', lang="yid", extension="box", config="hocr") hocr = open("output.hocr", "r", encoding="utf-8").read() #extract coordinate information from hocr soup = BeautifulSoup(hocr, "html.parser") words = soup.find_all('span',class_='ocrx_word') word_coordinates = get_coordinates(words) lines = soup.find_all('span',class_='ocr_line') line_coordinates = get_coordinates(lines) paragraphs = soup.find_all('p',class_='ocr_par') paragraph_coordinates = get_coordinates(paragraphs) # Draw the bounding box img = cv2.imread(filename)
def XaxisData(self): try: pt.run_tesseract(xAxisFile, '/tmp/x_temp', None, True, '-psm 7 nobatch digits') filename = '/tmp/x_temp.box' xData = open(filename, "r") d = xData.read() xData.close() data_lines = d.split('\n') del d data = [] for line in data_lines: l = [] x = line.split(" ") if (len(x) == 6): if (x[0].isdigit() or x[0] == '.' or x[0] == '-'): l.append(0) else: l.append(1) l.append(x[0]) l.append(int(x[1])) l.append(int(x[2])) l.append(int(x[3])) l.append(int(x[4])) data.append(l) # here first data being 0 means first is a number # now the tough part begins here # we will go through all the thichkness and get the maximum thichkness max = 0 for line in data: if (abs(line[4] - line[2]) > max): max = abs(line[4] - line[2]) l = [] new_data = [] for i in range(len(data) - 1): l.append(data[i]) if (abs(data[i][4] - data[i + 1][2]) > max): new_data.append(list(l)) l[:] = [] l.append(data[-1]) new_data.append(list(l)) for data in list(new_data): for d in data: if (d[0] == 1): new_data.remove(data) break new_new_data = [] for data in new_data: l = [] s = "" for d in data: s = s + d[1] l.append(float(s)) if (len(data) % 2 == 0): avg = (data[(len(data) - 1) / 2][2] + data[(len(data) - 1) / 2][4]) / 2 l.append(avg) else: avg = (data[(len(data)) / 2 - 1][2] + data[(len(data) - 1) / 2][4]) / 2 l.append(avg) new_new_data.append(l) return new_new_data except: return []
def upload(): #below checks should be done in js, but just in case if "upload" not in request.files: return json.dumps({"status": "Upload not found"}) f = request.files["upload"] if not allowed_file(f.filename): return json.dumps({"status": "Illegal filename"}) filename = secure_filename(f.filename) f.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) mask = analyze(filename) print "starting tesseract" hocr = 'pytesseractTemp' text = pytesseract.run_tesseract(UPLOAD_FOLDER + '/' + filename, UPLOAD_FOLDER + '/' + hocr, lang='eng', boxes=True, config="hocr") boxes = [] with open(UPLOAD_FOLDER + '/' + hocr + '.box', 'rb') as f: reader = csv.reader(f, delimiter = ' ') for row in reader: if(len(row)==6): boxes.append(row) boxes = pytesseract.image_to_string(Image.open(UPLOAD_FOLDER + '/' + filename), lang="eng", boxes = True).split('\n') text = pytesseract.image_to_string(Image.open(UPLOAD_FOLDER + '/' + filename), lang="eng") for b in boxes: #marking box be = b.split() if len(be) == 6: print "marking box" print b cx = int(be[1]) + int(be[3]) / 2 cy = int(be[2]) + int(be[4]) / 2 cv2.circle(mask, (cx, cy), 7, (0, 0, 0), -1) cv2.imshow('found circles', mask) cv2.waitKey(0) ret = {"status": "success"} ret.update(processText(text, boxes)) #mark it out for b in ret["boxes"]: #marking box print "marking box" print b cx = int(b[1]) + int(b[3]) / 2 cy = int(b[2]) + int(b[4]) / 2 cv2.circle(mask, (cx, cy), 7, (0, 0, 0), -1) cv2.imshow('found circles', mask) cv2.waitKey(0) #now detect the circle (img should be blurred out) #edges = cv2.Canny(mask.copy(), 0, 255) """ blurred = cv2.blur(mask, (5, 5)) plt.subplot(121),plt.imshow(mask),plt.title('Original') plt.xticks([]), plt.yticks([]) plt.subplot(122),plt.imshow(blurred),plt.title('Blurred') plt.xticks([]), plt.yticks([]) plt.show() edges = cv2.medianBlur(mask, 5) cimg = cv2.cvtColor(edges,cv2.COLOR_GRAY2BGR) circles = cv2.HoughCircles(edges,cv2.HOUGH_GRADIENT,1,30, param1=50,param2=30,minRadius=10,maxRadius=200) circles = np.uint16(np.around(circles)) for i in circles[0,:]: # draw the outer circle cv2.circle(cimg,(i[0],i[1]),i[2],(0,255,0),2) # draw the center of the circle cv2.circle(cimg,(i[0],i[1]),2,(0,0,255),3) cv2.imshow('detected circles',cimg) cv2.waitKey(0) cv2.destroyAllWindows() """ """ contours = cv2.findContours(mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) contours=contours[1] for cnt in contours: M = cv2.moments(cnt) cX = int(M["m10"] / M["m00"]) cY = int(M["m01"] / M["m00"]) # draw the contour and center of the shape on the image cv2.drawContours(mask, [cnt], -1, (0, 255, 0), 2) cv2.circle(mask, (cX, cY), 7, (255, 255, 255), -1) cv2.putText(mask, "center", (cX - 20, cY - 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2) plt.imshow(mask, cmap = 'gray', interpolation = 'bicubic') plt.xticks([]), plt.yticks([]) # to hide tick values on X and Y axis plt.show() """ #cv2.imshow("Image", mask) #cv2.waitKey(0) return json.dumps(ret)
import csv import cv2 from pytesseract import pytesseract as pt img_path = './images/PCE28-.jpg' pt.run_tesseract(img_path, 'output', lang=None, boxes=True, config='hocr') boxes = [] chars = [] with open('output.box', encoding="utf8") as f: reader = csv.reader(f, delimiter=' ') for row in reader: if len(row) == 6: boxes.append(row) img = cv2.imread(img_path) h, w = img.shape[:2] for b in boxes: cv2.rectangle(img, (int(b[1]), h-int(b[2])), (int(b[3]), h - int(b[4])), (255, 0, 0), 2) cv2.putText(img, b[0], (int(b[1]), h - int(b[2])), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1) cv2.imshow("result", img) cv2.imwrite("result.jpg", img) cv2.waitKey(0)
from PIL import Image from pytesseract import pytesseract import argparse import xmltodict import json import cv2 import os import requests from puttext import puttext from nltk.tokenize import sent_tokenize import math filename = '../upload/table1.png' o_filename = '../upload/table2.png' conf_data = pytesseract.run_tesseract( filename,output_filename_base='test',lang='eng+hin',extension='xml', config='alto --oem 1') f_hin = open("test.xml", "r") # print(xmltodict.parse(f_hin.read())) data = xmltodict.parse(f_hin.read()) blocks = data['alto']['Layout']['Page']['PrintSpace']['TextBlock'] for block in blocks: textline = block['TextLine'] text = '' height = 0 x = block['@HPOS'] y = block['@VPOS'] word_count = 0 no_lines = 0 line_height = 0 previous_position = 0 previous_position_x = 0
def post(self): """ Retrieve corresponding value from the test report. --- tags: - OCR parameters: - in: body name: image type: string required: true example: R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7 - in: body name: search_terms type: array items: type: string required: true example: ["fasting","blood","sugar"] responses: 200: description: A corresponding value has been extracted successfully properties: extracted_value: type: string description: Retrieved value from the report """ req = request.get_json(force=True) if not req.get("image") or not req.get("search_terms"): return "Invalid image or search terms" session_filename = utils.get_current_time() reader_module_path = os.path.dirname(os.path.realpath(__file__)) origin_img_base64 = req["image"] origin_img_fp = os.path.join(reader_module_path, 'img-origin/' + session_filename + '.jpg') origin_img = Image.open(BytesIO(base64.b64decode(origin_img_base64))) origin_img.save(origin_img_fp) hocr_filepath = os.path.join(reader_module_path, 'hocr-files/' + session_filename) pytesseract.run_tesseract( origin_img_fp, hocr_filepath, extension="box", lang=None, config="hocr --psm 7 tessedit_char_whitelist=0123456789") search_terms = tuple(req["search_terms"]) # print(search_terms) try: hocr_result = hocr_search.parse_hocr(search_terms, hocr_filepath + '.hocr') img_width, img_height = origin_img.size cropped_image = origin_img.crop( utils.calc_result_box(hocr_result, img_width)) cropped_img_fp = os.path.join(reader_module_path, 'cropped-imgs/') cropped_image.save(cropped_img_fp + session_filename + ".jpg", "jpeg") response = google_vision.get_value(cropped_img_fp + session_filename + ".jpg") res_detail = {"extracted_value": response} return (res_detail) except Exception as e: error_detail = {"error": e} abort(500, message="Search terms did not match the document")
def YaxisData(self): try: pt.run_tesseract(yAxisFile, '/tmp/y_temp', None, True, '-psm 3 nobatch digits') filename = '/tmp/y_temp.box' yData = open(filename, "r") d = yData.read() yData.close() data_lines = d.split('\n') del d data = [] for line in data_lines: l = [] x = line.split(" ") if (len(x) == 6): if (x[0].isdigit() or x[0] == '.' or x[0] == '-'): l.append(0) else: l.append(1) l.append(x[0]) l.append(int(x[1])) l.append(int(x[2])) l.append(int(x[3])) l.append(int(x[4])) data.append(l) # we need to take max distance between 2 and 4 max = 0 for line in data: if (abs(line[5] - line[3]) > max): max = abs(line[5] - line[3]) l = [] new_data = [] for i in range(len(data) - 1): l.append(data[i]) if (abs(data[i][3] - data[i + 1][5]) > max): new_data.append(list(l)) l[:] = [] l.append(data[-1]) new_data.append(list(l)) for data in list(new_data): for d in data: if (d[0] == 1): new_data.remove(data) break new_new_data = [] for data in new_data: l = [] s = "" for d in data: s = s + d[1] l.append(float(s)) avg = (data[-1][5] + data[-1][3]) / 2 l.append(avg) new_new_data.append(l) return new_new_data except: return []
def job(): sql = db.session.query(info_table).filter( info_table.status == 'Pending').first() sql.status = 'Working' fileid = sql.id savename = sql.savenamedb keyworddb = sql.keyworddb outfilename = sql.outputfiledb db.session.commit() with open(outfilename + '.txt', 'rb') as fp: recognized_text = pickle.load(fp) filepath = 'remaining/' + savename content_text = '' # Enter the keyword you want to search for keyword = keyworddb key = keyword.split(' ', 1)[0] result = [k for k in recognized_text if k.startswith(keyword)] for l in result: str1 = ''.join(l) page = recognized_text.index(str1) print("page is:", page + 1) pagefound = page + 1 break #Page Number loop for keyword print('Page Number loop for keyword') ls = [] for x in range(1, len(recognized_text) + 1): mat = "Page " + str(x) + " of " + key if res.find(mat) != -1: ls.append(x) else: if ls == []: #print("page of "+keyword+" is: "+str(pagefound)) ls.append(pagefound) break else: for i in range(0, len(ls)): ls[i] = int(ls[i]) print("Number of pages for " + keyword + " is " + str(max(ls))) break ls2 = [] if len(ls) > 1: for z in range(max(ls)): fin = z + pagefound z1 = str(fin) ls2.append(z1) for p1 in ls2: print("page of " + keyword + " is: " + p1) else: ls2 = ls for p1 in ls2: print("page of " + keyword + " is: " + str(p1)) # Save pdf to jpg page-wise print('Save pdf to jpg page-wise') fname = os.path.splitext(os.path.basename(filepath))[0] pages = convert_from_path( filepath, 843) # Resolution can be changed according to your use i = 1 for page in pages: savepath = 'Images/' + fname + '_' + str(i) + '.jpeg' page.save(savepath, 'JPEG') i = i + 1 #Save in contentdb print('Save in contentdb') for i in ls2: content_text = content_text + "|||" + recognized_text[int(i) - 1] sql = info_table.query.filter_by(id=fileid).first() sql.contentdb = content_text #Convert selected page from jpg to hOCR pdf print('Convert selected page from jpg to hOCR pdf') for i in ls2: currentpagepath = 'Images/' + fname + '_' + str(i) + '.jpeg' pdf_name = 'PDFs/' + fname + '_' + str(i) pytesseract.run_tesseract(currentpagepath, pdf_name, lang=None, config="hocr", extension='pdf') # Merge all pdfs into one print('Merge all pdfs into one') pdf_list = [] for i in ls2: pdf_name = 'PDFs/' + fname + '_' + str(i) + '.pdf' pdf_list.append(pdf_name) pdf_output_name = key + '.pdf' merger = PdfFileMerger() for pdf in pdf_list: merger.append(open(pdf, 'rb')) with open(pdf_output_name, 'wb') as fout: merger.write(fout) #Parse the hOCR for Tables and save it to xlsx print('Parse the hOCR for Tables and save it to xlsx') excel_output_name = key + '.xlsx' c.xlsx(pdf_output_name, excel_output_name) sql.status = 'Completed' print('Status : completed And updating Jsondb') json_string = {} json_string['json_data'] = [] json_string['json_data'].append({ 'taskid': sql.id, 'filename': sql.filenamedb, 'doctypeid': sql.doctype, 'accuracy': sql.accuracy, 'keywordsearched': sql.keyworddb, 'content': sql.contentdb }) sql.jsondb = json_string db.session.commit()
import sys import csv import cv2 from pytesseract import pytesseract as pt # extension ? #pt.run_tesseract(sys.argv[1], 'output', lang=None, boxes=True, config="hocr") pt.run_tesseract(sys.argv[1], 'output', lang=None, config="hocr") # To read the coordinates boxes = [] with open('output.box', 'rb') as f: reader = csv.reader(f, delimiter=' ') for row in reader: if (len(row) == 6): boxes.append(row) # Draw the bounding box img = cv2.imread('bw.png') h, w, _ = img.shape for b in boxes: img = cv2.rectangle(img, (int(b[1]), h - int(b[2])), (int(b[3]), h - int(b[4])), (255, 0, 0), 2) cv2.imshow('output', img)