def id_tabs(img, avg_line_height=20, line_blur=20, tab_wiggle_room=5, disp=False): """ Attempts to identify the indent level of each line of text, with the assumption that the first line is at level 0. :param img: the input image (should contain text) :param avg_line_height: the expected vertical height of a line of text :param line_blur: how far the image is blurred to extract features (img.width/line_blur) :param tab_wiggle_room: how far in pixels tabs are allowed to be from on another before they are considered distinct :param disp: whether to display intermediate results :return: An integer list representing the tab level for each line """ # load image as grayscale # aggressively horizontally blur the image r, c = len(img), len(img[0]) horizontal_size = c / line_blur horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1)) img = cv2.filter2D(img, -1, horizontal_structure) if disp: vis_img, _ = auto_crop.reduce_image(img.copy()) cv2.imshow('Horizontally Blur', vis_img) cv2.waitKey(0) # Identify connected components & generate bounding boxes n, regions = cv2.connectedComponents(img, img) img = np.uint8(regions) bbs = _generate_bounding_boxes(img, n, avg_line_height) return _analyze_bounding_boxes(bbs, tab_wiggle_room)
def main(): disp = False # Set to true to view pre-processing of the images. img = cv2.imread('../images/training/training3.jpg', 0) img = auto_crop.crop_to_bounding_box(img, disp=disp) # binarize image _, img = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV) if disp: vis_img, _ = auto_crop.reduce_image(img.copy()) cv2.imshow('Binary Image', vis_img) cv2.waitKey(0) img = 255 - img if disp: vis_img, _ = auto_crop.reduce_image(img.copy()) cv2.imshow('Inverted', vis_img) cv2.waitKey(0) # normalize image height img, avg_line_height = normalize_training_image(img, 30, disp=disp) if disp: cv2.imshow('Normalized', img) cv2.waitKey(0) cv2.waitKey(0) # identify indent levels tabs = id_tabs(255 - img, avg_line_height=avg_line_height, line_blur=20, tab_wiggle_room=2, disp=True) print tabs cv2.imwrite("text.png", img) pil_img = Image.open("text.png") translation = pytesseract.image_to_string(pil_img, 'hww') print "TESSERACT RESULTS: " print translation print "\n\nPROOFREAD: " print proofread(translation.split('\n'), tabs)
def normalize_training_image(img, threshold_height, disp=False): """ Takes in a binary image and normalizes the text within the image to the given height. :param img: the image to normalize :param threshold_height: the average height we're trying to obtain :param disp: whether to display intermediate results """ img = _remove_circles(img) if disp: vis_img, _ = reduce_image(img.copy()) cv2.imshow('Removed Bounding Circles', vis_img) cv2.waitKey(0) cc = _horizontally_blur_image(img) _, cc = cv2.threshold(cc, 1, 255, cv2.THRESH_BINARY) _, contours, hierarchy = cv2.findContours(cc, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) avg_height = 0 for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) avg_height += h avg_height /= len(contours) parsed_contours = [] for cnt in contours: x, y, w, h = cv2.boundingRect(cnt) if h >= avg_height / 2: parsed_contours.append(cnt) new_avg_height = 0 for cnt in parsed_contours: x, y, w, h = cv2.boundingRect(cnt) new_avg_height += h new_avg_height /= len(parsed_contours) p = threshold_height / new_avg_height height, width = img.shape dim = (int(width * p), int(height * p)) return cv2.resize(img, dim), threshold_height