def assign_sizes_for_page(content, stack_inx_pointer, stacks_gmms, stack_mean, prev_size='', smooth_method=''): all_lines = [] if not content: return None, all_lines # First pass for line in content: # prev_size = 'b' cur_line = [] for s in line: if s[-1] in u'་། ()〈〉༽༼༔༑' or num_stacks(s[-1]) > 1: # size = prev_size if smooth_method == 'line_smooth': cur_line.append(s[-1]) else: all_lines.append(s[-1]) continue # print s try: # size = big_or_small(stacks_gmms[s[-1]], s[2], stack_mean) size = big_or_small(stacks_gmms[stack_inx_pointer[s[-1]]], s[2], stack_mean) except KeyError: if smooth_method == 'line_smooth': cur_line.append(s[-1]) else: all_lines.append(s[-1]) # print 'KEY ERROR', s[-1] continue # prev_size = size # assert size in 'bs', 'fatal error in first assignment pass' if smooth_method == 'line_smooth': cur_line.append(size) else: all_lines.append(size) if smooth_method == 'line_smooth': all_lines.append(cur_line) cur_line = [] # all_lines.append(size) # all_lines.append(cur_line) # print '\n' # second pass. smooth over abrupt size changes if smooth_method == 'line_smooth': second_pass = line_smooth(all_lines) elif smooth_method == 'segment_smooth': second_pass = segment_smooth(all_lines) elif smooth_method == 'granular': second_pass = granular_smooth(all_lines) else: second_pass = [] for i, s in enumerate(all_lines): if s in u'་། ()〈〉༽༼༔༑': s = prev_size second_pass.append(prev_size) else: second_pass.append(s) prev_size = s if not second_pass: mode = 's' print 'WARNING: using default value for mode' else: mode = statsmode(second_pass)[0][0] final_lines = [] prev_inx = 0 for line in content: cur_inx = len(line) + prev_inx final_lines.append(second_pass[prev_inx:cur_inx]) prev_inx = cur_inx return mode, final_lines
def __init__(self, img_arr, fast_cls, small_coef=1, low_ink=False, \ page_type=None, flpath=None, detect_o=True,\ clear_hr = False): #lower coef means more filtering USE 3 for nying gyud self.img_arr = img_arr self.page_type = page_type self.flpath = flpath self.low_ink = low_ink self.detect_o = detect_o # self.clear_hr = clear_hr # self.cached_features = {} # self.cached_pred_prob = {} self.cached_features = OrderedDict() self.cached_pred_prob = OrderedDict() # self.low_ink = True # if page_type == 'pecha': # self._contour_mode = cv.RETR_CCOMP # else: self._contour_mode = cv.RETR_TREE ### repeatedly called functions ones = np.ones uint8 = np.uint8 predict = fast_cls.predict predict_proba = fast_cls.predict_proba _, self.contours, self.hierarchy = self._contours() self.boxes = [] self.indices = [] self.small_coef = small_coef FILTERED_PUNC = (u'།', u'་', u']', u'[') self._set_shape_measurements() if page_type == 'pecha': if clear_hr: print 'Warning: clear_hr called on pecha format. For clearing text' self.force_clear_hr() self.set_pecha_layout() if self.indices: content_parent = int( statsmode([self.hierarchy[0][i][3] for i in self.indices])[0]) else: print 'no content found' else: content_parent = int( statsmode([hier[3] for hier in self.hierarchy[0]])[0]) self.indices = self.get_indices() # if self.page_type != 'pecha': ### Find the parent with the most children. Call it 'content_parent' # content_parent = int(statsmode([self.hierarchy[0][i][3] for i in self.indices])[0]) # width_measures = self.char_gaussians([b[2] for b in self.get_boxes() if (b[2] < .1*self.img_arr.shape[1]] and self.hierarchy[0][] )) outer_contours = [] outer_widths = [] # pg = np.ones_like(img_arr) ## Iterate through all contours for i in self.indices: cbox = self.get_boxes()[i] x, y, w, h = cbox ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS. Recently # added the len(indices) < 40 as a way to prevent exaggerated # filtering of small lines where gaussian width measures # are meaningless due to small sample size (too few contours) # if self.hierarchy[0][i][3] == content_parent and (cbox[2] < .1*self.img_arr.shape[1] or len(self.indices) < 40 ): if self.hierarchy[0][i][3] == content_parent and ( cbox[2] < .1 * self.img_arr.shape[1] or len(self.indices) < 40): # if self.hierarchy[0][i][3] == content_parent and cbox[2] < 3*self.char_mean: ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS # if self.hierarchy[0][i][3] == content_parent and cbox[2] < .075*self.img_arr.shape[1]: ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS outer_contours.append(i) outer_widths.append(cbox[2]) # if cbox[2] > 50: print cbox[2], # x,y,w,h = cbox # cv.rectangle(self.img_arr, (x,y), (x+w, y+h), 0) else: # if cbox[2] > 100: # print cbox # raw_input('continue?') if cbox[2] > .66 * self.img_arr.shape[1]: print cbox[2] / float(self.img_arr.shape[1]) if clear_hr and .995*self.img_arr.shape[1] > cbox[2] > \ .66*self.img_arr.shape[1] and cbox[1] < .25*self.img_arr.shape[0]: self.img_arr[0:cbox[1] + cbox[3], :] = 1 # print 'rejected box. too wide?', cbox[2] >= .1*self.img_arr.shape[1] # print # print max(outer_widths) width_measures = self.char_gaussians(outer_widths) # import Image # Image.fromarray(self.img_arr*255).show() # newarr = np.ones_like(img_arr) # for o in self.indices: # x,y,w,h = self.get_boxes()[o] # cv.rectangle(newarr, (x,y), (x+w, y+h), 0) # if self.hierarchy[0][o][3] == content_parent: # self.draw_contour_and_children(o, newarr, (0,0)) # # import Image # Image.fromarray(newarr*255).show() # import sys; sys.exit() for i, j in zip(['char_mean', 'char_std', 'tsek_mean', 'tsek_std'], width_measures): setattr(self, i, j) # print self.gmm.converged_ # print self.char_mean, self.char_std # print self.tsek_mean, self.tsek_std self.small_contour_indices = [] # self.contours = [] self.indices = [] # Need to reset!19 self.emph_symbols = [] self.naros = [] # print self.char_mean, self.char_std, self.tsek_mean for i in outer_contours: cbox = self.get_boxes()[i] # if small and has no children, put in small list (this could backfire with false interiors e.g. from salt and pepper noise) ## NOTE: previously small was defined as less than tsek_mean + 3xtsek std ## however, this wasn't always working. changing to less than charmean ## minus 2xchar std however should watch to see if is ok for many different inputs... x, y, w, h = cbox tmparr = ones((h, w), dtype=uint8) tmparr = self.draw_contour_and_children(i, tmparr, (-x, -y)) features = normalize_and_extract_features(tmparr) self.cached_features[i] = features prprob = predict_proba(features) # all_feats = self.cached_features.values() # all_probs = predict_proba(all_feats) # all_probs = predict_proba(self.cached_features.values()) # for ix,i in enumerate(outer_contours): # prprob = all_probs[ix] # if recognizer == 'probout': mxinx = prprob.argmax() quick_prd = label_chars[mxinx] self.cached_pred_prob[i] = (mxinx, prprob[0]) # self.cached_pred_prob[i] = (mxinx, prprob) # else: # quick_prd = label_chars[predict_proba(features).argmax()] # quick_prd = label_chars[predict(features)[0]] # is_emph_symbol = quick_prd in set([u'༷', u'༵', u'༼', u'༽', u'—']) is_emph_symbol = quick_prd in set([u'༷', u'༵', u'༼', u'༽']) # is_emph_symbol = quick_prd in set([u'༼', u'༽']) # is_emph_symbol = quick_prd in set([u'༷', u'༵']) # is_emph_symbol = quick_prd in set([u'༼', u'༽', u'—']) # is_emph_symbol = quick_prd in set([u'༼', u'༽']) # is_emph_symbol = quick_prd == '~~' # use this line if don't want this to actually get anything # if is_emph_symbol: print 'found naro? ', is_emph_symbol # import Image; Image.fromarray(tmparr*255).show() if is_emph_symbol: self.emph_symbols.append(i) print 'EMPHSYMBOLFOUND', quick_prd # cv.rectangle(self.img_arr, (x,y), (x+w, y+h), 0) elif quick_prd == u'ོ' and self.detect_o: self.naros.append(i) elif cbox[2] < 7: # elif cbox[2] < 9: continue # elif (cbox[2] <= self.char_mean - 2*self.char_std and # elif (cbox[2] <= self.char_mean - 3*self.char_std and # elif (cbox[2] <= self.tsek_mean*1.5 and # elif (cbox[2] <= self.tsek_mean*.0 and elif (cbox[2] <= self.tsek_mean * 3 and # elif (cbox[2] <= self.char_mean - 4*self.char_std and # self.hierarchy[0][i][2] < 0 and quick_prd in FILTERED_PUNC ) and not self.low_ink: # default!!! # quick_prd in (u'་')) and not self.low_ink: # quick_prd not in word_parts_set) and not self.low_ink : self.small_contour_indices.append(i) # self.indices.append(i) #DEFAULT # elif (cbox[2] <= self.tsek_mean*.8 and # elif (cbox[2] <= self.tsek_mean*.3 and # elif (cbox[2] <= self.char_mean - 4*self.char_std and # self.hierarchy[0][i][2] < 0 and not self.low_ink): # cv.rectangle(self.img_arr, (x,y), (x+w, y+h), 0) # continue else: # cv.rectangle(self.img_arr, (x,y), (x+w, y+h), 0) self.indices.append(i) # if (cbox[2] <= self.tsek_mean*1.5 and ## elif (cbox[2] <= self.char_mean - 4*self.char_std and # self.hierarchy[0][i][2] < 0 and # quick_prd in (u'།', u'་')): # self.small_contour_indices.append(i) # import Image # Image.fromarray(tmparr*255).convert('L').save('/tmp/examples/%04d.tif' % i) # print len(self.small_contour_indices), 'len small contour ind' # import Image # Image.fromarray(self.img_arr*255).show() # print scount # raw_input() if self.detect_o: print 'pre-filtered na-ro vowel', len(self.naros), 'found' # for i in self.indices: # if cbox[2] > 50: print cbox[2], # bx = self.boxes[i] # x,y,w,h = bx # cv.rectangle(img_arr, (x,y), (x+w, y+h), 0) # import Image # Image.fromarray(img_arr*255).show() # raw_input() # for i in self.indices: # if self.hierarchy[0][i][2] >= 0: # char = self.draw_contour_and_children(i) # # Image.fromarray(char*255).show() # raw_input() # from matplotlib import pyplot as plt # from matplotlib.mlab import normpdf # plt.subplot(111) # plt.title('tsek-char distributions, pre-segmentation') # ## widths = [self.boxes[i][2] for i in self.get_indices()] # n,bins,p = plt.hist(outer_widths, 200, range=(0,75), normed=True, color='#3B60FA') # plt.vlines([self.char_mean, self.tsek_mean], 0, np.array([max(n), max(n)]), linestyles='--') # plt.plot(bins, normpdf(bins, self.tsek_mean, self.tsek_std), label='fit', linewidth=1) # plt.fill_between(bins, normpdf(bins, self.tsek_mean, self.tsek_std), color=(.58,.63,.8), alpha=0.09) # plt.plot(bins, normpdf(bins, self.char_mean, self.char_std), label='fit', linewidth=1) # plt.fill_between(bins, normpdf(bins, self.char_mean, self.char_std), color=(.58,.63,.8), alpha=0.01) # plt.show() # print self.tsek_mean, self.tsek_std # print len(self.boxes) # font_detector.save_info(self.char_mean, self.char_std, self.tsek_mean, self.tsek_std) # self.low_ink = False if self.low_ink: self._low_ink_setting()
def __init__(self, img_arr, fast_cls, small_coef=1, low_ink=False, \ page_type=None, flpath=None, detect_o=True,\ clear_hr = False): #lower coef means more filtering USE 3 for nying gyud self.img_arr = img_arr self.page_type = page_type self.flpath = flpath self.low_ink = low_ink self.detect_o = detect_o self.cached_features = OrderedDict() self.cached_pred_prob = OrderedDict() self._contour_mode = cv.RETR_TREE ### repeatedly called functions ones = np.ones uint8 = np.uint8 predict = fast_cls.predict predict_proba = fast_cls.predict_proba self.contours, self.hierarchy = self._contours() self.boxes = [] self.indices = [] self.small_coef = small_coef FILTERED_PUNC = (u'།', u'་', u']', u'[') self._set_shape_measurements() if page_type == 'pecha': if clear_hr: print 'Warning: clear_hr called on pecha format. For clearing text' self.force_clear_hr() self.set_pecha_layout() if self.indices: content_parent = int( statsmode([self.hierarchy[0][i][3] for i in self.indices])[0]) else: print 'no content found' else: content_parent = int( statsmode([hier[3] for hier in self.hierarchy[0]])[0]) self.indices = self.get_indices() outer_contours = [] outer_widths = [] ## Iterate through all contours for i in self.indices: cbox = self.get_boxes()[i] x, y, w, h = cbox ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS. Recently # added the len(indices) < 40 as a way to prevent exaggerated # filtering of small lines where gaussian width measures # are meaningless due to small sample size (too few contours) if self.hierarchy[0][i][3] == content_parent and ( cbox[2] < .1 * self.img_arr.shape[1] or len(self.indices) < 40): # if self.hierarchy[0][i][3] == content_parent and cbox[2] < 3*self.char_mean: ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS outer_contours.append(i) outer_widths.append(cbox[2]) else: if cbox[2] > .66 * self.img_arr.shape[1]: print cbox[2] / float(self.img_arr.shape[1]) if clear_hr and .995*self.img_arr.shape[1] > cbox[2] > \ .66*self.img_arr.shape[1] and cbox[1] < .25*self.img_arr.shape[0]: self.img_arr[0:cbox[1] + cbox[3], :] = 1 # print 'rejected box. too wide?', cbox[2] >= .1*self.img_arr.shape[1] width_measures = self.char_gaussians(outer_widths) for i, j in zip(['char_mean', 'char_std', 'tsek_mean', 'tsek_std'], width_measures): setattr(self, i, j) self.small_contour_indices = [] self.indices = [] # Need to reset!19 self.emph_symbols = [] self.naros = [] for i in outer_contours: cbox = self.get_boxes()[i] # if small and has no children, put in small list (this could backfire with false interiors e.g. from salt and pepper noise) ## NOTE: previously small was defined as less than tsek_mean + 3xtsek std ## however, this wasn't always working. changing to less than charmean ## minus 2xchar std however should watch to see if is ok for many different inputs... x, y, w, h = cbox tmparr = ones((h, w), dtype=uint8) tmparr = self.draw_contour_and_children(i, tmparr, (-x, -y)) features = normalize_and_extract_features(tmparr) self.cached_features[i] = features prprob = predict_proba(features) mxinx = prprob.argmax() quick_prd = label_chars[mxinx] self.cached_pred_prob[i] = (mxinx, prprob[0]) is_emph_symbol = quick_prd in set([u'༷', u'༵', u'༼', u'༽']) if is_emph_symbol: self.emph_symbols.append(i) print 'EMPHSYMBOLFOUND', quick_prd elif quick_prd == u'ོ' and self.detect_o: self.naros.append(i) elif cbox[2] < 7: continue elif (cbox[2] <= self.tsek_mean * 3 and quick_prd in FILTERED_PUNC) and not self.low_ink: # default!!! self.small_contour_indices.append(i) else: self.indices.append(i) if self.detect_o: print 'pre-filtered na-ro vowel', len(self.naros), 'found' if self.low_ink: self._low_ink_setting()