Ejemplo n.º 1
0
def assign_sizes_for_page(content,
                          stack_inx_pointer,
                          stacks_gmms,
                          stack_mean,
                          prev_size='',
                          smooth_method=''):
    all_lines = []

    if not content:
        return None, all_lines
    # First pass
    for line in content:
        #         prev_size = 'b'
        cur_line = []
        for s in line:
            if s[-1] in u'་། ()〈〉༽༼༔༑' or num_stacks(s[-1]) > 1:
                #                 size = prev_size
                if smooth_method == 'line_smooth':
                    cur_line.append(s[-1])
                else:
                    all_lines.append(s[-1])
                continue
#             print s
            try:
                #                 size = big_or_small(stacks_gmms[s[-1]], s[2], stack_mean)
                size = big_or_small(stacks_gmms[stack_inx_pointer[s[-1]]],
                                    s[2], stack_mean)
            except KeyError:
                if smooth_method == 'line_smooth':
                    cur_line.append(s[-1])
                else:
                    all_lines.append(s[-1])
#                 print 'KEY ERROR', s[-1]
                continue

#             prev_size = size
#             assert size in 'bs', 'fatal error in first assignment pass'
            if smooth_method == 'line_smooth':
                cur_line.append(size)
            else:
                all_lines.append(size)
        if smooth_method == 'line_smooth':
            all_lines.append(cur_line)
            cur_line = []


#             all_lines.append(size)
#         all_lines.append(cur_line)
#         print '\n'

# second pass. smooth over abrupt size changes

    if smooth_method == 'line_smooth':
        second_pass = line_smooth(all_lines)
    elif smooth_method == 'segment_smooth':
        second_pass = segment_smooth(all_lines)
    elif smooth_method == 'granular':
        second_pass = granular_smooth(all_lines)
    else:
        second_pass = []
        for i, s in enumerate(all_lines):
            if s in u'་། ()〈〉༽༼༔༑':
                s = prev_size
                second_pass.append(prev_size)
            else:
                second_pass.append(s)
                prev_size = s
    if not second_pass:
        mode = 's'
        print 'WARNING: using default value for mode'
    else:
        mode = statsmode(second_pass)[0][0]
    final_lines = []
    prev_inx = 0
    for line in content:
        cur_inx = len(line) + prev_inx
        final_lines.append(second_pass[prev_inx:cur_inx])
        prev_inx = cur_inx

    return mode, final_lines
Ejemplo n.º 2
0
    def __init__(self, img_arr, fast_cls, small_coef=1, low_ink=False, \
                 page_type=None, flpath=None, detect_o=True,\
                 clear_hr = False): #lower coef means more filtering USE 3 for nying gyud
        self.img_arr = img_arr
        self.page_type = page_type
        self.flpath = flpath
        self.low_ink = low_ink
        self.detect_o = detect_o
        #         self.clear_hr = clear_hr
        #         self.cached_features = {}
        #         self.cached_pred_prob = {}
        self.cached_features = OrderedDict()
        self.cached_pred_prob = OrderedDict()
        #         self.low_ink = True
        #        if page_type == 'pecha':
        #            self._contour_mode = cv.RETR_CCOMP
        #        else:
        self._contour_mode = cv.RETR_TREE
        ### repeatedly called functions
        ones = np.ones
        uint8 = np.uint8
        predict = fast_cls.predict
        predict_proba = fast_cls.predict_proba
        _, self.contours, self.hierarchy = self._contours()
        self.boxes = []
        self.indices = []
        self.small_coef = small_coef

        FILTERED_PUNC = (u'།', u'་', u']', u'[')

        self._set_shape_measurements()
        if page_type == 'pecha':
            if clear_hr:
                print 'Warning: clear_hr called on pecha format. For clearing text'
                self.force_clear_hr()
            self.set_pecha_layout()
            if self.indices:
                content_parent = int(
                    statsmode([self.hierarchy[0][i][3]
                               for i in self.indices])[0])
            else:
                print 'no content found'
        else:

            content_parent = int(
                statsmode([hier[3] for hier in self.hierarchy[0]])[0])
            self.indices = self.get_indices()
#        if self.page_type != 'pecha':

### Find the parent with the most children. Call it 'content_parent'
#        content_parent = int(statsmode([self.hierarchy[0][i][3] for i in self.indices])[0])

#        width_measures = self.char_gaussians([b[2] for b in self.get_boxes() if (b[2] < .1*self.img_arr.shape[1]] and self.hierarchy[0][] ))

        outer_contours = []
        outer_widths = []

        #        pg = np.ones_like(img_arr)

        ## Iterate through all contours
        for i in self.indices:
            cbox = self.get_boxes()[i]
            x, y, w, h = cbox
            ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS. Recently
            # added the len(indices) < 40 as a way to prevent exaggerated
            # filtering of small lines where gaussian width measures
            # are meaningless due to small sample size (too few contours)
            #             if self.hierarchy[0][i][3] == content_parent and (cbox[2] < .1*self.img_arr.shape[1] or len(self.indices) < 40 ):
            if self.hierarchy[0][i][3] == content_parent and (
                    cbox[2] < .1 * self.img_arr.shape[1]
                    or len(self.indices) < 40):
                #            if self.hierarchy[0][i][3] == content_parent and cbox[2] < 3*self.char_mean:  ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS
                #            if self.hierarchy[0][i][3] == content_parent and cbox[2] < .075*self.img_arr.shape[1]:  ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS
                outer_contours.append(i)
                outer_widths.append(cbox[2])
#                if cbox[2] > 50: print cbox[2],
#                x,y,w,h = cbox
#                 cv.rectangle(self.img_arr, (x,y), (x+w, y+h), 0)
            else:
                #                 if cbox[2] > 100:
                #                     print cbox
                #                     raw_input('continue?')
                if cbox[2] > .66 * self.img_arr.shape[1]:
                    print cbox[2] / float(self.img_arr.shape[1])
                if clear_hr and .995*self.img_arr.shape[1] > cbox[2] > \
                .66*self.img_arr.shape[1] and cbox[1] < .25*self.img_arr.shape[0]:
                    self.img_arr[0:cbox[1] + cbox[3], :] = 1
#                 print 'rejected box. too wide?', cbox[2] >= .1*self.img_arr.shape[1]
#        print
#        print max(outer_widths)
        width_measures = self.char_gaussians(outer_widths)

        #         import Image
        #         Image.fromarray(self.img_arr*255).show()

        #         newarr = np.ones_like(img_arr)
        #         for o in self.indices:
        #             x,y,w,h = self.get_boxes()[o]
        #             cv.rectangle(newarr, (x,y), (x+w, y+h), 0)
        #             if self.hierarchy[0][o][3] == content_parent:
        #                 self.draw_contour_and_children(o, newarr, (0,0))
        #
        #         import Image
        #         Image.fromarray(newarr*255).show()
        #         import sys; sys.exit()
        for i, j in zip(['char_mean', 'char_std', 'tsek_mean', 'tsek_std'],
                        width_measures):
            setattr(self, i, j)

#        print self.gmm.converged_
#        print self.char_mean, self.char_std
#        print self.tsek_mean, self.tsek_std

        self.small_contour_indices = []
        #        self.contours = []
        self.indices = []  # Need to reset!19
        self.emph_symbols = []
        self.naros = []
        #         print self.char_mean, self.char_std, self.tsek_mean
        for i in outer_contours:
            cbox = self.get_boxes()[i]
            # if small and has no children, put in small list (this could backfire with false interiors e.g. from salt and pepper noise)
            ## NOTE: previously small was defined as less than tsek_mean + 3xtsek std
            ## however, this wasn't always working. changing to less than charmean
            ## minus 2xchar std however should watch to see if is ok for many different inputs...

            x, y, w, h = cbox
            tmparr = ones((h, w), dtype=uint8)
            tmparr = self.draw_contour_and_children(i, tmparr, (-x, -y))

            features = normalize_and_extract_features(tmparr)
            self.cached_features[i] = features

            prprob = predict_proba(features)

            #         all_feats = self.cached_features.values()
            #         all_probs = predict_proba(all_feats)
            #         all_probs = predict_proba(self.cached_features.values())
            #         for ix,i in enumerate(outer_contours):
            #             prprob = all_probs[ix]
            #             if recognizer ==  'probout':
            mxinx = prprob.argmax()
            quick_prd = label_chars[mxinx]
            self.cached_pred_prob[i] = (mxinx, prprob[0])
            #             self.cached_pred_prob[i] = (mxinx, prprob)
            #             else:
            #             quick_prd = label_chars[predict_proba(features).argmax()]
            #                 quick_prd = label_chars[predict(features)[0]]

            #             is_emph_symbol = quick_prd in set([u'༷', u'༵', u'༼', u'༽', u'—'])
            is_emph_symbol = quick_prd in set([u'༷', u'༵', u'༼', u'༽'])
            #             is_emph_symbol = quick_prd in set([u'༼', u'༽'])
            #             is_emph_symbol = quick_prd in set([u'༷', u'༵'])
            #             is_emph_symbol = quick_prd in set([u'༼', u'༽', u'—'])
            #             is_emph_symbol = quick_prd in set([u'༼', u'༽'])
            #             is_emph_symbol = quick_prd == '~~' # use this line if don't want this to actually get anything
            #             if is_emph_symbol: print 'found naro? ', is_emph_symbol
            #                 import Image; Image.fromarray(tmparr*255).show()
            if is_emph_symbol:
                self.emph_symbols.append(i)

                print 'EMPHSYMBOLFOUND', quick_prd
#                 cv.rectangle(self.img_arr, (x,y), (x+w, y+h), 0)
            elif quick_prd == u'ོ' and self.detect_o:
                self.naros.append(i)

            elif cbox[2] < 7:

                #             elif cbox[2] < 9:
                continue
#             elif (cbox[2] <= self.char_mean - 2*self.char_std and
#             elif (cbox[2] <= self.char_mean - 3*self.char_std and
#             elif (cbox[2] <= self.tsek_mean*1.5 and
#             elif (cbox[2] <= self.tsek_mean*.0 and
            elif (cbox[2] <= self.tsek_mean * 3 and
                  #             elif (cbox[2] <= self.char_mean - 4*self.char_std and
                  #                   self.hierarchy[0][i][2] < 0 and
                  quick_prd in FILTERED_PUNC
                  ) and not self.low_ink:  # default!!!
                #                 quick_prd in (u'་')) and not self.low_ink:
                #                 quick_prd not in word_parts_set) and not self.low_ink :
                self.small_contour_indices.append(i)
#                self.indices.append(i) #DEFAULT
#             elif (cbox[2] <= self.tsek_mean*.8 and
#             elif (cbox[2] <= self.tsek_mean*.3 and
#            elif (cbox[2] <= self.char_mean - 4*self.char_std and
#                   self.hierarchy[0][i][2] < 0 and not self.low_ink):
#                 cv.rectangle(self.img_arr, (x,y), (x+w, y+h), 0)
#                 continue
            else:
                #                 cv.rectangle(self.img_arr, (x,y), (x+w, y+h), 0)
                self.indices.append(i)

#                if  (cbox[2] <= self.tsek_mean*1.5 and
##            elif (cbox[2] <= self.char_mean - 4*self.char_std and
#                  self.hierarchy[0][i][2] < 0 and
#                  quick_prd in (u'།', u'་')):
#                    self.small_contour_indices.append(i)

#            import Image
#            Image.fromarray(tmparr*255).convert('L').save('/tmp/examples/%04d.tif' % i)

#        print len(self.small_contour_indices), 'len small contour ind'
#         import Image
#         Image.fromarray(self.img_arr*255).show()
#        print scount
#         raw_input()
        if self.detect_o:
            print 'pre-filtered na-ro vowel', len(self.naros), 'found'

#        for i in self.indices:
#                if cbox[2] > 50: print cbox[2],
#            bx = self.boxes[i]
#            x,y,w,h = bx
#            cv.rectangle(img_arr, (x,y), (x+w, y+h), 0)

#         import Image
#         Image.fromarray(img_arr*255).show()
#        raw_input()
#        for i in self.indices:
#            if self.hierarchy[0][i][2] >= 0:
#                char = self.draw_contour_and_children(i)
#
#                Image.fromarray(char*255).show()
#                raw_input()
#        from matplotlib import pyplot as plt
#        from matplotlib.mlab import normpdf
#        plt.subplot(111)
#        plt.title('tsek-char distributions, pre-segmentation')
#
##        widths = [self.boxes[i][2] for i in self.get_indices()]
#        n,bins,p = plt.hist(outer_widths, 200, range=(0,75), normed=True, color='#3B60FA')
#        plt.vlines([self.char_mean, self.tsek_mean], 0, np.array([max(n), max(n)]), linestyles='--')
#        plt.plot(bins, normpdf(bins, self.tsek_mean, self.tsek_std),  label='fit', linewidth=1)
#        plt.fill_between(bins, normpdf(bins, self.tsek_mean, self.tsek_std), color=(.58,.63,.8), alpha=0.09)
#        plt.plot(bins, normpdf(bins, self.char_mean, self.char_std), label='fit', linewidth=1)
#        plt.fill_between(bins, normpdf(bins, self.char_mean, self.char_std), color=(.58,.63,.8), alpha=0.01)
#        plt.show()

#        print self.tsek_mean, self.tsek_std
#        print len(self.boxes)
#        font_detector.save_info(self.char_mean, self.char_std, self.tsek_mean, self.tsek_std)
#         self.low_ink = False
        if self.low_ink:
            self._low_ink_setting()
Ejemplo n.º 3
0
    def __init__(self, img_arr, fast_cls, small_coef=1, low_ink=False, \
                 page_type=None, flpath=None, detect_o=True,\
                 clear_hr = False): #lower coef means more filtering USE 3 for nying gyud
        self.img_arr = img_arr
        self.page_type = page_type
        self.flpath = flpath
        self.low_ink = low_ink
        self.detect_o = detect_o

        self.cached_features = OrderedDict()
        self.cached_pred_prob = OrderedDict()
        self._contour_mode = cv.RETR_TREE
        ### repeatedly called functions
        ones = np.ones
        uint8 = np.uint8
        predict = fast_cls.predict
        predict_proba = fast_cls.predict_proba
        self.contours, self.hierarchy = self._contours()
        self.boxes = []
        self.indices = []
        self.small_coef = small_coef

        FILTERED_PUNC = (u'།', u'་', u']', u'[')

        self._set_shape_measurements()
        if page_type == 'pecha':
            if clear_hr:
                print 'Warning: clear_hr called on pecha format. For clearing text'
                self.force_clear_hr()
            self.set_pecha_layout()
            if self.indices:
                content_parent = int(
                    statsmode([self.hierarchy[0][i][3]
                               for i in self.indices])[0])
            else:
                print 'no content found'
        else:

            content_parent = int(
                statsmode([hier[3] for hier in self.hierarchy[0]])[0])
            self.indices = self.get_indices()

        outer_contours = []
        outer_widths = []

        ## Iterate through all contours
        for i in self.indices:
            cbox = self.get_boxes()[i]
            x, y, w, h = cbox
            ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS. Recently
            # added the len(indices) < 40 as a way to prevent exaggerated
            # filtering of small lines where gaussian width measures
            # are meaningless due to small sample size (too few contours)
            if self.hierarchy[0][i][3] == content_parent and (
                    cbox[2] < .1 * self.img_arr.shape[1]
                    or len(self.indices) < 40):
                #            if self.hierarchy[0][i][3] == content_parent and cbox[2] < 3*self.char_mean:  ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS
                outer_contours.append(i)
                outer_widths.append(cbox[2])
            else:
                if cbox[2] > .66 * self.img_arr.shape[1]:
                    print cbox[2] / float(self.img_arr.shape[1])
                if clear_hr and .995*self.img_arr.shape[1] > cbox[2] > \
                .66*self.img_arr.shape[1] and cbox[1] < .25*self.img_arr.shape[0]:
                    self.img_arr[0:cbox[1] + cbox[3], :] = 1
#                 print 'rejected box. too wide?', cbox[2] >= .1*self.img_arr.shape[1]
        width_measures = self.char_gaussians(outer_widths)

        for i, j in zip(['char_mean', 'char_std', 'tsek_mean', 'tsek_std'],
                        width_measures):
            setattr(self, i, j)

        self.small_contour_indices = []
        self.indices = []  # Need to reset!19
        self.emph_symbols = []
        self.naros = []
        for i in outer_contours:
            cbox = self.get_boxes()[i]
            # if small and has no children, put in small list (this could backfire with false interiors e.g. from salt and pepper noise)
            ## NOTE: previously small was defined as less than tsek_mean + 3xtsek std
            ## however, this wasn't always working. changing to less than charmean
            ## minus 2xchar std however should watch to see if is ok for many different inputs...

            x, y, w, h = cbox
            tmparr = ones((h, w), dtype=uint8)
            tmparr = self.draw_contour_and_children(i, tmparr, (-x, -y))

            features = normalize_and_extract_features(tmparr)
            self.cached_features[i] = features

            prprob = predict_proba(features)

            mxinx = prprob.argmax()
            quick_prd = label_chars[mxinx]
            self.cached_pred_prob[i] = (mxinx, prprob[0])

            is_emph_symbol = quick_prd in set([u'༷', u'༵', u'༼', u'༽'])
            if is_emph_symbol:
                self.emph_symbols.append(i)

                print 'EMPHSYMBOLFOUND', quick_prd
            elif quick_prd == u'ོ' and self.detect_o:
                self.naros.append(i)

            elif cbox[2] < 7:

                continue
            elif (cbox[2] <= self.tsek_mean * 3 and quick_prd
                  in FILTERED_PUNC) and not self.low_ink:  # default!!!
                self.small_contour_indices.append(i)
            else:
                self.indices.append(i)
        if self.detect_o:
            print 'pre-filtered na-ro vowel', len(self.naros), 'found'

        if self.low_ink:
            self._low_ink_setting()