def test_accuracy(t, clsf=None):
    '''Get accuracy score for a testset t'''
    if clsf:
        cls = clsf
    else:
        global cls
    
    y = tsets[t][:,0]
    x = tsets[t][:,1:]
    
    x3 = []
    for j in x:
        j = ftrim(j.reshape((32,16)).astype(np.uint8))
        x3.append(normalize_and_extract_features(j))
    
    pred = cls.predict(x3)

    s  = 0
    for i, p in enumerate(pred):
        if float(p) == y[i]:
            s += 1.0            
        else:
            pass
            print 'correct', label_chars[y[i]], '||', label_chars[p], t #, max(cls.predict_proba(x3[i])[0])

    score = s / len(y)
    return score
Exemple #2
0
    def __init__(self, img_arr, fast_cls, small_coef=1, low_ink=False, \
                 page_type=None, flpath=None, detect_o=True,\
                 clear_hr = False): #lower coef means more filtering USE 3 for nying gyud
        self.img_arr = img_arr
        self.page_type = page_type
        self.flpath = flpath
        self.low_ink = low_ink
        self.detect_o = detect_o
        #         self.clear_hr = clear_hr
        #         self.cached_features = {}
        #         self.cached_pred_prob = {}
        self.cached_features = OrderedDict()
        self.cached_pred_prob = OrderedDict()
        #         self.low_ink = True
        #        if page_type == 'pecha':
        #            self._contour_mode = cv.RETR_CCOMP
        #        else:
        self._contour_mode = cv.RETR_TREE
        ### repeatedly called functions
        ones = np.ones
        uint8 = np.uint8
        predict = fast_cls.predict
        predict_proba = fast_cls.predict_proba
        _, self.contours, self.hierarchy = self._contours()
        self.boxes = []
        self.indices = []
        self.small_coef = small_coef

        FILTERED_PUNC = (u'།', u'་', u']', u'[')

        self._set_shape_measurements()
        if page_type == 'pecha':
            if clear_hr:
                print 'Warning: clear_hr called on pecha format. For clearing text'
                self.force_clear_hr()
            self.set_pecha_layout()
            if self.indices:
                content_parent = int(
                    statsmode([self.hierarchy[0][i][3]
                               for i in self.indices])[0])
            else:
                print 'no content found'
        else:

            content_parent = int(
                statsmode([hier[3] for hier in self.hierarchy[0]])[0])
            self.indices = self.get_indices()
#        if self.page_type != 'pecha':

### Find the parent with the most children. Call it 'content_parent'
#        content_parent = int(statsmode([self.hierarchy[0][i][3] for i in self.indices])[0])

#        width_measures = self.char_gaussians([b[2] for b in self.get_boxes() if (b[2] < .1*self.img_arr.shape[1]] and self.hierarchy[0][] ))

        outer_contours = []
        outer_widths = []

        #        pg = np.ones_like(img_arr)

        ## Iterate through all contours
        for i in self.indices:
            cbox = self.get_boxes()[i]
            x, y, w, h = cbox
            ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS. Recently
            # added the len(indices) < 40 as a way to prevent exaggerated
            # filtering of small lines where gaussian width measures
            # are meaningless due to small sample size (too few contours)
            #             if self.hierarchy[0][i][3] == content_parent and (cbox[2] < .1*self.img_arr.shape[1] or len(self.indices) < 40 ):
            if self.hierarchy[0][i][3] == content_parent and (
                    cbox[2] < .1 * self.img_arr.shape[1]
                    or len(self.indices) < 40):
                #            if self.hierarchy[0][i][3] == content_parent and cbox[2] < 3*self.char_mean:  ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS
                #            if self.hierarchy[0][i][3] == content_parent and cbox[2] < .075*self.img_arr.shape[1]:  ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS
                outer_contours.append(i)
                outer_widths.append(cbox[2])
#                if cbox[2] > 50: print cbox[2],
#                x,y,w,h = cbox
#                 cv.rectangle(self.img_arr, (x,y), (x+w, y+h), 0)
            else:
                #                 if cbox[2] > 100:
                #                     print cbox
                #                     raw_input('continue?')
                if cbox[2] > .66 * self.img_arr.shape[1]:
                    print cbox[2] / float(self.img_arr.shape[1])
                if clear_hr and .995*self.img_arr.shape[1] > cbox[2] > \
                .66*self.img_arr.shape[1] and cbox[1] < .25*self.img_arr.shape[0]:
                    self.img_arr[0:cbox[1] + cbox[3], :] = 1
#                 print 'rejected box. too wide?', cbox[2] >= .1*self.img_arr.shape[1]
#        print
#        print max(outer_widths)
        width_measures = self.char_gaussians(outer_widths)

        #         import Image
        #         Image.fromarray(self.img_arr*255).show()

        #         newarr = np.ones_like(img_arr)
        #         for o in self.indices:
        #             x,y,w,h = self.get_boxes()[o]
        #             cv.rectangle(newarr, (x,y), (x+w, y+h), 0)
        #             if self.hierarchy[0][o][3] == content_parent:
        #                 self.draw_contour_and_children(o, newarr, (0,0))
        #
        #         import Image
        #         Image.fromarray(newarr*255).show()
        #         import sys; sys.exit()
        for i, j in zip(['char_mean', 'char_std', 'tsek_mean', 'tsek_std'],
                        width_measures):
            setattr(self, i, j)

#        print self.gmm.converged_
#        print self.char_mean, self.char_std
#        print self.tsek_mean, self.tsek_std

        self.small_contour_indices = []
        #        self.contours = []
        self.indices = []  # Need to reset!19
        self.emph_symbols = []
        self.naros = []
        #         print self.char_mean, self.char_std, self.tsek_mean
        for i in outer_contours:
            cbox = self.get_boxes()[i]
            # if small and has no children, put in small list (this could backfire with false interiors e.g. from salt and pepper noise)
            ## NOTE: previously small was defined as less than tsek_mean + 3xtsek std
            ## however, this wasn't always working. changing to less than charmean
            ## minus 2xchar std however should watch to see if is ok for many different inputs...

            x, y, w, h = cbox
            tmparr = ones((h, w), dtype=uint8)
            tmparr = self.draw_contour_and_children(i, tmparr, (-x, -y))

            features = normalize_and_extract_features(tmparr)
            self.cached_features[i] = features

            prprob = predict_proba(features)

            #         all_feats = self.cached_features.values()
            #         all_probs = predict_proba(all_feats)
            #         all_probs = predict_proba(self.cached_features.values())
            #         for ix,i in enumerate(outer_contours):
            #             prprob = all_probs[ix]
            #             if recognizer ==  'probout':
            mxinx = prprob.argmax()
            quick_prd = label_chars[mxinx]
            self.cached_pred_prob[i] = (mxinx, prprob[0])
            #             self.cached_pred_prob[i] = (mxinx, prprob)
            #             else:
            #             quick_prd = label_chars[predict_proba(features).argmax()]
            #                 quick_prd = label_chars[predict(features)[0]]

            #             is_emph_symbol = quick_prd in set([u'༷', u'༵', u'༼', u'༽', u'—'])
            is_emph_symbol = quick_prd in set([u'༷', u'༵', u'༼', u'༽'])
            #             is_emph_symbol = quick_prd in set([u'༼', u'༽'])
            #             is_emph_symbol = quick_prd in set([u'༷', u'༵'])
            #             is_emph_symbol = quick_prd in set([u'༼', u'༽', u'—'])
            #             is_emph_symbol = quick_prd in set([u'༼', u'༽'])
            #             is_emph_symbol = quick_prd == '~~' # use this line if don't want this to actually get anything
            #             if is_emph_symbol: print 'found naro? ', is_emph_symbol
            #                 import Image; Image.fromarray(tmparr*255).show()
            if is_emph_symbol:
                self.emph_symbols.append(i)

                print 'EMPHSYMBOLFOUND', quick_prd
#                 cv.rectangle(self.img_arr, (x,y), (x+w, y+h), 0)
            elif quick_prd == u'ོ' and self.detect_o:
                self.naros.append(i)

            elif cbox[2] < 7:

                #             elif cbox[2] < 9:
                continue
#             elif (cbox[2] <= self.char_mean - 2*self.char_std and
#             elif (cbox[2] <= self.char_mean - 3*self.char_std and
#             elif (cbox[2] <= self.tsek_mean*1.5 and
#             elif (cbox[2] <= self.tsek_mean*.0 and
            elif (cbox[2] <= self.tsek_mean * 3 and
                  #             elif (cbox[2] <= self.char_mean - 4*self.char_std and
                  #                   self.hierarchy[0][i][2] < 0 and
                  quick_prd in FILTERED_PUNC
                  ) and not self.low_ink:  # default!!!
                #                 quick_prd in (u'་')) and not self.low_ink:
                #                 quick_prd not in word_parts_set) and not self.low_ink :
                self.small_contour_indices.append(i)
#                self.indices.append(i) #DEFAULT
#             elif (cbox[2] <= self.tsek_mean*.8 and
#             elif (cbox[2] <= self.tsek_mean*.3 and
#            elif (cbox[2] <= self.char_mean - 4*self.char_std and
#                   self.hierarchy[0][i][2] < 0 and not self.low_ink):
#                 cv.rectangle(self.img_arr, (x,y), (x+w, y+h), 0)
#                 continue
            else:
                #                 cv.rectangle(self.img_arr, (x,y), (x+w, y+h), 0)
                self.indices.append(i)

#                if  (cbox[2] <= self.tsek_mean*1.5 and
##            elif (cbox[2] <= self.char_mean - 4*self.char_std and
#                  self.hierarchy[0][i][2] < 0 and
#                  quick_prd in (u'།', u'་')):
#                    self.small_contour_indices.append(i)

#            import Image
#            Image.fromarray(tmparr*255).convert('L').save('/tmp/examples/%04d.tif' % i)

#        print len(self.small_contour_indices), 'len small contour ind'
#         import Image
#         Image.fromarray(self.img_arr*255).show()
#        print scount
#         raw_input()
        if self.detect_o:
            print 'pre-filtered na-ro vowel', len(self.naros), 'found'

#        for i in self.indices:
#                if cbox[2] > 50: print cbox[2],
#            bx = self.boxes[i]
#            x,y,w,h = bx
#            cv.rectangle(img_arr, (x,y), (x+w, y+h), 0)

#         import Image
#         Image.fromarray(img_arr*255).show()
#        raw_input()
#        for i in self.indices:
#            if self.hierarchy[0][i][2] >= 0:
#                char = self.draw_contour_and_children(i)
#
#                Image.fromarray(char*255).show()
#                raw_input()
#        from matplotlib import pyplot as plt
#        from matplotlib.mlab import normpdf
#        plt.subplot(111)
#        plt.title('tsek-char distributions, pre-segmentation')
#
##        widths = [self.boxes[i][2] for i in self.get_indices()]
#        n,bins,p = plt.hist(outer_widths, 200, range=(0,75), normed=True, color='#3B60FA')
#        plt.vlines([self.char_mean, self.tsek_mean], 0, np.array([max(n), max(n)]), linestyles='--')
#        plt.plot(bins, normpdf(bins, self.tsek_mean, self.tsek_std),  label='fit', linewidth=1)
#        plt.fill_between(bins, normpdf(bins, self.tsek_mean, self.tsek_std), color=(.58,.63,.8), alpha=0.09)
#        plt.plot(bins, normpdf(bins, self.char_mean, self.char_std), label='fit', linewidth=1)
#        plt.fill_between(bins, normpdf(bins, self.char_mean, self.char_std), color=(.58,.63,.8), alpha=0.01)
#        plt.show()

#        print self.tsek_mean, self.tsek_std
#        print len(self.boxes)
#        font_detector.save_info(self.char_mean, self.char_std, self.tsek_mean, self.tsek_std)
#         self.low_ink = False
        if self.low_ink:
            self._low_ink_setting()
Exemple #3
0
def recognize_chars_probout(segmentation, tsek_insert_method='baseline', ):
    '''Recognize characters using segmented char data
    
    Parameters:
    --------------------
    segmentation: an instance of PechaCharSegmenter or Segmenter
    
    Returns:
    --------------
    results: list of lists containing [x,y,width, height, prob, unicode], specifying the
    coordinates of the bounding box of stack, it probability, and its unicode
    characters -- on each line of the page'''
    
    results = []
    tsek_mean = segmentation.final_box_info.tsek_mean
    cached_features = segmentation.line_info.shapes.cached_features
    cached_pred_prob = segmentation.line_info.shapes.cached_pred_prob

    for l, vectors in enumerate(segmentation.vectors):
        
        if not vectors:
            print 'no vectors...'
            continue
        
        tmp_result = []

        new_boxes = segmentation.new_boxes[l]
        scale_w = segmentation.final_box_info.transitions[l]

        small_chars = segmentation.line_info.small_cc_lines_chars[l]
        
        #FIXME: define emph lines for line cut
        #### Line Cut has no emph_lines object so need to work around for now...
        emph_markers = getattr(segmentation.line_info, 'emph_lines', [])
        if emph_markers:
            emph_markers = emph_markers[l]
        
        img_arr = segmentation.line_info.shapes.img_arr

        left_edges = [b[0] for b in new_boxes]
        tsek_widths = []
        
        for s in small_chars[::-1]: # consider small char from end of line going backward. backward useful for misplaced tsek often and maybe for TOC though should check
            
            bx = segmentation.line_info.shapes.get_boxes()[s]
            bx = list(bx)
            x,y,w,h = bx
            try:
                feature_vect = cached_features[s]
                inx, probs = cached_pred_prob[s]
                prob = probs[inx]
                prd = dig_to_char[inx]

            except:
                cnt = segmentation.line_info.shapes.contours[s]
                char_arr = np.ones((h,w), dtype=np.uint8)
                offset = (-x, -y)
                drawContours(char_arr, [cnt], -1,0, thickness = -1, offset=offset)
                feature_vect = normalize_and_extract_features(char_arr)
                prd, prob = prd_prob(feature_vect)
            
            insertion_pos = bisect(left_edges, x)

            left_items = 6
            right_items = 5
            if insertion_pos >= len(new_boxes):
                # insertion is at or near end of line and needs more left 
                # neighbors to compensate for there being less chars to define the baseline
                left_items = 12
            elif insertion_pos <= len(new_boxes):
                # same as above except at front of line
                right_items = 12
#            right_items = 5 # bias slightly toward the left. 
            if tsek_insert_method == 'baseline':
                top = 1000000 # arbitrary high number
                bottom = 0
                
                #### Get min or max index to avoid reaching beyond edges of the line
                lower = max(insertion_pos - left_items, 0)
                upper = min(len(new_boxes)-1, insertion_pos+right_items)
                
                left = new_boxes[lower][0]
                right = new_boxes[upper][0] + new_boxes[upper][2]
                if insertion_pos < len(new_boxes):
                    mid = new_boxes[insertion_pos][0] + new_boxes[insertion_pos][2]
                else:
                    mid = right
                for j in new_boxes[lower:upper]:
                    if j[1] < top:
                        top = j[1]
                    if j[1] + j[3] > bottom:
                        bottom = j[1] + j[3]
                local_span = bottom - top

                top, bottom, left, right, mid = [int(np.round(ff)) for ff in [top, bottom, left, right, mid]]
                if prd == u'་' and local_span > 0:
                    left_sum = img_arr[top:bottom,left:mid].sum(axis=1)
                    right_sum = img_arr[top:bottom,mid:right].sum(axis=1)
                    local_baseline_left = top + left_sum.argmin()
                    if mid != right:
                        local_baseline_right = top + right_sum.argmin()
                    else:
                        local_baseline_right = local_baseline_left
                    if ((local_baseline_left >= bx[1] and local_baseline_left <= bx[1] + bx[3]) or 
                    (local_baseline_right >= bx[1] and local_baseline_right <= bx[1] + bx[3])) or (insertion_pos == len(vectors)): #or 
#                    (entire_local_baseline >= bx[1] and entire_local_baseline <= bx[1] + bx[3])):
                        ### Account for fact that the placement of a tsek could be 
                        # before or after its indicated insertion pos
                        ### experimental.. only need with certain fonts e.g. "book 6"
                        ## in samples
                        if insertion_pos <= len(new_boxes):
                            prev_box = new_boxes[insertion_pos-1]
                            left_prev = prev_box[0]
                            if 0 <= x - left_prev < w and 2*w < prev_box[2]:
                                insertion_pos -= 1
                        
                        vectors.insert(insertion_pos, prd)
                        new_boxes.insert(insertion_pos, bx)
                        new_boxes[insertion_pos].append(prob)
                        new_boxes[insertion_pos].append(prd)
                        left_edges.insert(insertion_pos, bx[0])
                        tsek_widths.append(bx[2])
                elif (bx[1] >= top -.25*local_span and bx[1] + bx[3] <= bottom + local_span*.25) or (insertion_pos == len(vectors)):
                    vectors.insert(insertion_pos, prd)
                    new_boxes.insert(insertion_pos, bx)
                    new_boxes[insertion_pos].append(prob)
                    new_boxes[insertion_pos].append(prd)
                    left_edges.insert(insertion_pos, bx[0])
                    
            else:
                vectors.insert(insertion_pos, prd)
                new_boxes.insert(insertion_pos, bx)
                new_boxes[insertion_pos].append(prob)
                new_boxes[insertion_pos].append(prd)
                left_edges.insert(insertion_pos, bx[0])
        
        for em in emph_markers:
            bx = segmentation.line_info.shapes.get_boxes()[em]
            mkinx = segmentation.line_info.shapes.cached_pred_prob[em][0]
            marker = dig_to_char[mkinx]
            marker_prob = segmentation.line_info.shapes.cached_pred_prob[em][1][mkinx]
            
            bx = list(bx)
            x,y,w,h = bx
            bx.append(marker_prob)
            bx.append(marker)
            insertion_pos = bisect(left_edges, x)
            vectors.insert(insertion_pos, marker)
            new_boxes.insert(insertion_pos, bx)
            left_edges.insert(insertion_pos, bx[0])

        if len(vectors) == 1: i = -1
        
        skip_next_n = 0
        for i, v in enumerate(vectors[:-1]):

            if skip_next_n:
                skip_next_n -= 1
                continue

            if new_boxes[i+1][0] - (new_boxes[i][0] + new_boxes[i][2]) >= 2*tsek_mean:
                if not len(new_boxes[i]) == 6 and not isinstance(v, unicode):
                    prd, prob = prd_prob(v)
                else:
                    if len(new_boxes[i]) == 6:
                        prob, prd = new_boxes[i][4:]
                    else:
                        ## v is unicode stack, likely from segmentation step
                        prd = v
                        prob = .95 # NEED ACTUAL PROB

                new_boxes[i].append(prob)
                new_boxes[i].append(prd)
                tmp_result.append(new_boxes[i])
                tmp_result.append([-1,-1,-1,-1, 1.0, u' '])
            else:
                if hasattr(v, 'dtype'):
                    try:
                        prd, prob = prd_prob(v)
                    except:
                        print v
                    
                    new_boxes[i].append(prob)
                    new_boxes[i].append(prd)
                else:
                    if len(new_boxes[i]) == 6:
                        prob, prd = new_boxes[i][4:]
                    else:
                        prd = v
                
                if len(new_boxes[i]) < 6:
                    try:
                        new_boxes[i].append(prob)
                    except:
                        new_boxes[i].append(1)
                    new_boxes[i].append(prd)
                tmp_result.append(new_boxes[i])
            
            
        if hasattr(vectors[-1], 'dtype'):
            prd, prob = prd_prob(vectors[-1])
            new_boxes[-1].append(prob)
            new_boxes[-1].append(prd)
        tmp_result.append(new_boxes[-1])
        results.append(tmp_result)
    return results
Exemple #4
0
def recognize_chars_hmm(segmentation, tsek_insert_method='baseline', ):
    '''Recognize characters using segmented char data
    
    Parameters:
    --------------------
    segmentation: an instance of PechaCharSegmenter or Segmenter
    
    Returns:
    --------------
    results: list of lists containing [x,y,width, height, prob, unicode], specifying the
    coordinates of the bounding box of stack, it probability, and its unicode
    characters -- on each line of the page
    '''
    n_states = trans_p.shape[0]
    
    results = []
    tsek_mean = segmentation.final_box_info.tsek_mean
    cached_features = segmentation.line_info.shapes.cached_features
    cached_pred_prob = segmentation.line_info.shapes.cached_pred_prob
#     width_dists = {}
#     times = []
    for l, vectors in enumerate(segmentation.vectors):
        
        if not vectors:
            print 'no vectors...'
            continue
        
        tmp_result = []
        new_boxes = segmentation.new_boxes[l]
        small_chars = segmentation.line_info.small_cc_lines_chars[l]
        
        #FIXME: define emph lines for line cut
        #### Line Cut has no emph_lines object so need to work around for now...
        emph_markers = getattr(segmentation.line_info, 'emph_lines', [])
        if emph_markers:
            emph_markers = emph_markers[l]
        
        img_arr = segmentation.line_info.shapes.img_arr
        left_edges = [b[0] for b in new_boxes]
        tsek_widths = []
        for s in small_chars[::-1]: # consider small char from end of line going backward. backward useful for misplaced tsek often and maybe for TOC though should check
            bx = segmentation.line_info.shapes.get_boxes()[s]
            bx = list(bx)
            x,y,w,h = bx
            try:
                feature_vect = cached_features[s]
                inx, probs = cached_pred_prob[s]
                prob = probs[inx]
                prd = dig_to_char[inx]
#             else:
#                 vect = normalize_and_extract_features(letter)
            except:
                cnt = segmentation.line_info.shapes.contours[s]
                char_arr = np.ones((h,w), dtype=np.uint8)
                offset = (-x, -y)
                drawContours(char_arr, [cnt], -1,0, thickness = -1, offset=offset)
                feature_vect = normalize_and_extract_features(char_arr)
#            prd = classify(feature_vect)
                prd, prob = prd_prob(feature_vect)

#            print prd, max(cls.predict_proba(feature_vect)[0])
            insertion_pos = bisect(left_edges, x)
            left_items = 6
            right_items = 5
            if insertion_pos >= len(new_boxes):
                left_items = 12
            elif insertion_pos <= len(new_boxes):
                # same as above except at front of line
                right_items = 12
            if tsek_insert_method == 'baseline':
                top = 1000000 # arbitrary high number
                bottom = 0
                
                #### Get min or max index to avoid reaching beyond edges of the line
                lower = max(insertion_pos - left_items, 0)
                upper = min(len(new_boxes)-1, insertion_pos+right_items)
                ####
                
                
                left = new_boxes[lower][0]
                right = new_boxes[upper][0] + new_boxes[upper][2]
                if insertion_pos < len(new_boxes):
                    mid = new_boxes[insertion_pos][0] + new_boxes[insertion_pos][2]
                else:
                    mid = right
                for j in new_boxes[lower:upper]:
                    if j[1] < top:
                        top = j[1]
                    try:
                        if j[1] + j[3] > bottom:
                            bottom = j[1] + j[3]
                    except IndexError:
                        print new_boxes[lower:upper]
                        print j
                        raise
                local_span = bottom - top

                left_sum = img_arr[top:bottom,left:mid].sum(axis=1)
                right_sum = img_arr[top:bottom,mid:right].sum(axis=1)
                try:
                    local_baseline_left = top + left_sum.argmin()
                except:
                    local_baseline_left = top 
                    
                if mid != right:
                    local_baseline_right = top + right_sum.argmin()
                else:
                    local_baseline_right = local_baseline_left
                if prd == u'་' and local_span > 0:
                    if ((local_baseline_left >= bx[1] and local_baseline_left <= bx[1] + bx[3]) or 
                    (local_baseline_right >= bx[1] and local_baseline_right <= bx[1] + bx[3])) or (insertion_pos == len(vectors)): #or 
                        if insertion_pos <= len(new_boxes):
                            prev_box = new_boxes[insertion_pos-1]
                            left_prev = prev_box[0]
                            if 0 <= x - left_prev < w and 2*w < prev_box[2]:
                                insertion_pos -= 1

                        new_boxes.insert(insertion_pos, bx)
                        bx.append(prob)
                        bx.append(prd)
                        vectors.insert(insertion_pos, bx)

                        left_edges.insert(insertion_pos, bx[0])
                        tsek_widths.append(bx[2])

                elif ((bx[1] >= top -.25*local_span and bx[1] + bx[3] <= 
                       bottom + local_span*.25) or 
                      (insertion_pos == len(vectors))) and bx[1] - local_baseline_left < 2*tsek_mean:
                    vectors.insert(insertion_pos, prd)
                    new_boxes.insert(insertion_pos, bx)
                    new_boxes[insertion_pos].append(prob)
                    new_boxes[insertion_pos].append(prd)
                    left_edges.insert(insertion_pos, bx[0])
                    
                else:
                    print 'small contour reject at', l, s, 'local height span', local_span, 'box height', bx[3]
            
            else:
                vectors.insert(insertion_pos, prd)
                new_boxes.insert(insertion_pos, bx)
                new_boxes[insertion_pos].append(prob)
                new_boxes[insertion_pos].append(prd)
                left_edges.insert(insertion_pos, bx[0])
        
        for em in emph_markers:
            mkinx = segmentation.line_info.shapes.cached_pred_prob[em][0]
            marker = dig_to_char[mkinx]
            marker_prob = segmentation.line_info.shapes.cached_pred_prob[em][1][mkinx]
            bx = segmentation.line_info.shapes.get_boxes()[em]
            bx = list(bx)
            x,y,w,h = bx
            insertion_pos = bisect(left_edges, x)
            vectors.insert(insertion_pos, marker)
            bx.append(marker_prob)
            bx.append(marker)
            new_boxes.insert(insertion_pos, bx)
            left_edges.insert(insertion_pos, bx[0])
        if len(vectors) == 1: i = -1
        
        skip_next_n = 0
        
        ###HMM PHASE

        allstrs = []
        curstr = []
        allinx = []
        curinx = []
        
        for j, v in enumerate(vectors):
            
            islist = isinstance(v, list)
            if isinstance(v, unicode) or islist:
                allstrs.append(curstr)
                allinx.append(curinx)
                curstr = []
                curinx = []
            else:
                curstr.append(v)
                curinx.append(j)
        if curstr:
            allstrs.append(curstr)
            allinx.append(curinx)
        for f, group in enumerate(allstrs):
            if not group: continue
            try:
                probs = predict_log_proba(group)

            except:
                print v,
#                 raise
            LPROB = len(probs)
            if LPROB == 1:
                inx = probs[0].argmax()
                prb = probs[0][inx]
                prds = [inx]
            else:
                probs = probs.astype(np.float32)

                prb, prds = viterbi_cython(LPROB, n_states, start_p, trans_p, probs)
            prb = np.exp(prb)
            inx = allinx[f]
            for vv, c in enumerate(range(len(prds))):
                ind = inx[c]
                cprob = probs[c].max()
                
                #######replace low prob stacks using svm rbf classifier
                ####### warning: this may undo decisions made by hmm classifier
#                 if np.exp(cprob) <= .98:
#  #                     print prds, type(prds)
#                     print 'replacing', dig_to_char[prds[c]], 'with',
#                     prds[c] = rbfcls.predict(group[vv])[0]
# #                    print prds, type(prds)
# #                    print prds[c]
#                     print dig_to_char[prds[c]]
#                     print 
                #######################
                new_boxes[ind].append(np.exp(cprob))
                try:
                    new_boxes[ind].append(dig_to_char[prds[c]])
                except KeyError:
                    new_boxes[ind].append('PROB')
        for ind, b in enumerate(new_boxes):
            tmp_result.append(new_boxes[ind])
            if not len(new_boxes[ind]) == 6:
                print l, ind, new_boxes[ind], '<-----'
            if ind + 1 < len(new_boxes) and  new_boxes[ind+1][0] - (new_boxes[ind][0] + new_boxes[ind][2]) >= 1.5*tsek_mean:
                tmp_result.append([-1,-1,-1,-1, 1.0, u' '])
            
        results.append(tmp_result)
    return results
Exemple #5
0
def recognize_chars(segmentation, tsek_insert_method='baseline', ):
    '''Recognize characters using segmented char data
    
    Parameters:
    --------------------
    segmentation: an instance of PechaCharSegmenter or Segmenter
    
    Returns:
    --------------
    results: Unicode string containing recognized text'''
    
    results = []

    tsek_mean = segmentation.final_box_info.tsek_mean
    width_dists = {}
    for l, vectors in enumerate(segmentation.vectors):
        
        if not vectors:
            print 'no vectors...'
            continue
        
        tmp_result = []
        new_boxes = segmentation.new_boxes[l]
        
        small_chars = segmentation.line_info.small_cc_lines_chars[l]
        
        #FIXME: define emph lines for line cut
        #### Line Cut has no emph_lines object so need to work around for now...
        emph_markers = getattr(segmentation.line_info, 'emph_lines', [])
        if emph_markers:
            emph_markers = emph_markers[l]
        
        img_arr = segmentation.line_info.shapes.img_arr

        left_edges = [b[0] for b in new_boxes]
        tsek_widths = []

        for s in small_chars[::-1]: # consider small char from end of line going backward. backward useful for misplaced tsek often and maybe for TOC though should check
#        for s in small_chars: # consider small char from end of line going backward. backward useful for misplaced tsek often and maybe for TOC though should check
            cnt = segmentation.line_info.shapes.contours[s]
            bx = segmentation.line_info.shapes.get_boxes()[s]
            bx = list(bx)
            x,y,w,h = bx
            char_arr = np.ones((h,w), dtype=np.uint8)
            offset = (-x, -y)
            drawContours(char_arr, [cnt], -1,0, thickness = -1, offset=offset)
            feature_vect = normalize_and_extract_features(char_arr)
            prd = classify(feature_vect)

            insertion_pos = bisect(left_edges, x)

            left_items = 6
            right_items = 5
            if insertion_pos >= len(new_boxes):
                # insertion is at or near end of line and needs more left 
                # neighbors to compensate for there being less chars to define the baseline
                left_items = 12
            elif insertion_pos <= len(new_boxes):
                # same as above except at front of line
                right_items = 12

            if tsek_insert_method == 'baseline':
                top = 1000000 # arbitrary high number
                bottom = 0
                
                #### Get min or max index to avoid reaching beyond edges of the line
                lower = max(insertion_pos - left_items, 0)
                upper = min(len(new_boxes)-1, insertion_pos+right_items)
                ####
                
                left = new_boxes[lower][0]
                right = new_boxes[upper][0] + new_boxes[upper][2]
                if insertion_pos < len(new_boxes):
                    mid = new_boxes[insertion_pos][0] + new_boxes[insertion_pos][2]
                else:
                    mid = right
                for j in new_boxes[lower:upper]:
                    if j[1] < top:
                        top = j[1]
                    if j[1] + j[3] > bottom:
                        bottom = j[1] + j[3]
                local_span = bottom - top

                if prd == u'་' and local_span > 0:

                    left_sum = img_arr[top:bottom,left:mid].sum(axis=1)
                    right_sum = img_arr[top:bottom,mid:right].sum(axis=1)
                    local_baseline_left = top + left_sum.argmin()
                    if mid != right:
                        local_baseline_right = top + right_sum.argmin()
                    else:
                        local_baseline_right = local_baseline_left
                    
                    if ((local_baseline_left >= bx[1] and local_baseline_left <= bx[1] + bx[3]) or 
                    (local_baseline_right >= bx[1] and local_baseline_right <= bx[1] + bx[3])): #or 
#                    (entire_local_baseline >= bx[1] and entire_local_baseline <= bx[1] + bx[3])):
                        ### Account for fact that the placement of a tsek could be 
                        # before or after its indicated insertion pos
                        ### experimental.. only need with certain fonts e.g. "book 6"
                        ## in samples
                        if insertion_pos <= len(new_boxes):
    #                        cur_box_in_pos = new_boxes[insertion_pos]
                            prev_box = new_boxes[insertion_pos-1]
    #                        left_cur = cur_box_in_pos[0]
                            left_prev = prev_box[0]
                            if 0 <= x - left_prev < w and 2*w < prev_box[2]:
                                insertion_pos -= 1

                        vectors.insert(insertion_pos, prd)
                        new_boxes.insert(insertion_pos, bx)
                        left_edges.insert(insertion_pos, bx[0])
                        tsek_widths.append(bx[2])

                elif bx[1] >= top -.25*local_span and bx[1] + bx[3] <= bottom + local_span*.25:
                    vectors.insert(insertion_pos, prd)
                    new_boxes.insert(insertion_pos, bx)
                    left_edges.insert(insertion_pos, bx[0])
            
            else:
                vectors.insert(insertion_pos, prd)
                new_boxes.insert(insertion_pos, bx)
                left_edges.insert(insertion_pos, bx[0])
        
        tsek_mean = np.mean(tsek_widths)
        
        for em in emph_markers:
            marker = dig_to_char[segmentation.line_info.shapes.cached_pred_prob[em][0]]
            marker_prob = segmentation.line_info.shapes.cached_pred_prob[em][1]
            bx = segmentation.line_info.shapes.get_boxes()[em]
            bx = list(bx)
            x,y,w,h = bx
            insertion_pos = bisect(left_edges, x)
            bx.append(marker_prob)
            bx.append(marker)
            vectors.insert(insertion_pos, marker)
            new_boxes.insert(insertion_pos, bx)
            left_edges.insert(insertion_pos, bx[0])
#        tsek_std = np.std(tsek_widths)
        if len(vectors) == 1: i = -1
        
        for i, v in enumerate(vectors[:-1]):
            if new_boxes[i+1][0] - (new_boxes[i][0] + new_boxes[i][2]) >= 2*tsek_mean:
                if not isinstance(v, unicode):
                    prd = classify(v, pca_trans=PCA_TRANS, multi=False)
                else:
                    prd = v

                new_boxes[i].append(prd)
                tmp_result.append(new_boxes[i])
                tmp_result.append([-1,-1,-1,-1, u' '])
            else:
                if not isinstance(v, unicode):
                    prd = classify(v, pca_trans=PCA_TRANS, multi=False)

                    ### Assume that a tsek shouldn't show up at this point
                    ### a more reliable way to do this is to better
#                    if prd == u'་':
#                        prbs = cls.predict_proba(v)[0]
#                        ind_probs = zip(range(len(prbs)), prbs)
#                        ind_probs.sort(key=lambda x: x[1])
#                        prd = dig_to_char[ind_probs[-2][0]]
                else:
                    prd = v
                
                if not width_dists.get(prd):
                    width_dists[prd] = [new_boxes[i][2]]
                else:
                    width_dists[prd].append(new_boxes[i][2])
                
                new_boxes[i].append(prd)
                tmp_result.append(new_boxes[i])
            
        if not isinstance(vectors[-1], unicode):
            prd = classify(vectors[-1], pca_trans=PCA_TRANS, multi=False)
        else:
            prd = vectors[-1]
        new_boxes[-1].append(prd)
        tmp_result.append(new_boxes[-1])
        results.append(tmp_result)

    return results
Exemple #6
0
    def __init__(self, shapes, k):
        from sklearn.cluster import KMeans
        self.shapes = shapes
        self.k = k
        self.page_array = shapes.img_arr

        if shapes.conf['line_cluster_pos'] == 'top':
            tops = array(shapes.get_tops(), dtype=float64)
        elif shapes.conf['line_cluster_pos'] == 'center':
            tops = array(
                 [t[1] + .5*shapes.char_mean for t in shapes.get_boxes() if t[3] > 2* shapes.tsek_mean],
                    dtype=float64
                         )
        else:
            raise ValueError, "The line_cluster_pos argument must be either 'top' or 'center'"

        tops.shape = (len(tops), 1)
        
        kmeans = KMeans(n_clusters=k)
#         print tops
        kmeans.fit(tops)
        
        
        ################## 
        ######## mark cluster centroids on original image and show them
#        img_arr = shapes.img_arr.copy()
#        for centroid in kmeans.cluster_centers_:
##            print centroid[0]
#            img_arr[centroid[0],:] = 0
#            
#        import Image
#        Image.fromarray(img_arr*255).show()
        #######################3
        
        lines = [[] for i in range(k)]
        
        ind = shapes.get_indices()
        
        ### Assign char pointers (ind) to the appropriate line ###
#        [lines[kmeans.labels_[i]].append(ind[i]) for i in range(len(ind))]
        [lines[kmeans.predict(shapes.get_boxes()[ind[i]][1])[0]].append(ind[i]) for i in range(len(ind))]
        lines = [l for l in lines if l]
        self.k = len(lines)
        boxes = shapes.get_boxes()
        
        
        ### Sort indices so they are in order from top to bottom using y from the first box in each line
        
        sort_inx = list(argsort([boxes[line[0]][1] for line in lines]))
        lines.sort(key=lambda line: boxes[line[0]][1])
        
        ### Get breaklines for splitting up lines
        ### Uses the topmost box in each line cluster to determine breakline
        
        try:
            topmosts = [min([boxes[i][1] for i in line]) for line in lines]
        except ValueError:
            print 'failed to get topmosts...'
            raise
        
        vsums = self.page_array.sum(axis=1)
        breaklines = []
        delta = 25
        for c in topmosts:
            if c - delta < 0:
                lower = 0
            else:
                lower = c-delta
            e = argmax(vsums[lower:c+delta])
            c = c - delta + e
            if c < 0:
                c = 0
            breaklines.append(c)
    
        breaklines.append(self.page_array.shape[0])
        self.baselines = []
        
        for i, br in enumerate(breaklines[:-1]):
            
            try:
                baseline_area = vsums[br:breaklines[i+1]]
                if baseline_area.any():
                    self.baselines.append(br + argmin(baseline_area))
                else:
                    print i
                    print 'No baseline info'
            except ValueError:
                print 'ValueError. exiting...HERE'
                import traceback;traceback.print_exc()
                
                raise

        final_ind = dict((i, []) for i in range(len(lines)))
        self.new_contours = {}
        for j, br in enumerate(breaklines[1:-1]):
            topcount = 0
            bottomcount = 0
            for i in lines[j]:
                # if char extends into next line, break it
                # 253 is roughly global line height avg + 1 std
                # The following lines says that a box/char must be extending over 
                # breakline by a non trivial amount eg. 30 px and must itself
                # be a tall-ish box (roughly the height of average line) in order
                # for it to be broken. 
    #            if (bounding[i][1] + bounding[i][3]) - br >= 30 and bounding[i][3] > 205:
                if (boxes[i][1] + boxes[i][3]) - br >= 30 and \
                    (boxes[i][1] + boxes[i][3]) - topmosts[j] > self.shapes.char_mean*2.85:
                    chars = ones((boxes[i][3]+2, boxes[i][2]+2), dtype=uint8)
                    contours = shapes.contours
                    cv.drawContours(chars, [contours[i]], -1,0, \
                        thickness = -1, offset=(-boxes[i][0]+1,-boxes[i][1]+1))
                    cv.dilate(chars, None, chars)
                    y_offset = boxes[i][1]
                    new_br = br - y_offset
                    prd_cut = []

                    ### Iterate through potential cut-points and 
                    ### and cut where top half has the highest probability
                    ### that is not a tsek
#                     print 'bottom bound cut point', int(.75*shapes.tsek_mean)
                    for delta in range(-3, int(.75*shapes.tsek_mean), 1):
#                     for delta in range(-3, 100, 1):
                        cut_point = new_br + delta
#                        chars[cut_point, :] = 0
#                        import Image
#                        Image.fromarray(chars*255).show()
                        tchr = chars[:cut_point,:]
                        tchr = ftrim(tchr)
                        if not tchr.any():
                            continue
                        tchr = normalize_and_extract_features(tchr)
                        probs = cls.predict_proba(tchr)
                        max_prob_ind = argmax(probs)
                        chr = label_chars[max_prob_ind]
                        prd_cut.append((probs[0,max_prob_ind], chr, cut_point))
                    
                    prd_cut = [q for q in prd_cut if q[1] != u'་']
                    try:
                        cut_point = max(prd_cut)[-1]
                    except:
                        print 'No max prob for vertical char break, using default breakline. Usually this means the top half of the attempted segmentation looks like a tsek blob'
                        cut_point = br-boxes[i][1]

                    #######FOLLWNG NOT WORKING ATTEMPTS TO GET A BETTER BREAK LINE
    #                br2 = br-bounding[i][1]
    #                
    #                csum = chars.sum(axis=1)
    #                bzone = csum[br2-25:br2+40]
    #                if bzone.any():
    #                    br2 = np.argmax(bzone) + (br-25)
    ##                    print br, 'br'
    #                chars = chars*255
    #                nbr = br
    #                cv.line(chars, (0, br2), (chars.shape[1], br2), 0)
    #                Image.fromarray(chars).save('/tmp/outt.tiff')
    #                sys.exit()
                    #############
                    
                    tarr = chars[:cut_point,:]
                    tarr, top_offset = ftrim(tarr, new_offset=True)
                    tarr = fadd_padding(tarr, 3)
                    barr = chars[cut_point:,:]
                    barr = ftrim(barr, sides='brt') 
                    barr = fadd_padding(barr, 3)
                    
                    c1, h = cv.findContours(image=tarr, mode=cv.RETR_LIST, method=cv.CHAIN_APPROX_SIMPLE, offset=(boxes[i][0]+top_offset['left'],boxes[i][1]))

                    c1 = c1[argmax([len(t) for t in c1])] # use the most complex contour

                    bnc1 = cv.boundingRect(c1)

                    c2, h = cv.findContours(barr, mode=cv.RETR_LIST, 
                                            method=cv.CHAIN_APPROX_SIMPLE,
                                            offset=(boxes[i][0]-3,boxes[i][1]+cut_point-3))

                    c2 = c2[argmax([len(t) for t in c2])]
                    bnc2 = cv.boundingRect(c2)

                    topbox_name = 't%d_%d' % (j, topcount)
                    final_ind[j].append(topbox_name)
                    self.new_contours[topbox_name] = (bnc1, c1)
                    topcount += 1
                    
                    if bnc2[-1] > 8: #only add bottom contour if not trivially small
                        bottombox_name = 'b%d_%d' % (j, bottomcount)
                        final_ind[j+1].append(bottombox_name)
                        self.new_contours[bottombox_name] = (bnc2, c2)
                        bottomcount += 1
                    
                else:
                    final_ind[j].append(i)
            # Don't forget to include the last line
        map(final_ind[len(lines)-1].append, lines[len(lines)-1])
        
        self.lines_chars = final_ind
        
        cctops = [self.shapes.get_boxes()[i][1] for i in self.shapes.small_contour_indices]
        char_tops = zip(cctops, self.shapes.small_contour_indices)
        char_tops.sort(key=lambda x: x[0])
        sorted_indices = [i[1] for i in char_tops]
        _line_insert_indxs = []
        _line_insert_indxs.extend([bisect_right(char_tops, (i - 1,))
                                   for i in breaklines])
        self.small_cc_lines_chars = []
        if not _line_insert_indxs: sys.exit()

        for i, l in enumerate(_line_insert_indxs[:-1]):
            self.small_cc_lines_chars.append(sorted_indices[l:_line_insert_indxs[i+1]])
        
        self.small_cc_lines_chars.append(sorted_indices[_line_insert_indxs[-1]:])
        
        self.small_cc_lines_chars = [self.small_cc_lines_chars[i] for i in range(len(self.lines_chars)) if self.lines_chars[i]]
       
        cctops = [self.shapes.get_boxes()[i][1] for i in self.shapes.emph_symbols]
        char_tops = zip(cctops, self.shapes.emph_symbols)
        char_tops.sort(key=lambda x: x[0])

        empred = [kmeans.predict(shapes.get_boxes()[i][1])[0] for i in self.shapes.emph_symbols]
        
        self.emph_lines = [[] for i in range(k)]
        for nn, e in enumerate(empred):
            self.emph_lines[sort_inx.index(e)].append(self.shapes.emph_symbols[nn])
        
    
        if self.shapes.detect_o:
            cctops = [self.shapes.get_boxes()[i][1] for i in self.shapes.naros]
            char_tops = zip(cctops, self.shapes.naros)
            char_tops.sort(key=lambda x: x[0])
            sorted_indices = [i[1] for i in char_tops]
            _line_insert_indxs = []
            _line_insert_indxs.extend([bisect_right(char_tops, (i - 1,))
                                       for i in breaklines])
            
            if not _line_insert_indxs: sys.exit()
            
            self.line_naros = []
            for i, l in enumerate(_line_insert_indxs[:-1]):
    #   
                self.line_naros.append(sorted_indices[l:_line_insert_indxs[i+1]])
            
            self.line_naros.append(sorted_indices[_line_insert_indxs[-1]:])
            
            self.line_naros  = [self.line_naros[i] for i in range(len(self.lines_chars)) if self.lines_chars[i]]
            self.line_naro_spans = []
            for ll, mm in enumerate(self.line_naros):
                thisline = []
                for nn, naro in enumerate(mm):
                    box = self.get_box(naro)
                    thisline.append(box)
                thisline.sort(key=lambda x: x[0])
                self.line_naros[ll].sort(key=lambda x: self.get_box(x)[0])
                self.line_naro_spans.append(thisline)
    
        if self.shapes.low_ink:
            
            cctops = [lib[1] for lib in self.shapes.low_ink_boxes]
            char_tops = zip(cctops, self.shapes.low_ink_boxes)
            char_tops.sort(key=lambda x: x[0])
            sorted_indices = [i[1] for i in char_tops]
            _line_insert_indxs = []
            _line_insert_indxs.extend([bisect_right(char_tops, (i - 1,))
                                       for i in breaklines])
            
            self.low_ink_boxes = []
            if not _line_insert_indxs: sys.exit()
    
            for i, l in enumerate(_line_insert_indxs[:-1]):
                self.low_ink_boxes.append(sorted_indices[l:_line_insert_indxs[i+1]])
            
            self.low_ink_boxes.append(sorted_indices[_line_insert_indxs[-1]:])
            
            self.low_ink_boxes = [self.low_ink_boxes[i] for i in range(len(self.lines_chars)) if self.lines_chars[i]]
Exemple #7
0
    def construct_vector_set_experimental(self):

        NINF = -np.inf

        final_box_info = CombineBoxesForPage(self.line_info)

        self.final_box_info = final_box_info
        final_boxes = final_box_info.final_boxes

        final_indices = final_box_info.final_indices
        scales = final_box_info.transitions

        self.vectors = [[] for i in range(self.line_info.k)]
        self.new_boxes = [[] for i in range(self.line_info.k)]  #
        cur_mean = self.final_box_info.char_mean
        cur_std = self.final_box_info.char_std
        BREAKWIDTH = self.breakwidth
        rbfcls = self.line_info.rbfcls
        for l in range(len(final_indices)):  # for each line
            try:
                scale_l = scales[l]
            except:
                print 'ERROR AT ', l, len(scales)
                raise
            char_mean_int = floor(final_box_info.char_mean)
            char_std_int = ceil(final_box_info.char_std)

            try:
                lb = range(len(final_indices[l]))
            except IndexError:
                print 'index error'
                continue

            segmented = 0
            for i in lb:  # for each line box

                ## New draw, takes into account tree hierarchy of contours
                x, y, w, h = final_boxes[l][i]
                letter = ones((h, w), dtype=uint8)
                for k in final_indices[l][i]:
                    if not isinstance(k, str):
                        letter = self.line_info.shapes.draw_contour_and_children(
                            k, char_arr=letter, offset=(-x, -y))
                    else:
                        cv.drawContours(letter,
                                        [self.line_info.get_contour(k)],
                                        -1,
                                        0,
                                        thickness=-1,
                                        offset=(-x, -y))

                letter = cv.resize(letter,
                                   dsize=(0, 0),
                                   fx=scale_l,
                                   fy=scale_l)
                if letter.shape[1] >= (final_box_info.char_mean +
                                       BREAKWIDTH * final_box_info.char_std
                                       ):  # if a box is too large, break it
                    #
                    segmented += 1
                    sw = w * scale_l
                    sh = h * scale_l
                    vsum = letter.sum(axis=0)
                    chars = sw // (final_box_info.char_mean -
                                   1.5 * final_box_info.char_std
                                   )  # important, floor division

                    if 10.0 > chars > 1.0:  # Assume chars-to-be-broken don't span > 10
                        #                     if chars:
                        w = sw
                        h = sh

                        best_box_dim = []
                        best_prob = 0.0
                        best_seq = None
                        ## Iterate through a range of variable chars if
                        ## chars is greater than 2. This allows potential
                        ## breaks for chars-1, chars-2
                        #                         all_choices = []

                        for chars in range(int(chars), 1, -1):

                            for z in range(0, 21, 2):
                                segs = []
                                prev_breakline = 0
                                for pos in range(int(chars - 1)):
                                    if char_mean_int - z >= 0:

                                        upper_range = [
                                            int(
                                                np.round((pos + 1) *
                                                         (char_mean_int - z))),
                                            int(
                                                np.round((pos + 1) *
                                                         (char_mean_int + z)))
                                        ]
                                        vsum_range = vsum[
                                            upper_range[0]:upper_range[1]]

                                        if vsum_range.any():
                                            breakline = int(
                                                np.round((pos + 1) *
                                                         (char_mean_int - z) +
                                                         argmax(vsum_range)))
                                        else:
                                            breakline = None

                                        if breakline:
                                            sg = letter[:, prev_breakline:
                                                        breakline]

                                            prev_breakline = breakline
                                        else:
                                            sg = letter[:,
                                                        int(
                                                            np.round(pos * (
                                                                char_mean_int -
                                                                z))
                                                        ):int(
                                                            np.
                                                            round((pos + 1) * (
                                                                char_mean_int -
                                                                z)))]
                                            prev_breakline = int(
                                                np.round((pos + 1) *
                                                         (char_mean_int - z)))

                                        segs.append(sg)

                                segs.append(
                                    letter[:,
                                           int(
                                               np.round((chars - 1) *
                                                        (char_mean_int -
                                                         z))):])

                                segs = [fadd_padding(sg, 2) for sg in segs]
                                seg_ctrs = [
                                    cv.findContours(
                                        sg.copy(),
                                        mode=cv.RETR_CCOMP,
                                        method=cv.CHAIN_APPROX_SIMPLE)
                                    for sg in segs
                                ]
                                try:
                                    seg_bxs = [[
                                        cv.boundingRect(k) for k in sgc[0]
                                    ] for sgc in seg_ctrs]
                                except:
                                    print sgc
                                    raise

                                bxs = []
                                nsegs = []

                                prev_w = 0
                                for zi, ltb in enumerate(seg_bxs):
                                    seg = segs[zi]
                                    for b in ltb:
                                        if b[2] < (
                                                final_box_info.tsek_mean +
                                                4 * final_box_info.tsek_std
                                        ) or b[3] < final_box_info.tsek_mean + 4 * final_box_info.tsek_std:
                                            seg[b[1] - 1:b[1] + b[3] + 1,
                                                b[0] - 1:b[0] + b[2] +
                                                1] = True
                                    seg, ofst = ftrim(seg, new_offset=True)
                                    bx = [
                                        x + prev_w + (ofst['left'] / scale_l),
                                        y + (ofst['top'] / scale_l),
                                        seg.shape[1] / scale_l,
                                        seg.shape[0] / scale_l
                                    ]
                                    prev_w += seg.shape[1] / scale_l
                                    bxs.append(bx)
                                    nsegs.append(seg)

                                xt = [
                                    normalize_and_extract_features(sg)
                                    for sg in nsegs if 0 not in sg.shape
                                ]
                                prd_probs = cls.predict_log_proba(xt)
                                prd_probs = prd_probs.astype(np.float32)

                                prob, prds = viterbi_cython(
                                    prd_probs.shape[0], n_states, start_p,
                                    trans_p, prd_probs)
                                prob = np.exp(prob)

                                if prob > best_prob:
                                    best_prob = prob
                                    best_seq = prds
                                    best_box_dim = bxs
                                    best_xt = xt

                        if not best_box_dim:
                            best_prob = prob
                            best_seq = prds
                            best_box_dim = bxs
                            best_xt = xt

                        for u in range(len(best_seq)):
                            self.vectors[l].append(label_chars[best_seq[u]])
                            best_box = best_box_dim[u]
                            best_box = [int(np.round(ii)) for ii in best_box]
                            best_box.append(best_prob)
                            best_box.append(label_chars[best_seq[u]])
                            self.new_boxes[l].append(best_box)

                            try:
                                self.line_info.shapes.img_arr[
                                    best_box[1]:best_box[1] + best_box[3],
                                    best_box[0] + best_box[2]] = 1
                            except:

                                pass

                    else:
                        self.new_boxes[l].append([x, y, w, h])
                        vect = normalize_and_extract_features(letter)
                        self.vectors[l].append(vect)

                else:
                    self.new_boxes[l].append([x, y, w, h])
                    vect = normalize_and_extract_features(letter)
                    self.vectors[l].append(vect)

        if not any(self.vectors):
            print 'no vectors'
            return
        else:
            if self.line_info.shapes.detect_o:

                for i, l in enumerate(self.new_boxes):
                    for n in self.line_info.line_naros[i]:
                        box = self.line_info.get_box(n)
                        x, y, w, h = box
                        r0 = x + w
                        for k, b in enumerate(l):
                            if ((b[2] + w) - abs(b[0] - x) - abs(
                                (b[0] + b[2]) - r0)) / (
                                    2 * float(min(w, b[2]))) > .8:
                                try:
                                    nbox = list(combine_many_boxes([box, b]))
                                except:
                                    print nbox[3]
                                    raise
                                if isinstance(self.vectors[i][k], unicode):
                                    self.vectors[i][k] += u'ོ'
                                    nbox = b
                                    nbox[-1] = self.vectors[i][k]
                                else:
                                    probs = cls.predict_log_proba(
                                        self.vectors[i][k])
                                    mx = np.argmax(probs)
                                    prob = probs[0][mx]
                                    mx = rbfcls.predict(self.vectors[i][k])[0]
                                    ch = label_chars[mx] + u'ོ'
                                    self.vectors[i][k] = ch
                                    nbox.append(prob)
                                    nbox.append(ch)
                                self.new_boxes[i][k] = nbox
Exemple #8
0
    def construct_vector_set_stochastic(self):
        # separate attached tsek
        # note this may note go here exactly, but somewhere in this function
        if self.line_info.shapes.conf.get('detach_tsek'):
            self._detach_tsek()

        final_box_info = CombineBoxesForPage(self.line_info)

        self.final_box_info = final_box_info
        final_boxes = final_box_info.final_boxes

        final_indices = final_box_info.final_indices
        scales = final_box_info.transitions

        self.vectors = [[] for i in range(self.line_info.k)]
        self.new_boxes = [[] for i in range(self.line_info.k)]  #

        BREAKWIDTH = self.breakwidth

        for l in range(len(final_indices)):  # for each line
            try:
                scale_l = scales[l]
                oo_scale_l = 1.0 / scale_l
            except:
                print 'ERROR AT ', l, len(scales)
                raise
            try:
                lb = range(len(final_indices[l]))
            except IndexError:
                continue

            segmented = 0
            for i in lb:  # for each line box

                ## New draw, takes into account tree hierarchy of contours
                x, y, w, h = final_boxes[l][i]
                letter = ones((h, w), dtype=uint8)
                lindices = final_indices[l][i]
                len_lindices = len(lindices)
                for k in lindices:
                    if not isinstance(k, str):
                        letter = self.line_info.shapes.draw_contour_and_children(
                            k, char_arr=letter, offset=(-x, -y))
                    else:
                        cv.drawContours(letter,
                                        [self.line_info.get_contour(k)],
                                        -1,
                                        0,
                                        thickness=-1,
                                        offset=(-x, -y))

                if w * scale_l >= 1 and h * scale_l >= 1:
                    letter = cv.resize(letter,
                                       dsize=(0, 0),
                                       fx=scale_l,
                                       fy=scale_l)

                if letter.shape[1] >= (final_box_info.char_mean +
                                       BREAKWIDTH * final_box_info.char_std
                                       ):  # if a box is too large, break it
                    sw = w * scale_l
                    sh = h * scale_l
                    chars = sw // (final_box_info.char_mean -
                                   1.5 * final_box_info.char_std
                                   )  # important, floor division
                    chars = min(chars, 4)
                    if chars > 1.0:

                        w = sw
                        h = sh

                        all_choices = []

                        for chars in range(int(chars), 0, -1):
                            #                             if l == 1:
                            if self.line_info.shapes.detect_o:
                                line_num = l
                            else:
                                line_num = None
                            all_choices.append(
                                self._sample_widths_method(chars,
                                                           letter,
                                                           final_boxes[l][i],
                                                           oo_scale_l,
                                                           line_num=line_num))

                        ## Append complete recognization results to vector list

                        mx = max(all_choices)
                        for v in mx[-1]:
                            self.new_boxes[l].append(v)
                            self.vectors[l].append(v)
                            self.line_info.shapes.img_arr[v[1]:v[1] + v[3],
                                                          v[0] + v[2]] = 1

                    else:
                        self.new_boxes[l].append([x, y, w, h])
                        if len_lindices == 1:
                            try:
                                vect = self.cached_features[lindices[0]]
                            except:  #FIXME: should really check key used
                                vect = normalize_and_extract_features(letter)
                        else:
                            vect = normalize_and_extract_features(letter)
                        self.vectors[l].append(vect)

                else:
                    self.new_boxes[l].append([x, y, w, h])
                    if len_lindices == 1:
                        try:
                            vect = self.cached_features[lindices[0]]
                        except KeyError:
                            vect = normalize_and_extract_features(letter)
                    else:
                        vect = normalize_and_extract_features(letter)
                    self.vectors[l].append(vect)

        if not any(self.vectors):
            print 'no vectors'
            return
        else:
            if self.line_info.shapes.detect_o:

                for i, line in enumerate(self.new_boxes):

                    used_boxes = set()
                    for n in self.line_info.line_naros[i]:
                        if n in used_boxes:
                            continue
                        box = self.line_info.get_box(n)
                        x, y, w, h = box
                        for k, box1 in enumerate(line):
                            assert isinstance(
                                box1,
                                (list, tuple)), 'error - {}-{}-{}'.format(
                                    str(box1), i, k)
                            assert isinstance(box, (list, tuple)), box
                            try:
                                overlap = check_for_overlap(box1, box)
                            except:

                                print i, k, box1, 'BOX problem'
                            if overlap:
                                used_boxes.add(n)

                                try:
                                    nbox = list(combine_many_boxes([box,
                                                                    box1]))
                                except:
                                    print nbox, 'slkfjlkfj'
                                    raise
                                if isinstance(self.vectors[i][k], unicode):
                                    self.vectors[i][k] += u'ོ'
                                    nbox = box1
                                    nbox[-1] = self.vectors[i][k]
                                elif isinstance(self.vectors[i][k], list):
                                    if not self.vectors[i][k][-1][-1] == u'ོ':
                                        pchar = self.vectors[i][k][-1] + u'ོ'
                                        self.vectors[i][k][-1] = pchar
                                    nbox = self.vectors[i][k]
                                else:
                                    probs = cls.predict_log_proba(
                                        self.vectors[i][k])
                                    mx = np.argmax(probs)
                                    prob = probs[0][mx]
                                    ch = label_chars[mx] + u'ོ'
                                    self.vectors[i][k] = ch
                                    nbox.append(prob)
                                    nbox.append(ch)
                                self.new_boxes[i][k] = nbox
Exemple #9
0
    def _sample_widths_method(self,
                              chars,
                              letter,
                              letter_box,
                              oo_scale_l,
                              line_num=None):
        x, y, w, h = letter_box

        ################default
        cur_mean = self.final_box_info.char_mean * .97
        cur_std = .295 * self.final_box_info.char_std
        #################
        best_prob = -np.inf

        if chars > 1:
            letter = cv.dilate(letter.copy(), None, iterations=1)

            padding_amount = 3

            for n in range(15):

                widths = [gauss(cur_mean, cur_std) for i in range(chars)]
                prev = 0
                vecs = []
                wdthprobs = 0
                boxes = []
                for i, val in enumerate(widths):
                    if i == chars - 1:
                        end = letter.shape[1]
                    else:
                        end = prev + val
                    wdthprobs += gausslogprob(cur_mean, cur_std, end - prev)

                    s = fadd_padding(letter[:, int(prev):int(end)],
                                     padding_amount)
                    _, ctrs, hier = cv.findContours(
                        s.copy(),
                        mode=cv.RETR_TREE,
                        method=cv.CHAIN_APPROX_NONE)
                    bounding = map(boundingRect, ctrs)
                    for k, b in enumerate(bounding):
                        if (b[2] < 23 or b[3] < 23) and hier[0][k][3] == 0:
                            s[b[1] - 1:b[1] + b[3] + 1,
                              b[0] - 1:b[0] + b[2] + 1] = 1
                    s = s[padding_amount:-padding_amount,
                          padding_amount:-padding_amount]
                    s, ofst = ftrim(s, new_offset=True)

                    if 0 not in s.shape:
                        nnbox = [
                            x + (prev + ofst['left']) * oo_scale_l,
                            y + (ofst['top'] * oo_scale_l),
                            s.shape[1] * oo_scale_l, s.shape[0] * oo_scale_l
                        ]
                        if line_num is not None:
                            naro = self.line_info.check_naro_overlap(
                                line_num, nnbox)
                            if naro != False:

                                naro_box = self.line_info.get_box(naro)
                                nnbox = combine_many_boxes([nnbox, naro_box])
                                ss = cv.resize(s,
                                               dsize=(0, 0),
                                               fx=oo_scale_l,
                                               fy=oo_scale_l)
                                ss = np.vstack((ones(
                                    (nnbox[3] - ss.shape[0], ss.shape[1]),
                                    dtype=ss.dtype), ss))
                                ss = hstack(
                                    (ss,
                                     ones(
                                         (ss.shape[0], nnbox[2] - ss.shape[1]),
                                         dtype=ss.dtype)))

                                cv.drawContours(
                                    ss, [self.line_info.get_contour(naro)],
                                    -1,
                                    0,
                                    thickness=-1,
                                    offset=(-naro_box[0], -naro_box[1]))
                                s = ss
                        vecs.append(normalize_and_extract_features(s))
                        boxes.append(nnbox)
                    else:
                        break
                    prev += val
                if not vecs: continue
                xn = len(vecs)

                vecs = np.array(vecs).reshape(xn, 346)  # 346 is len(vecs[0])

                probs = predict_log_proba(vecs)
                probs = probs.astype(np.float32)

                if n % 10 == 0 and n != 0:

                    cur_mean = self.final_box_info.char_mean * (
                        .97 - (3 * n / 1000.0))

                prob, prds = viterbi_cython(xn, n_states, start_p, trans_p,
                                            probs)
                prob = prob + wdthprobs
                if prob > best_prob:
                    best_prob = prob
                    best_prd = prds
                    best_boxes = boxes
        else:
            best_boxes = [letter_box]
            probs = predict_log_proba(normalize_and_extract_features(letter))
            amx = probs[0].argmax()
            try:
                startprob = start_p[amx]
            except IndexError:
                startprob = 1e-10
            best_prob = probs[0][amx] + gausslogprob(
                cur_mean, cur_std, letter_box[2] / oo_scale_l) + startprob
            best_prd = [amx]

        final_prob = best_prob
        res = []
        for i, val in enumerate(best_prd):
            best_boxes[i] = [int(np.round(k)) for k in best_boxes[i]]
            best_boxes[i].extend([float(np.exp(final_prob)), label_chars[val]])
            res.append(best_boxes[i])

        return (final_prob, res)
Exemple #10
0
    def construct_vector_set_simple(self):
        self.too_big = [[] for i in range(self.line_info.k)]
        self.too_big_box = [[] for i in range(self.line_info.k)]
        self.too_big_loc = []
        char_mean = self.line_info.shapes.char_mean
        for i in range(self.line_info.k):
            line = self.line_info.lines_chars[i]
            for j, c in enumerate(line):
                x, y, w, h = self.line_info.get_box(c)
                if w > 1.75 * char_mean or h > 2.5 * char_mean:
                    letter = ones((h, w), dtype=uint8)
                    if not isinstance(c, str):
                        letter = self.line_info.shapes.draw_contour_and_children(
                            c, char_arr=letter, offset=(-x, -y))
                    else:
                        cv.drawContours(letter,
                                        [self.line_info.get_contour(c)],
                                        -1,
                                        0,
                                        thickness=-1,
                                        offset=(-x, -y))

                    self.too_big[i].append(letter)
                    self.too_big_loc.append((i, j))
                    self.too_big_box[i].append([x, y, w, h])

        for loc in self.too_big_loc:
            self.line_info.lines_chars[loc[0]][loc[1]] = 'xx'

        for k in self.line_info.lines_chars:
            self.line_info.lines_chars[k] = [
                xx for xx in self.line_info.lines_chars[k] if xx != 'xx'
            ]

        final_box_info = CombineBoxesForPage(self.line_info)
        scales = final_box_info.transitions
        self.final_box_info = final_box_info
        final_boxes = final_box_info.final_boxes
        char_mean = self.final_box_info.char_mean
        final_indices = final_box_info.final_indices
        self.vectors = [[] for i in range(self.line_info.k)]
        self.new_boxes = [[] for i in range(self.line_info.k)]  #
        for l in range(self.line_info.k):  # for each line
            try:
                lb = range(len(final_indices[l]))
            except IndexError:
                continue

            try:
                scale_l = scales[l]
                oo_scale_l = 1.0 / scale_l
            except:
                print 'ERROR AT ', l, len(scales)
                raise

            for ii, i in enumerate(lb):  # for each line box

                ## New draw, takes into account tree hierarchy of contours
                x, y, w, h = final_boxes[l][i]
                letter = ones((h, w), dtype=uint8)
                for k in final_indices[l][i]:
                    if not isinstance(k, str):
                        letter = self.line_info.shapes.draw_contour_and_children(
                            k, char_arr=letter, offset=(-x, -y))
                    else:
                        cv.drawContours(letter,
                                        [self.line_info.get_contour(k)],
                                        -1,
                                        0,
                                        thickness=-1,
                                        offset=(-x, -y))

                letter = cv.resize(letter,
                                   dsize=(0, 0),
                                   fx=scale_l,
                                   fy=scale_l)
                self.new_boxes[l].append([x, y, w, h])
                vect = normalize_and_extract_features(letter)
                self.vectors[l].append(vect)

        if not any(self.vectors):
            print 'no vectors'
            return
Exemple #11
0
    def __init__(self, img_arr, fast_cls, small_coef=1, low_ink=False, \
                 page_type=None, flpath=None, detect_o=True,\
                 clear_hr = False): #lower coef means more filtering USE 3 for nying gyud
        self.img_arr = img_arr
        self.page_type = page_type
        self.flpath = flpath
        self.low_ink = low_ink
        self.detect_o = detect_o

        self.cached_features = OrderedDict()
        self.cached_pred_prob = OrderedDict()
        self._contour_mode = cv.RETR_TREE
        ### repeatedly called functions
        ones = np.ones
        uint8 = np.uint8
        predict = fast_cls.predict
        predict_proba = fast_cls.predict_proba
        self.contours, self.hierarchy = self._contours()
        self.boxes = []
        self.indices = []
        self.small_coef = small_coef

        FILTERED_PUNC = (u'།', u'་', u']', u'[')

        self._set_shape_measurements()
        if page_type == 'pecha':
            if clear_hr:
                print 'Warning: clear_hr called on pecha format. For clearing text'
                self.force_clear_hr()
            self.set_pecha_layout()
            if self.indices:
                content_parent = int(
                    statsmode([self.hierarchy[0][i][3]
                               for i in self.indices])[0])
            else:
                print 'no content found'
        else:

            content_parent = int(
                statsmode([hier[3] for hier in self.hierarchy[0]])[0])
            self.indices = self.get_indices()

        outer_contours = []
        outer_widths = []

        ## Iterate through all contours
        for i in self.indices:
            cbox = self.get_boxes()[i]
            x, y, w, h = cbox
            ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS. Recently
            # added the len(indices) < 40 as a way to prevent exaggerated
            # filtering of small lines where gaussian width measures
            # are meaningless due to small sample size (too few contours)
            if self.hierarchy[0][i][3] == content_parent and (
                    cbox[2] < .1 * self.img_arr.shape[1]
                    or len(self.indices) < 40):
                #            if self.hierarchy[0][i][3] == content_parent and cbox[2] < 3*self.char_mean:  ### THIS SECOND CONDITION IS CAUSING A LOT OF PROBLEMS
                outer_contours.append(i)
                outer_widths.append(cbox[2])
            else:
                if cbox[2] > .66 * self.img_arr.shape[1]:
                    print cbox[2] / float(self.img_arr.shape[1])
                if clear_hr and .995*self.img_arr.shape[1] > cbox[2] > \
                .66*self.img_arr.shape[1] and cbox[1] < .25*self.img_arr.shape[0]:
                    self.img_arr[0:cbox[1] + cbox[3], :] = 1
#                 print 'rejected box. too wide?', cbox[2] >= .1*self.img_arr.shape[1]
        width_measures = self.char_gaussians(outer_widths)

        for i, j in zip(['char_mean', 'char_std', 'tsek_mean', 'tsek_std'],
                        width_measures):
            setattr(self, i, j)

        self.small_contour_indices = []
        self.indices = []  # Need to reset!19
        self.emph_symbols = []
        self.naros = []
        for i in outer_contours:
            cbox = self.get_boxes()[i]
            # if small and has no children, put in small list (this could backfire with false interiors e.g. from salt and pepper noise)
            ## NOTE: previously small was defined as less than tsek_mean + 3xtsek std
            ## however, this wasn't always working. changing to less than charmean
            ## minus 2xchar std however should watch to see if is ok for many different inputs...

            x, y, w, h = cbox
            tmparr = ones((h, w), dtype=uint8)
            tmparr = self.draw_contour_and_children(i, tmparr, (-x, -y))

            features = normalize_and_extract_features(tmparr)
            self.cached_features[i] = features

            prprob = predict_proba(features)

            mxinx = prprob.argmax()
            quick_prd = label_chars[mxinx]
            self.cached_pred_prob[i] = (mxinx, prprob[0])

            is_emph_symbol = quick_prd in set([u'༷', u'༵', u'༼', u'༽'])
            if is_emph_symbol:
                self.emph_symbols.append(i)

                print 'EMPHSYMBOLFOUND', quick_prd
            elif quick_prd == u'ོ' and self.detect_o:
                self.naros.append(i)

            elif cbox[2] < 7:

                continue
            elif (cbox[2] <= self.tsek_mean * 3 and quick_prd
                  in FILTERED_PUNC) and not self.low_ink:  # default!!!
                self.small_contour_indices.append(i)
            else:
                self.indices.append(i)
        if self.detect_o:
            print 'pre-filtered na-ro vowel', len(self.naros), 'found'

        if self.low_ink:
            self._low_ink_setting()