Example #1
0
def normalized_scale(arr):
    crss_denom = float(max(horizontal_transitions(arr)))
    if crss_denom > 0:
        crossings_val = (arr.shape[1]) / crss_denom
        scale = 35.0 / crossings_val
    else:
        scale = 1.0

    return scale
Example #2
0
    def combine_for_line_ind(self, line_info, lineind=None):
        fli = []  # Final Line indices
        flb = []
        line_widths = []
        line = line_info.lines_chars[lineind]

        line.sort(key=lambda x: line_info.get_box(x)[0])

        shapes = line_info.shapes

        # line, line info, shapes
        if line:
            top = 1000000  # arbitrary high number
            bottom = 0
            for k in line:
                j = line_info.get_box(k)
                if j[1] < top:
                    top = j[1]
                if j[1] + j[3] > bottom:
                    bottom = j[1] + j[3]

            firstbox = line_info.get_box(line[0])
            lastbox = line_info.get_box(line[-1])
            whitespace = np.zeros(len(line), dtype=int)
            for p, c in enumerate(line):
                if p + 1 < len(line):
                    ab = line_info.get_box(c)
                    nab = line_info.get_box(line[p + 1])
                    ws_diff = nab[0] - (ab[0] + ab[2])
                    whitespace[p] = ws_diff

            sum_whitespace = whitespace.sum()

            ln_arr = shapes.img_arr[top:bottom, firstbox[0]:lastbox[0] +
                                    lastbox[2]].copy()

            try:
                for inx in line_info.small_cc_lines_chars[lineind]:
                    box = line_info.get_box(inx)
                    ln_arr[box[1] - top:box[1] + box[3] - top, box[0] -
                           firstbox[0]:box[0] + box[2] - firstbox[0]] = 1
            except:
                pass
#             Remove small contours when calculating scale
#             for k in line_info.small_cc_lines_chars[lineind]:
#                 x, y, w, h = shapes.get_boxes()[k]
#                 x = x - firstbox[0]
#                 y = y - top
#                 ln_arr[y:y+h, x:x+w] = 1

            crss_denom = float(max(horizontal_transitions(ln_arr)))
            if crss_denom > 0:
                crossings_val = (ln_arr.shape[1] - sum_whitespace) / crss_denom
                scale = 35.0 / crossings_val
            else:
                scale = 1.0

            self.transitions.append(scale)

        else:
            return []

        line = iter(line)

        # Initialize the current box and its attrs#         BREAKWIDTH = 3.0
        try:
            cur_ind = [next(line)]
        except StopIteration:
            return []

        # cb is current box, b is the next box
        cb = line_info.get_box(cur_ind[0])

        led, red, top, bottom, w, h = box_attrs(cb)

        # Loop through box, combine and close along the way
        for i in line:
            b = line_info.get_box(i)

            ledn, ren, topn, bottomn, wn, hn = box_attrs(b)

            # If left edge of next box doesn't overlap cur box
            # separate as 2 different chars
            is_interior = ((ledn >= led and ren <= red) or
                           (ledn <= led and ren >= red)) and not bottom < topn
            if not isinstance(i, str):
                is_interior = line_info.shapes.hierarchy[0][i][
                    0] < 0 and line_info.shapes.hierarchy[0][i][
                        1] < 0 and line_info.shapes.hierarchy[0][i][
                            2] < 0  # i.e. it has no peers at its place in the tree... and no children
            else:
                is_interior = False  # Its a string, meaning it is the result of a horizontal cut and likely not an interior

            if ledn > red:  # or (not is_interior and in_tsek_pop(shapes, wn,topn,top, bottomn, bs, i) and not hn > 1.5*shapes.tsek_mean

                #   ):

                fli.append(cur_ind)
                x, y, w, h = combine_many_boxes(
                    [line_info.get_box(j) for j in cur_ind])
                flb.append((x, y, w, h))
                self.widths.append(w * scale)
                cur_ind = [i]
                cb = b
                led, red, top, bottom, w, h = box_attrs(cb)


#                continue
            else:  # There is overlap
                # one box is completely enveloped by the other
                if is_interior:
                    #                print 'enveloped'

                    cur_ind.append(i)
                    bxs = [line_info.get_box(j) for j in cur_ind]
                    bxs.append(b)
                    cb = combine_many_boxes(bxs)
                    led, red, top, bottom, w, h = box_attrs(cb)

                elif ((float(min(wn, w)) - abs(
                    (red - ledn))) / float(min(wn, w))
                      ) < self.hangoff:  # amount hanging off end is 30 %
                    cur_ind.append(i)
                    bxs = [line_info.get_box(j) for j in cur_ind]
                    bxs.append(b)
                    cb = combine_many_boxes(bxs)
                    led, red, top, bottom, w, h = box_attrs(cb)

                # The overlap is incidental / boxes are not related
                else:
                    #                print 'incidental overlap'
                    fli.append(cur_ind)
                    x, y, w, h = combine_many_boxes(
                        [line_info.get_box(j) for j in cur_ind])
                    flb.append((x, y, w, h))
                    self.widths.append(w * scale)

                    cur_ind = [i]
                    cb = b
                    led, red, top, bottom, w, h = box_attrs(cb)
        fli.append(cur_ind)
        x, y, w, h = combine_many_boxes(
            [line_info.get_box(j) for j in cur_ind])
        flb.append((x, y, w, h))
        line_widths.append(w)
        self.widths.append(w * scale)

        if shapes.low_ink:
            lib = self.line_info.low_ink_boxes[lineind]
            low_ink_segmentation = {}
            not_intr = []
            for d, box in enumerate(flb):
                #                b = line_info.get_box(i)
                led, red, top, bottom, w, h = box_attrs(box)
                for p, bx in enumerate(lib):
                    ledn, ren, topn, bottomn, wn, hn = box_attrs(bx)
                    is_interior = (led >= ledn - 15
                                   and red <= ren + 15)  #and not bottom < topn
                    if is_interior:

                        ###  This attempts to remove noise
                        ### that doesn't fall into blurred low ink
                        ### box but does get combined according to normal
                        ### combination rules
                        #                         for inx in fli[d]:
                        #                             tb = line_info.get_box(inx)
                        #                             if tb[1] >= topn and tb[1] + tb[3] <= bottomn:
                        #                                 cur = low_ink_segmentation.get(p,[])
                        # #                                 cur.extend(fli[d])
                        #                                 cur.append(inx)
                        #                                 low_ink_segmentation[p] = cur

                        cur = low_ink_segmentation.get(p, [])
                        cur.extend(fli[d])
                        low_ink_segmentation[p] = cur
                        break
                    else:
                        continue
                else:
                    #                print 'fail'
                    not_intr.append(fli[d])
                    #NO! do something

            all_li_seg = low_ink_segmentation.values()
            all_li_seg.extend(not_intr)
            newfli = []
            newflb = []
            for j in all_li_seg:
                newfli.append(j)
                x, y, w, h = combine_many_boxes(
                    [line_info.get_box(i) for i in j])
                newflb.append([x, y, w, h])
                self.widths.append(w * scale)

            fli = newfli
            flb = newflb

            fliflb = zip(fli, flb)
            fliflb.sort(key=lambda x: x[1][0])

            fli = [i[0] for i in fliflb]
            flb = [i[1] for i in fliflb]

        self.final_indices.append(fli)
        self.final_boxes.append(flb)
        self.line_width_means.append(mean(line_widths))
        if not line:
            print flb
Example #3
0
    def li_combine_for_line_ind(self, line_info, lineind=None):
        fli = []  # Final Line indices
        flb = []
        line_widths = []

        line = line_info.lines_chars[lineind]

        lib = self.line_info.low_ink_boxes[lineind]

        shapes = line_info.shapes

        if line:
            top = 1000000  # arbitrary high number
            bottom = 0
            for k in line:
                j = line_info.get_box(k)
                if j[1] < top:
                    top = j[1]
                if j[1] + j[3] > bottom:
                    bottom = j[1] + j[3]

            firstbox = line_info.get_box(line[0])
            lastbox = line_info.get_box(line[-1])
            whitespace = []
            for p, c in enumerate(line):
                if p + 1 < len(line):
                    ab = line_info.get_box(c)
                    nab = line_info.get_box(line[p + 1])
                    ws_diff = nab[0] - (ab[0] + ab[2])
                    #                    if ws_diff  > 10*shapes.char_mean:
                    whitespace.append(ws_diff)

            sum_whitespace = sum(whitespace)

            ln_arr = shapes.img_arr[top:bottom, firstbox[0]:lastbox[0] +
                                    lastbox[2]].copy()

            crss_denom = float(max(horizontal_transitions(ln_arr)))
            if crss_denom > 0:
                crossings_val = (ln_arr.shape[1] - sum_whitespace) / crss_denom
                scale = 35.0 / crossings_val
            else:
                scale = 1.0
            self.transitions.append(scale)

        else:
            return []

        line = iter(line)
        low_ink_segmentation = {}
        not_intr = []
        for i in line:
            b = line_info.get_box(i)
            led, red, top, bottom, w, h = box_attrs(b)
            for p, bx in enumerate(lib):
                ledn, ren, topn, bottomn, wn, hn = box_attrs(bx)
                is_interior = (led >= ledn - 5
                               and red <= ren + 5)  #and not bottom < topn
                if is_interior:
                    cur = low_ink_segmentation.get(p, [])
                    cur.append(i)
                    low_ink_segmentation[p] = cur
                    break
                else:
                    continue
            else:
                not_intr.append([i])

        all_li_seg = low_ink_segmentation.values()
        all_li_seg.extend(not_intr)
        for j in all_li_seg:
            fli.append(j)
            x, y, w, h = combine_many_boxes([line_info.get_box(i) for i in j])
            flb.append([x, y, w, h])
            self.widths.append(w * scale)

        fliflb = zip(fli, flb)
        fliflb.sort(key=lambda x: x[1][0])

        fli = [i[0] for i in fliflb]
        flb = [i[1] for i in fliflb]

        self.final_indices.append(fli)
        self.final_boxes.append(flb)