コード例 #1
0
 def title_splits_3_col(self, boxes_in):
     """ Split based on titles in the box"""
     zones_out = []
     for box in boxes_in:
         if box['num_col'] == 'one':
             continue
         tops = []
         bots = []
         #            box_center = box['l']+(box_width/2)
         lines_in_box = redefined_line_bounds(self.get_split_lines(box))
         for line_num, line_l in enumerate(lines_in_box):
             words_in_line = get_words_in_box(line_l, self.words_in_page)
             line_str = ' '.join(w['word'] for w in words_in_line)
             ## Case 1: Check for entirely capital string.
             if line_str.isupper() and len(words_in_line) > 1:
                 if line_l['t'] not in tops:
                     tops.append(line_l['t'])
                     bots.append(line_l['b'])
         new_boxes_m = self.create_new_boxes(tops, bots, box)
         if new_boxes_m:
             zones_out.extend(new_boxes_m)
         else:
             zones_out.append(box)
     zones_out = [
         box for box in zones_out
         if get_words_in_box(box, self.words_in_page)
     ]
     return zones_out
コード例 #2
0
 def merge_consecutive_tables(self, boxes_in):
     boxes_out = []
     if boxes_in:
         prev_box = copy.deepcopy(boxes_in[0])
         lines_in_box = redefined_line_bounds(
             self.get_split_lines(prev_box))
         prev_is_table = check_table(
             get_words_in_box(prev_box, self.words_in_page), prev_box['w'],
             len(lines_in_box))
         for box in boxes_in[1:]:
             curr_words = get_words_in_box(box, self.words_in_page)
             c_lines_in_box = redefined_line_bounds(
                 self.get_split_lines(box))
             curr_is_table = check_table(curr_words, box['w'],
                                         len(c_lines_in_box))
             if prev_is_table and curr_is_table:
                 prev_box['b'] = box['b']
             else:
                 boxes_out.append(prev_box)
                 prev_box = copy.deepcopy(box)
             prev_is_table = copy.deepcopy(curr_is_table)
         boxes_out.append(prev_box)
         return boxes_out
     else:
         return []
コード例 #3
0
    def title_splits_2_col(self, boxes_in):
        """ Split based on titles in the box"""
        zones_out = []
        for box in boxes_in:
            if box['num_col'] == 'one':
                continue
            tops = []
            bots = []
            box_width = box['r'] - box['l']
            box_height = box['b'] - box['h']
            box_center = box['l'] + (box_width / 2)
            lines_in_box = get_lines_in_box(box, self.lines_in_page)
            for line_num, line_l in enumerate(lines_in_box):
                words_in_line = get_words_in_box(line_l, self.words_in_page)
                line_str = ' '.join(w['word'] for w in words_in_line)
                left_gap = line_l['l'] - box['l']
                right_gap = box['r'] - line_l['r']
                gap_diff = right_gap - left_gap
                line_width = line_l['r'] - line_l['l']
                line_center = line_l['l'] + ((line_l['r'] - line_l['l']) / 2)
                center_diff = box_center - line_center
                all_num = check_all_num(words_in_line)
                ## Case 1: Check for entirely capital string.
                if line_str.isupper() and len(words_in_line) > 1 and abs(
                        gap_diff) < 0.05 * box_width:
                    if line_l['t'] not in tops:
                        tops.append(line_l['t'])
                        bots.append(line_l['b'])
                ## Case 2: Check if the line is centered
                elif left_gap > 0.05*box_width and right_gap > 0.05*box_width and not all_num and \
                abs(center_diff) < 0.015*box_width and abs(gap_diff) < 0.03*box_width and len(words_in_line)>1:
                    if line_l['t'] not in tops:
                        tops.append(line_l['t'])
                        bots.append(line_l['b'])
                # Case 3: Check for "Moody's" title
                elif box_height > 200 and line_l['t'] < 1500 and \
                0.65*box_width < line_width < 0.8*box_width:
                    if line_l['t'] not in tops:
                        tops.append(line_l['t'])
                        bots.append(line_l['b'])

            new_boxes_m = self.create_new_boxes(tops, bots, box)
            if new_boxes_m:
                zones_out.extend(new_boxes_m)
            else:
                zones_out.append(box)
        zones_out = [
            box for box in zones_out
            if get_words_in_box(box, self.words_in_page)
        ]
        return zones_out
コード例 #4
0
 def title_splits_3_1_col(self, boxes):
     """ Partition Step 3 : Horizontal splits in vertically split boxes"""
     #    min_tab_width = 20
     zones_out = []
     new_boxes_m = []
     self.all_line_heights = [w['b'] - w['t'] for w in self.lines_in_page]
     #        height_5_percentile = np.percentile(self.all_line_heights, 90)
     for box in boxes:
         tops = []
         bots = []
         #            box_height = box['b'] - box['h']
         lines_in_box = redefined_line_bounds(self.get_split_lines(box))
         for line_num, line_l in enumerate(lines_in_box[1:]):
             words_in_line = get_words_in_box(line_l, self.words_in_page)
             line_str = ' '.join(w['word'] for w in words_in_line)
             ## Case 1: Check for entirely capital string.
             if line_str.isupper() and len(words_in_line) > 1:
                 if line_l['t'] not in tops:
                     tops.append(line_l['t'])
                     bots.append(line_l['b'])
         new_boxes_m = self.create_new_boxes(tops, bots, box)
         if new_boxes_m:
             zones_out.extend(new_boxes_m)
         else:
             zones_out.append(box)
     return zones_out
コード例 #5
0
 def right_gap_splits(self, box):
     """ Perform horizontal splits based on space at the end of a line when the 
     next line is a new paragraph."""
     max_word_width = np.percentile(
         [w['r'] - w['l'] for w in self.words_in_page], 60)
     new_boxes = []
     lines_in_block = redefined_line_bounds(self.get_split_lines(box))
     is_table = check_table(get_words_in_box(box, self.words_in_page),
                            box['r'] - box['l'], len(lines_in_block))
     if box['num_col'] == 'one':
         to_split = copy.deepcopy(box)
         sorted_ud = sorted(lines_in_block,
                            key=lambda k: ("t" not in k, k.get('t', None)))
         for lnum, line in enumerate(sorted_ud[1:-1], 1):
             right_space = box['r'] - line['r']
             if right_space > max_word_width:
                 new_boxes.append({
                     't': to_split['t'],
                     'b': line['b'],
                     'l': to_split['l'],
                     'r': to_split['r'],
                     'w': to_split['r'] - to_split['l'],
                     'h': line['b'] - to_split['t'],
                     'color': 'seagreen',
                     'num_col': 'one'
                 })
                 to_split.update({'t': line['b']})
                 self.left_indents.append(lines_in_block[lnum + 1]['l'])
         new_boxes.append(to_split)
     else:
         new_boxes.append(box)
     return new_boxes
コード例 #6
0
 def right_gap_splits(self, box):
     """ Perform horizontal splits based on space at the end of a line when the 
     next line is a new paragraph."""
     max_word_width = np.percentile(
         [w['r'] - w['l'] for w in self.words_in_page], 98)
     new_boxes = []
     words_in_block = get_words_in_box(box, self.words_in_page)
     lines_in_block = get_lines_in_box(box, self.lines_in_page)
     is_table = check_table(words_in_block, box['r'] - box['l'],
                            len(lines_in_block))
     if box['num_col'] == 'two' and not is_table:
         to_split = copy.deepcopy(box)
         sorted_ud = sorted(lines_in_block,
                            key=lambda k: ("t" not in k, k.get('t', None)))
         for lnum, line in enumerate(sorted_ud[1:-1], 1):
             right_space = box['r'] - line['r']
             if right_space > max_word_width:
                 new_b = {
                     't': to_split['t'],
                     'b': line['b'],
                     'l': to_split['l'],
                     'r': to_split['r'],
                     'w': to_split['r'] - to_split['l'],
                     'h': line['b'] - to_split['t'],
                     'color': 'orange',
                     'num_col': 'two'
                 }
                 newb_lines = get_lines_in_box(new_b, self.lines_in_page)
                 selected_lines = newb_lines[1:]
                 newb_words = []
                 for s_line in selected_lines:
                     newb_words.extend(
                         get_words_in_box(s_line, self.words_in_page))
                 all_num = check_all_num(newb_words, num_percent=0.5)
                 if newb_words:
                     if not all_num:
                         new_boxes.append(new_b)
                         to_split.update({'t': line['b']})
                         self.left_indents_2_col.append(
                             lines_in_block[lnum + 1]['l'] - box['l'])
         new_boxes.append(to_split)
     else:
         new_boxes.append(box)
     return new_boxes
コード例 #7
0
    def title_splits_1_col(self, boxes):
        """ Partition Step 3 : Horizontal splits in vertically split boxes"""
        #    min_tab_width = 20
        zones_out = []
        new_boxes_m = []
        self.all_line_heights = [w['b'] - w['t'] for w in self.lines_in_page]
        height_5_percentile = np.percentile(self.all_line_heights, 90)
        for box in boxes:
            tops = []
            bots = []
            box_width = box['r'] - box['l']
            box_height = box['b'] - box['h']
            box_center = box['l'] + (box_width / 2)
            lines_in_box = get_lines_in_box(box, self.lines_in_page)
            for line_num, line_l in enumerate(lines_in_box):
                words_in_line = get_words_in_box(line_l, self.words_in_page)
                line_str = ' '.join(w['word'] for w in words_in_line)
                left_gap = line_l['l'] - box['l']
                right_gap = box['r'] - line_l['r']
                gap_diff = right_gap - left_gap
                line_width = line_l['r'] - line_l['l']
                line_center = line_l['l'] + ((line_l['r'] - line_l['l']) / 2)
                center_diff = box_center - line_center
                ## Case 1: Check for entirely capital string.
                if line_str.isupper() and len(words_in_line) > 1:
                    if line_l['t'] not in tops:
                        tops.append(line_l['t'])
                        bots.append(line_l['b'])
                ## Case 2: Check for heights greater than some 99 percentile.
                elif (line_l['b']-line_l['t']) > height_5_percentile and \
                left_gap > 0.1*box_width and right_gap > 0.1*box_width:
                    if line_l['t'] not in tops:
                        tops.append(line_l['t'])
                        bots.append(line_l['b'])
                ## Case 3: Check if the line is centered
                elif left_gap > 0.1*box_width and right_gap > 0.1*box_width and \
                abs(gap_diff) < 0.01*box_width and \
                abs(center_diff) < 0.005*box_width:
                    if line_l['t'] not in tops:
                        tops.append(line_l['t'])
                        bots.append(line_l['b'])
                # Case 4: Check for "Moody's" title
                elif box_height > 200 and line_l['t'] < 1500 and \
                0.65*box_width < line_width < 0.8*box_width:
                    if line_l['t'] not in tops:
                        tops.append(line_l['t'])
                        bots.append(line_l['b'])

            new_boxes_m = self.create_new_boxes(tops, bots, box)
            if new_boxes_m:
                zones_out.extend(new_boxes_m)
            else:
                zones_out.append(box)
        return zones_out
コード例 #8
0
 def find_moodys_box(self, box, num_lines):
     moodys_box = []
     moodys_string = 'MOODY\'S MANUAL OF INVESTMENTS'
     #        if num_lines<=1:
     words_in_box = ' '.join(
         [w['word'] for w in get_words_in_box(box, self.words_in_page)])
     if '&quot;' in words_in_box:
         words_in_box = words_in_box.replace("&quot;", "'")
     word_similarity = cosine_similarity(moodys_string, words_in_box)
     if word_similarity > 0.8:
         moodys_box = box
     else:
         char_similarity = cosine_similarity(moodys_string, words_in_box,
                                             'char')
         if char_similarity >= 0.7:
             moodys_box = box
     return moodys_box
コード例 #9
0
 def left_indent_split(self, boxes_in):
     """ Split the box if there is a left indent at the start of the box."""
     one_col_tab = 350
     boxes_out = []
     for in_box in boxes_in:
         if self.left_indents:
             mean_left_indent = np.mean(self.left_indents)
         else:
             mean_left_indent = min(boxes_in,
                                    key=lambda x: x['l'])['l'] + one_col_tab
         new_boxes = []
         to_split = copy.deepcopy(in_box)
         lines_in_block = redefined_line_bounds(
             self.get_split_lines(in_box))
         is_table = check_table(
             get_words_in_box(in_box, self.words_in_page),
             in_box['r'] - in_box['l'], len(lines_in_block))
         if not is_table and in_box['num_col'] == 'one':
             sorted_ud = sorted(lines_in_block,
                                key=lambda k:
                                ("t" not in k, k.get('t', None)))
             for lnum, line in enumerate(sorted_ud[1:-1], 1):
                 is_indent = 0.85 * mean_left_indent < line[
                     'l'] < 1.25 * mean_left_indent
                 next_line_indent = lines_in_block[lnum +
                                                   1]['l'] < line['l']
                 if is_indent and next_line_indent:
                     self.left_indents.append(line['l'])
                     new_boxes.append({
                         't': to_split['t'],
                         'b': line['t'],
                         'l': to_split['l'],
                         'r': to_split['r'],
                         'w': to_split['w'],
                         'h': to_split['h'],
                         'color': 'seagreen',
                         'num_col': to_split['num_col']
                     })
                     to_split.update({'t': line['t']})
             new_boxes.append(to_split)
         else:
             new_boxes.append(in_box)
         boxes_out.extend(new_boxes)
     return boxes_out
コード例 #10
0
 def left_indent_split(self, boxes_in):
     """ Split the box if there is a left indent at the start of the box."""
     final_boxes = []
     for in_box in boxes_in:
         three_col_tab = self.get_tab_width()
         new_boxes = []
         to_split = copy.deepcopy(in_box)
         lines_in_block = redefined_line_bounds(
             self.get_split_lines(in_box))
         is_table = check_table(
             get_words_in_box(in_box, self.words_in_page),
             in_box['r'] - in_box['l'],
             len(get_lines_in_box(in_box, lines_in_block)))
         if not is_table and in_box['num_col'] == 'three':
             lines_in_block = get_lines_in_box(in_box, self.lines_in_page)
             sorted_ud = sorted(lines_in_block,
                                key=lambda k:
                                ("t" not in k, k.get('t', None)))
             for lnum, line in enumerate(sorted_ud[1:-1], 1):
                 is_indent = 0.65 * three_col_tab < abs(
                     line['l'] - in_box['l']) < 3 * three_col_tab
                 next_line_indent = lines_in_block[lnum +
                                                   1]['l'] < line['l']
                 if is_indent and next_line_indent:
                     self.left_indents.append(line['l'] - in_box['l'])
                     new_boxes.append({
                         't': to_split['t'],
                         'b': line['t'],
                         'l': to_split['l'],
                         'r': to_split['r'],
                         'w': to_split['w'],
                         'h': to_split['h'],
                         'color': 'orange',
                         'num_col': to_split['num_col']
                     })
                     to_split.update({'t': line['t']})
             new_boxes.append(to_split)
         else:
             new_boxes.append(in_box)
         final_boxes.extend(new_boxes)
     return final_boxes
コード例 #11
0
 def get_split_lines(self, box):
     # Split words in box into lines.
     words_in_box = get_words_in_box(box, self.words_in_page)
     sorted_td = sorted(words_in_box,
                        key=lambda k: ("t" not in k, k.get('t', None)))
     prev_line_bottom = 0
     line_split = []
     new_line = []
     for word in sorted_td:
         space_from_prev_line = word['t'] - prev_line_bottom
         if space_from_prev_line > 12:
             line_split.append(new_line)
             new_line = [word]
             prev_line_bottom = word['b']
         else:
             new_line.append(word)
     else:
         line_split.append(new_line)
     sorted_lr = [sorted(each_line, key=lambda k: ("l" not in k, k.get('l', None))) \
                  for each_line in line_split]
     sorted_lr = [line_e for line_e in sorted_lr if line_e]
     return sorted_lr
コード例 #12
0
 def minor_horizontal_splits(self, boxes_in):
     """ Split one column horizontal boxes into minor splits."""
     boxes_out = []
     for box in boxes_in:
         if box['num_col'] == 'one':
             right_indent_splits = self.right_gap_splits(box)
             left_splits = self.left_indent_split(right_indent_splits)
             boxes_out.extend(left_splits)
         elif box['num_col'] == 'two':
             splitter = two_col_page(self.page)
             minor_splits = splitter.minor_horizontal_splits([box])
             boxes_out.extend(minor_splits)
         elif box['num_col'] == 'three':
             threeColSplitter = three_col_page(self.page)
             boxes_out.extend(
                 threeColSplitter.minor_horizontal_splits([box]))
     final_zones = []
     for box in boxes_out:
         words_in_box = get_words_in_box(box, self.words_in_page)
         if words_in_box:
             final_zones.append(box)
     return final_zones
コード例 #13
0
    def partition_3_col(self, boxes_in):
        """ When the page is expected to have three columns come here directly."""

        ### Rejoin all the blocks first so that there is only one left. Makes life easier
        if boxes_in:
            sorted_td = copy.deepcopy(
                sorted(boxes_in,
                       key=lambda k: ('t' not in k, k.get('t', None))))
            single_box = copy.deepcopy(sorted_td[0])
            single_box.update({
                'b': sorted_td[-1]['b'],
                'h': sorted_td[-1]['b'] - sorted_td[0]['t'],
                'color': 'seagreen'
            })
            one_third = (single_box['r'] - single_box['l']) / 3
            page_left = single_box['l']
            page_right = single_box['r']
            words_in_single_box = get_words_in_box(single_box,
                                                   self.words_in_page)
            text_one_third = page_left + one_third
            text_two_third = page_right - one_third

            vertical_split_1 = find_vertical_line(text_one_third,
                                                  words_in_single_box)
            vertical_split_2 = find_vertical_line(text_two_third,
                                                  words_in_single_box)
            final_boxes = []
            if vertical_split_1 and vertical_split_2:
                box_1 = {
                    't': single_box['t'],
                    'b': single_box['b'],
                    'l': page_left,
                    'r': vertical_split_1,
                    'w': vertical_split_1 - page_left,
                    'h': single_box['b'] - single_box['t'],
                    'color': 'mediumvioletred',
                    'num_col': 'three'
                }
                box_2 = {
                    't': single_box['t'],
                    'b': single_box['b'],
                    'l': vertical_split_1,
                    'r': vertical_split_2,
                    'w': vertical_split_2 - vertical_split_1,
                    'h': single_box['b'] - single_box['t'],
                    'color': 'mediumvioletred',
                    'num_col': 'three'
                }
                box_3 = {
                    't': single_box['t'],
                    'b': single_box['b'],
                    'l': vertical_split_2,
                    'r': page_right,
                    'w': page_right - vertical_split_2,
                    'h': single_box['b'] - single_box['t'],
                    'color': 'mediumvioletred',
                    'num_col': 'three'
                }
                final_boxes.extend([box_1, box_2, box_3])
            else:
                return [{
                    'l': page_left,
                    't': single_box['t'],
                    'r': page_right,
                    'b': single_box['b'],
                    'w': page_right - page_left,
                    'h': single_box['b'] - single_box['t'],
                    'color': 'red',
                    'num_col': 'one'
                }]
            return final_boxes
        else:
            return []
コード例 #14
0
    def vertical_splits_three_col(self, boxes_in):
        boxes_out = []
        for box in boxes_in:
            words_in_block = get_words_in_box(box, self.words_in_page)
            lines_in_block = redefined_line_bounds(self.get_split_lines(box))
            is_table = check_table(words_in_block, box['w'],
                                   len(lines_in_block))
            if not is_table and words_in_block:
                one_third = (box['r'] - box['l']) / 3
                page_left = box['l']
                page_right = box['r']
                words_in_single_box = get_words_in_box(box, self.words_in_page)
                text_one_third = page_left + one_third
                text_two_third = page_right - one_third

                vertical_split_1 = find_vertical_line(text_one_third,
                                                      words_in_single_box,
                                                      len(lines_in_block))
                vertical_split_2 = find_vertical_line(text_two_third,
                                                      words_in_single_box,
                                                      len(lines_in_block))
                if vertical_split_1 and vertical_split_2:
                    box_1 = {
                        't': box['t'],
                        'b': box['b'],
                        'l': page_left,
                        'r': vertical_split_1,
                        'w': vertical_split_1 - page_left,
                        'h': box['b'] - box['t'],
                        'color': 'mediumvioletred',
                        'num_col': 'three'
                    }
                    box_2 = {
                        't': box['t'],
                        'b': box['b'],
                        'l': vertical_split_1,
                        'r': vertical_split_2,
                        'w': vertical_split_2 - vertical_split_1,
                        'h': box['b'] - box['t'],
                        'color': 'mediumvioletred',
                        'num_col': 'three'
                    }
                    box_3 = {
                        't': box['t'],
                        'b': box['b'],
                        'l': vertical_split_2,
                        'r': page_right,
                        'w': page_right - vertical_split_2,
                        'h': box['b'] - box['t'],
                        'color': 'mediumvioletred',
                        'num_col': 'three'
                    }
                    gutter_l = {
                        'l': box_1['r'] - 1,
                        'r': box_2['l'] + 1,
                        'w': box_2['l'] - box_1['r'] + 2
                    }
                    gutter_r = {
                        'l': box_2['r'] - 1,
                        'r': box_3['l'] + 1,
                        'w': box_3['l'] - box_2['r'] + 2
                    }
                    split_again = self.find_3_col_intersects(
                        box, gutter_l, gutter_r, box_1, box_2, box_3,
                        words_in_block, lines_in_block)
                    boxes_out.extend(split_again)
                else:
                    boxes_out.append(box)
            else:
                boxes_out.append(box)
        return boxes_out
コード例 #15
0
    def find_3_col_intersects(self, main_box, gutter_l, gutter_r, box_1, box_2,
                              box_3, words_in_block, lines_in_block):
        boxes_out = []
        all_of_them = []
        tops = []
        bots = []
        for line in lines_in_block:
            only_words = [
                w for w in get_words_in_box(line, words_in_block)
                if w['word'] not in string.punctuation
            ]
            for word in only_words:
                gutter_line = False
                if word['t'] >= box_1['t'] and word['b'] <= box_1['b']:
                    if (gutter_l['l'] < word['l'] and gutter_l['r'] > word['r']
                        ) or (gutter_r['l'] < word['l']
                              and gutter_r['r'] > word['r']):
                        gutter_line = True
                    elif (gutter_l['l'] <= word['l'] < gutter_l['r']
                          and word['r'] > gutter_l['r']) or (
                              gutter_r['l'] <= word['l'] < gutter_r['r']
                              and word['r'] > gutter_r['r']):
                        gutter_line = True
                    elif (word['l'] < gutter_l['l']
                          and gutter_l['l'] < word['r'] < gutter_l['r']) or (
                              word['l'] < gutter_r['l']
                              and gutter_r['l'] < word['r'] < gutter_r['r']):
                        gutter_line = True
                    elif (word['l'] < gutter_l['l'] and word['r'] >
                          gutter_l['r']) or (word['l'] < gutter_r['l']
                                             and word['r'] > gutter_r['r']):
                        gutter_line = True
                if gutter_line:
                    tops.append(line['t'])
                    bots.append(line['b'])
                    break
        non_gap_boxes = self.unsplit_boxes(tops, bots, main_box, main_box['l'],
                                           main_box['r'])
        if non_gap_boxes:
            to_split_l = copy.deepcopy([box_1])
            to_split_m = copy.deepcopy([box_2])
            to_split_r = copy.deepcopy([box_3])
            for num, ngb in enumerate(non_gap_boxes):
                boxes_l_split = []
                boxes_m_split = []
                boxes_r_split = []
                for bnum, outer_box in enumerate(to_split_l):
                    if ngb['t'] == outer_box['t'] and ngb['b'] < outer_box['b']:
                        box1_l = copy.deepcopy(outer_box)
                        box2_l = copy.deepcopy(outer_box)
                        box1_l.update({'b': ngb['b']})
                        box2_l.update({'t': ngb['b']})
                        boxes_l_split.extend([box1_l, box2_l])

                        box1_m = copy.deepcopy(to_split_m[bnum])
                        box2_m = copy.deepcopy(to_split_m[bnum])
                        box1_m.update({'b': ngb['b']})
                        box2_m.update({'t': ngb['b']})
                        boxes_m_split.extend([box1_m, box2_m])

                        box1_r = copy.deepcopy(to_split_r[bnum])
                        box2_r = copy.deepcopy(to_split_r[bnum])
                        box1_r.update({'b': ngb['b']})
                        box2_r.update({'t': ngb['b']})
                        boxes_r_split.extend([box1_r, box2_r])
                    elif outer_box['t'] < ngb['t'] < outer_box[
                            'b'] and outer_box['t'] < ngb['b'] < outer_box['b']:
                        box1_l = copy.deepcopy(outer_box)
                        box3_l = copy.deepcopy(outer_box)
                        box1_l.update({'b': ngb['t']})
                        box3_l.update({'t': ngb['b']})
                        boxes_l_split.extend([box1_l, box3_l])

                        box1_m = copy.deepcopy(to_split_m[bnum])
                        box3_m = copy.deepcopy(to_split_m[bnum])
                        box1_m.update({'b': ngb['t']})
                        box3_m.update({'t': ngb['b']})
                        boxes_m_split.extend([box1_m, box3_m])

                        box1_r = copy.deepcopy(to_split_r[bnum])
                        box3_r = copy.deepcopy(to_split_r[bnum])
                        box1_r.update({'b': ngb['t']})
                        box3_r.update({'t': ngb['b']})
                        boxes_r_split.extend([box1_r, box3_r])
                    elif outer_box['t'] < ngb['t'] < outer_box['b'] and ngb[
                            'b'] == outer_box['b']:
                        box1_l = copy.deepcopy(outer_box)
                        box2_l = copy.deepcopy(outer_box)
                        box1_l.update({'b': ngb['t']})
                        box2_l.update({'t': ngb['t']})
                        boxes_l_split.extend([box1_l, box2_l])

                        box1_m = copy.deepcopy(to_split_m[bnum])
                        box2_m = copy.deepcopy(to_split_m[bnum])
                        box1_m.update({'b': ngb['t']})
                        box2_m.update({'t': ngb['t']})
                        boxes_m_split.extend([box1_m, box2_m])

                        box1_r = copy.deepcopy(to_split_r[bnum])
                        box2_r = copy.deepcopy(to_split_r[bnum])
                        box1_r.update({'b': ngb['t']})
                        box2_r.update({'t': ngb['t']})
                        boxes_r_split.extend([box1_r, box2_r])
                    else:
                        boxes_l_split.extend([outer_box])
                        boxes_m_split.extend([to_split_m[bnum]])
                        boxes_r_split.extend([to_split_r[bnum]])
                to_split_l = copy.deepcopy(boxes_l_split)
                to_split_m = copy.deepcopy(boxes_m_split)
                to_split_r = copy.deepcopy(boxes_r_split)
            all_of_them += to_split_l + to_split_m + to_split_r + non_gap_boxes
            ### Merge the single col boxes that were unsplit
            sorted_td = sorted(all_of_them,
                               key=lambda k: ("t" not in k, k.get('t', None)))
            boxes_out = [sorted_td[0]]
            for bnum, box in enumerate(sorted_td, 1):
                if box['num_col'] == 'one' and boxes_out[-1][
                        'num_col'] == 'one':
                    boxes_out[-1]['b'] = box['b']
                else:
                    boxes_out.append(box)
        else:
            boxes_out.extend([box_1, box_2, box_3])
        return boxes_out
コード例 #16
0
    def find_gap_intersections(self, boxes_in):
        actual_left = min(boxes_in, key=lambda x: x['l'])['l']
        actual_right = max(boxes_in, key=lambda x: x['r'])['r']
        final_splits = []
        paired_boxes = []
        for box in boxes_in:
            if 'num_col' in box:
                if box['num_col'] == 'two' and box['l'] == actual_left:
                    box_on_left = box
                    for box_in in boxes_in:
                        if box_in['r'] > box_on_left['r'] and box_in['r'] == actual_right and \
                        box_in['t']==box_on_left['t'] and box_in['b']==box_on_left['b'] and box['num_col']=='two':
                            box_on_right = box_in
                    paired_boxes.append({
                        'left_box': box_on_left,
                        'right_box': box_on_right
                    })
                elif box['num_col'] == 'one':
                    final_splits.append(box)

        if paired_boxes:
            for pair in paired_boxes:
                tops = []
                bots = []
                left_box = pair['left_box']
                right_box = pair['right_box']
                gutter = {
                    'l': left_box['r'],
                    'r': right_box['l'],
                    'w': right_box['l'] - left_box['l']
                }
                for line in self.lines_in_page:
                    gutter_line = False
                    if line['t'] >= left_box['t'] and line['b'] <= left_box[
                            'b']:
                        # Check if whole word is in gutter:
                        if gutter['l'] < line['l'] and gutter['r'] > line['r']:
                            gutter_line = True
                        # Word starts after gutter but ends in box:
                        elif gutter['l'] <= line['l'] < gutter['r'] and line[
                                'r'] > gutter['r']:
                            gutter_line = True
                        # Word starts in box but ends in gutter
                        elif line['l'] < gutter['l'] and gutter['l'] < line[
                                'r'] < gutter['r']:
                            gutter_line = True
                        # word starts before guttre and ends after guttr.
                        elif line['l'] < gutter['l'] and line['r'] > gutter[
                                'r']:
                            gutter_line = True
                    if gutter_line:
                        line_top = line['t']
                        line_bot = line['b']
                        tops.append(line_top)
                        bots.append(line_bot)
                non_gap_boxes = self.unsplit_boxes(tops, bots, box,
                                                   actual_left, actual_right)
                if non_gap_boxes:
                    to_split_l = copy.deepcopy([left_box])
                    to_split_r = copy.deepcopy([right_box])
                    for num, nbm in enumerate(non_gap_boxes):
                        boxes_l_split = []
                        boxes_r_split = []
                        for bnum, outer_box in enumerate(to_split_l):
                            if nbm['t'] == outer_box[
                                    't'] and nbm['b'] < outer_box['b']:
                                box1_l = copy.deepcopy(outer_box)
                                box2_l = copy.deepcopy(outer_box)
                                box1_l.update({'b': nbm['b']})
                                box2_l.update({'t': nbm['b']})
                                box1_r = copy.deepcopy(to_split_r[bnum])
                                box2_r = copy.deepcopy(to_split_r[bnum])
                                box1_r.update({'b': nbm['b']})
                                box2_r.update({'t': nbm['b']})
                                boxes_l_split.extend([box1_l, box2_l])
                                boxes_r_split.extend([box1_r, box2_r])
                            elif outer_box['t'] < nbm['t'] < outer_box[
                                    'b'] and outer_box['t'] < nbm[
                                        'b'] < outer_box['b']:
                                box1_l = copy.deepcopy(outer_box)
                                box3_l = copy.deepcopy(outer_box)
                                box1_l.update({'b': nbm['t']})
                                box3_l.update({'t': nbm['b']})
                                box1_r = copy.deepcopy(to_split_r[bnum])
                                box3_r = copy.deepcopy(to_split_r[bnum])
                                box1_r.update({'b': nbm['t']})
                                box3_r.update({'t': nbm['b']})
                                boxes_l_split.extend([box1_l, box3_l])
                                boxes_r_split.extend([box1_r, box3_r])
                            elif outer_box['t'] < nbm['t'] < outer_box[
                                    'b'] and nbm['b'] == outer_box['b']:
                                box1_l = copy.deepcopy(outer_box)
                                box2_l = copy.deepcopy(outer_box)
                                box1_l.update({'b': nbm['t']})
                                box2_l.update({'t': nbm['t']})
                                box1_r = copy.deepcopy(to_split_r[bnum])
                                box2_r = copy.deepcopy(to_split_r[bnum])
                                box1_r.update({'b': nbm['t']})
                                box2_r.update({'t': nbm['t']})
                                boxes_l_split.extend([box1_l, box2_l])
                                boxes_r_split.extend([box1_r, box2_r])
                            else:
                                boxes_l_split.extend([outer_box])
                                boxes_r_split.extend([to_split_r[bnum]])
                        to_split_l = copy.deepcopy(boxes_l_split)
                        to_split_r = copy.deepcopy(boxes_r_split)
                    final_splits += to_split_l + to_split_r + non_gap_boxes
                else:
                    final_splits.extend([left_box, right_box])

        final_splits = [bo for bo in final_splits if bo['t'] != bo['b']]
        boxes_out = []
        for fs in final_splits:
            if fs not in boxes_out:
                boxes_out.append(fs)
        final_splits_out = [
            box for box in boxes_out
            if get_words_in_box(box, self.words_in_page)
        ]
        return final_splits_out
コード例 #17
0
    def vertical_splits_one_col(self, boxes_in):
        zones_out = []
        final_splits = []
        for working_block in boxes_in:
            to_split = [copy.deepcopy(working_block)]
            new_boxes = []
            lines_in_block = get_lines_in_box(working_block,
                                              self.lines_in_page)
            words_in_block = get_words_in_box(working_block,
                                              self.words_in_page)
            half_width = (working_block['r'] - working_block['l']) / 2
            box_half = working_block['l'] + half_width
            lines_on_left = {}
            lines_on_right = {}
            for line_num, line in enumerate(lines_in_block):
                if line['l'] > box_half + 10:
                    lines_on_right[line_num] = line
                if box_half > line['r']:
                    lines_on_left[line_num] = line
            split_line_nums = sorted([*lines_on_right])
            consecutive_lines = formatlinelist(split_line_nums)
            binwidth_2 = 60
            tops = []
            bots = []
            for line_range in consecutive_lines:
                if lines_in_block[line_range[0]:line_range[1]]:
                    average_line_width = np.mean([
                        ll['r'] - ll['l']
                        for ll in lines_in_block[line_range[0]:line_range[1]]
                    ])
                else:
                    average_line_width = 0
                block_lefts = [
                    lines_in_block[l_num]['l']
                    for l_num in range(line_range[0], line_range[1] + 1)
                ]
                y_freq, y_ranges = np.histogram(block_lefts, \
                                                bins=np.arange(min(block_lefts), \
                                       max(block_lefts) + binwidth_2, binwidth_2))
                if y_freq.any():
                    if line_range[-1] - line_range[0] > 2 and max(
                            y_freq
                    ) > 2 and average_line_width > 0.4 * half_width:
                        lines_selected = lines_in_block[
                            line_range[0]:line_range[-1] + 1]
                        sorted_td = sorted(lines_selected,
                                           key=lambda k:
                                           ("t" not in k, k.get('t', None)))
                        tops.append(sorted_td[0]['t'])
                        bots.append(sorted_td[-1]['b'])

            if tops:
                for c_top, c_bot in zip(tops, bots):
                    left_box = {
                        't': c_top,
                        'b': c_bot,
                        'l': working_block['l'],
                        'r': box_half,
                        'w': box_half - working_block['l'],
                        'h': c_bot - c_top,
                        'color': 'orange',
                        'num_col': 'two'
                    }
                    right_box = {
                        't': c_top,
                        'b': c_bot,
                        'l': box_half,
                        'r': working_block['r'],
                        'w': working_block['r'] - box_half,
                        'h': c_bot - c_top,
                        'color': 'orange',
                        'num_col': 'two'
                    }
                    is_left_table = check_table(
                        get_words_in_box(left_box, words_in_block),
                        left_box['r'] - left_box['l'],
                        len(get_lines_in_box(left_box, lines_in_block)))
                    is_right_table = check_table(
                        get_words_in_box(right_box, words_in_block),
                        right_box['r'] - right_box['l'],
                        len(get_lines_in_box(right_box, lines_in_block)))
                    if is_right_table and is_left_table:
                        new_boxes = [{
                            't':
                            c_top,
                            'b':
                            c_bot,
                            'l':
                            working_block['l'],
                            'r':
                            working_block['r'],
                            'w':
                            working_block['r'] - working_block['l'],
                            'h':
                            c_bot - c_top,
                            'color':
                            'seagreen',
                            'num_col':
                            'one'
                        }]
                    else:
                        new_boxes = [left_box, right_box]
                    only_split_boxes = []
                    final_splits.extend(new_boxes)
                    for outer_box in to_split:
                        if left_box['t'] == outer_box[
                                't'] and left_box['b'] < outer_box['b']:
                            box1 = copy.deepcopy(outer_box)
                            box2 = copy.deepcopy(outer_box)
                            box1.update({'b': left_box['b']})
                            box2.update({'t': left_box['b']})
                            only_split_boxes.extend([box1, box2])
                        elif outer_box['t'] < left_box['t'] < outer_box[
                                'b'] and outer_box['t'] < left_box[
                                    'b'] < outer_box['b']:
                            box1 = copy.deepcopy(outer_box)
                            box3 = copy.deepcopy(outer_box)
                            box1.update({'b': left_box['t']})
                            box3.update({'t': left_box['b']})
                            only_split_boxes.extend([box1, box3])
                        elif outer_box['t'] < left_box['t'] < outer_box[
                                'b'] and left_box['b'] == outer_box['b']:
                            box1 = copy.deepcopy(outer_box)
                            box1.update({'b': left_box['t']})
                            only_split_boxes.extend([box1])
                        else:
                            only_split_boxes.append(outer_box)
                        to_split = only_split_boxes
                final_splits.extend(only_split_boxes)
            else:
                final_splits.append(working_block)
        for fs in final_splits:
            if fs not in zones_out:
                zones_out.append(fs)
        return zones_out