def title_splits_3_col(self, boxes_in): """ Split based on titles in the box""" zones_out = [] for box in boxes_in: if box['num_col'] == 'one': continue tops = [] bots = [] # box_center = box['l']+(box_width/2) lines_in_box = redefined_line_bounds(self.get_split_lines(box)) for line_num, line_l in enumerate(lines_in_box): words_in_line = get_words_in_box(line_l, self.words_in_page) line_str = ' '.join(w['word'] for w in words_in_line) ## Case 1: Check for entirely capital string. if line_str.isupper() and len(words_in_line) > 1: if line_l['t'] not in tops: tops.append(line_l['t']) bots.append(line_l['b']) new_boxes_m = self.create_new_boxes(tops, bots, box) if new_boxes_m: zones_out.extend(new_boxes_m) else: zones_out.append(box) zones_out = [ box for box in zones_out if get_words_in_box(box, self.words_in_page) ] return zones_out
def merge_consecutive_tables(self, boxes_in): boxes_out = [] if boxes_in: prev_box = copy.deepcopy(boxes_in[0]) lines_in_box = redefined_line_bounds( self.get_split_lines(prev_box)) prev_is_table = check_table( get_words_in_box(prev_box, self.words_in_page), prev_box['w'], len(lines_in_box)) for box in boxes_in[1:]: curr_words = get_words_in_box(box, self.words_in_page) c_lines_in_box = redefined_line_bounds( self.get_split_lines(box)) curr_is_table = check_table(curr_words, box['w'], len(c_lines_in_box)) if prev_is_table and curr_is_table: prev_box['b'] = box['b'] else: boxes_out.append(prev_box) prev_box = copy.deepcopy(box) prev_is_table = copy.deepcopy(curr_is_table) boxes_out.append(prev_box) return boxes_out else: return []
def title_splits_2_col(self, boxes_in): """ Split based on titles in the box""" zones_out = [] for box in boxes_in: if box['num_col'] == 'one': continue tops = [] bots = [] box_width = box['r'] - box['l'] box_height = box['b'] - box['h'] box_center = box['l'] + (box_width / 2) lines_in_box = get_lines_in_box(box, self.lines_in_page) for line_num, line_l in enumerate(lines_in_box): words_in_line = get_words_in_box(line_l, self.words_in_page) line_str = ' '.join(w['word'] for w in words_in_line) left_gap = line_l['l'] - box['l'] right_gap = box['r'] - line_l['r'] gap_diff = right_gap - left_gap line_width = line_l['r'] - line_l['l'] line_center = line_l['l'] + ((line_l['r'] - line_l['l']) / 2) center_diff = box_center - line_center all_num = check_all_num(words_in_line) ## Case 1: Check for entirely capital string. if line_str.isupper() and len(words_in_line) > 1 and abs( gap_diff) < 0.05 * box_width: if line_l['t'] not in tops: tops.append(line_l['t']) bots.append(line_l['b']) ## Case 2: Check if the line is centered elif left_gap > 0.05*box_width and right_gap > 0.05*box_width and not all_num and \ abs(center_diff) < 0.015*box_width and abs(gap_diff) < 0.03*box_width and len(words_in_line)>1: if line_l['t'] not in tops: tops.append(line_l['t']) bots.append(line_l['b']) # Case 3: Check for "Moody's" title elif box_height > 200 and line_l['t'] < 1500 and \ 0.65*box_width < line_width < 0.8*box_width: if line_l['t'] not in tops: tops.append(line_l['t']) bots.append(line_l['b']) new_boxes_m = self.create_new_boxes(tops, bots, box) if new_boxes_m: zones_out.extend(new_boxes_m) else: zones_out.append(box) zones_out = [ box for box in zones_out if get_words_in_box(box, self.words_in_page) ] return zones_out
def title_splits_3_1_col(self, boxes): """ Partition Step 3 : Horizontal splits in vertically split boxes""" # min_tab_width = 20 zones_out = [] new_boxes_m = [] self.all_line_heights = [w['b'] - w['t'] for w in self.lines_in_page] # height_5_percentile = np.percentile(self.all_line_heights, 90) for box in boxes: tops = [] bots = [] # box_height = box['b'] - box['h'] lines_in_box = redefined_line_bounds(self.get_split_lines(box)) for line_num, line_l in enumerate(lines_in_box[1:]): words_in_line = get_words_in_box(line_l, self.words_in_page) line_str = ' '.join(w['word'] for w in words_in_line) ## Case 1: Check for entirely capital string. if line_str.isupper() and len(words_in_line) > 1: if line_l['t'] not in tops: tops.append(line_l['t']) bots.append(line_l['b']) new_boxes_m = self.create_new_boxes(tops, bots, box) if new_boxes_m: zones_out.extend(new_boxes_m) else: zones_out.append(box) return zones_out
def right_gap_splits(self, box): """ Perform horizontal splits based on space at the end of a line when the next line is a new paragraph.""" max_word_width = np.percentile( [w['r'] - w['l'] for w in self.words_in_page], 60) new_boxes = [] lines_in_block = redefined_line_bounds(self.get_split_lines(box)) is_table = check_table(get_words_in_box(box, self.words_in_page), box['r'] - box['l'], len(lines_in_block)) if box['num_col'] == 'one': to_split = copy.deepcopy(box) sorted_ud = sorted(lines_in_block, key=lambda k: ("t" not in k, k.get('t', None))) for lnum, line in enumerate(sorted_ud[1:-1], 1): right_space = box['r'] - line['r'] if right_space > max_word_width: new_boxes.append({ 't': to_split['t'], 'b': line['b'], 'l': to_split['l'], 'r': to_split['r'], 'w': to_split['r'] - to_split['l'], 'h': line['b'] - to_split['t'], 'color': 'seagreen', 'num_col': 'one' }) to_split.update({'t': line['b']}) self.left_indents.append(lines_in_block[lnum + 1]['l']) new_boxes.append(to_split) else: new_boxes.append(box) return new_boxes
def right_gap_splits(self, box): """ Perform horizontal splits based on space at the end of a line when the next line is a new paragraph.""" max_word_width = np.percentile( [w['r'] - w['l'] for w in self.words_in_page], 98) new_boxes = [] words_in_block = get_words_in_box(box, self.words_in_page) lines_in_block = get_lines_in_box(box, self.lines_in_page) is_table = check_table(words_in_block, box['r'] - box['l'], len(lines_in_block)) if box['num_col'] == 'two' and not is_table: to_split = copy.deepcopy(box) sorted_ud = sorted(lines_in_block, key=lambda k: ("t" not in k, k.get('t', None))) for lnum, line in enumerate(sorted_ud[1:-1], 1): right_space = box['r'] - line['r'] if right_space > max_word_width: new_b = { 't': to_split['t'], 'b': line['b'], 'l': to_split['l'], 'r': to_split['r'], 'w': to_split['r'] - to_split['l'], 'h': line['b'] - to_split['t'], 'color': 'orange', 'num_col': 'two' } newb_lines = get_lines_in_box(new_b, self.lines_in_page) selected_lines = newb_lines[1:] newb_words = [] for s_line in selected_lines: newb_words.extend( get_words_in_box(s_line, self.words_in_page)) all_num = check_all_num(newb_words, num_percent=0.5) if newb_words: if not all_num: new_boxes.append(new_b) to_split.update({'t': line['b']}) self.left_indents_2_col.append( lines_in_block[lnum + 1]['l'] - box['l']) new_boxes.append(to_split) else: new_boxes.append(box) return new_boxes
def title_splits_1_col(self, boxes): """ Partition Step 3 : Horizontal splits in vertically split boxes""" # min_tab_width = 20 zones_out = [] new_boxes_m = [] self.all_line_heights = [w['b'] - w['t'] for w in self.lines_in_page] height_5_percentile = np.percentile(self.all_line_heights, 90) for box in boxes: tops = [] bots = [] box_width = box['r'] - box['l'] box_height = box['b'] - box['h'] box_center = box['l'] + (box_width / 2) lines_in_box = get_lines_in_box(box, self.lines_in_page) for line_num, line_l in enumerate(lines_in_box): words_in_line = get_words_in_box(line_l, self.words_in_page) line_str = ' '.join(w['word'] for w in words_in_line) left_gap = line_l['l'] - box['l'] right_gap = box['r'] - line_l['r'] gap_diff = right_gap - left_gap line_width = line_l['r'] - line_l['l'] line_center = line_l['l'] + ((line_l['r'] - line_l['l']) / 2) center_diff = box_center - line_center ## Case 1: Check for entirely capital string. if line_str.isupper() and len(words_in_line) > 1: if line_l['t'] not in tops: tops.append(line_l['t']) bots.append(line_l['b']) ## Case 2: Check for heights greater than some 99 percentile. elif (line_l['b']-line_l['t']) > height_5_percentile and \ left_gap > 0.1*box_width and right_gap > 0.1*box_width: if line_l['t'] not in tops: tops.append(line_l['t']) bots.append(line_l['b']) ## Case 3: Check if the line is centered elif left_gap > 0.1*box_width and right_gap > 0.1*box_width and \ abs(gap_diff) < 0.01*box_width and \ abs(center_diff) < 0.005*box_width: if line_l['t'] not in tops: tops.append(line_l['t']) bots.append(line_l['b']) # Case 4: Check for "Moody's" title elif box_height > 200 and line_l['t'] < 1500 and \ 0.65*box_width < line_width < 0.8*box_width: if line_l['t'] not in tops: tops.append(line_l['t']) bots.append(line_l['b']) new_boxes_m = self.create_new_boxes(tops, bots, box) if new_boxes_m: zones_out.extend(new_boxes_m) else: zones_out.append(box) return zones_out
def find_moodys_box(self, box, num_lines): moodys_box = [] moodys_string = 'MOODY\'S MANUAL OF INVESTMENTS' # if num_lines<=1: words_in_box = ' '.join( [w['word'] for w in get_words_in_box(box, self.words_in_page)]) if '"' in words_in_box: words_in_box = words_in_box.replace(""", "'") word_similarity = cosine_similarity(moodys_string, words_in_box) if word_similarity > 0.8: moodys_box = box else: char_similarity = cosine_similarity(moodys_string, words_in_box, 'char') if char_similarity >= 0.7: moodys_box = box return moodys_box
def left_indent_split(self, boxes_in): """ Split the box if there is a left indent at the start of the box.""" one_col_tab = 350 boxes_out = [] for in_box in boxes_in: if self.left_indents: mean_left_indent = np.mean(self.left_indents) else: mean_left_indent = min(boxes_in, key=lambda x: x['l'])['l'] + one_col_tab new_boxes = [] to_split = copy.deepcopy(in_box) lines_in_block = redefined_line_bounds( self.get_split_lines(in_box)) is_table = check_table( get_words_in_box(in_box, self.words_in_page), in_box['r'] - in_box['l'], len(lines_in_block)) if not is_table and in_box['num_col'] == 'one': sorted_ud = sorted(lines_in_block, key=lambda k: ("t" not in k, k.get('t', None))) for lnum, line in enumerate(sorted_ud[1:-1], 1): is_indent = 0.85 * mean_left_indent < line[ 'l'] < 1.25 * mean_left_indent next_line_indent = lines_in_block[lnum + 1]['l'] < line['l'] if is_indent and next_line_indent: self.left_indents.append(line['l']) new_boxes.append({ 't': to_split['t'], 'b': line['t'], 'l': to_split['l'], 'r': to_split['r'], 'w': to_split['w'], 'h': to_split['h'], 'color': 'seagreen', 'num_col': to_split['num_col'] }) to_split.update({'t': line['t']}) new_boxes.append(to_split) else: new_boxes.append(in_box) boxes_out.extend(new_boxes) return boxes_out
def left_indent_split(self, boxes_in): """ Split the box if there is a left indent at the start of the box.""" final_boxes = [] for in_box in boxes_in: three_col_tab = self.get_tab_width() new_boxes = [] to_split = copy.deepcopy(in_box) lines_in_block = redefined_line_bounds( self.get_split_lines(in_box)) is_table = check_table( get_words_in_box(in_box, self.words_in_page), in_box['r'] - in_box['l'], len(get_lines_in_box(in_box, lines_in_block))) if not is_table and in_box['num_col'] == 'three': lines_in_block = get_lines_in_box(in_box, self.lines_in_page) sorted_ud = sorted(lines_in_block, key=lambda k: ("t" not in k, k.get('t', None))) for lnum, line in enumerate(sorted_ud[1:-1], 1): is_indent = 0.65 * three_col_tab < abs( line['l'] - in_box['l']) < 3 * three_col_tab next_line_indent = lines_in_block[lnum + 1]['l'] < line['l'] if is_indent and next_line_indent: self.left_indents.append(line['l'] - in_box['l']) new_boxes.append({ 't': to_split['t'], 'b': line['t'], 'l': to_split['l'], 'r': to_split['r'], 'w': to_split['w'], 'h': to_split['h'], 'color': 'orange', 'num_col': to_split['num_col'] }) to_split.update({'t': line['t']}) new_boxes.append(to_split) else: new_boxes.append(in_box) final_boxes.extend(new_boxes) return final_boxes
def get_split_lines(self, box): # Split words in box into lines. words_in_box = get_words_in_box(box, self.words_in_page) sorted_td = sorted(words_in_box, key=lambda k: ("t" not in k, k.get('t', None))) prev_line_bottom = 0 line_split = [] new_line = [] for word in sorted_td: space_from_prev_line = word['t'] - prev_line_bottom if space_from_prev_line > 12: line_split.append(new_line) new_line = [word] prev_line_bottom = word['b'] else: new_line.append(word) else: line_split.append(new_line) sorted_lr = [sorted(each_line, key=lambda k: ("l" not in k, k.get('l', None))) \ for each_line in line_split] sorted_lr = [line_e for line_e in sorted_lr if line_e] return sorted_lr
def minor_horizontal_splits(self, boxes_in): """ Split one column horizontal boxes into minor splits.""" boxes_out = [] for box in boxes_in: if box['num_col'] == 'one': right_indent_splits = self.right_gap_splits(box) left_splits = self.left_indent_split(right_indent_splits) boxes_out.extend(left_splits) elif box['num_col'] == 'two': splitter = two_col_page(self.page) minor_splits = splitter.minor_horizontal_splits([box]) boxes_out.extend(minor_splits) elif box['num_col'] == 'three': threeColSplitter = three_col_page(self.page) boxes_out.extend( threeColSplitter.minor_horizontal_splits([box])) final_zones = [] for box in boxes_out: words_in_box = get_words_in_box(box, self.words_in_page) if words_in_box: final_zones.append(box) return final_zones
def partition_3_col(self, boxes_in): """ When the page is expected to have three columns come here directly.""" ### Rejoin all the blocks first so that there is only one left. Makes life easier if boxes_in: sorted_td = copy.deepcopy( sorted(boxes_in, key=lambda k: ('t' not in k, k.get('t', None)))) single_box = copy.deepcopy(sorted_td[0]) single_box.update({ 'b': sorted_td[-1]['b'], 'h': sorted_td[-1]['b'] - sorted_td[0]['t'], 'color': 'seagreen' }) one_third = (single_box['r'] - single_box['l']) / 3 page_left = single_box['l'] page_right = single_box['r'] words_in_single_box = get_words_in_box(single_box, self.words_in_page) text_one_third = page_left + one_third text_two_third = page_right - one_third vertical_split_1 = find_vertical_line(text_one_third, words_in_single_box) vertical_split_2 = find_vertical_line(text_two_third, words_in_single_box) final_boxes = [] if vertical_split_1 and vertical_split_2: box_1 = { 't': single_box['t'], 'b': single_box['b'], 'l': page_left, 'r': vertical_split_1, 'w': vertical_split_1 - page_left, 'h': single_box['b'] - single_box['t'], 'color': 'mediumvioletred', 'num_col': 'three' } box_2 = { 't': single_box['t'], 'b': single_box['b'], 'l': vertical_split_1, 'r': vertical_split_2, 'w': vertical_split_2 - vertical_split_1, 'h': single_box['b'] - single_box['t'], 'color': 'mediumvioletred', 'num_col': 'three' } box_3 = { 't': single_box['t'], 'b': single_box['b'], 'l': vertical_split_2, 'r': page_right, 'w': page_right - vertical_split_2, 'h': single_box['b'] - single_box['t'], 'color': 'mediumvioletred', 'num_col': 'three' } final_boxes.extend([box_1, box_2, box_3]) else: return [{ 'l': page_left, 't': single_box['t'], 'r': page_right, 'b': single_box['b'], 'w': page_right - page_left, 'h': single_box['b'] - single_box['t'], 'color': 'red', 'num_col': 'one' }] return final_boxes else: return []
def vertical_splits_three_col(self, boxes_in): boxes_out = [] for box in boxes_in: words_in_block = get_words_in_box(box, self.words_in_page) lines_in_block = redefined_line_bounds(self.get_split_lines(box)) is_table = check_table(words_in_block, box['w'], len(lines_in_block)) if not is_table and words_in_block: one_third = (box['r'] - box['l']) / 3 page_left = box['l'] page_right = box['r'] words_in_single_box = get_words_in_box(box, self.words_in_page) text_one_third = page_left + one_third text_two_third = page_right - one_third vertical_split_1 = find_vertical_line(text_one_third, words_in_single_box, len(lines_in_block)) vertical_split_2 = find_vertical_line(text_two_third, words_in_single_box, len(lines_in_block)) if vertical_split_1 and vertical_split_2: box_1 = { 't': box['t'], 'b': box['b'], 'l': page_left, 'r': vertical_split_1, 'w': vertical_split_1 - page_left, 'h': box['b'] - box['t'], 'color': 'mediumvioletred', 'num_col': 'three' } box_2 = { 't': box['t'], 'b': box['b'], 'l': vertical_split_1, 'r': vertical_split_2, 'w': vertical_split_2 - vertical_split_1, 'h': box['b'] - box['t'], 'color': 'mediumvioletred', 'num_col': 'three' } box_3 = { 't': box['t'], 'b': box['b'], 'l': vertical_split_2, 'r': page_right, 'w': page_right - vertical_split_2, 'h': box['b'] - box['t'], 'color': 'mediumvioletred', 'num_col': 'three' } gutter_l = { 'l': box_1['r'] - 1, 'r': box_2['l'] + 1, 'w': box_2['l'] - box_1['r'] + 2 } gutter_r = { 'l': box_2['r'] - 1, 'r': box_3['l'] + 1, 'w': box_3['l'] - box_2['r'] + 2 } split_again = self.find_3_col_intersects( box, gutter_l, gutter_r, box_1, box_2, box_3, words_in_block, lines_in_block) boxes_out.extend(split_again) else: boxes_out.append(box) else: boxes_out.append(box) return boxes_out
def find_3_col_intersects(self, main_box, gutter_l, gutter_r, box_1, box_2, box_3, words_in_block, lines_in_block): boxes_out = [] all_of_them = [] tops = [] bots = [] for line in lines_in_block: only_words = [ w for w in get_words_in_box(line, words_in_block) if w['word'] not in string.punctuation ] for word in only_words: gutter_line = False if word['t'] >= box_1['t'] and word['b'] <= box_1['b']: if (gutter_l['l'] < word['l'] and gutter_l['r'] > word['r'] ) or (gutter_r['l'] < word['l'] and gutter_r['r'] > word['r']): gutter_line = True elif (gutter_l['l'] <= word['l'] < gutter_l['r'] and word['r'] > gutter_l['r']) or ( gutter_r['l'] <= word['l'] < gutter_r['r'] and word['r'] > gutter_r['r']): gutter_line = True elif (word['l'] < gutter_l['l'] and gutter_l['l'] < word['r'] < gutter_l['r']) or ( word['l'] < gutter_r['l'] and gutter_r['l'] < word['r'] < gutter_r['r']): gutter_line = True elif (word['l'] < gutter_l['l'] and word['r'] > gutter_l['r']) or (word['l'] < gutter_r['l'] and word['r'] > gutter_r['r']): gutter_line = True if gutter_line: tops.append(line['t']) bots.append(line['b']) break non_gap_boxes = self.unsplit_boxes(tops, bots, main_box, main_box['l'], main_box['r']) if non_gap_boxes: to_split_l = copy.deepcopy([box_1]) to_split_m = copy.deepcopy([box_2]) to_split_r = copy.deepcopy([box_3]) for num, ngb in enumerate(non_gap_boxes): boxes_l_split = [] boxes_m_split = [] boxes_r_split = [] for bnum, outer_box in enumerate(to_split_l): if ngb['t'] == outer_box['t'] and ngb['b'] < outer_box['b']: box1_l = copy.deepcopy(outer_box) box2_l = copy.deepcopy(outer_box) box1_l.update({'b': ngb['b']}) box2_l.update({'t': ngb['b']}) boxes_l_split.extend([box1_l, box2_l]) box1_m = copy.deepcopy(to_split_m[bnum]) box2_m = copy.deepcopy(to_split_m[bnum]) box1_m.update({'b': ngb['b']}) box2_m.update({'t': ngb['b']}) boxes_m_split.extend([box1_m, box2_m]) box1_r = copy.deepcopy(to_split_r[bnum]) box2_r = copy.deepcopy(to_split_r[bnum]) box1_r.update({'b': ngb['b']}) box2_r.update({'t': ngb['b']}) boxes_r_split.extend([box1_r, box2_r]) elif outer_box['t'] < ngb['t'] < outer_box[ 'b'] and outer_box['t'] < ngb['b'] < outer_box['b']: box1_l = copy.deepcopy(outer_box) box3_l = copy.deepcopy(outer_box) box1_l.update({'b': ngb['t']}) box3_l.update({'t': ngb['b']}) boxes_l_split.extend([box1_l, box3_l]) box1_m = copy.deepcopy(to_split_m[bnum]) box3_m = copy.deepcopy(to_split_m[bnum]) box1_m.update({'b': ngb['t']}) box3_m.update({'t': ngb['b']}) boxes_m_split.extend([box1_m, box3_m]) box1_r = copy.deepcopy(to_split_r[bnum]) box3_r = copy.deepcopy(to_split_r[bnum]) box1_r.update({'b': ngb['t']}) box3_r.update({'t': ngb['b']}) boxes_r_split.extend([box1_r, box3_r]) elif outer_box['t'] < ngb['t'] < outer_box['b'] and ngb[ 'b'] == outer_box['b']: box1_l = copy.deepcopy(outer_box) box2_l = copy.deepcopy(outer_box) box1_l.update({'b': ngb['t']}) box2_l.update({'t': ngb['t']}) boxes_l_split.extend([box1_l, box2_l]) box1_m = copy.deepcopy(to_split_m[bnum]) box2_m = copy.deepcopy(to_split_m[bnum]) box1_m.update({'b': ngb['t']}) box2_m.update({'t': ngb['t']}) boxes_m_split.extend([box1_m, box2_m]) box1_r = copy.deepcopy(to_split_r[bnum]) box2_r = copy.deepcopy(to_split_r[bnum]) box1_r.update({'b': ngb['t']}) box2_r.update({'t': ngb['t']}) boxes_r_split.extend([box1_r, box2_r]) else: boxes_l_split.extend([outer_box]) boxes_m_split.extend([to_split_m[bnum]]) boxes_r_split.extend([to_split_r[bnum]]) to_split_l = copy.deepcopy(boxes_l_split) to_split_m = copy.deepcopy(boxes_m_split) to_split_r = copy.deepcopy(boxes_r_split) all_of_them += to_split_l + to_split_m + to_split_r + non_gap_boxes ### Merge the single col boxes that were unsplit sorted_td = sorted(all_of_them, key=lambda k: ("t" not in k, k.get('t', None))) boxes_out = [sorted_td[0]] for bnum, box in enumerate(sorted_td, 1): if box['num_col'] == 'one' and boxes_out[-1][ 'num_col'] == 'one': boxes_out[-1]['b'] = box['b'] else: boxes_out.append(box) else: boxes_out.extend([box_1, box_2, box_3]) return boxes_out
def find_gap_intersections(self, boxes_in): actual_left = min(boxes_in, key=lambda x: x['l'])['l'] actual_right = max(boxes_in, key=lambda x: x['r'])['r'] final_splits = [] paired_boxes = [] for box in boxes_in: if 'num_col' in box: if box['num_col'] == 'two' and box['l'] == actual_left: box_on_left = box for box_in in boxes_in: if box_in['r'] > box_on_left['r'] and box_in['r'] == actual_right and \ box_in['t']==box_on_left['t'] and box_in['b']==box_on_left['b'] and box['num_col']=='two': box_on_right = box_in paired_boxes.append({ 'left_box': box_on_left, 'right_box': box_on_right }) elif box['num_col'] == 'one': final_splits.append(box) if paired_boxes: for pair in paired_boxes: tops = [] bots = [] left_box = pair['left_box'] right_box = pair['right_box'] gutter = { 'l': left_box['r'], 'r': right_box['l'], 'w': right_box['l'] - left_box['l'] } for line in self.lines_in_page: gutter_line = False if line['t'] >= left_box['t'] and line['b'] <= left_box[ 'b']: # Check if whole word is in gutter: if gutter['l'] < line['l'] and gutter['r'] > line['r']: gutter_line = True # Word starts after gutter but ends in box: elif gutter['l'] <= line['l'] < gutter['r'] and line[ 'r'] > gutter['r']: gutter_line = True # Word starts in box but ends in gutter elif line['l'] < gutter['l'] and gutter['l'] < line[ 'r'] < gutter['r']: gutter_line = True # word starts before guttre and ends after guttr. elif line['l'] < gutter['l'] and line['r'] > gutter[ 'r']: gutter_line = True if gutter_line: line_top = line['t'] line_bot = line['b'] tops.append(line_top) bots.append(line_bot) non_gap_boxes = self.unsplit_boxes(tops, bots, box, actual_left, actual_right) if non_gap_boxes: to_split_l = copy.deepcopy([left_box]) to_split_r = copy.deepcopy([right_box]) for num, nbm in enumerate(non_gap_boxes): boxes_l_split = [] boxes_r_split = [] for bnum, outer_box in enumerate(to_split_l): if nbm['t'] == outer_box[ 't'] and nbm['b'] < outer_box['b']: box1_l = copy.deepcopy(outer_box) box2_l = copy.deepcopy(outer_box) box1_l.update({'b': nbm['b']}) box2_l.update({'t': nbm['b']}) box1_r = copy.deepcopy(to_split_r[bnum]) box2_r = copy.deepcopy(to_split_r[bnum]) box1_r.update({'b': nbm['b']}) box2_r.update({'t': nbm['b']}) boxes_l_split.extend([box1_l, box2_l]) boxes_r_split.extend([box1_r, box2_r]) elif outer_box['t'] < nbm['t'] < outer_box[ 'b'] and outer_box['t'] < nbm[ 'b'] < outer_box['b']: box1_l = copy.deepcopy(outer_box) box3_l = copy.deepcopy(outer_box) box1_l.update({'b': nbm['t']}) box3_l.update({'t': nbm['b']}) box1_r = copy.deepcopy(to_split_r[bnum]) box3_r = copy.deepcopy(to_split_r[bnum]) box1_r.update({'b': nbm['t']}) box3_r.update({'t': nbm['b']}) boxes_l_split.extend([box1_l, box3_l]) boxes_r_split.extend([box1_r, box3_r]) elif outer_box['t'] < nbm['t'] < outer_box[ 'b'] and nbm['b'] == outer_box['b']: box1_l = copy.deepcopy(outer_box) box2_l = copy.deepcopy(outer_box) box1_l.update({'b': nbm['t']}) box2_l.update({'t': nbm['t']}) box1_r = copy.deepcopy(to_split_r[bnum]) box2_r = copy.deepcopy(to_split_r[bnum]) box1_r.update({'b': nbm['t']}) box2_r.update({'t': nbm['t']}) boxes_l_split.extend([box1_l, box2_l]) boxes_r_split.extend([box1_r, box2_r]) else: boxes_l_split.extend([outer_box]) boxes_r_split.extend([to_split_r[bnum]]) to_split_l = copy.deepcopy(boxes_l_split) to_split_r = copy.deepcopy(boxes_r_split) final_splits += to_split_l + to_split_r + non_gap_boxes else: final_splits.extend([left_box, right_box]) final_splits = [bo for bo in final_splits if bo['t'] != bo['b']] boxes_out = [] for fs in final_splits: if fs not in boxes_out: boxes_out.append(fs) final_splits_out = [ box for box in boxes_out if get_words_in_box(box, self.words_in_page) ] return final_splits_out
def vertical_splits_one_col(self, boxes_in): zones_out = [] final_splits = [] for working_block in boxes_in: to_split = [copy.deepcopy(working_block)] new_boxes = [] lines_in_block = get_lines_in_box(working_block, self.lines_in_page) words_in_block = get_words_in_box(working_block, self.words_in_page) half_width = (working_block['r'] - working_block['l']) / 2 box_half = working_block['l'] + half_width lines_on_left = {} lines_on_right = {} for line_num, line in enumerate(lines_in_block): if line['l'] > box_half + 10: lines_on_right[line_num] = line if box_half > line['r']: lines_on_left[line_num] = line split_line_nums = sorted([*lines_on_right]) consecutive_lines = formatlinelist(split_line_nums) binwidth_2 = 60 tops = [] bots = [] for line_range in consecutive_lines: if lines_in_block[line_range[0]:line_range[1]]: average_line_width = np.mean([ ll['r'] - ll['l'] for ll in lines_in_block[line_range[0]:line_range[1]] ]) else: average_line_width = 0 block_lefts = [ lines_in_block[l_num]['l'] for l_num in range(line_range[0], line_range[1] + 1) ] y_freq, y_ranges = np.histogram(block_lefts, \ bins=np.arange(min(block_lefts), \ max(block_lefts) + binwidth_2, binwidth_2)) if y_freq.any(): if line_range[-1] - line_range[0] > 2 and max( y_freq ) > 2 and average_line_width > 0.4 * half_width: lines_selected = lines_in_block[ line_range[0]:line_range[-1] + 1] sorted_td = sorted(lines_selected, key=lambda k: ("t" not in k, k.get('t', None))) tops.append(sorted_td[0]['t']) bots.append(sorted_td[-1]['b']) if tops: for c_top, c_bot in zip(tops, bots): left_box = { 't': c_top, 'b': c_bot, 'l': working_block['l'], 'r': box_half, 'w': box_half - working_block['l'], 'h': c_bot - c_top, 'color': 'orange', 'num_col': 'two' } right_box = { 't': c_top, 'b': c_bot, 'l': box_half, 'r': working_block['r'], 'w': working_block['r'] - box_half, 'h': c_bot - c_top, 'color': 'orange', 'num_col': 'two' } is_left_table = check_table( get_words_in_box(left_box, words_in_block), left_box['r'] - left_box['l'], len(get_lines_in_box(left_box, lines_in_block))) is_right_table = check_table( get_words_in_box(right_box, words_in_block), right_box['r'] - right_box['l'], len(get_lines_in_box(right_box, lines_in_block))) if is_right_table and is_left_table: new_boxes = [{ 't': c_top, 'b': c_bot, 'l': working_block['l'], 'r': working_block['r'], 'w': working_block['r'] - working_block['l'], 'h': c_bot - c_top, 'color': 'seagreen', 'num_col': 'one' }] else: new_boxes = [left_box, right_box] only_split_boxes = [] final_splits.extend(new_boxes) for outer_box in to_split: if left_box['t'] == outer_box[ 't'] and left_box['b'] < outer_box['b']: box1 = copy.deepcopy(outer_box) box2 = copy.deepcopy(outer_box) box1.update({'b': left_box['b']}) box2.update({'t': left_box['b']}) only_split_boxes.extend([box1, box2]) elif outer_box['t'] < left_box['t'] < outer_box[ 'b'] and outer_box['t'] < left_box[ 'b'] < outer_box['b']: box1 = copy.deepcopy(outer_box) box3 = copy.deepcopy(outer_box) box1.update({'b': left_box['t']}) box3.update({'t': left_box['b']}) only_split_boxes.extend([box1, box3]) elif outer_box['t'] < left_box['t'] < outer_box[ 'b'] and left_box['b'] == outer_box['b']: box1 = copy.deepcopy(outer_box) box1.update({'b': left_box['t']}) only_split_boxes.extend([box1]) else: only_split_boxes.append(outer_box) to_split = only_split_boxes final_splits.extend(only_split_boxes) else: final_splits.append(working_block) for fs in final_splits: if fs not in zones_out: zones_out.append(fs) return zones_out