def join_parag_text(paragraph, block2text): parag_text = [block2text[line] for line in paragraph] parag_text = [tokenize(line) for line in parag_text] for line_index in xrange(len(parag_text) - 1): cur_last_word = parag_text[line_index] and parag_text[line_index][ -1] or " " last_is_dash = False if cur_last_word in u"‒–—―---" and len(parag_text[line_index]) > 1: cur_last_word = parag_text[line_index][-2] last_is_dash = True next_first_word = parag_text[line_index + 1] and parag_text[line_index + 1][0] or " " compound = cur_last_word + next_first_word if next_first_word[0] in u"aбвгдежзиклмнопрстуфхцчшщэюя" and ( last_is_dash or compound.lower() in freq_dict): if not last_is_dash: parag_text[line_index][-1] = compound else: parag_text[line_index][-2] = compound parag_text[line_index] = parag_text[line_index][:-1] parag_text[line_index + 1] = parag_text[line_index + 1][1:] parag_words = [word for words in parag_text for word in words] full_text = "" no_space_before = False for word in parag_words: if word[0] in u".,:;?!)}]\"»" or no_space_before: full_text += word else: full_text += " " + word no_space_before = word[-1] in u"«({[\"" return full_text
def do_main_analysis(original_image): cune_paragraphs, cune_letters, cune_texts = do_ocr(original_image) image_blocks = [] if 1: if "initial image blocks": image_mat_no_text = get_matrix(original_image) #for paragraph in cune_paragraphs: # for line in paragraph: for letter in cune_letters: x_min, x_max = letter[0] y_min, y_max = letter[1] image_mat_no_text[x_min:x_max, y_min:y_max] = 0 initial_block = ( (0, image_mat_no_text.shape[0]), (0, image_mat_no_text.shape[1]) ) detect_non_empty_blocks(image_mat_no_text, initial_block, image_blocks) line_heights = [block[0][1] - block[0][0] for block in cune_letters] line_heights.sort() line_height_avg = line_heights and line_heights[len(line_heights) / 2] or 1 if "remove small image blocks": image_blocks = [block for block in image_blocks\ if (block[0][1] - block[0][0] >= line_height_avg or \ block[1][1] - block[1][0] >= line_height_avg)] if "join image blocks": while True: used = set() for first in xrange(len(image_blocks)): if first in used: continue for second in xrange(first + 1, len(image_blocks)): if max_distance(image_blocks[first], image_blocks[second]) < 2 * line_height_avg: image_blocks[first] = merge_blocks(image_blocks[first], image_blocks[second]) used.add(second) if not used: break image_blocks = [image_blocks[index] for index in xrange(len(image_blocks)) if not index in used] if "remove not big image blocks": image_blocks = [block for block in image_blocks\ if (block[0][1] - block[0][0] > 4 * line_height_avg and \ block[1][1] - block[1][0] > 4 * line_height_avg)] if 1: """ ocr without images """ if image_blocks: page_no_imgs = original_image.copy() draw = ImageDraw.Draw(page_no_imgs) for block in image_blocks: draw.rectangle( (block[1][0], block[0][0], block[1][1], block[0][1]), fill=255) del draw page_no_imgs.save("tmp.tif") call("cuneiform -f hocr -l rus tmp.tif -o tmp.cune_out".split()) cune_paragraphs, cune_letters, cune_texts = upload_cuneiform("tmp.cune_out") if """ merge verticaly divided paragraphs """: line_heights = [block[0][1] - block[0][0] for block in cune_letters] line_heights.sort() line_height_avg = line_heights and line_heights[len(line_heights) / 2] or 1 parag_coordinates = [] for parag in cune_paragraphs: min_x, max_x = 100000, -10000 min_y, max_y = 100000, -10000 for (block_x_min, block_x_max), (block_y_min, block_y_max) in parag: min_x = min(min_x, block_x_min) max_x = max(max_x, block_x_max) min_y = min(min_y, block_y_min) max_y = max(max_y, block_y_max) parag_coordinates += [((min_x, max_x), (min_y, max_y))] while True: used = set() for first in xrange(len(cune_paragraphs)): first_left, first_right = parag_coordinates[first][1] for second in xrange(first + 1, len(cune_paragraphs)): if second in used: continue second_left, second_right = parag_coordinates[second][1] if len(cune_paragraphs[first]) == len(cune_paragraphs[second]) and \ abs(parag_coordinates[first][0][0] - parag_coordinates[second][0][0]) < line_height_avg and \ (abs(second_left - first_right) < line_height_avg or abs(second_right - first_left) < line_height_avg): used.add(second) """merge""" merged_paragraph = [] for first_line, second_line in zip(cune_paragraphs[first], cune_paragraphs[second]): merged_paragraph += [merge_blocks(first_line, second_line)] cune_texts[merged_paragraph[-1]] = cune_texts[first_line] + " " + cune_texts[second_line] cune_paragraphs[first] = merged_paragraph #print "MERGE", first, second if not used: break cune_paragraphs = [cune_paragraphs[first] for first in xrange(len(cune_paragraphs)) if not first in used] formulas = [] if """extract good paragraphs""": filtered_paragraphs = [] for paragraph in cune_paragraphs: parag_text = [cune_texts[line] for line in paragraph] parag_text = [tokenize(line) for line in parag_text] for line_index in xrange(len(parag_text) - 1): cur_last_word = parag_text[line_index] and parag_text[line_index][-1] or " " last_is_dash = False if cur_last_word in u"‒–—―-" and len(parag_text[line_index]) > 1: cur_last_word = parag_text[line_index][-2] last_is_dash = True next_first_word = parag_text[line_index + 1] and parag_text[line_index + 1][0] or " " compound = cur_last_word + next_first_word if next_first_word[0] in u"aбвгдежзиклмнопрстуфхцчшщэюя" and (last_is_dash or compound.lower() in freq_dict): if not last_is_dash: parag_text[line_index][-1] = compound else: parag_text[line_index][-2] = compound parag_text[line_index] = parag_text[line_index][:-1] parag_text[line_index + 1] = parag_text[line_index + 1][1:] full_text = [] for words in parag_text: full_text += words good_words = 0 corrected_words = [] for word in full_text: corrected_word = grammar_corrector.correct_word(word.lower()) corrected_words += [corrected_word] if is_alpha(corrected_word) and len(corrected_word) > 2 and corrected_word in grammar_corrector.dict and grammar_corrector.dict[corrected_word] > 10: good_words += 1 if len(full_text) == 1 and good_words != 1 or good_words < 2: formulas += [paragraph] else: filtered_paragraphs += [paragraph] #print good_words, len(full_text) #print " ".join(full_text) #print " ".join(corrected_words) #print cune_paragraphs = filtered_paragraphs formulas = [] if """ update formulas """: image_mat_no_text_no_imgs = get_matrix(original_image) for paragraph in cune_paragraphs: for line in paragraph: x_min, x_max = line[0] y_min, y_max = line[1] image_mat_no_text_no_imgs[x_min:x_max, y_min:y_max] = 0 for block in image_blocks: x_min, x_max = block[0] y_min, y_max = block[1] image_mat_no_text_no_imgs[x_min:x_max, y_min:y_max] = 0 initial_block = ( (0, image_mat_no_text_no_imgs.shape[0]), (0, image_mat_no_text_no_imgs.shape[1]) ) detect_non_empty_blocks(image_mat_no_text_no_imgs, initial_block, formulas) line_heights = [block[0][1] - block[0][0] for block in cune_letters] line_heights.sort() line_height_avg = line_heights and line_heights[len(line_heights) / 2] or 1 if "remove small blocks": formulas = [block for block in formulas\ if (block[0][1] - block[0][0] >= 0.5 * line_height_avg or \ block[1][1] - block[1][0] >= 0.5 * line_height_avg)] if "join formulas' blocks": while True: used = set() for first in xrange(len(formulas)): if first in used: continue for second in xrange(first + 1, len(formulas)): if max_distance(formulas[first], formulas[second]) < 2 * line_height_avg: formulas[first] = merge_blocks(formulas[first], formulas[second]) used.add(second) if not used: break formulas = [formulas[index] for index in xrange(len(formulas)) if not index in used] if "remove not big formulas blocks": formulas = [block for block in formulas\ if (block[0][1] - block[0][0] > 0.5 * line_height_avg and \ block[1][1] - block[1][0] > 0.5 * line_height_avg)] def dump_block(block): return ",".join(str(item) for item in [block[0][0], block[0][1], block[1][0], block[1][1]]) dump_line = "" for paragraph in cune_paragraphs: dump_line += "paragraph\t" + " ".join([dump_block(line) for line in paragraph]) dump_line += "\n" if 1: dump_line += "letters\t" + " ".join([dump_block(line) for line in cune_letters]) dump_line += "\n" for block in image_blocks: dump_line += "images\t" + " ".join([dump_block(line) for line in image_blocks]) dump_line += "\n" for block in formulas: dump_line += "formulas\t" + " ".join([dump_block(line) for line in formulas]) dump_line += "\n" for block, text in cune_texts.items(): dump_line += "text\t" + dump_block(block) + "\t" + text.encode("utf8") dump_line += "\n" return dump_line