Example #1
0
def join_parag_text(paragraph, block2text):
    parag_text = [block2text[line] for line in paragraph]
    parag_text = [tokenize(line) for line in parag_text]
    for line_index in xrange(len(parag_text) - 1):
        cur_last_word = parag_text[line_index] and parag_text[line_index][
            -1] or " "
        last_is_dash = False
        if cur_last_word in u"‒–—―---" and len(parag_text[line_index]) > 1:
            cur_last_word = parag_text[line_index][-2]
            last_is_dash = True
        next_first_word = parag_text[line_index +
                                     1] and parag_text[line_index +
                                                       1][0] or " "
        compound = cur_last_word + next_first_word
        if next_first_word[0] in u"aбвгдежзиклмнопрстуфхцчшщэюя" and (
                last_is_dash or compound.lower() in freq_dict):
            if not last_is_dash:
                parag_text[line_index][-1] = compound
            else:
                parag_text[line_index][-2] = compound
                parag_text[line_index] = parag_text[line_index][:-1]
            parag_text[line_index + 1] = parag_text[line_index + 1][1:]
    parag_words = [word for words in parag_text for word in words]
    full_text = ""
    no_space_before = False
    for word in parag_words:
        if word[0] in u".,:;?!)}]\"»" or no_space_before:
            full_text += word
        else:
            full_text += " " + word
        no_space_before = word[-1] in u"«({[\""
    return full_text
Example #2
0
def do_main_analysis(original_image):
    cune_paragraphs, cune_letters, cune_texts = do_ocr(original_image)
    image_blocks = []
    if 1:
        if "initial image blocks":
            image_mat_no_text = get_matrix(original_image)
            #for paragraph in cune_paragraphs:
            #    for line in paragraph:
            for letter in cune_letters:
                x_min, x_max  = letter[0]
                y_min, y_max  = letter[1] 
                image_mat_no_text[x_min:x_max, y_min:y_max] = 0
                        
            initial_block = ( (0, image_mat_no_text.shape[0]), (0, image_mat_no_text.shape[1]) )
            detect_non_empty_blocks(image_mat_no_text, initial_block, image_blocks)
            
        line_heights = [block[0][1] - block[0][0] for block in cune_letters]
        line_heights.sort()
        line_height_avg = line_heights and line_heights[len(line_heights) / 2] or 1   
        
        if "remove small image blocks":
            image_blocks = [block for block in image_blocks\
                                 if (block[0][1] - block[0][0] >= line_height_avg or \
                                 block[1][1] - block[1][0] >= line_height_avg)]
        
        if "join image blocks":
            while True:
                used = set()
                for first in xrange(len(image_blocks)):
                    if first in used:
                        continue
                    for second in xrange(first + 1, len(image_blocks)):
                        if max_distance(image_blocks[first], image_blocks[second]) < 2 * line_height_avg:
                            image_blocks[first] = merge_blocks(image_blocks[first], image_blocks[second])
                            used.add(second)
                if not used:
                    break
                image_blocks = [image_blocks[index] for index in xrange(len(image_blocks)) if not index in used]
        
        if "remove not big image blocks":
            image_blocks = [block for block in image_blocks\
                                 if (block[0][1] - block[0][0] > 4 * line_height_avg and \
                                 block[1][1] - block[1][0] > 4 * line_height_avg)]
    
    if 1:
        """ ocr without images """
        if image_blocks: 
            page_no_imgs = original_image.copy()
            draw = ImageDraw.Draw(page_no_imgs)
            for block in image_blocks:
                draw.rectangle( (block[1][0], block[0][0], block[1][1], block[0][1]), fill=255)        
            del draw
            page_no_imgs.save("tmp.tif")
            call("cuneiform -f hocr  -l rus tmp.tif -o tmp.cune_out".split())
            cune_paragraphs, cune_letters, cune_texts = upload_cuneiform("tmp.cune_out")   
        
        if """ merge verticaly divided paragraphs """:
            line_heights = [block[0][1] - block[0][0] for block in cune_letters]
            line_heights.sort()
            line_height_avg = line_heights and line_heights[len(line_heights) / 2] or 1  
            parag_coordinates = []
            for parag in cune_paragraphs:
                min_x, max_x = 100000, -10000
                min_y, max_y = 100000, -10000
                for (block_x_min, block_x_max), (block_y_min, block_y_max) in parag:
                    min_x = min(min_x, block_x_min)
                    max_x = max(max_x, block_x_max)
                    min_y = min(min_y, block_y_min)
                    max_y = max(max_y, block_y_max)
                parag_coordinates += [((min_x, max_x), (min_y, max_y))]
            while True:
                used = set()
                for first in xrange(len(cune_paragraphs)):
                    first_left, first_right = parag_coordinates[first][1]
                    for second in xrange(first + 1, len(cune_paragraphs)):
                        if second in used:
                            continue
                        second_left, second_right = parag_coordinates[second][1]
                        if len(cune_paragraphs[first]) == len(cune_paragraphs[second]) and  \
                                 abs(parag_coordinates[first][0][0] - parag_coordinates[second][0][0]) < line_height_avg and \
                                (abs(second_left - first_right) < line_height_avg or abs(second_right - first_left) < line_height_avg):
                            used.add(second)
                            """merge"""
                            merged_paragraph = []
                            for first_line, second_line in zip(cune_paragraphs[first], cune_paragraphs[second]):
                                 merged_paragraph += [merge_blocks(first_line, second_line)]
                                 cune_texts[merged_paragraph[-1]] = cune_texts[first_line] + " " + cune_texts[second_line]
                            cune_paragraphs[first] = merged_paragraph
                            #print "MERGE", first, second
                if not used:
                    break
                cune_paragraphs = [cune_paragraphs[first] for first  in xrange(len(cune_paragraphs)) if not first in used]  
                
        
        
        formulas = []
        if """extract good paragraphs""":
            filtered_paragraphs = []
            for paragraph in cune_paragraphs:
                parag_text = [cune_texts[line] for line in paragraph]
                parag_text = [tokenize(line) for line in parag_text]
                for line_index in xrange(len(parag_text) - 1):
                    cur_last_word = parag_text[line_index] and parag_text[line_index][-1] or " "
                    last_is_dash = False
                    if cur_last_word in u"‒–—―-" and len(parag_text[line_index]) > 1:
                        cur_last_word = parag_text[line_index][-2]
                        last_is_dash = True
                    next_first_word = parag_text[line_index + 1] and parag_text[line_index + 1][0] or " "
                    compound = cur_last_word + next_first_word
                    if next_first_word[0] in u"aбвгдежзиклмнопрстуфхцчшщэюя" and (last_is_dash or compound.lower() in freq_dict):
                        if not last_is_dash:
                            parag_text[line_index][-1] = compound
                        else:
                            parag_text[line_index][-2] = compound
                            parag_text[line_index] = parag_text[line_index][:-1]
                        parag_text[line_index + 1] = parag_text[line_index + 1][1:]
                full_text = []
                for words in parag_text:
                    full_text += words
                good_words = 0
                corrected_words = []
                for word in full_text:
                    corrected_word = grammar_corrector.correct_word(word.lower())
                    corrected_words += [corrected_word]
                    if is_alpha(corrected_word) and len(corrected_word) > 2 and corrected_word in grammar_corrector.dict and  grammar_corrector.dict[corrected_word] > 10:
                        good_words += 1
                if len(full_text) == 1 and good_words != 1 or good_words < 2:
                    formulas += [paragraph]
                else:
                    filtered_paragraphs += [paragraph]
                #print good_words, len(full_text)
                #print " ".join(full_text)
                #print " ".join(corrected_words)
                #print 
            cune_paragraphs = filtered_paragraphs
        
        formulas = []
        if """ update formulas """:
            image_mat_no_text_no_imgs = get_matrix(original_image)
            for paragraph in cune_paragraphs:
                for line in paragraph:
                    x_min, x_max  = line[0]
                    y_min, y_max  = line[1] 
                    image_mat_no_text_no_imgs[x_min:x_max, y_min:y_max] = 0
            for block in image_blocks:
                x_min, x_max  = block[0]
                y_min, y_max  = block[1] 
                image_mat_no_text_no_imgs[x_min:x_max, y_min:y_max] = 0
                        
            initial_block = ( (0, image_mat_no_text_no_imgs.shape[0]), (0, image_mat_no_text_no_imgs.shape[1]) )
            detect_non_empty_blocks(image_mat_no_text_no_imgs, initial_block, formulas)
            
            line_heights = [block[0][1] - block[0][0] for block in cune_letters]
            line_heights.sort()
            line_height_avg = line_heights and line_heights[len(line_heights) / 2] or 1 
            
            
            if "remove small blocks":
                formulas = [block for block in formulas\
                                     if (block[0][1] - block[0][0] >= 0.5 * line_height_avg or \
                                     block[1][1] - block[1][0] >= 0.5 *  line_height_avg)]
            
            if "join formulas' blocks":
                while True:
                    used = set()
                    for first in xrange(len(formulas)):
                        if first in used:
                            continue
                        for second in xrange(first + 1, len(formulas)):
                            if max_distance(formulas[first], formulas[second]) < 2 * line_height_avg:
                                formulas[first] = merge_blocks(formulas[first], formulas[second])
                                used.add(second)
                    if not used:
                        break
                    formulas = [formulas[index] for index in xrange(len(formulas)) if not index in used]
            
            if "remove not big formulas blocks":
                formulas = [block for block in formulas\
                                     if (block[0][1] - block[0][0] > 0.5 * line_height_avg and \
                                     block[1][1] - block[1][0] > 0.5 * line_height_avg)]
    def dump_block(block):
        return ",".join(str(item) for item in [block[0][0], block[0][1], block[1][0], block[1][1]])
    
    dump_line = ""
    for paragraph in cune_paragraphs:
        dump_line += "paragraph\t" + " ".join([dump_block(line) for line in paragraph])
        dump_line += "\n"
    if 1:
        dump_line += "letters\t" + " ".join([dump_block(line) for line in cune_letters])
        dump_line += "\n"
    for block in image_blocks:
        dump_line += "images\t" + " ".join([dump_block(line) for line in image_blocks])
        dump_line += "\n"   
    for block in formulas:
        dump_line += "formulas\t" + " ".join([dump_block(line) for line in formulas])
        dump_line += "\n"  
    for block, text in cune_texts.items():
        dump_line += "text\t" + dump_block(block) + "\t" + text.encode("utf8")
        dump_line += "\n"
    return dump_line