Esempio n. 1
0
def _split_word_at_space(word):
    prev_ = WordType(id=word.id + '_l')
    next_ = WordType(id=word.id + '_r')
    xywh = xywh_from_points(word.get_Coords().points)
    textequiv = word.get_TextEquiv()[0]
    pos = textequiv.Unicode.index(" ")
    fract = pos / len(textequiv.Unicode)
    xywh_prev = xywh.copy()
    xywh_prev.update({'w': xywh['w'] * fract})
    prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev)))
    xywh_next = xywh.copy()
    xywh_next.update({
        'x': xywh['x'] + xywh['w'] * fract,
        'w': xywh['w'] * (1 - fract)
    })
    next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next)))
    if word.get_language():
        prev_.set_language(word.get_language())
        next_.set_language(word.get_language())
    if word.get_TextStyle():
        prev_.set_TextStyle(word.get_TextStyle())
        next_.set_TextStyle(word.get_TextStyle())
    # Glyphs: irrelevant at this processing level
    textequiv_prev = TextEquivType(Unicode=textequiv.Unicode[0:pos],
                                   conf=textequiv.conf)
    textequiv_next = TextEquivType(Unicode=textequiv.Unicode[pos + 1:],
                                   conf=textequiv.conf)
    prev_.set_TextEquiv([textequiv_prev])
    next_.set_TextEquiv([textequiv_next])
    return prev_, next_
Esempio n. 2
0
def _split_word_at_glyph(word, glyph):
    prev_ = WordType(id=word.id + '_l')
    next_ = WordType(id=word.id + '_r')
    xywh_glyph = xywh_from_points(glyph.get_Coords().points)
    xywh_word = xywh_from_points(word.get_Coords().points)
    xywh_prev = xywh_word.copy()
    xywh_prev.update({'w': xywh_glyph['x'] - xywh_word['x']})
    prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev)))
    xywh_next = xywh_word.copy()
    xywh_next.update({
        'x': xywh_glyph['x'] - xywh_glyph['w'],
        'w': xywh_word['w'] - xywh_prev['w']
    })
    next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next)))
    if word.get_language():
        prev_.set_language(word.get_language())
        next_.set_language(word.get_language())
    if word.get_TextStyle():
        prev_.set_TextStyle(word.get_TextStyle())
        next_.set_TextStyle(word.get_TextStyle())
    glyphs = word.get_Glyph()
    pos = glyphs.index(glyph)
    prev_.set_Glyph(glyphs[0:pos])
    next_.set_Glyph(glyphs[pos + 1:])
    # TextEquiv: will be overwritten by page_update_higher_textequiv_levels
    return prev_, next_
Esempio n. 3
0
 def _process_words_in_line(self, result_it, line, line_xywh):
     LOG = getLogger('processor.TesserocrRecognize')
     if not result_it or result_it.Empty(RIL.WORD):
         LOG.warning("No text in line '%s'", line.id)
         return
     # iterate until IsAtFinalElement(RIL.LINE, RIL.WORD):
     word_no = 0
     while result_it and not result_it.Empty(RIL.WORD):
         word_id = '%s_word%04d' % (line.id, word_no)
         LOG.debug("Decoding text in word '%s'", word_id)
         bbox = result_it.BoundingBox(RIL.WORD)
         # convert to absolute coordinates:
         polygon = coordinates_for_segment(polygon_from_x0y0x1y1(bbox),
                                           None, line_xywh) - self.parameter['padding']
         polygon2 = polygon_for_parent(polygon, line)
         if polygon2 is not None:
             polygon = polygon2
         points = points_from_polygon(polygon)
         word = WordType(id=word_id, Coords=CoordsType(points))
         if polygon2 is None:
             # could happen due to rotation
             LOG.info('Ignoring extant word: %s', points)
         else:
             line.add_Word(word)
         # todo: determine if font attributes available for word level will work with LSTM models
         word_attributes = result_it.WordFontAttributes()
         if word_attributes:
             word_style = TextStyleType(
                 fontSize=word_attributes['pointsize']
                 if 'pointsize' in word_attributes else None,
                 fontFamily=word_attributes['font_name']
                 if 'font_name' in word_attributes else None,
                 bold=word_attributes['bold']
                 if 'bold' in word_attributes else None,
                 italic=word_attributes['italic']
                 if 'italic' in word_attributes else None,
                 underlined=word_attributes['underlined']
                 if 'underlined' in word_attributes else None,
                 monospace=word_attributes['monospace']
                 if 'monospace' in word_attributes else None,
                 serif=word_attributes['serif']
                 if 'serif' in word_attributes else None)
             word.set_TextStyle(word_style) # (or somewhere in custom attribute?)
         # add word annotation unconditionally (i.e. even for glyph level):
         word.add_TextEquiv(TextEquivType(
             Unicode=result_it.GetUTF8Text(RIL.WORD),
             conf=result_it.Confidence(RIL.WORD)/100))
         if self.parameter['textequiv_level'] != 'word':
             self._process_glyphs_in_word(result_it, word, line_xywh)
         if result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD):
             break
         else:
             word_no += 1
             result_it.Next(RIL.WORD)
Esempio n. 4
0
 def _merge_word_nodes(self, nodes):
     if not nodes:
         LOG.error('nothing to merge')
         return None
     merged = WordType()
     merged.set_id(','.join([n.id for n in nodes]))
     points = ' '.join([n.get_Coords().points for n in nodes])
     if points:
         merged.set_Coords(nodes[0].get_Coords())  # TODO merge
     # make other attributes and TextStyle a majority vote, but no Glyph
     # (too fine-grained) or TextEquiv (overwritten from best path anyway)
     languages = list(map(lambda elem: elem.get_language(), nodes))
     if languages:
         merged.set_language(max(set(languages), key=languages.count))
     # TODO other attributes...
     styles = map(lambda elem: elem.get_TextStyle(), nodes)
     if any(styles):
         # TODO make a majority vote on each attribute here
         merged.set_TextStyle(nodes[0].get_TextStyle())
     return merged
Esempio n. 5
0
def _merge_words(prev_, next_):
    merged = WordType(id=prev_.id + '.' + next_.id)
    merged.set_Coords(
        CoordsType(points=points_from_xywh(
            xywh_from_points(prev_.get_Coords().points + ' ' +
                             next_.get_Coords().points))))
    if prev_.get_language():
        merged.set_language(prev_.get_language())
    if prev_.get_TextStyle():
        merged.set_TextStyle(prev_.get_TextStyle())
    if prev_.get_Glyph() or next_.get_Glyph():
        merged.set_Glyph(prev_.get_Glyph() + next_.get_Glyph())
    if prev_.get_TextEquiv():
        merged.set_TextEquiv(prev_.get_TextEquiv())
    else:
        merged.set_TextEquiv([TextEquivType(Unicode='', conf=1.0)])
    if next_.get_TextEquiv():
        textequiv = merged.get_TextEquiv()[0]
        textequiv2 = next_.get_TextEquiv()[0]
        textequiv.Unicode += textequiv2.Unicode
        if textequiv.conf and textequiv2.conf:
            textequiv.conf *= textequiv2.conf
    return merged