Esempio n. 1
0
def _split_word_at_space(word):
    prev_ = WordType(id=word.id + '_l')
    next_ = WordType(id=word.id + '_r')
    xywh = xywh_from_points(word.get_Coords().points)
    textequiv = word.get_TextEquiv()[0]
    pos = textequiv.Unicode.index(" ")
    fract = pos / len(textequiv.Unicode)
    xywh_prev = xywh.copy()
    xywh_prev.update({'w': xywh['w'] * fract})
    prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev)))
    xywh_next = xywh.copy()
    xywh_next.update({
        'x': xywh['x'] + xywh['w'] * fract,
        'w': xywh['w'] * (1 - fract)
    })
    next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next)))
    if word.get_language():
        prev_.set_language(word.get_language())
        next_.set_language(word.get_language())
    if word.get_TextStyle():
        prev_.set_TextStyle(word.get_TextStyle())
        next_.set_TextStyle(word.get_TextStyle())
    # Glyphs: irrelevant at this processing level
    textequiv_prev = TextEquivType(Unicode=textequiv.Unicode[0:pos],
                                   conf=textequiv.conf)
    textequiv_next = TextEquivType(Unicode=textequiv.Unicode[pos + 1:],
                                   conf=textequiv.conf)
    prev_.set_TextEquiv([textequiv_prev])
    next_.set_TextEquiv([textequiv_next])
    return prev_, next_
Esempio n. 2
0
def _split_word_at_glyph(word, glyph):
    prev_ = WordType(id=word.id + '_l')
    next_ = WordType(id=word.id + '_r')
    xywh_glyph = xywh_from_points(glyph.get_Coords().points)
    xywh_word = xywh_from_points(word.get_Coords().points)
    xywh_prev = xywh_word.copy()
    xywh_prev.update({'w': xywh_glyph['x'] - xywh_word['x']})
    prev_.set_Coords(CoordsType(points=points_from_xywh(xywh_prev)))
    xywh_next = xywh_word.copy()
    xywh_next.update({
        'x': xywh_glyph['x'] - xywh_glyph['w'],
        'w': xywh_word['w'] - xywh_prev['w']
    })
    next_.set_Coords(CoordsType(points=points_from_xywh(xywh_next)))
    if word.get_language():
        prev_.set_language(word.get_language())
        next_.set_language(word.get_language())
    if word.get_TextStyle():
        prev_.set_TextStyle(word.get_TextStyle())
        next_.set_TextStyle(word.get_TextStyle())
    glyphs = word.get_Glyph()
    pos = glyphs.index(glyph)
    prev_.set_Glyph(glyphs[0:pos])
    next_.set_Glyph(glyphs[pos + 1:])
    # TextEquiv: will be overwritten by page_update_higher_textequiv_levels
    return prev_, next_
Esempio n. 3
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(
         psm=PSM.SINGLE_LINE,
         path=TESSDATA_PREFIX,
     ) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = page_from_file(self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 for line in region.get_TextLine():
                     log.debug("Detecting words in line '%s'", line.id)
                     image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
                     tessapi.SetImage(image)
                     offset = xywh_from_points(line.get_Coords().points)
                     for (word_no, component) in enumerate(tessapi.GetComponentImages(RIL.WORD, True)):
                         word_id = '%s_word%04d' % (line.id, word_no)
                         word_xywh = component[1]
                         word_xywh['x'] += offset['x']
                         word_xywh['y'] += offset['y']
                         line.add_Word(WordType(id=word_id, Coords=CoordsType(points_from_xywh(word_xywh))))
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts).encode('utf-8'),
             )
Esempio n. 4
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = page_from_file(
                 self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 log.debug("Detecting lines in %s with tesseract",
                           region.id)
                 image = self.workspace.resolve_image_as_pil(
                     image_url,
                     polygon_from_points(region.get_Coords().points))
                 tessapi.SetImage(image)
                 offset = xywh_from_points(region.get_Coords().points)
                 for (line_no, component) in enumerate(
                         tessapi.GetComponentImages(RIL.TEXTLINE, True)):
                     line_id = '%s_line%04d' % (region.id, line_no)
                     line_xywh = component[1]
                     line_xywh['x'] += offset['x']
                     line_xywh['y'] += offset['y']
                     line_points = points_from_xywh(line_xywh)
                     region.add_TextLine(
                         TextLineType(id=line_id,
                                      Coords=CoordsType(line_points)))
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 mimetype=MIMETYPE_PAGE,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 content=to_xml(pcgts).encode('utf-8'),
             )
Esempio n. 5
0
 def test_points_from_xywh(self):
     self.assertEqual(
         points_from_xywh({
             'x': 100,
             'y': 100,
             'w': 100,
             'h': 100
         }), '100,100 200,100 200,200 100,200')
Esempio n. 6
0
    def process(self):
        """
        Performs the cropping.
        """
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            #  print(self.input_file_grp)
            for (n, input_file) in enumerate(self.input_files):
                #  print(input_file)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
                log.debug("Cropping with tesseract")
                tessapi.SetImage(image)
                
                #
                # helper variables for saving the box coordinates
                #
                min_x = image.width
                min_y = image.height
                max_x = 0
                max_y = 0

                # iterate over all boxes and compare their extent
                # to the min and max values
                for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
                    points, index = points_from_xywh(component[1]), component[2]

                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    log.debug("Detected region '%s': %s", ID, points)

                    for pair in points.split(' '):
                        x, y = (int(pair.split(',')[0]), int(pair.split(',')[1]))
                        if x < min_x:
                            min_x = x
                        if y < min_y:
                            min_y = y
                        elif x > max_x:
                            max_x = x
                        elif y > max_y:
                            max_y = y
                    log.debug("Updated page border: %i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))

                #
                # set the identified page border
                #
                brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y)))
                pcgts.get_Page().set_Border(brd)

                ID = concat_padded(self.output_file_grp, n)
                self.workspace.add_file(
                    ID=ID,
                    file_grp=self.output_file_grp,
                    mimetype=MIMETYPE_PAGE,
                    local_filename='%s/%s' % (self.output_file_grp, ID),
                    content=to_xml(pcgts).encode('utf-8'),
                )
Esempio n. 7
0
    def process(self):
        """
        Performs the region segmentation.
        """
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            #  print(self.input_file_grp)
            for (n, input_file) in enumerate(self.input_files):
                #  print(input_file)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
                log.debug("Detecting regions with tesseract")
                tessapi.SetImage(image)
                for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
                    points, index = points_from_xywh(component[1]), component[2]

                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    log.debug("Detected region '%s': %s", ID, points)
                    # <pg:ReadingOrder>
                    ro = pcgts.get_Page().get_ReadingOrder()
                    if ro is None:
                        ro = ReadingOrderType()
                        pcgts.get_Page().set_ReadingOrder(ro)
                    # <pg:OrderedGroup>
                    og = ro.get_OrderedGroup()
                    if og is None:
                        og = OrderedGroupType(id="reading-order")
                        ro.set_OrderedGroup(og)
                    # <pg:RegionRefIndexed>
                    og.add_RegionRefIndexed(RegionRefIndexedType(regionRef=ID, index=index))

                    #
                    #  text region
                    #
                    pcgts.get_Page().add_TextRegion(TextRegionType(id=ID, Coords=CoordsType(points=points)))

                ID = concat_padded(self.output_file_grp, n)
                self.workspace.add_file(
                    ID=ID,
                    file_grp=self.output_file_grp,
                    mimetype=MIMETYPE_PAGE,
                    local_filename='%s/%s' % (self.output_file_grp, ID),
                    content=to_xml(pcgts).encode('utf-8'),
                )
Esempio n. 8
0
def _merge_words(prev_, next_):
    merged = WordType(id=prev_.id + '.' + next_.id)
    merged.set_Coords(
        CoordsType(points=points_from_xywh(
            xywh_from_points(prev_.get_Coords().points + ' ' +
                             next_.get_Coords().points))))
    if prev_.get_language():
        merged.set_language(prev_.get_language())
    if prev_.get_TextStyle():
        merged.set_TextStyle(prev_.get_TextStyle())
    if prev_.get_Glyph() or next_.get_Glyph():
        merged.set_Glyph(prev_.get_Glyph() + next_.get_Glyph())
    if prev_.get_TextEquiv():
        merged.set_TextEquiv(prev_.get_TextEquiv())
    else:
        merged.set_TextEquiv([TextEquivType(Unicode='', conf=1.0)])
    if next_.get_TextEquiv():
        textequiv = merged.get_TextEquiv()[0]
        textequiv2 = next_.get_TextEquiv()[0]
        textequiv.Unicode += textequiv2.Unicode
        if textequiv.conf and textequiv2.conf:
            textequiv.conf *= textequiv2.conf
    return merged