def test_alternative_image_additions():
    pcgts = PcGtsType(pcGtsId="foo")
    assert pcgts.pcGtsId == 'foo'

    # act
    # Page/AlternativeImage
    page = PageType()
    pcgts.set_Page(page)
    page.add_AlternativeImage(AlternativeImageType())
    # TextRegion/AlternativeImage
    region = TextRegionType()
    page.add_TextRegion(region)
    region.add_AlternativeImage(AlternativeImageType())
    # TextLine/AlternativeImage
    line = TextLineType()
    region.add_TextLine(line)
    line.add_AlternativeImage(AlternativeImageType())
    # Word/AlternativeImage
    word = WordType()
    line.add_Word(word)
    word.add_AlternativeImage(AlternativeImageType())
    # Glyph/AlternativeImage
    glyph = GlyphType()
    word.add_Glyph(glyph)
    glyph.add_AlternativeImage(AlternativeImageType())
Beispiel #2
0
 def test_alternativeImage(self):
     pcgts = PcGtsType(pcGtsId="foo")
     self.assertEqual(pcgts.pcGtsId, 'foo')
     # Page/AlternativeImage
     page = PageType()
     pcgts.set_Page(page)
     page.add_AlternativeImage(AlternativeImageType())
     # TextRegion/AlternativeImage
     region = TextRegionType()
     page.add_TextRegion(region)
     region.add_AlternativeImage(AlternativeImageType())
     # TextLine/AlternativeImage
     line = TextLineType()
     region.add_TextLine(line)
     line.add_AlternativeImage(AlternativeImageType())
     # Word/AlternativeImage
     word = WordType()
     line.add_Word(word)
     word.add_AlternativeImage(AlternativeImageType())
     # Glyph/AlternativeImage
     glyph = GlyphType()
     word.add_Glyph(glyph)
     glyph.add_AlternativeImage(AlternativeImageType())
Beispiel #3
0
    def process(self):
        """
        Performs the recognition.
        """

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        self._init_calamari()

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id)

            for region in pcgts.get_Page().get_TextRegion():
                region_image, region_xywh = self.workspace.image_from_segment(
                    region, page_image, page_xywh)

                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'",
                         len(textlines), region.id)
                for (line_no, line) in enumerate(textlines):
                    log.debug("Recognizing line '%s' in region '%s'", line.id,
                              region.id)

                    line_image, line_coords = self.workspace.image_from_segment(
                        line, region_image, region_xywh)
                    line_image_np = np.array(line_image, dtype=np.uint8)

                    raw_results = list(
                        self.predictor.predict_raw([line_image_np],
                                                   progress_bar=False))[0]
                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)

                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"

                    # Build line text on our own
                    #
                    # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
                    # on prediction.positions. Do it on our own to have consistency.
                    #
                    # XXX Check Calamari's built-in post-processing on prediction.sentence

                    def _sort_chars(p):
                        """Filter and sort chars of prediction p"""
                        chars = p.chars
                        chars = [
                            c for c in chars if c.char
                        ]  # XXX Note that omission probabilities are not normalized?!
                        chars = [
                            c for c in chars if c.probability >=
                            self.parameter['glyph_conf_cutoff']
                        ]
                        chars = sorted(chars,
                                       key=lambda k: k.probability,
                                       reverse=True)
                        return chars

                    def _drop_leading_spaces(positions):
                        return list(
                            itertools.dropwhile(
                                lambda p: _sort_chars(p)[0].char == " ",
                                positions))

                    def _drop_trailing_spaces(positions):
                        return list(
                            reversed(_drop_leading_spaces(
                                reversed(positions))))

                    def _drop_double_spaces(positions):
                        def _drop_double_spaces_generator(positions):
                            last_was_space = False
                            for p in positions:
                                if p.chars[0].char == " ":
                                    if not last_was_space:
                                        yield p
                                    last_was_space = True
                                else:
                                    yield p
                                    last_was_space = False

                        return list(_drop_double_spaces_generator(positions))

                    positions = prediction.positions
                    positions = _drop_leading_spaces(positions)
                    positions = _drop_trailing_spaces(positions)
                    positions = _drop_double_spaces(positions)
                    positions = list(positions)

                    line_text = ''.join(
                        _sort_chars(p)[0].char for p in positions)
                    if line_text != prediction.sentence:
                        log.warning(
                            "Our own line text is not the same as Calamari's: '%s' != '%s'",
                            line_text, prediction.sentence)

                    # Delete existing results
                    if line.get_TextEquiv():
                        log.warning("Line '%s' already contained text results",
                                    line.id)
                    line.set_TextEquiv([])
                    if line.get_Word():
                        log.warning(
                            "Line '%s' already contained word segmentation",
                            line.id)
                    line.set_Word([])

                    # Save line results
                    line_conf = prediction.avg_char_probability
                    line.set_TextEquiv(
                        [TextEquivType(Unicode=line_text, conf=line_conf)])

                    # Save word results
                    #
                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
                    # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
                    # hierarchy of lines > words > glyphs.

                    def _words(s):
                        """Split words based on spaces and include spaces as 'words'"""
                        spaces = None
                        word = ''
                        for c in s:
                            if c == ' ' and spaces is True:
                                word += c
                            elif c != ' ' and spaces is False:
                                word += c
                            else:
                                if word:
                                    yield word
                                word = c
                                spaces = (c == ' ')
                        yield word

                    if self.parameter['textequiv_level'] in ['word', 'glyph']:
                        word_no = 0
                        i = 0

                        for word_text in _words(line_text):
                            word_length = len(word_text)
                            if not all(c == ' ' for c in word_text):
                                word_positions = positions[i:i + word_length]
                                word_start = word_positions[0].global_start
                                word_end = word_positions[-1].global_end

                                polygon = polygon_from_x0y0x1y1([
                                    word_start, 0, word_end, line_image.height
                                ])
                                points = points_from_polygon(
                                    coordinates_for_segment(
                                        polygon, None, line_coords))
                                # XXX Crop to line polygon?

                                word = WordType(id='%s_word%04d' %
                                                (line.id, word_no),
                                                Coords=CoordsType(points))
                                word.add_TextEquiv(
                                    TextEquivType(Unicode=word_text))

                                if self.parameter[
                                        'textequiv_level'] == 'glyph':
                                    for glyph_no, p in enumerate(
                                            word_positions):
                                        glyph_start = p.global_start
                                        glyph_end = p.global_end

                                        polygon = polygon_from_x0y0x1y1([
                                            glyph_start, 0, glyph_end,
                                            line_image.height
                                        ])
                                        points = points_from_polygon(
                                            coordinates_for_segment(
                                                polygon, None, line_coords))

                                        glyph = GlyphType(
                                            id='%s_glyph%04d' %
                                            (word.id, glyph_no),
                                            Coords=CoordsType(points))

                                        # Add predictions (= TextEquivs)
                                        char_index_start = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
                                        for char_index, char in enumerate(
                                                _sort_chars(p),
                                                start=char_index_start):
                                            glyph.add_TextEquiv(
                                                TextEquivType(
                                                    Unicode=char.char,
                                                    index=char_index,
                                                    conf=char.probability))

                                        word.add_Glyph(glyph)

                                line.add_Word(word)
                                word_no += 1

                            i += word_length

            _page_update_higher_textequiv_levels('line', pcgts)

            # Add metadata about this operation and its runtime parameters:
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Beispiel #4
0
    def process(self):
        """
        Perform text recognition with Calamari on the workspace.

        If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word / glyph level segments by
        splitting at white space characters / glyph boundaries. In the case of ``glyph``, add all alternative character
        hypotheses down to ``glyph_conf_cutoff`` confidence threshold.
        """
        log = getLogger('processor.CalamariRecognize')

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector=self.features)

            for region in page.get_AllRegions(classes=['Text']):
                region_image, region_coords = self.workspace.image_from_segment(
                    region,
                    page_image,
                    page_coords,
                    feature_selector=self.features)

                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'",
                         len(textlines), region.id)
                line_images_np = []
                line_coordss = []
                for line in textlines:
                    log.debug("Recognizing line '%s' in region '%s'", line.id,
                              region.id)

                    line_image, line_coords = self.workspace.image_from_segment(
                        line,
                        region_image,
                        region_coords,
                        feature_selector=self.features)
                    if ('binarized' not in line_coords['features']
                            and 'grayscale_normalized'
                            not in line_coords['features']
                            and self.network_input_channels == 1):
                        # We cannot use a feature selector for this since we don't
                        # know whether the model expects (has been trained on)
                        # binarized or grayscale images; but raw images are likely
                        # always inadequate:
                        log.warning(
                            "Using raw image for line '%s' in region '%s'",
                            line.id, region.id)

                    line_image = line_image if all(line_image.size) else [[0]]
                    line_image_np = np.array(line_image, dtype=np.uint8)
                    line_images_np.append(line_image_np)
                    line_coordss.append(line_coords)
                raw_results_all = self.predictor.predict_raw(
                    line_images_np, progress_bar=False)

                for line, line_coords, raw_results in zip(
                        textlines, line_coordss, raw_results_all):

                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)

                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"

                    # Build line text on our own
                    #
                    # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
                    # on prediction.positions. Do it on our own to have consistency.
                    #
                    # XXX Check Calamari's built-in post-processing on prediction.sentence

                    def _sort_chars(p):
                        """Filter and sort chars of prediction p"""
                        chars = p.chars
                        chars = [
                            c for c in chars if c.char
                        ]  # XXX Note that omission probabilities are not normalized?!
                        chars = [
                            c for c in chars if c.probability >=
                            self.parameter['glyph_conf_cutoff']
                        ]
                        chars = sorted(chars,
                                       key=lambda k: k.probability,
                                       reverse=True)
                        return chars

                    def _drop_leading_spaces(positions):
                        return list(
                            itertools.dropwhile(
                                lambda p: _sort_chars(p)[0].char == " ",
                                positions))

                    def _drop_trailing_spaces(positions):
                        return list(
                            reversed(_drop_leading_spaces(
                                reversed(positions))))

                    def _drop_double_spaces(positions):
                        def _drop_double_spaces_generator(positions):
                            last_was_space = False
                            for p in positions:
                                if p.chars[0].char == " ":
                                    if not last_was_space:
                                        yield p
                                    last_was_space = True
                                else:
                                    yield p
                                    last_was_space = False

                        return list(_drop_double_spaces_generator(positions))

                    positions = prediction.positions
                    positions = _drop_leading_spaces(positions)
                    positions = _drop_trailing_spaces(positions)
                    positions = _drop_double_spaces(positions)
                    positions = list(positions)

                    line_text = ''.join(
                        _sort_chars(p)[0].char for p in positions)
                    if line_text != prediction.sentence:
                        log.warning(
                            "Our own line text is not the same as Calamari's: '%s' != '%s'",
                            line_text, prediction.sentence)

                    # Delete existing results
                    if line.get_TextEquiv():
                        log.warning("Line '%s' already contained text results",
                                    line.id)
                    line.set_TextEquiv([])
                    if line.get_Word():
                        log.warning(
                            "Line '%s' already contained word segmentation",
                            line.id)
                    line.set_Word([])

                    # Save line results
                    line_conf = prediction.avg_char_probability
                    line.set_TextEquiv(
                        [TextEquivType(Unicode=line_text, conf=line_conf)])

                    # Save word results
                    #
                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
                    # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
                    # hierarchy of lines > words > glyphs.

                    def _words(s):
                        """Split words based on spaces and include spaces as 'words'"""
                        spaces = None
                        word = ''
                        for c in s:
                            if c == ' ' and spaces is True:
                                word += c
                            elif c != ' ' and spaces is False:
                                word += c
                            else:
                                if word:
                                    yield word
                                word = c
                                spaces = (c == ' ')
                        yield word

                    if self.parameter['textequiv_level'] in ['word', 'glyph']:
                        word_no = 0
                        i = 0

                        for word_text in _words(line_text):
                            word_length = len(word_text)
                            if not all(c == ' ' for c in word_text):
                                word_positions = positions[i:i + word_length]
                                word_start = word_positions[0].global_start
                                word_end = word_positions[-1].global_end

                                polygon = polygon_from_x0y0x1y1([
                                    word_start, 0, word_end, line_image.height
                                ])
                                points = points_from_polygon(
                                    coordinates_for_segment(
                                        polygon, None, line_coords))
                                # XXX Crop to line polygon?

                                word = WordType(id='%s_word%04d' %
                                                (line.id, word_no),
                                                Coords=CoordsType(points))
                                word.add_TextEquiv(
                                    TextEquivType(Unicode=word_text))

                                if self.parameter[
                                        'textequiv_level'] == 'glyph':
                                    for glyph_no, p in enumerate(
                                            word_positions):
                                        glyph_start = p.global_start
                                        glyph_end = p.global_end

                                        polygon = polygon_from_x0y0x1y1([
                                            glyph_start, 0, glyph_end,
                                            line_image.height
                                        ])
                                        points = points_from_polygon(
                                            coordinates_for_segment(
                                                polygon, None, line_coords))

                                        glyph = GlyphType(
                                            id='%s_glyph%04d' %
                                            (word.id, glyph_no),
                                            Coords=CoordsType(points))

                                        # Add predictions (= TextEquivs)
                                        char_index_start = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
                                        for char_index, char in enumerate(
                                                _sort_chars(p),
                                                start=char_index_start):
                                            glyph.add_TextEquiv(
                                                TextEquivType(
                                                    Unicode=char.char,
                                                    index=char_index,
                                                    conf=char.probability))

                                        word.add_Glyph(glyph)

                                line.add_Word(word)
                                word_no += 1

                            i += word_length

            _page_update_higher_textequiv_levels('line', pcgts)

            # Add metadata about this operation and its runtime parameters:
            self.add_metadata(pcgts)
            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Beispiel #5
0
    def process_lines(self, textlines, predfiles, fgrp, regionid):

        for line in textlines:

            for file in predfiles:
                if file == '-'.join([fgrp, regionid, line.id]):
                    self.log.info("Processing text in line '%s'", line.id)

                    filepath = self.root + '/' + file + '.json'
                    with open(filepath) as f:
                        data = json.load(f)

                        linepred = data['predictions'][0]['sentence']
                        line_conf = []
                        line_pos = []

                        w = ''
                        word_conf = []
                        words = []
                        word_pos = []

                        positions = data['predictions'][0]['positions']
                        for i, d in enumerate(positions):
                            char = d['chars'][0]['char']
                            char_conf = d['chars'][0]['probability']
                            char_pos = (d['globalStart'], d['globalEnd'])

                            if char == ' ':
                                words.append(w)
                                w = ''
                                line_conf.append(word_conf)
                                word_conf = []
                                line_pos.append(word_pos)
                                word_pos = []
                            else:
                                w += char
                                word_conf.append(char_conf)
                                word_pos.append(char_pos)
                                if i == len(positions) - 1:
                                    words.append(w)
                                    line_conf.append(word_conf)
                                    line_pos.append(word_pos)

                        wconfs = [(min(conf) + max(conf)) / 2
                                  for conf in line_conf]
                        lineconf = (min(wconfs) + max(wconfs)) / 2

                        line.replace_TextEquiv_at(
                            0,
                            TextEquivType(Unicode=linepred,
                                          conf=str(lineconf)))

                        if self.maxlevel == 'word' or 'glyph':
                            box = bounding_box(line.get_Coords().points)
                            line.Word = []
                            for w_no, w in enumerate(words):

                                # Coords of word
                                wordbounding = (line_pos[w_no][0][0],
                                                line_pos[w_no][-1][-1])
                                word_bbox = [
                                    box[0] + wordbounding[0], box[1],
                                    box[2] + wordbounding[1], box[3]
                                ]

                                word_id = '%s_word%04d' % (line.id, w_no)
                                word = WordType(
                                    id=word_id,
                                    Coords=CoordsType(
                                        points_from_x0y0x1y1(word_bbox)))

                                line.add_Word(word)
                                word.add_TextEquiv(
                                    TextEquivType(Unicode=w,
                                                  conf=str(wconfs[w_no])))

                                if self.maxlevel == 'glyph':
                                    for glyph_no, g in enumerate(w):
                                        glyphbounding = (
                                            line_pos[w_no][glyph_no][0],
                                            line_pos[w_no][glyph_no][-1])
                                        glyph_bbox = [
                                            box[0] + glyphbounding[0], box[1],
                                            box[2] + glyphbounding[1], box[3]
                                        ]

                                        glyph_id = '%s_glyph%04d' % (word.id,
                                                                     glyph_no)
                                        glyph = GlyphType(
                                            id=glyph_id,
                                            Coords=CoordsType(
                                                points_from_x0y0x1y1(
                                                    glyph_bbox)))

                                        word.add_Glyph(glyph)
                                        glyph.add_TextEquiv(
                                            TextEquivType(
                                                Unicode=g,
                                                conf=str(line_conf[w_no]
                                                         [glyph_no])))
Beispiel #6
0
    def process_lines(self, textlines, maxlevel, region_image, region_coords):
        edits = 0
        lengs = 0
        for line in textlines:
            line_image, line_coords = self.workspace.image_from_segment(
                line, region_image, region_coords)

            self.logger.info("Recognizing text in line '%s'", line.id)
            if line.get_TextEquiv():
                linegt = line.TextEquiv[0].Unicode
            else:
                linegt = ''
            self.logger.debug("GT  '%s': '%s'", line.id, linegt)
            # remove existing annotation below line level:
            line.set_TextEquiv([])
            line.set_Word([])

            if line_image.size[1] < 16:
                self.logger.debug(
                    "ERROR: bounding box is too narrow at line %s", line.id)
                continue
            # resize image to 48 pixel height
            final_img, scale = resize_keep_ratio(line_image)

            # process ocropy:
            try:
                linepred, clist, rlist, confidlist = recognize(final_img,
                                                               self.pad,
                                                               self.network,
                                                               check=True)
            except Exception as err:
                self.logger.debug('error processing line "%s": %s', line.id,
                                  err)
                continue
            self.logger.debug("OCR '%s': '%s'", line.id, linepred)
            edits += Levenshtein.distance(linepred, linegt)
            lengs += len(linegt)

            words = [x.strip() for x in linepred.split(' ') if x.strip()]

            word_r_list = [[0]]  # r-positions of every glyph in every word
            word_conf_list = [[]]  # confidences of every glyph in every word
            if words != []:
                w_no = 0
                found_char = False
                for i, c in enumerate(clist):
                    if c != ' ':
                        found_char = True
                        word_conf_list[w_no].append(confidlist[i])
                        word_r_list[w_no].append(rlist[i])

                    if c == ' ' and found_char:
                        if i == 0:
                            word_r_list[0][0] = rlist[i]

                        elif i + 1 <= len(clist) - 1 and clist[i + 1] != ' ':
                            word_conf_list.append([])
                            word_r_list.append([rlist[i]])
                            w_no += 1
            else:
                word_conf_list = [[0]]
                word_r_list = [[0, line_image.width]]

            # conf for each word
            wordsconf = [(min(x) + max(x)) / 2 for x in word_conf_list]
            # conf for the line
            line_conf = (min(wordsconf) + max(wordsconf)) / 2
            # line text
            line.add_TextEquiv(TextEquivType(Unicode=linepred, conf=line_conf))

            if maxlevel in ['word', 'glyph']:
                for word_no, word_str in enumerate(words):
                    word_points = points_from_polygon(
                        coordinates_for_segment(
                            np.array(
                                polygon_from_bbox(
                                    word_r_list[word_no][0] / scale, 0,
                                    word_r_list[word_no][-1] / scale,
                                    0 + line_image.height)), line_image,
                            line_coords))
                    word_id = '%s_word%04d' % (line.id, word_no)
                    word = WordType(id=word_id, Coords=CoordsType(word_points))
                    line.add_Word(word)
                    word.add_TextEquiv(
                        TextEquivType(Unicode=word_str,
                                      conf=wordsconf[word_no]))

                    if maxlevel == 'glyph':
                        for glyph_no, glyph_str in enumerate(word_str):
                            glyph_points = points_from_polygon(
                                coordinates_for_segment(
                                    np.array(
                                        polygon_from_bbox(
                                            word_r_list[word_no][glyph_no] /
                                            scale, 0,
                                            word_r_list[word_no][glyph_no + 1]
                                            / scale, 0 + line_image.height)),
                                    line_image, line_coords))
                            glyph_id = '%s_glyph%04d' % (word.id, glyph_no)
                            glyph = GlyphType(id=glyph_id,
                                              Coords=CoordsType(glyph_points))
                            word.add_Glyph(glyph)
                            glyph.add_TextEquiv(
                                TextEquivType(
                                    Unicode=glyph_str,
                                    conf=word_conf_list[word_no][glyph_no]))
        return edits, lengs