def test_skipping_first_regression(self):
        neg_logits = np.asarray([
            [10.0, 10.0, 0.0],
            [0.0, 10.0, 10.0],
        ])

        self.assertEqual(force_align(neg_logits, [1, 2], 0), [1, 2])
    def test_trivial(self):
        neg_logits = np.asarray([
            [0.0, 10.0],
            [10.0, 0.0]
        ])

        self.assertEqual(force_align(neg_logits, [1], 0), [0, 1])
Example #3
0
    def test_multi_symbol_regression(self):
        neg_logits = np.asarray([
            [0.0, 10.0, 10.0],
            [10.0, 10.0, 0.0],
            [5.0, 10.0, 5.0],
            [10.0, 10.0, 0.0],
        ])

        self.assertEqual(force_align(neg_logits, [2, 2], 0), [0, 2, 0, 2])
Example #4
0
    def test_single_symbol_multi_blank(self):
        neg_logits = np.asarray([
            [0.0, 10.0, 0.0],
            [0.0, 10.0, 0.0],
            [0.0, 10.0, 0.0],
            [10.0, 0.0, 10.0],
            [0.0, 10.0, 0.0],
            [0.0, 10.0, 0.0],
        ])

        self.assertEqual(force_align(neg_logits, [1], 0), [0, 0, 0, 1, 0, 0])
Example #5
0
    def to_altoxml_string(self):
        NSMAP = {
            "xlink": 'http://www.w3.org/1999/xlink',
            "xsi": 'http://www.w3.org/2001/XMLSchema-instance'
        }
        root = ET.Element("alto", nsmap=NSMAP)
        root.set("xmlns", "http://www.loc.gov/standards/alto/ns-v2#")

        description = ET.SubElement(root, "Description")
        measurement_unit = ET.SubElement(description, "MeasurementUnit")
        measurement_unit.text = "pixel"
        ocr_processing = ET.SubElement(description, "OCRProcessing")
        ocr_processing.set("ID", "IdOcr")
        ocr_processing_step = ET.SubElement(ocr_processing,
                                            "ocrProcessingStep")
        processing_date_time = ET.SubElement(ocr_processing_step,
                                             "processingDateTime")
        processing_date_time.text = datetime.today().strftime('%Y-%m-%d')
        processing_software = ET.SubElement(ocr_processing_step,
                                            "processingSoftware")
        processing_creator = ET.SubElement(processing_software,
                                           "softwareCreator")
        processing_creator.text = "Project PERO"
        software_name = ET.SubElement(processing_software, "softwareName")
        software_name.text = "PERO OCR"
        software_version = ET.SubElement(processing_software,
                                         "softwareVersion")
        software_version.text = "v0.1.0"

        layout = ET.SubElement(root, "Layout")
        page = ET.SubElement(layout, "Page")
        page.set("ID", "id_" + self.id)
        page.set("PHYSICAL_IMG_NR", str(1))
        page.set("HEIGHT", str(self.page_size[0]))
        page.set("WIDTH", str(self.page_size[1]))

        top_margin = ET.SubElement(page, "TopMargin")
        left_margin = ET.SubElement(page, "LeftMargin")
        right_margin = ET.SubElement(page, "RightMargin")
        bottom_margin = ET.SubElement(page, "BottomMargin")
        print_space = ET.SubElement(page, "PrintSpace")

        print_space_height = 0
        print_space_width = 0
        print_space_vpos = self.page_size[0]
        print_space_hpos = self.page_size[1]
        for b, block in enumerate(self.regions):
            text_block = ET.SubElement(print_space, "TextBlock")
            text_block.set("ID", block.id)

            text_block_height = max(block.polygon[:, 1]) - min(
                block.polygon[:, 1])
            text_block.set("HEIGHT", str(text_block_height))

            text_block_width = max(block.polygon[:, 0]) - min(block.polygon[:,
                                                                            0])
            text_block.set("WIDTH", str(text_block_width))

            text_block_vpos = min(block.polygon[:, 1])
            text_block.set("VPOS", str(text_block_vpos))

            text_block_hpos = min(block.polygon[:, 0])
            text_block.set("HPOS", str(text_block_hpos))

            print_space_height = max([
                print_space_vpos + print_space_height,
                text_block_vpos + text_block_height
            ])
            print_space_width = max([
                print_space_hpos + print_space_width,
                text_block_hpos + text_block_width
            ])
            print_space_vpos = min([print_space_vpos, text_block_vpos])
            print_space_hpos = min([print_space_hpos, text_block_hpos])
            print_space_height = print_space_height - print_space_vpos
            print_space_width = print_space_width - print_space_hpos

            for l, line in enumerate(block.lines):
                if not line.transcription:
                    continue
                text_line = ET.SubElement(text_block, "TextLine")
                text_line_baseline = int(
                    np.average(np.array(line.baseline)[:, 1]))
                text_line.set("BASELINE", str(text_line_baseline))

                text_line_vpos = min(np.array(line.polygon)[:, 1])
                text_line.set("VPOS", str(text_line_vpos))
                text_line_hpos = min(np.array(line.polygon)[:, 0])
                text_line.set("HPOS", str(text_line_hpos))
                text_line_height = max(np.array(line.polygon)[:, 1]) - min(
                    np.array(line.polygon)[:, 1])
                text_line.set("HEIGHT", str(text_line_height))
                text_line_width = max(np.array(line.polygon)[:, 0]) - min(
                    np.array(line.polygon)[:, 0])
                text_line.set("WIDTH", str(text_line_width))

                chars = [i for i in range(len(line.characters))]
                char_to_num = dict(zip(line.characters, chars))

                label = []
                for item in (line.transcription):
                    label.append(char_to_num[item])

                logits = line.get_dense_logits()
                output = softmax(logits, axis=1)
                aligned = force_align(-np.log(output), label, len(chars))
                narrow_label(aligned, logits, len(chars))

                crop_engine = EngineLineCropper(poly=2)
                line_coords = crop_engine.get_crop_inputs(
                    line.baseline, line.heights, 16)

                global_letter_counter = 0
                for w, word in enumerate(line.transcription.split()):
                    local_letter_counter = 0
                    word_lenght = len(word)
                    string_width = 0
                    string_hpos = 0
                    end_of_space = 0
                    final = False
                    last = True

                    for a, ali in enumerate(aligned):
                        if ali != len(chars):
                            if local_letter_counter > global_letter_counter:
                                if final:
                                    end_of_space = 4 * a
                                    global_letter_counter = local_letter_counter
                                    last = False
                                    break
                                if local_letter_counter - global_letter_counter == word_lenght:
                                    string_width = 4 * a - string_hpos
                                    final = True
                            elif local_letter_counter - global_letter_counter == 0:
                                string_hpos = 4 * a
                            local_letter_counter += 1

                    if last:
                        string_width = 4 * len(aligned) - string_hpos

                    lm_const = np.shape(line_coords)[1] / (len(aligned) * 4)

                    string = ET.SubElement(text_line, "String")
                    string.set("CONTENT", word)

                    string_hpos -= 1
                    all_x = line_coords[:,
                                        int(string_hpos *
                                            lm_const):int(string_hpos *
                                                          lm_const) +
                                        int(string_width * lm_const), 0]
                    all_y = line_coords[:,
                                        int(string_hpos *
                                            lm_const):int(string_hpos *
                                                          lm_const) +
                                        int(string_width * lm_const), 1]

                    string.set("HEIGHT",
                               str(int(np.max(all_y) - np.min(all_y))))
                    string.set("WIDTH",
                               str(int(np.max(all_x) - np.min(all_x))))
                    string.set("VPOS", str(int(np.min(all_y))))
                    string.set("HPOS", str(int(np.min(all_x))))
                    if w != (len(line.transcription.split()) - 1):
                        space = ET.SubElement(text_line, "SP")
                        all_x = line_coords[:,
                                            int((string_hpos + string_width) *
                                                lm_const):int((string_hpos +
                                                               string_width) *
                                                              lm_const) +
                                            int((end_of_space -
                                                 (string_hpos +
                                                  string_width)) * lm_const),
                                            0]
                        all_y = line_coords[:,
                                            int((string_hpos + string_width) *
                                                lm_const):int((string_hpos +
                                                               string_width) *
                                                              lm_const) +
                                            int((end_of_space -
                                                 (string_hpos +
                                                  string_width)) * lm_const),
                                            1]

                        space.set("WIDTH",
                                  str(int(np.max(all_x) - np.min(all_x))))
                        space.set("VPOS", str(int(np.min(all_y))))
                        space.set("HPOS", str(int(np.min(all_x))))

        top_margin.set("HEIGHT", "{}".format(print_space_vpos))
        top_margin.set("WIDTH", "{}".format(self.page_size[1]))
        top_margin.set("VPOS", "0")
        top_margin.set("HPOS", "0")

        left_margin.set("HEIGHT", "{}".format(self.page_size[0]))
        left_margin.set("WIDTH", "{}".format(print_space_hpos))
        left_margin.set("VPOS", "0")
        left_margin.set("HPOS", "0")

        right_margin.set("HEIGHT", "{}".format(self.page_size[0]))
        right_margin.set(
            "WIDTH", "{}".format(self.page_size[1] -
                                 (print_space_hpos + print_space_width)))
        right_margin.set("VPOS", "0")
        right_margin.set("HPOS",
                         "{}".format(print_space_hpos + print_space_width))

        bottom_margin.set(
            "HEIGHT", "{}".format(self.page_size[0] -
                                  (print_space_vpos + print_space_height)))
        bottom_margin.set("WIDTH", "{}".format(self.page_size[1]))
        bottom_margin.set("VPOS",
                          "{}".format(print_space_vpos + print_space_height))
        bottom_margin.set("HPOS", "0")

        print_space.set("HEIGHT", str(print_space_height))
        print_space.set("WIDTH", str(print_space_width))
        print_space.set("VPOS", str(print_space_vpos))
        print_space.set("HPOS", str(print_space_hpos))

        return ET.tostring(root, pretty_print=True,
                           encoding="utf-8").decode("utf-8")