Example #1
0
    def parse_pages(self):
        """Genereate metadata for full page images.

        Includes every character on page. Characters not in character
        set or vocabulary will be labeled as unknown when converted to
        integer IDs.

        Returns:
            :obj:`list` of :obj:`carpedm.data.util.ImageMeta`:
                Page image meta data.
        """
        pages = []
        char = self.character(next(self._reader))
        image_id = char.image_id
        image = ImageMeta(filepath=image_id, full_image=True, first_char=char)
        for char in self.characters():
            if char.image_id == image_id:
                image.add_char(char)
            else:
                pages.append(image)
                image_id = char.image_id
                image = ImageMeta(filepath=image_id,
                                  full_image=True,
                                  first_char=char)
        # Add last image.
        pages.append(image)
        return pages
Example #2
0
 def test_character_bounding_boxes(self):
     meta = ImageMeta(filepath=self.char1.image_id, full_image=False,
                      first_char=self.char1)
     meta.add_char(self.char2)
     # char1
     self.assertEqual(meta.char_bboxes[0].xmin, 0)
     self.assertEqual(meta.char_bboxes[0].xmax, 179)
     self.assertEqual(meta.char_bboxes[0].ymin, 0)
     self.assertEqual(meta.char_bboxes[0].ymax, 132)
     # char2
     self.assertEqual(meta.char_bboxes[1].xmin, 8)
     self.assertEqual(meta.char_bboxes[1].xmax, 157)
     self.assertEqual(meta.char_bboxes[1].ymin, 246)
     self.assertEqual(meta.char_bboxes[1].ymax, 414)
Example #3
0
    def parse_sequences(self, charset, len_min, len_max):
        """Generate metadata for images of character sequences.

        Only includes sequences of chars in the desired character set.
        If ``len_min == len_max``, sequence length is deterministic, else
        each sequence is of random length from [len_min, len_max].

        Args:
            charset (CharacterSet): The character set.
            len_min (int): Minimum sequence length.
            len_max (int): Maximum sequence length.

        Returns:
            :obj:`list` of :obj:`carpedm.data.util.ImageMeta`:
                Sequence image meta data.

        """
        sequences = []
        length = random.randint(len_min, len_max)
        image = None
        for c in self.characters():
            if image is None:
                if charset.in_charset(c.label):
                    image = ImageMeta(filepath=c.image_id, first_char=c)
            elif (image.valid_char(c, same_line=True)
                  and charset.in_charset(c.label)
                  and image.num_chars < length):
                image.add_char(c)
            else:
                if len_min <= image.num_chars <= len_max:
                    sequences.append(image)
                    length = random.randint(len_min, len_max)
                if charset.in_charset(c.label):
                    image = ImageMeta(filepath=c.image_id, first_char=c)
                else:
                    image = None
        if image is not None and len_min <= image.num_chars <= len_max:
            sequences.append(image)
        return sequences
Example #4
0
    def parse_lines(self):
        """Generate metadata for vertical lines of characters.

        Characters not in character set or vocabulary will be labeled as
        unknown when converted to integer IDs.

        Returns:
            :obj:`list` of :obj:`carpedm.data.util.ImageMeta`:
                Line image meta data.
        """
        lines = []

        c = self.character(next(self._reader))
        image = ImageMeta(filepath=c.image_id, first_char=c)

        for c in self.characters():
            if image.valid_char(c, same_line=True):
                image.add_char(c)
            else:
                lines.append(image)
                image = ImageMeta(filepath=c.image_id, first_char=c)
        lines.append(image)
        return lines
Example #5
0
 def test_line_bounding_boxes_for_full_page(self):
     meta = ImageMeta(filepath=self.image_path, full_image=True)
     with open(self.csv_path, 'r') as csvfile:
         reader = csv.reader(csvfile)
         next(reader)  # header
         for row in reader:
             meta.add_char(Character(*row))
     # line1
     self.assertEqual(meta.line_bboxes[0].xmin, 2290)
     self.assertEqual(meta.line_bboxes[0].xmax, 2493)
     self.assertEqual(meta.line_bboxes[0].ymin, 717)
     self.assertEqual(meta.line_bboxes[0].ymax, 3780)
     # line2
     self.assertEqual(meta.line_bboxes[1].xmin, 1931)
     self.assertEqual(meta.line_bboxes[1].xmax, 2136)
     self.assertEqual(meta.line_bboxes[1].ymin, 703)
     self.assertEqual(meta.line_bboxes[1].ymax, 3796)
     # line3
     self.assertEqual(meta.line_bboxes[2].xmin, 1606)
     self.assertEqual(meta.line_bboxes[2].xmax, 1811)
     self.assertEqual(meta.line_bboxes[2].ymin, 715)
     self.assertEqual(meta.line_bboxes[2].ymax, 3783)
     # line4
     self.assertEqual(meta.line_bboxes[3].xmin, 1266)
     self.assertEqual(meta.line_bboxes[3].xmax, 1485)
     self.assertEqual(meta.line_bboxes[3].ymin, 701)
     self.assertEqual(meta.line_bboxes[3].ymax, 3320)
     # line5
     self.assertEqual(meta.line_bboxes[4].xmin, 940)
     self.assertEqual(meta.line_bboxes[4].xmax, 1154)
     self.assertEqual(meta.line_bboxes[4].ymin, 859)
     self.assertEqual(meta.line_bboxes[4].ymax, 2368)
     # line6
     self.assertEqual(meta.line_bboxes[5].xmin, 612)
     self.assertEqual(meta.line_bboxes[5].xmax, 817)
     self.assertEqual(meta.line_bboxes[5].ymin, 1627)
     self.assertEqual(meta.line_bboxes[5].ymax, 3583)
Example #6
0
 def test_character_labels(self):
     meta = ImageMeta(filepath=self.char1.image_id, full_image=False,
                      first_char=self.char1)
     meta.add_char(self.char2)
     self.assertEqual(meta.char_labels, ['U+4E4B', 'U+6CD5'])
Example #7
0
 def test_num_characters(self):
     meta = ImageMeta(filepath=self.char1.image_id, full_image=False,
                      first_char=self.char1)
     meta.add_char(self.char2)
     self.assertEqual(meta.num_chars, 2)
Example #8
0
 def test_get_multi_char_height(self):
     meta = ImageMeta(filepath=self.char1.image_id, full_image=False,
                      first_char=self.char1)
     meta.add_char(self.char2)
     self.assertEqual(meta.height, 414)
Example #9
0
 def test_get_multi_char_width(self):
     meta = ImageMeta(filepath=self.char1.image_id, full_image=False,
                      first_char=self.char1)
     meta.add_char(self.char2)
     self.assertEqual(meta.width, 179)
Example #10
0
 def test_get_multi_char_ymax(self):
     meta = ImageMeta(filepath=self.char1.image_id, full_image=False,
                      first_char=self.char1)
     meta.add_char(self.char2)
     self.assertEqual(meta.ymax, 963 + 168)