Ejemplo n.º 1
0
 def test_line_error(self, popen):
     message = ("Cuneiform for Linux 1.1.0\n"
                "Magick: Improper image header (example.png) reported by "
                "coders/png.c:2932 (ReadPNGImage)\n")
     self.stdout.stdout.read.return_value = message.encode()
     self.stdout.wait.return_value = 1
     popen.return_value = self.stdout
     with self.assertRaises(cuneiform.CuneiformError) as ce:
         cuneiform.image_to_string(self.image, builder=self.builder)
     self.assertEqual(ce.exception.status, 1)
     self.assertEqual(ce.exception.message, message)
Ejemplo n.º 2
0
    def test_write_read(self):
        original_boxes = cuneiform.image_to_string(
            Image.open(
                os.path.join("tests", "input", "specific", "test.png")
            ),
            builder=self.builder
        )
        self.assertTrue(len(original_boxes) > 0)

        (file_descriptor, tmp_path) = tempfile.mkstemp()
        try:
            # we must open the file with codecs.open() for utf-8 support
            os.close(file_descriptor)

            with codecs.open(tmp_path, 'w', encoding='utf-8') as file_descriptor:
                self.builder.write_file(file_descriptor, original_boxes)

            with codecs.open(tmp_path, 'r', encoding='utf-8') as file_descriptor:
                new_boxes = self.builder.read_file(file_descriptor)

            self.assertEqual(len(new_boxes), len(original_boxes))
            for i in range(0, len(original_boxes)):
                self.assertEqual(new_boxes[i], original_boxes[i])
        finally:
            os.remove(tmp_path)
Ejemplo n.º 3
0
    def __test_txt(self, image_file, expected_box_file, lang='eng'):
        image_file = os.path.join("tests", "input", "specific", image_file)
        expected_box_file = os.path.join(
            "tests", "output", "specific", "cuneiform", expected_box_file
        )

        with codecs.open(expected_box_file, 'r', encoding='utf-8') \
                as file_descriptor:
            expected_boxes = self.builder.read_file(file_descriptor)
        expected_boxes.sort()

        boxes = cuneiform.image_to_string(Image.open(image_file), lang=lang,
                                          builder=self.builder)
        boxes.sort()

        self.assertEqual(len(boxes), len(expected_boxes))

        for i in range(0, min(len(boxes), len(expected_boxes))):
            try:
                # Python 2.7
                self.assertEqual(type(expected_boxes[i].content), unicode)
                self.assertEqual(type(boxes[i].content), unicode)
            except NameError:
                # Python 3.x
                self.assertEqual(type(expected_boxes[i].content), str)
                self.assertEqual(type(boxes[i].content), str)
            self.assertEqual(boxes[i], expected_boxes[i])
Ejemplo n.º 4
0
    def __test_txt(self, image_file, expected_box_file, lang='eng'):
        image_file = "tests/data/" + image_file
        expected_box_file = "tests/cuneiform/" + expected_box_file

        with codecs.open(expected_box_file, 'r', encoding='utf-8') \
                as file_descriptor:
            expected_boxes = self.builder.read_file(file_descriptor)
        expected_boxes.sort()

        boxes = cuneiform.image_to_string(Image.open(image_file),
                                          lang=lang,
                                          builder=self.builder)
        boxes.sort()

        self.assertEqual(len(boxes), len(expected_boxes))

        for i in range(0, min(len(boxes), len(expected_boxes))):
            try:
                # Python 2.7
                self.assertEqual(type(expected_boxes[i].content), unicode)
                self.assertEqual(type(boxes[i].content), unicode)
            except NameError:
                # Python 3.x
                self.assertEqual(type(expected_boxes[i].content), str)
                self.assertEqual(type(boxes[i].content), str)
            self.assertEqual(boxes[i], expected_boxes[i])
Ejemplo n.º 5
0
 def test_text(self, popen, copen, temp_file):
     popen.return_value = self.stdout
     copen.return_value = self.text_file
     temp_file.return_value = self.enter
     output = cuneiform.image_to_string(self.image, builder=self.builder)
     self.assertEqual(output, self._get_file_content("text").strip())
     popen.assert_called_once_with(
         ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
         stdin=subprocess.PIPE,
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT)
Ejemplo n.º 6
0
 def test_line(self, popen, copen, temp_file):
     popen.return_value = self.stdout
     copen.return_value = self.text_file
     temp_file.return_value = self.enter
     output = cuneiform.image_to_string(self.image, builder=self.builder)
     popen.assert_called_once_with(
         ["cuneiform", "-f", "hocr", "-o", self.tmp_filename, "-"],
         stdin=subprocess.PIPE,
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT)
     for box in output:
         self.assertIsInstance(box, builders.LineBox)
Ejemplo n.º 7
0
 def test_image_to_string_defaults_to_text_buidler(self, popen, copen,
                                                   temp_file, get_version):
     get_version.return_value = (4, 0, 0)
     popen.return_value = self.stdout
     copen.return_value = self.text_file
     temp_file.return_value = self.enter
     output = cuneiform.image_to_string(self.image)
     self.assertEqual(output, self._get_file_content("text").strip())
     popen.assert_called_once_with(
         ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
         stdin=subprocess.PIPE,
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT)
Ejemplo n.º 8
0
    def __test_txt(self, image_file, expected_output_file, lang="eng"):
        image_file = "tests/data/" + image_file
        expected_output_file = "tests/cuneiform/" + expected_output_file

        expected_output = ""
        with codecs.open(expected_output_file, "r", encoding="utf-8") as file_descriptor:
            for line in file_descriptor:
                expected_output += line
        expected_output = expected_output.strip()

        output = cuneiform.image_to_string(Image.open(image_file), lang=lang)

        self.assertEqual(output, expected_output)
Ejemplo n.º 9
0
    def __test_txt(self, image_file, expected_output_file, lang='eng'):
        image_file = "tests/data/" + image_file
        expected_output_file = "tests/cuneiform/" + expected_output_file

        expected_output = ""
        with codecs.open(expected_output_file, 'r', encoding='utf-8') \
                as file_descriptor:
            for line in file_descriptor:
                expected_output += line
        expected_output = expected_output.strip()

        output = cuneiform.image_to_string(Image.open(image_file), lang=lang)

        self.assertEqual(output, expected_output)
Ejemplo n.º 10
0
 def test_text_non_rgb_image(self, popen, copen, temp_file):
     """This tests that image_to_string works with non RGB mode images and
     that image is converted in function."""
     image = self.image.convert("L")
     popen.return_value = self.stdout
     copen.return_value = self.text_file
     temp_file.return_value = self.enter
     output = cuneiform.image_to_string(image, builder=self.builder)
     self.assertEqual(output, self._get_file_content("text").strip())
     popen.assert_called_once_with(
         ["cuneiform", "-f", "text", "-o", self.tmp_filename, "-"],
         stdin=subprocess.PIPE,
         stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT)
Ejemplo n.º 11
0
    def __test_txt(self, image_file, expected_output_file, lang='eng'):
        image_file = os.path.join("tests", "input", "specific", image_file)
        expected_output_file = os.path.join("tests", "output", "specific",
                                            "cuneiform", expected_output_file)

        expected_output = ""
        with codecs.open(expected_output_file, 'r', encoding='utf-8') \
                as file_descriptor:
            for line in file_descriptor:
                expected_output += line
        expected_output = expected_output.strip()

        output = cuneiform.image_to_string(Image.open(image_file), lang=lang)

        self.assertEqual(output, expected_output)
Ejemplo n.º 12
0
    def __test_txt(self, image_file, expected_output_file, lang='eng'):
        image_file = os.path.join("tests", "input", "specific", image_file)
        expected_output_file = os.path.join(
            "tests", "output", "specific", "cuneiform", expected_output_file
        )

        expected_output = ""
        with codecs.open(expected_output_file, 'r', encoding='utf-8') \
                as file_descriptor:
            for line in file_descriptor:
                expected_output += line
        expected_output = expected_output.strip()

        output = cuneiform.image_to_string(Image.open(image_file), lang=lang)

        self.assertEqual(output, expected_output)
Ejemplo n.º 13
0
    def extract_name_from_image(self, cropped_chunk, counter):
        threshold_names = 120
        """
        Для имен еще надо:
         - отрезать слева иконку и класс
         - попытаться примерно угадать, где вторая строка с уровнем и званием, и удалить её тоже
        """
        image_with_name = self.get_part_with_name(cropped_chunk)
        # cut icon and class
        imgwidth, imgheight = image_with_name.size
        percentage_value_x = 0.35
        percentage_value_y = 0.5
        newwidth = floor(imgwidth * percentage_value_x)
        newheight = floor(imgheight * percentage_value_y)
        box = (newwidth, 0, floor(imgwidth * 0.8), newheight)
        image_with_name = image_with_name.crop(box)

        converted_names = image_with_name
        # converted_names = grayscale(image_with_name, threshold_names)
        # converted_names = add_border(converted_names)

        cropped_img = Image.new('RGB', converted_names.size, 255)
        cropped_img.paste(converted_names)
        # cropped_img.save('chunk-{}-{}'.format(counter, threshold_names) + '.png')

        try:
            # cuneiform
            line_and_word_boxes = image_to_string(
                converted_names,
                lang='eng'
                #            builder=pyocr.builders.LineBoxBuilder()
            )
            text_data_cunei = line_and_word_boxes
            # for lb in line_and_word_boxes:
            #    print('LB', lb.content)
        except pyocr.error.CuneiformError:
            text_data_cunei = None
            pass
        # text_data = merge_nearest_chars(line_and_word_boxes)
        text_data_tesseract = pytesseract.image_to_string(converted_names)
        return {'tesseract': text_data_tesseract, 'cunei': text_data_cunei}
Ejemplo n.º 14
0
    def extract_numbers_from_image(self, cropped_chunk, image_name=None):
        """
        Работает через cuneiForm.
        :param cropped_chunk:
        :param image_name:
        :return:
        """
        threshold_numbers = 70
        converted_numbers = self.get_part_with_numbers(cropped_chunk)
        converted_numbers = self.grayscale(converted_numbers, threshold_numbers)
        converted_numbers = self.add_border(converted_numbers)

        if image_name is not None and self.debug:
            converted_numbers.save('debug_images/' + 'extracted_numbers-' + image_name)

        line_and_word_boxes = image_to_string(
            converted_numbers,
            lang='ruseng',
            builder=pyocr.builders.LineBoxBuilder()
        )
        text_numbers_data = self.merge_nearest_chars(line_and_word_boxes)
        return text_numbers_data
Ejemplo n.º 15
0
    def test_write_read(self):
        original_boxes = cuneiform.image_to_string(
            Image.open("tests/data/test.png"), builder=self.builder)
        self.assertTrue(len(original_boxes) > 0)

        (file_descriptor, tmp_path) = tempfile.mkstemp()
        try:
            # we must open the file with codecs.open() for utf-8 support
            os.close(file_descriptor)

            with codecs.open(tmp_path, 'w',
                             encoding='utf-8') as file_descriptor:
                self.builder.write_file(file_descriptor, original_boxes)

            with codecs.open(tmp_path, 'r',
                             encoding='utf-8') as file_descriptor:
                new_boxes = self.builder.read_file(file_descriptor)

            self.assertEqual(len(new_boxes), len(original_boxes))
            for i in range(0, len(original_boxes)):
                self.assertEqual(new_boxes[i], original_boxes[i])
        finally:
            os.remove(tmp_path)
Ejemplo n.º 16
0
    def __test_txt(self, image_file, expected_box_file, lang="eng"):
        image_file = "tests/data/" + image_file
        expected_box_file = "tests/cuneiform/" + expected_box_file

        with codecs.open(expected_box_file, "r", encoding="utf-8") as file_descriptor:
            expected_boxes = self.builder.read_file(file_descriptor)
        expected_boxes.sort()

        boxes = cuneiform.image_to_string(Image.open(image_file), lang=lang, builder=self.builder)
        boxes.sort()

        self.assertEqual(len(boxes), len(expected_boxes))

        for i in range(0, min(len(boxes), len(expected_boxes))):
            try:
                # Python 2.7
                self.assertEqual(type(expected_boxes[i].content), unicode)
                self.assertEqual(type(boxes[i].content), unicode)
            except NameError:
                # Python 3.x
                self.assertEqual(type(expected_boxes[i].content), str)
                self.assertEqual(type(boxes[i].content), str)
            self.assertEqual(boxes[i], expected_boxes[i])
Ejemplo n.º 17
0
 def test_digits_box_not_implemented(self):
     with self.assertRaises(NotImplementedError):
         cuneiform.image_to_string(self.image, builder=self.builder)