Ejemplo n.º 1
0
 def test_init_tesseract_version_4(self, get_version):
     get_version.return_value = (4, 0, 0)
     builder = builders.LineBoxBuilder()
     self.assertListEqual(builder.tesseract_flags, ["--psm", "1"])
     self.assertListEqual(builder.file_extensions, ["html", "hocr"])
     self.assertListEqual(builder.tesseract_configs, ["hocr"])
     self.assertListEqual(builder.cuneiform_args, ["-f", "hocr"])
     self.assertListEqual(builder.lines, [])
     self.assertEqual(builder.tesseract_layout, 1)
Ejemplo n.º 2
0
def ocr(tool, img, cont='txt'):
    # img is [0,1] with float64 and single channel
    # langs = tool.get_available_languages()
    # lang = langs[0]

    if cont == 'txt':
        # txt is a Python string
        txt = tool.image_to_string(
            Image.fromarray((img * 255.0).astype('uint8'), mode='L'),
            lang="eng",
            builder=ocrtools.TextBuilder()
        )
        return txt

    if cont == 'word_boxes':
        # list of box objects. For each box object:
        #   box.content is the word in the box
        #   box.position is its position on the page (in pixels)
        #
        # Beware that some OCR tools (Tesseract for instance)
        # may return empty boxes
        word_boxes = tool.image_to_string(
            Image.fromarray((img * 255.0).astype('uint8'), mode='L'),
            lang="eng",
            builder=ocrtools.WordBoxBuilder()
        )
        return word_boxes

    if cont == 'line_word_boxes':
        # list of line objects. For each line object:
        #   line.word_boxes is a list of word boxes (the individual words in the line)
        #   line.content is the whole text of the line
        #   line.position is the position of the whole line on the page (in pixels)
        #
        # Beware that some OCR tools (Tesseract for instance)
        # may return empty boxes
        line_and_word_boxes = tool.image_to_string(
            Image.fromarray((img * 255.0).astype('uint8'), mode='L'),
            lang="eng",
            builder=ocrtools.LineBoxBuilder()
        )
        return line_and_word_boxes

    # if cont == 'digits':
    #     # Digits - Only Tesseract (not 'libtesseract' yet !)
    #     # digits is a python string
    #     digits = tool.image_to_string(
    #         Image.fromarray((img * 255.0).astype('uint8'), mode='L'),
    #         lang="eng",
    #         builder=pyocr.tesseract.DigitBuilder()
    #     )
    #     return digits

    else:
        raise ValueError(" Not supported OCR type ")
Ejemplo n.º 3
0
 def setUp(self, get_version):
     get_version.return_value = (4, 0, 0)
     self.builder = builders.LineBoxBuilder()
     self.image = Image.new(mode="RGB", size=(1, 1))
     self.text_file = StringIO(self._get_file_content("cuneiform.lines"))
     self.stdout = MagicMock()
     self.stdout.stdout.read.return_value = b"Cuneiform for Linux 1.1.0\n"
     self.stdout.wait.return_value = 0
     self.tmp_filename = "/tmp/cuneiform_n0qfk87otxt"
     self.enter = MagicMock()
     self.enter.__enter__.return_value = MagicMock()
     self.enter.__enter__.return_value.configure_mock(
         name=self.tmp_filename)
Ejemplo n.º 4
0
 def setUp(self):
     self.builder = builders.LineBoxBuilder()
Ejemplo n.º 5
0
 def set_builder(self):
     self._builder = builders.LineBoxBuilder()
Ejemplo n.º 6
0
 def setUp(self, get_version):
     get_version.return_value = (4, 0, 0)
     self.image = Image.new(mode="RGB", size=(1, 1))
     self.builder = builders.LineBoxBuilder()
Ejemplo n.º 7
0
 def setUp(self, get_version):
     get_version.return_value = (4, 0, 0)
     self.builder = builders.LineBoxBuilder()