def get_content(api, img_path: str,
                image_crop_pram: Tuple[int, int, int, int], tracker) -> \
        List[Tuple[str, float]]:
    # first we do some pre-processing on the image
    img = Image.open(img_path)
    # convert to gray scale and apply binarization
    img = img.convert("L").point(lambda x: 0 if x < 180 else 255, "1")
    if image_crop_pram is not None:
        img = img.crop(image_crop_pram)
    # use tesseract to recognize the texts
    api.SetImage(img)
    api.Recognize()
    # build the data
    page_text = []
    ri = api.GetIterator()
    level = RIL.TEXTLINE
    for r in iterate_level(ri, level):
        try:
            line = r.GetUTF8Text(level)
            conf = r.Confidence(level)
            # process the text, remove the space, and newline
            line = line.replace(" ", "").replace("\n", "")
            page_text.append((line, conf))
        except RuntimeError as e:
            tracker.log("No text Returned on this line.",
                        tp=TRACKER_LOG_ERROR,
                        exc_info=e)
    return page_text
def iterate_lines(img):
    with PyTessBaseAPI(psm=PSM.AUTO, oem=OEM.LSTM_ONLY) as api:
        api.SetImageFile(img)
        text = api.GetUTF8Text()

        if text == '':
            data = {"status": "Succeeded", "recognitionResult": {"fullTetx": text, "lines": []}}

        else:
            data = {"status": "Succeeded", "recognitionResult": {"fullTetx": text, "lines": []}}
            api.Recongise()
            ri = api.GetIterator()
            level = RTL.TEXTLINE

            for r in iterate_level(ri, level):
                line = r.GetUTF8Text(level)
                line_post = line.replace('\n','')
                line_post = ' '.join(i for i in line_post.split(' ') if i != '')

                if line.isspace():
                    continue

                conf = r.Confidence(level)
                bbbox = r.BoundingBox(level)
                line_boundingBox = convert_boundingBox(bbox)

                data['recognitionResult']['lines'].append({'text': line_post, 'boundingBox': line_boundingBox})

    return data
Beispiel #3
0
 def tsOcrText(self,tpl,text_features,x1,y1,x2,y2,lang='chi_sim',psm=7, oem=1):
     _data_list = list()
     tpl = tpl[y1:y2,x1:x2]
     tpl = cv2.cvtColor(tpl,cv2.COLOR_RGB2GRAY)
     img = cv2.adaptiveThreshold(tpl,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,11,2) #经过测试高斯识别效果好
     #numpy转换成PIL格式
     img = Image.fromarray(img)
     #img.show()
     with PyTessBaseAPI(lang='chi_sim',psm=7, oem=1) as api:
             level = RIL.TEXTLINE #以标题为主
             #img = Image.open("C:\\Users\\Wrench\\Nox_share\\ImageShare\\Screenshots\\12121.png")
             api.SetImage(img)
             api.Recognize()
             ri = api.GetIterator()
             for r in iterate_level(ri, level):
                 try: 
                     symbol = r.GetUTF8Text(level)  # r == ri
                     conf = r.Confidence(level) #相似度
                     if symbol:
                         pass
                         #print('symbol {0}  conf: {1}'.format(symbol, conf))
                     boxes = r.BoundingBox(level) #xy等等坐标
                     dict_= {"text":symbol,"left":boxes[0],"top":boxes[1],"weight":boxes[2],"weight":boxes[3]}
                     _data_list.append(dict_)
                 except Exception as e:
                     print("没有字符")
     xz = list()
     for idx, data in enumerate(_data_list):
         for text in  text_features:
             if text in data["text"]:
                 x =  data["left"] + x1
                 y =  data["top"] + y1
                 xz.append((data["text"],x,y))            
     #print("识别结果:{0}".format(xz))
     return xz
def extract_data_from_image(filename):
    print(filename)
    bboxPrev = None
    coldist = None
    pdfPageDf = pd.DataFrame(
        columns=['trueline', 'lineitem', 'bbox', 'coldist'])
    with PyTessBaseAPI(path="C:\\Program Files (x86)\\Tesseract-OCR\\tessdata",
                       psm=PSM.SPARSE_TEXT_OSD) as api:
        api.SetImageFile(filename)
        api.Recognize()
        ri = api.GetIterator()
        level = RIL.TEXTLINE
        for r in iterate_level(ri, level):
            pdfLine = r.GetUTF8Text(level)
            trueline = True
            bbox = r.BoundingBoxInternal(level)

            if not bboxPrev == None:
                if abs(bbox[1] - bboxPrev[1]) <= 15:
                    trueline = False
                    coldist = dist(np.array([bbox[0], bbox[1]]),
                                   np.array([bboxPrev[0], bboxPrev[1]]))
            bboxPrev = bbox
            pdfPageDf = pdfPageDf.append(
                {
                    "trueline": trueline,
                    "lineitem": pdfLine,
                    "bbox": bbox,
                    "coldist": coldist
                },
                ignore_index=True)
            # print(pdfLine.strip(),bbox)
    return pdfPageDf
Beispiel #5
0
    def extract_text(self, image):
        """
        Given an arbitrary RGB image in numpy array format, return a list of all the words
        detected in the text, along with their bounding box coordinates.

        The returned list contains tuples in this format:
        (word, x1, y1, x2, y2)
        """

        # Convert the numpy array image to a Pillow-friendly format
        pillow_img = Image.fromarray(image)

        output = []

        # Open the Tesseract context, specifiying SPARSE_TEXT as an option (used to highlight
        # single words, rather than lines of text )
        with PyTessBaseAPI(psm=PSM.SPARSE_TEXT) as api:
            api.SetVariable("save_blob_choices", "T")
            api.SetImage(pillow_img)
            api.Recognize()

            ri = api.GetIterator()
            level = RIL.WORD

            # Cycle through the words, populating the list
            for r in iterate_level(ri, level):
                word = r.GetUTF8Text(level)
                conf = r.Confidence(level)
                box = r.BoundingBox(level)
                if word and conf > MIN_CONFIDENCE:
                    entry = (word, box[0], box[1], box[2], box[3], conf)
                    output.append(entry)

        return output
Beispiel #6
0
    def perform_ocr(self, x_offset=0, y_offset=0, pad_offset=None):
        dpi = 300
        api = self.api
        api.Recognize()
        ri = api.GetIterator()
        words = []
        level = RIL.WORD
        for r in iterate_level(ri, level):
            try:
                word = r.GetUTF8Text(level)
                font_info = r.WordFontAttributes()
                bbox = list(r.BoundingBox(level))
                bbox = [float(b) for b in bbox]
                bbox = [float(b) * 72 / dpi for b in bbox]
                bbox[0] += x_offset
                bbox[2] += x_offset
                bbox[1] += y_offset
                bbox[3] += y_offset
                if pad_offset is not None:
                    bbox[0] += pad_offset[0]
                    bbox[1] += pad_offset[1]
                    bbox[2] += pad_offset[0]
                    bbox[3] += pad_offset[1]
                word = word.rstrip().lstrip()
                if word:
                    bbox.append(font_info)
                    bbox.append(word)
                    words.append(bbox)
            except Exception as e:
                pass

        words = sorted(words, key=lambda x: (x[1], x[0]))
        return words
Beispiel #7
0
def classifier_choices():
    api.SetImageFile('/home/johannes/Repos/tesseract/testing/phototest.tif')
    api.SetVariable("save_blob_choices", "T")
    api.SetRectangle(37, 228, 548, 31)
    api.Recognize()

    ri = api.GetIterator()
    level = RIL.SYMBOL
    # level = RIL.PARA paragraj
    for r in iterate_level(ri, level):
        symbol = r.GetUTF8Text(level)  # r == ri
        conf = r.Confidence(level)
        test = r.SetLineSeparator('\a')
        lang = r.WordRecognitionLanguage()
        if symbol:
            print(u'symbol {}, conf: {}'.format(symbol, conf))
        indent = False
        ci = r.GetChoiceIterator()

        for c in ci:
            if indent:
                print('\t\t ')
            print('\t- ')
            choice = c.GetUTF8Text()  # c == ci
            print(u'{} conf: {}'.format(choice, c.Confidence()))
            indent = True
            ci.Next()  # j4t

        print('---------------------------------------------')
Beispiel #8
0
def upload(request: ImageModel):
    msg = base64.b64decode(request.base64)
    buf = io.BytesIO(msg)
    image = Image.open(buf)

    with PyTessBaseAPI(oem=OEM.LSTM_ONLY) as api:
        api.SetImage(image)
        api.Recognize()
        api.SetVariable("save_blob_choices", "T")
        ri = api.GetIterator()
        level = RIL.TEXTLINE
        boxes = api.GetComponentImages(RIL.TEXTLINE, True)
        text_list = []
        i = 0
        for r in iterate_level(ri, level):
            symbol = r.GetUTF8Text(level)
            conf = r.Confidence(level)
            bbox = r.BoundingBoxInternal(level)
            im = {
                "text": symbol,
                "left": bbox[0],
                "top": bbox[1],
                "width": bbox[2] - bbox[0],
                "height": bbox[3] - bbox[1],
            }
            text_list.append(im)
            i += 1
    return {
        "texts": text_list,
    }
Beispiel #9
0
 def __decode_words(self, iterator):
     words = []
     for tesseract_word in iterate_level(iterator, RIL.WORD):
         font_attributes = tesseract_word.WordFontAttributes()
         word = Word()
         word.bounding_box = BoundingBox.from_coordinates(
             *tesseract_word.BoundingBox(RIL.WORD))
         word.confidence = float(tesseract_word.Confidence(
             RIL.WORD)) / 100.0
         word.text = tesseract_word.GetUTF8Text(RIL.WORD)
         word.symbols = self.__decode_symbols(iterator)
         font = Font()
         font.bold = font_attributes['bold']
         font.italic = font_attributes['italic']
         font.underline = font_attributes['underlined']
         font.monospace = font_attributes['monospace']
         font.serif = font_attributes['serif']
         font.pointsize = font_attributes['pointsize']
         font.id = font_attributes['font_id']
         for symbol in word.symbols:
             symbol.font = font
         words.append(word)
         if iterator.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD):
             break
     return words
Beispiel #10
0
 def Ocrtext(self, scx_rgb, x1, y1, x2, y2):
     image_array1 = self.__Ocr(scx_rgb, x1, y1, x2, y2)
     #self.show(image_array1)
     _data_list = list()
     with PyTessBaseAPI(lang='chi_sim', psm=7, oem=1) as api:
         level = RIL.TEXTLINE  #以标题为主
         #img = Image.open("C:\\Users\\Wrench\\Nox_share\\ImageShare\\Screenshots\\12121.png")
         img = Image.fromarray(image_array1)
         #img.show()
         api.SetImage(img)
         api.Recognize()
         ri = api.GetIterator()
         for r in iterate_level(ri, level):
             try:
                 symbol = r.GetUTF8Text(level)  # r == ri
                 r.Confidence(level)  #相似度
                 if symbol:
                     pass
                     #print('symbol {0}  conf: {1}'.format(symbol, conf))
                 boxes = r.BoundingBox(level)  #xy等等坐标
                 dict_ = {
                     "text": symbol,
                     "left": boxes[0],
                     "top": boxes[1],
                     "weight": boxes[2],
                     "weight": boxes[3]
                 }
                 _data_list.append(dict_)
             except Exception as e:
                 print("没有字符")
     return _data_list
 def get_region(self, xml_box, padding):
     api = self.api
     ri = api.GetIterator()
     words = []
     level = RIL.WORD
     for r in iterate_level(ri, level):
         try:
             word = r.GetUTF8Text(level)
             bbox = list(r.BoundingBox(level))
             bbox = [float(b) for b in bbox]
             bbox[0] += padding // 2
             bbox[2] -= padding // 2
             bbox[1] += padding // 2
             bbox[3] -= padding // 2
             bbox = [float(b) * 72 / 300 for b in bbox]
             bbox[0] += xml_box[0]
             bbox[2] += xml_box[0]
             bbox[1] += xml_box[1]
             bbox[3] += xml_box[1]
             w = word.rstrip().lstrip()
             if len(w) > 0:
                 bbox.append(w)
                 words.append(bbox)
         except Exception as e:
             pass
     return words
def iterate_words(img):
    with PyTessBaseAPI(psm=PSM.AUTO, oem=OEM.LSTM_ONLY) as api:
        api.SetImageFile(img)
        text = api.GetUTF8Text()

        if text == '':
            data = {}

        else:
            data = {'text': text, 'words': []}
            api.Recongise()
            ri = api.GetIterator()
            level = RTL.WORD

            for r in iterate_level(ri, level):
                word = r.GetUTF8Text(level)

                if word.isspace():
                    continue

                conf = r.Confidence(level)
                word_bbox = r.BoundingBox(level)
                word_boundingBox = convert_boundingBox(word_bbox)

                data['words'].append({'text': word, 'boundingBox': word_boundingBox, 'confidence': conf})


        return data
Beispiel #13
0
def image_to_string(img, lang):
    with tesserocr.PyTessBaseAPI(lang=lang, psm=3) as api:
        api.SetVariable("tessedit_char_whitelist", " \n" + CHARSET)
        api.SetImage(img)
        api.Recognize()

        words = []

        level = tesserocr.RIL.WORD
        for r in tesserocr.iterate_level(api.GetIterator(), level):
            try:
                word = r.GetUTF8Text(level)
            except RuntimeError:
                continue
            conf = r.Confidence(level)

            # print(f"{word} ({conf})")

            if words:
                previous = words[-1]
                if regex.match(r"[\p{Lu}\p{Ll}]+\-$",
                               previous):  # dash=>combine
                    words[-1] = words[-1][:-1] + word
                    continue

            # if conf > 0.95 or (all(c in LETTERS for c in word) and conf > 0.9):
            #     words.append(word)
            #     continue
            #
            # print(f"LOWCONF! {word} ({conf})")

            words.append(word)

    return filter(bool, words)
Beispiel #14
0
    def _line_height(self, polygon):
        key = tuple(polygon.centroid.coords[0])
        if key not in self._ocr:
            from .utils import polygons_to_mask
            mask = polygons_to_mask(self._unbinarized.shape, [polygon])

            minx, miny, maxx, maxy = polygon.bounds
            minx, miny = numpy.floor(numpy.array([minx,
                                                  miny])).astype(numpy.int32)
            maxx, maxy = numpy.ceil(numpy.array([maxx,
                                                 maxy])).astype(numpy.int32)

            pixels = self._unbinarized[miny:maxy, minx:maxx]
            mask = mask[miny:maxy, minx:maxx]
            pixels[numpy.logical_not(mask)] = 255

            with tesserocr.PyTessBaseAPI(
                    psm=tesserocr.PSM.SINGLE_BLOCK) as api:
                api.SetImage(PIL.Image.fromarray(pixels, "L"))

                heights = []
                for i, data in enumerate(api.GetTextlines()):
                    bbox = data[1]
                    heights.append(bbox["h"])

                if heights:
                    n_lines = len(heights)
                    lh = numpy.min(heights)
                else:
                    lh = maxy - miny
                    n_lines = 1

                if self._debug:
                    api.Recognize()

                    ri = api.GetIterator()
                    level = tesserocr.RIL.TEXTLINE

                    text = ""
                    #lines = []
                    for r in tesserocr.iterate_level(ri, level):
                        #baseline = r.Baseline(level)
                        #if baseline:
                        #	p1, p2 = baseline
                        #	lines.append(shapely.geometry.LineString([p1, p2]))

                        try:
                            text += r.GetUTF8Text(level) + " "
                        except RuntimeError:
                            pass

                    #print("txt", text.strip(), "lh", lh, "#", n_lines)
                else:
                    text = ""

            self._ocr[key] = (n_lines, lh, text)

        return self._ocr[key]
def processa(path='imagem.jpg'):

    locale.setlocale(locale.LC_ALL, 'C')
    with PyTessBaseAPI(lang='por') as api:
        start_time = time.time()
        print('pre')
        c = cropa(path)
        print('pro')
        try:
            if '.png' in path:
                api.SetImageFile('tmp.png')
            elif '.jpeg' in path:
                api.SetImageFile('tmp.jpeg')
            else:
                api.SetImageFile('tmp.jpg')
        except RuntimeError:
            api.SetImageFile(path)

        api.SetVariable("save_blob_choices", "T")
        """"
        #ima = cv2.imread(path)
        #ima = cv2.resize(ima, (1000,900))
        #api.SetImage(Image.fromarray(ima))
        lines = api.GetTextlines()
        print(list(lines))
        for im in lines:
            #ia= cv2.rectangle(ima,(im[1]['x'], im[1]['y']),(im[1]['x'] + im[1]['w'],im[1]['y'] + im[1]['h']),(0,255,0),3)
            #cv2.imshow("kk", ia)
            #cv2.waitKey(0)
            api.SetRectangle(im[1]['x'], im[1]['y'], im[1]['w'], im[1]['h'])
            api.Recognize()
            print(api.GetUTF8Text())
        """
        api.Recognize()
        ri = api.GetIterator()
        level = RIL.TEXTLINE
        lines = []
        #print(' '.join(word for word in api.AllWords()))
        for r in iterate_level(ri, level):
            symbol = r.GetUTF8Text(level)  # r == ri
            conf = r.Confidence(level)
            #print(symbol, end='')
            if symbol.strip():
                lines.append(symbol.strip())
        #print(api.GetUTF8Text())
        #print(lines)
        text = '\n'.join(lines)
        print(text)
        #text = api.GetUTF8Text()
        locale.setlocale(locale.LC_ALL, 'pt_BR.UTF-8')
        if text != None:
            #text = unidecode.unidecode(text)
            file = open('textscanner.txt', 'w')
            file.write(text)
            file.close()
            os.system('python2 translator.py textscanner.txt')
            print("Elapsed time: {}".format(time.time() - start_time))
def orcTitle(path):
     # make first page into jpeg
    page = convert_from_path(path, first_page=0, last_page = 1)[0]
    page.save(temp_file, 'JPEG') 

    # use ocr to extract title
    # image = Image.open(temp_file)
    with PyTessBaseAPI() as api:
        api.SetImageFile(temp_file)
        api.Recognize()  # required to get result from the next line
        ri = api.GetIterator()

        # loop through and find largest text size
        level = RIL.TEXTLINE
        maxSize = 0
        for r in iterate_level(ri, level):
            # extract line of text
            text = r.GetUTF8Text(level)

            # get line's font size
            fontSize = r.WordFontAttributes()['pointsize']

            # check to see if current max
            # remove extra spaces/newlines/tabs (etc.) when testing min length req
            if len(''.join(text.split())) > 1 and fontSize > maxSize:
                maxSize = fontSize


        # loop through again and concatenate largest words
        ri = api.GetIterator()
        level = RIL.TEXTLINE
        title_list = []
        for r in iterate_level(ri, level):
            text = r.GetUTF8Text(level)
            fontSize = ri.WordFontAttributes()['pointsize']
            if len(''.join(text.split())) > 1 and fontSize > maxSize - 15:
                # add title words to list
                title_list.extend(r.GetUTF8Text(level).split())

        # concatenate them back together
        title = ' '.join(title_list)

        os.remove(temp_file)
        return title
Beispiel #17
0
def get_text_from_box(fn, x, y, w, h):
    """
        Functionality: given the bounding box, find the word(s) within; assumes the box is good enough
        For debugging purpose, this function will draw the bounding box where Tesseract sees the word 
        and save to debug_output.png

        Args: 
            image: PIL image object
            x: x coordinate of the upper left corner of the bounding box 
            y: y coordinate of the upper left corner of the bounding vox
            w: width of the bounding box
            h: height of the bounding box

        Returns:
            a list of word objects (but did not set id)
        """
    # print (fn)
    image = Image.open(fn)
    Arr = np.array(image)
    boxes = []
    words = []
    with PyTessBaseAPI() as api:
        api.SetImage(image)
        api.SetVariable("save_blob_choices", "T")
        api.SetRectangle(x, y, w, h)
        api.Recognize()

        ri = api.GetIterator()
        level = RIL.WORD
        counter = 0
        for r in iterate_level(ri, level):
            try:
                symbol = r.GetUTF8Text(level)
                conf = r.Confidence(level)
                bbox = r.BoundingBox(level)
                w = word.Word(None, None, None, None, None, None, None)
                w.confidence = conf
                w.text = symbol
                w.x = bbox[0]
                w.y = bbox[1]
                w.width = bbox[2] - bbox[0]
                w.height = bbox[3] - bbox[1]
                words.append(w)
                # print (w.text)
                outim = Image.fromarray(Arr[bbox[1]:bbox[3], bbox[0]:bbox[2]])
                #debugging purpose only...
                if symbol:
                    print(symbol + " " + str(conf))
                    # print (bbox)
                    outim.save(str(counter) + ' debug.png')
                    counter += 1
            except RuntimeError:
                print('No text returned')
                continue

    return words
Beispiel #18
0
 def __decode_lines(self, iterator):
     lines = []
     for tesseract_line in iterate_level(iterator, RIL.TEXTLINE):
         line = TextLine()
         line.bounding_box = BoundingBox.from_coordinates(
             *tesseract_line.BoundingBox(RIL.TEXTLINE))
         line.words = self.__decode_words(iterator)
         lines.append(line)
         if iterator.IsAtFinalElement(RIL.PARA, RIL.TEXTLINE):
             break
     return lines
Beispiel #19
0
 def __decode_paragraphs(self, iterator):
     paragraphs = []
     for tesseract_paragraph in iterate_level(iterator, RIL.PARA):
         paragraph = Paragraph()
         paragraph.bounding_box = BoundingBox.from_coordinates(
             *tesseract_paragraph.BoundingBox(RIL.PARA))
         paragraph.lines = self.__decode_lines(iterator)
         paragraphs.append(paragraph)
         if iterator.IsAtFinalElement(RIL.BLOCK, RIL.PARA):
             break
     return paragraphs
Beispiel #20
0
    def Ocrtext(self,
                scx_rgb,
                x1,
                y1,
                x2,
                y2,
                ril=RIL.TEXTLINE,
                lang='chi_sim',
                psm=7,
                oem=1,
                attribute=None,
                THRESH_GAUSSIAN=False):

        if THRESH_GAUSSIAN:
            tpl = self.Print_screen()
            tpl = tpl[y1:y2, x1:x2]
            tpl = cv2.cvtColor(tpl, cv2.COLOR_RGB2GRAY)
            image_array1 = cv2.adaptiveThreshold(
                tpl, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
                11, 2)
        else:
            image_array1 = self.__Ocr(scx_rgb, x1, y1, x2, y2)
        #self.show(image_array1)
        _data_list = list()
        with PyTessBaseAPI(lang=lang, psm=psm, oem=oem) as api:
            level = ril  #以标题为主
            #img = Image.open("C:\\Users\\Wrench\\Nox_share\\ImageShare\\Screenshots\\12121.png")
            img = Image.fromarray(image_array1)
            if attribute:
                api.SetVariable(attribute[0], attribute[1])
            #img.show()
            api.SetImage(img)
            api.Recognize()
            ri = api.GetIterator()
            for r in iterate_level(ri, level):
                try:
                    symbol = r.GetUTF8Text(level)  # r == ri
                    r.Confidence(level)  #相似度
                    if symbol:
                        pass
                        #print('symbol {0}  conf: {1}'.format(symbol, conf))
                    boxes = r.BoundingBox(level)  #xy等等坐标
                    dict_ = {
                        "text": symbol,
                        "left": boxes[0],
                        "top": boxes[1],
                        "weight": boxes[2],
                        "weight": boxes[3]
                    }
                    _data_list.append(dict_)
                except Exception as e:
                    print("没有字符")
        return _data_list
Beispiel #21
0
 def __decode_blocks(self, iterator, image):
     blocks = []
     for tesseract_block in iterate_level(iterator, RIL.BLOCK):
         block = Block()
         block.bounding_box = BoundingBox.from_coordinates(
             *tesseract_block.BoundingBox(RIL.BLOCK))
         if not tesseract_block.GetUTF8Text(RIL.BLOCK).strip():
             block.image = tesseract_block.GetImage(RIL.BLOCK, 0, image)
             blocks.append(block)
             continue
         block.paragraphs = self.__decode_paragraphs(iterator)
         blocks.append(block)
     return blocks
Beispiel #22
0
def get_font(image_path):
    with PyTessBaseAPI() as api:
        api.SetImageFile(image_path)
        api.Recognize()
        ri = api.GetIterator()
        level = RIL.SYMBOL
        print(ri)
        for r in iterate_level(ri, level):
            symbol = r.GetUTF8Text(level)
            word_attributes = r.WordFontAttributes()
            print(word_attributes)
            if symbol:
                print(u'symbol {}, font: {}'.format(
                    symbol, word_attributes['font_name']))
Beispiel #23
0
def reconhece(nomeArquivo):
    '''reconhece caracteres'''

    #return tesserocr.file_to_text(nomeArquivo)

    dados = ''
    # todos = ''

    imageDados = Image.open(nomeArquivo)

    w, h = imageDados.size

    slice_w = w / 4
    slice_w = slice_w * 4

    quartenario = [0, 0, slice_w, h]

    try:

        with PyTessBaseAPI() as api:
            api.SetImageFile(nomeArquivo)
            #api.SetVariable("tessedit_char_whitelist", "0123456789ABCDEFGHIJKLMNOPQRSTUVXWYZabcdefghijklmnopqrstuvwxyz")
            api.SetVariable("tessedit_char_whitelist",
                            "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")
            api.SetVariable("save_blob_choices", "T")
            #api.SetVariable("tessedit_unrej_any_wd", True)
            api.SetRectangle(quartenario[0], quartenario[1], quartenario[2],
                             quartenario[3])
            api.Recognize()

            ri = api.GetIterator()
            level = RIL.SYMBOL
            for r in iterate_level(ri, level):
                symbol = r.GetUTF8Text(level)
                if symbol:
                    dados = dados + symbol
                    #todos = todos +'[{0} : {1:2.2f}], '.format(symbol, conf)

                # ci = r.GetChoiceIterator()
                # for c in ci:
                #     choice = c.GetUTF8Text()
                #     todos = todos +'[{0} : {1:2.2f}], '.format(choice, c.Confidence())

                # todos = todos + '\n'

    except Exception as exp:
        raise exp

    return dados
Beispiel #24
0
def get_font(image_path):
    with PyTessBaseAPI() as api:
        api.SetImageFile(image_path)
        api.Recognize()
        iter = api.GetIterator()
        level = RIL.SYMBOL

        for r in iterate_level(iter, level):
            symbol = r.GetUTF8Text(level)
            word_attributes = r.WordFontAttributes()
            # a = PyResultIterator.WordFontAttributes()

            if symbol:
                # name = word_attributes['font_name']
                print(f'symbol {symbol}, font: {word_attributes}')
Beispiel #25
0
 def __decode_symbols(self, iterator):
     symbols = []
     for tesseract_symbol in iterate_level(iterator, RIL.SYMBOL):
         symbol = Symbol()
         symbol.bounding_box = BoundingBox.from_coordinates(
             *tesseract_symbol.BoundingBox(RIL.SYMBOL))
         symbol.confidence = float(tesseract_symbol.Confidence(
             RIL.SYMBOL)) / 100.0
         symbol.text = tesseract_symbol.GetUTF8Text(RIL.SYMBOL)
         symbol.image = tesseract_symbol.GetBinaryImage(RIL.SYMBOL).convert(
             '1', dither=Image.NONE)
         symbols.append(symbol)
         if iterator.IsAtFinalElement(RIL.WORD, RIL.SYMBOL):
             break
     return symbols
Beispiel #26
0
def classiferChoices(ri):
	level = RIL.SYMBOL
	for r in iterate_level(ri, level):
		symbol = r.GetUTF8Text(level)  # r == ri
		conf = r.Confidence(level)
		if symbol:
			print u'symbol {}, conf: {}'.format(symbol, conf),
		indent = False
		ci = r.GetChoiceIterator()
		for c in ci:
			if indent:
				print '\t\t ',
			print '\t- ',
			choice = c.GetUTF8Text()  # c == ci
			print u'{} conf: {}'.format(choice, c.Confidence())
			indent = True
		print '---------------------------------------------'
Beispiel #27
0
 def test_result_iterator(self):
     """Test result iterator."""
     self._api.SetImageFile(self._image_file)
     self._api.Recognize()
     it = self._api.GetIterator()
     level = tesserocr.RIL.WORD
     for i, w in enumerate(tesserocr.iterate_level(it, level)):
         text = w.GetUTF8Text(level)
         blanks = w.BlanksBeforeWord()
         if i == 0:
             self.assertEqual(text, "The")
             self.assertEqual(blanks, 0)
         elif i == 1:
             self.assertEqual(text, "(quick)")
             self.assertEqual(blanks, 1)
         else:
             break
def tesseract(path, filename, conf_dir, text_dir):
    # OCR - use the Tesseract API through Cython and PyTesseract
    with PyTessBaseAPI() as api:
        pathFilename = path + "/" + filename

        label_text = ""
        ri = None
        try:
            # Set the image
            api.SetImageFile(pathFilename)
            # Run and verify the recognition process
            label_text = api.GetUTF8Text()
            label_text = label_text[:-1]
            api.SetVariable("save_blob_choices", "T")
            api.Recognize()
            ri = api.GetIterator()
        except:
            return

        conf_text = ""
        # Iterate over each of the symbols of the file
        level = RIL.SYMBOL
        for r in iterate_level(ri, level):
            try:
                symbol = r.GetUTF8Text(level)
                conf = 0.01 * r.Confidence(level)

                # We only save non-break symbols
                if (symbol not in ['\n', '\r', '\t', '\f']):
                    conf_text += symbol + "\t" + str(conf) + "\n"
            except:
                continue

        if len(conf_text) > 0:
            basename = filename[:-4]
            # Write all the characters and their Confidence in the probabilities file
            conf_pathFilename = conf_dir + "/" + basename + ".prob"
            with open(conf_pathFilename, "w") as f:
                f.write(conf_text.encode('utf-8'))

            # Write the recognized text line in the text file
            text_pathFilename = text_dir + "/" + basename + ".txt"
            with open(text_pathFilename, "w") as f:
                f.write(label_text.encode('utf-8'))
Beispiel #29
0
def symbolConfidenc(img):
    word = ''
    count = 0
    insertSpace = 'false'
    with PyTessBaseAPI() as api:
        api.SetImageFile(img)
        api.Recognize()

        ri = api.GetIterator()
        #levelTwo = RIL.TEXTLINE
        level = RIL.WORD
        for r in iterate_level(ri, level):
            #space = r.GetUTF8Text(levelTwo)#gets whole line includes everything unlike RIL.SYMBOL
            symbol = r.GetUTF8Text(level)  # r == ri
            conf = r.Confidence(level)

            if conf > 50:
                word = word + ' ' + symbol
    return word
def find_word_attribute(image, tessdata_3_path):

    #Reading image
    raw_img = Image.open(image)

    #Scaling image
    img = scale_image(raw_img)

    #Initializing parameters
    word_arr = []
    bold_arr = []

    #Using TessBaseAPI to read the fond attribute
    with PyTessBaseAPI(path=tessdata_3_path) as api:
        api.SetImage(img)
        api.Recognize(0)

        #print(api.GetUTF8Text())
        ri = api.GetIterator()
        level = RIL.WORD
        for r in iterate_level(ri, level):
            bb = r.BoundingBox(level)
            if bb != None:
                word = r.GetUTF8Text(level)
                #word_arr.append(word)

                font_name = r.WordFontAttributes()
                #attr.append(font_name)

                if word != None and font_name != None:
                    word_arr.append(word)
                    bold_arr.append(font_name)

                Lang_name = r.WordRecognitionLanguage()
                bool_value = r.WordIsFromDictionary()
                conf = r.Confidence(level)

        df1 = pd.DataFrame(word_arr)
        df2 = pd.DataFrame(bold_arr)
        df = pd.concat([df1, df2], axis=1)
        df.rename(columns={df.columns[0]: "Word"}, inplace=True)

    return (df)
Beispiel #31
0
def get_boxes(image_filename: str) -> list:
    image = Image.open(image_filename)
    width = image.width
    height = image.height
    max_width = width // 2
    max_height = height // 2

    api = PyTessBaseAPI(lang="jpn_vert")
    # api.ReadConfigFile("tess.conf")
    api.SetPageSegMode(PSM.SPARSE_TEXT_OSD)
    api.SetImage(image)
    api.Recognize(0)
    ri = api.GetIterator()
    level = RIL.WORD
    boxes = []
    for r in iterate_level(ri, level):
        conf = r.Confidence(level)
        text = r.GetUTF8Text(level)
        left, top, right, bottom = r.BoundingBox(level)
        # boxes = api.GetComponentImages(RIL.SYMBOL, True)
        # for im, rect, _, _ in boxes:
        #     # im is a PIL image object
        #     # rect is a dict with x, y, w and h keys
        #     left, top, right, bottom = rect['x'], rect['y'], rect['w'], rect['h']
        #     api.SetRectangle(left, top, right, bottom)
        #     text = api.GetUTF8Text()
        #     conf = api.MeanTextConf()
        print("'%s' \tConf: %.2f \tCoords: %d,%d,%d,%d" %
              (text, conf, left, top, right, bottom))
        box = {
            'text': text,
            'left': left,
            'top': top,
            'width': right - left,
            'height': bottom - top
        }
        if should_ignore_box(conf, box, max_width, max_height):
            continue
        boxes.append(box)
    api.End()
    image.close()
    return boxes
        w = line[1]['w']
        h = line[1]['h']
        img = cv2.rectangle(img, (x, y), (x + w, y + h), (255, 0, 0), 3)
    result = Image.fromarray(img)

    print "Shape of the original image: "
    print result.size
    # result.show()
    # img = cv2.rectangle(img, (x, y), (x + w, y + h), (255, 0, 0), 3

    # print api.GetBoxText()
    # api.GetThresholdedImage().show()
    iterator = api.GetIterator()
    iterator.Begin()
    level = RIL.SYMBOL
    for r in iterate_level(iterator, level):
        # print r.BoundingBox(level)
        x = r.BoundingBox(level)[0]
        y = r.BoundingBox(level)[1]
        x_2 = r.BoundingBox(level)[2]
        y_2 = r.BoundingBox(level)[3]

        img = cv2.rectangle(img, (x, y), (x_2, y_2), (0, 255, 0), 3) # Draw a green rectangle around each character found by OCR

    out = Image.fromarray(img)
    #out.show()
    out.save("out.png")
    f.close()
    # Need to kill iterator to clear memory====

    # Want to show the bounding box of L1 of the SKU: