Esempio n. 1
0
class TextExtractor:
    def __init__(self, image_path, seg_mode=PSM.SPARSE_TEXT):
        self.api = PyTessBaseAPI()
        self.api.SetPageSegMode(seg_mode)
        self.api.SetImageFile(image_path)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def _extract(self) -> Tuple:
        text = self.api.GetUTF8Text()
        conf = self.api.MeanTextConf()
        return text, conf

    def _extract_from_rect(self, x, y, w, h) -> Tuple:
        self.api.SetRectangle(x, y, w, h)
        return self._extract()

    #TODO: Add support of zero values
    def extract(self, x=None, y=None, w=None, h=None) -> Tuple:
        if all([x, y, w, h]):
            return self._extract_from_rect(x, y, w, h)
        else:
            return self._extract()

    def close(self):
        self.api.End()
Esempio n. 2
0
 def get_api(self, languages):
     if not hasattr(self.thread, 'api'):
         from tesserocr import PyTessBaseAPI, PSM
         api = PyTessBaseAPI(lang=languages)
         api.SetPageSegMode(PSM.AUTO_OSD)
         self.thread.api = api
     return self.thread.api
Esempio n. 3
0
def ocr(img, level):
    """Use tesseract OCR to detection images.

    Args:
        imagePath: File path of image.
        level: Iteration level.

    Returns:
        An array with coordinate of boxes.

    """
    result = []
    with c_locale():
        from tesserocr import PyTessBaseAPI
        api = PyTessBaseAPI()
        api.SetPageSegMode(PSM.AUTO_OSD)
        # api.SetImageFile(imagePath)
        api.SetImage(Image.fromarray(img))
        blockIter = api.AnalyseLayout()
        while blockIter.Next(level):
            pt = blockIter.BlockType()
            #result.append(blockIter.Baseline(level))
            if pt in [1, 6]:
                result.append(blockIter.BoundingBox(level) + (pt, ))
        api.End()
    return result
Esempio n. 4
0
def _create_tesseract():
    tesseract = PyTessBaseAPI()
    tesseract.SetVariable("load_system_dawg", "F")
    tesseract.SetVariable("load_freq_dawg", "F")
    tesseract.SetVariable("load_punc_dawg", "F")
    tesseract.SetVariable("load_number_dawg", "F")
    tesseract.SetVariable("load_unambig_dawg", "F")
    tesseract.SetVariable("load_bigram_dawg", "F")
    tesseract.SetVariable("load_fixed_length_dawgs", "F")

    tesseract.SetVariable("classify_enable_learning", "F")
    tesseract.SetVariable("classify_enable_adaptive_matcher", "F")

    tesseract.SetVariable("segment_penalty_garbage", "F")
    tesseract.SetVariable("segment_penalty_dict_nonword", "F")
    tesseract.SetVariable("segment_penalty_dict_frequent_word", "F")
    tesseract.SetVariable("segment_penalty_dict_case_ok", "F")
    tesseract.SetVariable("segment_penalty_dict_case_bad", "F")

    tesseract.SetVariable("edges_use_new_outline_complexity", "T")
    tesseract.SetVariable("tessedit_char_whitelist",
                          "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
    tesseract.SetPageSegMode(PSM.SINGLE_LINE)

    return tesseract
Esempio n. 5
0
def read_char(image, whitelist=None):
    """ OCR a single character from an image. Useful for captchas."""
    api = PyTessBaseAPI()
    api.SetPageSegMode(10)
    if whitelist is not None:
        api.SetVariable("tessedit_char_whitelist", whitelist)
    api.SetImage(image)
    api.Recognize()
    return api.GetUTF8Text().strip()
Esempio n. 6
0
class OCR(object):
    MAX_MODELS = 5
    MIN_WIDTH = 10
    MIN_HEIGHT = 10

    def __init__(self):
        # Tesseract language types:
        _, self.supported = get_languages()
        self.reset_engine('eng')

    def language_list(self, languages):
        models = [c for c in alpha3(languages) if c in self.supported]
        if len(models) > self.MAX_MODELS:
            log.warning("Too many models, limit: %s", self.MAX_MODELS)
            models = models[:self.MAX_MODELS]
        models.append('eng')
        return '+'.join(sorted(set(models)))

    def reset_engine(self, languages):
        if hasattr(self, 'api'):
            self.api.Clear()
            self.api.End()
        self.api = PyTessBaseAPI(lang=languages, oem=OEM.LSTM_ONLY)

    def extract_text(self, data, languages=None, mode=PSM.AUTO_OSD):
        """Extract text from a binary string of data."""
        languages = self.language_list(languages)
        if languages != self.api.GetInitLanguagesAsString():
            self.reset_engine(languages)

        try:
            image = Image.open(BytesIO(data))
            # TODO: play with contrast and sharpening the images.
            if image.width <= self.MIN_WIDTH:
                return
            if image.height <= self.MIN_HEIGHT:
                return

            if mode != self.api.GetPageSegMode():
                self.api.SetPageSegMode(mode)

            self.api.SetImage(image)
            text = self.api.GetUTF8Text()
            confidence = self.api.MeanTextConf()
            log.info("%s chars (w: %s, h: %s, langs: %s, confidence: %s)",
                     len(text), image.width, image.height, languages,
                     confidence)
            return text
        except Exception as ex:
            log.exception("Failed to OCR: %s", languages)
        finally:
            self.api.Clear()
class OCR(object):
    MAX_MODELS = 5
    DEFAULT_MODE = PSM.AUTO_OSD

    # DEFAULT_MODE = PSM.AUTO

    def __init__(self):
        # Tesseract language types:
        _, self.supported = get_languages()

    def language_list(self, languages):
        models = [c for c in alpha3(languages) if c in self.supported]
        if len(models) > self.MAX_MODELS:
            log.warning("Too many models, limit: %s", self.MAX_MODELS)
            models = models[:self.MAX_MODELS]
        models.append('eng')
        return '+'.join(sorted(set(models)))

    def configure_engine(self, languages, mode):
        # log.info("Configuring OCR engine (%s)", languages)
        if not hasattr(self, 'api'):
            self.api = PyTessBaseAPI(lang=languages, oem=OEM.LSTM_ONLY)
        if languages != self.api.GetInitLanguagesAsString():
            self.api.Init(lang=languages, oem=OEM.LSTM_ONLY)
        if mode != self.api.GetPageSegMode():
            self.api.SetPageSegMode(mode)
        return self.api

    def extract_text(self, data, languages=None, mode=DEFAULT_MODE):
        """Extract text from a binary string of data."""
        languages = self.language_list(languages)
        api = self.configure_engine(languages, mode)

        try:
            image = Image.open(BytesIO(data))
            # TODO: play with contrast and sharpening the images.
            start_time = time.time()
            api.SetImage(image)
            text = api.GetUTF8Text()
            confidence = api.MeanTextConf()
            end_time = time.time()
            duration = end_time - start_time
            log.info("%s chars (w: %s, h: %s, langs: %s, c: %s), took: %.5f",
                     len(text), image.width, image.height, languages,
                     confidence, duration)
            return text
        except Exception as ex:
            log.exception("Failed to OCR: %s", languages)
        finally:
            api.Clear()
Esempio n. 8
0
def read_word(image, whitelist=None, chars=None, spaces=False):
    """ OCR a single word from an image. Useful for captchas.
        Image should be pre-processed to remove noise etc. """
    api = PyTessBaseAPI()
    api.SetPageSegMode(8)
    if whitelist is not None:
        api.SetVariable("tessedit_char_whitelist", whitelist)
    api.SetImage(image)
    api.Recognize()
    guess = api.GetUTF8Text()

    if not spaces:
        guess = ''.join([c for c in guess if c != " "])
        guess = guess.strip()

    if chars is not None and len(guess) != chars:
        return guess, None

    return guess, api.MeanTextConf()
Esempio n. 9
0
def get_boxes(image_filename: str) -> list:
    image = Image.open(image_filename)
    width = image.width
    height = image.height
    max_width = width // 2
    max_height = height // 2

    api = PyTessBaseAPI(lang="jpn_vert")
    # api.ReadConfigFile("tess.conf")
    api.SetPageSegMode(PSM.SPARSE_TEXT_OSD)
    api.SetImage(image)
    api.Recognize(0)
    ri = api.GetIterator()
    level = RIL.WORD
    boxes = []
    for r in iterate_level(ri, level):
        conf = r.Confidence(level)
        text = r.GetUTF8Text(level)
        left, top, right, bottom = r.BoundingBox(level)
        # boxes = api.GetComponentImages(RIL.SYMBOL, True)
        # for im, rect, _, _ in boxes:
        #     # im is a PIL image object
        #     # rect is a dict with x, y, w and h keys
        #     left, top, right, bottom = rect['x'], rect['y'], rect['w'], rect['h']
        #     api.SetRectangle(left, top, right, bottom)
        #     text = api.GetUTF8Text()
        #     conf = api.MeanTextConf()
        print("'%s' \tConf: %.2f \tCoords: %d,%d,%d,%d" %
              (text, conf, left, top, right, bottom))
        box = {
            'text': text,
            'left': left,
            'top': top,
            'width': right - left,
            'height': bottom - top
        }
        if should_ignore_box(conf, box, max_width, max_height):
            continue
        boxes.append(box)
    api.End()
    image.close()
    return boxes
Esempio n. 10
0
def _get_tesseract():
    tesseract = PyTessBaseAPI()
    tesseract.SetVariable("tessedit_char_whitelist", "ABCDEFGHIJKLMNOPQRSTUVWXYZ' ")
    tesseract.SetPageSegMode(PSM.SINGLE_LINE)
    return tesseract
Esempio n. 11
0
class OCR():
    ocr_api = None

    def __init__(self):
        self.reset()

    def __del__(self):
        del self.ocr_api

    def reset(self):
        self.ocr_api = PyTessBaseAPI(oem=OEM.TESSERACT_ONLY)

    def ocr_filter_img(self, im):
        if im == None:
            return im
        dat = im.getdata()
        f = []
        for d in dat:
            if d[0] >= 254 and d[1] >= 254 and d[2] >= 254:  #chp catk
                f.append((0, 0, 0))
            elif d[0] <= 28 and d[1] == 255 and d[2] <= 80:  #chp catk boost
                f.append((0, 0, 0))
            elif d[0] == 255 and d[1] <= 2 and d[2] <= 2:  #chp catk malus
                f.append((0, 0, 0))
            elif d[0] <= 179 and d[0] >= 164 and d[1] <= 230 and d[
                    1] >= 211 and d[2] >= 233:  #smana
                f.append((0, 0, 0))
            elif d[0] <= 205 and d[0] >= 175 and d[1] <= 220 and d[
                    1] >= 190 and d[2] <= 235 and d[2] >= 215:  #mana
                f.append((0, 0, 0))
            elif d[0] == 245 and d[1] == 245 and d[2] == 250:  #hp
                f.append((0, 0, 0))
            elif d[0] == 246 and d[1] == 227 and d[2] == 227:  #card cost
                f.append((0, 0, 0))
            else:
                f.append((255, 255, 255))
        im.putdata(f)
        im = ImageOps.grayscale(im)
        # im = im.filter(ImageFilter.GaussianBlur(4))
        # im = ImageOps.invert(im)
        return im

    def filter_img(self, im):
        if im == None:
            return im
        # im = ImageOps.grayscale(im)
        # im = im.filter(ImageFilter.GaussianBlur(4))
        # im = ImageOps.invert(im)
        return im

    def ocr_txt(self, img):
        im = self.ocr_filter_img(img)
        self.ocr_api.SetPageSegMode(PSM.SINGLE_BLOCK)
        self.ocr_api.SetVariable('tessedit_char_whitelist', ascii_letters)
        self.ocr_api.SetVariable('tessedit_char_blacklist', digits)
        self.ocr_api.SetImage(im)
        text = self.ocr_api.GetUTF8Text().strip('\n')
        # logging.info("Btn text detected as %s", text)
        return text.lower()

    def ocr_number(self, img):
        im = self.ocr_filter_img(img)
        self.ocr_api.SetVariable('tessedit_char_whitelist', digits)
        self.ocr_api.SetVariable('tessedit_char_blacklist', ascii_letters)
        self.ocr_api.SetPageSegMode(PSM.SINGLE_WORD)
        self.ocr_api.SetImage(im)
        number = self.ocr_api.GetUTF8Text().strip('\n')
        try:
            number = int(number)
        except:
            number = -1

        logging.info("OCR number %i >", number)
        return int(number)
Esempio n. 12
0
def ocr_on_bounding_boxes(img, components):

    blurbs = []
    for component in components:
        (aspect, vertical, horizontal) = segment_into_lines(img, component)
        #if len(vertical)<2 and len(horizontal)<2:continue

        #attempt to separately process furigana
        #(furigana, non_furigana) = estimate_furigana(vertical)
        '''
      from http://code.google.com/p/tesseract-ocr/wiki/ControlParams
      Useful parameters for Japanese and Chinese

      Some Japanese tesseract user found these parameters helpful for increasing tesseract-ocr (3.02) accuracy for Japanese :

      Name 	Suggested value 	Description
      chop_enable 	T 	Chop enable.
      use_new_state_cost 	F 	Use new state cost heuristics for segmentation state evaluation
      segment_segcost_rating 	F 	Incorporate segmentation cost in word rating?
      enable_new_segsearch 	0 	Enable new segmentation search path.
      language_model_ngram_on 	0 	Turn on/off the use of character ngram model.
      textord_force_make_prop_words 	F 	Force proportional word segmentation on all rows. 
    '''
        #now run OCR on this bounding box
        api = PyTessBaseAPI(path='C:/Program Files/Tesseract-OCR/tessdata')
        api.Init(".", "jpn", tesseract.OEM_DEFAULT)
        #handle single column lines as "vertical align" and Auto segmentation otherwise
        if len(vertical) < 2:
            api.SetPageSegMode(
                5)  #tesseract.PSM_VERTICAL_ALIGN)#PSM_AUTO)#PSM_SINGLECHAR)#
        else:
            api.SetPageSegMode(tesseract.PSM_AUTO)  #PSM_SINGLECHAR)#
        api.SetVariable('chop_enable', 'T')
        api.SetVariable('use_new_state_cost', 'F')
        api.SetVariable('segment_segcost_rating', 'F')
        api.SetVariable('enable_new_segsearch', '0')
        api.SetVariable('language_model_ngram_on', '0')
        api.SetVariable('textord_force_make_prop_words', 'F')
        api.SetVariable('tessedit_char_blacklist', '}><L')
        api.SetVariable('textord_debug_tabfind', '0')

        x = component[1].start
        y = component[0].start
        w = component[1].stop - x
        h = component[0].stop - y
        roi = cv2.cv.CreateImage((w, h), 8, 1)
        sub = cv2.cv.GetSubRect(cv2.cv.fromarray(img), (x, y, w, h))
        cv2.cv.Copy(sub, roi)
        tesseract.SetCvImage(roi, api)
        txt = api.GetUTF8Text()
        conf = api.MeanTextConf()
        if conf > 0 and len(txt) > 0:
            blurb = Blurb(x, y, w, h, txt, confidence=conf)
            blurbs.append(blurb)
        '''
    for line in non_furigana:
      x=line[1].start
      y=line[0].start
      w=line[1].stop-x
      h=line[0].stop-y
      roi = cv2.cv.CreateImage((w,h), 8, 1)
      sub = cv2.cv.GetSubRect(cv2.cv.fromarray(img), (x, y, w, h))
      cv2.cv.Copy(sub, roi)
      tesseract.SetCvImage(roi, api)
      txt=api.GetUTF8Text()
      conf=api.MeanTextConf()
      if conf>0:
        blurb = Blurb(x, y, w, h, txt, confidence=conf)
        blurbs.append(blurb)
    '''
    return blurbs