Example #1
0
def lambda_handler(event, context):
    bucket = event['Records'][0]['s3']['bucket']['name']
    key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key']).decode('utf8')
    shutil.copyfile("tesseract", tmp_dir + '/tesseract')
    shutil.copyfile("test.png", tmp_dir + '/test.png')
    os.chmod(tmp_dir + "/tesseract", 0755)
    os.chmod(tmp_dir, 0755)
    os.chmod('/tmp', 0755)
    print("before image file from s3")
    image_file = download_file(bucket, key)
    print("before image file to PIL")
    print("before OCR")
    result_file = tesseract(image_file)
    print("Print files in firectory")
    for file in os.listdir('/tmp'):
        print(file)
    try:
        print("before PyTessBaseAPI set 2")
        api = PyTessBaseAPI(path=os.path.join(SCRIPT_DIR, 'tessdata'), lang='eng',psm=PSM.AUTO_OSD)
        print("After API set")
        api.SetImageFile(image_file)
        print("After API set image")
        print("TEXT from tesserocr: %s" % api.GetUTF8Text())
        print("CONFIDENCE from tesserocr: %s" % api.AllWordConfidences())
    except Exception:
        pass
Example #2
0
class TextExtractor:
    def __init__(self, image_path, seg_mode=PSM.SPARSE_TEXT):
        self.api = PyTessBaseAPI()
        self.api.SetPageSegMode(seg_mode)
        self.api.SetImageFile(image_path)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def _extract(self) -> Tuple:
        text = self.api.GetUTF8Text()
        conf = self.api.MeanTextConf()
        return text, conf

    def _extract_from_rect(self, x, y, w, h) -> Tuple:
        self.api.SetRectangle(x, y, w, h)
        return self._extract()

    #TODO: Add support of zero values
    def extract(self, x=None, y=None, w=None, h=None) -> Tuple:
        if all([x, y, w, h]):
            return self._extract_from_rect(x, y, w, h)
        else:
            return self._extract()

    def close(self):
        self.api.End()
 def __init__(self, image_file, tessdata):
     api = PyTessBaseAPI(path=tessdata, psm=PSM.AUTO_OSD)
     api.SetImageFile(image_file)
     api.SetVariable("textord_tablefind_recognize_tables", "T")
     api.SetVariable("textord_tabfind_find_tables", "T")
     api.Recognize()
     self.api = api
Example #4
0
def get_lines(filename):
    '''
    Args
    ::filename (str): Image file relative or absolute path.

    Return:
    ::list: List of lines as text from the image. Every line contain the stop
      times for a certain trip.
    '''
    api = PyTessBaseAPI()
    api.SetImageFile(filename)
    text = api.GetUTF8Text()

    textual_lines = []
    line = ''
    line_num = 0
    for char in text:
        line += char
        if char == "\n":
            # ignore lines with less than 5 chars (H:MM)
            if len(line) < 5:
                line = ''
                continue
            else:
                line_num += 1
                # debug
                # print('linea: "', line, '" numero: ', line_num,
                # q      'largo de linea: ', len(line))
                textual_lines.append(line)
                line = ''

    return textual_lines
Example #5
0
def handler(event, context):
  api = PyTessBaseAPI()
  api.SetImageFile("sample.jpg")
  txt = api.GetUTF8Text()
  logging.info(txt)
  logging.info(api.AllWordConfidences())
  return txt
Example #6
0
 def __init__(self, image_file, tessdata, oem_mode):
     if oem_mode == "v3":
         oem = OEM.TESSERACT_ONLY
     else:
         oem = OEM.LSTM_ONLY
     api = PyTessBaseAPI(path=tessdata, psm=PSM.AUTO_OSD, oem=oem)
     api.SetImageFile(image_file)
     self.api = api
Example #7
0
def orientation_stuff():
    api2 = PyTessBaseAPI(psm=PSM.OSD_ONLY, path=MY_TESSDATA_PATH)
    api2.SetImageFile('/home/johannes/Repos/tesseract/testing/eurotext.tif')

    # os = api2.DetectOS()
    os = api2.DetectOrientationScript(
    )  # beide verursachen fehler: 'Speicherzugriffsfehler (Speicherabzug geschrieben)'
    print(
        "Orientation: {orientation}\nOrientation confidence: {oconfidence}\n Script: {script}\nScript confidence: {sconfidence}"
        .format(**os))
Example #8
0
 def getText(self):
     file = open("test.png", "wb")
     file.write(self.document_text)
     file.close()
     text = ""
     out, err = Popen(
         'python ../../models/tutorials/image/imagenet/classify_image.py --image_file test.png',
         shell=True,
         stdout=PIPE).communicate()
     text += out.decode("utf-8")
     api = PyTessBaseAPI()
     api.SetImageFile("test.png")
     text += api.GetUTF8Text()
     os.remove("test.png")
     return text
Example #9
0
def preprocess_title(filename):
    title = ''
    api = PyTessBaseAPI()
    api.SetImageFile(filename)
    boxes = api.GetComponentImages(RIL.TEXTLINE, True)
    for i, (im, box, _, _) in enumerate(boxes):
        api.SetRectangle(box['x'], box['y'], box['w'], box['h'])
        ocrResult = api.GetUTF8Text()
        text = ' '.join(alpha_re.findall(ocrResult.strip()))
        if len(text) < 5:
            continue

        title = text
        break

    if title:
        logger.info("%s: %s", filename, title)
    return title
Example #10
0
    def getText(self):
        file = open("test.pdf", "wb")
        file.write(self.document_text)
        file.close()
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        try:
            fp = open("test.pdf", 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()

            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)

            text = retstr.getvalue()

            fp.close()
            device.close()
            retstr.close()
            text = "".join(text.split("\n"))
            os.remove("test.pdf")
            return text
        except:
            text = ""
            with Image(filename="test.pdf") as img:
                img.save(filename="kek.png")
            for file in os.listdir(os.curdir):
                if file.endswith(".png") and file.startswith("kek"):
                    api = PyTessBaseAPI()
                    api.SetImageFile(file)
                    text += api.GetUTF8Text()
                    os.remove(file)
            return text
Example #11
0
def tess_ocr(img):
    """Get text from an image.

    Args:
        img: The file path of image.

    Returns:
        A string.
    Raises:
        IOError: An error occurred accessing the img object.

    """
    with c_locale():
        from tesserocr import PyTessBaseAPI, PSM
        api = PyTessBaseAPI(lang='chi_sim', psm=PSM.AUTO_OSD)
        api.SetImageFile(img)
        text = api.GetUTF8Text()
        api.End()
    return text
Example #12
0
def runTessTest(folderpath):
    images = [x for x in os.listdir(folderpath) if x[-3:] == 'png']
    images.sort()
    actualLabels = [x[x.rfind('_') + 1:-4] for x in images]
    letterCorrectCounts = [0 for x in actualLabels]
    countDict = dict(Counter(actualLabels))
    correctDict = \
        {actualLabels[i]:letterCorrectCounts[i] for i in range(len(actualLabels))}
    api = PyTessBaseAPI(lang='frk', psm=10)
    correctCount = 0
    for i in range(len(images)):
        img = images[i]
        api.SetImageFile(img)
        predictLabel = api.GetUTF8Text().rstrip()
        if actualLabels[i] == predictLabel:
            correctCount += 1
            correctDict[predictLabel] += 1
    accuracy = correctCount / len(images)
    accuracyDict = \
        {x:round(correctDict[x]/countDict[x], 3) for x in countDict}
    return (accuracy, accuracyDict)
Example #13
0
class OCREngine:
    def __init__(self, psm: int = 3, config: dict = {}):
        logging.info('Initializing OCR engine with PSM=%d and configs=%s' %
                     (psm, config))
        self.api = PyTessBaseAPI(psm=psm)
        for key in config.keys():
            self.api.SetVariable(key, config[key])
        logging.debug('OCR engine initialized')

    def build_graph(self,
                    image_path: str,
                    scheme: str = None) -> DocumentGraph:

        hocr = self._get_hocr(image_path)
        words = self._get_words(hocr, scheme)
        dg = DocumentGraph(words)

        return dg

    def _get_hocr(self, image_path: str) -> str:
        logging.info('Reading to hOCR from image: %s' % image_path)
        self.api.SetImageFile(image_path)
        hocr_text = self.api.GetHOCRText(0)
        logging.debug('Image read')

        return hocr_text

    def _get_words(self, hocr: str, scheme: str = None):
        logging.info('Extracting words from hOCR.')
        if scheme is None:
            logging.warning('No scheme specified. Assuming xyxy')
            scheme = 'xyxy'

        soup = BeautifulSoup(hocr, 'html.parser')
        word_tags = soup.select('.ocrx_word')

        word_nodes = [self._make_node(tag, scheme=scheme) for tag in word_tags]
        word_nodes = list(filter(lambda node: node is not None, word_nodes))

        return word_nodes

    def _make_node(self, tag: dict, scheme: str) -> WordNode:
        fields = tag['title'].split(';')
        if not len(fields) == 2:
            logging.warn('Malformed tag: %s. Skipping.' % tag)
            return None

        word = tag.text
        coordinates = tuple(map(int, fields[0].split()[1:]))
        conf = int(fields[1].split()[1])

        wn = WordNode(word, WordNode.convert_coords(coordinates, scheme), conf)
        logging.debug('Made word: %s' % wn.__repr__())

        return wn

    def close(self):
        self.api.End()
        logging.debug('OCR engine closed')

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        if exc_type:
            print("type: %s\nvalue: %s\ntrace: %s" %
                  (exc_type, exc_value, traceback))

        self.close()
Example #14
0
 def __init__(self, image_file, tessdata):
     self.image_file = image_file
     self.tessdata = tessdata
     api = PyTessBaseAPI(path=tessdata, psm=PSM.OSD_ONLY)
     api.SetImageFile(image_file)
     self.api = api
Example #15
0
    def run_tesseract(image_file):
        if tessdata:
            api = PyTessBaseAPI(path=tessdata, psm=PSM.AUTO_OSD)
        else:
            api = PyTessBaseAPI(psm=PSM.AUTO_OSD)

        api.SetImageFile(image_file)
        api.SetVariable("textord_tablefind_recognize_tables", "T")
        api.SetVariable("textord_tabfind_find_tables", "T")
        api.Recognize()

        document = {}
        it = api.AnalyseLayout()
        if it is not None:
            orientation, direction, order, deskew_angle = it.Orientation()
            api.Recognize()
            ri = api.GetIterator()
            if ri is not None:
                document = {
                    "orientation": orientation,
                    "writing_direction": direction,
                    "text_direction": order,
                    "deskew_angle": deskew_angle,
                    "blocks": []
                }
                while ri.IsAtBeginningOf(RIL.BLOCK):
                    block = {
                        "block_type": ri.BlockType(),
                        "block_type_str": BlockType[ri.BlockType()],
                        "box": ri.BoundingBox(RIL.BLOCK),
                        "ocr_text": ri.GetUTF8Text(RIL.BLOCK),
                        "confidence": ri.Confidence(RIL.BLOCK),
                        "paragraphs": []
                    }
                    break_para = False
                    while True:
                        if ri.IsAtFinalElement(RIL.BLOCK, RIL.PARA):
                            break_para = True
                        break_line = False
                        paragraph = {
                            "box": ri.BoundingBox(RIL.PARA),
                            "ocr_text": ri.GetUTF8Text(RIL.PARA),
                            "paragraph_info": list(ri.ParagraphInfo()),
                            "confidence": ri.Confidence(RIL.PARA),
                            "lines": []
                        }
                        while True:
                            if ri.IsAtFinalElement(RIL.PARA, RIL.TEXTLINE):
                                break_line = True
                            break_word = False
                            line = {
                                "box": ri.BoundingBox(RIL.TEXTLINE),
                                "ocr_text": ri.GetUTF8Text(RIL.TEXTLINE),
                                "confidence": ri.Confidence(RIL.TEXTLINE),
                                "words": []
                            }
                            while True:
                                word = {
                                    "box": ri.BoundingBox(RIL.WORD),
                                    "ocr_text": ri.GetUTF8Text(RIL.WORD),
                                    "confidence": ri.Confidence(RIL.WORD),
                                    "attributes": ri.WordFontAttributes()
                                }
                                if ri.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD):
                                    break_word = True
                                line["words"].append(word)
                                if break_word:
                                    break
                                ri.Next(RIL.WORD)
                            paragraph["lines"].append(line)
                            if break_line:
                                break
                            ri.Next(RIL.TEXTLINE)
                        block["paragraphs"].append(paragraph)
                        if break_para:
                            break
                        ri.Next(RIL.PARA)
                    document["blocks"].append(block)
                    ri.Next(RIL.BLOCK)
        return document
Example #16
0
from tesserocr import file_to_text
from tesserocr import PyTessBaseAPI

api = PyTessBaseAPI()
api.__init__(lang="rus")
api.SetImageFile("images/okey-micro.jpg")
api.GetThresholdedImage().save("dfd.png")
print file_to_text("dfd.png", lang="rus")
Example #17
0
#    for img in images:
#        api.SetImageFile(img)
#        print( api.GetUTF8Text())
#        print( api.AllWordConfidences())

img = Image.open(
    glo.DATA_FOLDER + '/number_range_predictorcropped3.png'
)  #glo.UNCLASSIFIED_GLOBAL_CAPTURES_FOLDER + '/fullcapture961 .png')
#img = img.convert('L')

from tesserocr import PyTessBaseAPI, RIL, iterate_level, PSM
#print(help(tesserocr))

api = PyTessBaseAPI()
api.Init()
api.SetImageFile(glo.DATA_FOLDER + '/number_range_predictorcropped3.png')
api.SetVariable("tessedit_pageseg_mode", "7")
api.SetVariable("language_model_penalty_non_dict_word", "0")
api.SetVariable("doc_dict_enable", "0")
print("recognized txt:", api.GetUTF8Text().encode('utf-8').strip())
#api.Recognize()
"""
ri = api.GetIterator()
level = RIL.SYMBOL
for r in iterate_level(ri, level):
    symbol = r.GetUTF8Text(level)  # r == ri
    conf = r.Confidence(level)
    print(u'symbol {}, conf: {}'.format(symbol, conf).encode('utf-8').strip())
    indent = False
    ci = r.GetChoiceIterator()
    for c in ci: