Python ProcessPagesBufferの例、tesseract.ProcessPagesBuffer Pythonの例

コード例 #1

0

ファイルを表示

ファイル: getCaptcha.py プロジェクト: yxm4109/drugcheck-captcha

def test():
    img=Image.open('a.jpg')
    output = io.BytesIO()
    img.save(output, format='JPEG')
    mBuffer=output.getvalue()

    print tesseract.ProcessPagesBuffer(mBuffer,len(mBuffer),api)

コード例 #2

0

ファイルを表示

ファイル: votes.py プロジェクト: schlos/openstates

    def house_add_votes_from_image(self, vote_file, vote):

        # Extract the image.
        with cd('/tmp'):
            sh.pdfimages(vote_file, vote_file)

        # Convert it to .png
        image_file = vote_file + '-000.pbm'

        with open(image_file, 'rb') as f:
            data = f.read()
            api = tesseract.TessBaseAPI()
            api.Init(".", "eng", tesseract.OEM_DEFAULT)
            api.SetPageSegMode(tesseract.PSM_SINGLE_BLOCK)
            whitelist = ("abcdefghijklmnopqrstuvwxyz',-.*"
                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ ")
            api.SetVariable("tessedit_char_whitelist", whitelist)
            text = tesseract.ProcessPagesBuffer(data, len(data), api)

        # Parse the text into a tree.
        tree = with_image.Rollcall.parse(with_image.Lexer(text))

        # Visit the tree and add rollcall votes to the vote object.
        visitor = with_image.VoteVisitor(vote).visit(tree)

        os.remove(image_file)

コード例 #3

0

ファイルを表示

def ocr_receipt(window,image):
	"""string_int=('tesseract ' + 'TempBin' + file_name + ' '
					+ RECEIPT_PATH + file_name[:-4] + '_int1')
	string_int2=('tesseract ' + 'TempBin2' + file_name + ' '
					+ RECEIPT_PATH + file_name[:-4] + '_int2')
	string_final = ('tesseract ' + 'Final.Bin' + file_name + ' '
					+ RECEIPT_PATH + file_name[:-4] + '_final')
	
	temp = subprocess.call(string_int,shell=True)
	temp = subprocess.call(string_int2,shell=True)
	temp = subprocess.call(string_final,shell=True)
	os.system(string_final)
	print string_final"""
		
	api = tesseract.TessBaseAPI()
	api.Init(".", "eng", tesseract.OEM_DEFAULT)
	api.SetVariable("tessedit_char_whitelist", "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ=-,.\/:")
	api.SetPageSegMode(tesseract.PSM_AUTO)
	
	img_file = "test.png"
	cv2.imwrite(img_file,image)
	mbuffer=open(img_file,"rb").read()
	result = tesseract.ProcessPagesBuffer(mbuffer,len(mbuffer),api)
	#tesseract.SetCvImage(ocr_img,api)
	#text=api.GetUTF8Text()
	conf=api.MeanTextConf()
	conf2=api.AllWordConfidences()
	
	print('Result: ')
	print(result)

コード例 #4

0

ファイルを表示

ファイル: recognize.py プロジェクト: kuangchanglang/videoAbatraction

def recognize(mImgFile):
    api = tesseract.TessBaseAPI()
    api.Init(".", "eng", tesseract.OEM_DEFAULT)
    eng = "0123456789abcdefghijklmnopqrstuvwxyz."
    digit = "0123456789"
    api.SetVariable("tessedit_char_whitelist", digit)
    api.SetPageSegMode(tesseract.PSM_AUTO)

    mBuffer = open(mImgFile, "rb").read()
    result = tesseract.ProcessPagesBuffer(mBuffer, len(mBuffer), api)
    print "result(ProcessPagesBuffer)=", result
    api.End()

コード例 #5

0

ファイルを表示

ファイル: imageReader.py プロジェクト: quamis/myHouse

    def detect(self):
        api = tesseract.TessBaseAPI()
        api.Init(".", "eng", tesseract.OEM_DEFAULT)
        api.SetVariable("tessedit_char_whitelist", "0123456789 ")
        api.SetPageSegMode(tesseract.PSM_AUTO)

        f = open(self.tempFile, "rb")
        mBuffer = f.read()
        f.close()
        self.result = tesseract.ProcessPagesBuffer(mBuffer, len(mBuffer), api)

        return self

コード例 #6

0

ファイルを表示

    def GetDecimalDegrees(self, image):
        buffer = image.make_blob()

        tesseract.ProcessPagesBuffer(buffer, len(buffer), self._api)
        text = self._api.GetUTF8Text().replace(' ', '')
        coordinates = re.split("°|'|\"| ", text)[:3]

        # Do not accept low-quality OCRs, since this can cause
        # the image to be misplaced, which is worse than discarded.
        if self._api.MeanTextConf() < 50:
            return self.INVALID_COORDINATE

        return self._ConvertToDecimalDegrees(coordinates)

コード例 #7

0

ファイルを表示

	def POST(self):
		web.header("Content-Type","text/html; charset=utf-8")
		infile = web.input()
		if 'image' in infile:
			buf = infile.image  #.file.read()
			print len(buf)
			if buf:
				result = tesseract.ProcessPagesBuffer(buf,len(buf),api)
				print result
				return json.dumps(get_items(result))
		elif 'imagestr' in infile:
			buf = base64.decodestring(infile.imagestr)
			f = open('im.jpg', 'wb')
			f.write(buf)
			f.close()
			print len(buf)
			if buf:
				result = tesseract.ProcessPagesBuffer(buf,len(buf),api)
				print result
				return json.dumps(get_items(result))

		print 'none'
		raise web.seeother('/')

コード例 #8

0

ファイルを表示

ファイル: ocr.py プロジェクト: HeLiangHIT/OCR

def tesseract_ocr(imgname, type='PagesWrapper'):
    api = tesseract.TessBaseAPI()
    api.SetOutputName("outputName")
    api.Init(".", "eng", tesseract.OEM_DEFAULT)
    api.SetPageSegMode(tesseract.PSM_AUTO)
    if type == 'PagesWrapper':
        result = tesseract.ProcessPagesWrapper(imgname, api)
    elif type == 'PagesFileStream':
        result = tesseract.ProcessPagesFileStream(mImgFile, api)
    elif type == 'PagesRaw':
        result = tesseract.ProcessPagesRaw(mImgFile, api)
    elif type == 'PagesBuffer':
        mBuffer = open(imgname).read()
        result = tesseract.ProcessPagesBuffer(mBuffer, len(mBuffer), api)
    return result

コード例 #9

0

ファイルを表示

def read_text(img):
    """
    **SUMMARY**

    This function will return any text it can find using OCR on the
    image.

    Please note that it does not handle rotation well, so if you need
    it in your application try to rotate and/or crop the area so that
    the text would be the same way a document is read

    **RETURNS**

    A String

    **EXAMPLE**

    >>> img = Imgae("somethingwithtext.png")
    >>> text = img.read_text()
    >>> print text

    **NOTE**

    If you're having run-time problems I feel bad for your son,
    I've got 99 problems but dependencies ain't one:

    http://code.google.com/p/tesseract-ocr/
    http://code.google.com/p/python-tesseract/

    """

    if not OCR_ENABLED:
        return "Please install the correct OCR library required - " \
               "http://code.google.com/p/tesseract-ocr/ " \
               "http://code.google.com/p/python-tesseract/"

    api = tesseract.TessBaseAPI()
    api.SetOutputName("outputName")
    api.Init(".", "eng", tesseract.OEM_DEFAULT)
    api.SetPageSegMode(tesseract.PSM_AUTO)

    jpgdata = StringIO()
    convert.to_pil_image(img).save(jpgdata, "jpeg")
    jpgdata.seek(0)
    stringbuffer = jpgdata.read()
    result = tesseract.ProcessPagesBuffer(stringbuffer, len(stringbuffer), api)
    return result

コード例 #10

0

ファイルを表示

ファイル: pil.py プロジェクト: zhaoqiyuan123/12306-data

def guess(img_data):
    try:
        api = tesseract.TessBaseAPI()
        api.Init(".", "eng", tesseract.OEM_DEFAULT)
        api.SetVariable("tessedit_char_whitelist",
                        "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")
        im = Image.open(StringIO.StringIO(img_data))
        im.load()
        if im.mode == 'RGBA':
            r, g, b, a = im.split()
            im = Image.merge("RGB", (r, g, b))
        color_sep(im, 3)
        color_fil(im)
        contents = get_pil_string(im)
        result = tesseract.ProcessPagesBuffer(contents, len(contents), api)
        if result == None: result = ''
        result = result.strip().replace(' ', '')
        return result
    except Exception, e:
        e

コード例 #11

0

ファイルを表示

 def pull_text_from_file(self, root, filename):
     """Use OCR to extract the text from the file. Writes the text to file
     @param root The str root of the file path
     @param filename The name of the file to pull from
     """
     full_path = os.path.join(root, filename)
     text_file_path = "{0}{1}{2}{3}".format(self.TEXT_FILE_PREFIX, filename,
                                            self.TEXT_FILE_SUFFIX,
                                            self.TEXT_FILE_EXTENSION)
     text_file_path = os.path.join(root, text_file_path)
     if not os.path.isfile(text_file_path):
         with open(full_path, "rb") as buffer:
             buffer = buffer.read()
             text_pulled = tesseract.ProcessPagesBuffer(
                 buffer, len(buffer), self.tesseract_api)
             if self.tesseract_api.MeanTextConf(
             ) >= self.OCR_CONFIDENCE_THRESHOLD:
                 with open(text_file_path, "w") as text_file:
                     text_file.write(text_pulled)
             else:
                 self.potential_problem_files.append(root, filename)

コード例 #12

0

ファイルを表示

ファイル: test_.py プロジェクト: pyaswanthreddy/python-tesseract

import tesseract
api = tesseract.TessBaseAPI()
api.Init(".", "eng", tesseract.OEM_DEFAULT)
api.SetVariable("tessedit_char_whitelist",
                "0123456789abcdefghijklmnopqrstuvwxyz")
api.SetPageSegMode(tesseract.PSM_AUTO)

mImgFile = "eurotext.jpg"
mBuffer = open(mImgFile, "rb").read()
result = tesseract.ProcessPagesBuffer(mBuffer, len(mBuffer), api)
print "result(ProcessPagesBuffer)=", result
intPtr = api.AllWordConfidences()
print str(intPtr)
pyPtr = tesseract.cdata(intPtr, 100)
for i in range(10):
    print ord(pyPtr[i])
tesseract.delete_intp(intPtr)

コード例 #13

0

ファイルを表示

ファイル: getCaptcha.py プロジェクト: yxm4109/drugcheck-captcha

def OCRByImage(img):
    output = StringIO()
    img.save(output, format='JPEG')
    mBuffer=output.getvalue()
    return tesseract.ProcessPagesBuffer(mBuffer,len(mBuffer),api)

コード例 #14

0

ファイルを表示

ファイル: getCaptcha.py プロジェクト: yxm4109/drugcheck-captcha

def OCRByFilePath(mImgFile):
    mBuffer=open(mImgFile,'rb').read()
    return tesseract.ProcessPagesBuffer(mBuffer,len(mBuffer),api)