def check(img):

    out = pytesseract.image_to_string(img, lang='eng', config="-psm 5")
    if out.__contains__('P'):
        return 1

    color = 255
    for i in range(len_x):
        for j in range(len_y):
            if data[i, j] == 255:
                color -= 1
                floodfill(i, j, color)
    print(color)
    for c in range(color, 255):
        (a, b) = np.where(data == c)
        if a.shape[0] > 0:
            x_min, x_max = np.min(a), np.max(a)
            y_min, y_max = np.min(b), np.max(b)

            if (x_max - x_min > 5) and (y_max - y_min > 5):
                tmp = data[x_min:x_max, y_min:y_max]
                out = pytesseract.image_to_string(tmp,
                                                  lang='eng',
                                                  config="-psm 5")
                #				rect=cv2.rectangle(img, (x_min, y_min), (x_max, y_max), (0,255,0), 2)
                #				#Store the small blocks, change the directions!!!
                #				cv2.imwrite('/Users/chenlingna/Desktop/output/'+str(x_min)+'.JPG', tmp)
                if out.__contains__('P') or out.__contains__(
                        'F') or out.__contains__('E'):
                    return 1
    return 0
Example #2
0
def get_ocr_number(img_orig):
    """Return float value from image. -1.0f when OCR failed"""
    img_resized = prepareImage(img_orig)
    lst = []
    config_ocr = '--psm 7 --oem 1 -c tessedit_char_whitelist=0123456789.$£B'

    lst.append(
        pytesseract.image_to_string(img_resized, 'eng', config=config_ocr).
            strip().replace('$', '').replace('£', '').replace('B', ''))

    try:
        return float(lst[-1])
    except ValueError:
        images = [img_orig, img_resized]  # , img_min, img_mod, img_med, img_sharp]
        i = 0
        while i < 2:
            j = 0
            while j < len(images):
                lst.append(
                    pytesseract.image_to_string(images[j], 'eng', config=config_ocr).
                        strip().replace('$', '').replace('£', '').replace('B', ''))
                j += 1
            config_ocr = '--psm 8 --oem 1 -c tessedit_char_whitelist=0123456789.$£B'
            i += 1

    log.debug(lst)
    for element in lst:
        try:
            return float(element)
        except ValueError:
            pass
    return -1.0
Example #3
0
def check_exam(request, course_id, task_id):
    if request.method == 'POST':
        path = 'media/diploma.pdf'
        fname = os.path.splitext(os.path.basename(path))[0]
        pdf = PdfFileReader(path)
        #split pdf file to multiple pds
        for page in range(pdf.getNumPages()):
            pdf_writer = PdfFileWriter()
            pdf_writer.addPage(pdf.getPage(page))
            output_filename = 'media/diploma/{}_page_{}.pdf'.format(
                fname, page + 1)
            with open(output_filename, 'wb') as out:
                pdf_writer.write(out)
            #print('Created: {}'.format(output_filename))

        #convert pdf to png, because tesseracts workd only with png file
        for page in range(pdf.getNumPages()):
            images = convert_from_path('media/diploma/{}_page_{}.pdf'.format(
                fname, page + 1))
            for image in images:
                image.save(
                    'media/diploma/{}_page_{}.png'.format(fname, page + 1),
                    'PNG')
                image_name2 = 'media/diploma/{}_page_{}.png'.format(
                    fname, page + 1)
                #crop name or id
                img = Image.open(image_name2)
                student_id = img.crop((430, 0, 920, 276))
                student_id.save('media/diploma/student_{}.png'.format(page +
                                                                      1))
                img_cv = cv2.imread(
                    'media/diploma/student_{}.png'.format(page + 1))
                student_id_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
                student_id_text = pytesseract.image_to_string(student_id_rgb)
                if User.objects.filter(groups__name='Student').filter(
                        last_name=student_id_text).exists():
                    #tesseract converts handwriting to txx file
                    img_cv = cv2.imread(image_name2)
                    img_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
                    new = pytesseract.image_to_string(img_rgb)
                    true_answers = open('media/RightAnswers.txt').read()
                    m = SequenceMatcher(None, new, true_answers)  #insert mark
                    mark = m.ratio() * 1388  #insert markz
                    Assignments.objects.create(
                        grade=mark,
                        task=Task.objects.get(id=task_id),
                        student=Student.objects.get(user=User.objects.filter(
                            groups__name='Student').filter(
                                last_name=student_id_text).first()))
    assignments = Assignments.objects.filter(task_id=task_id)
    return render(
        request, 'qmain/check_exam.html', {
            'title': 'Exam check',
            'course_id': course_id,
            'task_id': task_id,
            'assignments': assignments
        })
Example #4
0
def parseDataFromImgs(nameImg, itemImg):
    # Tess extraction
    nameText = pytesseract.image_to_string(nameImg)
    itemText = pytesseract.image_to_string(itemImg)

    # Parsing
    name = parseName(nameText)
    itemName, itemQuantity, price, seenAs = parseItemData(itemText)

    return name, itemName, itemQuantity, price, seenAs
def process_image():
    image = Image.open("screenshots/1ttp39.jpg")
    width, height = image.size
    pixel_access = image.load()

    # When searching the image, either we haven't found a band, or we're in a swb color band
    # or we're in a recipient color band
    # possible values are None, "swb" and "recipient"
    search_mode = None

    # it's a goddamned state machine

    # y increases from top to bottom of image
    lower_y = None
    # list of tuples of (lower_y, upper y)
    image_band_dimensions = []
    for y in xrange(height):
        if search_mode == None:
            if is_swb_color_in_row(pixel_access, width, y):
                lower_y = y
                search_mode = 'swb'
                continue
            if is_recipient_color_in_row(pixel_access, width, y):
                lower_y = y
                search_mode = 'recipient'
                continue
        # I'm making the assumption that you're always going to have some space b/w the text bubbles
        if search_mode == "swb":
            if (not is_swb_color_in_row(pixel_access, width, y)):
                search_mode = None
                image_band_dimensions.append((lower_y, y))
                continue
        if search_mode == 'recipient':
            if (not is_recipient_color_in_row(pixel_access, width, y)):
                search_mode = None
                image_band_dimensions.append((lower_y, y))
                continue


    image_bands = []
    for band_dimension in image_band_dimensions:
        box = (0, band_dimension[0], width, band_dimension[1])
        region = image.crop(box)
        # region.show()
        image_bands.append(region)

    for band in image_bands:
        image_name = str(int(round(random.random() * 10000))) + '.jpg'
        print image_name
        band.save('screenshots/processed/' + image_name)
        print pytesseract.image_to_string(band)

    print pytesseract.image_to_string(image)
Example #6
0
def process_image():
    image = Image.open("screenshots/1ttp39.jpg")
    width, height = image.size
    pixel_access = image.load()

    # When searching the image, either we haven't found a band, or we're in a swb color band
    # or we're in a recipient color band
    # possible values are None, "swb" and "recipient"
    search_mode = None

    # it's a goddamned state machine

    # y increases from top to bottom of image
    lower_y = None
    # list of tuples of (lower_y, upper y)
    image_band_dimensions = []
    for y in xrange(height):
        if search_mode == None:
            if is_swb_color_in_row(pixel_access, width, y):
                lower_y = y
                search_mode = 'swb'
                continue
            if is_recipient_color_in_row(pixel_access, width, y):
                lower_y = y
                search_mode = 'recipient'
                continue
        # I'm making the assumption that you're always going to have some space b/w the text bubbles
        if search_mode == "swb":
            if (not is_swb_color_in_row(pixel_access, width, y)):
                search_mode = None
                image_band_dimensions.append((lower_y, y))
                continue
        if search_mode == 'recipient':
            if (not is_recipient_color_in_row(pixel_access, width, y)):
                search_mode = None
                image_band_dimensions.append((lower_y, y))
                continue

    image_bands = []
    for band_dimension in image_band_dimensions:
        box = (0, band_dimension[0], width, band_dimension[1])
        region = image.crop(box)
        # region.show()
        image_bands.append(region)

    for band in image_bands:
        image_name = str(int(round(random.random() * 10000))) + '.jpg'
        print image_name
        band.save('screenshots/processed/' + image_name)
        print pytesseract.image_to_string(band)

    print pytesseract.image_to_string(image)
Example #7
0
 def similar_image(ffc, frames, image):
     # accumulate lists of frames with similar text
     try:
         if not frames:  # If this is the first frame in the potential bottom third
             return pytesseract.image_to_string(image), True
         else:
             if len(pytesseract.image_to_string(image)) < 3:
                 return ffc, False
             return ffc, SequenceMatcher(
                 None, ffc,
                 pytesseract.image_to_string(image)).ratio() > .5
     except TesseractError as e:
         print(e)
         return ffc, True
Example #8
0
def validateNumber(cell, found, could_be):
    k = getKernel(cell)
    eroded = cv2.erode(cell, k)
    dilated = cv2.dilate(cell, k)
    eroded_d = pytesseract.image_to_string(eroded,
                                           lang='eng',
                                           config='--psm 6 --oem 3')[0]
    dilated_d = pytesseract.image_to_string(dilated,
                                            lang='eng',
                                            config='--psm 6 --oem 3')[0]
    potential_numbers = [eroded_d, dilated_d, found]

    could_be_count = potential_numbers.count(could_be)

    return could_be if could_be_count > 1 else found
Example #9
0
def extract_data(image):
    # extract text
    text = pytesseract.image_to_string(image)
    # convert to PDF
    pdf = pytesseract.image_to_pdf_or_hocr(image)
    # done
    return text, pdf
Example #10
0
def run_tests():
    imgs = []
    fails = 0
    tests = 0

    for filename in os.listdir("img/tests"):
        t = (Image.open("img/tests/" + filename), filename)
        tests += 1
        imgs.append(t)

    for e in imgs:
        dprint(e)
    print()

    for ele in imgs:
        img = ele[0]

        img = enhance(img, "enhance-" + ele[1][0])
        text = image_to_string(img, lang='eng')
        c = solve_check(text) # Gets character for afk check

        ans = ele[1][0] # Answer is in the file name
        if c == ans:
            print("Test passed")
        else:
            if c:
                print("[FAILED] Test failed. Got '%s' instead of '%s'." % (c, ans))
            else:
                print("[FAILED] Test failed. Regex did not return any result.")
            fails += 1

    print("Failed %d out of %d times" % (fails, tests))
Example #11
0
def captcha_to_text():
    "captures the captcha image"
    elem = driver.find_element_by_id("captcha_image")
    loc = elem.location
    size = elem.size
    left = loc['x']
    top = loc['y']
    width = size['width']
    height = size['height']
    box = (int(left), int(top), int(left + width), int(top + height))
    screenshot = driver.get_screenshot_as_base64()
    img = Image.open(BytesIO(base64.b64decode(screenshot)))
    area = img.crop(box)
    full_path = r'/home/sangharshmanuski/Documents/e_courts/captcha'
    area.save(os.path.join(full_path, 'file_trial.png'), 'PNG')
    img = cv2.imread(os.path.join(full_path, 'file_trial.png'), 0)
    ret, thresh1 = cv2.threshold(img, 111, 255, cv2.THRESH_BINARY)
    cv2.imwrite(
        '/home/sangharshmanuski/Documents/e_courts/editC/oneDisNoLoop.png',
        thresh1)
    # know the text with pytesseract
    captcha_text = pytesseract.image_to_string(
        Image.open(
            '/home/sangharshmanuski/Documents/e_courts/editC/oneDisNoLoop.png')
    )
    return captcha_text
Example #12
0
    def getDigit(self, image, iter=0):
        if iter > 0:
            image = noise_removal(image)

        dig = remove_specials(
            pytesseract.image_to_string(image,
                                        lang='eng',
                                        config='--psm 6 --oem 3'))
        dig = fixCommonErrors(dig)
        digitsOnly = [d for d in dig if d.isdigit()]

        if not digitsOnly:
            if iter > 3:
                return "0"
            else:
                return self.getDigit(image, iter + 1)

        if digitsOnly[0] == "1":
            digitsOnly[0] = validateNumber(image, "1", "7")

        if digitsOnly[0] == "2":
            digitsOnly[0] = validateNumber(image, "2", "7")

        if digitsOnly[0] == "4":
            digitsOnly[0] = validateNumber(image, "4", "1")

        return digitsOnly[0]
Example #13
0
 def imgtotxt():
     elem = driver.find_element_by_id("captcha_image")
     loc = elem.location
     size = elem.size
     left = loc['x']
     top = loc['y']
     width = size['width']
     height = size['height']
     box = (int(left), int(top), int(left + width),
            int(top + height))
     screenshot = driver.get_screenshot_as_base64()
     img = Image.open(BytesIO(base64.b64decode(screenshot)))
     area = img.crop(box)
     area.save(
         '/home/sangharshmanuski/Documents/e_courts/captcha/file_trial.png',
         'PNG')
     fullPath = r'/home/sangharshmanuski/Documents/e_courts/captcha'
     f = os.listdir(fullPath)
     desPath = r"/home/sangharshmanuski/Documents/e_courts/editC"
     img = cv2.imread(os.path.join(fullPath, 'file_trial.png'), 0)
     ret, thresh1 = cv2.threshold(img, 111, 255, cv2.THRESH_BINARY)
     cv2.imwrite(
         '/home/sangharshmanuski/Documents/e_courts/editC/oneDisNoLoop.png',
         thresh1)
     # know the text with pytesseract
     captchaText = pytesseract.image_to_string(
         Image.open(
             '/home/sangharshmanuski/Documents/e_courts/editC/oneDisNoLoop.png'
         ))
     captcha = driver.find_element_by_id('captcha')
     captcha.send_keys(captchaText)
     driver.find_element_by_css_selector(
         'input.button:nth-child(1)').click()
     time.sleep(1)
Example #14
0
def convert_to_text():
    while True:
        time.sleep(1)
        image = Image.open('img.png')
        txt = pytesseract.image_to_string(image)
        pyperclip.copy(txt)
        print(txt)
Example #15
0
def get_mark(path, path_answers):
    img_cv = cv2.imread(path)
    img_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)
    new = pytesseract.image_to_string(img_rgb)
    true_answers = open(path_answers).read()
    mark = SequenceMatcher(None, new, true_answers).ratio() * 1388
    return mark
Example #16
0
def generateProcessedFiles(splittedName):
    # Get string from Image
    fullName = splittedName[0] + '.' + splittedName[1]
    imgText = pt.image_to_string(Image.open(fullName), lang='ron')

    with open('../TextIntermediar/' + splittedName[0] + "text.txt", 'w') as f:
        f.write(imgText)

    # Get bounding boxes
    pt.run_tesseract(fullName,
                     splittedName[0] + 'output',
                     lang='ron',
                     boxes=True,
                     config="hocr")

    #Remove non-alphanumeric characters
    with open(splittedName[0] + 'output.box', 'r+') as f:
        buf = ''
        for line in f:
            if line[0].isalnum() or line[0] == '(' or line[0] == ')':
                buf += line
        f.seek(0)
        f.write(buf)

    # To read the coordinates
    boxes = []
    with open(splittedName[0] + 'output.box', 'r') as f:
        reader = csv.reader(f, delimiter=' ')
        for row in reader:
            if (len(row) == 6):
                boxes.append(row)
Example #17
0
def validate_code_recognition():
    flag = request.form.get("flag", True)
    image = request.files.get("image", None)
    lang = request.form.get("lang", "eng")
    oem = request.form.get("oem", "1")
    psm = request.form.get("psm", "3")
    url = request.form.get("url", None)
    config = f"-l {lang} --oem {oem} --psm {psm}"
    if not image and not url:
        return jsonify({"msg": "image file don`t exist!", "code": 0}), 400
    if image:
        filename = image.filename
    else:
        filename = download_image(url)
        image = filename
    if not allowed_file(filename):
        return jsonify({
            "msg": f"Allowed file types are: {ALLOWED_EXTENSIONS}",
            "code": 0
        }), 400

    if flag:
        image = get_blackwhite_image(image)
    else:
        image = Image.open(image)
    text = pytesseract.image_to_string(image, config=config)
    return jsonify({"code": 1, "msg": "success", "data": {"text": text}}), 200
Example #18
0
def recogit(imgobject=None):
    '''
    # 程序入口,识别模块
    :param imgobject: 图片对象
    :return: nearest:最可能的值,result :  [48, '4', '4_0.txt', 48, '4', '4_0.txt']
    '''
    if not imgobject:
        im = Image.open('D:/workspace/captcha/40.png')
    else:
        im = imgobject
    im = im.convert("L")
    size = im.size
    img_array = im.load()
    for i in range(size[0]):
        for j in range(size[1]):
            if i != 0 and j != 0 and i != 44 and j != 59:  # 把边界变为纯白
                if (img_array[i, j] > 250):
                    # print('像素点:%d,%d,%d'%(i,j,img_array[i,j]))
                    img_array[i, j] = 255
                else:
                    img_array[i, j] = 0
            else:
                img_array[i, j] = 255

    # 去除噪点
    clearnoise(img_array, size, 3)
    clearnoise(img_array, size, 3)
    clearnoise(img_array, size, 3)
    clearnoise2(im, 140, 3, 1)
    clearnoise(img_array, size, 3)

    s = pt.image_to_string(im, lang="dt", config="-psm 7")
    return s
Example #19
0
def get_ocr_string(img_orig):
    """Return string value from image."""
    img_resized = prepareImage(img_orig)
    config_ocr = '--psm 7 --oem 1'

    return pytesseract.image_to_string(img_resized, 'eng',
                                       config=config_ocr).strip()
Example #20
0
def exchange(user, index):
    _session = requests.session()
    resp = _session.get(login_bypass_url)
    headers = {
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin": "http://h5.nty.tv189.com",
        "Referer": "http://h5.nty.tv189.com/csite/tysx/uc/login-by-pass?goBackUrl=",
        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0.1; MuMu Build/V417IR; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Mobile Safari/537.36/newtysx-android-ua-5.5.9.39",
        "X-Requested-With": "XMLHttpRequest"}
    login_resp = _session.post(login_url,
                               data={'uname': user, 'upass': users['upass']},
                               headers=headers)
    login_json = json.loads(login_resp.text)
    if login_json.get('code') == 0:
        for i in range(1, 100):
            png = _session.get(validate_url)
            if png.content[:2].hex() == '0d0a':
                bys = png.content[2:]
            else:
                bys = png.content
            image = Image.open(io.BytesIO(bys)).convert('L')
            b_img = binarizing(image, 126)
            img = depoint(b_img)
            code = pytesseract.image_to_string(img, lang='tv189', config=tessdate_dir)[:-2].replace(" ", "")
            print(code)
            resp = _session.get(exchange_url % (str(phones[index]), code))
            resp_json = json.loads(resp.text)
            print(resp.json())
            if resp_json['code'] == 0 or resp_json['code'] == -3:
                return
    else:
        print("login failed")
Example #21
0
def get_verification_code_by_pytesseract(driver, selector):
    """
    使用pytesseract通过webdriver和验证码的定位器selector获取验证码
    :param driver: webdriver对象
    :param selector: 验证码的定位器selector
    :return: 验证码
    """
    # 获取screenshot文件夹路径
    path = os.path.dirname(os.path.dirname(__file__)) + '\\screenshot'
    # 截屏保存并保存图片名称为当前时间戳.png
    raw_picture_name = path + '\\' + str(time.time()) + ".png"
    driver.save_screenshot(raw_picture_name)
    # 定位到验证码图片并获取到图片的左上右下的xy坐标
    element = driver.find_element_by_css_selector(selector)
    left_top_x = element.location['x']
    left__top_y = element.location['y']
    right_down_x = element.size['width'] + left_top_x
    right_down_y = element.size['height'] + left__top_y
    # 获取屏幕的缩放比例
    dpr = driver.execute_script('return window.devicePixeRatio')
    # 在截屏中抠出验证码图片并保存图片名称为当前时间戳.png
    raw_image = Image.open(raw_picture_name)
    target_image = raw_image.crop((left_top_x * dpr, left__top_y * dpr, right_down_x * dpr, right_down_y * dpr))
    code_picture_name = path + '\\' + str(time.time()) + ".png"
    target_image.save(code_picture_name)
    # 使用pytesseract获取图片中的验证码
    code_image = Image.open(code_picture_name)
    verification_code = pytesseract.image_to_string(code_image)
    return verification_code
Example #22
0
def captcha_processor():
    hp_image_name = 'home_page.png'
    driver.get_screenshot_as_file(hp_image_name)
    img = Image.open(hp_image_name)
    box = (285, 380, 345, 415)
    region = img.crop(box)

    captcha_img_name = 'captcha.png'
    region.save(captcha_img_name)
    # get img source
    # captcha_img = driver.find_element_by_id('captcha')
    # captcha_img_src = captcha_img.get_attribute('src')
    # urllib.urlretrieve(captcha_img_src, captcha_img_name)

    im = Image.open(captcha_img_name)
    imgry = im.convert('L')
    sharpness = ImageEnhance.Contrast(imgry)
    sharp_img = sharpness.enhance(2.0)
    sharp_img.save(captcha_img_name)

    captcha_code = pytesseract.image_to_string(sharp_img)

    if os.path.exists(hp_image_name):
        os.remove(hp_image_name)
    if os.path.exists(captcha_img_name):
        os.remove(captcha_img_name)

    return captcha_code.replace(' ', '')  # replace all space
Example #23
0
def get_code_text():
    r = session.get(url=img_url)
    img = Image.open(BytesIO(r.content))
    if is_save:
        img.save("D:\\data\\code{}.PNG".format(mobile_no))
    image = img.convert('L')
    if is_save:
        image.save("D:\\data\\code_black{}.PNG".format(mobile_no))
    pixels = image.load()
    # 【二值化】阀值:standard
    standard1, standard2 = (100, 170)
    # 【描边邻域降噪】阀值:standard
    for x in range(image.width):
        for y in range(image.height):
            if x >= image.width - 1 or y >= image.height - 1:
                # 边缘过滤
                pixels[x, y] = 255
            elif pixels[x, y] < standard2 and pixels[x + 1, y] < standard2 and pixels[x, y + 1] < standard2:
                # 深色并且粗线保留
                # 浅色加深
                pixels[x, y] = 0
            else:
                # 细线过滤
                pixels[x, y] = 255
    if is_save:
        image.save("D:\\data\\code_scan{}.PNG".format(mobile_no))
    testdata_dir_config = '--tessdata-dir "D:/Program Files (x86)/Tesseract-OCR/tessdata"'
    text_code = pytesseract.image_to_string(image, lang='eng', config=testdata_dir_config)
    # 去掉非法字符,只保留字母数字
    return re.sub("\W", "", text_code)
Example #24
0
def sampleA(filepath):
    # 1枠の縦横ピクセル
    w = 80
    h = 65

    # 枠線を除去するマージンピクセル。縦横共通。
    margin_w = 20
    margin_h = 15

    img = Image.open(filepath, "r")
    for pos_h in range(0, 3):
        # 読み取るマスの左上位置
        leftpos_y = margin_h + (h * pos_h) + (margin_h * pos_h)
        for pos_w in range(0, 3):
            # 読み取るマスの左上位置
            leftpos_x = margin_w + (w * pos_w) + (margin_w * pos_w)

            # 該当のマスをトリミング
            crop = img.crop((leftpos_x, leftpos_y, leftpos_x + w, leftpos_y + h))
            # crop.save("/opt/crop" + str(imgid) + "-" +  str(pos_h) + "-" + str(pos_w) + ".png")

            # インストールしたtesseractコマンドのパス
            pytesseract.tesseract_cmd = "/usr/bin/tesseract"

            # -psm 10は1文字判定のフラグ
            result = pytesseract.image_to_string(crop, config="-psm 10 -c tessedit_char_whitelist='0123456789-.'", lang="eng+jpn")

            print(result)
Example #25
0
def to_text(img: Image.Image) -> str:
    """Performs OCR on an image.

    Performs :abbr:`OCR (optical character recognition)` on the
    :class:`~PIL.Image.Image` :any:`img` and returns the resulting string.

    Notes
    -----
    OCR is performed using :mod:`pytesseract`.

    Parameters
    ----------
    img: Image.Image
        The image on which to perform OCR.

    Returns
    -------
    str
        The resulting text from OCR.
    """
    log.debug("Performing OCR.")
    config = f'--tessdata-dir "{g.config["tessdata_dir"]}"' if g.config[
        "tessdata_dir"] else None

    return pytesseract.image_to_string(img, config=config)
Example #26
0
 def extract_text(frames,
                  tess_psm="",
                  tess_char_whitelist=default_whitelist,
                  oem=""):
     config = "-c tessedit_char_whitelist=" + tess_char_whitelist + " --oem " + oem + " --psm " + tess_psm
     res = pytesseract.image_to_string(frames[0], config=config)
     return res
Example #27
0
    def read_small_and_save(self, img_path, single_pic_doc, only_save=False):
        """
        识别小图片并且保存
        :param img_path: 图片路径
        :param single_pic_doc: 单个文件进行保存的路径
        :return:
        """
        # 旧的切割图片方式
        # small_pics = self.cutting_pic(img_path)

        small_img_objs = self.cutting_pic_update(img_path)
        if only_save:
            name_list = []
            for small_img_obj in small_img_objs:
                time.sleep(0.1)
                single_pic_path = '{0}/{1}.png'.format(single_pic_doc,
                                                       int(time.time() * 10))
                resized = small_img_obj.resize((20, 30), Image.ANTIALIAS)
                resized.save(single_pic_path)
                name_list.append(single_pic_path)
            return name_list

        else:
            for small_img_obj in small_img_objs:
                text = pytesseract.image_to_string(small_img_obj,
                                                   config='-psm 10')
                if not is_value_name(text):
                    text = 'temp'
                small_img_path = '{0}/{1}_{2}.png'.format(
                    single_pic_doc, int(time.time() * 10), ''.join(text))
                resized = small_img_obj.resize((20, 30), Image.ANTIALIAS)
                resized.save(small_img_path)
                print("单个图识别结果:", text)
def parse_code():
    driver.save_screenshot('code.png')
    img = cv2.imread('code.png')
    cro = img[1042:1106, 2282:2397]
    cv2.imwrite('real_code.png', cro)
    text = pytesseract.image_to_string("real_code.png")
    return text
Example #29
0
def input():
    a1 = pyautogui.position().x
    a2 = pyautogui.position().y
    sleep(0.2)
    while True:
        b1 = pyautogui.position().x
        b2 = pyautogui.position().y
        if mouse.is_pressed(button='middle'):
            break
    temp1 = a1
    temp2 = a2
    if b2 - a2 < 0 and b1 - a1 < 0:
        a1 = b1
        a2 = b2
        b1 = temp1
        b2 = temp2
    elif b2 - a2 > 0 and b1 - a1 < 0:
        a1 = b1
        b1 = temp1
    elif b2 - a2 < 0 and b1 - a1 > 0:
        a2 = b2
        b2 = temp2

    grab(bbox=(a1, a2, b1, b2)).save('screen.png')
    screen = Image.open('screen.png')
    """
    display = pygame.display.set_mode(screen.size)
    showimage = pygame.image.load('screen.png')
    display.blit(showimage, (0, 0))
    pygame.display.update()
    """
    vocab = pytesseract.image_to_string(screen)
    sent = '\n' + vocab + "\n" + translator.translate(vocab, dest='th').text
    lineNotify(sent)
Example #30
0
def extracteddetails(filename):
    c_id = ar[0]
    mycursor.execute("SELECT * from vacancies where company_id='%s'" % c_id)
    values1 = mycursor.fetchall()
    # vid = datetime.datetime.now().strftime("%I:%M%p%B%d%Y").replace(":/ ", "") + filename
    pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
    text = pytesseract.image_to_string(Image.open((os.path.join(app.config['UPLOAD_FOLDER'], filename))), lang='eng')
    txt = []
    for line in text.split('\n' or '   ' or '. ' or ','):
        line2 = line.strip()
        if line2 != '':
            txt.append(line2)
    print(txt)
    otherskills = []
    # education = []
    # skills = []
    experience = []
    for lineindex, line in enumerate(txt):

        if extract_otherskills(line) is not None:
            otherskills.append(line)

        # if extract_education(line) is not None:
        #     education.append(line)

        # if extract_skills(line) is not None:
        #     skills.append(line)

        experience = []
        if extract_experience(line) is not None:
            experience.append(line)

    job = extract_job(text)
    skills = extract_skills2(text)
    education = extract_education(text)

    vjob = json.dumps(job)
    ved = json.dumps(education)
    vsk = json.dumps(skills)
    vos = json.dumps(otherskills)
    vex = json.dumps(experience)

    if request.method == 'POST':
        jobv = request.form['vjob']
        eduv = request.form['vedu']
        expv = request.form['vexp']
        eyv = request.form['vey']
        vskv = request.form['vski']
        osv = request.form['voski']
        sql_vacancies = "INSERT INTO vacancies (company_id, job, education, experience, ex_year,skills,other_skills) VALUES(%s,%s,%s,%s,%s,%s,%s) "
        val_vacancies = (c_id, jobv, eduv, expv, eyv, vskv, osv)
        mycursor.execute(sql_vacancies, val_vacancies)
        mydb.commit()
        vid = mycursor.lastrowid
        print("VID: " + str(vid))
        vMatcher.VacancyMatching().matchingByVacencyId(vid)
        flash("successfully saved")
        return render_template("vacancy.html", data=values1, vvid=c_id)

    return render_template('vacancyextract.html', vvjob=vjob, vved=ved, vvsk=vsk, vvos=vos, vvex=vex)
def run_tesseract(input_file, output_file, language="deu"):
    """
    :param input_file: str
        Path to image to OCR
    :param output_file: str
        Path to output file
    :return: void
        Runs tesseract on image and saves result
    """

    print(ORANGE + '\t~: ' + RESET + 'Parse image using pytesseract' + RESET)
    print(ORANGE + '\t~: ' + RESET + 'Parse image at: ' + input_file + RESET)
    print(ORANGE + '\t~: ' + RESET + 'Write result to: ' + output_file + RESET)

    with io.BytesIO() as transfer:
        with WandImage(filename=input_file) as img:
            img.auto_level()
            img.sharpen(radius=0, sigma=4.0)
            img.contrast()
            img.save(transfer)

        with Image.open(transfer) as img:
            image_data = pytesseract.image_to_string(img, lang=language, timeout=60)

            out = open(output_file, "w")
            out.write(image_data)
            out.close()
Example #32
0
 def code_veryfy(self,link):
     local = 'verifycode.jpg'
     try:
         urllib.request.urlretrieve(link,local)
         image1=Image.open(local)
         text=image_to_string(image1)
     except Exception :
        print("Error occured in code verifing... ")
     return text
Example #33
0
def get_random_code():
    resp = requests.get("http://10.3.254.23:8080/dangwebx/randCodeImage?a=1444784364985", stream=True)
    with open("tmp.png", "wb") as f:
        f.write(resp.raw.read())
        # or
        shutil.copyfileobj(resp.raw, f)

    image = Image.open("tmp.png")
    return pytesseract.image_to_string(image)
Example #34
0
def perform_ocr_scoreboard(image: Image.Image, column: str, start=START, end=END, diff=DIFF)->(str, int, None):
    """Perform OCR on a part of an image"""
    result = None
    is_number = column in DIGITS
    for treshold in range(start, end, -diff):  # Continue until result is valid
        template = high_pass_invert(image, treshold)
        result = tesseract.image_to_string(template, config="" if column in DIGITS else WHITELIST)
        if is_number and not result.isdigit():  # Try again for numbers
            result = tesseract.image_to_string(template, config="-psm 10")
        if is_number and not result.isdigit():
            template = template.filter(ImageFilter.GaussianBlur())
            result = tesseract.image_to_string(template, config="-psm 10")
        if result == "" or (is_number and not result.isdigit()):
            continue
        break
    if is_number and not result.isdigit():
        result = match_digit(image)
    if result == "" or (is_number and not result.isdigit()):
        return 0 if is_number else None
    if is_number:
        return int(result)
    return result.replace("\n", "").replace("  ", "")
Example #35
0
    def recognize_captcha(self,r=None, data=None):
        if data is None:
            return '0000'
        file = 'd:/code_{}.png'.format(r)
        with open(file, 'wb') as f:
            f.write(data)

        mem = BytesIO()
        mem.write(data)
        mem.seek(0)
        print(type(Image.open(mem)))
        captcha = pytesseract.image_to_string(Image.open(mem), config='digits')
        print(captcha)
        return captcha
Example #36
0
def perform_ocr(image: Image.Image) -> (None, str):
    """Perform OCR on an Image"""
    if not is_installed():
        print("[Tesseract] Critical error: Tesseract is not installed!")
        return None
    result = None
    for threshold in range(START, END, -DIFF):
        template: Image.Image = high_pass_invert(image, threshold)
        result: str = tesseract.image_to_string(template)
        if result == "":
            continue
        break
    if result == "":
        return None
    return result
Example #37
0
            # <h2>6/500</h2>
            m = re.search("<h2>(\d+)/\d+</h2>", data)
            if m is not None:
		s = m.group(1)
                print(s)
                if int(s) < lastScore:
                    lastScore = int(s)
                    im.show()  # debug, display failed captcha to improve blacklist
                else:
                    lastScore = int(s)
            f = opener.open("http://ctfquest.trendmicro.co.jp:8080/acf2e28903f698eda9bdefc043b6edc3/image")
            image_file = io.BytesIO(f.read())
            im = Image.open(image_file)
            im = im.convert('RGB')
            filling = fix_colors(im)
            black_and_white(im, filling)
            text = pytesseract.image_to_string(im,
                                               config="-psm 8 --user-patterns /cygdrive/d/tess/pattern.txt /cygdrive/d/tess/conf.txt")
            text = text.replace(" ", "")
            if not on_blacklist(text):
                print(text)
                params = {"captcha": text}
                params.update({c.name: c.value for c in cookies.cookiejar})
                encoded_params = urllib.urlencode(params)
                opener.open("http://ctfquest.trendmicro.co.jp:8080/acf2e28903f698eda9bdefc043b6edc3/challenge",
                            encoded_params)
            else:
                print "Possibly wrong decode " + text + " skipping"
	except:
        pass
Example #38
0
import shutil
from PIL import Image
from pytesseract import pytesseract
import requests

__author__ = 'bida'

resp = requests.get("http://10.3.254.23:8080/dangwebx/randCodeImage?a=1444784364985", stream=True)
with open("tmp.png", "wb") as f:
    f.write(resp.raw.read())
    shutil.copyfileobj(resp.raw, f)

image = Image.open("tmp.png")
print pytesseract.image_to_string(image)
Example #39
0
 def print_text(cls, img):
     text = pytesseract.image_to_string(img, lang="eng")
     if text:
         print(text)