Ejemplo n.º 1
0
 def __langoptions__(self):
     tesseract_list = pytesseract.get_languages(config='')
     supported_langs = []
     for k,v in lang_options.items():
         if v in tesseract_list:
             supported_langs.append(k)
     return sorted(supported_langs)
Ejemplo n.º 2
0
def main(inputfile, args, lang = 'eng', width = 2000):
    if lang == None: lang = 'eng' # Default to language to 'eng'
    if args.scale: width = int(args.scale) # Set width scale to 
    if args.verbose:
        print('File:', inputfile)
        print('Starting ocr with lang:', lang)
    if args.get_lang:
        print('Languages:')
        print(', '.join(pytesseract.get_languages()))
    
    # Load image
    image = cv2.imread(inputfile)

    # Downscale image with interpolation
    (h, w) = image.shape[:2]
    r = width / float(w)
    dim = (int(width), int(h * r))
    
    image = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    if args.verbose:
        print("Downscaled to ", dim)

    # Perform OCR
    result = pytesseract.image_to_string(image, args.lang).strip()

    # Remove all special chars (--remove-special)
    if args.remove_special:
        if args.verbose:
            print("Removing special chars")
        for c in SPECIAL_CHARS:
            result = result.replace(c, '')

    # Output result (--verbose) or (--result)
    if args.verbose and args.result:
        print('OCR Result:', result)
    elif args.result:
        print(result)
    # Write result to file (--out)
    if args.out:
        outFileName = inputfile+'.txt'

        if args.verbose:
            print("Outputting to", outFileName)

        with open(outFileName, 'w') as fo:
            fo.write(result)
Ejemplo n.º 3
0
    def fetch_text_in_image(self, img):
        # List of available languages
        print(pytesseract.get_languages(config=''))

        # NOTE: You can just load the image if it is one of the supported formats like:
        # text = pytesseract.image_to_string(img)

        # Load image with OpenCV
        img_cv = cv2.imread(img)
        # pytesseract needs images in RGB, opencv uses BGR - convert it!
        img_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB)

        # Other data sources available from https://github.com/tesseract-ocr/tessdata
        # tessdata_dir_config = r'--tessdata-dir "/tmp/screenshots" --psm 11'
        # text = pytesseract.image_to_string(img_rgb, lang='eng', config=tessdata_dir_config)

        # Inform the OCR engine that the "page" is not a document of text and process
        tessdata_dir_config = r'--psm 11'
        text = pytesseract.image_to_string(img_rgb,
                                           lang='eng',
                                           config=tessdata_dir_config)
        return text
Ejemplo n.º 4
0
def get_tesseract_languages(tessdata_path) -> list[str]:
    """Get info abput tesseract setup."""
    configure_tesseract_binary()

    try:
        languages = sorted(
            pytesseract.get_languages(
                config=get_tesseract_config(tessdata_path)))
    except RuntimeError as e:
        traceback.print_tb(e.__traceback__)
        raise RuntimeError(
            "Couldn't determine Tesseract information. If you pip installed NormCap "
            + "make sure Tesseract is installed and configured correctly."
        ) from e

    if not languages:
        raise ValueError(
            "Could not load any languages for tesseract. " +
            "On Windows, make sure that TESSDATA_PREFIX environment variable is set. "
            +
            "On Linux/MacOS see if 'tesseract --list-langs' work is the command line."
        )

    return languages
Ejemplo n.º 5
0
try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract

# If you don't have tesseract executable in your PATH, include the following:
pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'
# Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'

# Simple image to string
print(pytesseract.image_to_string(Image.open('test.png')))

# List of available languages
print(pytesseract.get_languages(config=''))

# French text image to string
print(pytesseract.image_to_string(Image.open('test-european.jpg'), lang='fra'))

# In order to bypass the image conversions of pytesseract, just use relative or absolute image path
# NOTE: In this case you should provide tesseract supported images or tesseract will return error
print(pytesseract.image_to_string('test.png'))

# Batch processing with a single file containing the list of multiple image file paths
print(pytesseract.image_to_string('images.txt'))

# Timeout/terminate the tesseract job after a period of time
try:
    print(pytesseract.image_to_string('test.jpg',
                                      timeout=2))  # Timeout after 2 seconds
    print(pytesseract.image_to_string(
Ejemplo n.º 6
0
################
#    SETUP
################

# create config file if not already present
if not configPath.exists():
    print("No config found")
    writeConfig()
readConfig()

# pytesseract version
print("[INFO] currently using tesseract: " + str(pytesseract.get_tesseract_version()))

# other languages can be installed by
# sudo apt install tesseract-ocr-[language code]
langs = pytesseract.get_languages(config="")
print(f"[INFO] following languages are availible:\n   {langs}")

# get a numerical sorted list of all files and the total count
numbers = re.compile(r"(\d+)")  # matches numerical token with multiple digits
files = sorted(listdir(path=src_image_path), key=numericalSort)
totalNumOfImages = len(files)
if enable_debug:
    print(f'[INFO] {totalNumOfImages} image files are present in "{src_image_path}"')
    print(f"[INFO] Following files are present:\n    {files}")

################
#    LOOP
################

# CHECK IF OCR CODE IS INCLUDED IN INSTALLED LANG's. NEEDS TO BE PRESENT FOR OCR!
Ejemplo n.º 7
0
def print_lang_codes():
    langs = pytesseract.get_languages(config='')
    for l in langs:
        print(f"'{l}': ,")
Ejemplo n.º 8
0
# Text recognition from images using Pytesseract lib

import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'D:\installations\pytesseract\inst\Tesseract-OCR\tesseract.exe'
import cv2
from PIL import Image
import tensorflow as tf

# Read and display image
image = cv2.imread('sample3.jpg')
img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Some functions of tesseract
print(pytesseract.image_to_string(img_rgb))  # Prints out the text
print(pytesseract.get_languages(config=''))  # List of available languages

#--------- Prints out the letters and corresponding coordinates and create BB around them ------

print(pytesseract.image_to_boxes(
    img_rgb))  # Prints out the letters and corresponding coordinates
bounding_box_coordinates = pytesseract.image_to_boxes(img_rgb)
image_h, image_w, _ = img_rgb.shape
for bounding_box in bounding_box_coordinates.splitlines():
    print(type(bounding_box))  # string class
    print(bounding_box)
    bounding_box = bounding_box.split(' ')
    x1, y1, x2, y2 = int(bounding_box[1]), int(bounding_box[2]), int(
        bounding_box[3]), int(bounding_box[4])

    cv2.rectangle(img_rgb, (x1, image_h - y1), (x2, image_h - y2), (0, 0, 255),
                  thickness=3)
Ejemplo n.º 9
0
def get_available_languages() -> set[str]:
    try:
        # Will throw an exception if tesseract_cmd is invalid
        return set(tess.get_languages())
    except EnvironmentError:
        return AVAILABLE_LANGUAGES
Ejemplo n.º 10
0
    }
    </style>
    """,
            unsafe_allow_html=True)

st.markdown(f"""
    <div class="container">
        <img class="logo-img" 
        src="data:image/png;base64,{base64.b64encode(open(LOGO_IMAGE, "rb").read()).decode()}">
        <p class="logo-text">GrumpyOCR</p>
    </div>
    """,
            unsafe_allow_html=True)

tesseract_version = pytesseract.get_tesseract_version()
tesseract_languages = pytesseract.get_languages(config='')
st.markdown(f"Version: {tesseract_version}, Langs: {tesseract_languages}")

st.sidebar.markdown('### Настройки')
languages = ['eng+rus', 'rus', 'eng']
selected_languages = st.sidebar.selectbox('Язык', languages)

# Выбор файла с признаками обязателен для дальнейшего выполнения.
st.markdown("### 1 Выберите или перенесите файл с текстом")
upload_file_object = st.file_uploader('', ['pdf', 'png', 'jpg', 'jpeg'])
if not upload_file_object:
    st.stop()

# Небольшой трюк с кэшированием в файл.
filename = upload_file_object.name
if not upload_file_object.closed: