This python package can be used for extracting text from PDF/TIF,jpg and png files.
from textgetter.gettxt import img_txt_extract
from textgetter.gettxt import tif_txt_extract
from textgetter.gettxt import pdf_txt_extract
if __name__ == "__main__":
# use img_txt_extract for extracting text from images like jpg,png etc
img_txt_extract('/home/user/test', '/home/user/output', ['jpeg','png'],ocr_path='C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe',
verbose=True)
# use tif_txt_extract for extracting text from tif files
tif_txt_extract('/home/user/test', '/home/user/output', verbose=True)
# use pdf_txt_extract for extracting text from pdf files
pdf_txt_extract('/home/user/test', '/home/user/output', verbose=True)
from textgetter.getdocx import img_txt_extract
from textgetter.getdocx import tif_txt_extract
from textgetter.getdocx import pdf_txt_extract
if __name__ == "__main__":
# use img_txt_extract for extracting text from images like jpg,png etc
img_txt_extract('/home/user/test', '/home/user/output', ['jpeg','png'], verbose=True)
# use tif_txt_extract for extracting text from tif files
tif_txt_extract('/home/user/test', '/home/user/output', ocr_path='C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe',
verbose=True)
# use pdf_txt_extract for extracting text from pdf files
pdf_txt_extract('/home/user/test', '/home/user/output', verbose=True)
from textgetter.getcsv import img_txt_extract
from textgetter.getcsv import tif_txt_extract
from textgetter.getcsv import pdf_txt_extract
if __name__ == "__main__":
# use img_txt_extract for extracting text from images like jpg,png etc
img_txt_extract('/home/user/test', '/home/user/output', ['jpeg','png'], verbose=True)
# use tif_txt_extract for extracting text from tif files
tif_txt_extract('/home/user/test', '/home/user/output', verbose=True)
# use pdf_txt_extract for extracting text from pdf files
pdf_txt_extract('/home/user/test', '/home/user/output', ocr_path='C:\\Program Files (x86)\\Tesseract-OCR\\tesseract.exe',
verbose=True)
from textgetter.getexcel import img_txt_extract
from textgetter.getexcel import tif_txt_extract
from textgetter.getexcel import pdf_txt_extract
if __name__ == "__main__":
# use img_txt_extract for extracting text from images like jpg,png etc
img_txt_extract('/home/user/test', '/home/user/output', ['jpeg','png'], verbose=True)
# use tif_txt_extract for extracting text from tif files
tif_txt_extract('/home/user/test', '/home/user/output', verbose=True)
# use pdf_txt_extract for extracting text from pdf files
pdf_txt_extract('/home/user/test', '/home/user/output', verbose=True)
- input_files_path - folder path for input files e.g., '/home/user/test'
- output_files_path - folder path for output files e.g., '/home/user/output'
- file_extensions - list of file extensions from input folder e.g., ['jpeg','png']
- ocr_path - path of tesseract ocr (Windows only) defualte.g., 'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe' , if linux ignore this argument
- verbose - for printing logs e.g., True/False\
- input_files_path - folder path for input files e.g., '/home/user/test'
- output_files_path - folder path for output files e.g., '/home/user/output'
- ocr_path - path of tesseract ocr (Windows only) e.g., 'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe', if linux ignore this argument
- verbose - for printing logs e.g., True/False
This package uses poppler for reading pdf files, for windows platform poppler is included in the package but for linux we have to install it manually.
We can download poppler from poppler
OR
We can install poppler using below command
sudo apt-get install python-poppler
This package uses tesseract for extracting text from files, we have to install it manually for both windows and linux platforms.
Use this link to install tesseract ocr for Windows OS
Use below command for Linux OS
sudo apt install tesseract-ocr
sudo apt install libtesseract-dev
$ pip install textgetter