def pdfToTxt(filename): print('Extracting {}'.format(filename)) print( 'WARNING: Corrupted text on file may retrive errors during extraction') # open image filename = filename[0:-4] pdfFile = wi(filename='data/{}.pdf'.format(filename), resolution=300) image = pdfFile.convert('jpeg') # transform image to imageBlob imageBlobs = [] for img in image.sequence: imgPage = wi(image=img) imageBlobs.append(imgPage.make_blob('jpeg')) # extract text from imageBlob extract = '' for imgBlob in imageBlobs: image = Image.open(io.BytesIO(imgBlob)) text = pytesseract.image_to_string(image, lang='eng') extract += text # save as text file file = open('data/text/{}.txt'.format(filename), 'w') file.write(extract) file.close()
def get_text(pdf_location, res=120, page=None): # import os import io from PIL import Image import pytesseract from wand.image import Image as wi from clean import _clean pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract' # DIR = pdf_location[0:pdf_location.rindex("\\")] try: FILE = pdf_location[pdf_location.rindex("\\") + 1:] except ValueError: FILE = pdf_location # os.chdir(DIR) if page is None: pdf = wi(filename=FILE, resolution=res) else: pdf = wi(filename=FILE + "[" + str(page) + "]", resolution=res) pdfImg = pdf.convert('jpeg') extracted_text = [] for img in pdfImg.sequence: page = wi(image=img) imgBlob = page.make_blob('jpeg') im = Image.open(io.BytesIO(imgBlob)) text = pytesseract.image_to_string(im, lang='eng', config='--psm 6') extracted_text.append(_clean(text)) # extracted_text.append(text) return extracted_text
def pdfocr(path, pages=[], lang='eng'): if len(path) == 0: print('Path is empty') return pytesseract.pytesseract.tesseract_cmd = ConfigOcr.path_to_tesseract pdf = wi(filename=path, resolution=300) pdfImage = pdf.convert('jpeg') imageBlobs = [] count = 1 for img in pdfImage.sequence: if (not pages) or count in pages: imgPage = wi(image=img) imageBlobs.append(imgPage.make_blob('jpeg')) count = count + 1 result_text = [] for imgBlob in imageBlobs: im = Image.open(io.BytesIO(imgBlob)) text = pytesseract.image_to_string(im, lang=lang) result_text.append(text) return result_text
def is_crypto(_filename): pdf = wi(filename=_filename, resolution=140) pdf_image = pdf.convert('jpeg') image_blobs = [] for img in pdf_image.sequence: img_page = wi(image=img) image_blobs.append(img_page.make_blob('jpeg')) recognized_text = [] for imgBlob in image_blobs: im = Image.open(io.BytesIO(imgBlob)) text = pytesseract.image_to_string(im, lang='eng') recognized_text.append(text) keyword = "crypto" keyword_second = "second" for i in recognized_text: # print(i) if keyword in i or keyword_second in i: print("CRYPTO STATEMENT!!!") return True print("PRAESCIRE STATEMENT") return False
def generate_text(pdf_location, res=300): # import os import io from PIL import Image import pytesseract from wand.image import Image as wi # from clean import _clean pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract' # DIR = pdf_location[0:pdf_location.rindex("\\")] try: FILE = pdf_location[pdf_location.rindex("\\") + 1:] except ValueError: FILE = pdf_location # os.chdir(DIR) pdf = wi(filename=FILE, resolution=res) pdfImg = pdf.convert('jpeg') for img in pdfImg.sequence: page = wi(image=img) imgBlob = page.make_blob('jpeg') im = Image.open(io.BytesIO(imgBlob)) text = pytesseract.image_to_string(im, lang='eng', config='--psm 6') # yield _clean(text) yield text
def is_crypto(_filename): """ returns a bool whether the file is a crypto statement or not :param _filename: :return: """ pdf = wi(filename=_filename, resolution=100) pdf_image = pdf.convert('jpeg') image_blobs = [] for img in pdf_image.sequence: img_page = wi(image=img) image_blobs.append(img_page.make_blob('jpeg')) recognized_text = [] for imgBlob in image_blobs: im = Image.open(io.BytesIO(imgBlob)) text = pytesseract.image_to_string(im, lang='eng') recognized_text.append(text) keyword = "crypto" for i in recognized_text: # print(i) if keyword in i: print("WE FOUND IT EXITING") print("CRYPTO STATEMENT!!!") return True print("PRAESCIRE STATEMENT") return False
def pdf_to_image(filepath): #encrypt file to be uploaded onto IPFS encrypted = encrypt(getKey("abc123"), filepath) #try connect to IPFS try: api = ipfsapi.connect('127.0.0.1', 5001) new_file = api.add(encrypted) filehash = new_file['Hash'] except ipfsapi.exceptions.ConnectionError as ce: print("Check if Dameon is switched on") filehash = " " os.remove(encrypted) # CHECK IF pdf file file_name, extension = os.path.splitext(filepath) ''' Returns a list of CV2 images which are ready for further preprocessing''' #See if in pdf form - otherwise return image path if extension == ".pdf": imagelist = [] with(wi(filename=filepath,resolution=400)) as source: images=source.sequence pages=len(images) for i in range(pages): wi(images[i]).save(filename=f"{file_name} page {i+1}.png") imagelist.append(f"{file_name} page {i+1}.png") return imagelist, filehash else: print("not pdf") return filepath, filehash
def ocr(f_path): # img = input() new_path = os.path.join(os.getcwd(), f_path[1:]) pdf = wi(filename=os.path.join(os.getcwd(), new_path), resolution=300) pdfImage = pdf.convert('jpg') orig_stdout = sys.stdout # print(os.getcwd()) f = open('receipt/media/txt/output1.txt', 'w+') sys.stdout = f imageBlob = [] for img in pdfImage.sequence: imgPage = wi(image=img) imageBlob.append(imgPage.make_blob('jpeg')) recognized_text = [] for blob in imageBlob: im = Image.open(io.BytesIO(blob)) text = pytesseract.image_to_string(im, lang='eng') recognized_text.append(text) print(recognized_text[0]) sys.stdout = orig_stdout f.close()
def pdfs2txts(file_path, folderName): complete_name = os.path.join(file_path, folderName) reports_dir1 = os.listdir(complete_name) create_dir('Convert output') for elem in tqdm(reports_dir1): complete_name = os.path.join(folderName, elem) print(complete_name) pdf = wi(filename=complete_name, resolution=750, depth=8, height=50, background='white') pdfimage = pdf.convert('jpg') i = 1 string = '' for img in pdfimage.sequence: page = wi(image=img) page.save(filename=str(i) + '.jpg') img_cv = cv2.imread(str(i) + '.jpg') # By default OpenCV stores images in BGR format and since pytesseract assumes RGB format, # we need to convert from BGR to RGB format/mode: img_rgb = cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB) string += pytesseract.image_to_string(img_rgb) i += 1 with open(os.path.join('Convert output', elem.replace('.pdf', '')) + '.txt', 'w', encoding='utf-8') as f: f.write(string)
def convert(way): if not (os.path.exists('F:\python\open cv\jpegs')): os.mkdir('F:\python\open cv\jpegs') i = 1 img_dir = way for root, dirs, files in os.walk(img_dir): for file in files: if file.endswith(".pdf"): path = os.path.join(root, file) pdf = wi(filename=path, resolution=400) pdfimage = pdf.convert("jpeg") for imgs in pdfimage.sequence: page = wi(image=imgs) page.save( filename='F:\python\open cv\jpegs\conv_{}.jpg'. format(i)) i += 1 print("***proccessed****") else: messagebox.showerror("error", "{} is not PDF file".format(file)) print("=====converted=======") else: print("File already exists")
def conv_pdf(pdf): with open('./pdfs/' + pdf, 'rb') as f: # text from pdf bfr = io.BufferedReader(f) pdf_text = convert_pdf_to_txt(bfr) # recognized text (OCR) from pdf if len(pdf_text.strip()) <= 50: with wi(filename=pdf, resolution=200) as pdf_file: pdfImage = pdf_file.convert('jpeg') imageBlobs = [] for img in pdfImage.sequence: with wi(image=img) as imgPage: imageBlobs.append(imgPage.make_blob('jpeg')) recognized_text = [] for imgBlob in imageBlobs: im = Image.open(io.BytesIO(imgBlob)) text = pytesseract.image_to_string(im, lang='eng') recognized_text.append(text) recognized_text = '\n\n\n'.join(recognized_text) pdf_text = pdf_text if len(pdf_text.strip()) > 50 else recognized_text return pdf_text
def pdfOcrTel(file_name): pdf = wi(filename = file_name, resolution = 300) pdfImage = pdf.convert('jpeg') imageBlobs = [] for img in pdfImage.sequence: imgPage = wi(image = img) imageBlobs.append(imgPage.make_blob('jpeg')) extracted_text = [] for imgBlob in imageBlobs: im = Image.open(io.BytesIO(imgBlob)) text = pytesseract.image_to_string(im, lang = 'tel') extracted_text.append(text) fin = open('extracted.txt','w') fin.writelines(["%s\n" % item for item in extracted_text]) fin.close() f = open('extracted.txt','r') text = f.read() f.close() return text
def Get_text_from_image(): import pytesseract, io, gc from PIL import Image from wand.image import Image as wi import gc """ Extracting text content from Image """ pdf = wi(filename='C:\\Users\\user\\Desktop\\Labs\\INE033L07EO0.pdf', resolution=300) pdfImg = pdf.convert('jpeg') imgBlobs = [] extracted_text = [] try: for img in pdfImg.sequence: page = wi(image=img) imgBlobs.append(page.make_blob('jpeg')) for i in range(0, 5): [gc.collect() for i in range(0, 10)] for imgBlob in imgBlobs: im = Image.open(io.BytesIO(imgBlob)) pytesseract.pytesseract.tesseract_cmd = r'C:\Users\user\AppData\Local\Tesseract-OCR\tesseract.exe' text = pytesseract.image_to_string(im, lang='eng') text = text.replace(r"\n", " ") extracted_text.append(text) for i in range(0, 5): [gc.collect() for i in range(0, 10)] return (''.join([ i.replace("\n", " ").replace("\n\n", " ") for i in extracted_text ])) [gc.collect() for i in range(0, 10)] finally: [gc.collect() for i in range(0, 10)] img.destroy()
def get_pdf(file_path): '''splitting pages of pdf into iamges input= pdf file path or pdf bytes array ouput= list of all image blobs''' # pdf_pages=convert_from_path(file_path) pdf_pages = [] pdf = wi(blob=file_path, resolution=600) pdfimage = pdf.convert("jpg") print(pdfimage) i = 0 for img in pdfimage.sequence: # iterate over all pages extracted form the pdf doc page = wi(image=img) save=page.convert('jpg') #page.save(filename='./temp_pdfs/pdf_page_' + str(i) + '.jpg') # =blob=page.make_blob(format='jpeg') # pil_bytes=io.BytesIO(blob) # f_page=Image.open(blob) # pdf_pages.append(blob) #pdf_pages.append('./temp_pdfs/pdf_page_' + str(i) + '.jpg') i += 1 pdf_pages.append(save) return pdf_pages
def answers_extr(page_name, ans_count): pdf = wi(filename=page_name, resolution=300) pdfImage = pdf.convert('jpeg') imageBlobs = [] for img in pdfImage.sequence: imgPage = wi(image=img) imageBlobs.append(imgPage.make_blob('jpeg')) recognized_text = [] pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract' a = open('Answers.txt', 'a') for imgBlob in imageBlobs: im = Image.open(io.BytesIO(imgBlob)) text = pytesseract.image_to_string(im, lang='eng') ans = re.compile( '(?!.*(Time|TARGET|ANSWER|DAY)).*[^SCORE]$' ) #REMOVING TEXTS STARTING FROM WORDS INSIDE THE BRACKETS for line in text.split('\n'): if "Correct Option:" in line or "Correct Option" in line or "Correct Answer" in line: ans_count += 1 a.write('\n' + "For Question {}.---------------".format(ans_count)) if ans.match(line): text = re.sub('[\d{1}|\d{2}]\.|,', '', line) #Replacing 1. or 12. with blank value a.write('\n' + text) a.close() return ans_count
def pdf_ocr(path, pages=[], lang='eng'): """Function return parsed text from pdf file using optical character recognition. path = path to pdf file pages = pages to recognize """ if len(path) == 0: print('Path is empty') return pytesseract.pytesseract.tesseract_cmd = tesseract_path pdf = wi(filename=path, resolution=300) pdf_image = pdf.convert('jpeg') image_blobs = [] count = 1 for img in pdf_image.sequence: if (not pages) or count in pages: img_page = wi(image=img) image_blobs.append(img_page.make_blob('jpeg')) count = count + 1 result_text = [] for imgBlob in image_blobs: im = Image.open(io.BytesIO(imgBlob)) text = pytesseract.image_to_string(im, lang=lang) result_text.append(text) return ' '.join(result_text)
def questions_extr(page_name, count): pdf = wi(filename=page_name, resolution=500) pdfImage = pdf.convert('jpeg') imageBlobs = [] for img in pdfImage.sequence: imgPage = wi(image=img) imageBlobs.append(imgPage.make_blob('jpeg')) recognized_text = [] pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract' q = open('Questions.txt', 'a') o = open('Options.txt', 'a') for imgBlob in imageBlobs: im = Image.open(io.BytesIO(imgBlob)) text = pytesseract.image_to_string(im, lang='eng') quest = re.compile( '(?!.*(\(|Time|TARGET|TEST|DAY|Maximum))[0-9]?[a-zA-z]?.*[^SCORE]$' ) #Pattern to select answers non_waste = re.compile( '(?!.*(Time|TARGET|TEST|DAY|Maximum|\d{2}\.)).*[^SCORE]$') options = re.compile('[(][a-c][)].*') #Pattern to select options lines = text.split('\n') length = len(lines) i = 0 while (i < length): if lines[i] == '' or len(lines[i]) <= 3 and lines[i][0].isdigit(): i += 1 continue if options.match(lines[i]): while (i < length and lines[i][1] != 'd'): if non_waste.match(lines[i]): o.write('\n' + lines[i]) i += 1 while (i < length and lines[i] == ''): i += 1 if len(lines[i]) == 3: o.write('\n' + lines[i]) i += 1 while (i < length and lines[i] == ''): i += 1 while (i < length and lines[i] != ''): o.write('\n' + lines[i]) i += 1 count += 1 o.write('\n' + 'Options {}.------------'.format(count)) q.write('\n' + "Question {}.----------------".format(count) + '\n') continue if quest.match(lines[i]): text = re.sub('\d{2}\.|,', '', lines[i]) q.write(' ' + text) i += 1 q.close() o.close() return count
def pdfToImages(filepath): pdf = wi(filename=filepath, resolution=300) pdfImage = pdf.convert('jpeg') global i for image in pdfImage.sequence: page_i = wi(image=image) page_i.save(filename=str(i) + ".jpg") i += 1
def toText(path='path to Offres Directorie'): text = '' #set up database mydb = Mongo('TTProject', ['offres-info', 'offres-text']) #move to directory that containes offres os.chdir(path) dirnames = open('names.txt').read().split(" ") references = open('references.txt').read().split(" ") i = 0 for directory in dirnames: if len(directory): x = len(directory) dirname = str(directory)[1:x - 1] #informations to store reference = references[i] name = dirname text += '############################ ' + \ dirname+' ############################\n\n\n' #loop trought Directory : dirPath = path + '\\' + dirname for filename in os.listdir(os.path.join(dirPath)): text += '\n\n#################### ' + filename + ' ####################\n\n' filePath = path + '\\' + dirname + '\\' + filename # check extention # PDFs if filename.endswith('.pdf'): pdfFile = wi(filename=os.path.join(filePath), resolution=300) images = pdfFile.convert('jpeg') for page in images: page.save('page.jpg', 'JPEG') image = Image.open(io.BytesIO(page)) imgPage = wi(image=image) Image = imgPage.make_blob('jpeg') text += pytesseract.image_to_string( Image.open('page.jpg'), lang='fra') # DOCXs if filename.endswith('.docx'): document = Document(filePath) tables = document.tables for p in document.paragraphs: text += p.text text += "\n ###### Tables's Content ###### \n" for table in tables: for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: text += (paragraph.text) #inserting to mongo offre = {'_id': reference, "name": name, 'text': text} mydb.insert(offre, 'offres-text') text = '' i += 1
def parsesave(self, response): # self.numb += 1 # filename = response.meta.get('year') + '-' + str(self.numb) + '-' + response.meta.get('numbname') + '.pdf' # os.makedirs('./indiana/', exist_ok=True) # with open('./indiana/' + filename, 'wb') as f: # f.write(response.body) # yield IndpdfItem(url=response.url, year=response.meta.get('year'), bill=response.meta.get('numbname')) url = response.url state = 'indiana' html = '' bill_name = response.meta.get('numbname') session = response.meta.get('year') chamber = response.meta.get('chamber').capitalize() topic = response.meta.get('topic') topic = ' '.join(topic) if topic else '' date = '#TODO' md5 = hashlib.md5(response.body).hexdigest() # text from pdf bytesio = io.BytesIO(response.body) bfr = io.BufferedReader(bytesio) pdf_text = convert_pdf_to_txt( bfr ) # if response.url.strip()[-4:].lower() == '.pdf' else 'unsupported file' # recognized text (OCR) from pdf if len(pdf_text.strip()) <= 50: with wi(filename=response.url, resolution=200) as pdf: pdfImage = pdf.convert('jpeg') imageBlobs = [] for img in pdfImage.sequence: with wi(image=img) as imgPage: imageBlobs.append(imgPage.make_blob('jpeg')) recognized_text = [] for imgBlob in imageBlobs: im = Image.open(io.BytesIO(imgBlob)) text = pytesseract.image_to_string(im, lang='eng') recognized_text.append(text) recognized_text = '\n\n\n'.join(recognized_text) pdf_text = pdf_text if len(pdf_text.strip()) > 50 else recognized_text yield IndpdfItem(url=url, state=state, html=html, text=pdf_text, bill_name=bill_name, session=session, chamber=chamber, topic=topic, date=date, md5=md5)
def convertir(self): pdf = wi(filename="example.pdf", resolution=300) pdfimage = pdf.convert("jpeg") i = 1 for img in pdfimage.sequence: page = wi(image=img) page.save(filename=str(i) + ".jpg") i += 1
def save_img_as_pdf(file_dir, filename, count, output_dir, output_filename): path = os.path.join(file_dir, filename) output_path = os.path.join(output_dir, output_filename) with wi() as w: for i in range(1, count): with wi(filename=path + str(i) + '.jpg') as page: w.sequence.append(page) w.save(filename=output_path) return output_path
def p2f(name): pdf = wi(filename=name, resolution=300) pdfimage = pdf.convert("png") i=1 #convert to image for img in pdfimage.sequence: page = wi(image=img) page.save(filename=str(i)+".png") s=segment.image_processing(str(i)+".png") return(s)
def pdfimgconvert(): pdffilepath = "Offer.pdf" PDFfile = wi(filename=pdffilepath, resolution=400) Images = PDFfile.convert('jpg') ImageSequence = 1 for img in PDFfile.sequence: Image = wi(image=img) Image.save(filename="PDFImg/Image" + str(ImageSequence) + ".jpg") ImageSequence += 1
def convertPDF(self, filename): pdf = wi(filename=filename, resolution=300) pdfImage = pdf.convert('jpeg') blobs = [] for img in pdfImage.sequence: imgPage = wi(image=img) blobs.append(imgPage.make_blob('jpeg')) if (pdfImage.sequence.index(img) == 0): self.initialize(blobs[0]) return blobs
def _create_to_page(j, lis, x): pdf = wi(filename="PDF/LIBROS CONTABLES/" + lis + "/" + x, resolution=100) pdfImage = pdf.convert("jpg") for img in pdfImage.sequence: page = wi(image=img) page.save(filename="Libro Contable/pages/" + str(j) + ".jpg") j += 1 print "Imprimiendo pagina......" + str(j) return j
def openthefile(self, path, fname): pdf = wi(filename=fname, resolution=300) pdfImage = pdf.convert("jpeg") filename = fname[:-4] self.pagecnt = len(pdf.sequence) i = 0 for img in pdf.sequence: page = wi(image=img) page.save(filename=path + '/' + filename.split('/')[-1] + "-" + str(i) + ".jpg") i = i + 1
def from_pdf(file_path, start, work_dir): pdf = wi(filename=file_path, resolution=300) pdfImage = pdf.convert("png") print(pdfImage) pg_n = start for img in pdfImage.sequence: page = wi(image=img) page_fn = os.path.join(work_dir, 'page-{:03}.png'.format(pg_n)) page.save(filename=page_fn) pg_n += 1 return page_fn
def pdf2jpg(filename, source): pdf = wi(filename=source + '\\' + filename + '.pdf', resolution=300) pdfImg = pdf.convert('jpeg') i = 1 name = str(filename).split('.')[0] for img in pdfImg.sequence: page = wi(image=img) page.save(filename=source + '\\' + name + '--' + str(i) + '.jpg') i += 1
def pdf2image(pdfName, resolution=300, imgFormat='jpeg'): pdf = wi(filename=pdfName, resolution=resolution) pdfImage = pdf.convert(imgFormat) imgBlobs = [] for img in pdfImage.sequence: imgPage = wi(image=img) imgBlobs.append(imgPage.make_blob(imgFormat)) return imgBlobs