string = 'HELLO ALL' print(string.title()) # Split string = 'hello Everyone' print(string.split()) string = 'hello-Everyone' print(string.split('-')) # with maxsplit =1 it split for one occurance string = 'www.all.com' print(string.split('.', maxsplit=1)) # PDF Reading import PyPDF2 f = open('F:\Algorithmica\MyCodes\DAAI Newbie document.pdf', 'rb') # rb - Read file in binary format pdf_reader = PyPDF2.PdfFileReader(f) #Number of pages print(pdf_reader.numPages) Page_one = pdf_reader.getPage(0) print(Page_one) # Extracting text from page1 page_one_text = Page_one.extractText() page_one_text f.close()
page['/Annots'].append(indir) else: page[NameObject('/Annots')] = ArrayObject([indir]) if annot.popup: popup = _popup_annotation(indir) indir_popup = outpdf._addObject(popup) annot[NameObject('/Popup')] = indir_popup page['/Annots'].append(indir_popup) if __name__ == '__main__': import sys import PyPDF2 as pyPdf try: inpdf = pyPdf.PdfFileReader(open(sys.argv[1], 'rb')) except (IndexError, IOError): print("Needs PDF file as an argument.") raise SystemExit annot1 = highlight_annotation([ [100, 100, 400, 125] ], 'An argument is a connected series of statements intended to establish a proposition.', 'Graham Chapman', 'I came here for a good argument.') annot2 = text_annotation([100, 50, 125, 75], "No it isn't.", 'John Cleese', "No you didn't.") page = inpdf.getPage(0) outpdf = pyPdf.PdfFileWriter() add_annotation(outpdf, page, annot1) add_annotation(outpdf, page, annot2) outpdf.addPage(page)
# This is a script to delete the final page in a pdf import PyPDF2 as pdf pdf1File = open('ANNEXE3.pdf', 'rb') pdf1Reader = pdf.PdfFileReader(pdf1File) pdfWriter = pdf.PdfFileWriter() for pageNum in range(1,pdf1Reader.numPages): pageObj = pdf1Reader.getPage(pageNum) pdfWriter.addPage(pageObj) pdfOutputFile = open('outputpdf.pdf','wb') pdfWriter.write(pdfOutputFile) pdfOutputFile.close() pdf1File.close()
import PyPDF2 import sys filenamelist = sys.argv[1:] filenamelist = ["etsy.pdf", "amazon.pdf"] for filename in filenamelist: currentfile = PyPDF2.PdfFileReader(open(filename, 'rb')) firstpage = currentfile.getPage(2).extractText() if len(firstpage) > 0: if firstpage.split(" ")[0] == "Order": pdftype = "etsy" elif firstpage.split(" ")[0] == "Ship": pdftype = "amazon" else: pdftype = "error" print(firstpage) else: print(firstpage)
import pyttsx3 import PyPDF2 book = open("An_Introduction_to_Software_Engineering_and_Fault_.pdf", "rb") pdfReader = PyPDF2.PdfFileReader(book) pages = pdfReader.numPages for num in range(0, pages - 1): page = pdfReader.getPage(num) text = page.extractText() bot = pyttsx3.init() bot.say(text) bot.runAndWait()
#! python3 import sys import os import PyPDF2 password = sys.argv[1] #Dodaj ściężkę do walk.os for folderName, subfolders, filenames in os.walk('XXXX'): for filename in filenames: if filename.endswith('.pdf'): pdfReader = PyPDF2.PdfFileReader(open(os.path.join(folderName, filename), 'rb')) print(pdfReader) #print(pdfReader.isEncrypted) pdfWriter = PyPDF2.PdfFileWriter() for pageNum in range(pdfReader.numPages): pdfWriter.addPage(pdfReader.getPage(pageNum)) pdfWriter.encrypt(password) filename = filename.strip('.pdf') print(filename) resultPdf = open(filename + '_encrypted.pdf', 'wb') pdfWriter.write(resultPdf)
def make_pdf(dlg): # no file selected: treat like "Cancel" if not len(dlg.szr02.Table.data): # no files there return None cdate = wx.DateTime.Now().Format("D:%Y%m%d%H%M%S-04'30'") ausgabe = dlg.btn_aus.GetPath() pdf_fle_out = open(ausgabe, "wb") pdf_out = PyPDF2.PdfFileWriter() aus_nr = 0 # current page number in output pdf_dict = { "/Creator": "PDF-Joiner", "/Producer": "PyMuPDF, PyPDF2", "/CreationDate": cdate, "/ModDate": cdate, "/Title": dlg.austit.Value, "/Author": dlg.ausaut.Value, "/Subject": dlg.aussub.Value } pdf_out.addMetadata(pdf_dict) parents = {} #============================================================================== # process one input file #============================================================================== for zeile in dlg.szr02.Table.data: dateiname = zeile[0] doc = dlg.FileList[dateiname] max_seiten = int(zeile[1]) #============================================================================== # user input minus 1, PDF pages count from zero # also correct any inconsistent input #============================================================================== von = int(zeile[2]) - 1 bis = int(zeile[3]) - 1 von = max(0, von) # "from" must not be < 0 bis = min(max_seiten - 1, bis) # "to" must not be > max pages - 1 bis = max(von, bis) # "to" cannot be < "from" rot = int(zeile[4]) # get rotation angle pdfin = PyPDF2.PdfFileReader(dateiname) for p in range(von, bis + 1): # read pages from input file pdf_page = pdfin.getPage(p) if rot > 0: pdf_page.rotateClockwise(rot) # rotate the page pdf_out.addPage(pdf_page) # output the page # title = "infile [from-to (max.pages)]" if dlg.noToC.Value: # no ToC wanted continue bm_main_title = "%s [%s-%s (%s)]" % \ (os.path.basename(dateiname[:-4]).encode("latin-1"), von + 1, bis + 1, max_seiten) bm_main = pdf_out.addBookmark(bm_main_title, aus_nr, None, None, False, False, "/Fit") print(1, bm_main_title, aus_nr) parents[1] = bm_main # lvl 1 bookmark is infile's title toc = fitz.GetToC(doc) # get infile's table of contents bm_lst = [] # prepare the relevant sub-ToC for t in toc: if t[2] > von and t[2] <= bis + 1: # relevant page range only bm_lst.append([ t[0] + 1, # indent increased 1 level t[1], # the title t[2] + aus_nr - von - 1 ]) # new page number aus_nr += (bis - von + 1) # increase output counter if bm_lst == []: # do we have a sub-ToC? continue # no, next infile # while indent gap is too large, prepend "filler" bookmarks to bm_lst while bm_lst[0][0] > 2: zeile = [bm_lst[0][0] - 1, "<>", bm_lst[0][2]] bm_lst.insert(0, zeile) # now add infile's bookmarks for b in bm_lst: bm = pdf_out.addBookmark(b[1].encode("latin-1"), b[2], parents[b[0] - 1], None, False, False, "/Fit") parents[b[0]] = bm #============================================================================== # all input files processed #============================================================================== pdf_out.write(pdf_fle_out) pdf_fle_out.close() return ausgabe
def get_pdf_title(pdf_file_path): pdf_reader = PyPDF2.PdfFileReader(open(pdf_file_path, "rb")) return pdf_reader.getDocumentInfo().title
Adapted from work by Sylvain Pelissier http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python ''' import sys import PyPDF2 from PIL import Image if (len(sys.argv) != 2): print("\nUsage: python {} input_file\n".format(sys.argv[0])) sys.exit(1) pdf = sys.argv[1] if __name__ == '__main__': input1 = PyPDF2.PdfFileReader(open(pdf, "rb")) page0 = input1.getPage(30) if '/XObject' in page0['/Resources']: xObject = page0['/Resources']['/XObject'].getObject() for obj in xObject: if xObject[obj]['/Subtype'] == '/Image': size = (xObject[obj]['/Width'], xObject[obj]['/Height']) data = xObject[obj].getData() if xObject[obj]['/ColorSpace'] == '/DeviceRGB': mode = "RGB" else: mode = "P" if '/Filter' in xObject[obj]:
def OCRPDF( source="", targetPath=None, page=None, nice=5, verbose=False, tesseract_config='--oem 1 -l best/eng -c preserve_interword_spaces=1 textonly_pdf=1', logger=None): try: output = PyPDF2.PdfFileWriter() #if this is a string.. if isinstance(source, str): if verbose: ( logger.info if logger else print )("You passed a string in as source. Trying this as source pdf file path." ) page_count = PyPDF2.PdfFileReader(source).getNumPages() filename, file_extension = os.path.splitext(source) if (file_extension == ".pdf"): if verbose: (logger.info if logger else print)( "OCRUSREX - Try extracting Images from path: {0}". format(source)) if page is None: if verbose: ( logger.info if logger else print )("\tOCRing entire document with total page count of: {0}" .format(page_count)) for i in range(0, page_count): if verbose: (logger.info if logger else print)( "\tOCRing page {0} of {1}".format( i + 1, page_count)) page_image_array = convert_from_path(source, dpi=300, first_page=i + 1, last_page=i + 1) pdf_page = ocrPilImage(image=page_image_array[0], nice=nice, config=tesseract_config) output.addPage(pdf_page) else: if verbose: (logger.info if logger else print)( "\tOCRing only page {0} of {1}".format( page, page_count)) page_image_array = convert_from_path(source, dpi=300, first_page=page, last_page=page) output.addPage( ocrPilImage(image=page_image_array[0], nice=nice, config=tesseract_config)) if verbose: print("Done") # IF source isn't a string, assume it's a file-like object. If incorrect, error handling will catch this. else: if verbose: (logger.info if logger else print)("OCRUSREX - Try extracting Images from bytes object") page_count = PyPDF2.PdfFileReader(io.BytesIO(source)).getNumPages() if page is None: if verbose: (logger.info if logger else print )("\tOCRing entire document with total page count of: {0}" .format(page_count)) for i in range(0, page_count): if verbose: (logger.info if logger else print)( "\tOCRing page {0} of {1}".format( i + 1, page_count)) page_image_array = convert_from_bytes(source, dpi=100, first_page=i + 1, last_page=i + 1) output.addPage( ocrPilImage(image=page_image_array[0], nice=nice, config=tesseract_config)) else: if verbose: (logger.info if logger else print)( "\tOCRing only page {0} of {1}".format( page, page_count)) page_image_array = convert_from_bytes(source, dpi=100, first_page=page, last_page=page) output.addPage( ocrPilImage(image=page_image_array[0], nice=nice, config=tesseract_config)) if verbose: (logger.info if logger else print)("OCRUSREX - Successfully processed!") #If targetPath was provided, assume that it's a string and valid path. Try to write. if targetPath: outputStream = open(targetPath, "wb") output.write(outputStream) outputStream.close() # upon success, return truthy values (in this case, True) return True #otherwise, return results as bytes obj else: output_file_obj = io.BytesIO() output.write(output_file_obj) return output_file_obj.getvalue() except Exception as e: (logger.error if logger else print)("ERROR - Exception: {0}".format(e)) return None
# Automate the Boring Stuff with Python 3 # Ch 13 - Working with PDF and Word Documents # Rotating Pages ''' The pages of a PDF can also be rotated in 90-degree increments with the rotateClockwise() and rotateCounterClockwise() methods. ''' import PyPDF2 minutesFile = open('meetingminutes.pdf', 'rb') pdfReader = PyPDF2.PdfFileReader(minutesFile) page = pdfReader.getPage(0) page.rotateClockwise(90) pdfWriter = PyPDF2.PdfFileWriter() pdfWriter.addPage(page) resultPdfFile = open('rotatedPage.pdf', 'wb') pdfWriter.write(resultPdfFile) resultPdfFile.close() minutesFile.close()
def ocrPilImage(image=None, nice=5, config=""): ocred_page = pytesseract.image_to_pdf_or_hocr(image, extension='pdf', nice=nice, config=config) return PyPDF2.PdfFileReader(io.BytesIO(ocred_page)).getPage(0)
partInfo = requests.get( 'https://web-booktab.zanichelli.it/api/v1/resources_web/' + isbn + '/' + part.getAttribute("btbid") + '/config.xml', headers={'Cookie': cookie}) #print('http://web.booktab.it/boooks_web/'+isbn+'/'+part.getAttribute("btbid")+'/config.xml') if partInfo.status_code != 200: continue partXML = parseString(partInfo.text) key = partXML.getElementsByTagName("content")[0].firstChild.nodeValue pdfUrl = '' for entry in partXML.getElementsByTagName("entry"): if entry.getAttribute("key") == key + ".pdf": pdfUrl = entry.firstChild.nodeValue + ".pdf" break pdf = requests.get( 'https://web-booktab.zanichelli.it/api/v1/resources_web/' + isbn + '/' + part.getAttribute("btbid") + '/' + pdfUrl, headers={'Cookie': cookie}) merger.append(PyPDF2.PdfFileReader(ResponseStream(pdf.iter_content(64)))) merger.write(input("Input a title for the file: ") + ".pdf")
def read_files(pathorfile): if (type(pathorfile) == str): os.chdir(pathorfile) filelist = glob.glob("*") else: filelist = pathorfile # reading files(.docx, .txt, .xlsx, .ppt, .pdf) dfList = [] filename = [] for file in filelist: try: extension = os.path.splitext(file)[1] if extension == '.docx': filename.append(file) print(file) text = docx2txt.process(file) text = text.replace('\n', ' ') text = re.sub('\s+', ' ', text).strip() dfList.append(text) elif extension == '.pdf': filename.append(file) print(file) pdfFileObj = open(file, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) pageObj = pdfReader.getPage(0) text = pageObj.extractText() text = text.replace('\n', ' ') text = re.sub('\s+', ' ', text).strip() dfList.append(text) pdfFileObj.close() elif extension == '.txt': filename.append(file) print(file) text = open(file, 'r').read() text = text.replace('\n', ' ') text = re.sub('\s+', ' ', text).strip() dfList.append(text) elif extension == '.xlsx': filename.append(file) print(file) xlsdf = pd.read_excel(file) xlsdf['new'] = xlsdf.astype(str).values.sum(axis=1) xls = ' '.join(xlsdf['new'].tolist()) xls = xls.replace('\n', ' ') xls = re.sub('\s+', ' ', xls).strip() dfList.append(xls) elif extension == '.ppt': filename.append(file) print(file) prs = Presentation(file) text_runs = [] for slide in prs.slides: for shape in slide.shapes: if not shape.has_text_frame: continue for paragraph in shape.text_frame.paragraphs: for run in paragraph.runs: text_runs.append(run.text) text_runs = ' '.join(text_runs) text_runs = text_runs.replace('\n', ' ') text_runs = re.sub('\s+', ' ', text_runs).strip() dfList.append(text_runs) prs.close() else: print("WRONG EXTENSION:") print(file) continue except: pass indir = 'C:/Users/Faiz Ali/Rezide/imp' #change with the path where all files and ML model is stored os.chdir(indir) allfilesDF = pd.DataFrame(dfList) allfilesDF.columns = ['data'] allfilesDF.drop_duplicates(keep="first", inplace=True) allfilesDF.replace('', np.nan, inplace=True) allfilesDF.dropna(inplace=True) allfilesDF.reset_index(drop=True, inplace=True) return allfilesDF, filename
def main(): clear() while True: imprimir() try: option = int(input("Seleccione una opcion: ")) if option == 1: # DIPLOMAS pdfName = openFile() pathFile = newCarpeta( pdfName, path2 ) # Nueva Carpeta con el nombre del archivo a procesar fileList = convert_image(pdfName, pathFile, ext) os.chdir(pathFile) # Go to the Directory for i in range(0, len(fileList)): allTextfromImage = ocr_core(fileList[i]) # All text subText = read_image( allTextfromImage) # Only text with need # Rename image file with text imageFile = rename_file(fileList[i], subText) convert_pdf(imageFile, subText) # convert to PDF again os.unlink(imageFile ) # Erase the images files using shutil module print('\nTotal de archivo renombrados y comprimidos: ' + str(len(fileList))) #MessageBox.showinfo('Total de archivo renombrados y comprimidos:', str(len(fileList)) ) #compress_file('pdf') # compress all pdf files break elif option == 2: # CARTAS pdfName = openFile() # Open the pdf file pdfFileObj = open(pdfName, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) # Read info pathFile = newCarpeta( pdfName, path2 ) # Nueva Carpeta con el nombre del archivo a procesar pdf_splitter(pdfName, pdfReader, pathFile) print('\nFueron Procesadas: ' + str(pdfReader.numPages) + ' páginas') break elif option == 3: # ENVIAR CARTAS POR GMAIL clear() PASSWORD = input('Digite la clave de aplicaciones para Gmail: ' ) #'xqwdzjbamvfiztnt' os.chdir(pathCartas) names, emails = get_contacts(contacts) # read contacts message_template = read_template(msgTemplate) pathdirectory = getDirectory( ) # get Directory where the Cartas are attach = search_files(names, pathdirectory) s = smtplib.SMTP(host='smtp.gmail.com', port=587) # set up the SMTP server s.starttls() s.login(MY_ADDRESS, PASSWORD) i = 0 #Iniciar la varaible i en ceros for name, email in zip( names, emails): # FOR EACH CONTACT, SEND THE EMAIL: msg = MIMEMultipart() # create a message message = message_template.substitute( PERSON_NAME=name.title() ) # add in the actual person name to the message template # setup the parameters of the message msg['From'] = MY_ADDRESS msg['To'] = email msg['Subject'] = "Exámenes psicofísicos (SIMETRIC)" msg.attach(MIMEText(message, 'plain')) # add in the message body #ATTACH FILES TO THE EMAIL filenameAtach = attach[i] if filenameAtach != None: with open( filenameAtach, "rb" ) as attachment: # Open PDF file in binary mode # Add file as application/octet-stream # Email client can usually download this automatically as attachment part = MIMEBase("application", "octet-stream") part.set_payload(attachment.read()) encoders.encode_base64( part ) # Encode file in ASCII characters to send by email part.add_header( 'Content-Disposition', 'attachment', filename=filenameAtach ) # Add header as key/value pair to attachment part msg.attach( part ) # Add attachment to message and convert message to string sendmailStatus = s.send_message( msg) # SEND THE MESSAGE WITH ATTACHED. del msg #Status message sent if sendmailStatus != {}: print( '\nThere was a problem sending the email to %s: %s' % (email, s.send_message)) else: print( '\nThe email to %s was sent correctly with the attached:\n%s' % (email, filenameAtach)) i += 1 else: # SEND THE MESSAGE WITHOUT ATTACHED. #s.send_message(msg) del msg #Do not send a message s.quit() # Terminate the SMTP session and close the connection break elif option == 4: # SLPIT PDF pdfName = openFile() pdfFileObj = open(pdfName, 'rb') # Open the pdf file pdfReader = PyPDF2.PdfFileReader(pdfFileObj) # Read info pdf_splitter2(pdfName, pdfReader) print('\nEl documento tiene: ' + str(pdfReader.numPages) + ' páginas') break elif option == 0: # SALIR break else: print() clear() print('Error, solo de aceptan numeros del 0 al 4') except ValueError: print("Error, ingrese solamente numeros")
# tries to access all the pdfs in folder to make sure they # were encrypted import PyPDF2, os, sys try: path, key = sys.argv[1:] os.chdir(path) # Searching for PDFs: for filename in os.listdir('.'): if filename.endswith('.pdf'): try: # Creating PDF reader and writer object: print('Encrypting %s...' % (filename)) pdfFile = open(filename, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFile) encrypted = PyPDF2.PdfFileWriter() # Copying content: for pageNum in range(pdfReader.numPages): page = pdfReader.getPage(pageNum) encrypted.addPage(page) # Encrypting: encrypted.encrypt(key) # Creating new PDF: encryptedFile = open(filename[:-4] + '_encrypted.pdf', 'wb') encrypted.write(encryptedFile) encryptedFile.close() pdfFile.close()
import PyPDF2 # creating an object file = open( '/home/jario/Documentos/ENEM - 1998 à 2017/(1998) ENEM/1998 - ENEM - Prova amarela.pdf', 'rb') # creating a pdf reader object fileReader = PyPDF2.PdfFileReader(file) # print the number of pages in pdf file print(fileReader.numPages) print(fileReader.getPage(0).extractText())
import PyPDF2 import json import re pdf_file = open('data.pdf', 'rb') read_pdf = PyPDF2.PdfFileReader(pdf_file) number_of_pages = read_pdf.getNumPages() page = read_pdf.getPage(0) page_content = page.extractText() mcq = re.compile data = json.dumps(page_content) with open("sample.json", "w") as file: file.write(data)
filePath = r"C:\Users\alene\Desktop\Random PDFs" passwort = 'chamberlain' for folder, subfolders, files in os.walk(filePath): logging.debug("Current folder: " + folder) if subfolders != []: logging.debug("This folder's subfolders are: ") for folderName in subfolders: logging.debug(folderName) if files != []: logging.debug("This folder's files: ") for fileName in files: logging.debug(fileName) if fileName.endswith('_encrypted.pdf'): encryptedPdf = open(folder + "\\" + fileName, 'rb') if PyPDF2.PdfFileReader(encryptedPdf).isEncrypted: if PyPDF2.PdfFileReader(encryptedPdf).decrypt( passwort) == 1: print("Deleting original unencrypted file: " + fileName[:-14] + ".pdf") # send2trash.send2trash(folder + "\\" + fileName[:-14] + ".pdf") continue if fileName.endswith('.pdf'): print("ENCRYPTING: " + fileName) pdfFile = open(folder + '\\' + fileName, 'rb') pdfRead = PyPDF2.PdfFileReader(pdfFile) pdfWrite = PyPDF2.PdfFileWriter() for pageNum in range(pdfRead.numPages): pdfWrite.addPage(pdfRead.getPage(pageNum)) pdfWrite.encrypt(passwort) resultPDF = open(
import PyPDF2 f = open( 'C:\\Users\\plape\\OneDrive\\Escritorio\\Python\\Python-Bootcamp-notas-ejercicios\\15working with PDF & Spreadsheet CSV\\Working_Business_Proposal.pdf', 'rb') #rb es read binary pdf_reader = PyPDF2.PdfFileReader(f) #para que lea el pdf pdf_reader.numPages #cuando lo llamo deberia dar la cantidad de paginas q tiene si es q funciona page_one = pdf_reader.getPage(0) #primera pagina page_one_text = page_one.extractText() #esto me lo da como un python string page_one_text #a veces pasa q cuando pedis las paginas te lo da pero cuando pedis el texto te da empty strings, si me da empty strings el pdf file no es compatible con pdf2 f.close() #adding to pdf files f = open( 'C:\\Users\\plape\\OneDrive\\Escritorio\\Python\\Python-Bootcamp-notas-ejercicios\\15working with PDF & Spreadsheet CSV\\Working_Business_Proposal.pdf', 'rb') pdf_reader = PyPDF2.PdfFileReader(f) first_page = pdf_reader.getPage(0) pdf_writer = PyPDF2.PdfFileWriter() #para addPage, lo q quiero agregar deberia ser un pdfPageObject no un python string, por ejemplo: type(first_page) #esto es un ejemplo pdf_writer.addPage(first_page) pdf_output = open( 'C:\\Users\\plape\\OneDrive\\Escritorio\\Python\\Python-Bootcamp-notas-ejercicios\\15working with PDF & Spreadsheet CSV\\Some_BrandNew_Doc.pdf', 'wb') #overwrites another pdf file with this name pdf_writer.write(pdf_output) f.close() pdf_output.close() #quiero all text dentro de un pdf file f = open(
import PyPDF2, os passwd = input('please input decrypted password:'******'' for folderName, subfolders, filenames in os.walk('.'): #读取文件下所有的pdf文件 for filename in filenames: if filename.endswith('.pdf'): filedirname = folderName + '/' + filename pdfReader = PyPDF2.PdfFileReader(open(filedirname, 'rb')) if pdfReader.isEncrypted == True: #判断pdf是否被加密 try: pdfReader.decrypt(passwd) #解密pdf numPages = pdfReader.numPages #读取pdf页数 except PyPDF2.utils.PdfReadError: #读取失败说明解密失败,报错 print(filename + ", decrypted password is not correct.") else: #解密成功,复制pdf内容,生成新文件 pdfWriter = PyPDF2.PdfFileWriter() for pageNum in range(numPages): pdfWriter.addPage(pdfReader.getPage(pageNum)) filename_decrypt = filedirname[:-14] + '_decrypted' + '.pdf' resultPdf = open(filename_decrypt, 'wb') pdfWriter.write(resultPdf) resultPdf.close()
pip install pdf2image conda install -c conda-forge poppler (or apt-get install -y poppler-utils) """ from pdf2image import convert_from_path import pickle import PyPDF2 as pdf from nltk.tokenize import RegexpTokenizer from nltk import Text #input pdf pdf_name = "C:/Users/jacqu/Documents/DDEFI/Projet WeFinn/Bilan d_entreprises/Airbus Annual Report 2019.pdf" file = open(pdf_name, 'rb') pdf_reader = pdf.PdfFileReader(file) tokenizer = RegexpTokenizer(r'\w+') keyword = 'co' page = [] sol = [] for i in range(pdf_reader.getNumPages()): raw = pdf_reader.getPage(i).extractText() token = tokenizer.tokenize(raw) text = Text(token) word = [w.lower() for w in text] while keyword in word: index = word.index(keyword) if word[index + 1] == '2': page.append(i)
#imports import PyPDF2 try: # Adds a watermark to pdfs # This is the 1 line way of opening files fileToWatermark = PyPDF2.PdfFileReader(open('test.pdf', 'rb')) watermark = PyPDF2.PdfFileReader(open('watermark.pdf', 'rb')) outFile = PyPDF2.PdfFileWriter() for x in range(fileToWatermark.getNumPages()): page = fileToWatermark.getPage(x) # mergepage combines 2 pages's content into 1 page.mergePage(watermark.getPage(0)) # watermark only has 1 page outFile.addPage(page) with open('outFile', 'rb') as output: outFile.write(output) except FileNotFoundError: print("The file was not found")
import PyPDF2 try: output = PyPDF2.PdfFileWriter() wtr_reader = PyPDF2.PdfFileReader(open("./pdf/wtr.pdf", "rb")) i_reader = PyPDF2.PdfFileReader(open("./pdf/00. Cover v0.2.pdf", "rb")) i = 0 for i in range(i_reader.getNumPages()): page = i_reader.getPage(i) page.mergePage(wtr_reader.getPage(0)) output.addPage(page) with open("./pdf/newfile.pdf", "wb") as f: output.write(f) except FileNotFoundError: print("File not existed")
def getPdfReaderObj(file_marker): pdf_rd_obj = PyPDF2.PdfFileReader(opnFile(file_marker)) return pdf_rd_obj
import PyPDF2 pdf = PyPDF2.PdfFileReader('data/src/pdf/sample1.pdf') print(pdf.isEncrypted) # False src_pdf = PyPDF2.PdfFileReader('data/src/pdf/sample1.pdf') dst_pdf = PyPDF2.PdfFileWriter() dst_pdf.cloneReaderDocumentRoot(src_pdf) print(src_pdf.documentInfo) # {'/Title': IndirectObject(33, 0), '/Producer': IndirectObject(34, 0), '/Creator': IndirectObject(35, 0), '/CreationDate': IndirectObject(36, 0), '/ModDate': IndirectObject(36, 0)} # dst_pdf.addMetadata(src_pdf.documentInfo) # TypeError: createStringObject should have str or unicode arg d = {key: src_pdf.documentInfo[key] for key in src_pdf.documentInfo.keys()} print(d) # {'/Title': 'sample1', '/Producer': 'macOS バージョン10.14.2(ビルド18C54) Quartz PDFContext', '/Creator': 'Keynote', '/CreationDate': "D:20190114072947Z00'00'", '/ModDate': "D:20190114072947Z00'00'"} dst_pdf.addMetadata(d) dst_pdf.encrypt('password') with open('data/temp/sample1_pass.pdf', 'wb') as f: dst_pdf.write(f)
def process_pdf(pdf_file_path, g, show_page_lists=False): print("Processing: {0}".format(os.path.basename(pdf_file_path))) # compile regular expressions for searches # state_re = re.compile("[a-z, A-Z][a-z, A-Z](?=_Bucket)") # bucket_re = re.compile("([0-9]|[0-9][a-z, A-z])(?=_Print)") wid_re = re.compile("\d{3}(AD)\d{4}|(W)\d{8}") date_string = datetime.datetime.strftime(datetime.datetime.today(), "%m%d%Y%H%M%S") # save_dir_name = ('jttocust100001_{timestamp}'.format( timestamp=date_string)) # Add primary folder save_dir_name = os.path.join(save_dir_name, '0') # make a new directory to save results in if not os.path.exists(save_dir_name) and not show_page_lists: os.makedirs(save_dir_name) # open the pdf pdfFileObj = open(pdf_file_path, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) # make lists of all pages, pages to search WID on, last page of each record all_pages = set(i for i in range(0, pdfReader.numPages)) wid_search_pages = set(i for i in range(0, pdfReader.numPages) if (i % 2) == 0) doc_last_pages = set(i for i in range(0, pdfReader.numPages) if (i % 2) == 1) # Yes, above could be done more efficiently (because each record is 2 pages), # but I decided to go with an explicit list as a framework for projects # with more than two pages per record. # a little condition for debugging if show_page_lists: print("** Debug page lists, full processing not done **") print("all pages: ", all_pages) print("name pages: ", wid_search_pages) print("last pages: ", doc_last_pages) pdfFileObj.close() exit() # initialize a couple of variables batch = PyPDF2.PdfFileWriter() extracted_wid = None seq = 0 for n, i in enumerate(all_pages, 1): # where n is the iteratator count, i is the source pdf page number pageObj = pdfReader.getPage(i) batch.addPage(pdfReader.getPage(i)) # Create secondary folder secondary_dir = int(seq / 100000) secondary_dir = os.path.join(save_dir_name, str.zfill(str(secondary_dir), 2)) if not os.path.exists(secondary_dir): os.mkdir(secondary_dir) # if i in wid_search_pages: # search for text, save to variable text = pageObj.extractText() srch = wid_re.search(text) srch_cnt = wid_re.findall(text) if len(srch_cnt) > 2: print(("WARNING!!! Too Many Matches!!!: " "{0} Record: {1}\n{2}\n\n".format( os.path.basename(pdf_file_path), i, text))) if srch is not None: extracted_wid = srch[0] else: print("Skipping: {0} Record: {1}\n{2}\n\n".format( os.path.basename(pdf_file_path), i, text)) if (i in doc_last_pages) and (i != pdfReader.numPages): # write dat file, write out to pdf with open( os.path.join(secondary_dir, "{0:0>5}001.pdf".format(seq)), 'wb') as output: batch.write(output) with open( os.path.join(secondary_dir, "{0:0>5}IDX.dat".format(seq)), 'w') as datfile: datfile.write( "{appid};1;;;;;;;;;;;{wid};0001;N;{year};{scan}\n".format( wid=extracted_wid, appid=g.appid, scan=g.scan_date, year=g.year)) seq += 1 batch = PyPDF2.PdfFileWriter() pdfFileObj.close()
external TEXT )''') cur.execute('''REPLACE INTO ''' + '"' + str(roll_no) + '"' +''' (sub, internal, external) VALUES (?, ?, ?)''',(int(key),subs[key][0],subs[key][1])) conn.commit() slist = list() for sl in statusr: slist.append(sl) for files in os.listdir('./res_pdf'): global pdfpath pdfpath= os.path.join('res_pdf', files) if (pdfpath + '\n') in slist: continue pdfres = PyPDF2.PdfFileReader(open(pdfpath, 'rb')) pg = pdfres.getNumPages() global j j = 0 count = 0 while j < pg: print('PDF NAME', pdfpath, j) try: res = pdfres.getPage(j) except: print("Done!") j = j + 1 txt = res.extractText() txt = txt.strip() #print(txt) line = txt.split('\n')
# search_string = input('Enter author name for search: ') # search_results = dblp.search(search_string) # if(search_results.empty): # print("No results...\nExtting...") # exit(1) # for index, value in search_results.iterrows(): # print(f"{index}): {value.Title}\n") # option = int(input('Enter index of the record to get data: ')) # record = search_results.loc[option] # request = requests.get(record.Link) # html_soup = BeautifulSoup(request.text, 'html.parser') # download_link = html_soup.find('a', class_='c-pdf-download__link') # get_pdf = download_link.get('href') # request = requests.get("https://" + get_pdf[2:]) # with open("".join("PDF.pdf"), "wb" ) as pdf_file: # pdf_file.write(request.content) pdfFileObj = open('PDF.pdf', 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) numOfPages = pdfReader.numPages with open("sample.txt", "w") as txt_file: for i in range(numOfPages): txt_file.write(pdfReader.getPage(i).extractText()) txt_file.write(' \n')
from urllib2 import urlopen from bs4 import BeautifulSoup # Read the URL and save text in html1 and then in text. url1 = "https://www.theguardian.com/politics/2018/sep/20/the-death-of-consensus-how-conflict-came-back-to-politics" html1 = urlopen(url1).read().decode('utf8') BeautifulSoup(html1).get_text() soup = BeautifulSoup(html1, 'lxml') # Read the PDF and save text in pdfString. url2 = "http://eprints.lse.ac.uk/86880/7/Cox_Rise%20of%20populism%20published_2018.pdf" pdf2 = open(url2, 'rb') fileReader = PyPDF2.PdfFileReader(pdf2) pdfString = "" for x in range(11): pageObj = fileReader.getPage(x) pdfString = pdfString + pageObj.extractText() # Print text from url2.I closed the text, but You can open it.#print(pdfString) text = "" for element in soup.find_all(['title', 'p']): #print(element.text) text = text + element.text # At this point, there are text and pdfString. # Print text from url1.I closed the text, but You can open it.#print(text)