def extracting_text_from_pdfs(pdf_file): import PyPDF2 pdf_file_object = open(pdf_file, mode="rb") pdf_reader = PyPDF2.reader(pdf_file_object) page_object = pdf_reader.getPage(0) print(page_object.extractText())
import PyPDF2 file = open('ejemplo.pdf', 'rb') pdfObj = PyPDF2.PdfFileReader(file) pdfObj.numPages page1 = pdfObj.getPage(0) r = pdfObj.isEncrypted print(r) page1.cropBox.getWidth d = page1.extractText() pdfWriter = PyPDF2.PdfFileWriter() pdfWriter.addPage(page1) out = open('out.pdf', 'wb') pdfWriter.write(out) out.close()
#import packages import pyttsx3 import PyPDF2 book = open('Introduction_to_Machine_Learning.pdf', 'rb') #read PDF pdfReader = PyPDF2.PdfFileReader(book) pages = pdfReader.numPages #read pages in PDF #print(pages) #no of pages in PDF speaker = pyttsx3.init() #create speaker #page = pdfReader.getPage(9) #read single page for num in range(8, pages): #read certain range of pdf page = pdfReader.getPage(num) text = page.extractText() speaker.say(text) speaker.runAndWait()
def pdf_processor(pdf): pdf_reader = PyPDF2.PdfFileReader(pdf) page_obj = pdf_reader.getPage(0) info = page_obj.extractText() print(info)
fileList = [] for filename in os.listdir(path): if filename.endswith(".pdf"): numOfPDFsFound += 1 filename = os.path.join(path, filename) fileList.append(filename) filecount = 1 foundFilesList = [] for filename in fileList: sys.stdout.flush() sys.stdout.write("\rChecking file:%s out of Total %s PDFs" % (filecount, numOfPDFsFound)) sys.stdout.flush() pdfFileObj = open(filename, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj, strict=False) if pdfReader.isEncrypted: # print "The file is encrypted. Skipping" continue text = "" count = 0 try: num_pages = pdfReader.numPages #The while loop will read each page while count < num_pages: pageObj = pdfReader.getPage(count) count += 1 text += pageObj.extractText() except:
def getPageCount(file): pdfFileObj = open(file, 'rb') pdfReader = pdf.PdfFileReader(pdfFileObj) return pdfReader.numPages
import PyPDF2 # reading pdf dta file = open('sample.pdf', 'rb') reader = PyPDF2.PdfFileReader(file) pdfData = reader.getPage(0) print(pdfData.extractText()) data = pdfData.extractText() file.close() # looking for text assert "Mechanics1" in data, 'Not present' print('it is present')
def stub_to_print(src_stub_file_path, dst_print_file_path, track_selector, orchestra): """ :param Path src_stub_file_path: :param Path dst_print_file_path: :param ITrackSelector track_selector: the mechanism that computes the number of copies to do for each track :param Orchestra orchestra: :param dict(str, int) musician_count: gets the number of musicians for each musical intrument family :param TableOfContents or None stub_toc: if defined, gets the start page number for each track in the stub """ stub_toc = get_stub_tracks(src_stub_file_path, orchestra) print(stub_toc) track_to_print_count = track_selector.get_track_to_copy( stub_toc.get_track_ids()) print(track_to_print_count) dst_print_file_path.parent.mkdir(parents=True, exist_ok=True) with open(dst_print_file_path, 'wb') as print_file, open( dst_print_file_path.with_suffix('.log'), 'wt') as log_file: print_pdf = PyPDF2.PdfFileWriter() log_file.write("contents of print file %s :\n\n" % dst_print_file_path) with open(src_stub_file_path, 'rb') as stub_file: stub_pdf = PyPDF2.PdfFileReader(stub_file) sorted_tracks = [ Track(track_id, orchestra) for track_id in track_to_print_count.keys() ] sorted_tracks.sort() ranges = [] range_to_num_copies = {} range_to_tracks = {} for track in sorted_tracks: # for track_id, num_copies in track_to_print_count.iteritems().sorted(): # track_id = track.get_id() num_copies = track_to_print_count[track.id] if num_copies > 0: first_page_index = stub_toc.get_tracks_first_page_index( [track]) last_page_index = stub_toc.get_tracks_last_page_index( [track], stub_pdf.getNumPages()) print('adding %d copies of %s (pages %d-%d)' % (num_copies, track.id, first_page_index, last_page_index)) assert first_page_index <= last_page_index assert last_page_index <= stub_pdf.getNumPages() page_range = (first_page_index, last_page_index) if page_range in ranges: # this page range has already been encountered. This can happen when multiple tracks share the same pages (eg crash cymbals are on the same pages as suspended cybal) if track.instrument.get_player() == 'percussionist': # we don't want to duplicate these shared pages for each track so # we make as many copies as the track that asks for the most range_to_num_copies[page_range] = max( range_to_num_copies[page_range], num_copies) range_to_tracks[page_range].append(track.id) else: # here we're in the case of a page that contains 2 non percussion tracks (eg bassoon 1,2) # these must be not be merged, but be treated as 2 separate copies : # if we request 2 copies of bassoon 1 and 2 copies of bassoon 2, we want 4 copies of bassoon 1,2, not 2 range_to_num_copies[page_range] += num_copies range_to_tracks[page_range].append(track.id) else: ranges.append(page_range) range_to_num_copies[page_range] = num_copies range_to_tracks[page_range] = [track.id] for page_range in ranges: (first_page_index, last_page_index) = page_range num_copies = range_to_num_copies[page_range] log_file.write( "%d copies of %s\n" % (num_copies, '/'.join(range_to_tracks[page_range]))) # print(page_range, num_copies) for copy_index in range(num_copies): # @UnusedVariable pylint: disable=unused-variable for page_index in range(first_page_index, last_page_index + 1): track_page = stub_pdf.getPage( page_index - 1 ) # -1 to convert 1-based index into 0-based index # print('adding page %d' % page_index) print_pdf.addPage(track_page) log_file.write("\nunprinted tracks :\n\n") for label in stub_toc.get_track_ids(): label_is_printed = False for tracks in range_to_tracks.values(): for track in tracks: # print(track, label) if track == label: label_is_printed = True break if label_is_printed: break if not label_is_printed: log_file.write("no copies of %s\n" % label) print_pdf.write(print_file)
def monster_manual_lookup(manual_file_path, monster): """ function that opens and reads monster manual, allowing lookup of creature entries :param manual_file_path: file path for the monster manual :param monster: monster that we are looking up in the manual :return monster_text: a string containing all the text contained in the page describing the monster in the Monster Manual """ # Prepare the monster string to match format in # the monster manual sub_words = monster.split(' ') capitalized_words = [] for word in sub_words: first_char = word[0].upper() capitalized_word = first_char + word[1:] capitalized_words.append(capitalized_word) monster = " ".join(capitalized_words) # Open monster manual pdf: with open(manual_file_path, 'rb') as pdf: # Create a pdf reader object pdf_reader = PyPDF2.PdfFileReader(pdf) # Retrieve pdf outlines outlines = pdf_reader.outlines # Find indices that allow us to extract text about # chosen monster from monster manual index_list, key = find_index_by_value(outlines, monster) if not index_list: print("Error: {} not found in monster manual. Try different entry". format(monster)) return # Dive into lists to find text for the monster entry current_level = outlines[index_list[0]] #pdb.set_trace() for idx in index_list[1:]: current_level = current_level[idx] # Get the page id of the monster that we are interested in # (Note: this id is not the same as the actual page number # itself. We will need to run a conversion method to extract # the number from the id that we are given page_id = current_level.page.idnum page_id_to_number_dict, _ = assign_page_id_to_number(pdf_reader) page_number = page_id_to_number_dict[page_id] # Print out creature text for parsing purposes page = pdf_reader.getPage(page_number) text = page.extractText().replace('\n', '') print("creature text: ", text) return page_number
def main(): sourceName, outputFolder, targetPage = parseParam() fileBase = os.path.splitext(os.path.basename(sourceName))[0] pdfObj = PyPDF2.PdfFileReader(open(sourceName, "rb")) for iPage in range(0, pdfObj.numPages): pageObj = pdfObj.getPage(iPage) if targetPage and (iPage + 1 != targetPage): print("Skip page {}.".format(iPage + 1)) continue print("Processing page {} of {}...".format(iPage + 1, pdfObj.numPages)) try: xObject = pageObj['/Resources']['/XObject'].getObject() except KeyError: continue iImage = 0 for obj in xObject: if xObject[obj]['/Subtype'] == '/Image': iImage += 1 title = obj[1:] fileName = "{2}_p{0:0>3}_{3}".format(iPage + 1, iImage, fileBase, title) outFileName = os.path.join(outputFolder, fileName) size = (xObject[obj]['/Width'], xObject[obj]['/Height']) colorSpace = xObject[obj]['/ColorSpace'] if colorSpace == '/DeviceRGB': mode = "RGB" elif colorSpace == '/DeviceCMYK': mode = "CMYK" elif colorSpace == '/DeviceGray': mode = "L" elif colorSpace[0] == "/Indexed": mode = "P" colorSpace, base, hival, lookup = [ v.getObject() for v in colorSpace ] palette = lookup.getData() elif colorSpace[0] == "/ICCBased": mode = "P" lookup = colorSpace[1].getObject() palette = lookup.getData() elif colorSpace[0] == "/DeviceN": # UNKNOWN TYPE mode = "P" palette = DEFAULT_PALETTE else: print("[ERROR] Unknown mode: {}".format(colorSpace)) continue mode = "P" if type(filters) is PyPDF2.generic.ArrayObject: lookup = colorSpace[1].getObject() palette = lookup.getData() print("[FILE]" + fileName + " [MODE] " + colorSpace[0] + " [FILTER]" + xObject[obj]['/Filter']) else: palette = DEFAULT_PALETTE print("[FILE]" + fileName + " [MODE]: " + colorSpace + " [FILTER]" + xObject[obj]['/Filter']) try: stream = xObject[obj] data = stream._data filters = stream.get("/Filter", ()) if type(filters) is not PyPDF2.generic.ArrayObject: filters = [filters] leftFilters = copy.deepcopy(filters) if data: for filterType in filters: if filterType == "/FlateDecode" or filterType == "/Fl": data = FlateDecode.decode( data, stream.get("/DecodeParms")) leftFilters.remove(filterType) elif filterType == "/ASCIIHexDecode" or filterType == "/AHx": data = ASCIIHexDecode.decode(data) leftFilters.remove(filterType) elif filterType == "/LZWDecode" or filterType == "/LZW": data = LZWDecode.decode( data, stream.get("/DecodeParms")) leftFilters.remove(filterType) elif filterType == "/ASCII85Decode" or filterType == "/A85": data = ASCII85Decode.decode(data) leftFilters.remove(filterType) elif filterType == "/Crypt": decodeParams = stream.get("/DecodeParams", {}) if "/Name" not in decodeParams and "/Type" not in decodeParams: pass else: raise NotImplementedError( "/Crypt filter with /Name or /Type not supported yet" ) leftFilters.remove(filterType) elif filterType == (): leftFilters.remove(filterType) # case of Flat image if len(leftFilters) == 0: img = Image.frombytes(mode, size, data) if mode == "P": img.putpalette(palette) if mode == "CMYK": img = img.convert('RGB') img.save(outFileName + ".png") # case of JPEG elif len(leftFilters ) == 1 and leftFilters[0] == '/DCTDecode': jpgData = BytesIO(data) img = Image.open(jpgData) if mode == "CMYK": # case of CMYK invert all channel # imgData = list(img.tobytes()) # invData = [(255 - val) & 0xff for val in imgData] # data = struct.pack("{}B".format(len(invData)), *invData) # img = Image.frombytes(img.mode, img.size, data) imgData = np.frombuffer(img.tobytes(), dtype='B') invData = np.full(imgData.shape, 255, dtype='B') invData -= imgData img = Image.frombytes(img.mode, img.size, invData.tobytes()) img.save(outFileName + ".jpg") # case of JPEG2000 elif len(leftFilters ) == 1 and leftFilters[0] == '/JPXDecode': img = open(outFileName + ".jp2", "wb") img.write(data) img.close() # case of TIFF elif len(leftFilters) == 1 and leftFilters[ 0] == '/CCITTFaxDecode': if xObject[obj]['/DecodeParms']['/K'] == -1: CCITT_group = 4 else: CCITT_group = 3 width = xObject[obj]['/Width'] height = xObject[obj]['/Height'] img_size = len(data) tiff_header = tiff_header_for_CCITT( width, height, img_size, CCITT_group) with open(outFileName + ".tif", 'wb') as img_file: img_file.write(tiff_header + data) elif len(leftFilters) >= 1: print("[WARING] Unknown filter: " + leftFilters) except Exception as ex: print("[ERROR] " + fileName) print("\t" + str(ex)) print("Completed.")
import PyPDF2 # Open PDF and Create PDF Object pdf1File = open('meetingminutes.pdf', 'rb') pdf1Reader = PyPDF2.PdfFileReader(pdf1File) pdf2File = open('meetingminutes2.pdf', 'rb') pdf2Reader = PyPDF2.PdfFileReader(pdf2File) # Create a new PDF Object pdfWriter = PyPDF2.PdfFileWriter() # Combine 2 PDF File for pageNum in range(pdf1Reader.numPages): pageObj = pdf1Reader.getPage(pageNum) pdfWriter.addPage(pageObj) for pageNum in range(pdf2Reader.numPages): pageObj = pdf2Reader.getPage(pageNum) pdfWriter.addPage(pageObj) # Create PDF File pdfOutputFile = open('combinedminutes.pdf', 'wb') pdfWriter.write(pdfOutputFile) # Close File pdfOutputFile.close() pdf1File.close() pdf2File.close()
def pdf_list(names): merger = PyPDF2.PdfFileMerger() for pdf in names: merger.append(pdf) merger.write('merged.pdf')
import pyttsx3 import PyPDF2 path = open('python4everybody.pdf', 'rb') pdfReader = PyPDF2.PdfFileReader(path) for i in range(0, 244): from_page = pdfReader.getPage(i) text = from_page.extractText() speak = pyttsx3.init() speak.say(text) speak.runAndWait()
def extract_pdf_image(self, full_file_name: str): """Extract image files from the current pdf.""" try: if os.path.isfile(full_file_name): # open the current pdf pdf_reader = PyPDF2.PdfFileReader(open(full_file_name, 'rb')) print(f'Current Pdf: {full_file_name}') # get the number of pages num_pages = pdf_reader.getNumPages() # create a dictionary for the current pdf current_pdf = {} # iterate through each page and extract the pdf's contents n = 0 while n < num_pages: try: # get the current page page = pdf_reader.getPage(n) # get the xObject xObject = page['/Resources']['/XObject'].getObject() #text = page.extractText() #print(f'Text size: {len(text)}') # sub page counter m = 0 for obj in xObject: # if current object is an image if xObject[obj]['/Subtype'] == '/Image': size = (xObject[obj]['/Width'], xObject[obj]['/Height']) data = xObject[obj]._data if xObject[obj]['/ColorSpace'] == '/DeviceRGB': mode = "RGB" else: mode = "P" # NOTE: extract .tiff images if xObject[obj]['/Filter'] == '/CCITTFaxDecode': # .tiff # create a directory for the image # set the image format self.img_format = 'tiff' pdf_name = os.path.basename( os.path.splitext(full_file_name) [0]) # current pdf if not os.path.exists( os.path.join( self.__dict__['pdf_img_path'], pdf_name)): # create a directory for the current pdf new_dir = os.path.join( self.__dict__['pdf_img_path'], pdf_name) self.current_pdf_dir = new_dir os.mkdir(new_dir) time.sleep(4) # NOTE: using the tiff struct method if xObject[obj]['/DecodeParms']['/K'] == -1: self.CCITT_group = 4 else: self.CCITT_group = 3 width = xObject[obj]['/Width'] height = xObject[obj]['/Height'] data = xObject[obj]._data img_size = len(data) tiff_header = self.tiff_header_CCITT( width=width, height=height, img_size=img_size, CCITT_group=self.CCITT_group) # save the image file img_name = f'ImgFilePage{n}_{m}.tiff' with open(os.path.join(new_dir, img_name), 'wb') as img_file: img_file.write(tiff_header + data) m += 1 # NOTE: extract .png images elif xObject[obj]['/Filter'] == '/FlateDecode': # .png # set the image format self.img_format = 'png' # create a directory for the image pdf_name = os.path.basename( os.path.splitext(full_file_name) [0]) # current pdf if not os.path.exists( os.path.join( self.__dict__['pdf_img_path'], pdf_name)): # create a directory for the current pdf new_dir = os.path.join( self.__dict__['pdf_img_path'], pdf_name) self.current_pdf_dir = new_dir os.mkdir(new_dir) time.sleep(4) # save the image file img = Image.frombytes(mode, size, data) img.save( os.path.join(new_dir, f'ImgFilePage{n}_{m}.png')) m += 1 # NOTE: extract .jpg images elif xObject[obj]['/Filter'] == '/DCTDecode': # .jpg # set the image format self.img_format = 'jpg' # create a directory for the image pdf_name = os.path.basename( os.path.splitext(full_file_name) [0]) # current pdf if not os.path.exists( os.path.join( self.__dict__['pdf_img_path'], pdf_name)): # create a directory for the current pdf new_dir = os.path.join( self.__dict__['pdf_img_path'], pdf_name) self.current_pdf_dir = new_dir os.mkdir(new_dir) time.sleep(4) # save the image file img = open( os.path.join(new_dir, f'ImgFilePage{n}_{m}.jpg'), "wb") img.write(data) img.close() m += 1 # NOTE: extract .jp2 images elif xObject[obj]['/Filter'] == '/JPXDecode': # .jp2 # set the image format self.img_format = 'jp2' # create a directory for the image pdf_name = os.path.basename( os.path.splitext(full_file_name) [0]) # current pdf if not os.path.exists( os.path.join( self.__dict__['pdf_img_path'], pdf_name)): # create a directory for the current pdf new_dir = os.path.join( self.__dict__['pdf_img_path'], pdf_name) self.current_pdf_dir = new_dir os.mkdir(new_dir) time.sleep(4) # save the image file img = open( os.path.join(new_dir, f'ImgFilePage{n}_{m}.jp2'), "wb") img.write(data) img.close() m += 1 # NOTE: extract image from bytes else: # image from bytes print( f'Pdf: {full_file_name} has no images on page: {n}' ) m += 1 # increment the page counter n += 1 except Exception as e: print( f'An error occurred extracting text from page: {n}' ) print(e) n += 1 except OSError as e: print( f'OSError: An error occurred while trying to extract images from pdf: {full_file_name}' )
import PyPDF2 import time from tqdm import tqdm template = PyPDF2.PdfFileReader(open('dummypdf.pdf', 'rb')) watermark = PyPDF2.PdfFileReader(open('png2pdf.pdf', 'rb')) output = PyPDF2.PdfFileWriter() for i in range(template.getNumPages()): page = template.getPage(i) page.mergePage(watermark.getPage(0)) output.addPage(page) with open('watermarked_output.pdf', 'wb') as file: output.write(file) # Initial call to print 0% progress for i in tqdm(range(100)): time.sleep(0.005) print("All PDF's are watermarked")
def scan_to_stub(src_scanned_pdf_file_path, dst_stub_pdf_file_path, toc, title, orchestra, stamp_descs=[], page_info_line_y_pos=1.0): """ creates musical score stub from a musical score raw scan : - adds a table of contents - adds a stamp - numbers the pages :param str src_scanned_pdf_file_path: the source file that is expected to contain the scanned musical scores :param str dst_stub_pdf_file_path: the destination file that is expected to contain the stub of musical scores :param TableOfContents toc: :param str title: musical piece title :param Orchestra orchestra: the inventory of musical instruments :param list(StampDesc) stamp_descs: description of the stamps to overlay on each page :param float page_info_line_y_pos: y position of the status line relative to the bottom of the page """ assert len(toc.tracks) > 0 assert isinstance(src_scanned_pdf_file_path, Path) assert isinstance(dst_stub_pdf_file_path, Path) # check that the track_ids in the toc are known for track_id in toc.get_track_ids(): try: track = Track(track_id, orchestra) # @UnusedVariable pylint: disable=unused-variable except KeyError as e: # pylint: disable=unused-variable raise Exception( "Failed to identify track id '%s'. Either its syntax is incorrect or the related instrument in not yet registered in the orchestra." % (track_id)) # tmp_dir = tempfile.mkdtemp() tmp_dir = Path('/tmp/pymusco') tmp_dir.mkdir(parents=True, exist_ok=True) scanned_image_file_paths = [] with open(src_scanned_pdf_file_path, 'rb') as src_pdf_file: pdf_reader = PyPDF2.PdfFileReader(src_pdf_file) # pdfReader.numPages # 19 for page_index in range(pdf_reader.numPages): print('page_index = %d' % page_index) page = pdf_reader.getPage(page_index) # image_file_path = extract_pdf_page_main_image(page, image_dir=tmp_dir, image_name=('page%03d' % page_index)) image_file_path = extract_pdf_page(page, image_dir=tmp_dir, image_name=('page%03d' % page_index)) scanned_image_file_paths.append(image_file_path) # break images_to_pdf( StubContents(image_file_paths=scanned_image_file_paths, toc=toc, title=title, stamp_descs=stamp_descs, page_info_line_y_pos=page_info_line_y_pos), dst_stub_pdf_file_path)
def fetch_wrapper(monster_manual_path, creature_page_number): """ Wrapper function to iterate over page numbers until the desired creture trait we want to fetch has been found :param creature_page_number: page number for creature info in Monster Manual, determined by the monster_manual_lookup method :param pdf_reader: pdf reader object for the Monster Manual :return creature_dict: dictionary with all the traits that we are interested in (i.e., attributes, hit_points, etc.) as keys and stats for those traits as values """ # dict to hold all information we need about a creature creature_dict = {} # list of traits that we would like to fetch for creature trait_list = [ 'attributes', 'armor_class', 'hit_points', 'alignment', 'creature_type', 'languages', 'passive_perception', ] # Open up Monster Manual pdf and initialize a new reader with open(monster_manual_path, 'rb') as pdf: # Create a pdf reader object pdf_reader = PyPDF2.PdfFileReader(pdf) for trait in trait_list: # Initialize loop to grab all desired traits of a given creature pass_code = 0 trait_page_number = creature_page_number # Initialize empty dict to keep track of traits we are unable to # fetch error_dict = {} # Initialize empty dict for atttributes if trait == 'attributes': attribute_dict = {} while not pass_code: trait_page = pdf_reader.getPage(trait_page_number) print("trait_page_number: ", trait_page_number) trait_text = trait_page.extractText().replace('\n', '') # Fetch attributes if trait == 'attributes': attribute_dict, pass_code = fetch_attributes( trait_text, attribute_dict=attribute_dict) # Fetch armor class elif trait == 'armor_class': armor_class, pass_code = fetch_armor_class(trait_text) # Fetch hit points elif trait == 'hit_points': hit_points, pass_code = fetch_hit_points(trait_text) # Fetch alignment elif trait == 'alignment': alignment, pass_code = fetch_alignment(trait_text) elif trait == 'creature_type': creature_type, pass_code = fetch_creature_type( trait_text, alignment) print("pass_score after creature_type: ", pass_code) elif trait == 'languages': bag_of_words = trait_text.split() languages, pass_code = fetch_known_languages(bag_of_words) elif trait == 'challenge': bag_of_words = trait_text.split() challenge, pass_code = fetch_challenge_rating(bag_of_words) elif trait == 'passive_perception': pass # see if pass code has been issued. If not, increment page # number by one and fetch next page of text to shift through if not pass_code: trait_page_number += 1 # If we have gone five pages without seeing anything, issue # error code for attributes and continue if (trait_page_number - creature_page_number) > 5: # Issue error codes for specific attributes we were unable # to fetch if trait == 'attributes': error_dict['attributes'] = {} for key in attribute_dict.keys(): if not attribute_dict[key]: error_dict['attributes'][key] = 1 else: error_dict['attributes'][key] = 0 else: error_dict[trait] = 0 # Since we have busted the five page limit, break the # loop for this trait and move onto the next break # Populate the creature_dict with traits that we have fetched creature_dict['attributes'] = attribute_dict creature_dict['armor_class'] = armor_class creature_dict['hit_points'] = hit_points creature_dict['alignment'] = alignment creature_dict['creature_type'] = creature_type
import PyPDF2 template = PyPDF2.PdfFileReader( open( 'combined.pdf', 'rb' ) ) watermark = PyPDF2.PdfFileReader( open( 'wtr.pdf', 'rb' ) ) output = PyPDF2.PdfFileWriter() for i in range( template.getNumPages() ): page = template.getPage( i ) page.mergePage( watermark.getPage(0) ) output.addPage( page ) with open( 'combined_watermarked.pdf', 'wb' ) as file: output.write( file )
import pyttsx3 import PyPDF2 as p2 # Get file handle, you want to read-aloud file = open('./pdf/lipsum.pdf', 'rb') # Read contents of file into book object book = p2.PdfFileReader(file) # Load first page (page 0 i.e. page 1) contents into page object page = book.getPage(0) # Store contents of page in a text object text = page.extractText() # Print the text for debugging purpose. # Sometimes the pyttsx3 package is not able to read all types of pdf files # You may comment it by addeding the # before the line below, it's your choice print(text) # Initialize the read-aloud python package speaker = pyttsx3.init() # Read aloud the pdf page selected speaker.say(text) # The line below is used to block program execution until the read-aloud command buffer / queue is cleared or read-out speaker.runAndWait() print('Program completed.')
def extractData(file, page): pdfFileObj = open(file, 'rb') pdfReader = pdf.PdfFileReader(pdfFileObj) pageObj = pdfReader.getPage(page) return pageObj.extractText()
import PyPDF2 pdf = open("example2.pdf", "rb") pdfRead = PyPDF2.PdfFileReader(pdf) pdfPages = pdfRead.numPages selectedPages = pdfRead.getPage(pdfPages - 1) #pyPdf2 used with text data text = selectedPages.extractText() file = open( r"C:\Users\Raksh\Documents\GitHub\Python-Converting-Pdf-To-Text\text2.txt", "a") file.writelines(text) file.close() print("Done Converting !!")
import PyPDF2 as pd # filename = input('Path to the file: ') filename = 'output.pdf' file = open(filename, 'rb') pdfReader = pd.PdfFileReader(file) tried = 0 if not pdfReader.isEncrypted: print('The file is not encryted! You can successfully open it!') else: wordListFile = open('dictionary.txt', 'r') body = wordListFile.read().lower() words = body.split('\n') for i in range(len(words)): word = words[i] print('Trying dencryption by: {}'.format(word)) result = pdfReader.decrypt(word) if result == 1: print('Success! The password is: ' + word) break elif result == 0: tried += 1 print('Passwords tried: ' + str(tried)) continue
import PyPDF2 import pyttsx3 infile = open('Related/sample.pdf', 'rb') pdfReader = PyPDF2.PdfFileReader(infile) num_Pages = pdfReader.numPages print(num_Pages) start = pyttsx3.init() print("Playing audio..") for i in range(0, num_Pages): page = pdfReader.getPage(i) text = page.extractText() start.say(text) start.runAndWait()
import PyPDF2 # creating a pdf file object pdfFileObj = open('filename.pdf', 'rb') # creating a pdf reader object pdfReader = PyPDF2.PdfFileReader(pdfFileObj) # printing number of pages in pdf file number_pages = pdfReader.numPages print(pdfReader.numPages) # creating a page object count = 0 count_1 = 0 for i in range(2, number_pages): pageObj = pdfReader.getPage(i) # extracting text from page requiremt_extract = pageObj.extractText() print(requiremt_extract) # a=requiremt_extract.split('\n' or '.') # for section_number in a: # if len(section_number)<=3: # for dot in section_number: # if dot == '.': # try: # if(int(section_number[:-1])): # count=int(section_number[:-1]) # print(section_number[:-1]) #
import PyPDF2 pdfFile = open('sample.pdf', 'rb') pdfReader = PyPDF2.pdfFileReader(pdfFile) print(pdfReader.numPages) pageObj = pdfReader.getPage(0) print(pageObj.extractText()) pdfFile.close()
pdfFiles = [] for filename in os.listdir(directory_target): if filename.endswith('.pdf'): pdfFilePath = os.path.join(directory_target, filename) # pdfFiles.append(filename) pdfFiles.append(pdfFilePath) pdfFiles.sort( key=str.lower) # list sorted into alpha order with keyword argument logging.debug('pdf files list after sorting') logging.debug(pdfFiles) pdfWriter = PyPDF2.PdfFileWriter( ) # this is the new pdf temporary file, you will add pages to this # loop through all the PDF files for filename in pdfFiles: pdfFileObj = open(filename, 'rb') # read in binary mode pdfReader = PyPDF2.PdfFileReader(pdfFileObj) # loop through all the pages (except the first) and add them for pageNum in range( 1, pdfReader.numPages ): # cycle through all pages except the first which is n = 0, hence start at n = 1 pageObj = pdfReader.getPage(pageNum) pdfWriter.addPage(pageObj) # save the resulting PDF to a file
import PyPDF2 template = PyPDF2.PdfFileReader(open('super_pdf.pdf', 'rb')) watermark = PyPDF2.PdfFileReader(open('wtr.pdf', 'rb')) output = PyPDF2.PdfFileWriter() for i in range(template.getNumPages()): page = template.getPage(i) page.mergePage(watermark.getPage(0)) output.addPage(page) with open('watermarked_output.pdf', 'wb') as file: output.write(file)
def pdfMerge(pdf_list): merger = PyPDF2.PdfFileMerger() for pdf in pdf_list: merger.append(pdf) merger.write('Adrien Clay Resume Cover Letter.pdf')
import PyPDF2 # This works fine with open('demo.pdf', 'rb') as pdf_obj: pdf = PyPDF2.PdfFileReader(pdf_obj) out = PyPDF2.PdfFileWriter() for page in pdf.pages: # page.scale(2, 2) extracted = page.extractText() print(extracted, "\n\n") out.addPage(page) with open('new.pdf', 'wb') as f: out.removeLinks() dir(out) out.write(f) # # This attempts to remove annotations # with open('old.pdf', 'rb') as pdf_obj: # pdf = PyPDF2.PdfFileReader(pdf_obj) # page = pdf.pages[2] # print(page['/Annots'], '\n\n\n\n') # page.Annots = [] # print(page['/Annots'])
import PyPDF2 # with open("resume_AaryamanSaini.pdf", "rb") as file: # reader = PyPDF2.PdfFileReader(file) # print(reader.numPages) # page = reader.getPage(0) # page.rotateClockwise(90) # writer = PyPDF2.PdfFileWriter() # writer.addPage(page) # with open("rotated.pdf", "wb") as output: # writer.write(output) merger = PyPDF2.PdfFileMerger() file_names = ["rotated.pdf", "resume_AaryamanSaini.pdf"] for file_name in file_names: merger.append(file_name) merger.write("combined.pdf")
## Code to rotate a complete PDF file import PyPDF2 file_name = '20200210225306926' page_to_be_rotated = 1 pdf_in = open(f'{file_name}.pdf', 'rb') pdf_reader = PyPDF2.PdfFileReader(pdf_in) pdf_writer = PyPDF2.PdfFileWriter() for pagenum in range(pdf_reader.numPages): page = pdf_reader.getPage(pagenum) page.rotateClockwise(180) pdf_writer.addPage(page) pdf_out = open(f'{file_name}_rotated.pdf', 'wb') pdf_writer.write(pdf_out) pdf_out.close() pdf_in.close()