def combina_foi_din_diferite_pdf(): """1. Find all pdf in a folder.""" pdfFiles = [] pdfWriter = PyPDF3.PdfFileWriter() for filename in os.listdir('.'): if filename.endswith('.pdf'): pdfFiles.append(filename) pdfFiles.sort(key=str.lower) #print(pdfFiles) pdfFiles = ['3340063.pdf', 'bus.pdf'] # pentru test print(pdfFiles) # pentru test # pdfWriter = PyPDF3.PdfFileWriter() # """2. Open each pdf.""" # pdfFiles = [] for filename in pdfFiles: pdfFileObj = open(filename, 'rb') pdfReader = PyPDF3.PdfFileReader(pdfFileObj) print(pdfReader) # """3. Add each page.""" for filename in pdfFiles: for pageNum in range(1, pdfReader.numPages): pageObj = pdfReader.getPage(pageNum) pdfWriter.addPage(pageObj) print(pdfWriter) # """4. Save the result.""" for filename in pdfFiles: for pageNum in range(1, pdfReader.numPages): pdfOutput = open('rezultat_merging.pdf', 'wb') pdfWriter.write(pdfOutput) pdfOutput.close()
def decrypt(): pdfName = input('Enter the name of PDF file to decrypt:\n') pdfName = path.abspath(pdfName) # Check if entered filename is valid if not path.exists(pdfName) or pdfName[-4:].lower() != '.pdf': print(f'The filename {pdfName} is not a PDF.') sys.exit() pdfFile = open(pdfName, 'rb') pdfReader = PyPDF3.PdfFileReader(pdfFile) # Password for decryption password = input('Enter the password to decrypt the PDF: \n') pdfReader.decrypt(password) pdfWriter = PyPDF3.PdfFileWriter() # Loop through the pages and add them to pdfWriter for pageNum in range(pdfReader.numPages): pdfWriter.addPage(pdfReader.getPage(pageNum)) # Save the resulting PDF to a file decryptedPdf = open('decrypted' + path.basename(pdfName), 'wb') pdfWriter.write(decryptedPdf) decryptedPdf.close() print(f'File decrypted and saved as decrypted{path.basename(pdfName)}')
def add_barcode(self, barcode, original_path, x_pos=None, y_pos=None): """ Adds a barcode to the first page of a document :param barcode: Barcode to add to the file specified in original_path :param original_path: Path to the original file. :param y_pos: Position of the bottom of the barcode measure in mm from the bottom of the page. :return: """ merger = PyPDF3.PdfFileReader(original_path) barcode_doc = PyPDF3.PdfFileReader( self.barcode_doc.generate_barcode(barcode)).getPage(0) first_page = merger.getPage(0) if y_pos is not None: barcode_target_y = self.get_y_pos(first_page.mediaBox[3], y_pos) else: barcode_target_y = 0 centre_pos = float(first_page.mediaBox[2]) / 2 - float( barcode_doc.mediaBox[2]) / 2 first_page.mergeTranslatedPage(barcode_doc, centre_pos, barcode_target_y) out = PyPDF3.PdfFileWriter() out.addPage(first_page) [out.addPage(page) for page in merger.pages[1:]] filename = original_path[original_path.rfind('/'):original_path. rfind('.')] output_filename = f"{self.save_path}/{filename}_{barcode}_coded.pdf" with open(output_filename, 'wb') as output: out.write(output)
def encripteaza(pdf_file): pdfFile = open(pdf_file, 'rb') pdfReader = PyPDF3.PdfFileReader(pdfFile) pdfWriter = PyPDF3.PdfFileWriter() for pageNum in range(pdfReader.numPages): pdfWriter.addPage(pdfReader.getPage(pageNum)) pdfWriter.encrypt('swordfish') # parola cu care encripteaza resultPdf = open('rezultat_encriptare.pdf', 'wb') pdfWriter.write(resultPdf) resultPdf.close()
def roteste_pagina(pdf_file): # roteste prima pagina si face din ea un nou pdf minutesFile = open(pdf_file, 'rb') pdfReader = PyPDF3.PdfFileReader(minutesFile) page = pdfReader.getPage(0) page.rotateClockwise(90) pdfWriter = PyPDF3.PdfFileWriter() pdfWriter.addPage(page) resultPdfFile = open('rezultat_rotire.pdf', 'wb') pdfWriter.write(resultPdfFile) resultPdfFile.close() minutesFile.close()
def overlay(pdf_file): minutesFile = open(pdf_file, 'rb') pdfReader = PyPDF3.PdfFileReader(minutesFile) minutesFirstPage = pdfReader.getPage(0) pdfWatermarkReader = PyPDF2.PdfFileReader(open('watermark.pdf', 'rb')) minutesFirstPage.mergePage(pdfWatermarkReader.getPage(0)) pdfWriter = PyPDF3.PdfFileWriter() pdfWriter.addPage(minutesFirstPage) for pageNum in range(1, pdfReader.numPages): pageObj = pdfReader.getPage(pageNum) pdfWriter.addPage(pageObj) resultPdfFile = open('rezultat_overlay.pdf', 'wb') pdfWriter.write(resultPdfFile) minutesFile.close() resultPdfFile.close()
def decripteaza(): pdfReader = PyPDF3.PdfFileReader(open('encrypted.pdf', 'rb')) print(pdfReader.isEncrypted) # print(pdfReader.getPage(0)) # print(pdfReader.getPage()) print(pdfReader.decrypt('rosebud')) print(pageObj=pdfReader.getPage(0))
def search(path, term, flags): # Check every item in the directory for item in listdir(path): # If the item is another directory, enter the directory and keep searching if isdir(join(path, item)): search(join(path, item), term, flags) if isfile(join(path, item)): filename, file_extension = splitext(item) # We only care about PDFs if file_extension == ".pdf": print_searching("Searching in: " + filename + " ... ") pdf_file_object = open(join(path, item), 'rb') pdf_reader = PyPDF3.PdfFileReader(pdf_file_object) # Go through each page in the pdf for page_number in range(0, pdf_reader.getNumPages()): page = pdf_reader.getPage(page_number) text = page.extractText() # If the term exists in the PDF, print out the file and the page if re.search(term, text, flags=flags): print_found("\tFOUND: " + join(path, item) + " Page: " + str(page_number)) # Close after searching pdf_file_object.close()
def parse_pdf(doc_name): """ Reads an Adobe PDF document given filepath and divides it into individual clauses Parameters ---------- doc_name: string filepath of the Adobe PDF document with pdf extension Returns ------- list of string an array of individual clauses of the EULA """ document = PyPDF3.PdfFileReader(doc_name) pageNum = document.getNumPages() lst_output = [] clause = "" for int_page in range(pageNum): lst_text = document.getPage(int_page).extractText().split(" \n") for text in lst_text: # clean the text txt = text.strip() txt = re.sub('\n', '', txt) if txt == "": continue clause = " ".join([clause, txt]) clause = clause.strip() if txt[-1] == "." and len(clause) > 30: lst_output.append(clause) clause = "" return (lst_output)
def _import_pdf_fields(self): """ Parameters: None Returns: None """ self.pdf_fields = {} # Import the original PDF. pdf = self.pdf_file_path #pdf = codecs.open(self.pdf_file_path, 'rb', encoding='utf-8') # Set the PDF fields. self.pdf_fields['o'] = PyPDF3.PdfFileReader(pdf).getFields() # Import the copy PDF. pdf = self.pdf_copy_file_path #pdf = codecs.open(self.pdf_copy_file_path, 'rb', encoding='utf-8') # Set the PDF fields. self.pdf_fields['c'] = PyPDF3.PdfFileReader(pdf).getFields()
def _get_fields(self): """ Parameters: None Returns: None """ # Import the PDF fields as a dictionary. self._fields = PyPDF3.PdfFileReader(self.pdf_file_path).getFields()
def extrage_text(pdf_file): pdfFileObj = open(pdf_file, 'rb') pdfReader = PyPDF3.PdfFileReader(pdfFileObj) pdfReader.numPages pageObj = pdfReader.getPage(0) pageObj.extractText() print(pdfReader.numPages) print(pageObj.extractText())
def append_pdf(): # append a pdf to another pdf (merge 2 pdf-uri) pdf1File = open('meetingminutes.pdf', 'rb') pdf2File = open('meetingminutes2.pdf', 'rb') pdf1Reader = PyPDF3.PdfFileReader(pdf1File) pdf2Reader = PyPDF3.PdfFileReader(pdf2File) pdfWriter = PyPDF3.PdfFileWriter() for pageNum in range(pdf1Reader.numPages): pageObj = pdf1Reader.getPage(pageNum) pdfWriter.addPage(pageObj) for pageNum in range(pdf2Reader.numPages): pageObj = pdf2Reader.getPage(pageNum) pdfWriter.addPage(pageObj) pdfOutputFile = open('combinedminutes1.pdf', 'wb') pdfWriter.write(pdfOutputFile) pdfOutputFile.close() pdf1File.close() pdf2File.close()
def extract_data_from_pdf(self,pdf_file): page_content = "" pdfFileObj = open(pdf_file,'rb') pdfReader = PyPDF3.PdfFileReader(pdfFileObj) number_of_pages = pdfReader.getNumPages() for page_number in range(number_of_pages): page = pdfReader.getPage(page_number) page_content = page_content + page.extractText() return page_content
def simple(): # pdf_file = open('encrypted.pdf','rb') pdf_file = open('meetingminutes.pdf', 'rb') pdf_reader = PyPDF3.PdfFileReader(pdf_file) print(pdf_reader.isEncrypted) # pdf_reader.decrypt('rosebud') print(pdf_reader.numPages) page = pdf_reader.getPage(1) print(page.extractText())
def extractContent(content=""): fileNames = [] # numPageBooks = [] pdfFileText = [] pdfFileReader = '' for files in glob.glob("Resource/*.pdf"): fileNames.append(files) for i in range(len(fileNames)): pdfFile = open(fileNames[i], 'rb') pdfFileReader = pyPDF.PdfFileReader(fileNames[i]) if (pdfFileReader.isEncrypted): pdfFile = pikepdf.open(fileNames[i]) #pdfFile.save(fileNames[i]) print("%s decrypted!" % fileNames[i]) pdfFileReader = pyPDF.PdfFileReader(fileNames[i]) #numPageBooks.append(pdfFileReader.numPages) pdfText = pdfFileReader.getPage(100) pdfText = pdfText.extractText() pdfFileText.append(pdfText) print(pdfFileText)
def pdf2page_count(self, path): try: pdfFileObj = open(path, 'rb') pdfReader = PyPDF3.PdfFileReader(pdfFileObj, strict=False) except Exception as e: self._show_warning( 'Unable to process the file,\nplease contact our customer support.' ) Clock.schedule_once(self._go_back, 3) return 1 return pdfReader.numPages
def main(): filelist = os.listdir(input()) print(filelist) exit merger = PyPDF3.PdfFileMerger() for i in filelist: pathroot, ext = os.path.splitext(i) if ext == '.pdf': merger.append(i) merger.write('./merger.pdf') merger.close()
def find_table_page(pdf_path) -> Tuple[PageObject, int]: """ Return the (1-based) index of the page containing the table table. """ pdf = PyPDF3.PdfFileReader(str(pdf_path)) num_pages = pdf.getNumPages() for i in range(1, num_pages ): # skip the first page, the table is certainly not there page = pdf.getPage(i) text = page.extractText().replace('\n', '') if TABLE_CAPTION_PATTERN.search(text): return page, i + 1 # return a 1-based index else: raise TableExtractionError('could not find the table in the pdf')
def __init__(self, datasheet_path): self.path = Path(datasheet_path) self.pdf_file = PyPDF3.PdfFileReader(self.path.open('rb')) self.plumber = pdfplumber.load(self.path.open('rb')) self.raw_outline = [] self.tables, self.figures = {}, {} # type: Dict self.table_of_content = DataSheetNode('ROOT', [0]) self.table_root = DataSheetNode('TABLES', [-1]) self.table_of_content.append(self.table_root) self.fallback_table: DataSheetTableNode = None self.flatten_outline() self.sort_raw_outline() self.collect_tables()
def split_pdf(pdffile: str): """Split a PDF file in one file per page. Args: pdffile (str): The name of the PDF file to split. """ with open(pdffile, 'rb') as pdffile_handle: pdf_in = pypdf.PdfFileReader(pdffile_handle) num_pages = pdf_in.numPages pages = [] if 'pdf' in pdffile.lower(): file_first_name = pdffile.lower()[:pdffile.lower().index('.pdf')] for page in range(num_pages): pages.append(file_first_name + f'_p_{page}' + '.pdf') pdf_out = pypdf.PdfFileWriter() pdf_out.addPage(pdf_in.getPage(page)) with open(pages[-1], 'wb') as stream: pdf_out.write(stream) return pages
def combine_pdfs(pdffiles: t.List[str], outfile: str): """Combine several PDF files to one. Args: pdffiles (list): The names of the PDF files to combine. outfile (str): The name of the PDF file to write. """ pdf_merger = pypdf.PdfFileMerger() for this_pdf in pdffiles: pdf_merger.append(this_pdf) pdf_merger.write(outfile) pdf_merger.close()
def simple2(): pdf_file = open('meetingminutes.pdf', 'rb') pdf_reader = PyPDF3.PdfFileReader(pdf_file) pdf_file2 = open('meetingminutes2.pdf', 'rb') pdf_reader2 = PyPDF3.PdfFileReader(pdf_file) pdf_writer = PyPDF3.PdfFileWriter() for page_num in range(pdf_reader.numPages): pdf_obj = pdf_reader.getPage(page_num) pdf_writer.addPage(pdf_obj) for page_num in range(pdf_reader2.numPages): pdf_obj = pdf_reader2.getPage(page_num) pdf_writer.addPage(pdf_obj) pdf_file3 = open('combinedminutes.pdf', 'wb') pdf_writer.write(pdf_file3) pdf_file3.close() pdf_file.close() pdf_file2.close()
def find(self): if self.overwrite == True: onlyfiles = [ f for f in listdir(self.rootdir) if isfile(join(self.rootdir, f)) ] for i in onlyfiles: if " " in i: os.rename(i, i.replace(" ", "_")) newdir = os.path.join(root, self.word) if os.path.exists(newdir): shutil.rmtree(newdir) os.makedirs(newdir) fil = [] nums = [] for subdir, dirs, files in os.walk(self.rootdir): for file in files: try: pdf = (os.path.join(subdir, file)) pdfFileObj = open(pdf, 'rb') pdfReader = PyPDF3.PdfFileReader(pdfFileObj) if pdfReader.isEncrypted: pdfReader.decrypt('') nums.append(pdfReader.getNumPages()) fil.append(pdf) for i in range(pdfReader.getNumPages()): pageObj = pdfReader.getPage(i) text = pageObj.extractText() if text.find(self.word) != -1: st = 'copy ' + str(pdf) + ' ' + str(newdir + "\\" + file) os.popen(st) break except: print(pdf + "not decryptable") return fil
def croptime(self, Path): while True: try: TestPdf = PyPDF3.PdfFileReader(Path, "rb") break except: print("File not found") Path = input("Enter new path\n") self.tPdf = TestPdf self.path = Path self.is_cropped = True logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, filename='LOG.txt') # Writing to log file logging.info('Test pdf attached') # Writing to log file
def __crop__(self, name): pagNum = int(input("Enter page number\n")) upperX = int(input("please enter upper x cordinate\n")) upperY = int(input("please enter upper y cordinate\n")) lowerX = int(input("please enter lower x cordinate\n")) lowerY = int(input("please enter lower y cordinate\n")) path = self.getPath() writer = PyPDF3.PdfFileWriter() page = self.tPdf.getPage(pagNum) page.cropBox.setLowerLeft((lowerX, lowerY)) page.cropBox.setUpperRight((upperX, upperY)) writer.addPage(page) path = "Class\pdfFileHere\{0}.pdf".format(name) outstream = open(path, 'wb') writer.write((outstream)) outstream.close()
def get_grade_from_pdf_file(pdf_file): """ Return a grade extracted from the PDF file, such as "10.0" or "9.5". :param pdf_file: file_path : string :return: grade : string """ input1 = PyPDF3.PdfFileReader(open(pdf_file, "rb"), strict=False) page0 = input1.getPage(0) if '/Annots' in page0: for annot in page0['/Annots']: annot_obj = annot.getObject() if '/Contents' in annot_obj: content = str(annot_obj['/Contents']) if is_grade(content): return content return ""
def download_workbook_pdf(self, workbook: WorkbookItem, dest_dir, data_filters: dict = None, page_type=None, orientation=None): """ :param workbook: :param dest_dir: :return: """ self.server.workbooks.populate_views(workbook) _pdf_merger = PyPDF3.PdfFileMerger() _is_pdf_content_generated = False _pdf_file = os.path.join(dest_dir, workbook.name) + ".pdf" _vw_filters = PDFRequestOptions(page_type=page_type, orientation=orientation) if data_filters is None: data_filters = dict() for name, value in data_filters.items(): _vw_filters.vf(name=quote_plus(name), value=quote_plus(value)) log.info( "Exporting\nWorbook='%s' \nProject='%s' \nPage Type='%s' \nOrientation='%s' \nFilters='%s'\nFile='%s' " % (workbook.name, workbook.project_name, page_type, orientation, _vw_filters.view_filters, _pdf_file)) for _view in workbook.views: _downloaded_wv = self._download_view_pdf(_view, dest_dir=os.path.join( dest_dir, 'views'), view_filters=_vw_filters) _pdf_merger.append(_downloaded_wv) _is_pdf_content_generated = True if _is_pdf_content_generated: _pdf_merger.write(_pdf_file) _pdf_merger.close() log.info("Exported Workbook to pdf %s" % _pdf_file) else: raise Exception("No Pdf Content Generated") return _pdf_file
def search(query): results = [] if query is not None: for pdf in pdfFiles: pdfName = "PDF_NAME: " + pdf.split('/')[-1] try: fileObject = PyPDF3.PdfFileReader(pdf) numPages = fileObject.getNumPages() for page in range(0, numPages): pageObj = fileObject.getPage(page) searchResults = re.search(query, pageObj.extractText(), re.IGNORECASE) if searchResults: pageResult = ("Page: {}".format(str(page + 1))) results.append((pdfName, pageResult)) except PyPDF3.utils.PdfReadError: pass write(query, results)
def pdf_search(): redpdf = PyPDF3.PdfFileReader(open(pdfPath, 'rb')) count = 0 for sida in range(0, redpdf.numPages): temptext = redpdf.getPage(sida) text = temptext.extractText() start = 0 kontroll = 0 while kontroll != -1: kontroll = text[start:].find(pdfWord) if kontroll != -1: start += kontroll + len(pdfWord) count = count + 1 else: break print(pdfWord, "finns", count, "gånger i dokumentet")