def extract_pdf_pypdf2(pdf_path): with open(pdf_path, 'rb') as f: pdf = PdfFileReader(f) if pdf.isEncrypted: pdf.decrypt('') page_obj = pdf.getPage(2) return page_obj.extractText()
def RemovePdfOwnerPassword(inputname, outputname): ''' ''' inputfile = open(inputname, 'rb') wrt = PdfFileWriter() ipt = PdfFileReader(inputfile) try: ipt.decrypt("") except KeyError as e: if e.message == '/Encrypt': print("%s is not an encrypted pdf" % inputname) return -1 else: raise e print(ipt.getDocumentInfo()) size = ipt.getNumPages() i = 0 while i < size: page = ipt.getPage(i) #print(page.extractText()) wrt.addPage(page) i = i + 1 fl = open(outputname, "wb") wrt.write(fl) inputfile.close() fl.close() return 0
def createNewBooks(self, pdf_file, stPage, endPage, filename='my.pdf'): input = PdfFileReader(open(pdf_file, "rb")) if input.isEncrypted: #注意:所有的pdf,pypdf2默认都是加密形式,所以要先解密再读取 input = input.decrypt('') pdf_input = input pdf_output = PdfFileWriter() i = stPage while i < endPage: page = pdf_input.getPage(i) # 选取需要页面,需要注意的是第一页的编号是0 pdf_output.addPage(page) # 将选好的页面加入到新的pdf中 i += 1 output_stream = open(filename, 'wb') pdf_output.write(output_stream) output_stream.close() return 'Complete knifing'
def processFile(self, curr_file): global extractedFrom author = '-' date = '-' generator = '-' created = '-' producer = '-' modded = '-' last_saved = '-' if ".pdf" in curr_file: try: raw_input("Processing " + str(curr_file) + ".\nPress Enter to continue...") # pdfFile = PdfFileReader(file(curr_file, 'rb')) pdfFile = PdfFileReader(open(curr_file, 'rb')) if pdfFile.getIsEncrypted(): print "File is encrypted (maybe, this sometimes has false positives). Trying to decrypt." pdfFile.decrypt('') print "Success! File decrypted." docInfo = pdfFile.getDocumentInfo() if not docInfo: return last_saved = '-' print " The RAW document information" for section in docInfo: print docInfo[section] raw_input("Press Enter to continue...") #looks at the entire dictionary to parse for information if "/CreationDate" in docInfo: print " Processing CREATION DATE information" print " Creation Date RAW: " + docInfo["/CreationDate"] data = docInfo["/CreationDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] created_time = data[8:10] + ":" + data[10:12] print " The value of 'created_time' is: " + str(created_time) print " The data type is: " + str(type(created_time)) print " Expecting H:M format" raw_input(" Does it match?") created_time = time.strftime("%I:%M %p", time.strptime(created_time, "%H:%M")) created = date + "/" + year + " " + created_time if "/Author" in docInfo: print " Processing AUTHOR information" author = docInfo["/Author"] + " " if len(author) <=1: author = "-" if "/Producer" in docInfo: print " Processing PRODUCER information" producer = docInfo["/Producer"].strip("(Windows)") producer = re.sub(r'[^\w]', ' ', producer) if len(producer) == 0: producer = "-" while True: if " " in producer: producer = producer.replace(" ", " ") else: break if "/ModDate" in docInfo: print " Processing MODIFIED DATE information" data = docInfo["/ModDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] modded_time = data[8:10] + ":" + data[10:12] modded_time = time.strftime("%I:%M %p", time.strptime(modded_time, "%H:%M")) modded = date + "/" + year + " " + modded_time #strips '/' off file name (if it includes directory name) if "/" in curr_file: curr_file = curr_file[curr_file.rfind("/")+1:] if "\\" in curr_file: curr_file = curr_file.replace("\\","") #trim information if it's too long if len(curr_file) > 15: # trims file name curr_file = curr_file[:15] + "..." + curr_file[-13:] if len(producer) > 30: producer = producer[:20] + " [snipped] " if len(author) > 20: author = author[:20] + " [snipped] " #appends each piece of information. output will show ONLY if at least ONE file has data in a column self.container.append([" | " + curr_file,created,author,producer,modded,last_saved]) print "Parsing Completed." except NotImplementedError: print "Tried to decrypt a secured/encrypted PDF, and it failed. Try to read details manually." except Exception, err: print "Parsing failed somewhere in the TRY statement." return
def getPdfMetadata(self, path=None): ''' This method will get the pdf metadata and return book object. ''' logger.debug('getPdfMetadata path: %s', path) if path: try: input = PdfFileReader(open(path, "rb")) logger.debug('getIsEncrypted : %s ', input.getIsEncrypted()) except Exception as e: logger.error(e, exc_info=True) pdf_info = None try: pdf_toread = PdfFileReader(open(path, "rb")) if pdf_toread.isEncrypted: try: pdf_toread.decrypt('') except Exception as e: logger.error(e, exc_info=True) except Exception as e: logger.error(e, exc_info=True) try: pdf_info = pdf_toread.getDocumentInfo() logger.debug('NumPages:%s', pdf_toread.getNumPages()) self.book.numberOfPages = pdf_toread.getNumPages() # value = pdf_info.subject subject = None if pdf_info.subject and type(pdf_info.subject) == str: # Ignore errors even if the string is not proper UTF-8 or has # broken marker bytes. # Python built-in function unicode() can do this. subject = pdf_info.subject # else: # # Assume the value object has proper __unicode__() method # value = unicode(pdf_info.subject) # print 'else' if not self.book.tag and subject: self.book.tag = subject elif self.book.tag and subject: self.book.tag = self.book.tag + '' + subject except Exception as e: logger.error(e, exc_info=True) try: if pdf_info.title != None and pdf_info.title.strip() != '': self.book.bookName = str(pdf_info.title) except Exception as e: logger.error(e, exc_info=True) try: if pdf_info.creator: self.book.publisher = str(pdf_info.creator.encode('utf-8')) except Exception as e: logger.error(e, exc_info=True) self.book.createdOn = datetime.now() try: # print str(pdf_info['/CreationDate'])[2:10] date = datetime.strptime( str(pdf_info['/CreationDate'])[2:10], '%Y%m%d') self.book.publishedOn = date except Exception as e: logger.error(e, exc_info=True) logger.error('CreationDate not found') logger.debug(Util().convert_bytes(os.path.getsize(path))) self.book.fileSize = Util().convert_bytes(os.path.getsize(path)) # if 'ISBN'.lower() in str(pdf_info['/Subject']).lower(): # self.book.isbn_13 = str(pdf_info['/Subject'])[6:] author = Author() val = 'Unknown' try: if pdf_info.author != None and pdf_info.author.strip() != '': val = pdf_info.author # val = val.encode("utf8", "ignore") except Exception as e: logger.error(e, exc_info=True) author.authorName = val authorList = list() authorList.append(author) self.book.authors = authorList
def getPdfMetadata(self, path=None): ''' This method will get the pdf metadata and return book object. ''' print path if path: try: input = PdfFileReader(open(path, "rb")) print 'getPdfMetadata', input.getIsEncrypted() except: pass pdf_info = None try: pdf_toread = PdfFileReader(open(path, "rb")) if pdf_toread.isEncrypted: try: pdf_toread.decrypt('') except: traceback.print_exc() except: pass try: pdf_info = pdf_toread.getDocumentInfo() print 'Pages:', pdf_toread.getNumPages() self.book.numberOfPages = pdf_toread.getNumPages() # value = pdf_info.subject if type(pdf_info.subject) == str: # Ignore errors even if the string is not proper UTF-8 or has # broken marker bytes. # Python built-in function unicode() can do this. value = unicode(pdf_info.subject, "utf-8", errors="ignore") else: # Assume the value object has proper __unicode__() method value = unicode(pdf_info.subject) print 'else' if not self.book.tag : self.book.tag = value else: self.book.tag = self.book.tag + '' + value except: traceback.print_exc() try: if pdf_info.title != None and pdf_info.title.strip() != '': self.book.bookName = str(pdf_info.title) except: print 'unable to set bookName', traceback.print_exc() try: if pdf_info.creator: self.book.publisher = str(pdf_info.creator.encode('utf-8')) except: pass self.book.createdOn = datetime.now() try: print str(pdf_info['/CreationDate'])[2:10] date = datetime.strptime(str(pdf_info['/CreationDate'])[2:10] , '%Y%m%d') self.book.publishedOn = date except: print 'CreationDate not found' print path print Util().convert_bytes(os.path.getsize(path)) self.book.fileSize = Util().convert_bytes(os.path.getsize(path)) # if 'ISBN'.lower() in str(pdf_info['/Subject']).lower(): # self.book.isbn_13 = str(pdf_info['/Subject'])[6:] author = Author() val = 'Unknown' try: if pdf_info.author !=None and pdf_info.author.strip()!='': val = pdf_info.author val = val.encode("utf8", "ignore") except: pass author.authorName = val authorList = list() authorList.append(author) self.book.authors = authorList
def main(): parser = argparse.ArgumentParser(description="Swaps colors of pdf file.") parser.add_argument("to_color", help="hex string of color that will replace") parser.add_argument("input", help="path to input pdf file", type=str) # optional arguments parser.add_argument("-p", help="page numbers", nargs="*", type=int, metavar="pageno", dest="pages") parser.add_argument("-P", help="password to pdf input file", metavar="password", nargs=1, dest="password", type=str) parser.add_argument("-c", help="color to be swapped(default black)", default="#000000", nargs=1, metavar="from_color", dest="from_color") parser.add_argument("-o", help="filename of output pdf", default="output.pdf", nargs=1, metavar="filename", dest="output") parser.add_argument( "-O", help="save directory for output file(default current directory)", nargs=1, metavar="directory", dest="outputDir") parser.add_argument("-d", help="debugging mode", action="store_true", dest="debug_mode") args = parser.parse_args() # input path management currentPath = os.getcwd() if os.path.exists(args.input): filepath = args.input else: input_filename = os.path.split(args.input)[1] filepath = os.path.join(currentPath, input_filename) # opening the file with reader reader = PdfFileReader(filepath) if not args.password == None: reader.decrypt(args.password) colorWriter = PdfColorConverter(debug=args.debug_mode) colorWriter.appendPagesFromReader(reader) # extracting colors from_rgb = hexStringToRGB(args.from_color) from_color = RGBColor(*from_rgb) to_rgb = hexStringToRGB(args.to_color) to_color = RGBColor(*to_rgb) # performing color swaps if args.pages != None: for page in args.pages: if page > (colorWriter.getNumPages() - 1): parser.error("page index to high: %d" % page) return 1 else: colorWriter.swapColor(page, from_color, to_color) else: for page in range(0, colorWriter.getNumPages()): colorWriter.swapColor(page, from_color, to_color) # saving output pdf if args.outputDir == None: path = os.path.join(currentPath, args.output) outputStream = open(path, "wb") else: path = os.path.join(args.outputDir, args.output) outputStream = open(path, "wb") colorWriter.write(outputStream) return 0