def pdf(self): """Fix other peoples missing docstrings.""" pdf = None try: pdf = PdfFileReader(StringIO(self.data)) except Exception: logger.warn('Error opening pdf file, trying to fix it...') fixed_data = self._fixPdf(self.data) # try to reopen the pdf file again try: pdf = PdfFileReader(StringIO(fixed_data)) except Exception: logger.warn('This pdf file cannot be fixed.') if pdf and pdf.isEncrypted: try: decrypt = pdf.decrypt('') if decrypt == 0: logger.warn('This pdf is password protected.') except Exception: logger.warn('Errors while decrypting the pdf file.') if pdf is None: remove_image_previews(self.context) return pdf
def read_or_save(): switch = read_save_dropdown.get() pdf_f = read_pdf_input.get() save_f = save_to_file_input.get() cracked_p = read_pass_input.get() canvas.update() if switch == "Save": start_read_button.config(text=f"{switch}ing file!") with open(pdf_f, 'rb') as input_file, open(save_f, 'wb') as output_file: reader = PdfFileReader(input_file) reader.decrypt(cracked_p) writer = PdfFileWriter() for i in range(reader.getNumPages()): writer.addPage(reader.getPage(i)) writer.write(output_file) else: start_read_button.config(text=f"{switch}ing pdf. Check Terminal!") with open(pdf_f, 'rb') as input_file: reader = PdfFileReader(input_file) reader.decrypt(cracked_p) for i in range(reader.getNumPages()): page = reader.getPage(i) page_content = page.extractText() print( f"-----------------------------------\nOUTPUT:\n\n{page_content}" f"\n-----------------------------------")
def merge_pdf(filepath, outfilename='.merge.pdf'): """ merge_pdf 合并同一个文件夹下所有PDF文件 :return: outfile 输出压缩后的pdf全路径 """ filepath_2nd = os.path.dirname(filepath) filename = os.path.basename(filepath) + outfilename outfile = os.path.join(filepath_2nd, filename) pdfwriter = PdfFileWriter() outputPages = 0 pdf_files = get_filename(filepath, filetypes=['.pdf']) for _file in pdf_files: pdreader = PdfFileReader(open(_file, 'rb')) if pdreader.isEncrypted == True: pdreader.decrypt("map") # 如果pdf文件已经加密,必须首先解密才能使用pyPdf pageCount = pdreader.getNumPages() # 获得源pdf文件中页面总数 outputPages += pageCount print(_file, pageCount) # 分别将page添加到输出output中 for iPage in range(0, pageCount): pdfwriter.addPage(pdreader.getPage(iPage)) print("All Pages Number:" + str(outputPages)) with open(outfile, "wb") as f: pdfwriter.write(f) print('outfile=%s' % outfile) return outfile
def _save_pages(self, filepath, pages, temp): """Saves specified page from PDF into a temporary directory. Parameters ---------- filepath : str Filepath or URL of the PDF file. pages : int Page numbers. temp : str Tmp directory. """ with open(filepath, "rb") as fileobj: infile_original = PdfFileReader(fileobj, strict=False) if infile_original.isEncrypted: infile_original.decrypt(self.password) for page in pages: # Ensure PdfFileReader object is unmodified infile = copy.copy(infile_original) fpath = os.path.join(temp, 'page-{0}.pdf'.format(page)) froot, fext = os.path.splitext(fpath) p = infile.getPage(page - 1) outfile = PdfFileWriter() outfile.addPage(p) with open(fpath, 'wb') as f: outfile.write(f) # Orient rotated pages correctly """
def _parsepdf(self, pdf, password='', **kwargs): """ parses the given pdf file and returns a mapping of attributes """ # This will store the parsed metadata META_MAP = {} opdf = PdfFileReader(pdf) if password != "": opdf.decrypt(password) metadata = opdf.getXmpMetadata() if getattr(metadata, 'pdf_keywords', None): META_MAP['keywords'] = metadata.pdf_keywords if getattr(metadata, 'dc_language', None): META_MAP['language'] = metadata.dc_language if getattr(metadata, 'dc_identifier', None): META_MAP['uuid'] = metadata.dc_identifier if getattr(metadata, 'xmpmm_documentId', None): META_MAP['uuid'] = metadata.xmpmm_documentId if getattr(metadata, 'xmpmm_instanceId', None): META_MAP['uuid'] = metadata.xmpmm_instanceId if getattr(metadata, 'xmp_createDate', None): META_MAP['creationdate'] = metadata.xmp_createDate if getattr(metadata, 'xmp_modifyDate', None): META_MAP['modificationdate'] = metadata.xmp_modifyDate if getattr(metadata, 'xmp_metadataDate', None): META_MAP['metadatadate'] = metadata.xmp_metadataDate if getattr(metadata, 'dc_rights', None): META_MAP['rights webstatement'] = metadata.dc_rights if getattr(metadata, 'pdf_producer', None): META_MAP['producer'] = metadata.pdf_producer if getattr(metadata, 'xmp_creatorTool', None): META_MAP['creatortool'] = metadata.xmp_creatorTool if getattr(metadata, 'dc_title', None): META_MAP['title'] = metadata.dc_title if getattr(metadata, 'dc_description', None): META_MAP['description'] = metadata.dc_description if getattr(metadata, 'dc_rights', None): META_MAP['rights'] = metadata.dc_rights if getattr(metadata, 'dc_format', None): META_MAP['format'] = metadata.dc_format if getattr(metadata, 'dc_creator', None): META_MAP['creator'] = metadata.dc_creator if getattr(metadata, 'custom_properties', None): META_MAP.update(metadata.custom_properties) l = self._guessLanguage(pdf) if l and not META_MAP.has_key('language'): META_MAP['language'] = l # Finally we'll do some plone specific rewritings # It would be smart to hook some kind of adapter # here so that one can define his own rewritings if META_MAP.has_key('keywords'): META_MAP['subject_keywords'] = list(META_MAP['keywords']) return META_MAP
def convert_coords(pdf_name: str, detected_obj: list): PDFfile = PdfFileReader(open(pdf_name, 'rb')) if PDFfile.isEncrypted: PDFfile.decrypt('') PDFcoords = PDFfile.getPage(0).mediaBox pdf_width = PDFcoords[2] pdf_height = PDFcoords[3] # Taking out coords, and translating them for camelot detected_coords = detected_obj coords = [int(coord) for coord in detected_coords] x1 = int(coords[0]) y1 = int(IMG_HEIGHT - coords[1]) x2 = int(coords[2]) y2 = int(IMG_HEIGHT - coords[3]) coords_img = [x1, y1, x2, y2] pdf_img_ratio = pdf_height / IMG_HEIGHT coords_pdf = [float(pdf_img_ratio * x) for x in coords_img] coords_camelot = str(coords_pdf)[1:-1] return coords_camelot
def deletePDF(input_dirPath, output_dirPath, delete_page): ''' 删除PDF文件中的指定页码 ''' output = PdfFileWriter() deleteInterval = getDeleteInterval(delete_page) print(deleteInterval) # 读取源pdf文件 input = PdfFileReader(open(input_dirPath, "rb")) # 如果pdf文件已经加密,必须首先解密才能使用pyPdf if input.isEncrypted: input.decrypt("map") # 获得源pdf文件中页面总数 pageCount = input.getNumPages() outputPages = pageCount - len(deleteInterval) print(pageCount) # 分别将page添加到输出output中 for iPage in range(1, pageCount + 1): if iPage not in deleteInterval: output.addPage(input.getPage(iPage - 1)) print("All Pages Number:" + str(outputPages)) # 最后写pdf文件 outputStream = open(output_dirPath, "wb") output.write(outputStream) outputStream.close() print("finished")
def MergePDF(filepath, outfile): output = PdfFileWriter() outputPages = 0 pdf_fileName = getFileName(filepath) for each_file in pdf_fileName: print("adding %s" % each_file) # 读取源pdf文件 input = PdfFileReader(open(each_file, "rb")) # 如果pdf文件已经加密,必须首先解密才能使用pyPdf if input.isEncrypted == True: input.decrypt("map") # print(each_file[:-4]) # 获得源pdf文件中页面总数 pageCount = input.getNumPages() outputPages += pageCount print("%s has %d pages" % (each_file, pageCount)) # 分别将page添加到输出output中 for iPage in range(pageCount): output.addPage(input.getPage(iPage)) # 添加书签 output.addBookmark(title=each_file[:-3], pagenum=outputPages - pageCount) print("All Pages Number: " + str(outputPages)) # 最后写pdf文件 outputStream = open(filepath + outfile, "wb") output.write(outputStream) outputStream.close() print("finished")
def merge_pdf(path: str, output_filename: str, bookmark_separator: str = "", bookmark_start_index: int = 1, password: str = "") -> None: """ 合并一个文件里所有的pdf :param str path: 文件夹路径 :param str output_filename: 输出文件名(包含路径) :param str bookmark_separator: 用来分割每一个pdf的书签格式, 如果没有会按照文件名命名书签 :param int bookmark_start_index: 书签后缀开始的序号 :param str password: 如果pdf有加密,这里填pdf的密码 """ if os.path.exists(output_filename): os.remove(output_filename) os.chmod(path, stat.S_IRWXU) # ensure we have permission output_pdf = PdfFileMerger() output_page_num = 0 for index, pdf_path_with_name in enumerate(get_pdf_names(path), bookmark_start_index): print(pdf_path_with_name) with open(pdf_path_with_name, "rb") as pdf: content = PdfFileReader(pdf) if content.isEncrypted: content.decrypt(password) # add bookmark at the beginning of each merged pdf if bookmark_separator is not None if bookmark_separator: output_pdf.addBookmark(bookmark_separator + str(index), output_page_num) else: output_pdf.addBookmark(pdf_path_with_name.split("\\")[-1].split(".")[0], output_page_num) output_pdf.append(content) output_page_num += content.numPages with codecs.open(output_filename, "wb") as f: output_pdf.write(f) print("mission complete")
def MergePDF(filepath, outfile): """ 将文件夹里面的pdf文件合并成一个文件 :param filepath: :param outfile: :return: """ output = PdfFileWriter() outputPages = 0 pdf_fileName = getFileName(filepath, '.pdf') for each in pdf_fileName: input = PdfFileReader(open(each, 'rb')) if input.isEncrypted == True: input.decrypt('map') pageCount = input.getNumPages() outputPages += pageCount for iPage in range(0, pageCount): output.addPage(input.getPage(iPage)) outputStream = open(outfile, 'wb') output.write(outputStream) outputStream.close() print('save:' + outfile + ' finished!')
def add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out): """添加水印 """ pdf_output = PdfFileWriter() input_stream = open(pdf_file_in, 'rb') pdf_input = PdfFileReader(input_stream) # PDF文件被加密了 if pdf_input.getIsEncrypted(): print('该PDF文件被加密了.') # 尝试用空密码解密 try: pdf_input.decrypt('') except Exception as e: print('尝试用空密码解密失败.') return False else: print('用空密码解密成功.') # 获取PDF文件的页数 page_num = pdf_input.getNumPages() # 读入水印pdf文件 pdf_watermark_input_stream = open(pdf_file_mark, 'rb') pdf_watermark = PdfFileReader(pdf_watermark_input_stream) # 给每一页打水印 for i in range(page_num): page = pdf_input.getPage(i) page.mergePage(pdf_watermark.getPage(0)) page.compressContentStreams() # 压缩内容 pdf_output.addPage(page) output_stream = open(pdf_file_out, "wb") pdf_output.write(output_stream) input_stream.close() pdf_watermark_input_stream.close() output_stream.close()
def add_watermark(pdf_file_mark, pdf_file_in, pdf_file_out): with open(pdf_file_in, 'rb') as fp: pdf_input = PdfFileReader(fp) # PDF文件被加密了 if pdf_input.getIsEncrypted(): print('该PDF文件被加密了.') # 尝试用空密码解密 try: pdf_input.decrypt('') except Exception: print('尝试用空密码解密失败.') return False else: print('用空密码解密成功.') # 获取PDF文件的页数 pageNum = pdf_input.getNumPages() with open(pdf_file_mark, 'rb') as mfp: pdf_output = PdfFileWriter() # 读入水印pdf文件 pdf_watermark = PdfFileReader(mfp) # 给每一页打水印 for i in range(pageNum): page = pdf_input.getPage(i) page.mergePage(pdf_watermark.getPage(0)) page.compressContentStreams() # 压缩内容 pdf_output.addPage(page) with open(pdf_file_out, 'wb') as wfp: pdf_output.write(wfp)
def MergePDF(filepath, outfile): output = PdfFileWriter() outputPages = 0 pdf_fileName = getFileName(filepath) for each in pdf_fileName: print(each) # 读取源pdf文件 input = PdfFileReader(each) # 如果pdf文件已经加密,必须首先解密才能使用pyPdf if input.isEncrypted == True: input.decrypt("map") # 获得源pdf文件中页面总数 pageCount = input.getNumPages() outputPages += pageCount print(pageCount) # 分别将page添加到输出output中 for iPage in range(0, pageCount): output.addPage(input.getPage(iPage)) print("All Pages Number:" + str(outputPages)) # 最后写pdf文件 outputStream = file(filepath + outfile, "wb") output.write(outputStream) outputStream.close() print("finished")
def convert2img(self, path_to_save_img): file_name = get_file_name_without_extension(self.filepath) if not os.path.exists(path_to_save_img): os.makedirs(path_to_save_img) if not os.path.exists(f'{path_to_save_img}/{file_name}'): os.makedirs(f'{path_to_save_img}/{file_name}') with open(self.filepath, 'rb') as file: # initialize the PDF reader object reader = PdfFileReader(file) if reader.isEncrypted: reader.decrypt(self.password) base_file_name = get_file_name(self.filepath) temp_file_loc = os.path.join(path_to_save_img, file_name, base_file_name) with open(temp_file_loc, 'wb') as pdf_file: writer = PdfFileWriter() for page in range(reader.getNumPages()): writer.addPage(reader.getPage(page)) writer.write(pdf_file) convrt_img(temp_file_loc, path_to_save_img) os.remove(temp_file_loc) else: convrt_img(self.filepath, path_to_save_img)
def parse_pdf_pypdf2(self, f, fpath): text = "" iocs = None try: pdf = PdfFileReader(f, strict=False) if pdf.isEncrypted: pdf.decrypt('') if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 for page in pdf.pages: page_num += 1 data = page.extractText() # Parse IOCs temp_iocs = self.parse_page(fpath, data, page_num) # parse_page # Add IOCs to collection iocs.extend(temp_iocs) # Add new page text += data self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handler.print_error(fpath, e) return text, iocs
def MergePDF(filepath, outfile): output = PdfFileWriter() outputPages = 0 pdf_fileName = getFileName(filepath) pdf_fileName = sorted(pdf_fileName) for each in pdf_fileName: input = PdfFileReader(open(each, "rb"), strict=False) if input.isEncrypted == True: input.decrypt("map") # 获得源pdf文件中页面总数 pageCount = input.getNumPages() outputPages += pageCount print(pageCount) # 分别将page添加到输出output中 for iPage in range(0, pageCount): output.addPage(input.getPage(iPage)) print("All Pages Number:" + str(outputPages)) # 最后写pdf文件 outputStream = open(filepath + outfile, "wb") output.write(outputStream) outputStream.close() print("finished")
def downloadpdf(url): try: request = requests.get(url, verify=False) if 'Content-Type' in request.headers.keys(): if request.headers['Content-Type'] == 'text/html': return None except requests.exceptions.ConnectionError: sys.exit( "\nThere was an error when trying to connect to the domain. Please confirm if the domain is " "correctly written.\n") try: objbyte = BytesIO(request.content) except Exception as e: Scratcher.log(url, e) sys.exit( "\nThere was an error when trying to convert the content of the response.Please verify the logs to" " see the raised error.\n") try: pdf = PdfFileReader(objbyte) except utils.PdfReadError as e: Scratcher.log(url, e) obje = BytesIO(request.content.strip(b'\x00')) try: pdf = PdfFileReader(obje) except utils.PdfReadError: return 2 if pdf.getIsEncrypted() is True: try: pdf.decrypt('') except: pdf = Scratcher.handlepdf(request.content) return pdf
def parse_pdf_pypdf2(self, f, fpath): text = "" iocs = None try: pdf = PdfFileReader(f, strict = False) if pdf.isEncrypted: pdf.decrypt('') if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 for page in pdf.pages: page_num += 1 data = page.extractText() # Parse IOCs temp_iocs = self.parse_page(fpath, data, page_num) # parse_page # Add IOCs to collection iocs.extend(temp_iocs) # Add new page text += data self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handler.print_error(fpath, e) return text, iocs
def save_page(filepath, page_number): infile = PdfFileReader(open(filepath, 'rb'), strict=False) page = infile.getPage(page_number - 1) outfile = PdfFileWriter() outfile.addPage(page) outpath = os.path.join(os.path.dirname(filepath), 'page-{}.pdf'.format(page_number)) with open(outpath, 'wb') as f: outfile.write(f) froot, fext = os.path.splitext(outpath) layout, __ = get_page_layout(outpath) # fix rotated PDF chars = get_text_objects(layout, ltype="char") horizontal_text = get_text_objects(layout, ltype="horizontal_text") vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != '': outpath_new = ''.join([froot.replace('page', 'p'), '_rotated', fext]) os.rename(outpath, outpath_new) infile = PdfFileReader(open(outpath_new, 'rb'), strict=False) if infile.isEncrypted: infile.decrypt('') outfile = PdfFileWriter() p = infile.getPage(0) if rotation == 'anticlockwise': p.rotateClockwise(90) elif rotation == 'clockwise': p.rotateCounterClockwise(90) outfile.addPage(p) with open(outpath, 'wb') as f: outfile.write(f)
def count_pages(filename: str) -> int: with open(filename, "rb") as f: pdf = PdfFileReader(f) if pdf.isEncrypted: pdf.decrypt("") n_pages = pdf.getNumPages() return n_pages
def get_edition2(i): pdf = PdfFileReader(i) if pdf.isEncrypted: pdf.decrypt('') list_me = [] for j in (1,2): pageObj = pdf.getPage(j) text = pageObj.extractText() year = re.search(r'© \d\d\d\d', text) if year : edition_dic.update({'Creation_Year':year.group()[2:6]}) edition = re.search(r'\d*\w\w edition', text) if edition: edition_dic.update({'Edition':edition.group()}) match = re.search(r'ISBN [\d*-]*\d*', text) if match : edition_dic.update({'ISBN':match.group()[5:]}) return edition_dic
def split_pdf_pages(pdf_input_path: str, output_folder: str, max_pages=None): """ Split pdf into individual pages and save to output_folder with name filename + _page_x.pdf if max_pages is provided, only take up to the amount provided. Pages will be selected at random """ # create staging file for filename issues staging_file_handle = tempfile.TemporaryFile() with open(pdf_input_path, "rb") as f: shutil.copyfileobj(f, staging_file_handle) staging_file_handle.seek(0) pdf = PdfFileReader(staging_file_handle, strict=False) if pdf.isEncrypted: pdf.decrypt("") page_numbers = range(pdf.numPages) # Take random page numbers if max_pages is provided if max_pages and pdf.numPages > max_pages: page_numbers = random.sample(page_numbers, max_pages) for page_num in page_numbers: out_pdf = PdfFileWriter() pdf_page_filepath = pageFilename(pdf_input_path, page_num) pdf_filename = os.path.basename(pdf_page_filepath) output_filepath = os.path.join(output_folder, pdf_filename) out_pdf.addPage(pdf.getPage(page_num)) with open(output_filepath, "wb") as f: out_pdf.write(f) staging_file_handle.close()
def _parse(self, pdf, password='', **kwargs): """ parses the given pdf file and returns a mapping of attributes """ metadata = self._parsepdf(pdf, password) if not metadata: metadata = {} opdf = PdfFileReader(pdf) if password: opdf.decrypt(password) info = opdf.getDocumentInfo() new_metadata = dict( (key.strip('/').lower(), val) for key, val in info.items()) # #116365 use title from pdf parsing instead of the data coming # from pypdf2 getDocumentInfo method as that method will wrongly # encode an mdash found in the title parsed_title = metadata.get('title', {}).get('x-default') if parsed_title: new_metadata['title'] = parsed_title metadata.update(new_metadata) # # Fix some metadata # metadata = self._fix_metadata(metadata) return metadata
def downloadpdf(url): try: request = requests.get(url, verify=False) if request.headers['Content-Type'] == 'text/html': return None except requests.exceptions.ConnectionError: sys.exit( "\nThere was an error when trying to connect to the domain. Please confirm if the domain is correctly written.\n" ) try: objbyte = BytesIO(request.content) except Exception as e: print(e) return None try: s_stdout = sys.stdout sys.stdout = BytesIO() pdf = PdfFileReader(objbyte) sys.stdout = s_stdout except Exception as e: print(e) return None if pdf.getIsEncrypted() is True: try: pdf.decrypt('') except: pdf = Scratcher.handlepdf(request.content) return pdf
def save_page(filepath, page_number): infile = PdfFileReader(open(filepath, "rb"), strict=False) page = infile.getPage(page_number - 1) outfile = PdfFileWriter() outfile.addPage(page) outpath = os.path.join(os.path.dirname(filepath), f"page-{page_number}.pdf") with open(outpath, "wb") as f: outfile.write(f) froot, fext = os.path.splitext(outpath) layout, __ = get_page_layout(outpath) # fix rotated PDF chars = get_text_objects(layout, ltype="char") horizontal_text = get_text_objects(layout, ltype="horizontal_text") vertical_text = get_text_objects(layout, ltype="vertical_text") rotation = get_rotation(chars, horizontal_text, vertical_text) if rotation != "": outpath_new = "".join([froot.replace("page", "p"), "_rotated", fext]) os.rename(outpath, outpath_new) infile = PdfFileReader(open(outpath_new, "rb"), strict=False) if infile.isEncrypted: infile.decrypt("") outfile = PdfFileWriter() p = infile.getPage(0) if rotation == "anticlockwise": p.rotateClockwise(90) elif rotation == "clockwise": p.rotateCounterClockwise(90) outfile.addPage(p) with open(outpath, "wb") as f: outfile.write(f)
def split_pdf(input_file, output_file, pages): fp = open(input_file, "rb") inputpdf = PdfFileReader(fp) if inputpdf.isEncrypted: try: inputpdf.decrypt('') print('File Decrypted (PyPDF2)') except: command = ("cp " + input_file + " temp.pdf; qpdf --password='' --decrypt temp.pdf " + input_file + "; rm temp.pdf") os.system(command) print('File Decrypted (qpdf)') fp = open(input_file) inputpdf = PdfFileReader(fp) outputpdf = PdfFileWriter() newpages = [] for p in pages: if '-' in p: num_range = p.split('-') for r in range(int(num_range[0]), int(num_range[1]) + 1): newpages.append(r) else: newpages.append(p) print(newpages) pages_int = [int(x) for x in newpages] for i in pages_int: outputpdf.addPage(inputpdf.getPage(i - 1)) with open(output_file, "wb") as outf: outputpdf.write(outf)
def _get_pages(self, filepath, pages): """Converts pages string to list of ints pages = '1', '2,5,8-all', 'all' """ page_numbers = [] with open(self.filepath, 'rb') as file: reader = PdfFileReader(file) if reader.isEncrypted: reader.decrypt(self.password) num_of_pages = reader.getNumPages() if pages == "all": page_numbers.append({'start': 1, 'end': num_of_pages}) else: for p in pages.split(','): if '-' in p: a, b = p.split('-') if b == 'all': b = num_of_pages page_numbers.append({'start': int(a), 'end': int(b)}) else: page_numbers.append({'start': int(p), 'end': int(p)}) page_numbers_lst = [] for page in page_numbers: page_numbers_lst.extend(range(page['start'], page['end']+1)) return sorted(set(page_numbers_lst))
def meta_pdf(url_pdf): try: path = "./%s/%s" % (result.url_s, url_pdf.split("/")[-1]) headers = {'user-agent': result.user_agentt} print tc.bold_yellow("Download: %s" % url_pdf) r = requests.get(url_pdf, headers=headers) with open(path, 'wb') as f: f.write(r.content) except: print "%s Can't Download or search for metadata" % url_pdf try: print tc.bold_yellow("File %s metadata:" % path) fl = open("./%s/metadata_results.txt" % result.url_s, "a+") fl.write("---Url: %s --- \n" % url_pdf) fl.write("--------> File %s metadata: <------------ \n" % path) fp = open(path, 'rb') pdf = PdfFileReader(fp) if pdf.isEncrypted: pdf.decrypt('') info = pdf.getDocumentInfo() for i in info: print tc.italic_yellow(i + ": " + info[i]) fl_write_line = " " + i + ": " + info[i] + "\n" fl.write(fl_write_line.encode('utf8')) fl.close() fp.close() except: print tc.bold_red("Can't read metadata in: %s" % path)
def decrypt(query, pdfs): """Decrypt PDF files.""" try: for pdf in pdfs: reader = PdfFileReader(pdf, strict=False) if reader.isEncrypted: reader.decrypt(query) writer = PdfFileWriter() for i in xrange(reader.numPages): writer.addPage(reader.getPage(i)) noextpath = os.path.splitext(pdf)[0] out_file = "{} (decrypted).pdf".format(noextpath) with open(out_file, 'wb') as f: writer.write(f) notify.notify('Alfred PDF Tools', 'Decryption successfully completed.') else: notify.notify('Alfred PDF Tools', 'The PDF file is not encrypted.') except PdfReadError: notify.notify('Alfred PDF Tools', 'The entered password is not valid.')
class PdfHelper: def __init__(self, file_name): self.file_name = file_name self.reader = PdfFileReader(self.file_name, strict=False) self.num_pages = self.reader.getNumPages() if self.reader.isEncrypted: print('Trying to decrypt ...') try: self.reader.decrypt('') print('Success!') except: print('Failed to decrypt') def split_pages(self, start_page, end_page, output_name): if start_page <= end_page <= self.num_pages: self.select_pages(range(start_page, end_page + 1), output_name) else: print( 'ERROR: page number out of range: start {}, end {}, total {}.'. format(start_page, end_page, self.num_pages)) def select_pages(self, pages, output_name): if max(pages) - 1 <= self.num_pages: writer = PdfFileWriter() for p in pages: writer.addPage(self.reader.getPage(p - 1)) with open(output_name, 'wb') as outfile: writer.write(outfile) else: print('ERROR: page number out of range: max {}, total {}.'.format( max(pages), self.num_pages))
def merge_pdf(self, data, base_date): data_length = len(data[:, 0]) for i in range(data_length): company_name = data[i, 0] bizno = data[i, 1] department = data[i, 2] director_name = data[i, 3] accounts_balance = data[i, 4] accounts_maturity = data[i, 5] credit_exposure = data[i, 6] action = f'\n{department}/{director_name}/{bizno}-{company_name}의 {base_date} pdf 병합' if accounts_balance != 'O' or accounts_maturity != 'O' or credit_exposure != 'O': log_str = action + "실패\n" self.logger.warning(log_str) continue file_location = os.path.join(os.getcwd() + "/pdf_file", department, base_date, director_name, company_name).replace(' ', '_') pdf_merger = PdfFileMerger() try: for file_name in self.file_list: file_path = os.path.join( file_location, f'{company_name}_{base_date}_{file_name}.pdf').replace( ' ', '_') tmp_file_path = os.path.join( file_location, f'{company_name}_{base_date}_{file_name}_tmp.pdf' ).replace(' ', '_') pdf_file_object = open(file_path, 'rb') pdf_file = PdfFileReader(pdf_file_object) if pdf_file.isEncrypted: try: pdf_file.decrypt('') except: cmd_command = f"qpdf --decrypt \"{file_path}\" \"{tmp_file_path}\" " os.system(cmd_command) pdf_file_object.close() pdf_file_object = open(tmp_file_path, 'rb') pdf_file = PdfFileReader(pdf_file_object) pdf_merger.append(pdf_file) pdf_file_object.close() os.remove(tmp_file_path) output_file_path = os.path.join( file_location, f'{company_name}_{base_date}_종합.pdf') pdf_merger.write(output_file_path) except: self.logger.warning(f"\n{action} 병합실패\n") continue
def decrypt_file(self, password): reader = PdfFileReader(str(self.read_dir)) writer = PdfFileWriter() reader.decrypt(password) writer.appendPagesFromReader(reader) with self.write_dir.open(mode='wb') as output_file: writer.write(output_file)
def read_data(file_object): data = PdfFileReader(file_object) if data.isEncrypted: data.decrypt('') return data
def pdf(self): pdf = None try: pdf = PdfFileReader(StringIO(self.data)) except: logger.warn("Error opening pdf file, trying to fix it...") fixed_data = self._fixPdf(self.data) # try to reopen the pdf file again try: pdf = PdfFileReader(StringIO(fixed_data)) except: logger.warn("This pdf file cannot be fixed.") if pdf and pdf.isEncrypted: try: decrypt = pdf.decrypt('') if decrypt == 0: logger.warn("This pdf is password protected.") except: logger.warn("Errors while decrypting the pdf file.") if pdf is None: remove_image_previews(self.context) return pdf
def pdf_parser(s): s = s.strip() # required to suppress warning messages with open(os.devnull, 'w') as fp: pdf = PdfFileReader(StringIO(s), strict=False, warndest=fp) if pdf.isEncrypted: try: pdf.decrypt('') except NotImplementedError: return {} meta = pdf.getDocumentInfo() #print(str(meta)) result = {} for key in meta.keys(): result[key[1:]] = meta.get(key) return result
def add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out): pdf_output = PdfFileWriter() input_stream = file(pdf_file_in, 'rb') pdf_input = PdfFileReader(input_stream) # PDF文件被加密了 if pdf_input.getIsEncrypted(): print '该PDF文件被加密了.' # 尝试用空密码解密 try: pdf_input.decrypt('') except Exception, e: print '尝试用空密码解密失败.' return False else: print '用空密码解密成功.'
def add(path, password='', writer=None, rules=RULE_DEFAULT): """Add one or more paths to a PdfFileWriter. Args: path (str, list): path or list of paths to merge password (str): password for encrypted files writer (PdfFileWriter): output writer to add pdf files rules (str): pages and rotation rules Returns: (PdfFileWriter). The merged PDF ready for output. """ if writer is None: writer = PdfFileWriter() if isinstance(path, list): # merge all the paths for subpath in path: writer = add(subpath, password, writer, rules) else: match = RE_HAS_RULE.search(path) if match: path, rules = match.groups() rules = re.sub(r'\s', '', rules) # remove all whitespace if os.path.isdir(path): # merge all pdfs in a directory path = os.path.join(path, '*.pdf') if '*' in path: # merge multiple files writer = add(glob(path), password, writer, rules) else: # base case; a single file assert os.path.isfile(path), ERROR_PATH.format(path) reader = PdfFileReader(open(path, 'rb')) if reader.isEncrypted: reader.decrypt(password) for rule in rules.split(','): match = RE_RULE.search(rule) assert match, ERROR_RULE.format(rule) _, _, _, rotate = match.groups() for page in rangify(match, reader.getNumPages()): writer.addPage( reader.getPage(page - 1).rotateClockwise( RULE_ROTATE[rotate] ) ) return writer
def merge_files(local_pdfs): name = 'merge_{0}_output.pdf'.format(str(time.clock())[2:]) merged_export = PdfFileMerger() for pdfile in local_pdfs: filepath = getpath(pdfile, config().get(section='server', option='upload_folder')) file_bin = PdfFileReader(file(filepath, 'rb')) if file_bin.getIsEncrypted(): file_bin.decrypt('') merged_export.append(fileobj=file_bin) os.remove(filepath) full_ouput = getpath(name, config().get(section='server', option='upload_folder')) with open(full_ouput, 'wb') as output: merged_export.write(output) return full_ouput
def split_pdf(pdf_filename, temp_dir): ''' Split the PDF into n PDFs ( one for each page ). ''' filenames = [] inputpdf = PdfFileReader(open(pdf_filename, "rb")) if inputpdf.getIsEncrypted(): inputpdf.decrypt('') for i in range(inputpdf.numPages): output = PdfFileWriter() output.addPage(inputpdf.getPage(i)) filename = os.path.basename(pdf_filename) filename = "{0}/{1}-p{2}.pdf".format(temp_dir, filename, i) with open(filename, "wb") as outputStream: output.write(outputStream) filenames.append(filename) return filenames
def extract_creation_date(filename): # Add strict=False in order to avoid 'PdfReadWarning: Xref table not zero-indexed. ID numbers for objects will be corrected. [pdf.py:1736]' pdf_toread = PdfFileReader(open(filename, "rb"), strict=False) # "file has not been decrypted" error https://github.com/mstamy2/PyPDF2/issues/51 if pdf_toread.isEncrypted: pdf_toread.decrypt('') pdf_info = pdf_toread.getDocumentInfo() #print(str(pdf_info)) # PDF Reference, 3.8.3 Dates, http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf # A date is an ASCII string of the form (D:YYYYMMDDHHmmSSOHH'mm') # Examle: D:20170508085336+02'00' raw_date = pdf_info['/CreationDate'] #print(str(raw_date)) date_str = re.search('^D:(\d{14})', raw_date).group(1) #print(str(date_str)) timestamp = datetime.strptime(date_str, "%Y%m%d%H%M%S") #print(str(date)) return timestamp
def split_pdf(pdf_filename): filenames = [] inputpdf = PdfFileReader(open(pdf_filename, "rb")) if inputpdf.getIsEncrypted(): inputpdf.decrypt('') for i in range(inputpdf.numPages): output = PdfFileWriter() output.addPage(inputpdf.getPage(i)) directory = os.path.dirname(pdf_filename) if directory == '': directory = '.' filename = os.path.basename(pdf_filename) filename = "{0}/{1}-p{2}.pdf".format(directory,filename,i) with open(filename, "wb") as outputStream: output.write(outputStream) filenames.append(filename) return filenames
def read_pdf(filename): """Open a PDF file with PyPDF2.""" if not os.path.exists(filename): raise CommandError("{} does not exist".format(filename)) pdf = PdfFileReader(file(filename, "rb")) if pdf.isEncrypted: while True: pw = prompt_for_pw(filename) matched = pdf.decrypt(pw) if matched: break else: print "The password did not match." return pdf
def print_pdf(file_full_path, color_mode): """Analyzes the metadata of a .pdf file""" # Header with file path if color_mode: cprint("\n[+] Metadata for file: %s" % (file_full_path), "green", attrs=["bold"]) else: print "\n[+] Metadata for file: %s" % (file_full_path) # Open the file try: pdf_file = PdfFileReader(file(file_full_path, "rb")) except: if color_mode: cprint("Could not read this file. Sorry!", "red") else: print "Could not read this file. Sorry!" return if pdf_file.isEncrypted: # Temporary workaround, pdf encrypted with no pass try: pdf_file.decrypt('') except: if color_mode: cprint("\tCould not decrypt this file. Sorry!", "red") else: print "\tCould not decrypt this file. Sorry!" return # Data structure with document information pdf_info = pdf_file.getDocumentInfo() # Print metadata if pdf_info: for metaItem in pdf_info: try: if color_mode: cprint("\t-" + metaItem[1:] + ": ", "cyan", end="") cprint(pdf_info[metaItem]) else: print "\t-" + metaItem[1:] + ": " + pdf_info[metaItem] except TypeError: if color_mode: cprint("\t-" + metaItem[1:] + ": " + "Error - Item not readable", "red") else: print "\t-" + metaItem[1:] + ": " + "Error - Item not readable" else: if color_mode: cprint("\t No data found", "red") else: print "\t No data found" print ""
def pdf(which, page=None): """Create a page of a pdf and display it""" if page is None: page = get_saved_page(which) if page == 'all': return send_file(build_path(which)) page = int(page) pdf_path = build_path(which) page_directory = build_path(which, 'pages') page_path = build_path(which, 'pages', page) if page < 0: return redirect('{}/{}'.format(which, 0)) if not isfile(page_path): makedirs(page_directory, exist_ok=True) pdfout = PdfFileWriter() with open(pdf_path, 'rb') as fin: pdfin = PdfFileReader(fin) if pdfin.isEncrypted: pdfin.decrypt('') pdfout.addPage(pdfin.getPage(page)) with open(page_path, 'wb') as fout: pdfout.write(fout) set_saved_page(which, page) return render_template('index.html', which=which, page=page)
def __save_task_pdf(self, task): try: extension = '.pdf' sep = task.text_pdf_url.find('#') if sep != -1: extension = '.zip' tmp_pdf_path = os.path.join(self.__tmp_dir, task.key() + extension) task.download_text_pdf(tmp_pdf_path) if sep != -1: tmp_pdf_path = self.__pdf_from_zip(task, tmp_pdf_path, sep) with open(tmp_pdf_path, 'rb') as input_stream: input_pdf = PdfFileReader(input_stream) if input_pdf.isEncrypted: input_pdf.decrypt('') output_pdf = PdfFileWriter() for page in task.pages: output_pdf.addPage(input_pdf.getPage(page-1)) with open(self.task_pdf_path(task), 'wb') as output_stream: output_pdf.write(output_stream) finally: os.remove(tmp_pdf_path)
def __Get_info(file_path, plain_log, csv_log, analyzed_files, total_files): """ Get_info(file_path) Opens the pdf file for reading. Args: - file_path: (string) Absolute file path. - plain_log: (None | string) Log file in plain text. - csv_log: (None | string) Log file in csv format. """ file_name = os.path.basename(file_path) file_size = os.path.getsize(file_path) encrypted = 'No' try: # Try to open not password encrypted pdf files and pdf files # encrypted with a blank password. pdf_file = PdfFileReader(file(file_path, 'rb')) if pdf_file.getIsEncrypted() is True: dec_res = pdf_file.decrypt('') if dec_res == 1: encrypted = 'Yes' #Get and parse metadata doc_info = pdf_file.getDocumentInfo() title, author, creator, subject, producer, c_date, m_date \ = __Parse_doc_info(doc_info) num_pages = pdf_file.getNumPages() #Group info pdf_meta = pdf_metadata(file_name, title, author, creator, subject, producer, c_date, m_date, encrypted, num_pages, file_size) __Print_metadata(pdf_meta) if plain_log: Log(file_name, pdf_meta, plain_log, 'txt') if csv_log: Log(file_name, pdf_meta, f_log_csv, 'csv') analyzed_files = analyzed_files + 1 except Exception, e: error = file_name + ' ' + str(e) __Print_error(error)
def read(self, payload, **kwargs): """ Extract text from a PDF file :param bytes payload : Contents of pdf file :param **kwargs kwargs: Additional attributes (unused) :returns: Extracted content of payload :rtype: bytes """ # Ensure the payload if a ByesIO object payload_object = BytesIO(payload) # Parse the PDF payload pdf_object = PdfFileReader(payload_object, strict=False) results = [] # Determine if the pdf is encrypted, if so, let's attempt to decrypt if pdf_object.isEncrypted: try: # Returns 0 if the password failed, 1 if the password matched # the user password, and 2 if the password matched the owner # password. decrypt_return = pdf_object.decrypt(kwargs['pdf_password']) if decrypt_return == 0: self.stoq.log.warn("Incorrect PDF encryption password") except NotImplementedError: self.stoq.log.warn("Unsupported encryption method") except: self.stoq.log.error("Unable to decrypt PDF. Was a password provided?") # Iterate over the pages and append to our for page in pdf_object.pages: results.append(page.extractText()) return "".join(results)
def pdfobj(doc): pdf = None try: pdf = PdfFileReader(StringIO(data(doc))) except: logger.warn('Error opening pdf file, trying to fix it...') fixed_data = _fixPdf(data(doc)) # try to reopen the pdf file again try: pdf = PdfFileReader(StringIO(fixed_data)) except: logger.warn('This pdf file cannot be fixed.') if pdf and pdf.isEncrypted: try: decrypt = pdf.decrypt('') if decrypt == 0: logger.warn('This pdf is password protected.') except: logger.warn('Errors while decrypting the pdf file.') return pdf
#!/usr/bin/env python # coding: utf-8 ''' IMBdownloader : download all Internet Magazine Back Number Archives''' import glob from PyPDF2 import PdfFileWriter, PdfFileReader magName = 'InternetMag' dirs = glob.glob('*/') # directory list, it’s OK?? for dir in dirs: pdfName = magName+dir[:-1]+'.pdf' # ex) InternetMag194410.pdf outPdf = PdfFileWriter() # make empty pdf files = glob.glob(dir+'*.pdf') for file in files: inPdf = PdfFileReader(open(file, "rb")) if inPdf.isEncrypted: # some pdf were encripted inPdf.decrypt("") # ? why empty password ? pageNum = inPdf.getNumPages()-1 # delete last page, !! for p in range(0, pageNum): # !! getNumPages(0) gets page1 page = inPdf.getPage(p) outPdf.addPage(page) outPdf.write(open(pdfName, "wb")) print pdfName
### Parametrage ### path = 'practice_files' ############ ### MAIN ### ############ # chapter 11.1 review ex 1 : Write a script that opens the file named Walrus.pdf # from the Chapter 11 practice files; you will need to decrypt the file using # the password "IamtheWalrus" input_file_name = os.path.join(path, 'Walrus.pdf') input_file = PdfFileReader(open(input_file_name, 'rb')) input_file.decrypt('IamtheWalrus') # decrypt password protected file output_PDF = PdfFileWriter() # chapter 11.1 review ex 2 : Rotate every page in this input file counter-clockwise by 90 degrees for current_page in range(0, input_file.getNumPages()): page = input_file.getPage(current_page) page.rotateClockwise(-90) # rotate left 90° # chapter 11.1 review ex 3 : Split each page in half vertically, such that # every column appears on its own separate page, and page_left = input_file.getPage(current_page) page_right = copy.copy(page_left) upper_right = page_left.mediaBox.upperRight
def extract(cls, file): try: reader = PdfFileReader(file) except PdfReadError as e: raise six.raise_from(ExtractionError("Could not open pdf reader"), e) except TypeError as e: if str(e) == "'NumberObject' object has no attribute '__getitem__'": # there's a bug in PyPDF2 for some pdf valid files # return cls(file, dict()) else: raise if reader.isEncrypted: try: # try to decrypt it with an empty password success = reader.decrypt('') except NotImplementedError: # the document uses an unsupported encryption method # it's (probably) a real pdf document though, # we just can't extract its metadata without the password return cls(file, dict()) else: if success == 0: # the password failed # it's (probably) a real pdf document though, # we just can't extract its metadata without the password return cls(file, dict()) # for success values 1 and 2 we should now be able to read the document props = OrderedDict() try: props['pages'] = reader.numPages except PdfReadError: pass try: info = reader.documentInfo except PdfReadError: info = None if info is None: return cls(file, props) for key, prop, parser in ( ('Title', 'title', None), ('Subject', 'subject', None), ('Author', 'author', None), ('Creator', 'creator', None), ('Producer', 'producer', None), ('CreationDate', 'created', parse_date), ('ModDate', 'modified', parse_date), ): try: value = info['/%s' % key] except KeyError: pass else: if value is not None: if parser: value = parser(value) if value is None: continue props[prop] = value return cls(file, props)
def generate_pdf(self): """ Checks all items and linked files to generate a huge PDF with all files concatenated. This action might be quite expensive, so it should not be called too often. The only reasons when it is called should be: - when a FileAttachment is modified (see events.concatenate_pdf) - when a FileAttachment is deleted (see base_agendaitem.manage_delObjects) - when an agenda item is deleted (see base_meeting.manage_delObjects) """ files = [] for item in self.find_items(): item = item.getObject() for att_id in item.contentIds(): if item.is_attachment_pdf(att_id): files.append( {'file': StringIO(item[att_id].getFile()), 'attachment': '%s/%s' % (item.absolute_url(), att_id)}) if not files: self.pdf = None return self.pdf = PdfFileWriter() # Settings when a custom page has to be written. font = "Helvetica" font_size = 12 for f in files: pdf = PdfFileReader(f['file']) if pdf.isEncrypted: try: if pdf.decrypt('') == 0: # There is two cases: # - the decrypt method raise an error because # it can not decrypt # - the decrypt method just returns 0 to tell # it was not able to decrypt (in this case, we # raise an exception ourself to create the # default page) raise Exception('Ho noes, we can not decrypt') except: logger.info('Could not decrypt pdf file at "%s"' % f['attachment']) # We generate a simple page to tell the user # we were not able to include this file. text = f['attachment'] page = StringIO() my_canvas = canvas.Canvas(page) my_canvas.linkURL(f['attachment'], 0) my_canvas.setFont(font, font_size) my_canvas.drawCentredString( 4.0 * inch, 8.5 * inch, 'Could not integrate file at:') my_canvas.drawCentredString( 4.0 * inch, 8.0 * inch, text) my_canvas.save() pdf = PdfFileReader(page) [self.pdf.addPage(pdf.getPage(page_num)) for page_num in range(pdf.numPages)] if (self.pdf.getNumPages() % 2) == 1 and not f == files[-1]: self.pdf.addBlankPage()
def get_link_text(url, mime_type, data=None, clean=False): ''' Take URL, MIME type, and optional data to produce the link text. ''' tld = get_tld(url) result = "File on " + tld if mime_type.startswith("image"): result = "Image on " + tld elif "application/pdf" in mime_type: logging.debug("PDF detected") # I need seek() for some reason so convert from bytes data = io.BytesIO(data) # fix this later, but I always get a "PdfReadWarning: Xref table # not zero-indexed" which should only happen when the -v flag is # present warnings.filterwarnings("ignore") try: pdf = PdfFileReader(data, strict=True) # PyPDF2 somehow thinks many PDFs are encrypted with the empty # string, so deal with that if pdf.isEncrypted: pdf.decrypt('') result = pdf.getDocumentInfo().title if not result or result.strip() == "": result = "PDF on " + tld except PyPDF2.utils.PdfReadError: result = "PDF on " + tld elif "text/html" in mime_type: try: soup = BeautifulSoup(data, 'html.parser') meta = soup.find_all("meta") og_title_lst = [] twitter_title_lst = [] meta_title_lst = [] schema_lst = [] for i in meta: if i.get("property") == "og:title": og_title_lst.append(i.get("content")) elif i.get("property") == "twitter:title": twitter_title_lst.append(i.get("content")) elif i.get("name") == "title": meta_title_lst.append(i.get("content")) elif i.get("itemprop") == "name": schema_lst.append(i.get("content")) if og_title_lst: logging.debug("found og:title") result = og_title_lst[0].strip() elif twitter_title_lst: logging.debug("found twitter title") result = twitter_title_lst[0].strip() elif meta_title_lst: logging.debug("found meta name title") result = meta_title_lst[0].strip() if clean: result = messy_title_parse(result, url) elif schema_lst: logging.debug("found schema title") result = schema_lst[0].strip() elif soup.title and soup.title.string: logging.debug("found title tag") result = html.unescape(soup.title.string) if clean: result = messy_title_parse(result, url) else: logging.debug("no title found; using default") result = "Page on " + tld except AttributeError: # Probably just empty title when trying to get # soup.title.string logging.debug("FIXME: this isn't supposed to happen") result = "Page on " + tld if len(result) > 255: result = result[:253] + " …" return result
def process_file(self,curr_file): """Process the provided file. If the file is a PDF, the PyPDF2 library will be used. Otherwise, the extract tool is used, so extract must be installed. This is the one piece that requires Linux. Parameters: curr_file The filepath of the file to be processed """ date = "None" modded = "None" author = "None" created = "None" producer = "None" last_saved = "None" # Process the current file as a PDF if ".pdf" in curr_file: try: pdf_file = PdfFileReader(open(curr_file,"rb")) if pdf_file.getIsEncrypted(): pdf_file.decrypt('') # getDocumentInfo() returns something like: # {'/Author': 'Chris Maddalena', # '/CreationDate': "D:20131014182824-04'00'", # '/Creator': 'Microsoft® Excel® 2013',1 # '/ModDate': "D:20131015141200-04'00'", # '/Producer': 'Microsoft® Excel® 2013'} doc_info = pdf_file.getDocumentInfo() # If there is no info, just return if not doc_info: return # Parse the document into if "/CreationDate" in doc_info: data = doc_info["/CreationDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] created_time = data[8:10] + ":" + data[10:12] created_time = time.strftime("%I:%M %p",time.strptime(created_time,"%H:%M")) created = date + "/" + year + " " + created_time if "/Author" in doc_info: author = doc_info["/Author"] if "/Producer" in doc_info: producer = doc_info["/Producer"].strip("(Windows)") producer = re.sub(r'[^\w]',' ',producer) while True: if " " in producer: producer = producer.replace(" "," ") else: break if "/ModDate" in doc_info: data = doc_info["/ModDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] modded_time = data[8:10] + ":" + data[10:12] modded_time = time.strftime("%I:%M %p",time.strptime(modded_time,"%H:%M")) modded = date + "/" + year + " " + modded_time # Strips '/' off filename (if it includes directory name) if "/" in curr_file: curr_file = curr_file[curr_file.rfind("/")+1:] if "\\" in curr_file: curr_file = curr_file.replace("\\","") # Add the document info to the container self.container.append([curr_file,created,author,producer,modded,last_saved]) except Exception: return # Not a PDF, so treat the current file as an Office doc else: curr_file = curr_file.replace(" ","\ ").replace("(","\(").replace(")","\)") try: # Unzip the contents of the document to get the contents of core.xml and app.xml files unzipped = zipfile.ZipFile(curr_file) doc_xml = lxml.etree.fromstring(unzipped.read("docProps/core.xml")) app_xml = lxml.etree.fromstring(unzipped.read("docProps/app.xml")) # Namespaces for doc.xml dc_ns = {"dc":"http://purl.org/dc/elements/1.1/"} cp_ns = {"cp":"http://schemas.openxmlformats.org/package/2006/metadata/core-properties"} dcterms_ns = {"dcterms":"http://purl.org/dc/terms/"} # Namespaces for app.xml: # app_ns = {"http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"} # vt_ns = {"vt": "http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"} # tags = doc_xml.xpath('//cp:keywords', namespaces=cp_ns)[0].text # description = doc_xml.xpath('//dc:description', namespaces=dc_ns)[0].text author = doc_xml.xpath('//dc:creator',namespaces=dc_ns)[0].text modded = doc_xml.xpath('//cp:lastModifiedBy',namespaces=cp_ns)[0].text created = doc_xml.xpath('//dcterms:created',namespaces=dcterms_ns)[0].text last_saved = doc_xml.xpath('//dcterms:modified',namespaces=dcterms_ns)[0].text # Convert the created time to a prettier format created_date = created.split("T")[0] created_time = created.split("T")[1].strip("Z") modded_time = time.strftime("%I:%M %p",time.strptime(created_time,"%H:%M:%S")) created = created_date + " " + modded_time # Determine the Office application and version that created this document for child in app_xml: if 'AppVersion' in child.tag: office_version = child.text if "16." in office_version: version = "2016" elif "15." in office_version: version = "2013" elif "14." in office_version: version = "2010" elif "12." in office_version: version = "2007" if ".xls" in curr_file: producer = "Microsoft Excel " + version elif ".doc" in curr_file: producer = "Microsoft Word " + version elif ".ppt" in curr_file: producer = "Microsoft PowerPoint " + version # Remove any slashes in the filename if "/" in curr_file: curr_file = curr_file[curr_file.rfind("/")+1:] if "\\" in curr_file: curr_file = curr_file.replace("\\","") # Add the results to the container self.container.append([curr_file,created,author,producer,modded,last_saved]) except Exception as error: click.secho("[!] Failed to extract metadata from {}!".format(curr_file),fg="red") click.secho("L.. Details: {}".format(error),fg="red") pass
def tweak( self, pdfstream, skip_sections=0, mainsections_count=None, reverse_naming=False ): """ :param int skip_sections: In order to handle several documents in the same file, I introduced skip_sections: this tells the parser that previous sections have been handled by another parser :param int mainsections_count: same purpose as above. If None: :param bool reverse_naming : defaults to False. for Port-Parallele - outline is reversed (analytic code / entr_name) """ self.logger.debug("Writing to {0}".format(self.output_dir)) mkdir_p(self.output_dir, self.logger) filename = pdfstream.name with open(filename, 'rb') as duplicate_pdfstream: inputpdf = PdfFileReader(duplicate_pdfstream) if inputpdf.isEncrypted: inputpdf.decrypt('') pages_nb = inputpdf.getNumPages() if not self.pages_to_process: # 0 means no restriction self.pages_to_process = pages_nb self.logger.info("%s has %d pages", filename, pages_nb) self.logger.info( "Estimated time for completion of %d pages on " "an average computer: %.f seconds. Please stand by while " "the parsing takes place.", self.pages_to_process, self._UNITARY_TIME*self.pages_to_process ) start = time.clock() self.register_pages(inputpdf, pages_nb) if not self.getdata( inputpdf, filename, pages_nb, skip_sections, mainsections_count, ): self.logger.critical( "No data could be extracted! " "Not splitting, sorry" ) return self.logger.debug("Now writing files") did_print = False for iteration, printinfo in enumerate(self.split_stream(pages_nb)): self.printpages( iteration, *printinfo, reverse_naming=reverse_naming ) did_print = True if not did_print: self.logger.critical("No page of output!") duration = time.clock() - start closing_message(self.logger, duration)
def info(pdf): data = {} with open(pdf, 'rb') as fd: try: pdfreader = PdfFileReader(fd) data['pages'] = pdfreader.numPages if pdfreader.getIsEncrypted(): pdfreader.decrypt('') info = pdfreader.getDocumentInfo() if info: for key in info: if info[key]: try: value = info[key] if len(value) == 1: value = value[0] if isinstance(value, bytes): value = value.decode('utf-16') data[key[1:].lower()] = value except: pass xmp = pdfreader.getXmpMetadata() if xmp: for key in dir(xmp): if key.startswith('dc_'): value = getattr(xmp, key) if isinstance(value, dict) and 'x-default' in value: value = value['x-default'] elif isinstance(value, list): value = [v.strip() if isinstance(v, str) else v for v in value if v] value = [v.strftime('%Y-%m-%d') if isinstance(v, datetime) else v for v in value] if len(value) == 1: value = value[0] _key = key[3:] if value and _key not in data: data[_key] = value except: logger.debug('FAILED TO PARSE %s', pdf, exc_info=1) ''' cmd = ['pdfinfo', pdf] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) stdout, stderr = p.communicate() for line in stdout.strip().split('\n'): parts = line.split(':') key = parts[0].lower().strip() if key: data[key] = ':'.join(parts[1:]).strip() for key in data.keys(): if not data[key]: del data[key] ''' if 'identifier' in data: value = normalize_isbn(data['identifier']) if stdnum.isbn.is_valid(value): data['isbn'] = [value] del data['identifier'] for key, value in data.items(): if isinstance(value, dict): value = ' '.join(list(value.values())) data[key] = value.strip() for key in list(data): if data[key] in ('Unknown',): del data[key] if key == 'language': data[key] = get_language(data[key]) text = extract_text(pdf) data['textsize'] = len(text) if settings.server['extract_text']: if not 'isbn' in data: isbn = extract_isbn(text) if isbn: data['isbn'] = [isbn] if 'isbn' in data and isinstance(data['isbn'], str): data['isbn'] = [data['isbn']] if 'date' in data and len(data['date']) == 8 and data['date'].isdigit(): d = data['date'] data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:]) if 'author' in data and isinstance(data['author'], str): data['author'] = data['author'].split(', ') return data
def main(): sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout) mypath = u'../../samples' min_tokens = 2 onlypdfFiles = [ f for f in listdir(mypath) if isfile(join(mypath,f)) ] qualityThreshold = 80 overallResult = [] for f in onlypdfFiles: print f try: pdfFile = PdfFileReader(open(mypath + '/' + f, "rb")) if pdfFile.isEncrypted: pdfFile.decrypt('') metainfo = pdfFile.getDocumentInfo() res = [] hit = [] bestHit = [] title = None author = None #in case there are some metadata if metainfo != None: #removing useless (since they are too short) terms if metainfo.title != None: title = removeShortTerms([metainfo.title], 5) if title == []: title = None else: title = title[0] if metainfo.author != None: author = removeShortTerms([metainfo.author], 5) if author == []: author = None else: author = author[0] #in case there are author and/or title information in the metadata if (author != None or title != None): if author != None and title != None: query = ('title', title),('person', author) else: if author != None: query = (('person', author),) if title != None: query = (('title', title),) res = searchDataprovider(query) if res['hits']['total'] > 0: bestHit = selectBestMatch([q[1] for q in query], res['hits']['hits'], qualityThreshold, creatorWeight=1, titleWeight=1) if bestHit != None: participants = getParticipants(bestHit[1]) overallResult.append({'match': True, 'quality': bestHit[0], 'filename': f, 'id': bestHit[1]['id'], 'participants': [a for a in participants if len(participants) > 0], 'title': bestHit[1]['title']}) # when there are no metainfomation available or # there where no decent results if bestHit == [] or bestHit == None: hits = [] s = pdf_to_txt(mypath + '/' + f, 0, 0) paragraphs = re.split(' *\n+ *', s) paragraphs = removeShortTerms(paragraphs, 5) end = min(5, len(paragraphs)) for a in range(0, end): #search only if there are more than min_tokens words if len(paragraphs[a].split()) > min_tokens: res = searchDataprovider((('title', paragraphs[a]),)) if res != None and res['hits']['total'] > 0: hit = selectBestMatch(paragraphs, res['hits']['hits'], qualityThreshold) if hit != None: bisect.insort_left(hits, hit) if len(hits) > 0: bestHit = hits[-1] #creator, person, contributor etc. unionizen und in authos unterbringen participants = getParticipants(bestHit[1]) overallResult.append({'match': True, 'quality': bestHit[0], 'filename': f, 'id': bestHit[1]['id'], 'participants': [a for a in participants if len(participants) > 0], 'title': bestHit[1]['title']}) else: overallResult.append({'match': False, 'reason': 'no match', 'filename': f}) except (AttributeError, PdfReadError, IOError, AssertionError, KeyError, NotImplementedError, PDFTextExtractionNotAllowed, TypeError) as e: print "exception:" for arg in e.args: print arg overallResult.append({'match': False, 'reason': 'exception', 'filename': f}) print("done. Results:") for i, r in enumerate(overallResult): if r['match'] == True: print str(i) + '.' + ' match: True' + '\n' \ ' quality: ' + str(r['quality']) + '\n' + \ ' filename. ' + r['filename'] + '\n' + \ ' id: ' + r['id'] + '\n' + ' title: ' + r['title'] for p in r['participants']: print ' participant: ' + p else: print str(i) + '.' + ' match: False' + '\n' + \ ' reason: ' + r['reason'] + '\n' + \ ' filename: ' + r['filename']
""" 读取PDF文件 Version: 0.1 Author: 骆昊 Date: 2018-03-26 """ from PyPDF2 import PdfFileReader with open('./res/Python课程大纲.pdf', 'rb') as f: reader = PdfFileReader(f, strict=False) print(reader.numPages) if reader.isEncrypted: reader.decrypt('') current_page = reader.getPage(5) print(current_page) print(current_page.extractText())