def downloadpdf(url): try: request = requests.get(url, verify=False) if request.headers['Content-Type'] == 'text/html': return None except requests.exceptions.ConnectionError: sys.exit( "\nThere was an error when trying to connect to the domain. Please confirm if the domain is correctly written.\n" ) try: objbyte = BytesIO(request.content) except Exception as e: print(e) return None try: s_stdout = sys.stdout sys.stdout = BytesIO() pdf = PdfFileReader(objbyte) sys.stdout = s_stdout except Exception as e: print(e) return None if pdf.getIsEncrypted() is True: try: pdf.decrypt('') except: pdf = Scratcher.handlepdf(request.content) return pdf
def add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out): """添加水印 """ pdf_output = PdfFileWriter() input_stream = open(pdf_file_in, 'rb') pdf_input = PdfFileReader(input_stream) # PDF文件被加密了 if pdf_input.getIsEncrypted(): print('该PDF文件被加密了.') # 尝试用空密码解密 try: pdf_input.decrypt('') except Exception as e: print('尝试用空密码解密失败.') return False else: print('用空密码解密成功.') # 获取PDF文件的页数 page_num = pdf_input.getNumPages() # 读入水印pdf文件 pdf_watermark_input_stream = open(pdf_file_mark, 'rb') pdf_watermark = PdfFileReader(pdf_watermark_input_stream) # 给每一页打水印 for i in range(page_num): page = pdf_input.getPage(i) page.mergePage(pdf_watermark.getPage(0)) page.compressContentStreams() # 压缩内容 pdf_output.addPage(page) output_stream = open(pdf_file_out, "wb") pdf_output.write(output_stream) input_stream.close() pdf_watermark_input_stream.close() output_stream.close()
def add_watermark(pdf_file_mark, pdf_file_in, pdf_file_out): with open(pdf_file_in, 'rb') as fp: pdf_input = PdfFileReader(fp) # PDF文件被加密了 if pdf_input.getIsEncrypted(): print('该PDF文件被加密了.') # 尝试用空密码解密 try: pdf_input.decrypt('') except Exception: print('尝试用空密码解密失败.') return False else: print('用空密码解密成功.') # 获取PDF文件的页数 pageNum = pdf_input.getNumPages() with open(pdf_file_mark, 'rb') as mfp: pdf_output = PdfFileWriter() # 读入水印pdf文件 pdf_watermark = PdfFileReader(mfp) # 给每一页打水印 for i in range(pageNum): page = pdf_input.getPage(i) page.mergePage(pdf_watermark.getPage(0)) page.compressContentStreams() # 压缩内容 pdf_output.addPage(page) with open(pdf_file_out, 'wb') as wfp: pdf_output.write(wfp)
def downloadpdf(url): try: request = requests.get(url, verify=False) if 'Content-Type' in request.headers.keys(): if request.headers['Content-Type'] == 'text/html': return None except requests.exceptions.ConnectionError: sys.exit( "\nThere was an error when trying to connect to the domain. Please confirm if the domain is " "correctly written.\n") try: objbyte = BytesIO(request.content) except Exception as e: Scratcher.log(url, e) sys.exit( "\nThere was an error when trying to convert the content of the response.Please verify the logs to" " see the raised error.\n") try: pdf = PdfFileReader(objbyte) except utils.PdfReadError as e: Scratcher.log(url, e) obje = BytesIO(request.content.strip(b'\x00')) try: pdf = PdfFileReader(obje) except utils.PdfReadError: return 2 if pdf.getIsEncrypted() is True: try: pdf.decrypt('') except: pdf = Scratcher.handlepdf(request.content) return pdf
def _get_pdf_document_info(fp): try: pdf = PdfFileReader(fp) if pdf.getIsEncrypted(): # Some PDFs are "encrypted" with an empty password: give that a # shot... if not pdf.decrypt(""): return None return pdf.getDocumentInfo() except FileNotFoundError: return None
def add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out): pdf_output = PdfFileWriter() input_stream = file(pdf_file_in, 'rb') pdf_input = PdfFileReader(input_stream, strict=False) # PDF文件被加密了 if pdf_input.getIsEncrypted(): print '该PDF文件被加密了.' # 尝试用空密码解密 try: pdf_input.decrypt('') except Exception, e: return False else: print '用空密码解密成功.'
def add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out): pdf_output = PdfFileWriter() input_stream = file(pdf_file_in, 'rb') pdf_input = PdfFileReader(input_stream) # PDF文件被加密了 if pdf_input.getIsEncrypted(): print '该PDF文件被加密了.' # 尝试用空密码解密 try: pdf_input.decrypt('') except Exception, e: print '尝试用空密码解密失败.' return False else: print '用空密码解密成功.'
def __Get_info(file_path, plain_log, csv_log, analyzed_files, total_files): """ Get_info(file_path) Opens the pdf file for reading. Args: - file_path: (string) Absolute file path. - plain_log: (None | string) Log file in plain text. - csv_log: (None | string) Log file in csv format. """ file_name = os.path.basename(file_path) file_size = os.path.getsize(file_path) encrypted = 'No' try: # Try to open not password encrypted pdf files and pdf files # encrypted with a blank password. pdf_file = PdfFileReader(file(file_path, 'rb')) if pdf_file.getIsEncrypted() is True: dec_res = pdf_file.decrypt('') if dec_res == 1: encrypted = 'Yes' #Get and parse metadata doc_info = pdf_file.getDocumentInfo() title, author, creator, subject, producer, c_date, m_date \ = __Parse_doc_info(doc_info) num_pages = pdf_file.getNumPages() #Group info pdf_meta = pdf_metadata(file_name, title, author, creator, subject, producer, c_date, m_date, encrypted, num_pages, file_size) __Print_metadata(pdf_meta) if plain_log: Log(file_name, pdf_meta, plain_log, 'txt') if csv_log: Log(file_name, pdf_meta, f_log_csv, 'csv') analyzed_files = analyzed_files + 1 except Exception, e: error = file_name + ' ' + str(e) __Print_error(error)
def merge_files(local_pdfs): name = 'merge_{0}_output.pdf'.format(str(time.clock())[2:]) merged_export = PdfFileMerger() for pdfile in local_pdfs: filepath = getpath(pdfile, config().get(section='server', option='upload_folder')) file_bin = PdfFileReader(file(filepath, 'rb')) if file_bin.getIsEncrypted(): file_bin.decrypt('') merged_export.append(fileobj=file_bin) os.remove(filepath) full_ouput = getpath(name, config().get(section='server', option='upload_folder')) with open(full_ouput, 'wb') as output: merged_export.write(output) return full_ouput
def split_pdf(pdf_filename, temp_dir): ''' Split the PDF into n PDFs ( one for each page ). ''' filenames = [] inputpdf = PdfFileReader(open(pdf_filename, "rb")) if inputpdf.getIsEncrypted(): inputpdf.decrypt('') for i in range(inputpdf.numPages): output = PdfFileWriter() output.addPage(inputpdf.getPage(i)) filename = os.path.basename(pdf_filename) filename = "{0}/{1}-p{2}.pdf".format(temp_dir, filename, i) with open(filename, "wb") as outputStream: output.write(outputStream) filenames.append(filename) return filenames
def split_pdf(pdf_filename): filenames = [] inputpdf = PdfFileReader(open(pdf_filename, "rb")) if inputpdf.getIsEncrypted(): inputpdf.decrypt('') for i in range(inputpdf.numPages): output = PdfFileWriter() output.addPage(inputpdf.getPage(i)) directory = os.path.dirname(pdf_filename) if directory == '': directory = '.' filename = os.path.basename(pdf_filename) filename = "{0}/{1}-p{2}.pdf".format(directory,filename,i) with open(filename, "wb") as outputStream: output.write(outputStream) filenames.append(filename) return filenames
def extract_pdf_metadata(filepath: Path): metadata = dict() try: metadata['pdf_file_size_bytes'] = filepath.stat().st_size with filepath.open('rb') as handle: pdf = PdfFileReader(handle) metadata['pdf_is_encrypted'] = pdf.getIsEncrypted() metadata['pdf_number_of_pages'] = pdf.getNumPages() for key, val in pdf.getDocumentInfo().items(): metadata[key] = str(val) except Exception as e: error_msg = f'Failed to read metadata of {filepath}, error: {e}' logger.error(error_msg) return error_msg, None return constants.RETURNCODE_OK, metadata
def main(): parser = OptionParser('usage %prog -F'+'<target_File>'+'-P <password_File>') parser.add_option("-F", dest="targetFile", type='string', help="target PDF File") parser.add_option("-P", dest="PasswordFile", type='string', help="Password File") (options, args) = parser.parse_args() if(options.targetFile == None)|(options.PasswordFile == None): print parser.usage exit(0) else: pdfFile = options.targetFile PasswordFile = options.PasswordFile pdfFileReader = PdfFileReader(file(pdfFile,'rb')) if pdfFileReader.getIsEncrypted(): fp = open(PasswordFile,'r') for line in fp.readlines(): passWord = line.strip('\r').strip('\n') if(crackPdf(pdfFile, passWord, pdfFileReader)): fp.close() return True fp.close() else: print '[*] PDF File '+pdfFile+' no encrypted!'
def print_pdf(file_full_path): # Header with file path cprint("[+] Metadata for file: %s " % (file_full_path), "green", attrs=['bold']) # Open the file pdf_file = PdfFileReader(file(file_full_path, 'rb')) # Create a dictorionary with the info pdf_info = pdf_file.getDocumentInfo() # Print metadata if pdf_info: for metaItem in pdf_info: try: cprint('\t ' + metaItem[1:] + ': ', 'cyan', end="") cprint(pdf_info[metaItem]) except TypeError: cprint( '\t ' + metaItem[1:] + ': ' + 'Error - Item not redeable', 'red') else: cprint('Not data found', 'red') # Print other info cprint("\t Number of pages: %s" % pdf_file.getNumPages(), 'cyan') cprint("\t Is Encripted: %s" % pdf_file.getIsEncrypted(), 'cyan')
def process_file(self, curr_file): """Function to process the provided file. If the file is a PDF, the PyPDF2 library will be used. Otherwise, the extract tool is used, so extract must be installed. This is the one piece that requires Linux. """ global ED_FROM author = '-' date = '-' generator = '-' created = '-' producer = '-' modded = '-' last_saved = '-' if ".pdf" in curr_file: try: pdf_file = PdfFileReader(open(curr_file, 'rb')) if pdf_file.getIsEncrypted(): pdf_file.decrypt('') doc_info = pdf_file.getDocumentInfo() if not doc_info: return last_saved = '-' # Looks at the entire dictionary to parse for information if "/CreationDate" in doc_info: data = doc_info["/CreationDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] created_time = data[8:10] + ":" + data[10:12] created_time = time.strftime( "%I:%M %p", time.strptime(created_time, "%H:%M")) created = date + "/" + year + " " + created_time if "/Author" in doc_info: author = doc_info["/Author"] + " " if len(author) <= 1: author = "-" if "/Producer" in doc_info: producer = doc_info["/Producer"].strip("(Windows)") producer = re.sub(r'[^\w]', ' ', producer) if len(producer) == 0: producer = "-" while True: if " " in producer: producer = producer.replace(" ", " ") else: break if "/ModDate" in doc_info: data = doc_info["/ModDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] modded_time = data[8:10] + ":" + data[10:12] modded_time = time.strftime( "%I:%M %p", time.strptime(modded_time, "%H:%M")) modded = date + "/" + year + " " + modded_time # Strips '/' off filename (if it includes directory name) if "/" in curr_file: curr_file = curr_file[curr_file.rfind("/") + 1:] if "\\" in curr_file: curr_file = curr_file.replace("\\", "") if len(producer) > 30: producer = producer[:20] + " [snipped] " if len(author) > 20: author = author[:20] + " [snipped] " # Appends each piece of information # Output will show ONLY if at least ONE file has data in a column self.container.append( [curr_file, created, author, producer, modded, last_saved]) except Exception: return else: try: curr_file = curr_file.replace(" ", "\ ").replace("(", "\(")\ .replace(")", "\)") try: extract_status = subprocess.getstatusoutput("extract") except: print( yellow( "[*] We found an Office document, but 'extract' is not installed \ on this system to get the metadata. It is downloaded for later analysis.")) if extract_status[0] == 0: output = subprocess.check_output("extract -V " + curr_file, shell=True)\ .decode('utf-8').split('\n') if "extract: not found" in output[0]: print( red("[!] PyFOCA requires the 'extract' command.")) print( red("L.. Please install extract by typing 'apt-get install extract' \ in terminal.")) for i in output: if "creator" in i: author = i[i.find("-") + 2:] rem_alphanumeric = re.compile(r'\W') author = re.sub(rem_alphanumeric, ' ', author) while True: if " " in author: author = author.replace(" ", " ") elif author[0] == " ": author = author[1:] else: break elif "date" in i and "creation" not in i: year = i[i.find('-') + 2:(i.find('-') + 2) + 4] date = i[i.find(year) + 5:(i.find(year) + 5) + 5].replace("-", "/") modded_time = i[i.find(":") - 2:i.rfind(":") - 1] modded_time = time.strftime( "%I:%M %p", time.strptime(modded_time, "%H:%M")) modded = date + "/" + year + " " + modded_time elif "generator" in i: producer = i[i.find('-') + 2:] elif "creation" in i: year = i[i.find('-') + 2:(i.find('-') + 2) + 4] date = i[i.find(year) + 5:(i.find(year) + 5) + 5].replace("-", "/") created_time = i[i.find(":") - 2:i.rfind(":") - 1] created_time = time.strftime( "%I:%M %p", time.strptime(created_time, "%H:%M")) created = date + "/" + year + " " + created_time elif "last saved" in i: last_saved = i[i.find('-') + 2:] if "/" in curr_file: curr_file = curr_file[curr_file.rfind("/") + 1:] if "\\" in curr_file: curr_file = curr_file.replace("\\", "") if author != "-" or date != "-" or generator != "-" or created != "-" or \ producer != "-" or modded != "-" or last_saved != "-": self.container.append([ " | " + curr_file, created, author, producer, modded, last_saved ]) else: print( yellow( "[*] We found an Office document, but 'extract' is not installed \ on this system to get the metadata. It is downloaded for later analysis.")) except Exception as error: if "command not found" in str(error): print(red("[!] PyFOCA requires the 'extract' command.")) print( red("L.. Please install on Linux extract by typing 'apt-get install extract' \ in terminal.")) # exit() return
def info(pdf): data = {} with open(pdf, 'rb') as fd: try: pdfreader = PdfFileReader(fd) data['pages'] = pdfreader.numPages if pdfreader.getIsEncrypted(): pdfreader.decrypt('') info = pdfreader.getDocumentInfo() if info: for key in info: if info[key]: try: value = info[key] if len(value) == 1: value = value[0] if isinstance(value, bytes): value = value.decode('utf-16') data[key[1:].lower()] = value except: pass xmp = pdfreader.getXmpMetadata() if xmp: for key in dir(xmp): if key.startswith('dc_'): value = getattr(xmp, key) if isinstance(value, dict) and 'x-default' in value: value = value['x-default'] elif isinstance(value, list): value = [ v.strip() if isinstance(v, str) else v for v in value if v ] value = [ v.strftime('%Y-%m-%d') if isinstance( v, datetime) else v for v in value ] if len(value) == 1: value = value[0] _key = key[3:] if value and _key not in data: data[_key] = value except: logger.debug('FAILED TO PARSE %s', pdf, exc_info=1) ''' cmd = ['pdfinfo', pdf] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) stdout, stderr = p.communicate() for line in stdout.strip().split('\n'): parts = line.split(':') key = parts[0].lower().strip() if key: data[key] = ':'.join(parts[1:]).strip() for key in data.keys(): if not data[key]: del data[key] ''' if 'identifier' in data: value = normalize_isbn(data['identifier']) if stdnum.isbn.is_valid(value): data['isbn'] = [value] del data['identifier'] for key, value in data.items(): if isinstance(value, dict): value = ' '.join(list(value.values())) data[key] = value.strip() for key in list(data): if data[key] in ('Unknown', ): del data[key] if key == 'language': data[key] = get_language(data[key]) text = extract_text(pdf) data['textsize'] = len(text) if settings.server['extract_text']: if not 'isbn' in data: isbn = extract_isbn(text) if isbn: data['isbn'] = [isbn] if 'isbn' in data and isinstance(data['isbn'], str): data['isbn'] = [data['isbn']] if 'date' in data and len(data['date']) == 8 and data['date'].isdigit(): d = data['date'] data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:]) if 'author' in data and isinstance(data['author'], str): data['author'] = data['author'].split(', ') return data
def process_file(self,curr_file): """Process the provided file. If the file is a PDF, the PyPDF2 library will be used. Otherwise, the extract tool is used, so extract must be installed. This is the one piece that requires Linux. Parameters: curr_file The filepath of the file to be processed """ date = "None" modded = "None" author = "None" created = "None" producer = "None" last_saved = "None" # Process the current file as a PDF if ".pdf" in curr_file: try: pdf_file = PdfFileReader(open(curr_file,"rb")) if pdf_file.getIsEncrypted(): pdf_file.decrypt('') # getDocumentInfo() returns something like: # {'/Author': 'Chris Maddalena', # '/CreationDate': "D:20131014182824-04'00'", # '/Creator': 'Microsoft® Excel® 2013',1 # '/ModDate': "D:20131015141200-04'00'", # '/Producer': 'Microsoft® Excel® 2013'} doc_info = pdf_file.getDocumentInfo() # If there is no info, just return if not doc_info: return # Parse the document into if "/CreationDate" in doc_info: data = doc_info["/CreationDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] created_time = data[8:10] + ":" + data[10:12] created_time = time.strftime("%I:%M %p",time.strptime(created_time,"%H:%M")) created = date + "/" + year + " " + created_time if "/Author" in doc_info: author = doc_info["/Author"] if "/Producer" in doc_info: producer = doc_info["/Producer"].strip("(Windows)") producer = re.sub(r'[^\w]',' ',producer) while True: if " " in producer: producer = producer.replace(" "," ") else: break if "/ModDate" in doc_info: data = doc_info["/ModDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] modded_time = data[8:10] + ":" + data[10:12] modded_time = time.strftime("%I:%M %p",time.strptime(modded_time,"%H:%M")) modded = date + "/" + year + " " + modded_time # Strips '/' off filename (if it includes directory name) if "/" in curr_file: curr_file = curr_file[curr_file.rfind("/")+1:] if "\\" in curr_file: curr_file = curr_file.replace("\\","") # Add the document info to the container self.container.append([curr_file,created,author,producer,modded,last_saved]) except Exception: return # Not a PDF, so treat the current file as an Office doc else: curr_file = curr_file.replace(" ","\ ").replace("(","\(").replace(")","\)") try: # Unzip the contents of the document to get the contents of core.xml and app.xml files unzipped = zipfile.ZipFile(curr_file) doc_xml = lxml.etree.fromstring(unzipped.read("docProps/core.xml")) app_xml = lxml.etree.fromstring(unzipped.read("docProps/app.xml")) # Namespaces for doc.xml dc_ns = {"dc":"http://purl.org/dc/elements/1.1/"} cp_ns = {"cp":"http://schemas.openxmlformats.org/package/2006/metadata/core-properties"} dcterms_ns = {"dcterms":"http://purl.org/dc/terms/"} # Namespaces for app.xml: # app_ns = {"http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"} # vt_ns = {"vt": "http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"} # tags = doc_xml.xpath('//cp:keywords', namespaces=cp_ns)[0].text # description = doc_xml.xpath('//dc:description', namespaces=dc_ns)[0].text author = doc_xml.xpath('//dc:creator',namespaces=dc_ns)[0].text modded = doc_xml.xpath('//cp:lastModifiedBy',namespaces=cp_ns)[0].text created = doc_xml.xpath('//dcterms:created',namespaces=dcterms_ns)[0].text last_saved = doc_xml.xpath('//dcterms:modified',namespaces=dcterms_ns)[0].text # Convert the created time to a prettier format created_date = created.split("T")[0] created_time = created.split("T")[1].strip("Z") modded_time = time.strftime("%I:%M %p",time.strptime(created_time,"%H:%M:%S")) created = created_date + " " + modded_time # Determine the Office application and version that created this document for child in app_xml: if 'AppVersion' in child.tag: office_version = child.text if "16." in office_version: version = "2016" elif "15." in office_version: version = "2013" elif "14." in office_version: version = "2010" elif "12." in office_version: version = "2007" if ".xls" in curr_file: producer = "Microsoft Excel " + version elif ".doc" in curr_file: producer = "Microsoft Word " + version elif ".ppt" in curr_file: producer = "Microsoft PowerPoint " + version # Remove any slashes in the filename if "/" in curr_file: curr_file = curr_file[curr_file.rfind("/")+1:] if "\\" in curr_file: curr_file = curr_file.replace("\\","") # Add the results to the container self.container.append([curr_file,created,author,producer,modded,last_saved]) except Exception as error: click.secho("[!] Failed to extract metadata from {}!".format(curr_file),fg="red") click.secho("L.. Details: {}".format(error),fg="red") pass
def processFile(self, curr_file): global extractedFrom author = '-' date = '-' generator = '-' created = '-' producer = '-' modded = '-' last_saved = '-' if ".pdf" in curr_file: try: pdfFile = PdfFileReader(file(curr_file, 'rb')) if pdfFile.getIsEncrypted(): pdfFile.decrypt('') docInfo = pdfFile.getDocumentInfo() if not docInfo: return last_saved = '-' #looks at the entire dictionary to parse for information if "/CreationDate" in docInfo: data = docInfo["/CreationDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] created_time = data[8:10] + ":" + data[10:12] created_time = time.strftime( "%I:%M %p", time.strptime(created_time, "%H:%M")) created = date + "/" + year + " " + created_time if "/Author" in docInfo: author = docInfo["/Author"] + " " if len(author) <= 1: author = "-" if "/Producer" in docInfo: producer = docInfo["/Producer"].strip("(Windows)") producer = re.sub(r'[^\w]', ' ', producer) if len(producer) == 0: producer = "-" while True: if " " in producer: producer = producer.replace(" ", " ") else: break if "/ModDate" in docInfo: data = docInfo["/ModDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] modded_time = data[8:10] + ":" + data[10:12] modded_time = time.strftime( "%I:%M %p", time.strptime(modded_time, "%H:%M")) modded = date + "/" + year + " " + modded_time #strips '/' off file name (if it includes directory name) if "/" in curr_file: curr_file = curr_file[curr_file.rfind("/") + 1:] if "\\" in curr_file: curr_file = curr_file.replace("\\", "") #trim information if it's too long if len(curr_file) > 15: # trims file name curr_file = curr_file[:15] + "..." + curr_file[-13:] if len(producer) > 30: producer = producer[:20] + " [snipped] " if len(author) > 20: author = author[:20] + " [snipped] " #appends each piece of information. output will show ONLY if at least ONE file has data in a column self.container.append([ " | " + curr_file, created, author, producer, modded, last_saved ]) except Exception, err: return
def process_file(self, curr_file): """Process the provided file. If the file is a PDF, the PyPDF2 library will be used. Otherwise, the extract tool is used, so extract must be installed. This is the one piece that requires Linux. Parameters: curr_file The filepath of the file to be processed """ date = "None" modded = "None" author = "None" created = "None" producer = "None" last_saved = "None" # Process the current file as a PDF if ".pdf" in curr_file: try: pdf_file = PdfFileReader(open(curr_file, "rb")) if pdf_file.getIsEncrypted(): pdf_file.decrypt('') # getDocumentInfo() returns something like: # {'/Author': 'Chris Maddalena', # '/CreationDate': "D:20131014182824-04'00'", # '/Creator': 'Microsoft® Excel® 2013',1 # '/ModDate': "D:20131015141200-04'00'", # '/Producer': 'Microsoft® Excel® 2013'} doc_info = pdf_file.getDocumentInfo() # If there is no info, just return if not doc_info: return # Parse the document into if "/CreationDate" in doc_info: data = doc_info["/CreationDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] created_time = data[8:10] + ":" + data[10:12] created_time = time.strftime( "%I:%M %p", time.strptime(created_time, "%H:%M")) created = date + "/" + year + " " + created_time if "/Author" in doc_info: author = doc_info["/Author"] if "/Producer" in doc_info: producer = doc_info["/Producer"].strip("(Windows)") producer = re.sub(r'[^\w]', ' ', producer) while True: if " " in producer: producer = producer.replace(" ", " ") else: break if "/ModDate" in doc_info: data = doc_info["/ModDate"].strip("D:|'") year = data[0:4] date = data[4:6] + "/" + data[6:8] modded_time = data[8:10] + ":" + data[10:12] modded_time = time.strftime( "%I:%M %p", time.strptime(modded_time, "%H:%M")) modded = date + "/" + year + " " + modded_time # Strips '/' off filename (if it includes directory name) if "/" in curr_file: curr_file = curr_file[curr_file.rfind("/") + 1:] if "\\" in curr_file: curr_file = curr_file.replace("\\", "") # Add the document info to the container self.container.append( [curr_file, created, author, producer, modded, last_saved]) except Exception: return # Not a PDF, so treat the current file as an Office doc else: curr_file = curr_file.replace(" ", "\ ").replace("(", "\(").replace( ")", "\)") try: # Unzip the contents of the document to get the contents of core.xml and app.xml files unzipped = zipfile.ZipFile(curr_file) doc_xml = lxml.etree.fromstring( unzipped.read("docProps/core.xml")) app_xml = lxml.etree.fromstring( unzipped.read("docProps/app.xml")) # Namespaces for doc.xml dc_ns = {"dc": "http://purl.org/dc/elements/1.1/"} cp_ns = { "cp": "http://schemas.openxmlformats.org/package/2006/metadata/core-properties" } dcterms_ns = {"dcterms": "http://purl.org/dc/terms/"} # Namespaces for app.xml: # app_ns = {"http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"} # vt_ns = {"vt": "http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes"} # tags = doc_xml.xpath('//cp:keywords', namespaces=cp_ns)[0].text # description = doc_xml.xpath('//dc:description', namespaces=dc_ns)[0].text author = doc_xml.xpath('//dc:creator', namespaces=dc_ns)[0].text modded = doc_xml.xpath('//cp:lastModifiedBy', namespaces=cp_ns)[0].text created = doc_xml.xpath('//dcterms:created', namespaces=dcterms_ns)[0].text last_saved = doc_xml.xpath('//dcterms:modified', namespaces=dcterms_ns)[0].text # Convert the created time to a prettier format created_date = created.split("T")[0] created_time = created.split("T")[1].strip("Z") modded_time = time.strftime( "%I:%M %p", time.strptime(created_time, "%H:%M:%S")) created = created_date + " " + modded_time # Determine the Office application and version that created this document for child in app_xml: if 'AppVersion' in child.tag: office_version = child.text if "16." in office_version: version = "2016" elif "15." in office_version: version = "2013" elif "14." in office_version: version = "2010" elif "12." in office_version: version = "2007" if ".xls" in curr_file: producer = "Microsoft Excel " + version elif ".doc" in curr_file: producer = "Microsoft Word " + version elif ".ppt" in curr_file: producer = "Microsoft PowerPoint " + version # Remove any slashes in the filename if "/" in curr_file: curr_file = curr_file[curr_file.rfind("/") + 1:] if "\\" in curr_file: curr_file = curr_file.replace("\\", "") # Add the results to the container self.container.append( [curr_file, created, author, producer, modded, last_saved]) except Exception as error: click.secho("[!] Failed to extract metadata from {}!".format( curr_file), fg="red") click.secho("L.. Details: {}".format(error), fg="red") pass
output.close() print("Your hot 'n' ready PDF awaits.") print( "You'll find it this directory with '-reordered' appended to the filename." ) if __name__ == "__main__": pdf_path = input( 'Enter the path to your PDF (e.g. "path/to/your/pdf.pdf"): ').strip() if pdf_path[-1] == '/': pdf_path = pdf_path[0:-1] regex = r"(?P<filename>[\w\-\_]+)(\.|$)" title = re.search(regex, pdf_path).groupdict()['filename'] reader = PdfFileReader(pdf_path) if reader.getIsEncrypted(): try: reader.decrypt('') except: print( "This file was detected as encrypted. Attempted to decrypt with empty password, but failed." ) print( "If this file is not encrypted with a password, you can try decrypting with QPDF." ) print( "Please make sure you have installed QPDF before saying yes to the following prompt..." ) tryWithQPDF = input( "Would you like to try decrypting with QPDF?: ") if re.match(r"(y|Y|Yes|yes)", tryWithQPDF):
#!/usr/bin/python from PyPDF2 import PdfFileReader pdf_document = "file.pdf" with open(pdf_document, "rb") as filehandle: pdf = PdfFileReader(filehandle) info = pdf.getDocumentInfo() pages = pdf.getNumPages() print('file information: ', info) print("number of pages: %i" % pages) page1 = pdf.getPage(0) print(pdf.getIsEncrypted()) print(pdf.pageMode) print(pdf.getFields()) print(pdf.stream) print(pdf.flattenedPages) print(page1) print(page1.extractText())
def export_to_file(self, file_out, only_selected=False): """Export to file""" selection = self.iconview.get_selected_items() pdf_output = PdfFileWriter() pdf_input = [] for pdfdoc in self.pdfqueue: pdfdoc_inp = PdfFileReader(open(pdfdoc.copyname, 'rb')) if pdfdoc_inp.getIsEncrypted(): try: # Workaround for lp:#355479 stat = pdfdoc_inp.decrypt('') except: stat = 0 if stat != 1: errmsg = _( 'File %s is encrypted.\n' 'Support for encrypted files has not been implemented yet.\n' 'File export failed.') % pdfdoc.filename raise Exception(errmsg) #FIXME #else # ask for password and decrypt file pdf_input.append(pdfdoc_inp) for row in self.model: if only_selected and row.path not in selection: continue # add pages from input to output document nfile = row[2] npage = row[3] current_page = copy(pdf_input[nfile - 1].getPage(npage - 1)) angle = row[6] angle0 = current_page.get("/Rotate", 0) crop = [row[7], row[8], row[9], row[10]] if angle != 0: current_page.rotateClockwise(angle) if crop != [0., 0., 0., 0.]: rotate_times = int(round(((angle + angle0) % 360) / 90) % 4) crop_init = crop if rotate_times != 0: perm = [0, 2, 1, 3] for it in range(rotate_times): perm.append(perm.pop(0)) perm.insert(1, perm.pop(2)) crop = [crop_init[perm[side]] for side in range(4)] #(x1, y1) = current_page.cropBox.lowerLeft #(x2, y2) = current_page.cropBox.upperRight (x1, y1) = [float(xy) for xy in current_page.mediaBox.lowerLeft] (x2, y2) = [float(xy) for xy in current_page.mediaBox.upperRight] x1_new = int(x1 + (x2 - x1) * crop[0]) x2_new = int(x2 - (x2 - x1) * crop[1]) y1_new = int(y1 + (y2 - y1) * crop[3]) y2_new = int(y2 - (y2 - y1) * crop[2]) #current_page.cropBox.lowerLeft = (x1_new, y1_new) #current_page.cropBox.upperRight = (x2_new, y2_new) current_page.mediaBox.lowerLeft = (x1_new, y1_new) current_page.mediaBox.upperRight = (x2_new, y2_new) pdf_output.addPage(current_page) # finally, write "output" to document-output.pdf pdf_output.write(open(file_out, 'wb'))
def info(pdf): data = {} with open(pdf, 'rb') as fd: try: pdfreader = PdfFileReader(fd) data['pages'] = pdfreader.numPages if pdfreader.getIsEncrypted(): pdfreader.decrypt('') info = pdfreader.getDocumentInfo() if info: for key in info: if info[key]: try: value = info[key] if len(value) == 1: value = value[0] if isinstance(value, bytes): value = value.decode('utf-16') data[key[1:].lower()] = value except: pass xmp = pdfreader.getXmpMetadata() if xmp: for key in dir(xmp): if key.startswith('dc_'): value = getattr(xmp, key) if isinstance(value, dict) and 'x-default' in value: value = value['x-default'] elif isinstance(value, list): value = [v.strip() if isinstance(v, str) else v for v in value if v] value = [v.strftime('%Y-%m-%d') if isinstance(v, datetime) else v for v in value] if len(value) == 1: value = value[0] _key = key[3:] if value and _key not in data: data[_key] = value except: logger.debug('FAILED TO PARSE %s', pdf, exc_info=1) ''' cmd = ['pdfinfo', pdf] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) stdout, stderr = p.communicate() for line in stdout.strip().split('\n'): parts = line.split(':') key = parts[0].lower().strip() if key: data[key] = ':'.join(parts[1:]).strip() for key in data.keys(): if not data[key]: del data[key] ''' if 'identifier' in data: value = normalize_isbn(data['identifier']) if stdnum.isbn.is_valid(value): data['isbn'] = [value] del data['identifier'] for key, value in data.items(): if isinstance(value, dict): value = ' '.join(list(value.values())) data[key] = value.strip() for key in list(data): if data[key] in ('Unknown',): del data[key] if key == 'language': data[key] = get_language(data[key]) text = extract_text(pdf) data['textsize'] = len(text) if settings.server['extract_text']: if not 'isbn' in data: isbn = extract_isbn(text) if isbn: data['isbn'] = [isbn] if 'isbn' in data and isinstance(data['isbn'], str): data['isbn'] = [data['isbn']] if 'date' in data and len(data['date']) == 8 and data['date'].isdigit(): d = data['date'] data['date'] = '%s-%s-%s' % (d[:4], d[4:6], d[6:]) if 'author' in data and isinstance(data['author'], str): data['author'] = data['author'].split(', ') return data