def create(id, userid): if id == 1: pdf_path = os.path.dirname(__file__) + '/files/VTB_anketa.pdf' elif id == 2: return send_from_directory(directory=os.path.abspath(os.path.dirname(__file__) + '/files'), filename='VTB_accept.pdf') elif id == 3: pdf_path = os.path.dirname(__file__) + '/files/VTB_spravka.pdf' else: return 'BAD ID' with open(pdf_path, 'rb') as f: pdf = PdfFileReader(f) fields = pdf.getFormTextFields() checkboxes = {} for i in pdf.getFields().keys(): if 'Check Box' in i: checkboxes[i] = pdf.getFields()[i] docChecker(id=id, userid=userid, fields=checkboxes) docWriter(id=id, userid=userid, fields=fields) pdf_writer = PdfFileWriter() for page in range(pdf.getNumPages()): pdf_writer.addPage(pdf.getPage(page)) pdf_writer.updatePageFormFieldValues(page=pdf_writer.getPage(page), fields=fields) updateCheckboxValues(page=pdf_writer.getPage(page), fields=checkboxes) with open(os.path.dirname(__file__) + '/files/downloaded.pdf', 'wb') as out: pdf_writer.write(out) return send_from_directory(directory=os.path.abspath(os.path.dirname(__file__) + '/files'), filename='downloaded.pdf')
def test_get_form(src, expected, expected_get_fields): """Check if we can read out form data.""" src = os.path.join(RESOURCE_ROOT, src) reader = PdfFileReader(src) fields = reader.getFormTextFields() assert fields == expected fields = reader.getFields() assert fields == expected_get_fields
def main(args: List[str]) -> int: filename = args[1] reader = PdfFileReader(filename) fields = reader.getFields() for f in fields.keys(): print(f"Name: '{f}', " f"Type: '{readable_type(fields[f])}', " f"Value: {get_value(fields[f])}") return 0
def addInTemplate(filepath): with app.tabbedFrame('form_templates'): template_name = filepath.split('/')[-1].replace('.pdf', '') with open(filepath, 'rb') as intemp: template_form = PdfFileReader(intemp) with app.tab(template_name): app.setStretch('both') app.addListBox(template_name, sorted(template_form.getFields()), 0, 0, 10, 10) app.setListBoxGroup(template_name) app.setListBoxChangeFunction(template_name, updateFormTemplateEdit)
def main(): # --- Parse argv --------------------------------------------------- arg_parser = argparse.ArgumentParser(description='PDF field parser') arg_parser.add_argument('filenames', metavar='file', nargs='+', help='Path to one or more PDF to parse') arg_parser.add_argument('-o', metavar='output_file', dest='output', default = "output.csv", help='Output filename (default: output.csv)') args = arg_parser.parse_args() # --- Read in data from all the files ------------------------------ parsed_data = [] for file in args.filenames: with open(file, "rb") as con: pdf = PdfFileReader(con) fields = pdf.getFields() for column, objects in fields.items(): fields[column] = str(objects["/V"]) if "/V" in objects else "" parsed_data.append(fields) # If our dictionary is empty, assume that we have no data and exit # TODO: This only checks the first file. This could be more robust. if parsed_data[0] is None: sys.exit(0) # --- Write out data to the CSV file ------------------------------ with open(args.output, "w") as outfile: csvwriter = DictWriter( outfile, delimiter=",", quotechar="\"", lineterminator="\n", quoting=QUOTE_NONNUMERIC, fieldnames = parsed_data[0].keys() ) csvwriter.writeheader() for row in parsed_data: csvwriter.writerow(row)
def displaySourceData(window, fsource): '''Post the metadata for a pdf file''' '''NOTE: setting strict to False prevents PdfReadWarning. See https://github.com/mstamy2/PyPDF2/issues/36 ''' PDFSource = PdfFileReader(fsource, strict=False) info = PDFSource.getDocumentInfo() window['-NUMPAGES-'].update(PDFSource.getNumPages()) window['-FIELDS-'].update(PDFSource.getFields(fsource)) window['-ENCRYPTED-'].update(PDFSource.isEncrypted) window['-AUTHOR-'].update(info.author) window['-CREATIONDATE-'].update(str(convertDate(info))) window['-CREATOR-'].update(info.creator) window['-PRODUCER-'].update(info.producer) window['-SUBJECT-'].update(info.subject) window['-TITLE-'].update(info.title) return PDFSource
def parse_pdf_impl(filenames): # --- Read in data from all the files ------------------------------ parsed_data = [] for file in filenames: with open(file, "rb") as con: pdf = PdfFileReader(con) fields = pdf.getFields() for column, objects in fields.items(): fields[column] = str(objects["/V"]) if "/V" in objects else "" parsed_data.append(fields) # --- return as Pandas DataFrame ------------------------------ parsed_data = pd.DataFrame(parsed_data) return parsed_data
def _get_fields_from_pdf(pdf_data_path, radio_btn_group1, radio_btn_group2): pdf_data_source =\ PdfFileReader(pdf_data_path.open(mode="rb"), strict=False) field_values = pair_fields_name_and_val(pdf_data_source.getFields(), True) try: group1_index = radio_btn_group1.index(field_values.get(_NAME_GROUP1)) field_values[_NAME_GROUP1] = group1_index except ValueError: pass try: group2_index = radio_btn_group2.index(field_values.get(_NAME_GROUP2)) field_values[_NAME_GROUP2] = group2_index except ValueError: pass group4_index = _index_from_btn_group4(field_values.get(_NAME_GROUP4)) if group4_index >= 0: field_values[_NAME_GROUP4] = group4_index return field_values
def read_pdf(filedir, stForms, i): form_fields = {} clean = CleanUtil() try: pdf_reader = PdfFileReader( open(filedir, "rb") ) #this creates a pdf reader so we can gather information from the pdf form fields = pdf_reader.getFields( ) #this is a dictionary with field key values and their associative input data except: fields = None if fields is None: fields = {} for key in fields: #key is the forms field name i.e 'email' is the key for '*****@*****.**' if key != 'sig_univ_employee' and key != 'sig_dept_chair' and key != 'dean_students': fl = fields[key] form_fields[fl.name] = clean.sanitize(fl.value, fl.fieldType) if 'employee_responsible_name' in form_fields: stForms['1'] = form_fields elif 'colleges-st10' not in form_fields: stForms['2' + str(i)] = form_fields return
return writer if __name__ == '__main__': csv_filename = "EISAutoFill.csv" pdf_filename = "EIS 3 Certificate - Autofilled.pdf" csvin = os.path.normpath(os.path.join(os.getcwd(), 'in', csv_filename)) pdfin = os.path.normpath(os.path.join(os.getcwd(), 'in', pdf_filename)) pdfout = os.path.normpath(os.path.join(os.getcwd(), 'out')) data = pd.read_csv(csvin) pdf = PdfFileReader(open(pdfin, "rb"), strict=False) if "/AcroForm" in pdf.trailer["/Root"]: pdf.trailer["/Root"]["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) pdf_fields = [str(x) for x in pdf.getFields().keys() ] # List of all pdf field names csv_fields = data.columns.tolist() i = 0 #Filename numerical prefix for j, rows in data.iterrows(): i += 1 pdf2 = PdfFileWriter() set_need_appearances_writer(pdf2) if "/AcroForm" in pdf2._root_object: pdf2._root_object["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) # Key = pdf_field_name : Value = csv_field_value field_dictionary_1 = { "Full Name":
from collections import OrderedDict from PyPDF2 import PdfFileWriter, PdfFileReader, pdf from PyPDF2.generic import BooleanObject, NameObject, IndirectObject from pprint import pprint import pandas as pd import numpy as np import os data = pd.read_csv('WestulDatabaseOrd.csv') pdfpath = os.getcwd() pdf_new_name = 'Form_Final.pdf' pdfread = PdfFileReader(open(pdf_new_name, 'rb')) page = pdfread.getFields() fields = pdf.trailer
def pdf(self, fp, csv_row): password = '' extracted_text = '' self.parser = PDFParser(fp) self.document_t = PDFDocument pf = PdfFileReader # isEncrypted try: i = 0 try: thread = Thread(target=self.load_pdf, args=(PDFDocument, password)) thread.start() thread.join(timeout=90) except Exception as e: print('PDF I/O error: ' + e.__str__()) row = [ self.line_count, 'PDF DOCUMENT OBJECT FAILED TO LOAD - ' + e.__str__() + ': ' + self.url, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ] # self.line_count += 1 report_path = self.report_folder + self.report_name # 90 SECONDS or LOAD FAIL with open(report_path, 'a', encoding='utf8', newline='') as csv_file: writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL) writer.dialect.lineterminator.replace('\n', '') writer.writerow(row) stop_event.set() document = PDFDocument document = self.document_t pf = PdfFileReader(BytesIO(open(self.pdf_path, 'rb').read())) # ENCRYPTION if self.parser.doc.encryption is not None: csv_row.insert(4, [self.csv_header[4], 'ENCRYPTED']) csv_row.insert(5, [self.csv_header[5], 'ENCRYPTED']) else: csv_row.insert(4, [self.csv_header[4], 'FALSE']) csv_row.insert(5, [self.csv_header[5], 'NA']) except Exception as e: csv_row.insert(4, [self.csv_header[4], 'FAILED: ' + e.__str__()]) csv_row.insert(5, [self.csv_header[5], 'NA']) exit_call = e.__str__() + ' document failed!!' print(exit_call) pass page_count = 0 # istagged try: pages = PDFPage.get_pages(document) if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager() laparams = LAParams() page_no = 0 istagged = 'FALSE' try: # document.catalog if document.catalog['MarkInfo']: istagged = 'TRUE' except Exception as e: exit_call = e.__str__() + ' tagged info failed!!' print(exit_call) page_count = resolve1(document.catalog['Pages'])['Count'] csv_row.insert(6, [self.csv_header[6], istagged]) csv_row.insert(7, [self.csv_header[7], page_count]) except Exception as e: csv_row.insert(6, [self.csv_header[6], 'IsTagged: ' + e.__str__()]) csv_row.insert(7, [self.csv_header[7], 'Page Count: ' + e.__str__()]) exit_call = e.__str__() + ' tagged info failed!!' print(exit_call) # TOC try: if pf.outlines: csv_row.insert(8, [self.csv_header[8], 'TRUE']) '''pdf_path_toc = self.document_folder + pdf_name + '_toc.txt' places_list = pf.outlines with open(pdf_path_toc, 'w') as filehandle: filehandle.writelines("%s\n" % place for place in places_list) filehandle.close()''' else: csv_row.insert(8, [self.csv_header[8], 'FALSE']) except Exception as e: csv_row.insert(8, [self.csv_header[8], 'TOC FAILED: ' + e.__str__()]) exit_call = e.__str__() + ' toc info failed!!' print(exit_call) # isForm, fields, try: if pf.getFields(): csv_row.insert(9, [self.csv_header[9], 'TRUE']) csv_row.insert(10, [self.csv_header[10], pf.getFields().__len__()]) else: csv_row.insert(9, [self.csv_header[9], 'FALSE']) csv_row.insert(10, [self.csv_header[10], 0]) except Exception as e: csv_row.insert(9, [self.csv_header[9], 'FORMS: ' + e.__str__()]) csv_row.insert(10, [self.csv_header[10], 'FIELDS: ' + e.__str__()]) exit_call = e.__str__() + ' forms failed!!' print(exit_call) # tables csv_row.insert(11, [self.csv_header[11], 'NOT RUN']) write_clip = '' word_count = 0 words_per_page = 0 char_count = 0 chars_per_word = 0 image_count = 0 # TODO: write 3 page sample and word count try: if pf.getNumPages() < 50: for page in range(pf.getNumPages()): p = pf.getPage(page) text_clip = p.extractText().encode('UTF-8') text_clip = BytesIO(text_clip).read().__str__()[2:] count_clip = re.findall(r"[^\W_]+", text_clip, re.MULTILINE) word_count += len(count_clip) char_count += len(text_clip) if page <= 3: write_clip += '[ PAGE ' + (page + 1).__str__() + ' START ] ' write_clip += text_clip.replace('\n', '').replace( ',', ' ').replace('"', '') write_clip += '[ PAGE ' + (page + 1).__str__() + ' END ]' else: write_clip = 'OVER 50 PAGES - SAMPLE SKIPPED' except Exception as e: exit_call = e.__str__() + ' :: TEXT sample failed!!' write_clip = exit_call word_count = exit_call char_count = exit_call print(exit_call) # TODO: Words/chars per page try: if not word_count == 0: chars_per_word = char_count / word_count else: chars_per_word = 0 if not page_count == 0: words_per_page = word_count / page_count else: words_per_page = 0 except Exception as e: exit_call = e.__str__() + ' :: WORD METRICS failed!!' chars_per_word = exit_call words_per_page = exit_call print(exit_call) # TODO: Add to row i = 12 try: csv_row.insert(i, [self.csv_header[i], word_count.__str__()]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'WORD_COUNT: ' + e.__str__()]) i = 13 try: csv_row.insert(i, [self.csv_header[i], char_count.__str__()]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'CHAR_COUNT: ' + e.__str__()]) i = 14 try: csv_row.insert(i, [self.csv_header[i], words_per_page.__str__()]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'WPP: ' + e.__str__()]) i = 15 try: csv_row.insert(i, [self.csv_header[i], chars_per_word.__str__()]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'CPP: ' + e.__str__()]) # TODO: IMAGES i = 16 '''try: pdfImages = Globals.base_folder + 'cli-tools\\pdfimages.exe' img_folder = self.document_folder + 'images\\' # + pdf_name[:-4] + '\\' if not os.path.exists(img_folder): os.makedirs(img_folder) # cmd = pdfImages + ' -list ' + '\"' + pdf_path + '\"' # output = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0].split(b'\n') # save images to disk cmd = pdfImages + ' -list \"' + self.pdf_path + '\" \"' + ' ' + '\"' # subprocess.Popen(cmd, stdout=subprocess.PIPE) os.chdir(img_folder) image_list = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()[0].split(b'\r\n') # os.remove(img_folder) # image_count = output.count('\n') image_count = image_list.__len__() if image_count > 2: # target = open(pdf_path_image, 'w') # target.write(image_list) # target.close() csv_row.insert(i, [self.csv_header[i], (image_count - 2).__str__()]) elif image_count == 0: csv_row.insert(i, [self.csv_header[i], 0]) else: csv_row.insert(i, [self.csv_header[i], 0]) except Exception as e: csv_row.insert(i, [self.csv_header[i], e.__str__() + ' image info failed!!']) exit_call = e.__str__() + ' image info failed!!' print(exit_call)''' # TODO: IMAGES per page i = 17 percent_img_per_page = float try: if not image_count == 0 or page_count == 0: percent_img_per_page = (float(image_count) / float(page_count)) * 100 else: percent_img_per_page = 0 csv_row.insert(i, [self.csv_header[i], percent_img_per_page]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'IMG: ' + e.__str__()]) # TODO: OCR risk i = 18 try: if words_per_page == 0 or percent_img_per_page > 3000: ocr_risk = 5 elif words_per_page < 15 or percent_img_per_page > 2000: ocr_risk = 4 elif words_per_page < 40 or percent_img_per_page > 1000: ocr_risk = 3 elif words_per_page < 70 or percent_img_per_page > 425: ocr_risk = 2 elif words_per_page < 80 or percent_img_per_page > 200: ocr_risk = 1 else: ocr_risk = 0 csv_row.insert(i, [self.csv_header[i], ocr_risk]) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'OCR: ' + e.__str__()]) # author, creator, producer, subject, title, di = pf try: di = pf.documentInfo except Exception as e: exit_call = e.__str__() + ' :: DOCUMENT INFO LOAD failed!!' print(exit_call) # Document info if di: # Author try: i = 19 if di.author: csv_row.insert( i, [self.csv_header[i], di.author.encode('UTF-8')]) else: csv_row.insert(i, [self.csv_header[i], 'NULL']) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'AUTHOR: ' + e.__str__()]) exit_call = e.__str__() + ' doc info failed!!' print(exit_call) # Creator try: i = 20 if di.creator: csv_row.insert( i, [self.csv_header[i], di.creator.encode('UTF-8')]) else: csv_row.insert(i, [self.csv_header[i], 'NULL']) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'CREATOR: ' + e.__str__()]) print(exit_call) print('#5.1') # Producer try: i = 21 if di.producer: csv_row.insert( i, [self.csv_header[i], di.producer.encode('UTF-8')]) else: csv_row.insert(i, [self.csv_header[i], 'NULL']) except Exception as e: csv_row.insert( i, [self.csv_header[i], 'PRODUCER: ' + e.__str__()]) print(exit_call) # Subject try: i = 22 if di.subject: csv_row.insert( i, [self.csv_header[i], di.subject.encode('UTF-8')]) else: csv_row.insert(i, [self.csv_header[i], 'NULL']) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'SUBJECT: ' + e.__str__()]) print(exit_call) # Title try: i = 23 if di.title: csv_row.insert( i, [self.csv_header[i], di.title.encode('UTF-8')]) else: csv_row.insert(i, [self.csv_header[i], 'NULL']) except Exception as e: csv_row.insert(i, [self.csv_header[i], 'TITLE: ' + e.__str__()]) print(exit_call) # Document clip i = 24 try: csv_row.insert(i, [self.csv_header[i], write_clip]) except Exception as e: csv_row.insert(i, [self.csv_header[i], e.__str__()]) # Write results row = [] for i in range(csv_row.__len__()): row.append(csv_row[i][1]) report_path = self.report_folder + self.report_name # COPLETE WRITE with open(report_path, 'a', encoding='utf8', newline='') as csv_file: writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL) writer.dialect.lineterminator.replace('\n', '') writer.writerow(row) # csv_file.close() fp.close() os.remove(self.pdf_path) # Log close msg = (' >>>> PDF complete:[' + self.url + '] ' + self.line_count.__str__() + ' ' + (datetime.datetime.now().__str__()[:-7])) print(msg) utils.logline(self.log, msg)
def process_file(self): infile = PdfFileReader(open(self.fp, 'rb')) fields = infile.getFields() self.content = self.parse_fields(fields)
#!/usr/bin/python from PyPDF2 import PdfFileReader pdf_document = "file.pdf" with open(pdf_document, "rb") as filehandle: pdf = PdfFileReader(filehandle) info = pdf.getDocumentInfo() pages = pdf.getNumPages() print('file information: ', info) print("number of pages: %i" % pages) page1 = pdf.getPage(0) print(pdf.getIsEncrypted()) print(pdf.pageMode) print(pdf.getFields()) print(pdf.stream) print(pdf.flattenedPages) print(page1) print(page1.extractText())
True) return writer except Exception as e: print('set_need_appearances_writer() catch : ', repr(e)) return writer csvin = "H:\\gitprojects\\\PyPDF2-Pandas-PDFFieldUpdater\\in\\data.csv" infile = "H:\\gitprojects\\\PyPDF2-Pandas-PDFFieldUpdater\\in\\PatientIntakeForm.pdf" data = pd.read_csv(csvin) pdf = PdfFileReader(open(infile, "rb"), strict=False) if "/AcroForm" in pdf.trailer["/Root"]: pdf.trailer["/Root"]["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) fields = pdf.getFields() # Run in console to see Key names for field entry i = 0 #Filename numerical prefix for j, rows in data.iterrows(): outfile = "H:\\gitprojects\\\PyPDF2-Pandas-PDFFieldUpdater\\out\\" i += 1 pdf2 = PdfFileWriter() set_need_appearances_writer(pdf2) if "/AcroForm" in pdf2._root_object: pdf2._root_object["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)}) if "/AcroForm" in pdf2._root_object: pdf2._root_object["/AcroForm"].update( {NameObject("/NeedAppearances"): BooleanObject(True)})
def get_headers(infile): inputStream = open(infile, "rb") pdf_reader = PdfFileReader(inputStream, strict=False) fields = pdf_reader.getFields().keys() print('Headers needed for data file: {}'.format(list(fields)))
def PDFForm2Excel(mypath, outfile): errcount = 0 mypath = abspath(mypath) pdffiles = [] #Put PDF Files into Array pdffiles if parser.parse_args( ).boolean_recursive == False: #non recursive: Only input Folder is analyzed pdffiles = sorted([ join(mypath, f) for f in listdir(mypath) if f[-4:] == '.pdf' and isfile(join(mypath, f)) ]) if parser.parse_args( ).boolean_recursive == True: #recursive: input Folder and Subfolders are analyzed for r, d, f in walk(mypath): for file in f: if file[-4:] == '.pdf' in file: pdffiles.append(join(abspath(r), file)) #Start with Initial PDF to create Dataframe schema #It needs to be assured that the first PDF and its Form Fields are always readable #They create the master schema for all following PDFs #It also has to be assured that the follwing PDF Forms have the same fields in the same order #print(pdffiles) pdf = pdffiles[0] # pdf #Define Objects f (file) and fields f = PdfFileReader(pdf) fields = f.getFields() #Define Dataframe Object for Results results = pd.DataFrame() #Define dataframe for Field Name List try: df = pd.DataFrame([(k, k1, v1) for k, v in fields.items() for k1, v1 in v.items()], columns=['Field', 'Type', pdf]) df2 = df.loc[df['Type'] == '/V'] # Filter for Values := '/V' only df2 = df2.filter(items=['Field', pdf]) df2 = df2.reset_index(drop=True) # Reset Row Index results = df2.set_index('Field') results = results.drop(pdf, axis='columns') except: exit("##initial pdf read error## " + pdf) #Loop through all PDF Files in Input Folder for pdf in pdffiles: f = PdfFileReader(pdf) # PDF Fileobject producer = f.getDocumentInfo().producer try: fields = f.getFields() df = pd.DataFrame([(k, k1, v1) for k, v in fields.items() for k1, v1 in v.items()], columns=['Field', 'Type', pdf]) df2 = df.loc[df['Type'] == '/V'] # Filter for Values := '/V' only df2 = df2.filter(items=['Field', pdf]) # Add Filename as ColumnHeader df2 = df2.set_index('Field') # Set 'Field' as Row Index df2[pdf] = df2[pdf].map(lambda x: x.lstrip( '=')) #remove heading '=' from future excel Cells results = results.merge( df2, on='Field', how='left') # Write Values to Array 'results' print("read success " + str(producer) + " " + pdf) except: results[pdf] = '' #Create empty Column in results for failed PDF print("##read error## " + str(producer) + " " + pdf) errcount = errcount + 1 continue print('--------------') print("Summary: " + str(len(pdffiles)) + " files read with " + str(errcount) + " file read Errors") #write results to Excel try: if parser.parse_args().boolean_col == False: results.T.to_excel( mypath + outfile, header=True, index=True ) # Write Result dataframe in one line per PDF to Excel File else: results.to_excel( mypath + outfile, header=True, index=True ) # Write Result dataframe in one Coumn per PDF to Excel File print("File " + mypath + outfile + " successfully written") except: print("Error writing File " + mypath + outfile)
def cargoDoc(): fp = open(r"C:\Users\ssleep\Documents\Programming\Cargo Docker\Thursday\LCBO\601331975 PARS MANIFESTS.pdf", 'rb') parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize('') visited = set() pars = [] for xref in doc.xrefs: for objid in xref.get_objids(): if objid in visited: continue visited.add(objid) obj = doc.getobj(objid) if obj is None: continue pars = extract(objid,obj) pdfFileObj = open(specificPath, 'rb') pdfReader = PdfFileReader(pdfFileObj) fields = pdfReader.getFields() # print(len(fields)-15) for i in range(len(fields)-15): containerNumber = "" weight = "" consignee = "" shipper = "" eta = "" portOfLoading = "" portOfDischarge = "" description = "" if i == 0: # prefix = str(i) + "." containerNumber = fields["Container Row1"].value weight = float(fields["Weight KGRow1"].value) consignee = fields["Consignee"].value shipper = fields["Shipper"].value eta = fields["ETA DATE"].value portOfLoading = fields["undefined"].value portOfDischarge = fields["Port of Discharge"].value description = fields["Description of goods"].value else: for j in list(fields.keys()): if j==str(i): for k in list(fields[j]["/Kids"]): try: if(k.getObject()['/T']=="WO"): wo=k.getObject()['/V'] elif(k.getObject()['/T']=="Container Row1"): containerNumber=k.getObject()['/V'] elif(k.getObject()['/T']=="SizeRow1"): size=k.getObject()['/V'] elif(k.getObject()['/T']=="Weight KGRow1"): weight=float(k.getObject()['/V']) elif(k.getObject()['/T']=="Consignee"): consignee=k.getObject()['/V'] elif(k.getObject()['/T']=="Shipper"): shipper=k.getObject()['/V'] elif(k.getObject()['/T']=="ETA DATE"): eta=k.getObject()['/V'] elif(k.getObject()['/T']=="undefined"): portOfLoading=k.getObject()['/V'] elif(k.getObject()['/T']=="Port of Discharge"): portOfDischarge=k.getObject()['/V'] elif(k.getObject()['/T']=="Description of goods"): description=k.getObject()['/V'] except KeyError: True
now = datetime.datetime.now() print str(now) year = now.year month = now.month day = now.day month_six_later = month + 6 date = '{}/{}/{}'.format(day, month, year) date_six_later = '{}/{}/{}'.format(day, month_six_later, year) print(date) #infile = "Test-Sheet-3.pdf" infile = askopenfilename() pdf_reader = PdfFileReader(open(infile, "rb")) dictionary = pdf_reader.getFormTextFields() # returns a python dictionary dictionary_2 = pdf_reader.getFields(tree=None, retval=None) patient_last_name = str(dictionary['Pat_LastName']) patient_first_name = str(dictionary['Pat_FirstName']) patient_name = '{} {}'.format(patient_first_name, patient_last_name) #patient_gender = str(dictionary['Pat_Gender']) patient_DOB = str(dictionary['Pat_DOB']) patient_gender = str(dictionary_2['Pat_Gender']) if patient_gender[102] == 'F': patient_gender = 'Female' elif patient_gender[102] == 'M': patient_gender = 'Male' knee_for_analysis = str(dictionary_2['Pat_Side'])
def write(client, doc): path_in_file = os.path.join(PDF_TEMPLATE_DIR, doc.file_name) inpt = open(path_in_file, 'rb') clients_file_name = str(client.first_name) + ' ' + str(client.last_name) + \ '_' + str(doc.file_name) # date or time p_file_path = os.path.join(PDF_GENERATED_RESULT_DIR, clients_file_name) reads = PdfFileReader(inpt) read = reads.getFormTextFields() checkboxes = reads.getFields() ## дефолтные значения ne menyat read['Text Field 490'] = ' ' # fio esli menyalos read['Text Field 473'] = ' ' # reklama read['Text Field 475'] = ' ' # inoe read['Text Field 493'] = ' ' read['Text Field 494'] = ' ' read['Text Field 492'] = ' ' read['Text Field 496'] = ' ' read['Text Field 497'] = ' ' read['Text Field 498'] = ' ' read['Text Field 50910'] = ' ' read['Text Field 50610'] = ' ' read['Text Field 505'] = ' ' read['Text Field 504'] = ' ' read['Text Field 506'] = ' ' for i in range(11, 21, 1): read['Text Field 50' + str(i)] = ' ' ##Заемщик\созаемщик checkboxes['Check Box 136'] = '/Yes' # zaemschik checkboxes['Check Box 137'] = '/Yes' # sozaemschik checkboxes['Check Box 97'] = '/Yes' # не убирать ########## Адрес checkboxes['Check Box 138'] = '/Yes' # fakt adres sovpadaet s registr ############Основания для проживания checkboxes['Check Box 101'] = '/Yes' # соц наем checkboxes['Check Box 102'] = '/Yes' # коммерческий наем checkboxes['Check Box 103'] = '/Yes' # собственность checkboxes['Check Box 104'] = '/Yes' # у родственников checkboxes['Check Box 105'] = '/Yes' # иное, отразить в Листе дополнений ###### Семейное положение checkboxes['Check Box 106'] = '/Yes' # женат\замужем checkboxes['Check Box 107'] = '/Yes' # в разводе checkboxes['Check Box 108'] = '/Yes' # вдовец\вдова checkboxes['Check Box 109'] = '/Yes' # гражданский брак checkboxes['Check Box 110'] = '/Yes' # холост\не замужем ## Брачный договор checkboxes['Check Box 111'] = '/Yes' # есть checkboxes['Check Box 112'] = '/Yes' # нет ## Изменялась фамилия checkboxes['Check Box 113'] = '/Yes' # да checkboxes['Check Box 114'] = '/Yes' # нет # Дети совместно проживают # первый ребенок checkboxes['Check Box 115'] = '/Yes' # да checkboxes['Check Box 116'] = '/Yes' # net ## второй ребенок checkboxes['Check Box 117'] = '/Yes' # da checkboxes['Check Box 118'] = '/Yes' # net ## третий ребенок checkboxes['Check Box 119'] = '/Yes' # da checkboxes['Check Box 120'] = '/Yes' # net ############Образование checkboxes['Check Box 121'] = '/Yes' # nizhe srednego checkboxes['Check Box 122'] = '/Yes' # srednee checkboxes['Check Box 123'] = '/Yes' # srednee spec checkboxes['Check Box 124'] = '/Yes' # neokon vishee checkboxes['Check Box 125'] = '/Yes' # highest checkboxes['Check Box 126'] = '/Yes' # neskolko high checkboxes['Check Box 127'] = '/Yes' # dop vish checkboxes['Check Box 128'] = '/Yes' # uchenaya stepen checkboxes['Check Box 129'] = '/Yes' # MBA checkboxes['Check Box 130'] = '/Yes' # inoe ################занятонсть checkboxes['Check Box 131'] = '/Yes' # yavlyatsya zarplatnym proektom checkboxes['Check Box 132'] = '/Yes' # ne yavlyaetsa ############Место работы checkboxes['Check Box 133'] = '/Yes' # ispytatelny srok checkboxes['Check Box 134'] = '/Yes' # ne ispytatelny srok checkboxes['Check Box 139'] = '/Yes' # по найму бессрочно checkboxes['Check Box 140'] = '/Yes' # по найму срочно checkboxes['Check Box 141'] = '/Yes' # ИП checkboxes['Check Box 142'] = '/Yes' # собственность бизнеса ###### Сфера деятельности организации checkboxes['Check Box 144'] = '/Yes' # армия checkboxes['Check Box 145'] = '/Yes' # ИТ checkboxes['Check Box 146'] = '/Yes' # Консалтинг checkboxes['Check Box 147'] = '/Yes' # Медицина checkboxes['Check Box 148'] = '/Yes' # наука checkboxes['Check Box 149'] = '/Yes' # образование checkboxes['Check Box 150'] = '/Yes' # строительство checkboxes['Check Box 151'] = '/Yes' # отповая розничная культура checkboxes['Check Box 152'] = '/Yes' # органы власти и управления checkboxes['Check Box 153'] = '/Yes' # охранная деятельность checkboxes['Check Box 154'] = '/Yes' # предприятия ТЭК checkboxes['Check Box 155'] = '/Yes' # промышленность и машиностроение checkboxes['Check Box 156'] = '/Yes' # социальная сфера checkboxes['Check Box 157'] = '/Yes' # транспорт checkboxes['Check Box 158'] = '/Yes' # туризм checkboxes['Check Box 159'] = '/Yes' # услуги checkboxes['Check Box 160'] = '/Yes' # финансы, банки, стразование checkboxes['Check Box 161'] = '/Yes' # другие отрасли ######################Численность персонала checkboxes['Check Box 162'] = '/Yes' # do 10 checkboxes['Check Box 163'] = '/Yes' # 11-50 checkboxes['Check Box 164'] = '/Yes' # 51-100 checkboxes['Check Box 165'] = '/Yes' # 101-500 checkboxes['Check Box 166'] = '/Yes' # 501-1000 checkboxes['Check Box 167'] = '/Yes' # >1000 ################Срок существования организации checkboxes['Check Box 168'] = '/Yes' # до 2 лет checkboxes['Check Box 169'] = '/Yes' # от 2 до 5 лет checkboxes['Check Box 170'] = '/Yes' # свыше 5 лет ################Дополнительное место работы checkboxes['Check Box 171'] = '/Yes' # имею checkboxes['Check Box 172'] = '/Yes' # не имею ##########Денежные средства (с учетом первоначального взноса) checkboxes['Check Box 17310'] = '/Yes' # имею checkboxes['Check Box 174'] = '/Yes' # не имею# ####################Автомобиль checkboxes['Check Box 175'] = '/Yes' # есть checkboxes['Check Box 176'] = '/Yes' # нет ##########Недвижисое имущество checkboxes['Check Box 1731011'] = '/Yes' # есть checkboxes['Check Box 173'] = '/Yes' # нет ############Основания возниконовения права на имущество checkboxes['Check Box 177'] = '/Yes' # покупка checkboxes['Check Box 178'] = '/Yes' # приватизация checkboxes['Check Box 179'] = '/Yes' # наследство checkboxes['Check Box 180'] = '/Yes' # дарение checkboxes['Check Box 181'] = '/Yes' # иное ##############процедура бонкротства checkboxes['Check Box 182'] = '/Yes' # применялось checkboxes['Check Box 183'] = '/Yes' # не применялось ################Алиментные обязательства checkboxes['Check Box 184'] = '/Yes' # yest checkboxes['Check Box 185'] = '/Yes' # net ################Не редаткировать. Принять условия соглашения checkboxes['Check Box 186'] = '/Yes' # иное checkboxes['Check Box 187'] = '/Yes' # иное checkboxes['Check Box 189'] = '/Yes' # согласие на обработку ПДн ######################Представитель checkboxes['Check Box 188'] = '/Yes' # есть представитель ################Клиент read['Text Field 470'] = 'stepen rodstva s zaemschikom' read['Text Field 471'] = client.last_name + ' ' + client.first_name + \ ' ' + client.part_name read['Text Field 472'] = client.passport.gender # male/female read['Text Field 474'] = client.snils # 'snils' read['Text Field 476'] = client.inn # 'INN' read['Text Field 477'] = 'index' read['Text Field 478'] = 'РФ' read['Text Field 479'] = 'oblast' read['Text Field 480'] = 'rayon' read['Text Field 481'] = client.address.city # 'naselenny punkt' read['Text Field 482'] = client.address.street # 'street' read['Text Field 483'] = client.address.buildingNumber # 'number of home' read['Text Field 484'] = 'korpus' read['Text Field 485'] = client.address.flat # 'flat' read['Text Field 486'] = 'phone' read['Text Field 487'] = 'home phone reg' read['Text Field 488'] = 'home phone prozhivanie' read['Text Field 489'] = 'work phone' read['Text Field 490'] = 'e-mail' if checkboxes['Check Box 113'] == '/Yes': # изменялась ли фамилмя read['Text Field 491'] = 'FIO' read['Text Field 492'] = 'god izmeneniya' ##################Дети read['Text Field 493'] = 'data rozhdeniya 1go rebenka' read['Text Field 494'] = 'data rozhdeniya 2go rebenka' ################Зарплатный проект if checkboxes['Check Box 131'] == '/Yes': read['Text Field 496'] = 'nomer karty' ####################Работа if checkboxes['Check Box 136'] == '/Yes': read['Text Field 497'] = 's' read['Text Field 498'] = 'do' if checkboxes['Check Box 138'] == '/Yes': read['Text Field 499'] = '% buisness' read['Text Field 500'] = 'должность' read['Text Field 501'] = 'среднемесячный доход' read['Text Field 502'] = "стаж работы на текущем месте, лет" read['Text Field 50311'] = 'Стаж по профилю, лет' read['Text Field 50411'] = 'Общий стаж работы общий, лет' read['Text Field 50510'] = 'Название организации' read['Text Field 50610'] = 'инн организации' read['Text Field 50710'] = 'фактический адрес' read['Text Field 50810'] = 'телефон организации' read['Text Field 50910'] = 'добавочный номер' read['Text Field 5010'] = 'сайт организации' if checkboxes['Check Box 151'] == '/Yes': read['Text Field 505'] = 'сфера розничной торговли' if checkboxes['Check Box 159'] == '/Yes': read['Text Field 504'] = 'уточните сферу' if checkboxes['Check Box 161'] == '/Yes': read['Text Field 503'] = 'Уточните' ######################Активы if checkboxes['Check Box 173'] == '/Yes': read['Text Field 5011'] = 'Наличные средства, сумма, руб' read['Text Field 5012'] = 'Банк №1' read['Text Field 5013'] = 'Банк №2' read['Text Field 5014'] = 'Сумма' read['Text Field 5015'] = 'Сумма' if checkboxes['Check Box 175'] == '/Yes': read['Text Field 5016'] = 'марка' read['Text Field 5017'] = 'год приобретения' read['Text Field 5018'] = 'стоимость по вашей оценке' if checkboxes['Check Box 1731011'] == '/Yes': read['Text Field 5019'] = 'Тип объекта недвижимости' read['Text Field 5020'] = 'Текущая рыночная стоимость(по вашей оценке)' if checkboxes['Check Box 181'] == '/Yes': read['Text Field 506'] = 'иное' ######################Представитель if checkboxes['Check Box 188'] == '/Yes': read['Text Field 5021'] = 'Фио представителя' ########################Согласие на Пдн read['Text Field 5026'] = client.last_name + ' ' + client.first_name + \ ' ' + client.part_name # Пдн outpt = open(p_file_path, 'wb') write = PdfFileWriter() set_need_appearances_writer(write) for i in range(reads.getNumPages()): write.addPage(reads.getPage(i)) updateCheckboxValues(reads.getPage(i), checkboxes) write.updatePageFormFieldValues(reads.getPage(i), read) write.write(outpt) inpt.close() outpt.close()
# this is the smallest example of a pdf I could find from the examples at # https://stackoverflow.com/questions/17279712/what-is-the-smallest-possible-valid-pdf. # it does _not_ successfully parse with pypdf2, but it might be enough to exercise the # code enough to get internal imports or caches initialized before the fork. IRL you # might like to use a real pdf for this. r = PdfFileReader( BytesIO( codecs.decode( b"255044462D312E0D747261696C65723C" b"3C2F526F6F743C3C2F50616765733C3C" b"2F4B6964735B3C3C2F4D65646961426F" b"785B302030203320335D3E3E5D3E3E3E" b"3E3E3E", "hex", ))) except PyPdfError: pass import sys from cpytraceafl import fuzz_from_here, crashing_excepthook fuzz_from_here(excepthook=crashing_excepthook) with open(sys.argv[1], "rb") as f: try: r = PdfFileReader(f) r.getFields() r.getXmpMetadata() except PyPdfError: pass
def get_form_fields(infile): infile = PdfFileReader(open(infile, 'rb')) fields = infile.getFields() return OrderedDict((k, v.get('/V', '')) for k, v in fields.items())
def write(client, doc): path_in_file = os.path.join(PDF_TEMPLATE_DIR, doc.file_name) inpt = open(path_in_file, 'rb') clients_file_name = str(client.first_name) + ' ' + str(client.last_name) + \ '_' + str(doc.file_name) # date or time p_file_path = os.path.join(PDF_GENERATED_RESULT_DIR, clients_file_name) reads = PdfFileReader(inpt) read = reads.getFormTextFields() checkboxes = reads.getFields() checkboxes['chk0'] = '/Yes' ## checkboxes['chk1'] = '/Yes' #Созаемщик ## checkboxes['chk2'] = '/Yes' #Поручитель checkboxes['untitled9'] = '/Yes' # Квартира checkboxes['untitled10'] = '/Yes' # Дом с участком checkboxes['untitled11'] = '/Yes' # Апартаменты checkboxes['untitled12'] = '/Yes' # Таунхаус checkboxes['untitled13'] = '/Yes' # страховка checkboxes['untitled14'] = '/Yes' # мужчина checkboxes['untitled15'] = '/Yes' # женщина ## if str(checkboxes['chk1']) == '/Yes' or str(checkboxes['chk2']) == '/Yes': ## read['str0'] = 'фамилия заемщика' ## if checkboxes['chk'] read['str1'] = 'Сумма кредита' read['str2'] = 'срок кредита' read['str3'] = 'сумма первоначального взноса' read['str4'] = 'стоимость объекта' read['str5'] = 'запрашиваемая сумма top up' read['str6'] = 'регион приобретения недвижимости' read['str7'] = client.last_name read['str8'] = client.first_name read['str9'] = client.part_name read['str10'] = 'дата рождения' read['str11'] = 'гражданство' read['str12'] = 'место рождения' read['str13'] = client.snils # 'снилс' read['str14'] = client.inn # inn read['str15'] = 'фио при изменении' read['str16'] = client.passport.serial + ' ' + client.passport.number # 'серия номер паспорта' read['str17'] = 'дата выдачи' read['str18'] = 'код подразделения' read['str19'] = client.passport._from # 'кем выдан' read['str20'] = 'адрес регистрации' read['str21'] = 'адрес проживания' # lj,bnm read['str22'] = 'мобильный' read['str23'] = 'регистрац' read['str24'] = 'тел жит' read['str25'] = 'email' read['str62'] = 'кол-во детей' read['str26'] = 'регистрац' outpt = open(p_file_path, 'wb') write = PdfFileWriter() set_need_appearances_writer(write) for i in range(reads.getNumPages() - 1): # пока хз почему write.addPage(reads.getPage(i)) updateCheckboxValues(reads.getPage(i), checkboxes) write.updatePageFormFieldValues(reads.getPage(0), read) write.write(outpt) inpt.close() outpt.close()
def return_infile(): from PyPDF2 import PdfFileReader infile = "C:\\Users\\\mark.nations\\Desktop\\H-15\\h15.pdf" pdf = PdfFileReader(open(infile, "rb"), strict=False) fields = pdf.getFields() return PdfFileReader, fields, infile, pdf
def arrive(specificPath): pdfFileObj = open(specificPath, 'rb') pdfReader = PdfFileReader(pdfFileObj) fields = pdfReader.getFields() # print(len(fields)-15) for i in range(len(fields) - 15): driver.switch_to_default_content() driver.switch_to_frame( driver.find_element_by_css_selector( "frame[src='portals/portal.asp']")) containerNumber = "" size = "" weight = "" otherInfo = "" consignee = "" if i == 0: # prefix = str(i) + "." wo = fields["WO"].value containerNumber = fields["Container Row1"].value size = fields["SizeRow1"].value weight = float(fields["Weight KGRow1"].value) otherInfo = fields["Other info"].value consignee = fields["Consignee"].value else: for j in list(fields.keys()): if j == str(i): for k in list(fields[j]["/Kids"]): try: if (k.getObject()['/T'] == "WO"): wo = k.getObject()['/V'] elif (k.getObject()['/T'] == "Container Row1"): containerNumber = k.getObject()['/V'] elif (k.getObject()['/T'] == "SizeRow1"): size = k.getObject()['/V'] elif (k.getObject()['/T'] == "Weight KGRow1"): weight = float(k.getObject()['/V']) elif (k.getObject()['/T'] == "Other info"): otherInfo = k.getObject()['/V'] elif (k.getObject()['/T'] == "Consignee"): consignee = k.getObject()['/V'] except KeyError: True elem = driver.find_element_by_name("container_prefix_dof") elem.send_keys(containerNumber[:4]) elem = driver.find_element_by_name("container_number_dof") elem.send_keys(containerNumber[4:11]) select = Select(driver.find_element_by_name("ddlLoadStatus_dof")) select.select_by_visible_text("Load") select = Select(driver.find_element_by_name("lineid")) select.select_by_visible_text("Hapag-Lloyd Container Line") elem = driver.find_element_by_name("ddlSzTyCnt") elem.send_keys(size) elem = driver.find_element_by_name("cargo_weight") elem.send_keys(str(weight)) select = Select(driver.find_element_by_name("ddWeightUnits")) select.select_by_visible_text("Kgs") elem = driver.find_element_by_id("CkbCR") elem.click() try: if not ("LCBO" in consignee or "LIQUOR CONTROL" in consignee): elem = driver.find_element_by_id("CkbFR") elem.click() except: elem = driver.find_element_by_id("CkbFR") elem.click() reservation = "import" if size == "20R86" or size == "40R96": m = re.search("Temperature: ", otherInfo) n = re.search(r"\.\d+ C", otherInfo[m.end():]) # print(str(m.end()) + " " + str(n.start())) reservation += otherInfo[m.end():n.start() + m.end()] + "c" elem = driver.find_element_by_name("bkg_nbr_dof") elem.send_keys(reservation) select = Select(driver.find_element_by_name("Line")) select.select_by_visible_text("Hapag-Lloyd Container Line") elem = driver.find_element_by_name("Submit") elem.click() wait = WebDriverWait(driver, 10) wait.until(lambda driver: "Equipment is already on Terminal" in driver. page_source or EC.element_to_be_clickable( driver.find_element_by_name("Close"))) if "Equipment is already on Terminal" in driver.page_source: f = open(testfile, "a+") f.write("WO: " + wo + " " + "Container: " + containerNumber + "\n") f.close() driver.switch_to_default_content() driver.switch_to_frame( driver.find_element_by_css_selector( "frame[src='MenuNavFrame.asp?MenuID=10']")) elem = driver.find_element_by_css_selector( 'a[href*="Gate/VirtualArrive/VirtualArrive.asp"') elem.click() else: elem = driver.find_element_by_name("Close") elem.click()
def make_pdf(self): #Import dependencies from PyPDF2 import PdfFileReader from datetime import datetime import os import pypdftk import pytz pdf_pages = [] #Cycle through pages for j, page in enumerate(self.pages): template_name = os.path.join(THIS_FOLDER, "./pdf_templates/form.pdf") #Read pdf templates using PyPDF2 form = PdfFileReader(open(template_name, "rb")) #Get main form field names from pdf reader fields = form.getFields(tree=None, retval=None, fileobj=None) field_names = list(fields.keys()) #Make a copy of field_values field_values = self.details[:] #Add values from each page for product in page: field_values += [ product.reference, product.lot, product.quantity, product.description ] #Pad out unused fields, zip into dict for writing field_values += [""] * (len(field_names) - len(field_values)) field_dict = dict( zip(field_names, map(lambda x: x.upper(), field_values))) #Add page to writer, update fields from input data pdf_pages.append(pypdftk.fill_form(template_name, field_dict)) if self.checklist: end_form_template_name = os.path.join( THIS_FOLDER, "./pdf_templates/end_page.pdf") #Get pdf templates using PyPDF2 end_form = PdfFileReader(open(end_form_template_name, "rb")) #Get end form fields from reader end_fields = end_form.getFields(tree=None, retval=None, fileobj=None) end_field_names = list(end_fields.keys()) #Populate end field values with name and date, position depending on options end_field_values = [""] * 4 index = 2 if self.new else 0 tz = pytz.timezone("Australia/Brisbane") current_date = datetime.now(tz) end_field_values[index:index + 1] = [ self.client.first_name + " " + self.client.last_name, current_date.strftime("%d/%m/%Y") ] #Zip end field values and names into dict end_field_dict = dict(zip(end_field_names, end_field_values)) pdf_pages.append( pypdftk.fill_form(end_form_template_name, end_field_dict)) pypdftk.concat(pdf_pages, os.path.join(THIS_FOLDER, "../dynamic/print.pdf"))
def printMeta(directorio): try: print('Analizando el directorio: ' + directorio, '\n') for dirpath, dirnames, files in os.walk(directorio): for name in files: ext = name.lower().rsplit('.', 1)[-1] #Documentos if ext in ['pdf']: print("[*] Metadatos del archivo: %s " % (dirpath + os.path.sep + name)) print( '----------------------------------------------------------' ) try: pdfFile = PdfFileReader( open(dirpath + os.path.sep + name, 'rb')) #abrimos el fichero docInfo = pdfFile.getDocumentInfo( ) #creamos un diccionario con la info recolectada for metaItem in docInfo: print('[+]' + metaItem + ':' + str(docInfo[metaItem])) docInfoextra = { pdfFile.getNumPages(): 'Numero de paginas: ', pdfFile.getPageMode(): 'Modo de la pagina: ', pdfFile.isEncrypted: 'Encriptacion: ', pdfFile.getFields(): 'Campos de texto: ' } for element in docInfoextra: if element != None: print('[+]/' + docInfoextra[element] + str(element)) xmpinfo = pdfFile.getXmpMetadata() except: pass if xmpinfo != None: if hasattr(xmpinfo, 'dc_contributor'): print('[+]/' + 'dc_contributor', xmpinfo.dc_contributor) elif hasattr(xmpinfo, 'dc_identifier'): print('[+]/' + 'dc_identifier', xmpinfo.dc_identifier) elif hasattr(xmpinfo, 'dc_date'): print('[+]/' + 'dc_date', xmpinfo.dc_date) elif hasattr(xmpinfo, 'dc_source'): print('[+]/' + 'dc_source', xmpinfo.dc_source) elif hasattr(xmpinfo, 'dc_subject'): print('[+]/' + 'dc_subject', xmpinfo.dc_subject) elif hasattr(xmpinfo, 'xmp_modifyDate'): print('[+]/' + 'xmp_modifyDate', xmpinfo.xmp_modifyDate) elif hasattr(xmpinfo, 'xmp_metadataDate'): print('[+]/' + 'xmp_metadataDate'), xmpinfo.xmp_metadataDate elif hasattr(xmpinfo, 'xmpmm_documentId'): print('[+]/' + 'xmpmm_documentId', xmpinfo.xmpmm_documentId) elif hasattr(xmpinfo, 'xmpmm_instanceId'): print('[+]/' + 'xmpmm_instanceId', xmpinfo.xmpmm_instanceId) elif hasattr(xmpinfo, 'pdf_keywords'): print('[+]/' + 'pdf_keywords', xmpinfo.pdf_keywords) elif hasattr(xmpinfo, 'pdf_pdfversion'): print('[+]/' + 'pdf_pdfversion', xmpinfo.pdf_pdfversion) print("\n") #Imagenes elif ext in ['jpg', 'tiff']: print("[*] Metadatos del archivo: %s " % (dirpath + os.path.sep + name)) print( '----------------------------------------------------------' ) f = open(dirpath + os.path.sep + name, 'rb') tags = exifread.process_file(f) if len(tags) == 0: print('[!]No hay metadatos') for tag in tags.keys(): if tag not in ('JPEGThumbnail', 'TIFFThumbnail', 'Filename', 'EXIF MakerNote'): print("[+]: %s, valor %s" % (tag, tags[tag])) print("\n") #Musica elif ext in ['mp3']: print("[*] Metadatos del archivo: %s " % (dirpath + os.path.sep + name)) print( '----------------------------------------------------------' ) tag = eyed3.id3.Tag() tag.parse(dirpath + os.path.sep + name) if tag.artist is not None: print('Artista: ', tag.artist) if tag.album is not None: print('Album: ', tag.album) if tag.title is not None: print('Titulo: ', tag.title) if tag.track_num[0] is not None: print('Track: ', tag.track_num[0]) else: print('[!]No hay metadatos') #Docs elif ext in ['docs']: print("[*] Metadatos del archivo: %s " % (dirpath + os.path.sep + name)) print( '----------------------------------------------------------' ) document = docx.Document(docx=dirpath + os.path.sep + name) core_properties = document.core_properties print(core_properties.author) print(core_properties.created) print(core_properties.last_modified_by) print(core_properties.last_printed) print(core_properties.modified) print(core_properties.revision) print(core_properties.title) print(core_properties.category) print(core_properties.comments) print(core_properties.identifier) print(core_properties.keywords) print(core_properties.language) print(core_properties.subject) print(core_properties.version) print(core_properties.keywords) print(core_properties.content_status) print('[+]Ejecucion finalizada') except (KeyboardInterrupt, SystemExit): print('[!]Se ha interrumpido la ejecucion') except: print("Unexpected error:", sys.exc_info()[0])
def main(): pdf = PdfFileReader(file('tt.pdf', 'rb'), strict=False) print pdf.getFields({''})
def write(client, doc): path_in_file = os.path.join(PDF_TEMPLATE_DIR, doc.file_name) inpt = open(path_in_file, 'rb') clients_file_name = str(client.first_name) + ' ' + str(client.last_name) + \ '_' + str(doc.file_name) # date or time p_file_path = os.path.join(PDF_GENERATED_RESULT_DIR, clients_file_name) reads = PdfFileReader(inpt) read = reads.getFormTextFields() checkboxes = reads.getFields() ## checkboxes['chk0'] = '/Yes' read['gText1'] = 'Наименование компании-партнёра' read['gText2'] = 'ФИО сотрудника компании-партнёра' read['gText3'] = '*****@*****.**' read['Text1'] = read['Text28'] = 'Фамилия' # client.last_name read['Text2'] = 'Имя' # client.first_name read['Text3'] = 'Отчество' # client.part_name ## read['Text28'] = client.first_name + client.part_name read['gNum1'] = 9379373737 # телефон партнера read['Num1'] = 99999 # запрашиваемая сумма кредита read['Num2'] = 122 # количество месяцев срок кредита read['Num3'] = 99999 # Предваритаельная стоимость жилья read['Text32'] = 'РФ' read['Text7'] = 'РФ' read['Text8'] = 'oblast' read['Text9'] = 'rayon' read['Num33'] = 'номер квартиры' # client.address.flat read['Text33'] = 'ulitsa' # client.address.street read['Num32'] = 123 # client.address.buildingNumber read['Text44'] = 44 # client.address korpus ?? read['Text35'] = 'gorod' # client.address.city read['Num6'] = 433 # client.address.flat read['Num4'] = 443531 # client.address.index ?? read['Num7'] = 9061264537 # stacion telefon read['email'] = '*****@*****.**' # client.email read['Num14'] = 9061264536 # client.phone_number read['Text20'] = 'nameOfOrganiz' # client.OrganizationInfo.full_name read['Text21'] = 'address_of_jobs' # client.OrganizationInfo.address read['Num17'] = 'inn' # client.OrganizationInfo.inn_number read['Num18'] = 45523455549 # client.OrganizationInfo.hr_number read['Num19'] = 45523455548 # client.OrganizationInfo.phoneJob ?? # рабочий телефон read['Num20'] = 99 # stazh v godah in organization read['Num21'] = 11 # stazh v month in organization read['Num22'] = 24 # full stazh in years read['Num23'] = 11 # full stazh in months read['Num24'] = 555555 # client.AdditionalClientInfo.average_income read['Num25'] = 12222 # client.AdditionalClinetInfo.aliment read['Num26'] = 222222 # client.AdditionalClinetInfo.monetary_obligations read['Num27'] = 3608 # client.passport.serial read['Num28'] = 128333 # client.passport.number read['Num29'] = 640 # str(clent.passport.code_of)[:3] read['Num30'] = 128 # str(client.passport.code_of)[4:] read['Text31'] = '' # пока так дальше видно будет outpt = open(out, 'wb') write = PdfFileWriter() set_need_appearances_writer(write) for i in range(reads.getNumPages() - 1): # пока хз почему write.addPage(reads.getPage(i)) updateCheckboxValues(reads.getPage(i), checkboxes) write.updatePageFormFieldValues(reads.getPage(i), read) write.write(outpt) inpt.close() outpt.close()