def test_split(self): total_pages = pypdftk.get_num_pages(TEST_PDF_PATH) paths = pypdftk.split(TEST_PDF_PATH) self.assertEqual(len(paths) - 1, total_pages) self.assertTrue('doc_data.txt' in paths[0]) for p in paths: self.assertTrue(os.path.exists(p))
def test_split_output_dir(self): output_dir = mkdtemp() total_pages = pypdftk.get_num_pages(TEST_PDF_PATH) paths = pypdftk.split(TEST_PDF_PATH, out_dir=output_dir) self.assertEqual(len(paths) - 1, total_pages) for p in paths: out_path = os.path.join(output_dir, os.path.basename(p)) self.assertTrue(out_path)
def add_custom(original_pdf, custom_pdf, output_pdf): output_pdf_aux = output_pdf + '.aux' pypdftk.stamp(original_pdf, custom_pdf, output_pdf_aux) num_pages = pypdftk.get_num_pages(original_pdf) if num_pages > 1: pypdftk.add_custom(output_pdf_aux, original_pdf, output_pdf) os.unlink(output_pdf_aux) else: os.rename(output_pdf_aux, output_pdf)
def admin_print(id): import pypdftk # noqa import tempfile app = ApplicationForm.query.get(id) rendered = render_app(app, print=True) pdf = generate_pdf(rendered, options={'quiet': ''}) with tempfile.NamedTemporaryFile(delete=False) as fp: fp.write(pdf) n_pages = pypdftk.get_num_pages(fp.name) n_blank_pages = 3 - n_pages blank_path = consts.DIR + '/data/blank_A4.pdf' paths_to_concat = [fp.name] paths_to_concat += [blank_path] * n_blank_pages paths_to_concat += [consts.DIR + '/data/protokol.pdf'] concated_file = pypdftk.concat(paths_to_concat) with open(concated_file, 'rb') as out_f: output = out_f.read() # remove the temp file os.unlink(fp.name) # mark the application as 'printed' in the DB app.state = ApplicationStates.printed app.printed_at = datetime.datetime.now() db.session.commit() response = make_response(output) response.headers['Content-Type'] = 'application/pdf' disposition = 'inline; filename=application_form_{}.pdf'.format(id) response.headers['Content-Disposition'] = disposition return response
def test_get_num_pages(self): num = pypdftk.get_num_pages(TEST_PDF_PATH) self.assertEqual(num, 129)
def test_replace_page_at_end(self): total_pages = pypdftk.get_num_pages(TEST_PDF_PATH) last_page = pypdftk.get_num_pages(TEST_PDF_PATH) pdf_to_insert = 'test_files/page_01.pdf' pypdftk.replace_page(TEST_PDF_PATH, last_page, pdf_to_insert) self.assertEqual(total_pages, pypdftk.get_num_pages(TEST_PDF_PATH))
def print_f99_pdftk_html( stamp_print="", paginate=False, begin_image_num=None, page_count=False, file_content=None, silent_print=False, filing_timestamp=None, rep_id=None, attachment_file_content=None, ): # check if json_file is in the request # HTML("templates/forms/test.html").write_pdf("output/pdf/test/test.pdf") # HTML(string='''<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><div><b>This is bold text</b></div><div><u>This is underline text</u></div><div><i>This is italics text</i><u><br></u></div><div align='center'><u>Title</u></div><div align='left'><u><br></u></div><ol><li>one</li><li>two</li><li>three</li></ol>''').write_pdf("output/pdf/test/test.pdf") # pdfkit.from_file("templates/forms/test.html", "output/pdf/test/test.pdf") # pypdftk.stamp(current_app.config['FORM_TEMPLATES_LOCATION'].format('F99'), "output/pdf/test/test.pdf", "output/pdf/test/output.pdf") try: silent_print = silent_print txn_img_num = begin_image_num filing_timestamp = filing_timestamp if ((page_count and file_content) or ((paginate or silent_print) and file_content and begin_image_num) or (not paginate and "json_file" in request.files)): if page_count and file_content: json_file_md5 = md5_for_text(file_content) json_data = json.loads(file_content) elif (paginate or silent_print) and file_content and begin_image_num: # generate md5 for file_content json_file_md5 = md5_for_text(file_content) json_data = json.loads(file_content) elif not paginate and "json_file" in request.files: json_file = request.files.get("json_file") silent_print = (True if request.form.get("silent_print") and request.form.get("silent_print").lower() in ["true", "1"] else False) page_count = (True if request.form.get("page_count") and request.form.get("page_count").lower() in ["true", "1"] else False) if silent_print: txn_img_num = request.form.get("begin_image_num", None) if not txn_img_num: if flask.request.method == "POST": envelope = common.get_return_envelope( "false", "begin_image_num is missing from your request") status_code = status.HTTP_400_BAD_REQUEST return flask.jsonify(**envelope), status_code txn_img_num = int(txn_img_num) filing_timestamp = request.form.get( "filing_timestamp", None) json_file_md5 = md5_for_file(json_file) json_file.stream.seek(0) # save json file as md5 file name json_file.save( current_app.config["REQUEST_FILE_LOCATION"].format( json_file_md5)) # load json file json_data = json.load( open(current_app.config["REQUEST_FILE_LOCATION"].format( json_file_md5))) md5_directory = current_app.config["OUTPUT_DIR_LOCATION"].format( json_file_md5) # if paginate or page_count is True and directory exist then don't remove it is_dir_exist = False if os.path.isdir(md5_directory): is_dir_exist = True os.makedirs(md5_directory, exist_ok=True) # os.makedirs(md5_directory + "images", exist_ok=True) if not os.path.exists(md5_directory + "images"): shutil.copytree("templates/forms/F99/images", md5_directory + "images") shutil.copyfile("templates/forms/F99/form-text.css", md5_directory + "form-text.css") infile = current_app.config["HTML_FORM_TEMPLATES_LOCATION"].format( "template") outfile = md5_directory + json_file_md5 + ".html" form99_json_data = json_data["data"] with open(infile) as inf: txt = inf.read() soup = bs4.BeautifulSoup(txt, features="html5lib") soup.find("label", attrs={ "id": "committeeName" }).string = form99_json_data["committeeName"] soup.find("label", attrs={ "id": "street1" }).string = form99_json_data["street1"] soup.find("label", attrs={ "id": "street2" }).string = form99_json_data["street2"] soup.find("label", attrs={ "id": "city" }).string = form99_json_data["city"] soup.find("label", attrs={ "id": "state" }).string = form99_json_data["state"] soup.find("label", attrs={ "id": "zipCode" }).string = form99_json_data["zipCode"] soup.find("span", attrs={ "id": "committeeId" }).string = form99_json_data["committeeId"] name_list = [ "LastName", "FirstName", "MiddleName", "Prefix", "Suffix" ] treasurerFullName = "" for item in name_list: item = "treasurer" + item if form99_json_data.get(item): treasurerFullName += form99_json_data.get(item) + ", " soup.find("label", attrs={ "id": "treasurerFullName" }).string = treasurerFullName[:-2] soup.find("label", attrs={ "id": "treasurerName" }).string = ((form99_json_data.get("treasurerLastName", "") + ", " + form99_json_data.get("treasurerFirstName", "") ).strip().rstrip(",").strip()) f99_html_data = form99_json_data["text"] soup.find("label", attrs={"id": "text"}).string = f99_html_data soup.find("label", attrs={ "id": form99_json_data["reason"] }).string = "X" date_array = form99_json_data["dateSigned"].split("/") soup.find("span", attrs={ "id": "dateSignedMonth" }).string = str(date_array[0]) soup.find("span", attrs={ "id": "dateSignedDate" }).string = str(date_array[1]) soup.find("span", attrs={ "id": "dateSignedYear" }).string = str(date_array[2]) with open(outfile, "w") as output_file: output_file.write( str(soup).replace("<", "<").replace(">", ">")) # F99 PDF page padding options options = { "margin-top": "0.40in", "margin-right": "0.20in", "margin-bottom": "0.40in", "margin-left": "0.20in", } # HTML(outfile).write_pdf(md5_directory + json_file_md5 + '.pdf', stylesheets=[CSS(current_app.config['FORMS_LOCATION'].format('F99.css'))]) pdfkit.from_file(outfile, md5_directory + json_file_md5 + ".pdf", options=options) # pdfkit.from_file(outfile, md5_directory + json_file_md5 + '.pdf') total_no_of_pages = pypdftk.get_num_pages(md5_directory + json_file_md5 + ".pdf") # checking if attachment_file exist if ((paginate or page_count) and attachment_file_content) or ( not paginate and "attachment_file" in request.files): # reading Attachment title file attachment_title_file = current_app.config[ "FORM_TEMPLATES_LOCATION"].format("Attachment_Title") if (paginate or page_count) and attachment_file_content: attachment_file = json.loads(attachment_file_content) else: attachment_file = request.files.get("attachment_file") attachment_file.save( os.path.join(md5_directory + "attachment_temp.pdf")) os.makedirs(md5_directory + "attachment", exist_ok=True) os.makedirs(md5_directory + "final_attachment", exist_ok=True) pypdftk.split(md5_directory + "attachment_temp.pdf", md5_directory + "attachment") os.remove(md5_directory + "attachment/doc_data.txt") attachment_no_of_pages = pypdftk.get_num_pages( os.path.join(md5_directory + "attachment_temp.pdf")) attachment_page_no = total_no_of_pages total_no_of_pages += attachment_no_of_pages # we are doing this to assign page numbers to attachment file for filename in os.listdir(md5_directory + "attachment"): attachment_page_no += 1 page_dict = {} page_dict["PAGESTR"] = ("PAGE " + str(attachment_page_no) + " / " + str(total_no_of_pages)) if silent_print: page_dict["IMGNO"] = txn_img_num + attachment_page_no pypdftk.fill_form( attachment_title_file, md5_directory + "attachment/attachment_page_" + str(attachment_page_no) + ".pdf", ) pypdftk.stamp( md5_directory + "attachment/" + filename, md5_directory + "attachment/attachment_page_" + str(attachment_page_no) + ".pdf", md5_directory + "final_attachment/attachment_" + str(attachment_page_no) + ".pdf", ) pypdftk.concat( directory_files(md5_directory + "final_attachment/"), md5_directory + "attachment.pdf", ) os.remove(md5_directory + "attachment_temp.pdf") os.makedirs(md5_directory + "pages", exist_ok=True) os.makedirs(md5_directory + "final_pages", exist_ok=True) pypdftk.split(md5_directory + json_file_md5 + ".pdf", md5_directory + "pages") os.remove(md5_directory + "pages/doc_data.txt") f99_page_no = 1 for filename in os.listdir(md5_directory + "pages"): page_dict = {} page_dict["PAGESTR"] = ("PAGE " + str(f99_page_no) + " / " + str(total_no_of_pages)) if silent_print: page_dict["IMGNO"] = txn_img_num txn_img_num += 1 # need to print timestamp on first page only if filing_timestamp and f99_page_no == 1: page_dict["FILING_TIMESTAMP"] = filing_timestamp page_number_file = current_app.config[ "FORM_TEMPLATES_LOCATION"].format("Page_Number") pypdftk.fill_form( page_number_file, page_dict, md5_directory + "pages/page_number_" + str(f99_page_no).zfill(6) + ".pdf", ) pypdftk.stamp( md5_directory + "pages/page_number_" + str(f99_page_no).zfill(6) + ".pdf", md5_directory + "pages/" + filename, md5_directory + "final_pages/page_" + str(f99_page_no).zfill(6) + ".pdf", ) f99_page_no += 1 pypdftk.concat( directory_files(md5_directory + "final_pages/"), json_file_md5 + "_temp.pdf", ) if ((paginate or page_count) and attachment_file_content) or ( not paginate and "attachment_file" in request.files): pypdftk.concat( [ json_file_md5 + "_temp.pdf", md5_directory + "attachment.pdf" ], md5_directory + "all_pages.pdf", ) shutil.rmtree(md5_directory + "attachment") shutil.rmtree(md5_directory + "final_attachment") os.remove(md5_directory + "attachment.pdf") else: shutil.move(json_file_md5 + "_temp.pdf", md5_directory + "all_pages.pdf") # clean up task shutil.rmtree(md5_directory + "pages") shutil.rmtree(md5_directory + "final_pages") os.remove(md5_directory + json_file_md5 + ".pdf") # if flask.request.method == "POST": response = { # 'file_name': ent_app.conf'{}.pdf'.format(json_file_md5), "total_pages": total_no_of_pages, } if not page_count and not paginate: s3 = boto3.client("s3") extraArgs = { "ContentType": "application/pdf", "ACL": "public-read" } if silent_print: response["pdf_url"] = current_app.config[ 'S3_FILE_URL'] + rep_id + '.pdf' s3.upload_file( md5_directory + 'all_pages.pdf', current_app. config['AWS_FECFILE_COMPONENTS_BUCKET_NAME'], current_app.config['AWS_FECFILE_OUTPUT_DIRECTORY'] + '/' + str(rep_id) + '.pdf', ExtraArgs=extraArgs) else: response["pdf_url"] = ( current_app.config["PRINT_OUTPUT_FILE_URL"].format( json_file_md5) + "all_pages.pdf", ) s3.upload_file( md5_directory + "all_pages.pdf", current_app. config["AWS_FECFILE_COMPONENTS_BUCKET_NAME"], md5_directory + "all_pages.pdf", ExtraArgs=extraArgs, ) else: if not is_dir_exist: shutil.rmtree(md5_directory) if paginate: txn_img_json = { "summary": { "committeeId": form99_json_data.get("committeeId", None), "begin_image_num": begin_image_num, "end_image_num": txn_img_num } } response["txn_img_json"] = txn_img_json envelope = common.get_return_envelope(data=response) status_code = (status.HTTP_200_OK if page_count or paginate else status.HTTP_201_CREATED) return flask.jsonify(**envelope), status_code # elif page_count or paginate: # if not is_dir_exist: # shutil.rmtree(md5_directory) # response = { # "total_pages": total_no_of_pages, # } # elif paginate: # txn_img_json = { # 'summary' : { # 'committeeId': form99_json_data.get('committeeId', None) # } # } # response['txn_img_json'] = txn_img_json # return True, response # elif silent_print and not flask.request.method == "POST": # return True, {} else: if paginate or page_count or silent_print: envelope = common.get_return_envelope(False, "") else: # elif flask.request.method == "POST": envelope = common.get_return_envelope( False, "json_file is missing from your request") return flask.jsonify(**envelope), status.HTTP_400_BAD_REQUEST except Exception as e: traceback.print_exception(*sys.exc_info()) return error("Error generating print preview, error message: " + str(e))
def test_get_pages_single_range(self): pageRanges = [[1], [2, 5]] output_file = pypdftk.get_pages(TEST_PDF_PATH, pageRanges) concat_total_pages = pypdftk.get_num_pages(output_file) self.assertEqual(rangeCount(pageRanges), concat_total_pages)
#files = [] #for filename in glob.glob('*.pdf'): # files.append(filename) myDir = "." files = [] for root, dirnames, filenames in os.walk(myDir): files.extend(glob.glob(root + "/*.pdf")) #dirList=os.listdir(file_path) #list all the files in the directories file_write = open('pdf_stats_' + timestamp + '.csv', 'w') #writing no.of pages into a csv file file_write.write("No~FileName~PageCount~Size~") file_write.write("\n") file_write.write("\n") print "\n\n" counter = 1 for fname in files: data_find = str(counter) + '~' + fname + '~' + str( pypdftk.get_num_pages(fname)) + '~' + humanize.naturalsize( os.path.getsize( fname)) + '~' # giving file path with the name of the file print data_find # test with printing the data file_write.write(str(data_find)) file_write.write("\n") counter = counter + 1 file_write.close() print "\nWrote the PDF stats to the file " + 'pdf_stats_' + timestamp + '.csv' + '\n\n'
#files = [] #for filename in glob.glob('*.pdf'): # files.append(filename) myDir = "." files = [] for root, dirnames, filenames in os.walk(myDir): files.extend(glob.glob(root + "/*.pdf")) #dirList=os.listdir(file_path) #list all the files in the directories file_write = open('pdf_stats_' + timestamp + '.csv', 'w') #writing no.of pages into a csv file file_write.write("No~FileName~PageCount~Size~") file_write.write("\n") file_write.write("\n") print "\n\n" counter = 1 for fname in files: data_find = str(counter) + '~' + fname +'~'+ str(pypdftk.get_num_pages(fname)) + '~' + humanize.naturalsize(os.path.getsize(fname)) + '~' # giving file path with the name of the file print data_find # test with printing the data file_write.write(str(data_find)) file_write.write("\n") counter = counter + 1 file_write.close() print "\nWrote the PDF stats to the file " + 'pdf_stats_' + timestamp + '.csv' + '\n\n'
def test_replace_page_at_middle(self): total_pages = pypdftk.get_num_pages(TEST_PDF_PATH) pdf_to_insert = 'test_files/page_01.pdf' pypdftk.replace_page(TEST_PDF_PATH, 3, pdf_to_insert) self.assertEqual(total_pages, pypdftk.get_num_pages(TEST_PDF_PATH))
def test_concat(self): total_pages = pypdftk.get_num_pages(TEST_PDF_PATH) output_file = pypdftk.concat([TEST_PDF_PATH, TEST_PDF_PATH, TEST_PDF_PATH]) concat_total_pages = pypdftk.get_num_pages(output_file) self.assertEqual(total_pages * 3, concat_total_pages)
def test_concat(self): total_pages = pypdftk.get_num_pages(TEST_PDF_PATH) output_file = pypdftk.concat( [TEST_PDF_PATH, TEST_PDF_PATH, TEST_PDF_PATH]) concat_total_pages = pypdftk.get_num_pages(output_file) self.assertEqual(total_pages * 3, concat_total_pages)
def test_get_pages_clone(self): total_pages = pypdftk.get_num_pages(TEST_PDF_PATH) output_file = pypdftk.get_pages(TEST_PDF_PATH, []) concat_total_pages = pypdftk.get_num_pages(output_file) self.assertEqual(total_pages, concat_total_pages)
def print_f99_pdftk(stamp_print): # check if json_file is in the request if 'json_file' in request.files: total_no_of_pages = 1 page_no = 1 json_file = request.files.get('json_file') # generate md5 for json file json_file_md5 = utils.md5_for_file(json_file) json_file.stream.seek(0) md5_directory = current_app.config['OUTPUT_DIR_LOCATION'].format( json_file_md5) os.makedirs(md5_directory, exist_ok=True) infile = current_app.config['FORM_TEMPLATES_LOCATION'].format('F99') # save json file as md5 file name json_file.save( current_app.config['REQUEST_FILE_LOCATION'].format(json_file_md5)) outfile = md5_directory + json_file_md5 + '_temp.pdf' json_data = json.load( open(current_app.config['REQUEST_FILE_LOCATION'].format( json_file_md5))) # setting timestamp and imgno to empty as these needs to show up after submission if stamp_print != 'stamp': json_data['FILING_TIMESTAMP'] = '' json_data['IMGNO'] = '' f99_pages_text_json = json.loads(split_f99_text_pages(json_data)) json_data['MISCELLANEOUS_TEXT'] = f99_pages_text_json['main_page'] total_no_of_pages += len(f99_pages_text_json['additional_pages']) # checking if attachment_file exist if 'attachment_file' in request.files: # reading Attachment title file attachment_title_file = current_app.config[ 'FORM_TEMPLATES_LOCATION'].format('Attachment_Title') attachment_file = request.files.get('attachment_file') attachment_file.save( os.path.join(md5_directory + 'attachment_temp.pdf')) os.makedirs(md5_directory + 'attachment', exist_ok=True) os.makedirs(md5_directory + 'final_attachment', exist_ok=True) pypdftk.split(md5_directory + 'attachment_temp.pdf', md5_directory + 'attachment') os.remove(md5_directory + 'attachment/doc_data.txt') attachment_no_of_pages = pypdftk.get_num_pages( os.path.join(md5_directory + 'attachment_temp.pdf')) attachment_page_no = total_no_of_pages total_no_of_pages += attachment_no_of_pages # we are doing this to assign page numbers to attachment file for filename in os.listdir(md5_directory + 'attachment'): attachment_page_no += 1 pypdftk.fill_form( attachment_title_file, { "PAGESTR": "PAGE " + str(attachment_page_no) + " / " + str(total_no_of_pages) }, md5_directory + 'attachment/attachment_page_' + str(attachment_page_no) + '.pdf') pypdftk.stamp( md5_directory + 'attachment/' + filename, md5_directory + 'attachment/attachment_page_' + str(attachment_page_no) + '.pdf', md5_directory + 'final_attachment/attachment_' + str(attachment_page_no) + '.pdf') pypdftk.concat( directory_files(md5_directory + 'final_attachment/'), md5_directory + 'attachment.pdf') os.remove(md5_directory + 'attachment_temp.pdf') shutil.rmtree(md5_directory + 'attachment') shutil.rmtree(md5_directory + 'final_attachment') json_data['PAGESTR'] = "PAGE " + str(page_no) + " / " + str( total_no_of_pages) pypdftk.fill_form(infile, json_data, outfile, flatten=False) additional_page_counter = 0 if len(f99_pages_text_json['additional_pages']) > 0: continuation_file = current_app.config[ 'FORM_TEMPLATES_LOCATION'].format('F99_CONT') os.makedirs(md5_directory + 'merge', exist_ok=True) for additional_page in f99_pages_text_json['additional_pages']: page_no += 1 continuation_outfile = md5_directory + 'merge/' + str( additional_page_counter) + '.pdf' pypdftk.fill_form( continuation_file, { "PAGESTR": "PAGE " + str(page_no) + " / " + str(total_no_of_pages), "CONTINOUS_TEXT": additional_page[str(additional_page_counter)] }, continuation_outfile) pypdftk.concat([outfile, continuation_outfile], md5_directory + json_file_md5 + '_all_pages_temp.pdf') shutil.copy( md5_directory + json_file_md5 + '_all_pages_temp.pdf', outfile) additional_page_counter += 1 os.remove(md5_directory + json_file_md5 + '_all_pages_temp.pdf') # Add the F99 attachment if 'attachment_file' in request.files: pypdftk.concat([outfile, md5_directory + 'attachment.pdf'], md5_directory + 'all_pages.pdf') os.remove(md5_directory + 'attachment.pdf') else: shutil.copy(outfile, md5_directory + 'all_pages.pdf') os.remove(md5_directory + json_file_md5 + '_temp.pdf') # push output file to AWS s3 = boto3.client('s3') s3.upload_file( md5_directory + 'all_pages.pdf', current_app.config['AWS_FECFILE_COMPONENTS_BUCKET_NAME'], md5_directory + 'all_pages.pdf', ExtraArgs={ 'ContentType': "application/pdf", 'ACL': "public-read" }) response = { # 'file_name': '{}.pdf'.format(json_file_md5), 'pdf_url': current_app.config['PRINT_OUTPUT_FILE_URL'].format(json_file_md5) + 'all_pages.pdf' } if flask.request.method == "POST": envelope = common.get_return_envelope(data=response) status_code = status.HTTP_201_CREATED return flask.jsonify(**envelope), status_code else: if flask.request.method == "POST": envelope = common.get_return_envelope( 'false', 'JSON file is missing from your request') status_code = status.HTTP_400_BAD_REQUEST return flask.jsonify(**envelope), status_code
def test_num_pages(): input_file = "./Out/some_file.pdf" num_pgs = pypdftk.get_num_pages(input_file) print num_pgs
def print_f99_pdftk_html(stamp_print): # check if json_file is in the request # HTML("templates/forms/test.html").write_pdf("output/pdf/test/test.pdf") # HTML(string='''<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><div><b>This is bold text</b></div><div><u>This is underline text</u></div><div><i>This is italics text</i><u><br></u></div><div align='center'><u>Title</u></div><div align='left'><u><br></u></div><ol><li>one</li><li>two</li><li>three</li></ol>''').write_pdf("output/pdf/test/test.pdf") # pdfkit.from_file("templates/forms/test.html", "output/pdf/test/test.pdf") # pypdftk.stamp(current_app.config['FORM_TEMPLATES_LOCATION'].format('F99'), "output/pdf/test/test.pdf", "output/pdf/test/output.pdf") if 'json_file' in request.files: total_no_of_pages = 1 page_no = 1 json_file = request.files.get('json_file') # generate md5 for json file json_file_md5 = utils.md5_for_file(json_file) json_file.stream.seek(0) md5_directory = current_app.config['OUTPUT_DIR_LOCATION'].format( json_file_md5) os.makedirs(md5_directory, exist_ok=True) # os.makedirs(md5_directory + "images", exist_ok=True) if not os.path.exists(md5_directory + "images"): shutil.copytree("templates/forms/F99/images", md5_directory + "images") shutil.copyfile("templates/forms/F99/form-text.css", md5_directory + "form-text.css") infile = current_app.config['HTML_FORM_TEMPLATES_LOCATION'].format( 'template') json_file.save( current_app.config['REQUEST_FILE_LOCATION'].format(json_file_md5)) outfile = md5_directory + json_file_md5 + '.html' json_data = json.load( open(current_app.config['REQUEST_FILE_LOCATION'].format( json_file_md5))) form99_json_data = json_data['data'] # load the file with open(infile) as inf: txt = inf.read() soup = bs4.BeautifulSoup(txt) soup.find('label', attrs={ 'id': 'committeeName' }).string = form99_json_data['committeeName'] soup.find('label', attrs={ 'id': 'street1' }).string = form99_json_data['street1'] soup.find('label', attrs={ 'id': 'street2' }).string = form99_json_data['street2'] soup.find('label', attrs={ 'id': 'city' }).string = form99_json_data['city'] soup.find('label', attrs={ 'id': 'state' }).string = form99_json_data['state'] soup.find('label', attrs={ 'id': 'zipCode' }).string = form99_json_data['zipCode'] soup.find('span', attrs={ 'id': 'committeeId' }).string = form99_json_data['committeeId'] soup.find('label', attrs={'id': 'treasurerFullName'}).string = form99_json_data['treasurerLastName'] + \ ', ' + form99_json_data['treasurerFirstName'] \ + ', ' + form99_json_data['treasurerMiddleName'] \ + ', ' + form99_json_data['treasurerPrefix'] \ + ', ' + form99_json_data['treasurerSuffix'] soup.find('label', attrs={'id': 'treasurerName'}).string = form99_json_data['treasurerLastName'] + \ ', ' + form99_json_data['treasurerFirstName'] f99_html_data = form99_json_data['text'] soup.find('label', attrs={'id': 'text'}).string = f99_html_data soup.find('label', attrs={ 'id': form99_json_data['reason'] }).string = 'X' date_array = form99_json_data['dateSigned'].split("/") soup.find('span', attrs={ 'id': 'dateSignedMonth' }).string = str(date_array[0]) soup.find('span', attrs={ 'id': 'dateSignedDate' }).string = str(date_array[1]) soup.find('span', attrs={ 'id': 'dateSignedYear' }).string = str(date_array[2]) with open(outfile, "w") as output_file: output_file.write( str(soup).replace("<", "<").replace(">", ">")) # F99 PDF page padding options options = { 'margin-top': '0.36in', 'margin-right': '0.25in', 'margin-bottom': '0.39in', 'margin-left': '0.25in' } # HTML(outfile).write_pdf(md5_directory + json_file_md5 + '.pdf', stylesheets=[CSS(current_app.config['FORMS_LOCATION'].format('F99.css'))]) pdfkit.from_file(outfile, md5_directory + json_file_md5 + '.pdf', options=options) total_no_of_pages = pypdftk.get_num_pages(md5_directory + json_file_md5 + '.pdf') page_number_file = current_app.config[ 'FORM_TEMPLATES_LOCATION'].format('Page_Number') # checking if attachment_file exist if 'attachment_file' in request.files: # reading Attachment title file attachment_title_file = current_app.config[ 'FORM_TEMPLATES_LOCATION'].format('Attachment_Title') attachment_file = request.files.get('attachment_file') attachment_file.save( os.path.join(md5_directory + 'attachment_temp.pdf')) os.makedirs(md5_directory + 'attachment', exist_ok=True) os.makedirs(md5_directory + 'final_attachment', exist_ok=True) pypdftk.split(md5_directory + 'attachment_temp.pdf', md5_directory + 'attachment') os.remove(md5_directory + 'attachment/doc_data.txt') attachment_no_of_pages = pypdftk.get_num_pages( os.path.join(md5_directory + 'attachment_temp.pdf')) attachment_page_no = total_no_of_pages total_no_of_pages += attachment_no_of_pages # we are doing this to assign page numbers to attachment file for filename in os.listdir(md5_directory + 'attachment'): attachment_page_no += 1 pypdftk.fill_form( attachment_title_file, { "PAGESTR": "PAGE " + str(attachment_page_no) + " / " + str(total_no_of_pages) }, md5_directory + 'attachment/attachment_page_' + str(attachment_page_no) + '.pdf') pypdftk.stamp( md5_directory + 'attachment/' + filename, md5_directory + 'attachment/attachment_page_' + str(attachment_page_no) + '.pdf', md5_directory + 'final_attachment/attachment_' + str(attachment_page_no) + '.pdf') pypdftk.concat( directory_files(md5_directory + 'final_attachment/'), md5_directory + 'attachment.pdf') os.remove(md5_directory + 'attachment_temp.pdf') # shutil.rmtree(md5_directory + 'attachment') # shutil.rmtree(md5_directory + 'final_attachment') # pypdftk.concat([md5_directory + json_file_md5 + '.pdf', md5_directory + 'attachment.pdf'], md5_directory + 'all_pages_temp.pdf') # else: # shutil.move(md5_directory + json_file_md5 + '.pdf', md5_directory + 'all_pages_temp.pdf') os.makedirs(md5_directory + 'pages', exist_ok=True) os.makedirs(md5_directory + 'final_pages', exist_ok=True) pypdftk.split(md5_directory + json_file_md5 + '.pdf', md5_directory + 'pages') os.remove(md5_directory + 'pages/doc_data.txt') f99_page_no = 1 for filename in os.listdir(md5_directory + 'pages'): pypdftk.fill_form( page_number_file, { "PAGESTR": "PAGE " + str(f99_page_no) + " / " + str(total_no_of_pages) }, md5_directory + 'pages/page_number_' + str(f99_page_no) + '.pdf') pypdftk.stamp( md5_directory + 'pages/page_number_' + str(f99_page_no) + '.pdf', md5_directory + 'pages/' + filename, md5_directory + 'final_pages/page_' + str(f99_page_no) + '.pdf') f99_page_no += 1 pypdftk.concat(directory_files(md5_directory + 'final_pages/'), json_file_md5 + '_temp.pdf') if 'attachment_file' in request.files: pypdftk.concat([ json_file_md5 + '_temp.pdf', md5_directory + 'attachment.pdf' ], md5_directory + 'all_pages.pdf') shutil.rmtree(md5_directory + 'attachment') shutil.rmtree(md5_directory + 'final_attachment') os.remove(md5_directory + 'attachment.pdf') else: shutil.move(json_file_md5 + '_temp.pdf', md5_directory + 'all_pages.pdf') # clean up task shutil.rmtree(md5_directory + 'pages') shutil.rmtree(md5_directory + 'final_pages') # os.remove(md5_directory + json_file_md5 + '.html') # shutil.rmtree(md5_directory + 'images') # os.remove(md5_directory + 'form-text.css') os.remove(md5_directory + json_file_md5 + '.pdf') # for f99_page_no in range(f99_no_of_pages): # pypdftk.fill_form(page_number_file, # {"PAGESTR": "PAGE " + str(f99_page_no+1) + " / " + str(total_no_of_pages)}, # md5_directory + 'pages/page_' + str(f99_page_no+1) + '.pdf') # pypdftk.stamp(md5_directory + json_file_md5 + '.pdf', md5_directory + # 'pages/page_' + str(f99_page_no+1) + '.pdf', md5_directory + json_file_md5 + '_temp.pdf') # json_data['PAGESTR'] = "PAGE " + str(page_no) + " / " + str(total_no_of_pages) # json_data['MISCELLANEOUS_TEXT'] = '' # xfdf_path = pypdftk.gen_xfdf(json_data) # pypdftk.fill_form(infile, json_data, outfile) # HTML(string='''<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><font face='Helvetica' size=10 ''' + f99_full_text).\ # write_pdf("output/pdf/test/test.pdf") # pypdftk.stamp(outfile, "output/pdf/test/test.pdf", "output/pdf/test/output.pdf") # additional_page_counter = 0 # if len(f99_pages_text_json['additional_pages']) > 0: # continuation_file = current_app.config['FORM_TEMPLATES_LOCATION'].format('F99_CONT') # os.makedirs(md5_directory + 'merge', exist_ok=True) # for additional_page in f99_pages_text_json['additional_pages']: # page_no += 1 # continuation_outfile = md5_directory + 'merge/' + str(additional_page_counter)+'.pdf' # pypdftk.fill_form(continuation_file, {"PAGESTR": "PAGE "+str(page_no)+" / " + str(total_no_of_pages), # "CONTINOUS_TEXT": additional_page[str(additional_page_counter)]}, continuation_outfile) # pypdftk.concat([outfile, continuation_outfile], md5_directory + json_file_md5 + '_all_pages_temp.pdf') # shutil.copy(md5_directory + json_file_md5 + '_all_pages_temp.pdf', outfile) # additional_page_counter += 1 # os.remove(md5_directory + json_file_md5 + '_all_pages_temp.pdf') # # # Add the F99 attachment # if 'attachment_file' in request.files: # pypdftk.concat([outfile, md5_directory + 'attachment.pdf'], md5_directory + 'all_pages.pdf') # os.remove(md5_directory + 'attachment.pdf') # else: # shutil.copy(outfile, md5_directory + 'all_pages.pdf') # os.remove(md5_directory + json_file_md5 +'_temp.pdf') # push output file to AWS s3 = boto3.client('s3') s3.upload_file( md5_directory + 'all_pages.pdf', current_app.config['AWS_FECFILE_COMPONENTS_BUCKET_NAME'], md5_directory + 'all_pages.pdf', ExtraArgs={ 'ContentType': "application/pdf", 'ACL': "public-read" }) response = { # 'file_name': '{}.pdf'.format(json_file_md5), 'pdf_url': current_app.config['PRINT_OUTPUT_FILE_URL'].format(json_file_md5) + 'all_pages.pdf' } if flask.request.method == "POST": envelope = common.get_return_envelope(data=response) status_code = status.HTTP_201_CREATED return flask.jsonify(**envelope), status_code else: if flask.request.method == "POST": envelope = common.get_return_envelope( 'false', 'JSON file is missing from your request') status_code = status.HTTP_400_BAD_REQUEST return flask.jsonify(**envelope), status_code
def main(): jobfile = "applicants.%s.pkl"%jobid try: os.mkdir(jobid) except OSError: print "jobid dir already exists" chdir(jobid) try: with open(jobfile,"rb") as f: app_record = pickle.load(f) except IOError: app_record = {} updates = {} # data is persistent list of apps already created, # attempted is ones we've done this iteration # cookie = extract_cookie_from_curl(input_url()) cookie = 'Cookie: ' + curlline getthisurl = list_url % jobid curlargs = ["curl","-s",getthisurl,"-H",cookie] h = check_output(curlargs) for name,url,pdfurl in completed_applicants(h): if url in broken: print "SKIPING BROKE APP" continue qs = urlparse.urlparse(url).query uid = urlparse.parse_qs(qs)['userID'][0] fname = "%s.%s.%s.pdf" % (jobid,uid,ts) # need to submit this form to receive the concatenated PDFs br = mechanize.Browser() br.addheaders = [('Cookie',curlline)] br.set_handle_robots(False) br.open(pdfurl,timeout=120.0) br.select_form(nr=0) response=br.submit() file_content = response.read() with open(fname,'wb') as f: f.write(file_content) # system("curl -s '%s' -H '%s' > %s" % (pdfurl,cookie,fname)) # accumulate updates for full pass on applicants try: num_pages = pypdftk.get_num_pages(fname) except: num_pages = -1 if num_pages > 0 and app_record.get(uid,0) == num_pages: print "%s (%s) unchanged number of pages, skipping" % (uid, name) os.unlink(fname) else: print "%s (%s) new/changed" % (uid, name) print "oldnum: %d\nnewnum: %d" % (app_record.get(uid,0),num_pages) updates[url] = (name,uid,fname) app_record[uid] = num_pages update_applicants(updates) with open(jobfile,"wb") as f: pickle.dump(app_record,f)