def merge_pdfs(link_infos, output): pdf_writer = PdfFileWriter() # Add parent PDF pdf_reader = PdfFileReader("parent.pdf") parent_height = pdf_reader.getPage(0).mediaBox[3] parent_width = pdf_reader.getPage(0).mediaBox[2] page_count = pdf_reader.getNumPages() for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) # Add children PDFs bad_children = [] for link_info in link_infos: try: pdf_reader = PdfFileReader(link_info["id"]) except: bad_children.append(link_info["id"]) continue link_info["pgTo"] = page_count page_count += pdf_reader.getNumPages() for page in range(pdf_reader.getNumPages()): # Add each page to the writer object pdf_writer.addPage(pdf_reader.getPage(page)) pdf_writer.removeLinks() for link_info in link_infos: if link_info["id"] in bad_children: continue try: add_link(pdf_writer, link_info["pgNum"], link_info["pgTo"], link_info["coords"], parent_height, parent_width, True) except: print("Failed to add a link for {}".format(link_info)) # Write out the merged PDF with open(output, 'wb') as out: pdf_writer.write(out)
def handler(event, context): # Extract links and S3 URI's for PDF's from step function input print("Extracting URIs") html_pdf_uri = event['htmlPdfUri'] + '.pdf' url_pdf_uris = [x + '.pdf' for x in event['urlPdfUris']] links = event['links'] # Prep S3 buckets and client s3_client = boto3.client('s3') html_bucket = "html-pdfs" url_bucket = "url-pdfs" merged_bucket = "merged-pdfs" # Download the PDF's from S3 into buffer then into file # TODO remove write to file print("Downloading HTML PDF") html_pdf_obj = s3_client.get_object(Bucket=html_bucket, Key=html_pdf_uri) html_pdf_bytes = html_pdf_obj['Body'].read() with open("/tmp/html_pdf_file.pdf", 'w+b') as f_obj: f_obj.write(html_pdf_bytes) print("Downloading URL PDFs") for i, url_pdf_uri in enumerate(url_pdf_uris): url_pdf_obj = s3_client.get_object(Bucket=url_bucket, Key=url_pdf_uri) url_pdf_bytes = url_pdf_obj['Body'].read() url_pdf_filename = f"/tmp/url_pdf_file_{i}.pdf" add_to_links(links, { 'url_pdf_filename': url_pdf_filename, 'url_pdf_uri': url_pdf_uri }) with open(url_pdf_filename, 'w+b') as f_obj: f_obj.write(url_pdf_bytes) # Find root coordinates of where to place links print("Finding link coordinates") find_links("/tmp/html_pdf_file.pdf", links) # Add the PDF's to the merger object pdf_writer = PdfFileWriter() # Starting with the html pdf print("Merging in HTML PDF") pdf_reader = PdfFileReader("/tmp/html_pdf_file.pdf") height = pdf_reader.getPage(0).mediaBox[3] width = pdf_reader.getPage(0).mediaBox[2] page_count = pdf_reader.getNumPages() for page in range(page_count): pdf_writer.addPage(pdf_reader.getPage(page)) # Now add the url pdfs print("Merging in URL PDFs") for link in links: pdf_reader = PdfFileReader(link['url_pdf_filename']) link["pg_to"] = page_count page_count += pdf_reader.getNumPages() for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) pdf_writer.removeLinks() # Add links to the PDF print("Linking links") for link in links: try: add_link(pdf_writer, link["pg_num"], link["pg_to"], link["coords"], height, width, True) except: print("Failed to add a link for {}".format(link)) # Save the PDF to file print("Saving merged pdf to S3") with open("/tmp/merged-pdf.pdf", 'wb') as out: pdf_writer.write(out) # Generate ulid for merged PDF filename merged_name = ulid.new().str + '.pdf' # Upload the PDF to S3 with open("/tmp/merged-pdf.pdf", "rb") as pdf: s3_client.put_object(Bucket=merged_bucket, Key=merged_name, Body=pdf) return { 'status': 201, 'message': "created" }