예제 #1
0
def merge_pdfs(link_infos, output):
    pdf_writer = PdfFileWriter()

    # Add parent PDF
    pdf_reader = PdfFileReader("parent.pdf")
    parent_height = pdf_reader.getPage(0).mediaBox[3]
    parent_width = pdf_reader.getPage(0).mediaBox[2]
    page_count = pdf_reader.getNumPages()
    for page in range(pdf_reader.getNumPages()):
        pdf_writer.addPage(pdf_reader.getPage(page))

    # Add children PDFs
    bad_children = []
    for link_info in link_infos:
        try:
            pdf_reader = PdfFileReader(link_info["id"])
        except:
            bad_children.append(link_info["id"])
            continue

        link_info["pgTo"] = page_count
        page_count += pdf_reader.getNumPages()
        for page in range(pdf_reader.getNumPages()):
            # Add each page to the writer object
            pdf_writer.addPage(pdf_reader.getPage(page))

    pdf_writer.removeLinks()

    for link_info in link_infos:
        if link_info["id"] in bad_children:
            continue
        try:
            add_link(pdf_writer, link_info["pgNum"], link_info["pgTo"],
                     link_info["coords"], parent_height, parent_width, True)
        except:
            print("Failed to add a link for {}".format(link_info))

    # Write out the merged PDF
    with open(output, 'wb') as out:
        pdf_writer.write(out)
예제 #2
0
def handler(event, context):
    # Extract links and S3 URI's for PDF's from step function input
    print("Extracting URIs")
    html_pdf_uri = event['htmlPdfUri'] + '.pdf'
    url_pdf_uris = [x + '.pdf' for x in event['urlPdfUris']]
    links = event['links']

    # Prep S3 buckets and client
    s3_client = boto3.client('s3')
    html_bucket = "html-pdfs"
    url_bucket = "url-pdfs"
    merged_bucket = "merged-pdfs"

    # Download the PDF's from S3 into buffer then into file
    # TODO remove write to file
    print("Downloading HTML PDF")
    html_pdf_obj = s3_client.get_object(Bucket=html_bucket, Key=html_pdf_uri)
    html_pdf_bytes = html_pdf_obj['Body'].read()
    with open("/tmp/html_pdf_file.pdf", 'w+b') as f_obj:
        f_obj.write(html_pdf_bytes)

    print("Downloading URL PDFs")
    for i, url_pdf_uri in enumerate(url_pdf_uris):
        url_pdf_obj = s3_client.get_object(Bucket=url_bucket, Key=url_pdf_uri)
        url_pdf_bytes = url_pdf_obj['Body'].read()
        url_pdf_filename = f"/tmp/url_pdf_file_{i}.pdf"
        add_to_links(links, {
            'url_pdf_filename': url_pdf_filename,
            'url_pdf_uri': url_pdf_uri
        })
        with open(url_pdf_filename, 'w+b') as f_obj:
            f_obj.write(url_pdf_bytes)

    # Find root coordinates of where to place links
    print("Finding link coordinates")
    find_links("/tmp/html_pdf_file.pdf", links)
    
    # Add the PDF's to the merger object
    pdf_writer = PdfFileWriter()

    # Starting with the html pdf
    print("Merging in HTML PDF")
    pdf_reader = PdfFileReader("/tmp/html_pdf_file.pdf")
    height = pdf_reader.getPage(0).mediaBox[3]
    width = pdf_reader.getPage(0).mediaBox[2]
    page_count = pdf_reader.getNumPages()
    for page in range(page_count):
        pdf_writer.addPage(pdf_reader.getPage(page))

    # Now add the url pdfs
    print("Merging in URL PDFs")
    for link in links:
        pdf_reader = PdfFileReader(link['url_pdf_filename'])
        
        link["pg_to"] = page_count
        page_count += pdf_reader.getNumPages()

        for page in range(pdf_reader.getNumPages()):
            pdf_writer.addPage(pdf_reader.getPage(page))

    pdf_writer.removeLinks()

    # Add links to the PDF
    print("Linking links")
    for link in links:
        try:
            add_link(pdf_writer,
                link["pg_num"],
                link["pg_to"],
                link["coords"],
                height,
                width,
                True)
        except:
            print("Failed to add a link for {}".format(link))

    # Save the PDF to file
    print("Saving merged pdf to S3")
    with open("/tmp/merged-pdf.pdf", 'wb') as out:
        pdf_writer.write(out)

    # Generate ulid for merged PDF filename
    merged_name = ulid.new().str + '.pdf'
    
    # Upload the PDF to S3
    with open("/tmp/merged-pdf.pdf", "rb") as pdf:
        s3_client.put_object(Bucket=merged_bucket, Key=merged_name, Body=pdf)
    return {
        'status': 201,
        'message': "created"
    }