Beispiel #1
0
def fixpage(page, watermark):

    # Find the page's resource dictionary. Create if none
    resources = page.inheritable.Resources
    if resources is None:
        resources = page.Resources = PdfDict()

    # Find or create the parent's xobject dictionary
    xobjdict = resources.XObject
    if xobjdict is None:
        xobjdict = resources.XObject = PdfDict()

    # Allow for an infinite number of cascaded watermarks
    index = 0
    while 1:
        watermark_name = '/Watermark.%d' % index
        if watermark_name not in xobjdict:
            break
        index += 1
    xobjdict[watermark_name] = watermark

    # Turn the contents into an array if it is not already one
    contents = page.Contents
    if not isinstance(contents, PdfArray):
        contents = page.Contents = PdfArray([contents])

    # Save initial state before executing page
    contents.insert(0, IndirectPdfDict(stream='q\n'))

    # Restore initial state and append the watermark
    contents.append(IndirectPdfDict(stream='Q %s Do\n' % watermark_name))
    return page
def fixpage(page, count=[0]):
    count[0] += 1
    evenpage = not (count[0] & 1)

    # For demo purposes, just go with the MediaBox and toast the others
    box = [float(x) for x in page.MediaBox]
    assert box[0] == box[1] == 0, "demo won't work on this PDF"

    for key, value in sorted(page.iteritems()):
        if 'box' in key.lower():
            del page[key]

    startsize = tuple(box[2:])
    finalsize = box[3], 2 * box[2]
    page.MediaBox = PdfArray((0, 0) + finalsize)
    page.Rotate = (int(page.Rotate or 0) + 90) % 360

    contents = page.Contents
    if contents is None:
        return page
    contents = isinstance(contents, dict) and [contents] or contents

    prefix = '0 1 -1 0 %s %s cm\n' % (finalsize[0], 0)
    if evenpage:
        prefix = '1 0 0 1 %s %s cm\n' % (0, finalsize[1] / 2) + prefix
    first_prefix = 'q\n-1 0 0 -1 %s %s cm\n' % finalsize + prefix
    second_prefix = '\nQ\n' + prefix
    first_prefix = IndirectPdfDict(stream=first_prefix)
    second_prefix = IndirectPdfDict(stream=second_prefix)
    contents = PdfArray(([second_prefix] + contents) * 2)
    contents[0] = first_prefix
    page.Contents = contents
    return page
Beispiel #3
0
def upscale(file_name, scale=1.5, margin_x=0, margin_y=0, suffix='scaled', tempdir=None):
    """Upscale a PDF to a large size."""
    def adjust(page):
        info = PageMerge().add(page)
        x1, y1, x2, y2 = info.xobj_box
        viewrect = (margin_x, margin_y, x2 - x1 - 2 * margin_x, y2 - y1 - 2 * margin_y)
        page = PageMerge().add(page, viewrect=viewrect)
        page[0].scale(scale)
        return page.render()

    # Set output file name
    if tempdir:
        output = NamedTemporaryFile(suffix='.pdf', dir=tempdir, delete=False).name
    elif suffix:
        output = os.path.join(os.path.dirname(file_name), add_suffix(file_name, suffix))
    else:
        output = NamedTemporaryFile(suffix='.pdf').name

    reader = PdfReader(file_name)
    writer = PdfWriter(output)
    for i in list(range(0, len(reader.pages))):
        writer.addpage(adjust(reader.pages[i]))
    writer.trailer.Info = IndirectPdfDict(reader.Info or {})
    writer.write()
    return output
Beispiel #4
0
def go(inpfn, outfn):
    reader = PdfReader(inpfn, decompress=False)
    page, = reader.pages
    writer = PdfWriter()
    writer.addpage(adjust(page))
    writer.trailer.Info = IndirectPdfDict(reader.Info)
    writer.write(outfn)
Beispiel #5
0
def fixpage(*pages):
    pages = [pagexobj(x) for x in pages]

    class PageStuff(tuple):
        pass

    x = y = 0
    for i, page in enumerate(pages):
        index = '/P%s' % i
        shift_right = x and '1 0 0 1 %s 0 cm ' % x or ''
        stuff = PageStuff((index, page))
        stuff.stream = 'q %s%s Do Q\n' % (shift_right, index)
        x += page.BBox[2]
        y = max(y, page.BBox[3])
        pages[i] = stuff

    # Multiple copies of first page used as a placeholder to
    # get blank page on back.
    for p1, p2 in zip(pages, pages[1:]):
        if p1[1] is p2[1]:
            pages.remove(p1)

    return IndirectPdfDict(
        Type=PdfName.Page,
        Contents=PdfDict(stream=''.join(page.stream for page in pages)),
        MediaBox=PdfArray([0, 0, x, y]),
        Resources=PdfDict(XObject=PdfDict(pages), ),
    )
Beispiel #6
0
 def pdfrw(self):
     reader = PdfReader(self.file_name)
     writer = PdfWriter(self.output)
     for i in list(range(0, len(reader.pages))):
         writer.addpage(self._pdfrw_adjust(reader.pages[i]))
     writer.trailer.Info = IndirectPdfDict(reader.Info or {})
     writer.write()
Beispiel #7
0
    def make_cid_system_info_object():
        """Make a CID System Info object.

        :returns PdfDict: CID System Info PdfDict object.
        """
        return IndirectPdfDict(Registry=PdfString('(Adobe)'),
                               Ordering=PdfString('(UCS)'),
                               Supplement=0)
Beispiel #8
0
def resize_2_a4(infn):
    outfn = infn[:-4] + '-A4.pdf'
    reader = PdfReader(infn)
    writer = PdfWriter(outfn)
    a4_size = get_size('A4.pdf', 0)
    params = get_scale_margin(infn, a4_size, 0)
    for page in reader.pages:
        writer.addpage(adjust(page, params))
    writer.trailer.Info = IndirectPdfDict(reader.Info or {})
    writer.write()
Beispiel #9
0
    def make_font_file_object(tt_font):
        """Make an embedded font object from the true type font itself.

        :param TrueTypeFont tt_font: Our utility class used to parse and calculate font metrics
        from a true type font.
        :returns PdfDict: font file PdfDict object stream.
        """
        # TODO: make subset font here
        with open(tt_font.ttfPath, 'rb') as font_file:
            data = font_file.read()

        # Let's let pdfrw handle compressing streams
        return IndirectPdfDict(stream=data.decode('Latin-1'))
Beispiel #10
0
def concatenate(paths, output):
    writer = PdfWriter()

    for path in paths:
        reader = PdfReader(path)
        writer.addpages(reader.pages)

    writer.trailer.Info = IndirectPdfDict(Title='Combined PDF Title',
                                          Author='Michael Driscoll',
                                          Subject='PDF Combinations',
                                          Creator='The Concatenator')

    writer.write(output)
Beispiel #11
0
    def pdfrw(pdf_files, output):
        writer = PdfWriter()
        for inpfn in pdf_files:
            writer.addpages(PdfReader(inpfn).pages)

        writer.trailer.Info = IndirectPdfDict(
            Title='HPA Design',
            Author='HPA Design',
            Subject='HPA Design',
            Creator='HPA Design',
        )
        writer.write(output)
        return output
Beispiel #12
0
    def _pdf_samples(self, request, response, samples):
        writer = PdfWriter()

        for sample in samples:
            self._pdf_sample(writer, sample)

        writer.trailer.Info = IndirectPdfDict(
            Title="Sample Labels",
            Author=str(request.user),
            Subject="Sample Labels",
            Creator="Turtleweb",
        )

        writer.write(response)
Beispiel #13
0
	def update_field(self, name, value):
		file_fields = self.file.Root.AcroForm.Fields
		field = file_fields[self.field_index(name)]

		rct = field.Rect
		height = round(float(rct[3]) - float(rct[1]), 2)
		width = round(float(rct[2]) - float(rct[0]), 2)

		xobj = IndirectPdfDict(
			BBox = [0, 0, width, height],
			FormType = 1,
			Resources = PdfDict(Prosec = [PdfName.PDF, PdfName.Text]),
			Subtype = PdfName.Form,
			Type = PdfName.XObject
		)

		#Change the value of field when not foccused
		xobj.stream = "/Tx BMC\nBT\n /Helvetica 8.0 Tf\n 1.0 5.0 Td\n 0 g\n (" + value + ") Tj\nET EMC"
		file_fields[self.field_index(name)].AP = PdfDict(N = xobj)

		#Change the value when field is foccused
		field.update(PdfDict(V=value))
		self.fields_info = self.read_fields()
		return field
Beispiel #14
0
def main(infiles, outfile, rows, cols, title, landscape):
    pages = []
    for pdf in infiles:
        pages.extend(PdfReader(pdf).pages)
    pages_per_page = rows * cols

    pdf_writer = PdfWriter(outfile)

    for page_group in grouper(pages_per_page, pages):
        pdf_writer.addpage(put_pages_on_grid(page_group, rows, cols))

    pdf_writer.trailer.Info = IndirectPdfDict(Title=title,
                                              Author="StaG-mwc",
                                              Creator=__file__)

    return pdf_writer
Beispiel #15
0
    def make_to_unicode_object():
        """Make a toUnicode object which allows the PDF reader to derive content from the PDF
        with the CIDFont embedded.  This map converts from CIDs to Unicode code points.

        :returns PdfDict: toUnicode CMAP PdfDict object.
        """
        # See section 9.10.3 ToUnicode CMaps of PDF 1.6 Spec
        # TODO: For now we put an empty mapping in.
        return IndirectPdfDict(stream='\n'.join((
            "/CIDInit /ProcSet findresource begin", "12 dict begin",
            "begincmap", "/CIDSystemInfo", "<</Registry (Adobe)",
            "/Ordering (UCS)", "/Supplement 0", ">> def",
            "/CMapName /Adobe-Identity-UCS def", "/CMapType 2 def",
            "1 begincodespacerange", "<0000> <FFFF>", "endcodespacerange",
            "1 beginbfrange", "<0000> <FFFF> <0000>", "endbfrange", "endcmap",
            "CMapName currentdict /CMap defineresource pop", "end", "end")))
Beispiel #16
0
    def make_composite_font_object(font_file_path):
        """Make a PDF Type0 composite font object for embedding in the annotation's
        Resources dict.

        :param str font_file_path: The path and filename to the true type font we want to embed.
        :returns PdfDict: Resources PdfDict object, ready to be included in the
            Resources 'Font' subdictionary.
        """
        # TODO: Get font name from font program itself
        tt_font = get_true_type_font(font_file_path, DEFAULT_BASE_FONT)

        return IndirectPdfDict(Type=PdfName('Font'),
                               Subtype=PdfName('Type0'),
                               BaseFont=PdfName(tt_font.fontName),
                               Encoding=PdfName('Identity-H'),
                               DescendantFonts=PdfArray(
                                   [FreeText.make_cid_font_object(tt_font)]),
                               ToUnicode=FreeText.make_to_unicode_object())
Beispiel #17
0
    def make_cid_font_object(tt_font):
        """Make a CID Type 2 font object for including as a descendant of a composite
        Type 0 font object.

        :param TrueTypeFont tt_font: Our utility class used to parse and calculate font metrics
        from a true type font.
        :returns PdfDict: CID Font Type 2 PdfDict object.
        """
        return IndirectPdfDict(
            Type=PdfName('Font'),
            Subtype=PdfName('CIDFontType2'),
            BaseFont=PdfName(tt_font.fontName),
            CIDSystemInfo=FreeText.make_cid_system_info_object(),
            FontDescriptor=FreeText.make_font_descriptor_object(tt_font),
            DW=int(round(tt_font.metrics.defaultWidth, 0)),
            Widths=PdfArray(tt_font.metrics.widths),
            CIDToGIDMap=FreeText.make_cid_to_gid_map_object(tt_font),
        )
Beispiel #18
0
    def update_and_move(self, targetdir: str, doctitle: str, tags: List[str],
                        date: str):
        """Update metadata of pdf and move to target directory.

        Arguments:
            targetdir {str} -- Target directory where pdf shall be placed.
            doctitle {str} -- New document title of pdf.
            tags {List[str]} -- Keywords/tags which shall be added to pdf.
            date {str} -- Date which will be entered into pdf filename.

        """
        pdf = PdfReader(self.filepath)
        # Check for correct file ending
        if doctitle[-4:] != ".pdf":
            filename = date + " " + doctitle + ".pdf"
        else:
            filename = date + " " + doctitle
            doctitle = doctitle[0:-4]

        # Check for unique filename
        n = 1
        if os.path.isfile(os.path.join(targetdir, filename)):
            filename = filename[0:-4] + "-" + str(n) + ".pdf"
        while os.path.isfile(os.path.join(targetdir, filename)):
            regex = re.compile(r"-\d{1,}.pdf", re.IGNORECASE)
            filename = regex.sub("-" + str(n) + ".pdf", filename)
            n = n + 1

        # pdf.Info.Keywords = tags
        # pdf.Info.Title = doctitle

        # Write data
        writer = PdfWriter()
        writer.addpages(pdf.pages)
        writer.trailer.Info = IndirectPdfDict(Title=doctitle, Keywords=tags)
        writer.write(os.path.join(targetdir, filename))

        # try to delete file ##
        try:
            os.remove(self.filepath)
        except OSError as e:  # if failed, report it back to the user ##
            print("Error: %s - %s." % (e.filename, e.strerror))
Beispiel #19
0
    def make_font_descriptor_object(tt_font):
        """Make a Font Descriptor object containing some calculated metrics
        for the font.

        :param TrueTypeFont tt_font: Our utility class used to parse and calculate font metrics
        from a true type font.
        :returns PdfDict: Font Descriptor PdfDict object.
        """
        return IndirectPdfDict(
            Type=PdfName('FontDescriptor'),
            FontName=PdfName(tt_font.fontName),
            Flags=tt_font.metrics.flags,
            FontBBox=tt_font.metrics.bbox,
            ItalicAngle=int(tt_font.metrics.italicAngle),
            Ascent=int(round(tt_font.metrics.ascent, 0)),
            Descent=int(round(tt_font.metrics.descent, 0)),
            CapHeight=int(round(tt_font.metrics.capHeight, 0)),
            StemV=int(round(tt_font.metrics.stemV, 0)),
            MissingWidth=int(round(tt_font.metrics.defaultWidth, 0)),
            FontFile2=FreeText.make_font_file_object(tt_font))
Beispiel #20
0
def concatenate(input_paths, output_path, details=None):
    """Given an ordered sequence of paths to pdf files, concatenate
    to the desired output path with the given details.
    
    Args:
        input_paths: A sequence of paths to pdf files.
        output_path: The desired path for the concatenated pdf.
        details: A dictionary of metadata values desired for the final pdf.
    """
    writer = PdfWriter()

    for path in input_paths:
        reader = PdfReader(path)
        writer.addpages(reader.pages)

    writer.trailer.Info = IndirectPdfDict()
    if details is not None:
        for metadata, value in details.items():
            writer.trailer.Info[PdfName(metadata)] = value

    writer.write(output_path)
Beispiel #21
0
    def make_cid_to_gid_map_object(tt_font):
        """Make a CID to GID map that is used to map character ids to glyph ids in the font.

        :param TrueTypeFont tt_font: Our utility class used to parse and calculate font metrics
        from a true type font.
        :returns PdfDict: CIDtoGID PdfDict object.
        """
        # Let's make this as large as possibly addressable for now, it will compress nicely.
        mapping_size = 256 * 256
        cid_to_gid_map = ["\x00"] * mapping_size * 2

        for cc, glyph_name in tt_font.metrics.cmap.items():
            # TODO: What is the expectation here since PDF only supports two bytes lookups?
            if cc >= mapping_size:
                continue
            glyph_id = tt_font.get_glyph_id(glyph_name)
            cid_to_gid_map[cc * 2] = chr(glyph_id >> 8)
            cid_to_gid_map[cc * 2 + 1] = chr(glyph_id & 0xFF)
        cid_to_gid_map = ''.join(cid_to_gid_map)

        # Let's let pdfrw handle the compressing of streams
        return IndirectPdfDict(stream=cid_to_gid_map)
Beispiel #22
0
def render(source,
           *,
           progress_cb=lambda x: None,
           expand_pages=True,
           template_alpha=0.3,
           only_annotated=False,
           black='black',
           white='white',
           gray=None,
           highlight=HIGHLIGHT_DEFAULT_COLOR):
    """Render a source document as a PDF file.

    source: The reMarkable document to be rendered.  This may be
              - A filename or pathlib.Path to a zip file containing the
                document, such as is provided by the Cloud API.
              - A filename or pathlib.Path to a root-level file from the
                document, such as might be copied off the device directly.
              - An object implementing the Source API.  See rmrl.sources
                for examples and further documentation.
    progress_cb: A function which will be called with a progress percentage
                 between 0 and 100.  The first 50% indicate rendering the
                 annotations, and the second the merging of these into the
                 base PDF file.  If this callback raises an error, this
                 function will abort gracefully and propagate the error up
                 the stack.
    expand_pages: Boolean value (default True) indicating whether pages
                  should be made larger, to reflect the view provided by
                  the reMarkable device.
    template_alpha: Opacity of the template backgrounds in notebooks.  0
                    makes the templates invisible, 1 makes them fully dark.
    only_annotated: Boolean value (default False) indicating whether only
                    pages with annotations should be output.
    black: A string giving the color to use as "black" in the document.
           Can be a color name or a hex string.  Default: 'black'
    white: A string giving the color to use as "white" in the document.
           See `black` parameter for format.  Default: 'white'
    gray: A string giving the color to use as "gray" in the document.
          See `black` parameter for format.  Default: None, which means to
          pick an average between the "white" and "black" values.
    highlight: A string giving the color to use for the highlighter.
               See `black` parameter for format.
    """

    colors = parse_colors(black, white, gray, highlight)

    vector = True  # TODO: Different rendering styles
    source = sources.get_source(source)

    # If this is using a base PDF, the percentage is calculated
    # differently.
    uses_base_pdf = source.exists('{ID}.pdf')

    # Generate page information
    # If a PDF file was uploaded, but never opened, there may not be
    # a .content file. So, just load a barebones one with a 'pages'
    # key of zero length, so it doesn't break the rest of the
    # process.
    pages = []
    if source.exists('{ID}.content'):
        with source.open('{ID}.content', 'r') as f:
            pages = json.load(f).get('pages', [])

    # Render each page as a pdf
    tmpfh = tempfile.TemporaryFile()
    pdf_canvas = canvas.Canvas(tmpfh, (PDFWIDTH, PDFHEIGHT))
    # TODO: check pageCompression

    # Don't load all the pages into memory, because large notebooks
    # about 500 pages could use up to 3 GB of RAM. Create them by
    # iteration so they get released by garbage collector.
    changed_pages = []
    annotations = []
    for i in range(0, len(pages)):
        page = document.DocumentPage(source, pages[i], i, colors=colors)
        if source.exists(page.rmpath):
            changed_pages.append(i)
        page.render_to_painter(pdf_canvas, vector, template_alpha)
        annotations.append(page.get_grouped_annotations())
        progress_cb((i + 1) / len(pages) * 50)
    pdf_canvas.save()
    tmpfh.seek(0)

    # This new PDF represents just the notebook. If there was a
    # parent PDF, merge it now.
    if uses_base_pdf and not changed_pages:
        # Since there is no stroke data, just return the PDF data
        progress_cb(100)

        log.info('exported pdf')
        return source.open('{ID}.pdf', 'rb')

    # PDF exists, stroke data exists, so mix them together.
    if uses_base_pdf:
        rmpdfr = PdfReader(tmpfh)
        basepdfr = PdfReader(source.open('{ID}.pdf', 'rb'))
    else:
        basepdfr = PdfReader(tmpfh)
        # Alias, which is used for annotations and layers.
        rmpdfr = basepdfr

    # If making a 'layered' PDF (with optional content groups,
    # OCGs), associate the annoatations with the layer.

    # This property list is put into the rmpdfr document, which
    # will not have any existing properties.
    ocgprop = IndirectPdfDict(OCGs=PdfArray(), D=PdfDict(Order=PdfArray()))

    for i in range(0, len(basepdfr.pages)):
        basepage = basepdfr.pages[i]
        rmpage = rmpdfr.pages[i]

        # Apply OCGs
        apply_ocg = False  #TODO configurable? bool(int(QSettings().value(
        #'pane/notebooks/export_pdf_ocg')))
        if apply_ocg:
            ocgorderinner = do_apply_ocg(basepage, rmpage, i, uses_base_pdf,
                                         ocgprop, annotations)
        else:
            ocgorderinner = None

        # Apply annotations to the rmpage. This must come after
        # applying OCGs, because the annotation may belong to
        # one of those groups.
        apply_annotations(rmpage, annotations[i], ocgorderinner)

        # If this is a normal notebook with highlighting,
        # just add the annotations and forget about the rest,
        # which are page geometry transformations.
        if uses_base_pdf:
            merge_pages(basepage, rmpage, i in changed_pages, expand_pages)

        progress_cb(((i + 1) / rmpdfr.numPages * 50) + 50)

    # Apply the OCG order. The basepdf may have already had OCGs
    # and so we must not overwrite them. NOTE: there are other
    # properties that ought to be carried over, but this is the
    # minimum required.
    if apply_ocg:
        if '/OCProperties' in basepdfr.Root:
            basepdfr.Root.OCProperties.OCGs += ocgprop.OCGs
            basepdfr.Root.OCProperties.D.Order += ocgprop.D.Order
        else:
            basepdfr.Root.OCProperties = ocgprop

    stream = tempfile.SpooledTemporaryFile(SPOOL_MAX)
    pdfw = PdfWriter(stream)
    if not only_annotated:
        # We are writing out everything, so we can take this shortcut:
        pdfw.write(trailer=basepdfr)
    else:
        for i, page in enumerate(basepdfr.pages):
            if i in changed_pages:
                pdfw.addpage(page)
        pdfw.write()
    stream.seek(0)

    log.info('exported pdf')
    return stream
Beispiel #23
0
import sys
import os

from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict

argv = sys.argv[1:]

if '-o' in argv:
    outfn = argv[argv.index('-o') + 1]
    del argv[argv.index('-o') + 1]
    del argv[argv.index('-o')]
else:
    outfn = 'output.pdf'

inpfn, underfn = argv
under = PdfReader(underfn)
trailer = PdfReader(inpfn)
for page, upage in zip(trailer.pages, under.pages):
    PageMerge(page).add(upage, prepend=1).render()

if trailer.Info is None:
    trailer.Info = IndirectPdfDict({})

# meta data comes from underneath.pdf
trailer.Info.Title = under.Info.Title
trailer.Info.Author = under.Info.Author
trailer.Info.Subject = under.Info.Subject

PdfWriter(outfn, trailer=trailer).write()
Beispiel #24
0
So she did an 8.5x11" output with 0.5" margin all around
(actual size of useful area 7.5x10") and we scaled it
up by 4.8.

We also copy the Info dict to the new PDF.

'''

import sys
import os

from pdfrw import PdfReader, PdfWriter, PageMerge, IndirectPdfDict


def adjust(page, margin=36, scale=4.8):
    info = PageMerge().add(page)
    x1, y1, x2, y2 = info.xobj_box
    viewrect = (margin, margin, x2 - x1 - 2 * margin, y2 - y1 - 2 * margin)
    page = PageMerge().add(page, viewrect=viewrect)
    page[0].scale(scale)
    return page.render()


inpfn, = sys.argv[1:]
outfn = 'poster.' + os.path.basename(inpfn)
reader = PdfReader(inpfn)
writer = PdfWriter()
writer.addpage(adjust(reader.pages[0]))
writer.trailer.Info = IndirectPdfDict(reader.Info or {})
writer.write(outfn)
Beispiel #25
0
1) Concatenating multiple input PDFs.

2) adding metadata to the PDF.

If you do not need to add metadata, look at subset.py, which
has a simpler interface to PdfWriter.

'''

import sys
import os

import find_pdfrw
from pdfrw import PdfReader, PdfWriter, IndirectPdfDict

inputs = sys.argv[1:]
assert inputs
outfn = 'output.pdf'

writer = PdfWriter()
for inpfn in inputs:
    writer.addpages(PdfReader(inpfn, decompress=False).pages)

writer.trailer.Info = IndirectPdfDict(
    Title='your title goes here',
    Author='your name goes here',
    Subject='what is it all about?',
    Creator='some script goes here',
)
writer.write(outfn)
Beispiel #26
0
from pdfrw import PdfReader
"""
x = PdfReader('source/07922XXX2258-2017Apr13-2017May15.pdf')
print x.keys()
print x.Info
print x.Root.keys()
print len(x.pages)
print x.pages[0]
print x.pages[0].Contents
print x.pages[0].Contents.stream

"""

#writing pdfs
from pdfrw import PdfWriter
writer = PdfWriter()
#y.addpage(x.pages[0])
#y.write('out.pdf')

for pdf_filename in pdf_filenames:
    writer.addpages(PdfReader(pdf_filename).pages)

from pdfrw import IndirectPdfDict
writer.trailer.Info = IndirectPdfDict(
    Title='pdf bundle',
    Author='Adobe',
    Subject='pdf',
    Creator='Adobe',
)
writer.write('out.pdf')
def process_pdf_file(inputFilename):

    try:
        print(f'Checking {inputFilename}')
        skip_this_page = False
        total_watermarks_skipped = 0

        try:
            reader = PdfReader(inputFilename)
        except:
            pass
        else:
            writer = PdfWriter()

            wm_width = 0
            page_count = 0
            counts = dict()
            sample_pages = []

            # Look through all pages for a potential primary watermark item
            if reader is not None:
                for idx, page in enumerate(reader.pages):
                    if '/Resources' in page and '/XObject' in page[
                            '/Resources']:
                        for xobj in page['/Resources']['/XObject']:
                            # Warning: Masks may or may not indicate WM presence
                            if '/Mask' in page['/Resources']['/XObject'][str(
                                    xobj)]:
                                if '/Width' in page['/Resources']['/XObject'][
                                        str(xobj)]:
                                    cur_width = int(
                                        page['/Resources']['/XObject'][str(
                                            xobj)]['/Width'])
                                    counts[cur_width] = counts.get(
                                        cur_width, 0) + 1
                                    sample_pages.append(idx + 1)
                    page_count += 1
                if counts:
                    wm_width = max(counts, key=lambda key: counts[key])
                    if counts[wm_width] != page_count and len(counts) < 4:
                        print('*' * 40)
                        print(
                            f'* Potential watermarks found but only occurs in {counts[wm_width]} of {page_count} pages'
                        )
                        print(f'* Sample pages: {sample_pages[0:9]}')
                        counts = sorted(counts.items(),
                                        reverse=True,
                                        key=lambda x: x[1])
                        print(f'* {counts}')
                        print('*' * 40)
                        wm_width = 0

                # Process all pages removing prop pages and watermark objects
                for idx, page in enumerate(reader.pages):
                    skip_this_page = False

                    # ************** Prop Pages **************
                    # Google
                    try:
                        skip_this_page = 'google' in page['/Annots'][0]['/A'][
                            '/URI']
                    except:
                        pass

                    # HathiTrust
                    # Looks like this may need another method for checking for the existence of the JxCBE
                    if not skip_this_page:
                        try:
                            if idx == 0:
                                skip_this_page = '/JxCBE' in page[
                                    '/Resources']['/XObject']['/CLC'][
                                        '/Resources']['/XObject']
                        except:
                            pass
                    if not skip_this_page:
                        try:
                            if idx == 0:
                                skip_this_page = '/JxCBE' in page[
                                    '/Resources']['/XObject']['/CCA'][
                                        '/Resources']['/XObject']
                        except:
                            pass

                    # Internet Archive / Microsoft
                    if not skip_this_page:
                        try:
                            if idx == 2:
                                skip_this_page = page['/Resources'][
                                    '/XObject']['/Im001']['/Length'] == '8420'
                        except:
                            pass

                    # ************** Watermarks **************
                    # Dump Google watermarks
                    if '/Resources' in page and '/XObject' in page[
                            '/Resources'] and '/Wm' in page['/Resources'][
                                '/XObject']:
                        junk = page['/Resources']['/XObject'].pop('/Wm')
                        total_watermarks_skipped += 1
                    if '/Resources' in page and '/XObject' in page[
                            '/Resources']:
                        for xobj in page['/Resources']['/XObject']:
                            if '/Mask' in page['/Resources']['/XObject'][str(
                                    xobj)]:
                                if '/Width' in page['/Resources']['/XObject'][
                                        str(xobj)]:
                                    cur_width = int(
                                        page['/Resources']['/XObject'][str(
                                            xobj)]['/Width'])
                                    if cur_width == wm_width:
                                        junk = page['/Resources'][
                                            '/XObject'].pop(str(xobj))
                                        total_watermarks_skipped += 1
                        for xobj in page['/Resources']['/XObject']:
                            if page['/Resources']['/XObject'][str(
                                    xobj)]['/Width'] == '156':
                                junk = page['/Resources']['/XObject'].pop(
                                    str(xobj))
                                total_watermarks_skipped += 1

                    # Dump HathiTrust watermarks
                    if '/Resources' in page and '/XObject' in page['/Resources'] and \
                       '/CBJ' in page['/Resources']['/XObject'] and \
                        '/Resources' in page['/Resources']['/XObject']['/CBJ'] and \
                        '/XObject' in page['/Resources']['/XObject']['/CBJ']['/Resources'] and \
                        '/PxCBA' in page['/Resources']['/XObject']['/CBJ']['/Resources']['/XObject']:
                        junk = page['/Resources']['/XObject']['/CBJ'][
                            '/Resources']['/XObject'].pop('/PxCBA')
                        total_watermarks_skipped += 1
                    if '/Resources' in page and '/XObject' in page['/Resources'] and \
                       '/CBJ' in page['/Resources']['/XObject'] and \
                        '/Resources' in page['/Resources']['/XObject']['/CBJ'] and \
                        '/XObject' in page['/Resources']['/XObject']['/CBJ']['/Resources'] and \
                        '/PxCBF' in page['/Resources']['/XObject']['/CBJ']['/Resources']['/XObject']:
                        junk = page['/Resources']['/XObject']['/CBJ'][
                            '/Resources']['/XObject'].pop('/PxCBF')
                        total_watermarks_skipped += 1
                    if '/Resources' in page and '/XObject' in page['/Resources'] and \
                       '/CBJ' in page['/Resources']['/XObject'] and \
                        '/Resources' in page['/Resources']['/XObject']['/CBJ'] and \
                        '/XObject' in page['/Resources']['/XObject']['/CBJ']['/Resources'] and \
                        '/PxCBG' in page['/Resources']['/XObject']['/CBJ']['/Resources']['/XObject']:
                        junk = page['/Resources']['/XObject']['/CBJ'][
                            '/Resources']['/XObject'].pop('/PxCBG')
                        total_watermarks_skipped += 1

                    # Add the page unless it's the prop page
                    if not skip_this_page and page.Contents is not None:
                        writer.addpage(page)

                # Copy and clean up the metadata
                if reader['/Info']:
                    new_meta_dict = {}
                    for info in reader['/Info']:
                        if '/Producer' not in info:
                            new_meta_dict[info] = reader['/Info'].get(info)
                    writer.trailer.Info = IndirectPdfDict(new_meta_dict)

                # Write the new file if cleanup was necessary
                if total_watermarks_skipped or (len(reader.pages) != len(
                        writer.pagearray)):
                    filename, file_extension = os.path.splitext(inputFilename)
                    os.rename(inputFilename, filename + '.bak')
                    writer.write(inputFilename)
                    if len(reader.pages) != len(writer.pagearray):
                        print(
                            f'  {len(reader.pages) - len(writer.pagearray)} pages deleted',
                        )
                    if total_watermarks_skipped:
                        print(
                            f'  {total_watermarks_skipped} page watermark references removed'
                        )
                    print(f'  Clean file written to {inputFilename}')

    except Exception as e:
        print('Exception: ', e)
Beispiel #28
0
from pdfrw import PdfReader, IndirectPdfDict, BookmarkedPdfWriter
from datetime import datetime

output = BookmarkedPdfWriter()

for i in xrange(3):
    totalPages = len(output.pagearray)
    output.addpages(
        PdfReader(
            'static_pdfs/global/0ae80b493bc21e6de99f2ff6bbb8bc2c.pdf').pages)

    bmname = 'Bm (%s) - %s' % (i + 1, 'Root')

    t1 = output.addBookmark(bmname, totalPages)
    t2 = output.addBookmark("Child 1", totalPages + 1, t1)
    output.addBookmark("Child 1.1", totalPages + 2, t2)

now = datetime.utcnow()
date = 'D:%04d%02d%02d%02d%02d%02d' % (now.year, now.month, now.day, now.hour,
                                       now.minute, now.second)

info = output.trailer.Info = IndirectPdfDict()
info.Title = 'Test PDF with Bookmarks'
info.Author = 'asdasd'
info.Creator = 'random dude'
info.Producer = 'another random dude'
info.CreationDate = date

output.write('result.pdf')
Beispiel #29
0
def cli(verbose, input, output):
    """
        input: input file or files

        output: output folder, will create if not found. 
    """
    if verbose:
        click.echo(f"Current args: {input} {output}")

    path = Path(input)
    folder = path.resolve()
    file_name = '.'

    if path.is_file():
        folder = path.absolute().parent
        file_name = path.name
    if verbose:
        click.echo(f"Current path: {path} {folder} {path.name}")

    files = [
        entry.path for entry in os.scandir(folder)
        if file_name in entry.name and entry.name.endswith('.pdf')
    ]

    if verbose:
        click.echo(f"Found {len(files)} files")
    number = 1
    out_path = os.path.realpath(output)

    if not os.path.exists(out_path):
        try:
            os.makedirs(out_path)
        except OSError as exc:  # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

    for file in files:
        out_file = os.path.join(out_path, file)
        trailer = PdfReader(file)

        if trailer.Info and trailer.Info.Title:
            click.echo(f'Current title: {trailer.Info.Title}')
        else:
            click.echo("Current file doesn't have an existing title")
            if not trailer.Info:
                trailer.Info = IndirectPdfDict(
                    Title='your title goes here',
                    Author='Title change',
                    Subject='This is a file with a changed title',
                    Creator='Title Change 0.1',
                )

        trailer.Info.Title = click.prompt(
            f'Write the new metadata title for {file}', type=str)

        PdfWriter(out_file, trailer=trailer).write()

        if verbose:
            click.echo(
                f"Wrote {os.path.basename(file)}, {number}/{len(files)}")

        number += 1

    click.echo('Done!')
Beispiel #30
0
                warn(f"O DRE {dre} está presente na pauta, mas não foi "
                     f"encontrado no lote! Ele vai ficar sem prova!")
                continue
            this_min = min(int(file.stem) for file in dre_to_pages_map[dre])
            this_max = max(int(file.stem) for file in dre_to_pages_map[dre])
            if prev_max is not None:
                assert this_min == prev_max + 1
            prev_max = this_max
        print()

        print("> Gerando os arquivos finais...", end='')
        for dre in dre_to_pages_map:
            prova_writer = PdfWriter()
            for filename in dre_to_pages_map[dre]:
                prova_writer.addpages(PdfReader(filename).pages)
            prova_writer.trailer.Info = IndirectPdfDict(
                Title=f"P1 AlgLin 2020 PLE: {dre}")
            prova_writer.write(os.fspath(final_dir / f"{dre}.pdf"))
        print()

        provas_dir = (pathlib.Path() / args.PROVAS_DIR).resolve()
        print(f"> Colocando as provas no diretório {provas_dir} ...", end='')
        provas_dir.mkdir()
        for filename in final_dir.iterdir():
            shutil.copyfile(os.fspath(filename),
                            os.fspath(provas_dir / filename.name))
        print()

    print("> Gerando o zip...", end='')
    shutil.make_archive(os.fspath(provas_dir),
                        "zip",
                        root_dir=os.fspath(provas_dir.parent),