Example #1
0
def _create_compressed_file_object(source):
    """
    Create a file like object as ``/EmbeddedFile`` compressing it with deflate.

    :return:
        the object representing the compressed file stream object
    """
    md5 = hashlib.md5()
    compress = zlib.compressobj()

    pdf_file_object = PdfDict(Type=PdfName('EmbeddedFile'),
                              Filter=PdfName('FlateDecode'))

    # pdfrw needs Latin-1-decoded unicode strings in object.stream
    pdf_file_object.stream = ''
    size = 0
    for data in iter(lambda: source.read(4096), b''):
        size += len(data)
        md5.update(data)
        pdf_file_object.stream += compress.compress(data).decode('latin-1')
    pdf_file_object.stream += compress.flush(zlib.Z_FINISH).decode('latin-1')
    pdf_file_object.Params = PdfDict(CheckSum=PdfString('<{}>'.format(
        md5.hexdigest())),
                                     Size=size)
    return pdf_file_object
Example #2
0
def get_xml(string):
    """
    Get XML soup from 'string'.
    """

    if string.startswith(b'<?xml'):
        try:
            return new_xml(string)
        except:  # pragma: no cover
            return None
    try:
        reader = PdfReader(fdata=string)
        root = reader['/Root']
        msg = root.get(PdfName('Data'))
        return new_xml(msg.stream.encode('latin-1'))
    # these are legacy branches; I don't believe such files actually exist
    except:  # pragma: no cover
        try:
            reader = PdfReader(fdata=string)
            root = reader['/Root']
            msg = root.get(PdfName('Metadata'))
            return new_xml(msg.stream.encode('latin-1'))
        except:
            try:
                return new_xml(
                    string.encode('latin-1').split('endstream')[0].split(
                        'stream')[1])
            except:
                try:
                    return new_xml(string.encode('latin-1'))
                except:
                    return None
Example #3
0
def create_bookmarks(bookmarks, pages, parent=None):
    count = len(bookmarks)
    bookmark_objects = []
    for label, target, children in bookmarks:
        destination = (pages[target[0]].indirect, PdfName('XYZ'), target[1],
                       target[2], 0)
        bookmark_object = PdfDict(Title=PdfString.encode(label),
                                  A=PdfDict(Type=PdfName('Action'),
                                            S=PdfName('GoTo'),
                                            D=PdfArray(destination)))
        bookmark_object.indirect = True
        children_objects, children_count = create_bookmarks(
            children, pages, parent=bookmark_object)
        bookmark_object.Count = 1 + children_count
        if bookmark_objects:
            bookmark_object.Prev = bookmark_objects[-1]
            bookmark_objects[-1].Next = bookmark_object
        if children_objects:
            bookmark_object.First = children_objects[0]
            bookmark_object.Last = children_objects[-1]
        if parent is not None:
            bookmark_object.Parent = parent
        count += children_count
        bookmark_objects.append(bookmark_object)
    return bookmark_objects, count
Example #4
0
    def make_image_xobject(image):
        """Construct a PdfDict representing the Image XObject, for inserting
        into the AP Resources dict.

        PNGs and GIFs are treated equally - the raw sample values are included
        using PDF's FlateDecode compression format. JPEGs can be included in
        their original form using the DCTDecode filter.

        PNGs with transparency have the alpha channel split out and included as
        an SMask, since PDFs don't natively support transparent PNGs.

        Details about file formats and allowed modes can be found at
        https://pillow.readthedocs.io/en/5.3.x/handbook/image-file-formats.html

        :param str|ImageFile image: Either a str representing the path to the
            image filename, or a PIL.ImageFile.ImageFile object representing
            the image loaded using the PIL library.
        :returns PdfDict: Image XObject
        """
        image = Image.resolve_image(image)
        # PILImage.convert drops the format attribute
        image_format = image.format
        width, height = image.size

        # Normalize images to RGB or grayscale color spaces, and split out the
        # alpha layer into a PDF smask XObject
        image, smask_xobj = Image.convert_to_compatible_image(
            image,
            image_format,
        )

        if image_format in ('PNG', 'GIF'):
            content = Image.make_compressed_image_content(image)
            filter_type = 'FlateDecode'  # TODO use a predictor
        elif image_format == 'JPEG':
            content = Image.make_jpeg_image_content(image)
            filter_type = 'DCTDecode'
        else:
            raise ValueError(
                'Unsupported image format: {}. Supported formats are '
                'PNG, JPEG, and GIF'.format(image.format))

        xobj = PdfDict(
            stream=content,
            BitsPerComponent=8,
            Filter=PdfName(filter_type),
            ColorSpace=Image._get_color_space_name(image),
            Width=width,
            Height=height,
            Subtype=PdfName('Image'),
            Type=PdfName('XObject'),
        )
        if smask_xobj is not None:
            xobj.SMask = smask_xobj
        return xobj
def add_payload(name, output, path):
    print('[*] Reading PDF file: %s' % name)
    reader = PdfReader(name)
    print('[*] Injecting the payload in the PDF file...')
    reader.pages[0].AA = PdfDict(
        O=PdfDict(F=r'%s' % path, D=[0, PdfName('Fit')], S=PdfName('GoToE')))
    writer = PdfWriter()
    writer.addpages(reader.pages)
    print('[*] Saving modified PDF as: %s' % output)
    writer.write(output)
    print('[*] Done!')
Example #6
0
    def make_font_object():
        """Make a PDF Type1 font object for embedding in the annotation's
        Resources dict. Only Helvetica is supported as a base font.

        :returns PdfDict: Resources PdfDict object, ready to be included in the
            Resources 'Font' subdictionary.
        """
        return PdfDict(
            Type=PdfName('Font'),
            Subtype=PdfName('Type1'),
            BaseFont=PdfName(DEFAULT_BASE_FONT),
            Encoding=PdfName('WinAnsiEncoding'),
        )
Example #7
0
 def get_png_smask(image):
     width, height = image.size
     smask = Image.make_compressed_image_content(image.getchannel('A'))
     smask_xobj = PdfDict(
         stream=smask,
         Width=width,
         Height=height,
         BitsPerComponent=8,
         Filter=PdfName('FlateDecode'),
         ColorSpace=PdfName('DeviceGray'),
         Subtype=PdfName('Image'),
         Type=PdfName('XObject'),
     )
     smask_xobj.indirect = True
     return smask_xobj
Example #8
0
def _create_pdf_attachment(attachment, url_fetcher):
    """
    Create an attachment to the PDF stream

    :return:
        the object representing the ``/Filespec`` object or :obj:`None` if the
        attachment couldn't be read.
    """
    try:
        # Attachments from document links like <link> or <a> can only be URLs.
        # They're passed in as tuples
        if isinstance(attachment, tuple):
            url, description = attachment
            attachment = Attachment(url=url,
                                    url_fetcher=url_fetcher,
                                    description=description)
        elif not isinstance(attachment, Attachment):
            attachment = Attachment(guess=attachment, url_fetcher=url_fetcher)

        with attachment.source as (source_type, source, url, _):
            if isinstance(source, bytes):
                source = io.BytesIO(source)
            pdf_file_object = _create_compressed_file_object(source)
    except URLFetchingError as exc:
        LOGGER.error('Failed to load attachment: %s', exc)
        return None

    # TODO: Use the result object from a URL fetch operation to provide more
    # details on the possible filename
    return PdfDict(Type=PdfName('Filespec'),
                   F=PdfString.encode(''),
                   UF=PdfString.encode(_get_filename_from_result(url, None)),
                   EF=PdfDict(F=pdf_file_object),
                   Desc=PdfString.encode(attachment.description or ''))
Example #9
0
def get_form_fields_from_fdf(pdf_file_path):
    try:
        pdf_reader = PdfReader(pdf_file_path)
    except errors.PdfParseError:
        print(f'File \'{pdf_file_path}\' not found please specify full path')
        return None
    try:
        # Check if PDF has Form Fields and if we can read them
        pdf_form_fields = pdf_reader.Root.AcroForm.Fields
    except AttributeError:
        print(f'File \'{pdf_file_path}\' no Form Fields')
        return None

    # Create an empty PdfDict to collect Form Fields and Values
    pdf_metadata = PdfDict()

    # Define list of Form Fields to be ignored from transfer to PdfDict
    # For example: field Sig
    ignore_fields = ('/Sig',)

    # Load Form Fields into PdfDict
    for field in pdf_form_fields:
        if field.FT not in ignore_fields:
            field_name = field.T
            if field_name is not None:
                key_name = PdfName(field_name.decode())
                pdf_metadata[key_name] = field.V
    return pdf_metadata
Example #10
0
def fill_form(input_file, output_file, data):
    """input_file can be file object or path name
    output_file can be file object or path name
    data is dictionary with keys corresponding to the form fields"""

    the_pdf = PdfReader(input_file)
    for page in the_pdf.pages:
        annotations = page[ANNOT_KEY]
        for annotation in annotations:
            if annotation[SUBTYPE_KEY] == WIDGET_SUBTYPE_KEY:
                key = annotation[ANNOT_FIELD_KEY][1:-1]
                if key in data.keys():
                    val = data[key]
                    if val == None:
                        # skip nulls
                        continue
                    if val == True:
                        # treat booleans as checkboxes
                        annotation.update(PdfDict(V=PdfName("On")))
                    else:
                        # set annotation value
                        annotation.update(PdfDict(V="{}".format(val)))
                        # and empty appearance to make field visible in Apple Preview
                        annotation.update(PdfDict(AP=""))
                    # mark the fields as un-editable
                    annotation.update(PdfDict(Ff=1))

    # set NeedAppearances to ensure the fields are visible in Adobe Reader
    if the_pdf.Root.AcroForm:
        the_pdf.Root.AcroForm.update(
            PdfDict(NeedAppearances=PdfObject("true")))

    PdfWriter().write(output_file, the_pdf)
Example #11
0
    def make_cid_font_object(tt_font):
        """Make a CID Type 2 font object for including as a descendant of a composite
        Type 0 font object.

        :param TrueTypeFont tt_font: Our utility class used to parse and calculate font metrics
        from a true type font.
        :returns PdfDict: CID Font Type 2 PdfDict object.
        """
        return IndirectPdfDict(
            Type=PdfName('Font'),
            Subtype=PdfName('CIDFontType2'),
            BaseFont=PdfName(tt_font.fontName),
            CIDSystemInfo=FreeText.make_cid_system_info_object(),
            FontDescriptor=FreeText.make_font_descriptor_object(tt_font),
            DW=int(round(tt_font.metrics.defaultWidth, 0)),
            Widths=PdfArray(tt_font.metrics.widths),
            CIDToGIDMap=FreeText.make_cid_to_gid_map_object(tt_font),
        )
Example #12
0
    def make_composite_font_object(font_file_path):
        """Make a PDF Type0 composite font object for embedding in the annotation's
        Resources dict.

        :param str font_file_path: The path and filename to the true type font we want to embed.
        :returns PdfDict: Resources PdfDict object, ready to be included in the
            Resources 'Font' subdictionary.
        """
        # TODO: Get font name from font program itself
        tt_font = get_true_type_font(font_file_path, DEFAULT_BASE_FONT)

        return IndirectPdfDict(Type=PdfName('Font'),
                               Subtype=PdfName('Type0'),
                               BaseFont=PdfName(tt_font.fontName),
                               Encoding=PdfName('Identity-H'),
                               DescendantFonts=PdfArray(
                                   [FreeText.make_cid_font_object(tt_font)]),
                               ToUnicode=FreeText.make_to_unicode_object())
Example #13
0
def write_fillable_pdf(input_pdf_path, output_pdf_path, data_dict,
                       camposCheckBox):

    template_pdf = PdfReader(input_pdf_path)
    #Necesario para que se vean cambios
    template_pdf.Root.AcroForm.update(
        PdfDict(NeedAppearances=PdfObject('true')))

    #Por cada pagina del PDF
    for page in template_pdf.pages:
        annotations = page[ANNOT_KEY]

        #Para cada anotacion de la pagina
        for annotation in annotations:
            if annotation[SUBTYPE_KEY] == WIDGET_SUBTYPE_KEY:
                if annotation[ANNOT_FIELD_KEY]:
                    key = annotation[ANNOT_FIELD_KEY][1:-1]
                    if key in data_dict.keys():

                        #HACK PARA LOS CHECK. Si es true, se marcan, sino no
                        if key in camposCheckBox:
                            if (data_dict[key] == 'true'):
                                annotation.update(
                                    PdfDict(V='{}'.format(data_dict[key]),
                                            AS=PdfName('Yes')))
                            #Si no se pone nada, por defecto no se marcan
                            continue

                    #Objeto necesario para que al rellenar se vean los campos
                        rct = annotation.Rect
                        hight = round(float(rct[3]) - float(rct[1]), 2)
                        width = (round(float(rct[2]) - float(rct[0]), 2))

                        xobj = PdfDict(
                            BBox=[0, 0, width, hight],
                            FormType=1,
                            Resources=PdfDict(
                                ProcSet=[PdfName.PDF, PdfName.Text]),
                            Subtype=PdfName.Form,
                            Type=PdfName.XObject)
                        #assign a stream to it
                        xobj.stream = '''/Tx BMC
                        BT
                        /Helvetica 8.0 Tf
                        1.0 5.0 Td
                        0 g
                        (''' + data_dict[key] + ''') Tj
                        ET EMC'''

                        #Actualizamos la anotacion en el PDF
                        annotation.update(
                            PdfDict(AP=PdfDict(N=xobj),
                                    V='{}'.format(data_dict[key])))

    #Escribimos el PDF ya anotado al PATH de salida
    PdfWriter().write(output_pdf_path, template_pdf)
Example #14
0
    def make_font_descriptor_object(tt_font):
        """Make a Font Descriptor object containing some calculated metrics
        for the font.

        :param TrueTypeFont tt_font: Our utility class used to parse and calculate font metrics
        from a true type font.
        :returns PdfDict: Font Descriptor PdfDict object.
        """
        return IndirectPdfDict(
            Type=PdfName('FontDescriptor'),
            FontName=PdfName(tt_font.fontName),
            Flags=tt_font.metrics.flags,
            FontBBox=tt_font.metrics.bbox,
            ItalicAngle=int(tt_font.metrics.italicAngle),
            Ascent=int(round(tt_font.metrics.ascent, 0)),
            Descent=int(round(tt_font.metrics.descent, 0)),
            CapHeight=int(round(tt_font.metrics.capHeight, 0)),
            StemV=int(round(tt_font.metrics.stemV, 0)),
            MissingWidth=int(round(tt_font.metrics.defaultWidth, 0)),
            FontFile2=FreeText.make_font_file_object(tt_font))
Example #15
0
def create_highlight(points,
                     color=(1, 0.92, 0.23),
                     author=None,
                     contents=None):
    """Given Quad points, create a highligh object in standard pdf format."""
    new_highlight = PdfDict()
    new_highlight.F = 4
    new_highlight.Type = PdfName('Annot')
    new_highlight.Subtype = PdfName('Highlight')
    if author:
        new_highlight.T = author
    new_highlight.C = color
    if contents:
        new_highlight.Contents = contents
    new_highlight.indirect = True

    #############################################################
    ### Search for bounding coordinates
    #############################################################
    bot_left_x = float('inf')
    bot_left_y = float('inf')
    top_right_x = 0.0
    top_right_y = 0.0

    quad_pts = []
    for (x1, y1, x2, y2) in points:
        # this quadpoints specified PDF definition of rect box
        quad_pts.extend([x1, y2, x2, y2, x1, y1, x2, y1])
        bot_left_x = min(bot_left_x, x1, x2)
        bot_left_y = min(bot_left_y, y1, y2)
        top_right_x = max(top_right_x, x1, x2)
        top_right_y = max(top_right_y, y1, y2)

    new_highlight.QuadPoints = PdfArray(quad_pts)
    new_highlight.Rect = PdfArray(
        [bot_left_x, bot_left_y, top_right_x, top_right_y])
    return new_highlight
Example #16
0
def apply_annotations(rmpage, page_annot, ocgorderinner):
    for k, layer_a in enumerate(page_annot):
        layerannots = layer_a[1]
        for a in layerannots:
            # PDF origin is in bottom-left, so invert all
            # y-coordinates.
            author = 'RCU'  #self.model.device_info['rcuname']
            pdf_a = PdfDict(Type=PdfName('Annot'),
                            Rect=PdfArray([(a[1] * PTPERPX),
                                           PDFHEIGHT - (a[2] * PTPERPX),
                                           (a[3] * PTPERPX),
                                           PDFHEIGHT - (a[4] * PTPERPX)]),
                            T=author,
                            ANN='pdfmark',
                            Subtype=PdfName(a[0]),
                            P=rmpage)
            # Set to indirect because it makes a cleaner PDF
            # output.
            pdf_a.indirect = True
            if ocgorderinner:
                pdf_a.OC = ocgorderinner[k]
            if not '/Annots' in rmpage:
                rmpage.Annots = PdfArray()
            rmpage.Annots.append(pdf_a)
def fix_metadata(doc, title=None, creation_date=None):
    # Clear any existing XMP meta data
    doc.Root.Metadata = None

    meta = {
        'Creator': 'OffeneGesetze.de',
        'Keywords': 'Amtliches Werk nach §5 UrhG https://offenegesetze.de',
        'ModDate': make_pdf_date(datetime.now()),
    }
    if title is not None:
        meta['Title'] = title
    if creation_date is not None:
        meta['CreationDate'] = make_pdf_date(creation_date)

    for key, val in meta.items():
        if 'Date' not in key:
            val = PdfString.from_unicode(val)
        doc.Info[PdfName(key)] = val
Example #18
0
def getPages(allpages, x, y, gap):

    # Number of pages to combine
    count = x * y

    # Pull pages off the list
    pages = [pagexobj(p) for p in allpages[:count]]
    del allpages[:count]

    # Out page size
    width_max  = max(page.BBox[2] for page in pages)
    height_max = max(page.BBox[3] for page in pages)

    stream = []
    xobjdict = PdfDict()

    line = y
    for index, page in enumerate(pages):

        width = (index % x) * width_max / x
        if not width:
            line = line - 1
        height = line * height_max / y

        # Page number
        index = PdfName('P{}'.format(index))

        format_stream = {
            "x": 1./x - gap,
            "y": 1./y - gap,
            "w": width,
            "h": height,
            "i": index
        }
        stream.append('q {x} 0 0 {y} {w} {h} cm {i} Do Q\n'.format(**format_stream))

        xobjdict[index] = page

    return PdfDict(
        Type = PdfName.Page,
        Contents = PdfDict(stream=''.join(stream)),
        MediaBox = PdfArray([-1000*gap, -1000*gap, width_max, height_max]),
        Resources = PdfDict(XObject = xobjdict),
    )
Example #19
0
def concatenate(input_paths, output_path, details=None):
    """Given an ordered sequence of paths to pdf files, concatenate
    to the desired output path with the given details.
    
    Args:
        input_paths: A sequence of paths to pdf files.
        output_path: The desired path for the concatenated pdf.
        details: A dictionary of metadata values desired for the final pdf.
    """
    writer = PdfWriter()

    for path in input_paths:
        reader = PdfReader(path)
        writer.addpages(reader.pages)

    writer.trailer.Info = IndirectPdfDict()
    if details is not None:
        for metadata, value in details.items():
            writer.trailer.Info[PdfName(metadata)] = value

    writer.write(output_path)
Example #20
0
 def test_graphics_state(self):
     state = GraphicsState(
         line_width=2,
         line_cap=constants.LINE_CAP_ROUND,
         line_join=constants.LINE_JOIN_MITER,
         miter_limit=1.404,
         dash_array=[[1], 0],
         stroke_transparency=0.7,
         fill_transparency=0.5,
     )
     pdf_dict = state.as_pdf_dict()
     assert pdf_dict == PdfDict(
         Type=PdfName('ExtGState'),
         LW=2,
         LC=1,
         LJ=0,
         ML=1.404,
         D=[[1], 0],
         CA=0.7,
         ca=0.5,
     )
Example #21
0
                        1.0 5.0 Td
                        0 g
                        (''' + data_dict[key] + ''') Tj
                        ET EMC'''
                        annotation.update(
                            PdfDict(AP=PdfDict(N=xobj),
                                    V='{}'.format(data_dict[key])))
                        #annotation.update(pdfrw.PdfDict(V='{}'.format(data_dict[key]),AP='{}'.format(data_dict[key])))

    PdfWriter().write(output_pdf_path, template_pdf)


data_dict = {
    'untitled1': '46017675',  #Codigo de centro
    'untitled5': 'IES La Sènia',  #Nombre del centro
    'untitled6': PdfName('Yes'),  #Centro titularidad publica
    'untitled2': 'Paiporta',  #Localidad centro
    'untitled4': 'Valencia',  #Provincia centro
    'untitled8': '961 20 59 55',  #Telefono Centro
    'untitled3': 'Calle Escultor José Capuz, 96',  #Direccion Centro
    'untitled9': '46200',  #Codigo Postal Centro
    'untitled10': 'NIA',  #NIA Alumno
    'untitled11': 'Curso',  #Curso Alumno
    'untitled12': 'Apellidos, Nombre',  #Apellidos, Nombre - Alumnos
    'untitled15': 'Desarrollo de Aplicaciones Web',  #Titulo ciclo
    'untitled16': 'Superior',  #Grado ciclo
    'untitled18': 'Punto 1.1',  #Punto 1.1
    'untitled17': 'Punto 1.2',  #Punto 1.2
    'untitled19': 'Punto 1.3',  #Punto 1.3
    'untitled20': 'Punto 1.4',  #Punto 1.4
    'untitled21': 'true',  #Check Avanzado
Example #22
0
 def pdfdict(self):
     """Return a PageLabel entry to pe inserted in the root of a PdfReader object"""
     nums = (i for label in sorted(self)
                 for i in label.pdfobjs())
     return PdfDict(Type=PdfName("Catalog"),
                    Nums = PdfArray(nums))
Example #23
0
#!/usr/bin/env python3
from collections import namedtuple
from pdfrw import PdfName, PdfDict, PdfObject, PdfString

PageLabelTuple = namedtuple("PageLabelScheme",
                            "startpage style prefix firstpagenum")

defaults = {"style": "arabic", "prefix": '', "firstpagenum": 1}
styles = {
    "arabic": PdfName('D'),
    "roman lowercase": PdfName('r'),
    "roman uppercase": PdfName('R'),
    "letters lowercase": PdfName('a'),
    "letters uppercase": PdfName('A')
}
stylecodes = {v: a for a, v in styles.items()}


class PageLabelScheme(PageLabelTuple):
    """Represents a page numbering scheme.
        startpage : the index in the pdf (starting from 0) of the
                    first page the scheme will be applied to.
        style : page numbering style (arabic, roman [lowercase|uppercase], letters [lowercase|uppercase])
        prefix: a prefix to be prepended to all page labels
        firstpagenum : where to start numbering
    """
    __slots__ = tuple()

    def __new__(cls,
                startpage,
                style=defaults["style"],
Example #24
0
def join_files(input_files, output_file):
    """input_files is a list of file objects or path names
    output_file can be file object or path name"""

    # standard PdfWriter does not copy AcroForm objects
    # modified from https://stackoverflow.com/a/57687160

    output = PdfWriter()
    output_acroform = None
    for pdf in input_files:
        input = PdfReader(pdf, verbose=False)
        output.addpages(input.pages)
        if (PdfName("AcroForm") in input[PdfName("Root")].keys()
            ):  # Not all PDFs have an AcroForm node
            source_acroform = input[PdfName("Root")][PdfName("AcroForm")]
            if PdfName("Fields") in source_acroform:
                output_formfields = source_acroform[PdfName("Fields")]
            else:
                output_formfields = []
            if output_acroform == None:
                # copy the first AcroForm node
                output_acroform = source_acroform
            else:
                for key in source_acroform.keys():
                    # Add new AcroForms keys if output_acroform already existing
                    if key not in output_acroform:
                        output_acroform[key] = source_acroform[key]
                # Add missing font entries in /DR node of source file
                if (PdfName("DR") in source_acroform.keys()) and (
                        PdfName("Font")
                        in source_acroform[PdfName("DR")].keys()):
                    if PdfName("Font") not in output_acroform[PdfName(
                            "DR")].keys():
                        # if output_acroform is missing entirely the /Font node under an existing /DR, simply add it
                        output_acroform[PdfName("DR")][PdfName(
                            "Font")] = source_acroform[PdfName("DR")][PdfName(
                                "Font")]
                    else:
                        # else add new fonts only
                        for font_key in source_acroform[PdfName("DR")][PdfName(
                                "Font")].keys():
                            if (font_key not in output_acroform[PdfName("DR")][
                                    PdfName("Font")]):
                                output_acroform[PdfName("DR")][PdfName(
                                    "Font")][font_key] = source_acroform[
                                        PdfName("DR")][PdfName(
                                            "Font")][font_key]
            if PdfName("Fields") not in output_acroform:
                output_acroform[PdfName("Fields")] = output_formfields
            else:
                # Add new fields
                output_acroform[PdfName("Fields")] += output_formfields
    output.trailer[PdfName("Root")][PdfName("AcroForm")] = output_acroform
    output.write(output_file)
Example #25
0
def do_apply_ocg(basepage, rmpage, i, uses_base_pdf, ocgprop, annotations):
    ocgpage = IndirectPdfDict(Type=PdfName('OCG'), Name='Page ' + str(i + 1))
    ocgprop.OCGs.append(ocgpage)

    # The Order dict is a Page, followed by Inner
    ocgorderinner = PdfArray()

    # Add Template OCG layer
    # If this uses a basepdf, the template is located
    # elsewhere.

    # If using a basepdf, assign its stream as a
    # 'Background' layer under this page. When the page
    # primary OCG is disabled, the background will
    # remain, making it easy to disable all annotations.
    if uses_base_pdf:
        ocgorigdoc = IndirectPdfDict(Type=PdfName('OCG'), Name='Background')
        ocgprop.OCGs.append(ocgorigdoc)
        ocgorderinner.append(ocgorigdoc)

        uncompress.uncompress([basepage.Contents])
        stream = basepage.Contents.stream
        stream = '/OC /ocgorigdoc BDC\n' \
            + stream \
            + 'EMC\n'
        basepage.Contents.stream = stream
        compress.compress([basepage.Contents])

        if '/Properties' in basepage.Resources:
            props = basepage.Resources.Properties
        else:
            props = PdfDict()
        props.ocgorigdoc = ocgorigdoc
        basepage.Resources.Properties = props

    # If not using a basepdf, assign the rmpage's stream
    # as a 'Template' layer under this page. It will be
    # affected by disabling the primary Page OCG (which
    # by itself is kind of useless for exported
    # notebooks).

    # Regardless of using a basepdf or not, put the
    # rmpage layers into their own OCGs.

    # If the template has an XObject, we want to skip
    # the first one. This happens when the template
    # contains a PNG. Question--what happens when the
    # template contains more than one PNG? How do we
    # detect all of those?

    template_xobj_keys = []
    vector_layers = []
    uncompress.uncompress([rmpage.Contents])
    if uses_base_pdf:
        # The entire thing is the page ocg
        stream = '/OC /ocgpage BDC\n'
        stream += rmpage.Contents.stream
        stream += 'EMC\n'
        rmpage.Contents.stream = stream
    else:
        stream = rmpage.Contents.stream
        # Mark the template ocg separate from page ocg
        template_endpos = 0
        page_inatpos = 0
        findkey = '1 w 2 J 2 j []0  d\nq\n'
        # Finds only the first instance, which should be
        # for the template.
        findloc = stream.find(findkey)
        if findloc < 0:
            # May be a vector, which we stick a marker
            # in for.
            # ?? Why is this a half-point off ??
            findkey = '799.500000 85 l\n'
            m = re.search(findkey, rmpage.Contents.stream)
            if m:
                findloc = m.start()
        if findloc > 0:
            template_endpos = findloc + len(findkey)
            # Add vector template OCG
            stream = '/OC /ocgtemplate BDC\n'
            stream += rmpage.Contents.stream[:template_endpos]
            stream += 'EMC\n'
            page_inatpos = len(stream)
            stream += rmpage.Contents.stream[template_endpos:]
            # Save stream
            rmpage.Contents.stream = stream

        # Add template ocg
        ocgtemplate = IndirectPdfDict(Type=PdfName('OCG'), Name='Template')
        ocgprop.OCGs.append(ocgtemplate)
        ocgorderinner.append(ocgtemplate)

        # If a template (which is SVG) has embedded PNG
        # images, those appear as XObjects. This will
        # mess up the layer order, so we will ignore
        # them later.
        template_xobj_keys = \
            re.findall(r'(\/Im[0-9]+)\s',
                        stream[:template_endpos])

        # Page ocg
        stream = rmpage.Contents.stream[:page_inatpos]
        stream += '/OC /ocgpage BDC\n'
        stream += rmpage.Contents.stream[page_inatpos:]
        stream += 'EMC\n'
        # Save stream
        rmpage.Contents.stream = stream

    # Find all other vector layers using the magic
    # point (DocumentPageLayer.render_to_painter()).
    # ?? Why is this a half-point off ??
    while True:
        m = re.search('420.500000 69 m\n', rmpage.Contents.stream)
        if not m:
            break
        stream = ''
        layerid = 'ocglayer{}'.format(len(vector_layers) + 1)
        stream = rmpage.Contents.stream[:m.start()]
        if len(vector_layers):
            # close previous layer
            stream += 'EMC\n'
        stream += '/OC /{} BDC\n'.format(layerid)
        stream += rmpage.Contents.stream[m.end():]
        vector_layers.append(layerid)
        rmpage.Contents.stream = stream
    # If we added vector layers, have to end the
    # first one.
    if len(vector_layers):
        stream = rmpage.Contents.stream + 'EMC\n'
        rmpage.Contents.stream = stream

    # Done--recompress the stream.
    compress.compress([rmpage.Contents])

    # There shouldn't be any Properties there since we
    # generated the rmpage ourselves, so don't bother
    # checking.
    rmpage.Resources.Properties = PdfDict(ocgpage=ocgpage)
    if not uses_base_pdf:
        rmpage.Resources.Properties.ocgtemplate = ocgtemplate

    # Add individual OCG layers (Bitmap)
    was_vector = True
    for n, key in enumerate(rmpage.Resources.XObject):
        if str(key) in template_xobj_keys:
            continue
        was_vector = False
        l = n - len(template_xobj_keys)
        # This would indicate a bug in the handling of a
        # notebook.
        try:
            layer = annotations[i][l]
        except:
            log.error(
                'could not associate XObject with layer: (i, l) ({}, {})'.
                format(i, l))
            log.error(str(annotations))
            log.error('document: {} ()').format('uuid', 'self.visible_name')
            continue
        layername = layer[0]
        ocg = IndirectPdfDict(Type=PdfName('OCG'), Name=layername)
        ocgprop.OCGs.append(ocg)
        ocgorderinner.append(ocg)
        rmpage.Resources.XObject[key].OC = ocg

    # Add individual OCG layers (Vector)
    if was_vector:
        for l, layerid in enumerate(vector_layers):
            # This would indicate a bug in the handling of a
            # notebook.
            try:
                layer = annotations[i][l]
            except:
                log.error(
                    'could not associate layerid with layer: (i, l, layerid) ({}, {}, {})'
                    .format(i, l, layerid))
                log.error('document: {} ()').format('uuid',
                                                    'self.visible_name')
                log.error(str(annotations))
                continue
            layername = layer[0]
            ocg = IndirectPdfDict(Type=PdfName('OCG'), Name=layername)
            ocgprop.OCGs.append(ocg)
            ocgorderinner.append(ocg)
            rmpage.Resources.Properties[PdfName(layerid)] = \
                ocg

    # Add order of OCGs to primary document
    ocgprop.D.Order.append(ocgpage)
    ocgprop.D.Order.append(ocgorderinner)

    return ocgorderinner
Example #26
0
 def test_blank(self):
     pdf_dict = GraphicsState().as_pdf_dict()
     assert len(pdf_dict) == 1
     assert pdf_dict.Type == PdfName('ExtGState')
Example #27
0
 def add_additional_resources(self, resources):
     font_dict = PdfDict()
     font_dict[PdfName(PDF_ANNOTATOR_FONT)] = self.make_font_object()
     resources[PdfName('Font')] = font_dict
Example #28
0
alt_img = PdfDict(Type=PdfName.XObject,
                  SubType=PdfName.Image,
                  BitsPerComponent=8,
                  ColorSpace=PdfName.DeviceRGB,
                  Height=800,
                  Width=600,
                  Length=0,
                  F=PdfDict(FS=PdfName.URL,
                            F='https://chezsoi.org/lucas/ThePatch.jpg'),
                  FFilter=PdfName.DCTDecode)
alt_img.indirect = true

alternates = PdfArray([PdfDict(DefaultForPrinting=True, Image=alt_img)])
alternates.indirect = true

img_name = PdfName('Image-9960')
img = img_kid.Resources.XObject[img_name]
img.Alternates = alternates
pdf_kid.Resources.XObject = PdfDict()
pdf_kid.Resources.XObject[img_name] = img

out = PdfWriter()
out.addpage(pdf.pages[0])
out.write('out.pdf')

# CONCLUSION: neither Adobe nor Sumatra readers visit the link...
# It may be that readers do not follow this "Alternates" images spec anymore, that HTTPS is not supported, or that I made a mistake in the resulting PDF.
# Anyway, I'm giving up.
# However Canary Tokens use a similar technic that works well (with Adobe not Sumatra): https://github.com/sumatrapdfreader/sumatrapdf/issues/1696
Example #29
0
 def _get_color_space_name(image):
     if image.mode == RGB_MODE:
         return PdfName('DeviceRGB')
     elif image.mode in (GRAYSCALE_MODE, SINGLE_CHANNEL_MODE):
         return PdfName('DeviceGray')
     raise ValueError('Image color space not yet supported')
Example #30
0
def write_pdf_metadata(document, fileobj, scale, metadata, attachments,
                       url_fetcher):
    """Append to a seekable file-like object to add PDF metadata."""
    fileobj.seek(0)
    trailer = PdfReader(fileobj)
    pages = trailer.Root.Pages.Kids

    bookmarks, links = prepare_metadata(document, scale, pages)
    if bookmarks:
        bookmark_objects, count = create_bookmarks(bookmarks, pages)
        trailer.Root.Outlines = PdfDict(Type=PdfName('Outlines'),
                                        Count=count,
                                        First=bookmark_objects[0],
                                        Last=bookmark_objects[-1])

    attachments = metadata.attachments + (attachments or [])
    if attachments:
        embedded_files = []
        for attachment in attachments:
            attachment_object = _create_pdf_attachment(attachment, url_fetcher)
            if attachment_object is not None:
                embedded_files.append(PdfString.encode('attachment'))
                embedded_files.append(attachment_object)
        if embedded_files:
            trailer.Root.Names = PdfDict(EmbeddedFiles=PdfDict(
                Names=PdfArray(embedded_files)))

    # A single link can be split in multiple regions. We don't want to embedded
    # a file multiple times of course, so keep a reference to every embedded
    # URL and reuse the object number.
    # TODO: If we add support for descriptions this won't always be correct,
    # because two links might have the same href, but different titles.
    annot_files = {}
    for page_links in links:
        for link_type, target, rectangle in page_links:
            if link_type == 'attachment' and target not in annot_files:
                # TODO: use the title attribute as description
                annot_files[target] = _create_pdf_attachment((target, None),
                                                             url_fetcher)

    # TODO: splitting a link into multiple independent rectangular annotations
    # works well for pure links, but rather mediocre for other annotations and
    # fails completely for transformed (CSS) or complex link shapes (area).
    # It would be better to use /AP for all links and coalesce link shapes that
    # originate from the same HTML link. This would give a feeling similiar to
    # what browsers do with links that span multiple lines.
    for page, page_links in zip(pages, links):
        annotations = PdfArray()
        for link_type, target, rectangle in page_links:
            if link_type != 'attachment' or annot_files[target] is None:
                annotation = PdfDict(Type=PdfName('Annot'),
                                     Subtype=PdfName('Link'),
                                     Rect=PdfArray(rectangle),
                                     Border=PdfArray((0, 0, 0)))
                if link_type == 'internal':
                    destination = (target[0], PdfName('XYZ'), target[1],
                                   target[2], 0)
                    annotation.A = PdfDict(Type=PdfName('Action'),
                                           S=PdfName('GoTo'),
                                           D=PdfArray(destination))
                else:
                    annotation.A = PdfDict(Type=PdfName('Action'),
                                           S=PdfName('URI'),
                                           URI=PdfString.encode(
                                               iri_to_uri(target)))
            else:
                assert annot_files[target] is not None
                ap = PdfDict(N=PdfDict(BBox=PdfArray(rectangle),
                                       Subtype=PdfName('Form'),
                                       Type=PdfName('XObject')))
                # evince needs /T or fails on an internal assertion. PDF
                # doesn't require it.
                annotation = PdfDict(Type=PdfName('Annot'),
                                     Subtype=PdfName('FileAttachment'),
                                     T=PdfString.encode(''),
                                     Rect=PdfArray(rectangle),
                                     Border=PdfArray((0, 0, 0)),
                                     FS=annot_files[target],
                                     AP=ap)
            annotations.append(annotation)

        if annotations:
            page.Annots = annotations

    trailer.Info.Producer = VERSION_STRING
    for attr, key in (('title', 'Title'), ('description', 'Subject'),
                      ('generator', 'Creator')):
        value = getattr(metadata, attr)
        if value is not None:
            setattr(trailer.Info, key, value)
    for attr, key in (('authors', 'Author'), ('keywords', 'Keywords')):
        value = getattr(metadata, attr)
        if value is not None:
            setattr(trailer.Info, key, ', '.join(getattr(metadata, attr)))
    for attr, key in (('created', 'CreationDate'), ('modified', 'ModDate')):
        value = w3c_date_to_pdf(getattr(metadata, attr), attr)
        if value is not None:
            setattr(trailer.Info, key, value)

    for page, document_page in zip(pages, document.pages):
        left, top, right, bottom = (float(value) for value in page.MediaBox)
        # Convert pixels into points
        bleed = {
            key: value * 0.75
            for key, value in document_page.bleed.items()
        }

        trim_left = left + bleed['left']
        trim_top = top + bleed['top']
        trim_right = right - bleed['right']
        trim_bottom = bottom - bleed['bottom']
        page.TrimBox = PdfArray((trim_left, trim_top, trim_right, trim_bottom))

        # Arbitrarly set PDF BleedBox between CSS bleed box (PDF MediaBox) and
        # CSS page box (PDF TrimBox), at most 10 points from the TrimBox.
        bleed_left = trim_left - min(10, bleed['left'])
        bleed_top = trim_top - min(10, bleed['top'])
        bleed_right = trim_right + min(10, bleed['right'])
        bleed_bottom = trim_bottom + min(10, bleed['bottom'])
        page.BleedBox = PdfArray(
            (bleed_left, bleed_top, bleed_right, bleed_bottom))

    fileobj.seek(0)
    PdfWriter().write(fileobj, trailer=trailer)
    fileobj.truncate()