Python PdfString.from_unicode Examples, pdfrw.objects.PdfString.from_unicode Python Examples

Example #1

0

Show file

File: pdf_redactor.py Project: vitalbeats/pdf-redactor

def update_annotation(annotation, options):
    import re
    from pdfrw.objects import PdfString

    # Contents holds a plain-text representation of the annotation
    # content, such as for accessibility. All annotation types may
    # have a Contents. NM holds the "annotation name" which also
    # could have redactable text, I suppose. Markup annotations have
    # "T" fields that hold a title / text label. Subj holds a
    # comment subject. CA, RC, and AC are used in widget annotations.
    for string_field in ("Contents", "NM", "T", "Subj", "CA", "RC", "AC"):
        if getattr(annotation, string_field):
            value = getattr(annotation, string_field).to_unicode()
            for pattern, function in options.content_filters:
                value = pattern.sub(function, value)
            setattr(annotation, string_field, PdfString.from_unicode(value))

    # A rich-text stream. Not implemented. Bail so that we don't
    # accidentally leak something that should be redacted.
    if annotation.RC:
        raise ValueError(
            "Annotation rich-text streams (Annot/RC) are not supported.")

    # An action, usually used for links.
    if annotation.A:
        update_annotation_action(annotation, annotation.A, options)
    if annotation.PA:
        update_annotation_action(annotation, annotation.PA, options)

    # If set, another annotation.
    if annotation.Popup:
        update_annotation(annotation.Popup, options)

Example #2

0

Show file

File: pdf_redactor.py Project: vitalbeats/pdf-redactor

def update_annotation_action(annotation, action, options):
    from pdfrw.objects import PdfString

    if action.URI and options.link_filters:
        value = action.URI.to_unicode()
        for func in options.link_filters:
            value = func(value, annotation)
        if value is None:
            # Remove annotation by supressing the action.
            action.URI = None
        else:
            action.URI = PdfString.from_unicode(value)

    if action.Next:
        # May be an Action or array of Actions to execute next.
        next_action = action.Next
        if isinstance(action.Next, dict):
            next_action = [action.Next]
        for a in next_action:
            update_annotation_action(annotation, a, options)

Example #3

0

Show file

File: djpdf.py Project: Unrud/djpdf

    def _build_font():
        with open(FONT_FILENAME, "rb") as f:
            embedded_font_stream = f.read()
        embedded_font = PdfDict()
        embedded_font.indirect = True
        embedded_font.Filter = [PdfName.FlateDecode]
        embedded_font.stream = zlib.compress(embedded_font_stream, 9).decode(
            "latin-1")
        embedded_font.Length1 = len(embedded_font_stream)

        font_descriptor = PdfDict()
        font_descriptor.indirect = True
        font_descriptor.Ascent = 1000
        font_descriptor.CapHeight = 1000
        font_descriptor.Descent = -1
        font_descriptor.Flags = 5  # FixedPitch + Symbolic
        font_descriptor.FontBBox = PdfArray([0, 0, 1000, 500])
        font_descriptor.FontFile2 = embedded_font
        font_descriptor.FontName = PdfName.GlyphLessFont
        font_descriptor.ItalicAngle = 0
        font_descriptor.StemV = 80
        font_descriptor.Type = PdfName.FontDescriptor

        # Map everything to glyph 1
        cid_to_gid_map_stream = b"\0\1" * (1 << 16)
        cid_to_gid_map = PdfDict()
        cid_to_gid_map.indirect = True
        cid_to_gid_map.Filter = [PdfName.FlateDecode]
        cid_to_gid_map.stream = zlib.compress(
            cid_to_gid_map_stream, 9).decode("latin-1")
        cid_to_gid_map.Length1 = len(cid_to_gid_map_stream)

        cid_system_info = PdfDict()
        cid_system_info.Ordering = PdfString.from_unicode("Identity")
        cid_system_info.Registry = PdfString.from_unicode("Adobe")
        cid_system_info.Supplement = 0

        cid_font = PdfDict()
        cid_font.indirect = True
        cid_font.CIDToGIDMap = cid_to_gid_map
        cid_font.BaseFont = PdfName.GlyphLessFont
        cid_font.CIDSystemInfo = cid_system_info
        cid_font.FontDescriptor = font_descriptor
        cid_font.Subtype = PdfName.CIDFontType2
        cid_font.Type = PdfName.Font
        cid_font.DW = 500

        with open(UNICODE_CMAP_FILENAME, "rb") as f:
            unicode_cmap_stream = f.read()
        unicode_cmap = PdfDict()
        unicode_cmap.indirect = True
        unicode_cmap.Filter = [PdfName.FlateDecode]
        unicode_cmap.stream = zlib.compress(unicode_cmap_stream, 9).decode(
            "latin-1")

        font = PdfDict()
        font.indirect = True
        font.BaseFont = PdfName.GlyphLessFont
        font.DescendantFonts = PdfArray([cid_font])
        font.Encoding = PdfName("Identity-H")
        font.Subtype = PdfName.Type0
        font.ToUnicode = unicode_cmap
        font.Type = PdfName.Font

        return font

Example #4

0

Show file

File: pdf_redactor.py Project: vitalbeats/pdf-redactor

def update_metadata(trailer, options):
    # Update the PDF's Document Information Dictionary, which contains keys like
    # Title, Author, Subject, Keywords, Creator, Producer, CreationDate, and ModDate
    # (the latter two containing Date values, the rest strings).

    import codecs
    from pdfrw.objects import PdfString, PdfName

    # Create the metadata dict if it doesn't exist, since the caller may be adding fields.
    if not trailer.Info:
        trailer.Info = PdfDict()

    # Get a list of all metadata fields that exist in the PDF plus any fields
    # that there are metadata filters for (since they may insert field values).
    keys = set(str(k)[1:] for k in trailer.Info.keys()) \
      | set(k for k in options.metadata_filters.keys() if k not in ("DEFAULT", "ALL"))

    # Update each metadata field.
    for key in keys:
        # Get the functions to apply to this field.
        functions = options.metadata_filters.get(key)
        if functions is None:
            # If nothing is defined for this field, use the DEFAULT functions.
            functions = options.metadata_filters.get("DEFAULT", [])

        # Append the ALL functions.
        functions += options.metadata_filters.get("ALL", [])

        # Run the functions on any existing values.
        value = trailer.Info[PdfName(key)]
        for f in functions:
            # Before passing to the function, convert from a PdfString to a Python string.
            if isinstance(value, PdfString):
                # decode from PDF's "(...)" syntax.
                value = value.decode()

            # Filter the value.
            value = f(value)

            # Convert Python data type to PdfString.
            if isinstance(value, str) or (sys.version_info < (3, )
                                          and isinstance(value, unicode)):
                # Convert string to a PdfString instance.
                value = PdfString.from_unicode(value)

            elif isinstance(value, datetime):
                # Convert datetime into a PDF "D" string format.
                value = value.strftime("%Y%m%d%H%M%S%z")
                if len(value) == 19:
                    # If TZ info was included, add an apostrophe between the hour/minutes offsets.
                    value = value[:17] + "'" + value[17:]
                value = PdfString("(D:%s)" % value)

            elif value is None:
                # delete the metadata value
                pass

            else:
                raise ValueError(
                    "Invalid type of value returned by metadata_filter function. %s was returned by %s."
                    % (repr(value), f.__name__ or "anonymous function"))

            # Replace value.
            trailer.Info[PdfName(key)] = value

Example #5

0

Show file

File: djpdf.py Project: 5l1v3r1/djpdf

    def _build_font():
        with open(FONT_FILENAME, "rb") as f:
            embedded_font_stream = f.read()
        embedded_font = PdfDict()
        embedded_font.indirect = True
        embedded_font.Filter = [PdfName.FlateDecode]
        embedded_font.stream = zlib.compress(embedded_font_stream,
                                             9).decode("latin-1")
        embedded_font.Length1 = len(embedded_font_stream)

        font_descriptor = PdfDict()
        font_descriptor.indirect = True
        font_descriptor.Ascent = 1000
        font_descriptor.CapHeight = 1000
        font_descriptor.Descent = -1
        font_descriptor.Flags = 5  # FixedPitch + Symbolic
        font_descriptor.FontBBox = PdfArray([0, 0, 1000, 500])
        font_descriptor.FontFile2 = embedded_font
        font_descriptor.FontName = PdfName.GlyphLessFont
        font_descriptor.ItalicAngle = 0
        font_descriptor.StemV = 80
        font_descriptor.Type = PdfName.FontDescriptor

        # Map everything to glyph 1
        cid_to_gid_map_stream = b"\0\1" * (1 << 16)
        cid_to_gid_map = PdfDict()
        cid_to_gid_map.indirect = True
        cid_to_gid_map.Filter = [PdfName.FlateDecode]
        cid_to_gid_map.stream = zlib.compress(cid_to_gid_map_stream,
                                              9).decode("latin-1")
        cid_to_gid_map.Length1 = len(cid_to_gid_map_stream)

        cid_system_info = PdfDict()
        cid_system_info.Ordering = PdfString.from_unicode("Identity")
        cid_system_info.Registry = PdfString.from_unicode("Adobe")
        cid_system_info.Supplement = 0

        cid_font = PdfDict()
        cid_font.indirect = True
        cid_font.CIDToGIDMap = cid_to_gid_map
        cid_font.BaseFont = PdfName.GlyphLessFont
        cid_font.CIDSystemInfo = cid_system_info
        cid_font.FontDescriptor = font_descriptor
        cid_font.Subtype = PdfName.CIDFontType2
        cid_font.Type = PdfName.Font
        cid_font.DW = 500

        with open(UNICODE_CMAP_FILENAME, "rb") as f:
            unicode_cmap_stream = f.read()
        unicode_cmap = PdfDict()
        unicode_cmap.indirect = True
        unicode_cmap.Filter = [PdfName.FlateDecode]
        unicode_cmap.stream = zlib.compress(unicode_cmap_stream,
                                            9).decode("latin-1")

        font = PdfDict()
        font.indirect = True
        font.BaseFont = PdfName.GlyphLessFont
        font.DescendantFonts = PdfArray([cid_font])
        font.Encoding = PdfName("Identity-H")
        font.Subtype = PdfName.Type0
        font.ToUnicode = unicode_cmap
        font.Type = PdfName.Font

        return font