Esempio n. 1
0
def toUnicode(string, font, fontcache):
    # This is hard!

    if not font:
        # There is no font for this text. Assume Latin-1.
        return string.decode("Latin-1")
    elif font.ToUnicode:
        # Decompress the CMap stream & check that it's not compressed in a way
        # we can't understand.
        from pdfrw.uncompress import uncompress as uncompress_streams
        uncompress_streams([font.ToUnicode])

        # Use the CMap, which maps character codes to Unicode code points.
        if font.ToUnicode.stream not in fontcache:
            fontcache[font.ToUnicode.stream] = CMap(font.ToUnicode)
        cmap = fontcache[font.ToUnicode.stream]

        string = cmap.decode(string)
        #print(string, end='', file=sys.stderr)
        #sys.stderr.write(string)
        return string
    elif font.Encoding == "/WinAnsiEncoding":
        return string.decode("cp1252", "replace")
    elif font.Encoding == "/MacRomanEncoding":
        return string.decode("mac_roman", "replace")
    else:
        return "?"
Esempio n. 2
0
def toUnicode(string, font, fontcache):
    # This is hard!

    if not font:
        # There is no font for this text. Assume Latin-1.
        return string.decode("Latin-1")
    elif font.ToUnicode:
        # Decompress the CMap stream & check that it's not compressed in a way
        # we can't understand.
        from pdfrw.uncompress import uncompress as uncompress_streams
        uncompress_streams([font.ToUnicode])

        # Use the CMap, which maps character codes to Unicode code points.
        if font.ToUnicode.stream not in fontcache:
            fontcache[font.ToUnicode.stream] = CMap(font.ToUnicode)
        cmap = fontcache[font.ToUnicode.stream]

        try:
            return cmap.decode(string)
        except RedactionException:
            # if there's a redaction exception, try and decode using fall back methods.
            pass

    if get_encoding(font) == "/WinAnsiEncoding":
        return string.decode("cp1252", "replace")
    if get_encoding(font) == "/MacRomanEncoding":
        return string.decode("mac_roman", "replace")
    else:
        current_app.logger.warning(
            f'Unrecognised font with encoding {font.Encoding}, may not be able to redact properly'
        )
        return "?"
Esempio n. 3
0
    def __init__(self, cmap):
        self.bytes_to_unicode = { }
        self.unicode_to_bytes = { }
        self.defns = { }
        self.usecmap = None

        # Decompress the CMap stream & check that it's not compressed in a way
        # we can't understand.
        from pdfrw.uncompress import uncompress as uncompress_streams
        uncompress_streams([cmap])

        #print(cmap.stream, file=sys.stderr)

        # This is based on https://github.com/euske/pdfminer/blob/master/pdfminer/cmapdb.py.
        from pdfrw import PdfString, PdfArray
        in_cmap = False
        operand_stack = []
        codespacerange = []

        def code_to_int(code):
            # decode hex encoding
            code = code.to_bytes()
            if sys.version_info < (3,):
                code = (ord(c) for c in code)
            from functools import reduce
            return reduce(lambda x0, x : x0*256 + x, (b for b in code))

        def add_mapping(code, char, offset=0):
            # Is this a mapping for a one-byte or two-byte character code?
            width = len(codespacerange[0].to_bytes())
            assert len(codespacerange[1].to_bytes()) == width
            if width == 1:
                # one-byte entry
                if sys.version_info < (3,):
                    code = chr(code)
                else:
                    code = bytes([code])
            elif width == 2:
                if sys.version_info < (3,):
                    code = chr(code//256) + chr(code & 255)
                else:
                    code = bytes([code//256, code & 255])
            else:
                raise ValueError("Invalid code space range %s?" % repr(codespacerange))

            # Some range operands take an array.
            if isinstance(char, PdfArray):
                char = char[offset]

            # The Unicode character is given usually as a hex string of one or more
            # two-byte Unicode code points.
            if isinstance(char, PdfString):
                char = char.to_bytes()
                if sys.version_info < (3,): char = (ord(c) for c in char)

                c = ""
                for xh, xl in chunk_pairs(list(char)):
                    c += (chr if sys.version_info >= (3,) else unichr)(xh*256 + xl)
                char = c

                if offset > 0:
                    char = char[0:-1] + (chr if sys.version_info >= (3,) else unichr)(ord(char[-1]) + offset)
            else:
                assert offset == 0

            self.bytes_to_unicode[code] = char
            self.unicode_to_bytes[char] = code

        for token in tokenize_streams([cmap.stream]):
            if token == "begincmap":
                in_cmap = True
                operand_stack[:] = []
                continue
            elif token == "endcmap":
                in_cmap = False
                continue
            if not in_cmap:
                continue
            
            if token == "def":
                name = operand_stack.pop(0)
                value = operand_stack.pop(0)
                self.defns[name] = value

            elif token == "usecmap":
                self.usecmap = self.pop(0)

            elif token == "begincodespacerange":
                operand_stack[:] = []
            elif token == "endcodespacerange":
                codespacerange = [operand_stack.pop(0), operand_stack.pop(0)]

            elif token in ("begincidrange", "beginbfrange"):
                operand_stack[:] = []
            elif token in ("endcidrange", "endbfrange"):
                for (code1, code2, cid_or_name1) in chunk_triples(operand_stack):
                    if not isinstance(code1, PdfString) or not isinstance(code2, PdfString): continue
                    code1 = code_to_int(code1)
                    code2 = code_to_int(code2)
                    for code in range(code1, code2+1):
                        add_mapping(code, cid_or_name1, code-code1)
                operand_stack[:] = []

            elif token in ("begincidchar", "beginbfchar"):
                operand_stack[:] = []
            elif token in ("endcidchar", "endbfchar"):
                for (code, char) in chunk_pairs(operand_stack):
                    if not isinstance(code, PdfString): continue
                    add_mapping(code_to_int(code), char)
                operand_stack[:] = []

            elif token == "beginnotdefrange":
                operand_stack[:] = []
            elif token == "endnotdefrange":
                operand_stack[:] = []

            else:
                operand_stack.append(token)
Esempio n. 4
0
def build_text_layer(document):
    # Within each page's content stream, look for text-showing operators to
    # find the text content of the page. Construct a string that contains the
    # entire text content of the document AND a mapping from characters in the
    # text content to tokens in the content streams. That lets us modify the
    # tokens in the content streams when we find text that we want to redact.
    #
    # The text-showing operators are:
    #
    #   (text) Tj      -- show a string of text
    #   (text) '       -- move to next line and show a string of text
    #   aw ac (text) " -- show a string of text with word/character spacing parameters
    #   [ ... ] TJ     -- show text strings from the array, which are interleaved with spacing parameters
    #
    # (These operators appear only within BT ... ET so-called "text objects",
    # although we don't make use of it.)
    #
    # But since we don't understand any of the other content stream operators,
    # and in particular we don't know how many operands each (non-text) operator
    # takes, we can never be sure whether what we see in the content stream is
    # an operator or an operand. If we see a "Tj", maybe it is the operand of
    # some other operator?
    #
    # We'll assume we can get by just fine, however, assuming that whenever we
    # see one of these tokens that it's an operator and not an operand.
    #
    # But TJ remains a little tricky because its operand is an array that preceeds
    # it. Arrays are delimited by square brackets and we need to parse that.
    #
    # We also have to be concerned with the encoding of the text content, which
    # depends on the active font. With a simple font, the text is a string whose
    # bytes are glyph codes. With a composite font, a CMap maps multi-byte
    # character codes to glyphs. In either case, we must map glyphs to unicode
    # characters so that we can pattern match against it.
    #
    # To know the active font, we look for the "<font> <size> Tf" operator.

    from pdfrw import PdfObject, PdfString, PdfArray
    from pdfrw.uncompress import uncompress as uncompress_streams
    from pdfrw.objects.pdfname import BasePdfName

    text_tokens = []
    fontcache = { }

    class TextToken:
        value = None
        font = None
        def __init__(self, value, font):
            self.font = font
            self.raw_original_value = value
            self.original_value = toUnicode(value, font, fontcache)
            self.value = self.original_value
        def __str__(self):
            # __str__ is used for serialization
            if self.value == self.original_value:
                # If unchanged, return the raw original value without decoding/encoding.
                return PdfString.from_bytes(self.raw_original_value)
            else:
                # If the value changed, encode it from Unicode according to the encoding
                # of the font that is active at the location of this token.
                return PdfString.from_bytes(fromUnicode(self.value, self.font, fontcache))
        def __repr__(self):
            # __repr__ is used for debugging
            return "Token<%s>" % repr(self.value)

    def process_text(token):
        if token.value == "": return
        text_tokens.append(token)

    # For each page...
    page_tokens = []
    for page in document.pages:
        # For each token in the content stream...

        # Remember this page's revised token list.
        token_list = []
        page_tokens.append(token_list)

        if page.Contents is None:
            continue

        prev_token = None
        prev_prev_token = None
        current_font = None

        # The page may have one content stream or an array of content streams.
        # If an array, they are treated as if they are concatenated into a single
        # stream (per the spec).
        if isinstance(page.Contents, PdfArray):
            contents = list(page.Contents)
        else:
            contents = [page.Contents]

        # If a compression Filter is applied, attempt to un-apply it. If an unrecognized
        # filter is present, an error is raised. uncompress_streams expects an array of
        # streams.
        uncompress_streams(contents)

        def make_mutable_string_token(token):
            if isinstance(token, PdfString):
                token = TextToken(token.to_bytes(), current_font)

                # Remember all unicode characters seen in this font so we can
                # avoid inserting characters that the PDF isn't likely to have
                # a glyph for.
                if current_font and current_font.BaseFont:
                    fontcache.setdefault(current_font.BaseFont, set()).update(token.value)
            return token

        # Iterate through the tokens in the page's content streams.
        for token in tokenize_streams(content.stream for content in contents):
            # Replace any string token with our own class that hold a mutable
            # value, which is how we'll rewrite content.
            token = make_mutable_string_token(token)

            # Append the token into a new list that holds all tokens.
            token_list.append(token)

            # If the token is an operator and we're not inside an array...
            if isinstance(token, PdfObject):
                # And it's one that we recognize, process it.
                if token in ("Tj", "'", '"') and isinstance(prev_token, TextToken):
                    # Simple text operators.
                    process_text(prev_token)
                elif token == "TJ" and isinstance(prev_token, PdfArray):
                    # The text array operator.
                    for i in range(len(prev_token)):
                        # (item may not be a string! only the strings are text.)
                        prev_token[i] = make_mutable_string_token(prev_token[i])
                        if isinstance(prev_token[i], TextToken):
                            process_text(prev_token[i])

                elif token == "Tf" and isinstance(prev_prev_token, BasePdfName):
                    # Update the current font.
                    # prev_prev_token holds the font 'name'. The name must be looked up
                    # in the content stream's resource dictionary, which is page.Resources,
                    # plus any resource dictionaries above it in the document hierarchy.
                    current_font = None
                    resources = page.Resources
                    while resources and not current_font:
                        current_font = resources.Font[prev_prev_token]
                        resources = resources.Parent

            # Remember the previously seen token in case the next operator is a text-showing
            # operator -- in which case this was the operand. Remember the token before that
            # because it may be a font name for the Tf operator.
            prev_prev_token = prev_token
            prev_token = token

    return (text_tokens, page_tokens)
Esempio n. 5
0
def build_text_layer(document, options):

    from pdfrw import PdfObject, PdfString, PdfArray
    from pdfrw.uncompress import uncompress as uncompress_streams
    from pdfrw.objects.pdfname import BasePdfName
    global text_tokens
    text_tokens = []
    fontcache = {}

    class TextToken:
        value = None
        font = None

        def __init__(self, value, font):
            self.font = font
            self.raw_original_value = value
            self.original_value = toUnicode(value, font, fontcache)
            self.value = self.original_value

        def __str__(self):
            # __str__ is used for serialization
            if self.value == self.original_value:
                # If unchanged, return the raw original value without decoding/encoding.
                return PdfString.from_bytes(self.raw_original_value)
            else:
                # If the value changed, encode it from Unicode according to the encoding
                # of the font that is active at the location of this token.
                return PdfString.from_bytes(
                    fromUnicode(self.value, self.font, fontcache, options))

        def __repr__(self):
            # __repr__ is used for debugging
            #print ("Token<%s>" % repr(self.value))
            return "Token<%s>" % repr(self.value)

    def process_text(token):
        if token.value == "": return
        text_tokens.append(token)

    # For each page...
    global page_tokens
    page_tokens = []
    for page in document.pages:
        # For each token in the content stream...

        # Remember this page's revised token list.
        token_list = []
        page_tokens.append(token_list)

        if page.Contents is None:
            continue

        prev_token = None
        prev_prev_token = None
        current_font = None

        # The page may have one content stream or an array of content streams.
        # If an array, they are treated as if they are concatenated into a single
        # stream (per the spec).
        if isinstance(page.Contents, PdfArray):
            contents = list(page.Contents)
        else:
            contents = [page.Contents]

        # If a compression Filter is applied, attempt to un-apply it. If an unrecognized
        # filter is present, an error is raised. uncompress_streams expects an array of
        # streams.
        uncompress_streams(contents)

        def make_mutable_string_token(token):
            if isinstance(token, PdfString):
                token = TextToken(token.to_bytes(), current_font)

                # Remember all unicode characters seen in this font so we can
                # avoid inserting characters that the PDF isn't likely to have
                # a glyph for.
                if current_font and current_font.BaseFont:
                    fontcache.setdefault(current_font.BaseFont,
                                         set()).update(token.value)
            return token

        # Iterate through the tokens in the page's content streams.
        for token in tokenize_streams(content.stream for content in contents):
            # Replace any string token with our own class that hold a mutable
            # value, which is how we'll rewrite content.
            token = make_mutable_string_token(token)

            # Append the token into a new list that holds all tokens.
            token_list.append(token)

            # If the token is an operator and we're not inside an array...
            if isinstance(token, PdfObject):
                # And it's one that we recognize, process it.
                if token in ("Tj", "'", '"') and isinstance(
                        prev_token, TextToken):
                    # Simple text operators.
                    process_text(prev_token)
                elif token == "TJ" and isinstance(prev_token, PdfArray):
                    # The text array operator.
                    for i in range(len(prev_token)):
                        # (item may not be a string! only the strings are text.)
                        prev_token[i] = make_mutable_string_token(
                            prev_token[i])
                        if isinstance(prev_token[i], TextToken):
                            process_text(prev_token[i])

                elif token == "Tf" and isinstance(prev_prev_token,
                                                  BasePdfName):
                    # Update the current font.
                    # prev_prev_token holds the font 'name'. The name must be looked up
                    # in the content stream's resource dictionary, which is page.Resources,
                    # plus any resource dictionaries above it in the document hierarchy.
                    current_font = None
                    resources = page.Resources
                    while resources and not current_font:
                        current_font = resources.Font[prev_prev_token]
                        resources = resources.Parent

            # Remember the previously seen token in case the next operator is a text-showing
            # operator -- in which case this was the operand. Remember the token before that
            # because it may be a font name for the Tf operator.
            prev_prev_token = prev_token
            prev_token = token

    return (text_tokens, page_tokens)