Ejemplo n.º 1
0
    def _translatePage(self, page, translator):
        def handleText(encoded_text, decodeDict):
            if not decodeDict:
                return encoded_text  #unable to decode the text
            if encoded_text[0] != '<':  # Unhandled case, never happend
                print encoded_text[:]
                return encoded_text
            b = encoded_text.decode()
            utext0 = u""
            b0 = b
            while len(b):
                code = ord(b[0]) * 256 + ord(b[1]) if len(b) > 1 else ord(b[0])
                if decodeDict.has_key(code):
                    utext0 += decodeDict[code]
                else:
                    utext0 += "??"
                    print "\n??", hex(code), [hex(ord(x)) for x in str(b0)]
                b = b[2:]
            utext = translator(utext0)
            gid_array = []
            for x in utext:
                try:
                    name = self.ttf_cmap[ord(x)]
                    gid = self.dttf.getGlyphID(name)
                    gid_array.append(gid)
                except:
                    print "no gid%d" % ord(x), x
            return "<" + "".join("%04X" % gid for gid in gid_array) + ">"

        self._updatePageFontDecodeDicts(page)
        output = ""
        contents = page.Contents
        fonts = page.Resources.Font
        tokens = PdfTokens(readStream(contents))
        operands = []
        decodeDict = None
        for tok in tokens:
            if str.isalpha(tok[0]) or tok[0] in ['"', "'"]:
                if tok == 'Tf':
                    font_name = operands[0]
                    decodeDict = self.decodeDicts[_id(fonts[font_name])]
                elif tok == "Tj":
                    operands[0] = handleText(operands[0], decodeDict)
                elif tok == "TJ":
                    for n, t in enumerate(operands[1:]):
                        if t == ']':
                            break
                        try:
                            tokNum = float(t)
                        except:
                            tokNum = None
                        if tokNum == None:
                            operands[n + 1] = handleText(t, decodeDict)
                output += " ".join(operands + [tok]) + "\n"
                operands = []
            else:
                operands.append(tok)
        writeStream(contents, output)
Ejemplo n.º 2
0
def complex_watermark_removal(stream, start_offset=1, end_offset=2):
    found = False
    start = None
    end = None
    for i, token in enumerate(PdfTokens(stream)):
        if NEEDLE_1 in token:
            start = i
            found = True
            continue
        if NEEDLE_2 in token:
            end = i
    tokens = list(PdfTokens(stream))
    if start is None or end is None:
        return '\n'.join(tokens), False
    if tokens[start - 1] == '[':
        start = start - 1
    while tokens[end].upper() != 'TJ':
        end += 1
    end += 1
    return '\n'.join(tokens[:start] + tokens[end:]), found
Ejemplo n.º 3
0
def tokenize_streams(streams):
    # pdfrw's tokenizer PdfTokens does lexical analysis only. But we need
    # to collapse arrays ([ .. ]) and dictionaries (<< ... >>) into single
    # token entries.
    from pdfrw import PdfTokens, PdfArray

    stack = []
    for stream in streams:
        tokens = PdfTokens(stream)
        for token in tokens:
            # Is this a control token?
            if token == "<<":
                # begins a dictionary
                stack.append((PdfDict, []))
                continue
            elif token == "[":
                # begins an array
                stack.append((PdfArray, []))
                continue
            elif token in (">>", "]"):
                # ends a dictionary or array
                constructor, content = stack.pop(-1)
                if constructor == PdfDict:
                    # Turn flat list into key/value pairs.
                    content = chunk_pairs(content)
                token = constructor(content)
            elif token == "BI":
                # begins an inline image's dictionary half
                stack.append((InlineImage, []))
                continue
            elif token == "ID":
                # divides an inline image's dictionary half and data half
                constructor, content = stack[-1]
                content = chunk_pairs(content)
                img = constructor(content)
                img.read_data(tokens)
                stack[-1] = (img, None)
                continue
            elif token == "EI":
                # ends an inline image
                token, _ = stack.pop(-1)

            # If we're inside something, add this token to that thing.
            if len(stack) > 0:
                stack[-1][1].append(token)
                continue

            # Yield it.
            yield token
Ejemplo n.º 4
0
def getFontDecodeDict(font):
    """
                Only works for font with ToUnicode and DesendantFont...FontFile2
                This function is a quick ugly function, may not work.
    """
    # Attempt to use the cmap from FontFile2
    ttfbuf = readStream(font.DescendantFonts[0].FontDescriptor.FontFile2)
    tmpttf = fontTools.ttLib.TTFont(StringIO.StringIO(ttfbuf))
    cmap = None
    if tmpttf.has_key('cmap'):
        if tmpttf['cmap'].getcmap(3, 1):
            cmap = tmpttf['cmap'].getcmap(3, 1).cmap
        elif tmpttf['cmap'].getcmap(3, 10):
            cmap10 = tmpttf['cmap'].getcmap(3, 10).cmap
            cmap = {k & 0xffff: v for k, v in cmap10.iteritems()}
    if cmap:
        return {tmpttf.getGlyphID(n): unichr(u) for u, n in cmap.iteritems()}
    # No cmap, use ToUnicode
    print "cmap not found, using /ToUnicode for", autoDecode(font.BaseFont)
    rtn = {}
    tokens = PdfTokens(readStream(font.ToUnicode))
    doRange = False
    working_list = nums = []
    for tok in tokens:
        if tok == 'beginbfrange':
            doRange = True
        elif tok == 'endbfrange':
            doRange = False
        elif tok[:] == '[':
            working_list = []
        elif tok[:] == ']':
            nums.append(working_array)
            working_list = nums
        elif tok[0] == '<' and tok[-1] == '>':
            working_list.append(int(tok[1:-1], 16))
        if doRange and len(nums) >= 3:
            start, end, target = nums[0], nums[1], nums[2]
            if isinstance(target, list):
                for i in range(start, end + 1):
                    rtn[i] = unichr(target[i - start])
            else:
                for i in range(start, end + 1):
                    rtn[i] = unichr(target + i - start)
            working_list = nums = []
        elif not doRange and len(nums) >= 2:
            rtn[nums[0]] = unichr(nums[1])
            working_list = nums = []
    return rtn
Ejemplo n.º 5
0
    def parsepage(cls, page, canvas=None):
        self = cls()
        contents = page.Contents
        if contents.Filter is not None:
            raise SystemExit('Cannot parse graphics -- page encoded with %s' %
                             contents.Filter)
        dispatch = cls.dispatch.get
        self.tokens = tokens = iter(PdfTokens(contents.stream))
        self.params = params = []
        self.canv = canvas
        self.gpath = None
        self.tpath = None
        self.fontdict = dict(
            (x, FontInfo(y)) for (x, y) in page.Resources.Font.iteritems())

        for token in self.tokens:
            info = dispatch(token)
            if info is None:
                params.append(token)
                continue
            func, paraminfo = info
            if paraminfo is None:
                func(self, token, ())
                continue
            delta = len(params) - len(paraminfo)
            if delta:
                if delta < 0:
                    print 'Operator %s expected %s parameters, got %s' % (
                        token, len(paraminfo), params)
                    params[:] = []
                    continue
                else:
                    print "Unparsed parameters/commands:", params[:delta]
                del params[:delta]
            paraminfo = zip(paraminfo, params)
            try:
                params[:] = [x(y) for (x, y) in paraminfo]
            except:
                for i, (x, y) in enumerate(paraminfo):
                    try:
                        x(y)
                    except:
                        raise  # For now
                    continue
            func(self, token, params)
            params[:] = []
Ejemplo n.º 6
0
 def __init__(self, source):
     name = source.BaseFont[1:]
     self.name = self.lookup.get(name, name)
     self.remap = chr
     self.twobyte = False
     info = source.ToUnicode
     if not info:
         return
     info = info.stream.split('beginbfchar')[1].split('endbfchar')[0]
     info = list(PdfTokens(info))
     assert not len(info) & 1
     info2 = []
     for x in info:
         assert x[0] == '<' and x[-1] == '>' and len(x) in (4, 6), x
         i = int(x[1:-1], 16)
         info2.append(i)
     self.remap = dict(
         (x, chr(y)) for (x, y) in zip(info2[::2], info2[1::2])).get
     self.twobyte = len(info[0]) > 4
Ejemplo n.º 7
0
from music21 import *

from pdfrw import PdfReader, PdfWriter, PdfTokens
from pdfrw.findobjs import page_per_xobj

CLEF_MAPPING = {"(&)": clef.TrebleClef,
                "(V)": clef.Treble8vbClef,
                "(?)": clef.BassClef}

inpfn, = sys.argv[1:]
outfn = 'extract.' + os.path.basename(inpfn)
doc = PdfReader(inpfn)
page = doc.pages[0]
# page.Contents.stream = page.Contents.stream[:21000]
tokens = PdfTokens(page.Contents.stream)
indent = 0
commands = ["q", "Q", "ET", "BT", "cm", "Tm", "Tf", "s", "m", "l", "S", "TJ", "f", "Tj", "k", "re", "W", "n", "K", "w", "c"]
params = []
items = []
subcommands = []
for token in tokens:
    if token == "q":
        indent += 1
        if subcommands:
            items.append(subcommands)
        subcommands = []
    elif token == "Q":
        indent -= 1
        items.append(subcommands)
        #print(subcommands)