def pagexobj(page, viewinfo=ViewInfo(), allow_compressed=True): ''' pagexobj creates and returns a Form XObject for a given view within a page (Defaults to entire page.) ''' inheritable = page.inheritable resources = inheritable.Resources rotation = get_rotation(inheritable.Rotate) mbox, bbox = getrects(inheritable, viewinfo, rotation) rotation += get_rotation(viewinfo.rotate) if isinstance(page.Contents, PdfArray): if len(page.Contents) == 1: contents = page.Contents[0] else: # decompress and join multiple streams contlist = [c for c in page.Contents] uncompress(contlist) stream = '\n'.join([c.stream for c in contlist]) contents = PdfDict(stream=stream) else: contents = page.Contents # Make sure the only attribute is length # All the filters must have been executed assert int(contents.Length) == len(contents.stream) if not allow_compressed: assert len([x for x in contents.iteritems()]) == 1 return _cache_xobj(contents, resources, mbox, bbox, rotation)
def find_images(file): pages = PdfReader(file).pages for page in pages: for obj in find_objects(page): if obj.Subtype == PdfName.Image and obj['/Height'] == '1': try: uncompress([obj], leave_raw=True) result = obj.stream.decode('utf-16be') yield json.loads(result) except Exception as e: pass
def parse_xref_stream(self, source, int=int, range=range, enumerate=enumerate, islice=itertools.islice, defaultdict=collections.defaultdict, hexlify=binascii.hexlify): ''' Parse (one of) the cross-reference file section(s) ''' def readint(s, lengths): offset = 0 for length in itertools.cycle(lengths): next = offset + length yield int(hexlify(s[offset:next]), 16) if length else None offset = next setdefault = source.obj_offsets.setdefault next = source.next # check for xref stream object objid = source.multiple(3) ok = len(objid) == 3 ok = ok and objid[0].isdigit() ok = ok and objid[1] == 'obj' ok = ok and objid[2] == '<<' if not ok: source.exception('Expected xref stream start') obj = self.readdict(source) if obj.Type != PdfName.XRef: source.exception('Expected dict type of /XRef') tok = next() self.readstream(obj, self.findstream(obj, tok, source), source, True) old_strm = obj.stream if not uncompress([obj], True): source.exception('Could not decompress Xref stream') stream = obj.stream # Fix for issue #76 -- goofy compressed xref stream # that is NOT ACTUALLY COMPRESSED stream = stream if stream is not old_strm else convert_store(old_strm) num_pairs = obj.Index or PdfArray(['0', obj.Size]) num_pairs = [int(x) for x in num_pairs] num_pairs = zip(num_pairs[0::2], num_pairs[1::2]) entry_sizes = [int(x) for x in obj.W] if len(entry_sizes) != 3: source.exception('Invalid entry size') object_streams = defaultdict(list) get = readint(stream, entry_sizes) for objnum, size in num_pairs: for cnt in range(size): xtype, p1, p2 = islice(get, 3) if xtype in (1, None): if p1: setdefault((objnum, p2 or 0), p1) elif xtype == 2: object_streams[p1].append((objnum, p2)) objnum += 1 obj.private.object_streams = object_streams return obj
def load_stream_objects(self, object_streams): # read object streams objs = [] for num in object_streams: obj = self.findindirect(num, 0).real_value() assert obj.Type == '/ObjStm' objs.append(obj) # read objects from stream if objs: # Decrypt if self.crypt_filters is not None: crypt.decrypt_objects(objs, self.stream_crypt_filter, self.crypt_filters) # Decompress uncompress(objs) for obj in objs: objsource = PdfTokens(obj.stream, 0, False) next = objsource.next offsets = [] firstoffset = int(obj.First) while objsource.floc < firstoffset: offsets.append((int(next()), firstoffset + int(next()))) for num, offset in offsets: # Read the object, and call special code if it starts # an array or dictionary objsource.floc = offset sobj = next() func = self.special.get(sobj) if func is not None: sobj = func(objsource) key = (num, 0) self.indirect_objects[key] = sobj if key in self.deferred_objects: self.deferred_objects.remove(key) # Mark the object as indirect, and # add it to the list of streams if it starts a stream sobj.indirect = key
def make_popup(page, rect, popupname, popup, code): from pdfrw import PdfDict, PdfArray, PdfName from pdfrw.uncompress import uncompress codeword_on, codeword_off = code show_action = PdfDict(S=PdfName.SetOCGState, State=PdfArray([PdfName.OFF] + codeword_off + [PdfName.ON] + codeword_on)) link = PdfDict(indirect=True, Type=PdfName.Annot, H=PdfName.I, Subtype=PdfName.Link, A=show_action, Rect=rect) if pdf_popup_config['popuplinkcolor']: link.C = PdfArray(pdf_popup_config['popuplinkcolor']) else: link.Border = [0, 0, 0] page.Annots.append(link) ocmd = PdfDict(Type=PdfName.OCMD, OCGs=codeword_on, P=PdfName.AllOn) popup_pdfname = '/SPopup' + popupname ocmd_pdfname = '/SPopupOCMD{}'.format(popup_unique_id()) if not page.Resources.Properties: page.Resources.Properties = PdfDict() if not page.Resources.XObject: page.Resources.XObject = PdfDict() page.Resources.XObject[popup_pdfname] = popup page.Resources.Properties[ocmd_pdfname] = ocmd if page.Contents.Filter: uncompress( [page.Contents] ) # Important. Otherwise appending to stream add plain text to compressed stream page.Contents.stream += "q /OC {ocmd} BDC 1 0 0 1 {x} {y} cm {popup} Do EMC Q\n".\ format(x=rect[0],y=float(rect[1])-popup.BBox[3], ocmd=ocmd_pdfname, popup=popup_pdfname)
def load_stream_objects(self, object_streams): # read object streams objs = [] for num in object_streams.iterkeys(): obj = self.findindirect(num, 0).real_value() assert obj.Type == '/ObjStm' objs.append(obj) # read objects from stream if objs: uncompress(objs) for obj in objs: objsource = PdfTokens(obj.stream, 0, False) snext = objsource.next offsets = {} firstoffset = int(obj.First) num = snext() while num.isdigit(): offset = int(snext()) offsets[int(num)] = firstoffset + offset num = snext() for num, offset in offsets.iteritems(): # Read the object, and call special code if it starts # an array or dictionary objsource.floc = offset sobj = snext() func = self.special.get(sobj) if func is not None: sobj = func(objsource) key = (num, 0) self.indirect_objects[key] = sobj if key in self.deferred_objects: self.deferred_objects.remove(key) # Mark the object as indirect, and # add it to the list of streams if it starts a stream sobj.indirect = key
def uncompress(self): self.read_all() uncompress(self.indirect_objects.itervalues())
def parsexref(self, source, int=int, range=range): ''' Parse (one of) the cross-reference file section(s) ''' def _pairs(array): i = 0 while 1: yield int(array[i]), int(array[i + 1]) i += 2 if (i + 1) >= len(array): break def convert_to_int(d, size): if size > 8: source.exception('Invalid size in convert_to_int') d = '\x00\x00\x00\x00\x00\x00\x00\x00' + d d = d[-8:] return struct.unpack('>q', d)[0] def read_trailer(): tok = next() if tok != '<<': source.exception('Expected "<<" starting catalog') return self.readdict(source) setdefault = source.obj_offsets.setdefault add_offset = source.all_offsets.append next = source.next tok = next() if tok.isdigit(): # check for xref stream object objid = source.multiple(2) ok = len(objid) == 2 ok = ok and objid[0].isdigit() ok = ok and objid[1] == 'obj' if ok: next() # start of dict obj = self.readdict(source) assert obj.Type == '/XRef' tok = next() end = source.floc + int(obj.Length) self.readstream(obj, self.findstream(obj, tok, source), source) uncompress([obj]) num_pairs = obj.Index or PdfArray(['0', obj.Size]) entry_sizes = [int(x) for x in obj.W] object_streams = {} for num, size in _pairs(num_pairs): cnt = 0 stream_offset = 0 while cnt < size: for i in range(len(entry_sizes)): d = obj.stream[stream_offset:stream_offset + entry_sizes[i]] stream_offset += entry_sizes[i] di = convert_to_int(d, entry_sizes[i]) if i == 0: xref_type = di if xref_type == 0 and entry_sizes[0] == 0: xref_type = 1 elif i == 1: if xref_type == 1: offset = di elif xref_type == 2: objnum = di elif i == 2: if xref_type == 1: generation = di elif xref_type == 2: obstr_idx = di if xref_type == 1 and offset != 0: setdefault((num, generation), offset) add_offset(offset) elif xref_type == 2: if not objnum in object_streams: object_streams[objnum] = [] object_streams[objnum].append(obstr_idx) cnt += 1 num += 1 self.load_stream_objects(object_streams) source.floc = end endit = source.multiple(2) if endit != ['endstream', 'endobj']: source.exception('Expected endstream endobj') return obj else: source.exception('Expected xref stream') elif tok == 'xref': # plain xref table start = source.floc try: while 1: tok = next() if tok == 'trailer': return read_trailer() startobj = int(tok) for objnum in range(startobj, startobj + int(next())): offset = int(next()) generation = int(next()) inuse = next() if inuse == 'n': if offset != 0: setdefault((objnum, generation), offset) add_offset(offset) elif inuse != 'f': raise ValueError except: pass try: # Table formatted incorrectly. # See if we can figure it out anyway. end = source.fdata.rindex('trailer', start) table = source.fdata[start:end].splitlines() for line in table: tokens = line.split() if len(tokens) == 2: objnum = int(tokens[0]) elif len(tokens) == 3: offset, generation, inuse = \ int(tokens[0]), int(tokens[1]), tokens[2] if offset != 0 and inuse == 'n': setdefault((objnum, generation), offset) add_offset(offset) objnum += 1 elif tokens: log.error('Invalid line in xref table: %s' % repr(line)) raise ValueError log.warning('Badly formatted xref table') source.floc = end next() except: source.floc = start source.exception('Invalid table format') return read_trailer() else: source.exception('Expected "xref" keyword or xref stream object')
def do_apply_ocg(basepage, rmpage, i, uses_base_pdf, ocgprop, annotations): ocgpage = IndirectPdfDict(Type=PdfName('OCG'), Name='Page ' + str(i + 1)) ocgprop.OCGs.append(ocgpage) # The Order dict is a Page, followed by Inner ocgorderinner = PdfArray() # Add Template OCG layer # If this uses a basepdf, the template is located # elsewhere. # If using a basepdf, assign its stream as a # 'Background' layer under this page. When the page # primary OCG is disabled, the background will # remain, making it easy to disable all annotations. if uses_base_pdf: ocgorigdoc = IndirectPdfDict(Type=PdfName('OCG'), Name='Background') ocgprop.OCGs.append(ocgorigdoc) ocgorderinner.append(ocgorigdoc) uncompress.uncompress([basepage.Contents]) stream = basepage.Contents.stream stream = '/OC /ocgorigdoc BDC\n' \ + stream \ + 'EMC\n' basepage.Contents.stream = stream compress.compress([basepage.Contents]) if '/Properties' in basepage.Resources: props = basepage.Resources.Properties else: props = PdfDict() props.ocgorigdoc = ocgorigdoc basepage.Resources.Properties = props # If not using a basepdf, assign the rmpage's stream # as a 'Template' layer under this page. It will be # affected by disabling the primary Page OCG (which # by itself is kind of useless for exported # notebooks). # Regardless of using a basepdf or not, put the # rmpage layers into their own OCGs. # If the template has an XObject, we want to skip # the first one. This happens when the template # contains a PNG. Question--what happens when the # template contains more than one PNG? How do we # detect all of those? template_xobj_keys = [] vector_layers = [] uncompress.uncompress([rmpage.Contents]) if uses_base_pdf: # The entire thing is the page ocg stream = '/OC /ocgpage BDC\n' stream += rmpage.Contents.stream stream += 'EMC\n' rmpage.Contents.stream = stream else: stream = rmpage.Contents.stream # Mark the template ocg separate from page ocg template_endpos = 0 page_inatpos = 0 findkey = '1 w 2 J 2 j []0 d\nq\n' # Finds only the first instance, which should be # for the template. findloc = stream.find(findkey) if findloc < 0: # May be a vector, which we stick a marker # in for. # ?? Why is this a half-point off ?? findkey = '799.500000 85 l\n' m = re.search(findkey, rmpage.Contents.stream) if m: findloc = m.start() if findloc > 0: template_endpos = findloc + len(findkey) # Add vector template OCG stream = '/OC /ocgtemplate BDC\n' stream += rmpage.Contents.stream[:template_endpos] stream += 'EMC\n' page_inatpos = len(stream) stream += rmpage.Contents.stream[template_endpos:] # Save stream rmpage.Contents.stream = stream # Add template ocg ocgtemplate = IndirectPdfDict(Type=PdfName('OCG'), Name='Template') ocgprop.OCGs.append(ocgtemplate) ocgorderinner.append(ocgtemplate) # If a template (which is SVG) has embedded PNG # images, those appear as XObjects. This will # mess up the layer order, so we will ignore # them later. template_xobj_keys = \ re.findall(r'(\/Im[0-9]+)\s', stream[:template_endpos]) # Page ocg stream = rmpage.Contents.stream[:page_inatpos] stream += '/OC /ocgpage BDC\n' stream += rmpage.Contents.stream[page_inatpos:] stream += 'EMC\n' # Save stream rmpage.Contents.stream = stream # Find all other vector layers using the magic # point (DocumentPageLayer.render_to_painter()). # ?? Why is this a half-point off ?? while True: m = re.search('420.500000 69 m\n', rmpage.Contents.stream) if not m: break stream = '' layerid = 'ocglayer{}'.format(len(vector_layers) + 1) stream = rmpage.Contents.stream[:m.start()] if len(vector_layers): # close previous layer stream += 'EMC\n' stream += '/OC /{} BDC\n'.format(layerid) stream += rmpage.Contents.stream[m.end():] vector_layers.append(layerid) rmpage.Contents.stream = stream # If we added vector layers, have to end the # first one. if len(vector_layers): stream = rmpage.Contents.stream + 'EMC\n' rmpage.Contents.stream = stream # Done--recompress the stream. compress.compress([rmpage.Contents]) # There shouldn't be any Properties there since we # generated the rmpage ourselves, so don't bother # checking. rmpage.Resources.Properties = PdfDict(ocgpage=ocgpage) if not uses_base_pdf: rmpage.Resources.Properties.ocgtemplate = ocgtemplate # Add individual OCG layers (Bitmap) was_vector = True for n, key in enumerate(rmpage.Resources.XObject): if str(key) in template_xobj_keys: continue was_vector = False l = n - len(template_xobj_keys) # This would indicate a bug in the handling of a # notebook. try: layer = annotations[i][l] except: log.error( 'could not associate XObject with layer: (i, l) ({}, {})'. format(i, l)) log.error(str(annotations)) log.error('document: {} ()').format('uuid', 'self.visible_name') continue layername = layer[0] ocg = IndirectPdfDict(Type=PdfName('OCG'), Name=layername) ocgprop.OCGs.append(ocg) ocgorderinner.append(ocg) rmpage.Resources.XObject[key].OC = ocg # Add individual OCG layers (Vector) if was_vector: for l, layerid in enumerate(vector_layers): # This would indicate a bug in the handling of a # notebook. try: layer = annotations[i][l] except: log.error( 'could not associate layerid with layer: (i, l, layerid) ({}, {}, {})' .format(i, l, layerid)) log.error('document: {} ()').format('uuid', 'self.visible_name') log.error(str(annotations)) continue layername = layer[0] ocg = IndirectPdfDict(Type=PdfName('OCG'), Name=layername) ocgprop.OCGs.append(ocg) ocgorderinner.append(ocg) rmpage.Resources.Properties[PdfName(layerid)] = \ ocg # Add order of OCGs to primary document ocgprop.D.Order.append(ocgpage) ocgprop.D.Order.append(ocgorderinner) return ocgorderinner