def test_object_free(): xrefs = [[ b'0 3', b'0000000000 65535 f', b'0000000100 00000 n', b'0000000200 00000 n' ], [b'0 2', b'0000000000 65535 f', b'0000000000 00001 f'], [b'0 2', b'0000000000 65535 f', b'0000000300 00001 n']] r = PdfFileReader(BytesIO(fmt_dummy_xrefs(xrefs))) assert r.xrefs.xref_sections == 3 assert r.xrefs[generic.Reference(1, 0)] == generic.NullObject() assert generic.Reference(1, 0) in r.xrefs.refs_freed_in_revision(1) assert r.xrefs[generic.Reference(1, 1)] == 300
def test_object_free_no_override(): xrefs = [[ b'0 3', b'0000000000 65535 f', b'0000000100 00000 n', b'0000000200 00000 n' ], [b'0 2', b'0000000000 65535 f', b'0000000000 00001 f'], [b'0 2', b'0000000000 65535 f', b'0000000300 00001 n'], [b'0 2', b'0000000000 65535 f', b'0000000000 00002 f']] r = PdfFileReader(BytesIO(fmt_dummy_xrefs(xrefs))) assert r.xrefs.xref_sections == 4 assert r.xrefs[generic.Reference(1, 0)] is None assert r.xrefs[generic.Reference(1, 1)] is None assert generic.Reference(1, 0) in r.xrefs.refs_freed_in_revision(1) assert generic.Reference(1, 1) in r.xrefs.refs_freed_in_revision(3)
def test_deep_modify(): w = IncrementalPdfFileWriter(BytesIO(MINIMAL)) obj3 = generic.Reference(3, 0, w) deep_obj = w.get_object(obj3)['/Resources']['/Font']['/F1']['/Subtype'] assert deep_obj.container_ref.idnum == obj3.idnum w.update_container(deep_obj) assert (0, 3) in w.objects
def test_refree_dead_object(): # I've seen the pattern below in Acrobat output. xrefs = [ [b'0 3', b'0000000000 65535 f', b'0000000000 00000 f', b'0000000200 00000 n'], [b'0 2', b'0000000000 65535 f', b'0000000000 00001 f'], ] r = PdfFileReader(BytesIO(fmt_dummy_xrefs(xrefs))) assert r.xrefs.total_revisions == 2 assert generic.Reference(1, 0) not in r.xrefs.refs_freed_in_revision(0) assert generic.Reference(1, 0) in r.xrefs.refs_freed_in_revision(1) assert generic.Reference(1, 0) in r.xrefs.explicit_refs_in_revision(1)
def test_broken_obj_stream_fallback(fname, obj_to_get, expect_null): with open(os.path.join(PDF_DATA_DIR, fname), 'rb') as inf: r = PdfFileReader(inf, strict=False) obj = r.get_object(generic.Reference(idnum=obj_to_get)) if expect_null: assert isinstance(obj, generic.NullObject) else: # we set up the tests to always point to dictionaries assert isinstance(obj, generic.DictionaryObject)
def test_refree_dead_object(): # I've seen the pattern below in Acrobat output. # (minus the second update) xrefs = [ [ b'0 3', b'0000000000 65535 f', b'0000000000 00000 f', b'0000000200 00000 n' ], [b'0 2', b'0000000000 65535 f', b'0000000000 00001 f'], [b'0 2', b'0000000000 65535 f', b'0000000300 00001 n'], # reintroduce as gen 1 ] r = PdfFileReader(BytesIO(fmt_dummy_xrefs(xrefs))) assert r.xrefs.xref_sections == 3 assert generic.Reference(1, 0) not in r.xrefs.refs_freed_in_revision(0) assert generic.Reference(1, 0) not in r.xrefs.refs_freed_in_revision(1) assert generic.Reference(1, 0) not in r.xrefs.explicit_refs_in_revision(1) assert generic.Reference(1, 1) in r.xrefs.explicit_refs_in_revision(2)
def _read_xref_stream_object(self): stream = self.stream idnum, generation = read_object_header(stream, strict=self.strict) xrefstream_ref = generic.Reference(idnum, generation, pdf=self.handler) xrefstream = generic.StreamObject.read_from_stream( stream, xrefstream_ref ) xrefstream.container_ref = xrefstream_ref assert xrefstream.raw_get("/Type") == "/XRef" return xrefstream_ref, xrefstream
def test_tagged_path_count(): r = PdfFileReader(BytesIO(MINIMAL_TWO_FIELDS_TAGGED)) r = r.get_historical_resolver(0) r._load_reverse_xref_cache() # The path simplifier should eliminate all (pseudo-)duplicates refs except # these three: # - one from the AcroForm hierarchy # - one from the pages tree (through /Annots) # - one from the structure tree paths_to = r._indirect_object_access_cache[generic.Reference(7, 0, r)] assert len(paths_to) == 3
def test_sign_reject_freed(forbid_freeing): w = IncrementalPdfFileWriter(BytesIO(MINIMAL_ONE_FIELD)) out = signers.sign_pdf( w, signature_meta=signers.PdfSignatureMetadata(field_name='Sig1'), signer=FROM_CA) # free the ref containing the /Info dictionary # since we don't have support for freeing objects in the writer (yet), # do it manually r = PdfFileReader(out) last_startxref = r.last_startxref # NOTE the linked list offsets are dummied out, but our Xref parser # doesn't care len_out = out.seek(0, os.SEEK_END) out.write(b'\n'.join([ b'xref', b'0 1', b'0000000000 65535 f ', b'2 1', b'0000000000 00001 f ', b'trailer<</Prev %d>>' % last_startxref, b'startxref', b'%d' % len_out, b'%%EOF' ])) r = PdfFileReader(out) last_rev = r.xrefs.xref_sections - 1 some_ref = generic.Reference(2, 0) assert some_ref in r.xrefs.refs_freed_in_revision(last_rev) sig = r.embedded_signatures[0] assert sig.signed_revision == 2 # make a dummy rule that whitelists our freed object ref class AdHocRule(QualifiedWhitelistRule): def apply_qualified(self, old: HistoricalResolver, new: HistoricalResolver): yield ModificationLevel.LTA_UPDATES, ReferenceUpdate( some_ref, paths_checked=RawPdfPath('/Root', '/Pages')) val_status = validate_pdf_signature( sig, SIMPLE_V_CONTEXT(), diff_policy=StandardDiffPolicy(DEFAULT_DIFF_POLICY.global_rules + [AdHocRule()], DEFAULT_DIFF_POLICY.form_rule, reject_object_freeing=forbid_freeing)) if forbid_freeing: assert val_status.modification_level == ModificationLevel.OTHER else: assert val_status.modification_level == ModificationLevel.LTA_UPDATES
def explicit_refs_in_revision(self, revision) -> Set[generic.Reference]: """ Look up the object refs for all objects explicitly added or overwritten in a given revision. :param revision: A revision number. The oldest revision is zero. :return: A set of Reference objects. """ section = self._xref_sections[revision] result = { generic.Reference(*ref, pdf=self.reader) for ref in section.xref_data.explicit_refs_in_revision } hybrid = section.xref_data.hybrid if hybrid is not None: # make sure we also account for refs in hybrid sections result |= { generic.Reference(*ref, pdf=self.reader) for ref in hybrid.xref_data.explicit_refs_in_revision } return result
def refs_freed_in_revision(self, revision) -> Set[generic.Reference]: """ Look up the object refs for all objects explicitly freed in a given revision. :param revision: A revision number. The oldest revision is zero. :return: A set of Reference objects. """ section = self._xref_sections[revision] return { generic.Reference(idnum, gen - 1, pdf=self.reader) for idnum, gen in section.xref_data.freed.items() if gen > 0 # don't acknowledge "dead" objects as freeings }
def _write_objects(self, stream, object_position_dict): # deal with objects in object streams first for obj_stream in self.object_streams: # first, register the object stream object # (will get written later) stream_ref = self.add_object(obj_stream.as_pdf_object()) # loop over all objects in the stream, and prepare # the data to put in the XRef table for ix, (idnum, obj) in enumerate(obj_stream._obj_refs.items()): object_position_dict[(0, idnum)] = (stream_ref.idnum, ix) for ix in sorted(self.objects.keys()): generation, idnum = ix obj = self.objects[ix] object_position_dict[ix] = stream.tell() stream.write(('%d %d obj\n' % (idnum, generation)).encode('ascii')) if self.security_handler is not None \ and idnum != self._encrypt.idnum: handler = self.security_handler else: handler = None container_ref = generic.Reference(idnum, generation, self) obj.write_to_stream(stream, handler, container_ref) stream.write(b'\nendobj\n')
def test_xref_access_no_decrypt(): r = PdfFileReader(BytesIO(MINIMAL_AES256)) # attempt to access xref stream, turn off transparent decryption obj = r.get_object(ref=generic.Reference(7, 0), transparent_decrypt=False) assert not isinstance(obj, generic.DecryptedObjectProxy)
def test_broken_objstream(fname, err, obj_to_get): with open(os.path.join(PDF_DATA_DIR, fname), 'rb') as inf: with pytest.raises(misc.PdfReadError, match=err): r = PdfFileReader(inf, strict=True) r.get_object(generic.Reference(idnum=obj_to_get))
def object_streams_used_in(self, revision): section = self._xref_sections[revision] return { generic.Reference(objstm_id, pdf=self.reader) for objstm_id in section.xref_data.obj_streams_used }
def do_check(): r = PdfFileReader(out) print(r.get_object(generic.Reference(2, 0, r), revision=3).data) s = r.embedded_signatures[0] status = validate_pdf_signature(s) assert status.modification_level == ModificationLevel.OTHER