def review_file(self, reader: PdfFileReader, base_revision: Union[int, HistoricalResolver], field_mdp_spec: Optional[FieldMDPSpec] = None, doc_mdp: Optional[MDPPerm] = None) \ -> Union[DiffResult, SuspiciousModification]: """ Implementation of :meth:`.DiffPolicy.review_file` that reviews each intermediate revision between the base revision and the current one individually. """ changed_form_fields = set() rev_count = reader.xrefs.total_revisions current_max = ModificationLevel.NONE if isinstance(base_revision, int): base_rev_resolver = reader.get_historical_resolver(base_revision) else: base_rev_resolver = base_revision base_revision = base_rev_resolver.revision # Note: there's a pragmatic reason why we iterate over all revisions # instead of just asking for all updated objects between the signed # revision and the most recent one: # # The effect of intermediate updates may not be detectable anymore in # the most recent version, so if we'd consolidate all checks into one, # we would have no way to tell whether or not the objects created # (and later forgotten) by these intermediate revisions actually # constituted legitimate changes. # (see the test_pades_revinfo tests for examples where this applies) # # Until we have a reference counter (which comes with its own # performance problems that may or may not be worse), I don't really # see a good way around this issue other than diffing every intermediate # version separately. for revision in range(base_revision + 1, rev_count): try: diff_result = self.apply( old=base_rev_resolver, new=reader.get_historical_resolver(revision), field_mdp_spec=field_mdp_spec, doc_mdp=doc_mdp) except SuspiciousModification as e: logger.warning( 'Error in diff operation between revision ' f'{base_revision} and {revision}', exc_info=e) return e current_max = max(current_max, diff_result.modification_level) changed_form_fields |= diff_result.changed_form_fields return DiffResult(current_max, changed_form_fields)
def test_tagged_path_count(): r = PdfFileReader(BytesIO(MINIMAL_TWO_FIELDS_TAGGED)) r = r.get_historical_resolver(0) r._load_reverse_xref_cache() # The path simplifier should eliminate all (pseudo-)duplicates refs except # these three: # - one from the AcroForm hierarchy # - one from the pages tree (through /Annots) # - one from the structure tree paths_to = r._indirect_object_access_cache[generic.Reference(7, 0, r)] assert len(paths_to) == 3