def apply(self, old: HistoricalResolver, new: HistoricalResolver) \
            -> Iterable[Reference]:
        xrefs = new.reader.xrefs
        xref_meta = xrefs.get_xref_container_info(new.revision)
        xref_stm = xref_meta.stream_ref
        if xref_stm is not None and old.is_ref_available(xref_stm):
            yield ReferenceUpdate(xref_stm)

        # If this revision is followed by a hybrid one, then we must
        #  clear the ref to the hybrid stream as well. Let's take care of that.

        # Note: this check is only relevant in nonstrict mode because
        #  hybrid-reference docs are banned otherwise.
        if new.reader.strict:
            return

        try:
            next_rev_data = xrefs.get_xref_data(new.revision + 1)
        except IndexError:
            return

        if next_rev_data.hybrid is not None:
            hyb_xref_stm = next_rev_data.hybrid.meta_info.stream_ref
            if old.is_ref_available(hyb_xref_stm):
                yield ReferenceUpdate(hyb_xref_stm)
Esempio n. 2
0
def _find_orphans(hist_rev: HistoricalResolver):
    """
    Within a revision, find new refs that can't be reached from refs in the
    older ones.
    """

    # Note: this function assumes that there is no shady behaviour with older
    #  revisions referring to as-of-yet-undefined references in future
    #  revisions.
    # TODO I might want to put a failsafe in the PdfFileReader class's
    #  dereferencing logic to prevent that.

    # This assumption makes finding orphans relatively cheap: we only need to
    # pull up the dependencies of the older objects that were overwritten
    # in this exact revision, and we only have to recurse into branches that
    # pass through new objects themselves.

    new_refs = hist_rev.explicit_refs_in_revision()

    previous = hist_rev.reader.get_historical_resolver(hist_rev.revision - 1)

    # These are newly updated refs that already existed in older revisions.
    #  We want to know which of the new refs are reachable from one of these.
    updated_old_refs = set()
    # The candidate orphans are all the others
    candidate_orphans = set()
    for ref in new_refs:
        if previous.is_ref_available(ref):
            # ref didn't exist in previous revision
            candidate_orphans.add(ref)
        else:
            updated_old_refs.add(ref)

    def _objs_to_check() -> Iterator[PdfObject]:
        # check the trailer too!
        yield hist_rev.trailer_view
        for _ref in updated_old_refs:
            # take care to return the historical value here
            yield hist_rev(_ref)

    obj_iter = _objs_to_check()
    while candidate_orphans:
        try:
            obj = next(obj_iter)
        except StopIteration:
            break
        candidate_orphans -= hist_rev.collect_dependencies(
            obj, since_revision=hist_rev.revision)
    return candidate_orphans
Esempio n. 3
0
def safe_whitelist(old: HistoricalResolver, old_ref, new_ref) \
        -> Generator[Reference, None, None]:
    """
    Checks whether an indirect reference in a PDF structure
    can be updated without clobbering an older object in a way
    that causes ramifications at the PDF syntax level.

    The following are verified:

     - Does the old reference point to a non-stream object?
     - If the new reference is equal to the old one, does the new reference point
       to a non-stream object?
     - If the new reference is not equal to the old one,
       is the new reference a newly defined object?

    This is a generator for syntactical convenience and integration
    with internal APIs, but it will always yield at most one element.
    """

    if old_ref:
        assert_not_stream(old_ref.get_object())

    if old_ref == new_ref:
        assert_not_stream(new_ref.get_object())
        yield new_ref
    elif old.is_ref_available(new_ref):
        yield new_ref
    else:
        raise SuspiciousModification(
            f"Update clobbers or reuses {new_ref} in an unexpected way.")
def _allow_appearance_update(old_field, new_field, old: HistoricalResolver,
                             new: HistoricalResolver) \
        -> Generator[Reference, None, None]:

    old_ap_val, new_ap_val = yield from compare_key_refs(
        '/AP', old, old_field, new_field)

    if new_ap_val is None:
        return

    if not isinstance(new_ap_val, generic.DictionaryObject):
        raise SuspiciousModification('AP entry should point to a dictionary')

    # we *never* want to whitelist an update for an existing
    # stream object (too much potential for abuse), so we insist on
    # modifying the /N, /R, /D keys to point to new streams
    # TODO this could be worked around with a reference counter for
    #  streams, in which case we could allow the stream to be overridden
    #  on the condition that it isn't used anywhere else.

    for key in ('/N', '/R', '/D'):
        try:
            appearance_spec = new_ap_val.raw_get(key)
        except KeyError:
            continue
        yield from new.collect_dependencies(appearance_spec,
                                            since_revision=old.revision + 1)
def _validate_dss_substructure(old: HistoricalResolver,
                               new: HistoricalResolver, old_dict, new_dict,
                               der_stream_keys, is_vri, path: RawPdfPath):
    for der_obj_type in der_stream_keys:
        as_update = ReferenceUpdate.curry_ref(paths_checked=path +
                                              der_obj_type)
        try:
            value = new_dict.raw_get(der_obj_type)
        except KeyError:
            continue
        _assert_stream_refs(der_obj_type, value, SuspiciousModification,
                            is_vri)
        if isinstance(value, generic.IndirectObject):
            new_ref = value.reference
            try:
                old_value = old_dict.raw_get(der_obj_type)
                if isinstance(old_value, generic.IndirectObject):
                    yield from map(
                        as_update,
                        safe_whitelist(old, old_value.reference, new_ref))
                _assert_stream_refs(der_obj_type, old_value, misc.PdfReadError,
                                    is_vri)
                # We don't enforce the contents of the new array vs. the old one
                # deleting info is allowed by PAdES, and this check can get
                # pretty expensive.
            except KeyError:
                pass

        yield from map(
            as_update,
            new.collect_dependencies(value, since_revision=old.revision + 1))
    def apply(self, old: HistoricalResolver, new: HistoricalResolver) \
            -> Iterable[ReferenceUpdate]:

        # /Metadata points to a stream, so we have to be careful allowing
        # object overrides!
        # we only approve the change if the metadata consists of well-formed xml
        # (note: this doesn't validate any XML schemata)

        def grab_metadata(root):
            try:
                return root.get_value_as_reference('/Metadata')
            except misc.IndirectObjectExpected:
                raise SuspiciousModification(
                    "/Metadata should be an indirect reference")
            except KeyError:
                return

        new_metadata_ref = grab_metadata(new.root)
        if new_metadata_ref is None:
            return  # nothing to do

        if self.check_xml_syntax:
            MetadataUpdateRule.is_well_formed_xml(new_metadata_ref)

        old_metadata_ref = grab_metadata(old.root)

        if self.check_xml_syntax:
            MetadataUpdateRule.is_well_formed_xml(old_metadata_ref)

        same_ref_ok = (old_metadata_ref == new_metadata_ref
                       and not self.always_refuse_stream_override)
        if same_ref_ok or old.is_ref_available(new_metadata_ref):
            yield ReferenceUpdate(new_metadata_ref,
                                  paths_checked=RawPdfPath(
                                      '/Root', '/Metadata'))
Esempio n. 7
0
    def apply(self,
              old: HistoricalResolver,
              new: HistoricalResolver,
              field_mdp_spec: Optional[FieldMDPSpec] = None,
              doc_mdp: Optional[MDPPerm] = None) -> DiffResult:

        if doc_mdp == MDPPerm.ANNOTATE:
            logger.warning(
                "StandardDiffPolicy was not designed to support "
                "DocMDP level 3 (MDPPerm.ANNOTATE). Unexpected validation "
                "results may occur.")

        if self.reject_object_freeing:
            freed = new.refs_freed_in_revision()
            if freed:
                raise SuspiciousModification(
                    f"The refs {freed} were freed in the revision provided. "
                    "The configured difference analysis policy does not allow "
                    "object freeing.")
        # we need to verify that there are no xrefs in the revision's xref table
        # other than the ones we can justify.
        new_xrefs = new.explicit_refs_in_revision()

        explained = defaultdict(set)

        # prepare LUT for refs that are used multiple times in the old revision
        # (this is a very expensive operation, since it reads all objects in
        #  the signed revision)
        def _init_multi_lut():
            old._load_reverse_xref_cache()
            for new_ref in new_xrefs:
                usages = old._get_usages_of_ref(new_ref)
                if usages:
                    yield new_ref, (ModificationLevel.NONE, set(usages))

        # orphaned objects are cleared at LTA update level
        if self.ignore_orphaned_objects:
            for _ref in _find_orphans(new):
                explained[ModificationLevel.LTA_UPDATES].add(_ref)

        # This table records all the overridden refs that already existed
        # in the old revision, together with the different ways they can be
        # reached from the document trailer.
        # Unlike fresh refs, these need to be cleared together with the paths
        # through which they are accessed.
        old_usages_to_clear = dict(_init_multi_lut())

        def ingest_ref(_level: ModificationLevel, _upd: ReferenceUpdate):
            ref = _upd.updated_ref
            try:
                current_max_level, usages = old_usages_to_clear[ref]
                if _upd.blanket_approve:
                    # approve all usages at once
                    usages = set()
                else:
                    # remove the paths that have just been cleared from
                    # the checklist
                    paths_checked = _upd.paths_checked or ()
                    if isinstance(paths_checked, RawPdfPath):
                        # single path
                        paths_checked = paths_checked,
                    usages.difference_update(paths_checked)
                # bump the modification level for this reference if necessary
                _level = max(current_max_level, _level)
                old_usages_to_clear[ref] = _level, usages
                if usages:
                    # not all paths/usages have been cleared, so we can't
                    # approve the reference yet
                    return
            except KeyError:
                pass
            explained[_level].add(ref)

        for rule in self.global_rules:
            for level, upd in rule.apply_qualified(old, new):
                ingest_ref(level, upd)

        changed_form_fields = set()

        if self.form_rule:
            form_changes = self.form_rule.apply(old, new)

            def is_locked(fq_name):
                return field_mdp_spec is not None \
                       and field_mdp_spec.is_locked(fq_name)

            for level, fu in form_changes:
                ingest_ref(level, fu)
                field_name = fu.field_name
                if field_name is not None and not fu.valid_when_locked:
                    if is_locked(field_name):
                        raise SuspiciousModification(
                            f"Update of {fu.updated_ref} is not allowed "
                            f"because the form field {field_name} is locked.")
                    changed_form_fields.add(fu.field_name)
                if doc_mdp is not None and not fu.valid_when_certifying:
                    raise SuspiciousModification(
                        f"Update of {fu.updated_ref} is only allowed "
                        f"after an approval signature, not a certification "
                        f"signature.")

        unexplained_lta = new_xrefs - explained[ModificationLevel.LTA_UPDATES]
        unexplained_formfill = \
            unexplained_lta - explained[ModificationLevel.FORM_FILLING]
        unexplained_annot = \
            unexplained_formfill - explained[ModificationLevel.ANNOTATIONS]
        if unexplained_annot:
            msg = misc.LazyJoin('\n', ('%s:%s...' %
                                       (repr(x), repr(x.get_object())[:300])
                                       for x in unexplained_annot))
            logger.debug("Unexplained xrefs in revision %d:\n%s", new.revision,
                         msg)
            unexplained_overrides = [
                f" - {repr(ref)} is also used at "
                f"{', '.join(str(p) for p in paths_remaining)} in the prior "
                f"revision."
                for ref, (_, paths_remaining) in old_usages_to_clear.items()
                if paths_remaining
            ]
            err_msg = (
                f"There are unexplained xrefs in revision {new.revision}: "
                f"{', '.join(repr(x) for x in unexplained_annot)}.")
            if unexplained_overrides:
                unchecked_paths_msg = (
                    f"Some objects from revision {old.revision} were replaced "
                    f"in revision {new.revision} without precise "
                    "justification:\n" + '\n'.join(unexplained_overrides))
                err_msg = "%s\n%s" % (err_msg, unchecked_paths_msg)
                logger.debug(unchecked_paths_msg)

            raise SuspiciousModification(err_msg)
        elif unexplained_formfill:
            level = ModificationLevel.ANNOTATIONS
        elif unexplained_lta:
            level = ModificationLevel.FORM_FILLING
        else:
            level = ModificationLevel.LTA_UPDATES

        return DiffResult(modification_level=level,
                          changed_form_fields=changed_form_fields)
def _walk_page_tree_annots(old_page_root, new_page_root, field_name_dict,
                           old: HistoricalResolver, valid_when_locked,
                           refs_seen):
    def get_kids(page_root, exc):
        try:
            return _arr_to_refs(page_root['/Kids'], exc)
        except KeyError:
            raise exc("No /Kids in /Pages entry")

    old_kids = get_kids(old_page_root, misc.PdfReadError)
    new_kids = get_kids(new_page_root, SuspiciousModification)

    # /Kids should only contain indirect refs, so direct comparison is
    # appropriate (__eq__ ignores the attached PDF handler)
    if old_kids != new_kids:
        raise SuspiciousModification(
            "Unexpected change to page tree structure.")
    for new_kid_ref, old_kid_ref in zip(new_kids, old_kids):
        if old_kid_ref in refs_seen:
            raise misc.PdfReadError(
                "Circular reference in page tree during annotation analysis")
        new_kid = new_kid_ref.get_object()
        old_kid = old_kid_ref.get_object()
        try:
            node_type = old_kid['/Type']
        except (KeyError, TypeError) as e:  # pragma: nocover
            raise misc.PdfReadError from e
        if node_type == '/Pages':
            yield from _walk_page_tree_annots(
                old_kid,
                new_kid,
                field_name_dict,
                old,
                valid_when_locked,
                refs_seen | {old_kid_ref},
            )
        elif node_type == '/Page':
            try:
                new_annots, new_annots_ref = _extract_annots_from_page(
                    new_kid, SuspiciousModification)
            except KeyError:
                # no annotations, continue
                continue
            try:
                old_annots, old_annots_ref = _extract_annots_from_page(
                    old_kid, misc.PdfReadError)
            except KeyError:
                old_annots_ref = None
                old_annots = set()

            # check if annotations were added
            if old_annots == new_annots:
                continue
            deleted_annots = old_annots - new_annots
            added_annots = new_annots - old_annots
            if deleted_annots:
                raise SuspiciousModification(
                    f"Annotations {deleted_annots} were deleted.")

            # look up the names of the associated form field(s)
            # if any of the refs are not in the list
            # -> unrelated annotation -> bail
            unknown_annots = added_annots - field_name_dict.keys()
            if unknown_annots:
                raise SuspiciousModification(
                    f"The newly added annotations {unknown_annots} were not "
                    "recognised.")

            # there are new annotations, and they're all changes we expect
            # => cleared to edit

            # if there's only one new annotation, we can set the field name
            # on the resulting FormUpdate object, but otherwise there's
            # not much we can do.
            field_name = None
            if len(added_annots) == 1:
                uniq_annot_ref, = added_annots
                field_name = field_name_dict[uniq_annot_ref]

            # Make sure the page dictionaries are the same, so that we
            #  can safely clear them for modification across ALL paths
            #  (not necessary if both /Annots entries are indirect references,
            #   but adding even more cases is pushing things)
            compare_dicts(old_kid, new_kid, {'/Annots'})
            # Page objects are often referenced from all sorts of places in the
            # file, and attempting to check all possible paths would probably
            # create more problems than it solves.
            yield FormUpdate(updated_ref=new_kid_ref,
                             field_name=field_name,
                             valid_when_locked=valid_when_locked
                             and field_name is not None,
                             blanket_approve=True)
            if new_annots_ref:
                # current /Annots entry is an indirect reference

                # collect paths to this page and append /Annots
                #  (recall: old_kid_ref and new_kid_ref should be the same
                #   anyhow)
                paths_to_annots = {
                    path + '/Annots'
                    for path in old._get_usages_of_ref(old_kid_ref)
                }

                # If the equality check fails,
                # either the /Annots array got reassigned to another
                # object ID, or it was moved from a direct object to an
                # indirect one, or the /Annots entry was newly created.
                # This is all fine, provided that the new  object
                # ID doesn't clobber an existing one.
                if old_annots_ref == new_annots_ref or \
                        old.is_ref_available(new_annots_ref):
                    yield FormUpdate(
                        updated_ref=new_annots_ref,
                        field_name=field_name,
                        valid_when_locked=(valid_when_locked
                                           and field_name is not None),
                        paths_checked=paths_to_annots)
Esempio n. 9
0
 def apply(self, old: HistoricalResolver, new: HistoricalResolver) \
         -> Iterable[Reference]:
     xref_start, _ = new.reader.xrefs.get_xref_container_info(new.revision)
     if isinstance(xref_start, generic.Reference) \
             and old.is_ref_available(xref_start):
         yield ReferenceUpdate(xref_start)
Esempio n. 10
0
 def apply(self, old: HistoricalResolver, new: HistoricalResolver) \
         -> Iterable[Reference]:
     # object streams are OK, but overriding object streams is not.
     for objstream_ref in new.object_streams_used():
         if old.is_ref_available(objstream_ref):
             yield ReferenceUpdate(objstream_ref)
Esempio n. 11
0
    def apply(self, old: HistoricalResolver, new: HistoricalResolver)\
            -> Iterable[Tuple[ModificationLevel, FormUpdate]]:
        """
        Evaluate changes in the document's form between two revisions.

        :param old:
            The older, base revision.
        :param new:
            The newer revision to be vetted.
        """

        acroform_path = RawPdfPath('/Root', '/AcroForm')
        old_acroform, new_acroform = yield from qualify(
            ModificationLevel.LTA_UPDATES,
            compare_key_refs(
                '/AcroForm', old, old.root, new.root
            ),
            transform=FormUpdate.curry_ref(
                field_name=None, paths_checked=acroform_path
            )
        )

        # first, compare the entries that aren't /Fields
        compare_dicts(old_acroform, new_acroform, self.ignored_acroform_keys)
        assert isinstance(old_acroform, generic.DictionaryObject)
        assert isinstance(new_acroform, generic.DictionaryObject)

        # mark /Fields ref as OK if it's an indirect reference
        # This is fine: the _list_fields logic checks that it really contains
        # stuff that looks like form fields, and other rules are responsible
        # for vetting the creation of other form fields anyway.
        fields_path = acroform_path + '/Fields'
        old_fields, new_fields = yield from qualify(
            ModificationLevel.LTA_UPDATES,
            compare_key_refs('/Fields', old, old_acroform, new_acroform),
            transform=FormUpdate.curry_ref(
                field_name=None, paths_checked=fields_path
            )
        )

        # we also need to deal with the default resource dict, since
        # Acrobat / Adobe Reader sometimes mess with it
        old_dr, new_dr = yield from qualify(
            ModificationLevel.FORM_FILLING,
            compare_key_refs('/DR', old, old_acroform, new_acroform),
            transform=FormUpdate.curry_ref(
                field_name=None, paths_checked=acroform_path + '/DR'
            )
        )
        if new_dr is not None:
            dr_deps = new.collect_dependencies(
                new_dr, since_revision=old.revision + 1
            )
            yield from qualify(
                ModificationLevel.FORM_FILLING, misc._as_gen(dr_deps),
                transform=FormUpdate.curry_ref(field_name=None)
            )

        context = FieldComparisonContext(
            field_specs=dict(
                _list_fields(old_fields, new_fields, old_path=fields_path)
            ),
            old=old, new=new
        )

        for rule in self.field_rules:
            yield from rule.apply(context)