def calculate_flat_provenance_types(
    prov_doc: ProvDocument,
    to_level: int = 0,
    including_primitives_types: bool = True,
    counting_wdf_as_two: bool = False,
    ignored_types: Iterable[str] = ϕ,
) -> MultiLevelTypeDict:
    # flatten all the bundles, if any
    prov_doc = prov_doc.flattened()

    # initialise index structures
    level0_types = defaultdict(
        set)  # type: Dict[QualifiedName, Set[QualifiedName]]
    predecessors = defaultdict(
        set
    )  # type: Dict[QualifiedName, Set[Tuple[QualifiedName, QualifiedName]]]

    types_to_ignore: FrozenSet[str] = frozenset(ignored_types)

    # indexing node types and relations
    for rec in prov_doc.get_records():  # type: ProvRecord
        if rec.is_element():
            level0_types[rec.identifier] |= get_element_types(
                rec, including_primitives_types, types_to_ignore)
        elif rec.is_relation():
            rel_type = rec.get_type()
            attrs, values = zip(*rec.formal_attributes)
            # expecting a QualifiedName from the first argument of a relation
            predecessor, successor = values[:2]
            if predecessor is not None and successor is not None:
                predecessors[successor].add((rel_type, predecessor))

    # the type map for this graph
    fp_types = defaultdict(dict)  # type: MultiLevelTypeDict
    # converting type sets to FlatProvenanceType level 0
    fp_types[0] = {
        node: (frozenset(level0_types[node]), )
        for node in level0_types
    }
    # propagating level-0 types to the specified level
    for k in range(1, to_level + 1):
        # only propagating (k-1) types from nodes that have them
        for node, types in fp_types[k - 1].items():
            # propagating the types to the predecessors
            for rel_type, predecessor in predecessors[node]:
                k_type = types + (frozenset({rel_type}),
                                  )  # type: FlatProvenanceType
                if counting_wdf_as_two and (rel_type == PROV_DERIVATION):
                    k_p1_type = k_type + (frozenset({rel_type}),
                                          )  # type: FlatProvenanceType
                    fp_types[k + 1][predecessor] = (
                        join_flat_types(fp_types[k +
                                                 1][predecessor], k_p1_type)
                        if predecessor in fp_types[k + 1] else k_p1_type)
                else:
                    fp_types[k][predecessor] = (join_flat_types(
                        fp_types[k][predecessor], k_type) if predecessor
                                                in fp_types[k] else k_type)

    return fp_types
Example #2
0
    def declare_directory(self, value: CWLObjectType) -> ProvEntity:
        """Register any nested files/directories."""
        # FIXME: Calculate a hash-like identifier for directory
        # so we get same value if it's the same filenames/hashes
        # in a different location.
        # For now, mint a new UUID to identify this directory, but
        # attempt to keep it inside the value dictionary
        dir_id = cast(str, value.setdefault("@id", uuid.uuid4().urn))

        # New annotation file to keep the ORE Folder listing
        ore_doc_fn = dir_id.replace("urn:uuid:", "directory-") + ".ttl"
        dir_bundle = self.document.bundle(self.metadata_ns[ore_doc_fn])

        coll = self.document.entity(
            dir_id,
            [
                (PROV_TYPE, WFPROV["Artifact"]),
                (PROV_TYPE, PROV["Collection"]),
                (PROV_TYPE, PROV["Dictionary"]),
                (PROV_TYPE, RO["Folder"]),
            ],
        )
        # ORE description of ro:Folder, saved separately
        coll_b = dir_bundle.entity(
            dir_id,
            [(PROV_TYPE, RO["Folder"]), (PROV_TYPE, ORE["Aggregation"])],
        )
        self.document.mentionOf(dir_id + "#ore", dir_id, dir_bundle.identifier)

        # dir_manifest = dir_bundle.entity(
        #     dir_bundle.identifier, {PROV["type"]: ORE["ResourceMap"],
        #                             ORE["describes"]: coll_b.identifier})

        coll_attribs = [(ORE["isDescribedBy"], dir_bundle.identifier)]
        coll_b_attribs = []  # type: List[Tuple[Identifier, ProvEntity]]

        # FIXME: .listing might not be populated yet - hopefully
        # a later call to this method will sort that
        is_empty = True

        if "listing" not in value:
            get_listing(self.fsaccess, value)
        for entry in cast(MutableSequence[CWLObjectType],
                          value.get("listing", [])):
            is_empty = False
            # Declare child-artifacts
            entity = self.declare_artefact(entry)
            self.document.membership(coll, entity)
            # Membership relation aka our ORE Proxy
            m_id = uuid.uuid4().urn
            m_entity = self.document.entity(m_id)
            m_b = dir_bundle.entity(m_id)

            # PROV-O style Dictionary
            # https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition
            # ..as prov.py do not currently allow PROV-N extensions
            # like hadDictionaryMember(..)
            m_entity.add_asserted_type(PROV["KeyEntityPair"])

            m_entity.add_attributes({
                PROV["pairKey"]: entry["basename"],
                PROV["pairEntity"]: entity,
            })

            # As well as a being a
            # http://wf4ever.github.io/ro/2016-01-28/ro/#FolderEntry
            m_b.add_asserted_type(RO["FolderEntry"])
            m_b.add_asserted_type(ORE["Proxy"])
            m_b.add_attributes({
                RO["entryName"]: entry["basename"],
                ORE["proxyIn"]: coll,
                ORE["proxyFor"]: entity,
            })
            coll_attribs.append((PROV["hadDictionaryMember"], m_entity))
            coll_b_attribs.append((ORE["aggregates"], m_b))

        coll.add_attributes(coll_attribs)
        coll_b.add_attributes(coll_b_attribs)

        # Also Save ORE Folder as annotation metadata
        ore_doc = ProvDocument()
        ore_doc.add_namespace(ORE)
        ore_doc.add_namespace(RO)
        ore_doc.add_namespace(UUID)
        ore_doc.add_bundle(dir_bundle)
        ore_doc = ore_doc.flattened()
        ore_doc_path = str(PurePosixPath(METADATA, ore_doc_fn))
        with self.research_object.write_bag_file(
                ore_doc_path) as provenance_file:
            ore_doc.serialize(provenance_file,
                              format="rdf",
                              rdf_format="turtle")
        self.research_object.add_annotation(dir_id, [ore_doc_fn],
                                            ORE["isDescribedBy"].uri)

        if is_empty:
            # Empty directory
            coll.add_asserted_type(PROV["EmptyCollection"])
            coll.add_asserted_type(PROV["EmptyDictionary"])
        self.research_object.add_uri(coll.identifier.uri)
        return coll