Beispiel #1
0
def bundles2():
    # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles2.provn
    #===========================================================================
    # document
    g = ProvDocument()

    #   prefix ex  <http://example.org/example/>
    g.add_namespace("ex", "http://www.example.com/")

    #   prefix alice  <http://example.org/alice/>
    #   prefix bob  <http://example.org/bob/>
    g.add_namespace('alice', 'http://example.org/alice/')
    g.add_namespace('bob', 'http://example.org/bob/')

    #   entity(bob:bundle4, [prov:type='prov:Bundle'])
    #   wasGeneratedBy(bob:bundle4, -, 2012-05-24T10:30:00)
    #   agent(ex:Bob)
    #   wasAttributedTo(bob:bundle4, ex:Bob)
    g.entity('bob:bundle4', {'prov:type': PROV['Bundle']})
    g.wasGeneratedBy('bob:bundle4', time='2012-05-24T10:30:00')
    g.agent('ex:Bob')
    g.wasAttributedTo('bob:bundle4', 'ex:Bob')

    #   entity(alice:bundle5, [ prov:type='prov:Bundle' ])
    #   wasGeneratedBy(alice:bundle5, -, 2012-05-25T11:15:00)
    #   agent(ex:Alice)
    #   wasAttributedTo(alice:bundle5, ex:Alice)
    g.entity('alice:bundle5', {'prov:type': PROV['Bundle']})
    g.wasGeneratedBy('alice:bundle5', time='2012-05-25T11:15:00')
    g.agent('ex:Alice')
    g.wasAttributedTo('alice:bundle5', 'ex:Alice')

    #   bundle bob:bundle4
    #     entity(ex:report1, [ prov:type="report", ex:version=1 ])
    #     wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01)
    #   endBundle
    b4 = g.bundle('bob:bundle4')
    b4.entity('ex:report1', {'prov:type': "report", 'ex:version': 1})
    b4.wasGeneratedBy('ex:report1', time='2012-05-24T10:00:01')

    #   bundle alice:bundle5
    #     entity(ex:report1bis)
    #     mentionOf(ex:report1bis, ex:report1, bob:bundle4)
    #     entity(ex:report2, [ prov:type="report", ex:version=2 ])
    #     wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01)
    #     wasDerivedFrom(ex:report2, ex:report1bis)
    #   endBundle
    b5 = g.bundle('alice:bundle5')
    b5.entity('ex:report1bis')
    b5.mentionOf('ex:report1bis', 'ex:report1', 'bob:bundle4')
    b5.entity('ex:report2', [('prov:type', "report"), ('ex:version', 2)])
    b5.wasGeneratedBy('ex:report2', time='2012-05-25T11:00:01')
    b5.wasDerivedFrom('ex:report2', 'ex:report1bis')

    # endDocument
    return g
Beispiel #2
0
def bundles2():
    # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles2.provn
    #===========================================================================
    # document
    g = ProvDocument()

    #   prefix ex  <http://example.org/example/>
    g.add_namespace("ex", "http://www.example.com/")

    #   prefix alice  <http://example.org/alice/>
    #   prefix bob  <http://example.org/bob/>
    g.add_namespace('alice', 'http://example.org/alice/')
    g.add_namespace('bob', 'http://example.org/bob/')

    #   entity(bob:bundle4, [prov:type='prov:Bundle'])
    #   wasGeneratedBy(bob:bundle4, -, 2012-05-24T10:30:00)
    #   agent(ex:Bob)
    #   wasAttributedTo(bob:bundle4, ex:Bob)
    g.entity('bob:bundle4', {'prov:type': PROV['Bundle']})
    g.wasGeneratedBy('bob:bundle4', time='2012-05-24T10:30:00')
    g.agent('ex:Bob')
    g.wasAttributedTo('bob:bundle4', 'ex:Bob')

    #   entity(alice:bundle5, [ prov:type='prov:Bundle' ])
    #   wasGeneratedBy(alice:bundle5, -, 2012-05-25T11:15:00)
    #   agent(ex:Alice)
    #   wasAttributedTo(alice:bundle5, ex:Alice)
    g.entity('alice:bundle5', {'prov:type': PROV['Bundle']})
    g.wasGeneratedBy('alice:bundle5', time='2012-05-25T11:15:00')
    g.agent('ex:Alice')
    g.wasAttributedTo('alice:bundle5', 'ex:Alice')

    #   bundle bob:bundle4
    #     entity(ex:report1, [ prov:type="report", ex:version=1 ])
    #     wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01)
    #   endBundle
    b4 = g.bundle('bob:bundle4')
    b4.entity('ex:report1', {'prov:type': "report", 'ex:version': 1})
    b4.wasGeneratedBy('ex:report1', time='2012-05-24T10:00:01')

    #   bundle alice:bundle5
    #     entity(ex:report1bis)
    #     mentionOf(ex:report1bis, ex:report1, bob:bundle4)
    #     entity(ex:report2, [ prov:type="report", ex:version=2 ])
    #     wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01)
    #     wasDerivedFrom(ex:report2, ex:report1bis)
    #   endBundle
    b5 = g.bundle('alice:bundle5')
    b5.entity('ex:report1bis')
    b5.mentionOf('ex:report1bis', 'ex:report1', 'bob:bundle4')
    b5.entity('ex:report2', [('prov:type', "report"), ('ex:version', 2)])
    b5.wasGeneratedBy('ex:report2', time='2012-05-25T11:00:01')
    b5.wasDerivedFrom('ex:report2', 'ex:report1bis')

    # endDocument
    return g
def bundles2():
    # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles2.provn
    # ===========================================================================
    # document
    g = ProvDocument()

    #   prefix ex  <http://example.org/example/>
    g.add_namespace("ex", "http://www.example.com/")

    #   prefix alice  <http://example.org/alice/>
    #   prefix bob  <http://example.org/bob/>
    g.add_namespace("alice", "http://example.org/alice/")
    g.add_namespace("bob", "http://example.org/bob/")

    #   entity(bob:bundle4, [prov:type='prov:Bundle'])
    #   wasGeneratedBy(bob:bundle4, -, 2012-05-24T10:30:00)
    #   agent(ex:Bob)
    #   wasAttributedTo(bob:bundle4, ex:Bob)
    g.entity("bob:bundle4", {"prov:type": PROV["Bundle"]})
    g.wasGeneratedBy("bob:bundle4", time="2012-05-24T10:30:00")
    g.agent("ex:Bob")
    g.wasAttributedTo("bob:bundle4", "ex:Bob")

    #   entity(alice:bundle5, [ prov:type='prov:Bundle' ])
    #   wasGeneratedBy(alice:bundle5, -, 2012-05-25T11:15:00)
    #   agent(ex:Alice)
    #   wasAttributedTo(alice:bundle5, ex:Alice)
    g.entity("alice:bundle5", {"prov:type": PROV["Bundle"]})
    g.wasGeneratedBy("alice:bundle5", time="2012-05-25T11:15:00")
    g.agent("ex:Alice")
    g.wasAttributedTo("alice:bundle5", "ex:Alice")

    #   bundle bob:bundle4
    #     entity(ex:report1, [ prov:type="report", ex:version=1 ])
    #     wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01)
    #   endBundle
    b4 = g.bundle("bob:bundle4")
    b4.entity("ex:report1", {"prov:type": "report", "ex:version": 1})
    b4.wasGeneratedBy("ex:report1", time="2012-05-24T10:00:01")

    #   bundle alice:bundle5
    #     entity(ex:report1bis)
    #     mentionOf(ex:report1bis, ex:report1, bob:bundle4)
    #     entity(ex:report2, [ prov:type="report", ex:version=2 ])
    #     wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01)
    #     wasDerivedFrom(ex:report2, ex:report1bis)
    #   endBundle
    b5 = g.bundle("alice:bundle5")
    b5.entity("ex:report1bis")
    b5.mentionOf("ex:report1bis", "ex:report1", "bob:bundle4")
    b5.entity("ex:report2", [("prov:type", "report"), ("ex:version", 2)])
    b5.wasGeneratedBy("ex:report2", time="2012-05-25T11:00:01")
    b5.wasDerivedFrom("ex:report2", "ex:report1bis")

    # endDocument
    return g
def bundles1():
    # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles1.provn
    # ===============================================================================
    # document
    g = ProvDocument()

    #   prefix ex  <http://example.org/example/>
    EX = Namespace("ex", "http://www.example.com/")
    g.add_namespace(EX)

    #   prefix alice  <http://example.org/alice/>
    #   prefix bob  <http://example.org/bob/>
    g.add_namespace("alice", "http://example.org/alice/")
    g.add_namespace("bob", "http://example.org/bob/")

    #   entity(bob:bundle1, [prov:type='prov:Bundle'])
    g.entity("bob:bundle1", {"prov:type": PROV["Bundle"]})
    #   wasGeneratedBy(bob:bundle1, -, 2012-05-24T10:30:00)
    g.wasGeneratedBy("bob:bundle1", time="2012-05-24T10:30:00")
    #   agent(ex:Bob)
    g.agent("ex:Bob")
    #   wasAttributedTo(bob:bundle1, ex:Bob)
    g.wasAttributedTo("bob:bundle1", "ex:Bob")

    #   entity(alice:bundle2, [ prov:type='prov:Bundle' ])
    g.entity("alice:bundle2", {"prov:type": PROV["Bundle"]})
    #   wasGeneratedBy(alice:bundle2, -, 2012-05-25T11:15:00)
    g.wasGeneratedBy("alice:bundle2", time="2012-05-25T11:15:00")
    #   agent(ex:Alice)
    g.agent("ex:Alice")
    #   wasAttributedTo(alice:bundle2, ex:Alice)
    g.wasAttributedTo("alice:bundle2", "ex:Alice")

    #   bundle bob:bundle1
    b1 = g.bundle("bob:bundle1")
    #     entity(ex:report1, [ prov:type="report", ex:version=1 ])
    b1.entity("ex:report1", {"prov:type": "report", "ex:version": 1})
    #     wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01)
    b1.wasGeneratedBy("ex:report1", time="2012-05-24T10:00:01")
    #   endBundle

    #   bundle alice:bundle2
    b2 = g.bundle("alice:bundle2")
    #     entity(ex:report1)
    b2.entity("ex:report1")
    #     entity(ex:report2, [ prov:type="report", ex:version=2 ])
    b2.entity("ex:report2", {"prov:type": "report", "ex:version": 2})
    #     wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01)
    b2.wasGeneratedBy("ex:report2", time="2012-05-25T11:00:01")
    #     wasDerivedFrom(ex:report2, ex:report1)
    b2.wasDerivedFrom("ex:report2", "ex:report1")
    #   endBundle

    # endDocument
    return g
Beispiel #5
0
def bundles1():
    # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles1.provn
    #===============================================================================
    # document
    g = ProvDocument()

    #   prefix ex  <http://example.org/example/>
    EX = Namespace("ex", "http://www.example.com/")
    g.add_namespace(EX)

    #   prefix alice  <http://example.org/alice/>
    #   prefix bob  <http://example.org/bob/>
    g.add_namespace('alice', 'http://example.org/alice/')
    g.add_namespace('bob', 'http://example.org/bob/')

    #   entity(bob:bundle1, [prov:type='prov:Bundle'])
    g.entity('bob:bundle1', {'prov:type': PROV['Bundle']})
    #   wasGeneratedBy(bob:bundle1, -, 2012-05-24T10:30:00)
    g.wasGeneratedBy('bob:bundle1', time='2012-05-24T10:30:00')
    #   agent(ex:Bob)
    g.agent('ex:Bob')
    #   wasAttributedTo(bob:bundle1, ex:Bob)
    g.wasAttributedTo('bob:bundle1', 'ex:Bob')

    #   entity(alice:bundle2, [ prov:type='prov:Bundle' ])
    g.entity('alice:bundle2', {'prov:type': PROV['Bundle']})
    #   wasGeneratedBy(alice:bundle2, -, 2012-05-25T11:15:00)
    g.wasGeneratedBy('alice:bundle2', time='2012-05-25T11:15:00')
    #   agent(ex:Alice)
    g.agent('ex:Alice')
    #   wasAttributedTo(alice:bundle2, ex:Alice)
    g.wasAttributedTo('alice:bundle2', 'ex:Alice')

    #   bundle bob:bundle1
    b1 = g.bundle('bob:bundle1')
    #     entity(ex:report1, [ prov:type="report", ex:version=1 ])
    b1.entity('ex:report1', {'prov:type': "report", 'ex:version': 1})
    #     wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01)
    b1.wasGeneratedBy('ex:report1', time='2012-05-24T10:00:01')
    #   endBundle

    #   bundle alice:bundle2
    b2 = g.bundle('alice:bundle2')
    #     entity(ex:report1)
    b2.entity('ex:report1')
    #     entity(ex:report2, [ prov:type="report", ex:version=2 ])
    b2.entity('ex:report2', {'prov:type': "report", 'ex:version': 2})
    #     wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01)
    b2.wasGeneratedBy('ex:report2', time='2012-05-25T11:00:01')
    #     wasDerivedFrom(ex:report2, ex:report1)
    b2.wasDerivedFrom('ex:report2', 'ex:report1')
    #   endBundle

    # endDocument
    return g
Beispiel #6
0
    def test_bundle_update_simple(self):
        doc = ProvDocument()
        doc.set_default_namespace(EX_URI)

        b1 = doc.bundle('b1')
        b1.entity('e')

        b2 = doc.bundle('b2')
        b2.entity('e')

        self.assertRaises(ProvException, lambda: b1.update(1))
        self.assertRaises(ProvException, lambda: b1.update(doc))

        b1.update(b2)
        self.assertEqual(len(b1.get_records()), 2)
    def test_bundle_update_simple(self):
        doc = ProvDocument()
        doc.set_default_namespace(EX_URI)

        b1 = doc.bundle('b1')
        b1.entity('e')

        b2 = doc.bundle('b2')
        b2.entity('e')

        self.assertRaises(ProvException, lambda: b1.update(1))
        self.assertRaises(ProvException, lambda: b1.update(doc))

        b1.update(b2)
        self.assertEqual(len(b1.get_records()), 2)
 def test_default_namespace_inheritance(self):
     prov_doc = ProvDocument()
     prov_doc.set_default_namespace("http://www.example.org/")
     bundle = prov_doc.bundle("bundle")
     e1 = bundle.entity("e1")
     self.assertIsNotNone(e1.identifier, "e1's identifier is None!")
     self.do_tests(prov_doc)
Beispiel #9
0
 def test_namespace_inheritance(self):
     prov_doc = ProvDocument()
     prov_doc.add_namespace('ex', 'http://www.example.org/')
     bundle = prov_doc.bundle('ex:bundle')
     e1 = bundle.entity('ex:e1')
     self.assertIsNotNone(e1.identifier, "e1's identifier is None!")
     self.do_tests(prov_doc)
Beispiel #10
0
    def get_document_as_prov(self, document_id=None):
        """
        Get a ProvDocument from the database based on the document_id
        :param document_id: The id as a sting value
        :return: ProvDocument
        """
        if type(document_id) is not str:
            raise InvalidArgumentTypeException()

        raw_doc = self._adapter.get_document(document_id)

        # parse document
        prov_document = ProvDocument()
        for record in raw_doc.document.records:
            self._parse_record(prov_document, record)

        for bundle in raw_doc.bundles:
            prefixed_identifier = bundle.bundle_record.metadata[
                METADATA_KEY_IDENTIFIER]
            # remove prefix
            identifier = prefixed_identifier[
                len(PROV_API_BUNDLE_IDENTIFIER_PREFIX) - 2:]
            prov_bundle = prov_document.bundle(identifier=identifier)

            for record in bundle.records:
                self._parse_record(prov_bundle, record)
        return prov_document
 def test_namespace_inheritance(self):
     prov_doc = ProvDocument()
     prov_doc.add_namespace('ex', 'http://www.example.org/')
     bundle = prov_doc.bundle('ex:bundle')
     e1 = bundle.entity('ex:e1')
     self.assertIsNotNone(e1.identifier, "e1's identifier is None!")
     self.do_tests(prov_doc)
Beispiel #12
0
 def test_default_namespace_inheritance(self):
     prov_doc = ProvDocument()
     prov_doc.set_default_namespace('http://www.example.org/')
     bundle = prov_doc.bundle('bundle')
     e1 = bundle.entity('e1')
     self.assertIsNotNone(e1.identifier, "e1's identifier is None!")
     self.assertRoundTripEquivalence(prov_doc)
def document_with_n_bundles_having_default_namespace(n):
    prov_doc = ProvDocument()
    prov_doc.add_namespace("ex", "http://www.example.org/")
    for i in range(n):
        x = str(i + 1)
        bundle = prov_doc.bundle("ex:bundle/" + x)
        bundle.set_default_namespace("http://www.example.org/default/" + x)
        bundle.entity("e")
    return prov_doc
Beispiel #14
0
def document_with_n_bundles_having_default_namespace(n):
    prov_doc = ProvDocument()
    prov_doc.add_namespace('ex', 'http://www.example.org/')
    for i in range(n):
        x = str(i + 1)
        bundle = prov_doc.bundle('ex:bundle/' + x)
        bundle.set_default_namespace('http://www.example.org/default/' + x)
        bundle.entity('e')
    return prov_doc
def document_with_n_bundles_having_default_namespace(n):
    prov_doc = ProvDocument()
    prov_doc.add_namespace('ex', 'http://www.example.org/')
    for i in range(n):
        x = str(i + 1)
        bundle = prov_doc.bundle('ex:bundle/' + x)
        bundle.set_default_namespace('http://www.example.org/default/' + x)
        bundle.entity('e')
    return prov_doc
    def test_document_update_simple(self):
        d1 = ProvDocument()
        d1.set_default_namespace(EX_URI)
        d1.entity('e')

        b1 = d1.bundle('b1')
        b1.entity('e')

        d2 = ProvDocument()
        d2.set_default_namespace(EX_URI)
        d2.entity('e')

        b1 = d2.bundle('b1')
        b1.entity('e')
        b2 = d2.bundle('b2')
        b2.entity('e')

        self.assertRaises(ProvException, lambda: d1.update(1))

        d1.update(d2)
        self.assertEqual(len(d1.get_records()), 2)
        self.assertEqual(len(d1.bundles), 2)
Beispiel #17
0
    def test_document_update_simple(self):
        d1 = ProvDocument()
        d1.set_default_namespace(EX_URI)
        d1.entity('e')

        b1 = d1.bundle('b1')
        b1.entity('e')

        d2 = ProvDocument()
        d2.set_default_namespace(EX_URI)
        d2.entity('e')

        b1 = d2.bundle('b1')
        b1.entity('e')
        b2 = d2.bundle('b2')
        b2.entity('e')

        self.assertRaises(ProvException, lambda: d1.update(1))

        d1.update(d2)
        self.assertEqual(len(d1.get_records()), 2)
        self.assertEqual(len(d1.bundles), 2)
Beispiel #18
0
def toW3Cprov(ling, bundl, format='w3c-prov-xml'):

    g = ProvDocument()
    vc = Namespace(
        "knmi", "http://knmi.nl"
    )  # namespaces do not need to be explicitly added to a document
    con = Namespace("dfp", "http://dispel4py.org")
    g.add_namespace("dcterms", "http://purl.org/dc/terms/")

    'specify bundle'
    bundle = None
    for trace in bundl:
        'specifing user'
        ag = g.agent(
            vc[trace["username"]],
            other_attributes={"dcterms:author": trace["username"]}
        )  # first time the ex namespace was used, it is added to the document automatically

        if trace['type'] == 'workflow_run':

            trace.update({'runId': trace['_id']})
            bundle = g.bundle(vc[trace["runId"]])
            bundle.actedOnBehalfOf(vc[trace["runId"]], vc[trace["username"]])

            dic = {}
            i = 0

            for key in trace:

                if key != "input":
                    if ':' in key:
                        dic.update({key: trace[key]})
                    else:
                        dic.update({vc[key]: trace[key]})

            dic.update({'prov:type': PROV['Bundle']})
            g.entity(vc[trace["runId"]], dic)

            dic = {}
            i = 0
            if type(trace['input']) != list:
                trace['input'] = [trace['input']]
            for y in trace['input']:
                for key in y:
                    if ':' in key:
                        dic.update({key: y[key]})
                    else:
                        dic.update({vc[key]: y[key]})
                dic.update({'prov:type': 'worklfow_input'})
                bundle.entity(vc[trace["_id"] + "_" + str(i)], dic)
                bundle.used(vc[trace["_id"]],
                            vc[trace["_id"] + "_" + str(i)],
                            identifier=vc["used_" + trace["_id"] + "_" +
                                          str(i)])
                i = i + 1

    'specify lineage'
    for trace in ling:

        #pprint(trace)

        try:
            bundle = g.bundle(vc[trace["runId"]])
            bundle.wasAttributedTo(vc[trace["runId"]],
                                   vc["ag_" + trace["username"]],
                                   identifier=vc["attr_" + trace["runId"]])

        except:
            pass
        'specifing creator of the activity (to be collected from the registy)'

        if 'creator' in trace:
            bundle.agent(
                vc["ag_" + trace["creator"]],
                other_attributes={"dcterms:creator": trace["creator"]}
            )  # first time the ex namespace was used, it is added to the document automatically
            bundle.wasAssociatedWith('process_' + trace["iterationId"],
                                     vc["ag_" + trace["creator"]])
            bundle.wasAttributedTo(vc[trace["runId"]],
                                   vc["ag_" + trace["creator"]])

        'adding activity information for lineage'
        dic = {}
        for key in trace:

            if type(trace[key]) != list:
                if ':' in key:
                    dic.update({key: trace[key]})
                else:

                    if key == 'location':

                        dic.update({"prov:location": trace[key]})
                    else:
                        dic.update({vc[key]: trace[key]})
        bundle.activity(vc["process_" + trace["iterationId"]],
                        trace["startTime"], trace["endTime"],
                        dic.update({'prov:type': trace["name"]}))

        'adding parameters to the document as input entities'
        dic = {}
        for x in trace["parameters"]:
            #print x
            if ':' in x["key"]:
                dic.update({x["key"]: x["val"]})
            else:
                dic.update({vc[x["key"]]: x["val"]})

        dic.update({'prov:type': 'parameters'})
        bundle.entity(vc["parameters_" + trace["instanceId"]], dic)
        bundle.used(vc['process_' + trace["iterationId"]],
                    vc["parameters_" + trace["instanceId"]],
                    identifier=vc["used_" + trace["iterationId"]])

        'adding input dependencies to the document as input entities'
        dic = {}

        for x in trace["derivationIds"]:
            'state could be added'
            #dic.update({'prov:type':'parameters'})
            bundle.used(vc['process_' + trace["iterationId"]],
                        vc[x["DerivedFromDatasetID"]],
                        identifier=vc["used_" + x["DerivedFromDatasetID"]])

        'adding entities to the document as output metadata'
        for x in trace["streams"]:
            i = 0
            parent_dic = {}
            for key in x:
                if key == 'con:immediateAccess':

                    parent_dic.update({vc['immediateAccess']: x[key]})

                elif key == 'location':

                    parent_dic.update({"prov:location": str(x[key])})
                else:
                    parent_dic.update({vc[key]: str(x[key])})

            c1 = bundle.collection(vc[x["id"]], other_attributes=parent_dic)
            bundle.wasGeneratedBy(vc[x["id"]],
                                  vc["process_" + trace["iterationId"]],
                                  identifier=vc["wgb_" + x["id"]])

            for d in trace['derivationIds']:
                bundle.wasDerivedFrom(vc[x["id"]],
                                      vc[d['DerivedFromDatasetID']],
                                      identifier=vc["wdf_" + x["id"]])

            for y in x["content"]:

                dic = {}

                if isinstance(y, dict):
                    val = None
                    for key in y:

                        try:
                            val = num(y[key])

                        except Exception, e:
                            val = str(y[key])

                        if ':' in key:
                            dic.update({key: val})
                        else:
                            dic.update({vc[key]: val})
                else:
                    dic = {vc['text']: y}

                dic.update({"verce:parent_entity": vc["data_" + x["id"]]})

                print x["id"]
                print str(i)
                print dic

                e1 = bundle.entity(vc["data_" + x["id"] + "_" + str(i)], dic)

                bundle.hadMember(c1, e1)
                bundle.wasGeneratedBy(vc["data_" + x["id"] + "_" + str(i)],
                                      vc["process_" + trace["iterationId"]],
                                      identifier=vc["wgb_" + x["id"] + "_" +
                                                    str(i)])

                for d in trace['derivationIds']:
                    bundle.wasDerivedFrom(
                        vc["data_" + x["id"] + "_" + str(i)],
                        vc[d['DerivedFromDatasetID']],
                        identifier=vc["wdf_" + "data_" + x["id"] + "_" +
                                      str(i)])

                i = i + 1
Beispiel #19
0
def toW3Cprov(ling,bundl,format='w3c-prov-xml'):
        
        g = ProvDocument()
        vc = Namespace("knmi", "http://knmi.nl")  # namespaces do not need to be explicitly added to a document
        con = Namespace("dfp", "http://dispel4py.org")
        g.add_namespace("dcterms", "http://purl.org/dc/terms/")
        
        'specify bundle'
        bundle=None
        for trace in bundl:
            'specifing user'
            ag=g.agent(vc[trace["username"]],other_attributes={"dcterms:author":trace["username"]})  # first time the ex namespace was used, it is added to the document automatically
            
            if trace['type']=='workflow_run':
                
                trace.update({'runId':trace['_id']})
                bundle=g.bundle(vc[trace["runId"]])
                bundle.actedOnBehalfOf(vc[trace["runId"]], vc[trace["username"]])
                
                dic={}
                i=0
                
                for key in trace:
                    
                
                    if key != "input": 
                        if ':' in key:
                            dic.update({key: trace[key]})
                        else:
                            dic.update({vc[key]: trace[key]})
                    
            
                dic.update({'prov:type': PROV['Bundle']})
                g.entity(vc[trace["runId"]], dic)
                
                dic={}
                i=0
                if type(trace['input'])!=list:
                    trace['input']=[trace['input']]
                for y in trace['input']:
                    for key in y:
                        if ':' in key:
                            dic.update({key: y[key]})
                        else:
                            dic.update({vc[key]: y[key]})
                    dic.update({'prov:type': 'worklfow_input'})
                    bundle.entity(vc[trace["_id"]+"_"+str(i)], dic)
                    bundle.used(vc[trace["_id"]], vc[trace["_id"]+"_"+str(i)], identifier=vc["used_"+trace["_id"]+"_"+str(i)])
                    i=i+1
                    
                    
        'specify lineage'
        for trace in ling:

            #pprint(trace)

            try:
                bundle=g.bundle(vc[trace["runId"]])
                bundle.wasAttributedTo(vc[trace["runId"]], vc["ag_"+trace["username"]],identifier=vc["attr_"+trace["runId"]])
            
            except:
                pass
            'specifing creator of the activity (to be collected from the registy)'
        
            if 'creator' in trace:
                bundle.agent(vc["ag_"+trace["creator"]],other_attributes={"dcterms:creator":trace["creator"]})  # first time the ex namespace was used, it is added to the document automatically
                bundle.wasAssociatedWith('process_'+trace["iterationId"],vc["ag_"+trace["creator"]])
                bundle.wasAttributedTo(vc[trace["runId"]], vc["ag_"+trace["creator"]])
    
            'adding activity information for lineage'
            dic={}
            for key in trace:
                
                if type(trace[key])!=list:
                    if ':' in key:
                        dic.update({key: trace[key]})
                    else:
                        
                        if key=='location':
                            
                            dic.update({"prov:location": trace[key]})    
                        else:
                            dic.update({vc[key]: trace[key]})
            bundle.activity(vc["process_"+trace["iterationId"]], trace["startTime"], trace["endTime"], dic.update({'prov:type': trace["name"]}))
        
            'adding parameters to the document as input entities'
            dic={}
            for x in trace["parameters"]:
                #print x
                if ':' in x["key"]:
                    dic.update({x["key"]: x["val"]})
                else:
                    dic.update({vc[x["key"]]: x["val"]})
                
            dic.update({'prov:type':'parameters'})        
            bundle.entity(vc["parameters_"+trace["instanceId"]], dic)
            bundle.used(vc['process_'+trace["iterationId"]], vc["parameters_"+trace["instanceId"]], identifier=vc["used_"+trace["iterationId"]])

            'adding input dependencies to the document as input entities'
            dic={}
        
            for x in trace["derivationIds"]:
                'state could be added'   
            #dic.update({'prov:type':'parameters'})        
                bundle.used(vc['process_'+trace["iterationId"]], vc[x["DerivedFromDatasetID"]], identifier=vc["used_"+x["DerivedFromDatasetID"]])


            'adding entities to the document as output metadata'
            for x in trace["streams"]:
                i=0
                parent_dic={}
                for key in x:
                        if key=='con:immediateAccess':
                            
                            parent_dic.update({vc['immediateAccess']: x[key]}) 
                            
                    
                        elif key=='location':
                             
                            parent_dic.update({"prov:location": str(x[key])})    
                        else:
                            parent_dic.update({vc[key]: str(x[key])})    
            
            
                c1=bundle.collection(vc[x["id"]],other_attributes=parent_dic)
                bundle.wasGeneratedBy(vc[x["id"]], vc["process_"+trace["iterationId"]], identifier=vc["wgb_"+x["id"]])
            
                for d in trace['derivationIds']:
                      bundle.wasDerivedFrom(vc[x["id"]], vc[d['DerivedFromDatasetID']],identifier=vc["wdf_"+x["id"]])
        
                for y in x["content"]:
                
                    dic={}
                
                    if isinstance(y, dict):
                        val=None
                        for key in y:
                        
                            try: 
                                val =num(y[key])
                                
                            except Exception,e:
                                val =str(y[key])
                            
                            if ':' in key:
                                dic.update({key: val})
                            else:
                                dic.update({vc[key]: val})
                    else:
                        dic={vc['text']:y}
                
                 
                    dic.update({"verce:parent_entity": vc["data_"+x["id"]]})
                    
                    print  x["id"]
                    print  str(i)
                    print  dic

                    e1=bundle.entity(vc["data_"+x["id"]+"_"+str(i)], dic)
                
                    bundle.hadMember(c1, e1)
                    bundle.wasGeneratedBy(vc["data_"+x["id"]+"_"+str(i)], vc["process_"+trace["iterationId"]], identifier=vc["wgb_"+x["id"]+"_"+str(i)])
                
                    for d in trace['derivationIds']:
                        bundle.wasDerivedFrom(vc["data_"+x["id"]+"_"+str(i)], vc[d['DerivedFromDatasetID']],identifier=vc["wdf_"+"data_"+x["id"]+"_"+str(i)])
        
                    i=i+1
Beispiel #20
0
class BioProvDocument:
    """
    Class containing base provenance information for a Project.
    """
    def __init__(
        self,
        project,
        add_attributes=False,
        add_users=True,
        _add_project_namespaces=True,
        _iter_samples=True,
        _iter_project=True,
    ):
        """
        Constructs the W3C-PROV document for a project.

        :param Project project: instance of bioprov.src.Project.
        :param bool add_attributes: whether to add object attributes.
        :param bool add_users: whether to add users and environments.
        :param bool _add_project_namespaces:
        :param bool _iter_samples:
        :param bool _iter_project:
        """

        # Assert Project is good before constructing instance
        assert isinstance(project,
                          Project), Warnings()["incorrect_type"](project,
                                                                 Project)
        self.ProvDocument = ProvDocument()
        self.project = project
        self.project.document = self.ProvDocument
        self._dot = prov_to_dot(self.ProvDocument)
        self._provn = self.ProvDocument.get_provn()
        self._entities = dict()
        self._activities = dict()
        self._agents = dict()
        self._user_bundles = dict()
        self._provstore_document = None

        # Don't add attributes if you plan on exporting to graphic format
        self.add_attributes = add_attributes

        # Set this before running Namespaces
        if add_users:
            self._create_envs_and_users = True

        else:
            self._create_envs_and_users = False

        # Default actions to create the document
        if _add_project_namespaces:
            self._add_project_namespaces()

        if self._create_envs_and_users:
            self._iter_envs_and_users()

        if _iter_project:
            self._iter_project()

        if _iter_samples:
            self._iter_samples()

    def __repr__(self):
        return "BioProvDocument describing Project '{}' with {} samples.".format(
            self.project.tag, len(self.project))

    @property
    def dot(self):
        self._dot = prov_to_dot(self.ProvDocument)
        return self._dot

    @dot.setter
    def dot(self, value):
        self._dot = value

    @property
    def provn(self):
        self._provn = self.ProvDocument.get_provn()
        return self._provn

    @provn.setter
    def provn(self, value):
        self._provn = value

    @property
    def provstore_document(self):
        self._provstore_document = self.ProvDocument
        return self._provstore_document

    @provstore_document.setter
    def provstore_document(self, value):
        self._provstore_document = value

    def _add_project_namespaces(self):
        """
        Runs the three _add_namespace functions.
        :return:
        """
        self._add_project_namespace()
        if self._create_envs_and_users:
            self._add_env_and_user_namespace()
        self._add_samples_namespace()
        self._add_activities_namespace()

    def _add_project_namespace(self):
        """
        Creates the Project Namespace and Project Entity.
        # Sets the default Namespace of the BioProvDocument as the Project.

        :return: updates self.project and self.ProvDocument.
        """
        self.ProvDocument.add_namespace("project", str(self.project))

    def _add_env_and_user_namespace(self):
        self.ProvDocument.add_namespace(
            "users",
            f"Users associated with BioProv Project '{self.project.tag}'")

    def _add_samples_namespace(self):
        self.ProvDocument.add_namespace(
            "samples",
            f"Samples associated with bioprov Project '{self.project.tag}'",
        )

    def _add_files_namespace(self):
        self.ProvDocument.add_namespace(
            "files",
            f"Files associated with bioprov Project '{self.project.tag}'")

    def _iter_project(self):
        self._create_sample_bundle(self.project, kind="Project")
        self._create_sample_file_entities(self.project, kind="Project")
        self._create_program_entities(self.project, kind="Project")

    def _iter_envs_and_users(self):
        for _user, _env_dict in self.project.users.items():
            _user_preffix = f"users:{_user}"
            _user_bundle = self._user_bundles[
                _user] = self.ProvDocument.bundle(_user_preffix)
            _user_bundle.set_default_namespace(_user)
            _user_bundle.add_namespace(
                "envs", f"Environments associated with User '{_user}'")
            self._agents[_user] = _user_bundle.agent(_user_preffix)

    def _iter_samples(self):
        for _, sample in self.project.samples.items():
            for statement in (
                    self._create_sample_bundle(sample),
                    self._create_sample_file_entities(sample),
                    self._create_program_entities(sample),
            ):
                try:
                    statement
                except KeyError:
                    config.logger.debug(
                        f"Could not run function '{statement.__name__}' for sample {sample.name}."
                    )
                    pass

    def _create_sample_bundle(self, object_, kind="Sample"):
        """
        Creates a ProvBundle for the Sample and associates it to self.ProvDocument.

        :param object_: instance of bioprov.Sample
        :return: updates self.ProvDocument by creating PROV objects for the sample.
        """
        choices = ("Sample", "Project")
        assert kind in choices, Warnings()["choices"](kind, choices, "kind")
        # Sample PROV attributes: bundle, namespace, entity
        object_.ProvBundle = self.ProvDocument.bundle(
            object_.namespace_preffix)
        object_.ProvBundle.set_default_namespace(object_.name)
        self._entities[
            object_.name] = object_.entity = object_.ProvBundle.entity(
                object_.namespace_preffix)
        if kind == "Sample":
            object_.ProvBundle.wasDerivedFrom(self._entities[object_.name],
                                              self.project.entity)

    def _create_sample_file_entities(self, sample, kind="Sample"):
        """
        Creates a ProvBundle for the Sample and associates it to self.ProvDocument.

        :param sample: instance of bioprov.Sample
        :return: updates the sample.ProvBundle by creating PROV objects for the files.

        """
        sample.files_namespace_preffix = "files"
        sample.file_namespace = sample.ProvBundle.add_namespace(
            sample.files_namespace_preffix,
            f"Files associated with {kind} {sample.name}",
        )
        # Files PROV attributes: namespace, entities
        for key, file in sample.files.items():
            # This prevents errors when the file refers to a project csv or JSON
            if file.name == sample.name:
                file.name = file.basename
            # Same function call, but in the first we pass the 'other_attributes' argument
            if self.add_attributes:
                self._entities[file.name] = sample.ProvBundle.entity(
                    f"{sample.files_namespace_preffix}:{file.tag}",
                    other_attributes=build_prov_attributes(
                        file.serializer(), sample.file_namespace),
                )
            else:
                self._entities[file.name] = sample.ProvBundle.entity(
                    f"{sample.files_namespace_preffix}:{file.tag}", )

            # Adding relationships
            sample.ProvBundle.wasDerivedFrom(
                self._entities[file.name],
                self._entities[sample.name],
            )

    def _create_program_entities(self, sample, kind="Sample"):
        # Programs PROV attributes: namespace, entities
        programs_namespace_prefix = f"programs"
        programs_namespace = sample.ProvBundle.add_namespace(
            programs_namespace_prefix,
            f"Programs associated with {kind} {sample.name}",
        )
        for key, program in sample.programs.items():
            last_run = program.runs[str(len(program.runs))]

            # We want to exclude _runs from the program serializer
            # So we put a custom serializer filter
            keys = ("sample", "_runs")
            serialized_program = serializer_filter(program, keys)
            try:
                del serialized_program["params"]
            except KeyError:
                pass

            # Same function call, but in the first we pass the 'other_attributes' argument
            if self.add_attributes:
                self._activities[program.name] = sample.ProvBundle.activity(
                    f"{programs_namespace_prefix}:{program.name}",
                    startTime=last_run.start_time,
                    endTime=last_run.end_time,
                    other_attributes=build_prov_attributes(
                        serialized_program, programs_namespace),
                )
            else:
                self._activities[program.name] = sample.ProvBundle.activity(
                    f"{programs_namespace_prefix}:{program.name}",
                    startTime=last_run.start_time,
                    endTime=last_run.end_time,
                )

            if self._create_envs_and_users:
                for _user, _env_dict in self.project.users.items():
                    _user_bundle = self._user_bundles[_user]
                    for _env_hash, _env in _env_dict.items():
                        if _env_hash == last_run.env:
                            if self.add_attributes:
                                self._agents[_env_hash] = _user_bundle.agent(
                                    f"envs:{_env}",
                                    other_attributes=build_prov_attributes(
                                        _env.env_dict, _env.env_namespace),
                                )
                            else:
                                self._agents[_env_hash] = _user_bundle.agent(
                                    f"envs:{_env}")
                            if not _env.actedOnBehalfOf:
                                _user_bundle.actedOnBehalfOf(
                                    self._agents[_env_hash],
                                    self._agents[_user])
                                _env.actedOnBehalfOf = True
                sample.ProvBundle.wasAssociatedWith(
                    self._activities[program.name], self._agents[last_run.env])

            inputs, outputs = self._get_IO_from_params(program)
            self._add_IO_relationships(sample, program, inputs, "input")
            self._add_IO_relationships(sample, program, outputs, "output")

    def _add_IO_relationships(self, sample, program, io_list, io_type):
        # TODO: replace Sample for Project when implementing Project.files and programs
        """
        Add PROV relationships between Program and input/output files.

        :param sample: instance of bioprov.Sample
        :param program: instance of bioprov.Program
        :param io_list: list of input/output files
        :param io_type: 'input' or 'output'
        :return: Adds relationship between
        """

        # Small assertion block
        choices = ("input", "output")
        assert io_type in choices, Warnings()["choices"](io_type, choices,
                                                         "io_type")

        # Start function
        sample_files = [str(file) for _, file in sample.files.items()]
        for value in io_list:
            if value in sample_files:
                file_obj = [
                    file_ for _, file_ in sample.files.items()
                    if str(file_) == value
                ]
                if file_obj:
                    file_obj, *_ = file_obj
                    if io_type == "input":
                        sample.ProvBundle.used(
                            self._entities[file_obj.name],
                            self._activities[program.name],
                        )
                    elif io_type == "output":
                        sample.ProvBundle.wasGeneratedBy(
                            self._entities[file_obj.name],
                            self._activities[program.name],
                        )

    @staticmethod
    def _get_IO_from_params(program):
        """
        :param program: instance of bioprov.Program
        :return: list of input parameter values and list of output parameter values
        """
        # Relationships based on Parameters
        inputs, outputs = [], []

        for _, parameter in program.params.items():
            assert isinstance(parameter, Parameter), (
                Warnings()["incorrect_type"](parameter, Parameter) +
                "\nPlease check if Programs were correctly deserialized.")
            if parameter.kind == "input":
                # This loop is because some positional arguments may have empty values (value stored in parameter.key)
                if parameter.value:
                    inputs.append(parameter.value)
                else:
                    inputs.append(parameter.key)
            elif parameter.kind == "output":
                if parameter.value:
                    outputs.append(parameter.value)
                else:
                    outputs.append(parameter.key)

        return inputs, outputs

    def _add_activities_namespace(self):
        """
        Add activities Namespace to self.
        :return:
        """

        if len(self.ProvDocument.namespaces) == 0:
            self.ProvDocument.add_namespace(
                "activities",
                f"Activities associated with bioprov Project '{self.project.tag}'",
            )

    def upload_to_provstore(self, api=None):
        """
        Uploads self.ProvDocument. to ProvStore (https://openprovenance.org/store/)

        :param api: provstore.api.Api
        :return: Sends POST request to ProvStore API and updates self.ProvDocument if successful.
        """
        if api is None:
            api = config.provstore_api
        try:
            self.provstore_document = api.document.create(
                self.ProvDocument, name=self.project.tag)
        except ConnectionError:
            logging.error(
                "Could not create remote document. Please check your internet connection and ProvStore credentials."
            )

    def write_provn(self, path=None):
        """
        Writes PROVN output of document.
        :param path: Path to write file.
        :return: Writes file.
        """
        if path is None:
            path = f"./{self.project.tag}_provn"
            if self.add_attributes:
                path += "_attrs"
            path += ".txt"

        path = Path(path)
        assert (
            path.parent.exists()
        ), f"Directory '{path.parent}' not found.\nPlease provide a valid directory."

        if path.exists():
            logging.info(f"Overwriting file at '{path}'")

        with open(path, "w") as f:
            f.write(self.provn)
            if path.exists():
                logging.info(f"Wrote PROVN record to {path}.")
Beispiel #21
0
class ProvenanceProfile:
    """
    Provenance profile.

    Populated as the workflow runs.
    """
    def __init__(
        self,
        research_object: "ResearchObject",
        full_name: str,
        host_provenance: bool,
        user_provenance: bool,
        orcid: str,
        fsaccess: StdFsAccess,
        run_uuid: Optional[uuid.UUID] = None,
    ) -> None:
        """Initialize the provenance profile."""
        self.fsaccess = fsaccess
        self.orcid = orcid
        self.research_object = research_object
        self.folder = self.research_object.folder
        self.document = ProvDocument()
        self.host_provenance = host_provenance
        self.user_provenance = user_provenance
        self.engine_uuid = research_object.engine_uuid  # type: str
        self.add_to_manifest = self.research_object.add_to_manifest
        if self.orcid:
            _logger.debug("[provenance] Creator ORCID: %s", self.orcid)
        self.full_name = full_name
        if self.full_name:
            _logger.debug("[provenance] Creator Full name: %s", self.full_name)
        self.workflow_run_uuid = run_uuid or uuid.uuid4()
        self.workflow_run_uri = self.workflow_run_uuid.urn  # type: str
        self.generate_prov_doc()

    def __str__(self) -> str:
        """Represent this Provenvance profile as a string."""
        return "ProvenanceProfile <{}> in <{}>".format(
            self.workflow_run_uri,
            self.research_object,
        )

    def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
        """Add basic namespaces."""
        def host_provenance(document: ProvDocument) -> None:
            """Record host provenance."""
            document.add_namespace(CWLPROV)
            document.add_namespace(UUID)
            document.add_namespace(FOAF)

            hostname = getfqdn()
            # won't have a foaf:accountServiceHomepage for unix hosts, but
            # we can at least provide hostname
            document.agent(
                ACCOUNT_UUID,
                {
                    PROV_TYPE: FOAF["OnlineAccount"],
                    "prov:location": hostname,
                    CWLPROV["hostname"]: hostname,
                },
            )

        self.cwltool_version = "cwltool %s" % versionstring().split()[-1]
        self.document.add_namespace("wfprov",
                                    "http://purl.org/wf4ever/wfprov#")
        # document.add_namespace('prov', 'http://www.w3.org/ns/prov#')
        self.document.add_namespace("wfdesc",
                                    "http://purl.org/wf4ever/wfdesc#")
        # TODO: Make this ontology. For now only has cwlprov:image
        self.document.add_namespace("cwlprov", "https://w3id.org/cwl/prov#")
        self.document.add_namespace("foaf", "http://xmlns.com/foaf/0.1/")
        self.document.add_namespace("schema", "http://schema.org/")
        self.document.add_namespace("orcid", "https://orcid.org/")
        self.document.add_namespace("id", "urn:uuid:")
        # NOTE: Internet draft expired 2004-03-04 (!)
        #  https://tools.ietf.org/html/draft-thiemann-hash-urn-01
        # TODO: Change to nih:sha-256; hashes
        #  https://tools.ietf.org/html/rfc6920#section-7
        self.document.add_namespace("data", "urn:hash::sha1:")
        # Also needed for docker images
        self.document.add_namespace(SHA256, "nih:sha-256;")

        # info only, won't really be used by prov as sub-resources use /
        self.document.add_namespace("researchobject",
                                    self.research_object.base_uri)
        # annotations
        self.metadata_ns = self.document.add_namespace(
            "metadata", self.research_object.base_uri + METADATA + "/")
        # Pre-register provenance directory so we can refer to its files
        self.provenance_ns = self.document.add_namespace(
            "provenance",
            self.research_object.base_uri + posix_path(PROVENANCE) + "/")
        ro_identifier_workflow = self.research_object.base_uri + "workflow/packed.cwl#"
        self.wf_ns = self.document.add_namespace("wf", ro_identifier_workflow)
        ro_identifier_input = (self.research_object.base_uri +
                               "workflow/primary-job.json#")
        self.document.add_namespace("input", ro_identifier_input)

        # More info about the account (e.g. username, fullname)
        # may or may not have been previously logged by user_provenance()
        # .. but we always know cwltool was launched (directly or indirectly)
        # by a user account, as cwltool is a command line tool
        account = self.document.agent(ACCOUNT_UUID)
        if self.orcid or self.full_name:
            person = {PROV_TYPE: PROV["Person"], "prov:type": SCHEMA["Person"]}
            if self.full_name:
                person["prov:label"] = self.full_name
                person["foaf:name"] = self.full_name
                person["schema:name"] = self.full_name
            else:
                # TODO: Look up name from ORCID API?
                pass
            agent = self.document.agent(self.orcid or uuid.uuid4().urn, person)
            self.document.actedOnBehalfOf(account, agent)
        else:
            if self.host_provenance:
                host_provenance(self.document)
            if self.user_provenance:
                self.research_object.user_provenance(self.document)
        # The execution of cwltool
        wfengine = self.document.agent(
            self.engine_uuid,
            {
                PROV_TYPE: PROV["SoftwareAgent"],
                "prov:type": WFPROV["WorkflowEngine"],
                "prov:label": self.cwltool_version,
            },
        )
        # FIXME: This datetime will be a bit too delayed, we should
        # capture when cwltool.py earliest started?
        self.document.wasStartedBy(wfengine, None, account,
                                   datetime.datetime.now())
        # define workflow run level activity
        self.document.activity(
            self.workflow_run_uri,
            datetime.datetime.now(),
            None,
            {
                PROV_TYPE: WFPROV["WorkflowRun"],
                "prov:label": "Run of workflow/packed.cwl#main",
            },
        )
        # association between SoftwareAgent and WorkflowRun
        main_workflow = "wf:main"
        self.document.wasAssociatedWith(self.workflow_run_uri,
                                        self.engine_uuid, main_workflow)
        self.document.wasStartedBy(self.workflow_run_uri, None,
                                   self.engine_uuid, datetime.datetime.now())
        return (self.workflow_run_uri, self.document)

    def evaluate(
        self,
        process: Process,
        job: JobsType,
        job_order_object: CWLObjectType,
        research_obj: "ResearchObject",
    ) -> None:
        """Evaluate the nature of job."""
        if not hasattr(process, "steps"):
            # record provenance of independent commandline tool executions
            self.prospective_prov(job)
            customised_job = copy_job_order(job, job_order_object)
            self.used_artefacts(customised_job, self.workflow_run_uri)
            research_obj.create_job(customised_job)
        elif hasattr(job, "workflow"):
            # record provenance of workflow executions
            self.prospective_prov(job)
            customised_job = copy_job_order(job, job_order_object)
            self.used_artefacts(customised_job, self.workflow_run_uri)

    def record_process_start(
            self,
            process: Process,
            job: JobsType,
            process_run_id: Optional[str] = None) -> Optional[str]:
        if not hasattr(process, "steps"):
            process_run_id = self.workflow_run_uri
        elif not hasattr(job, "workflow"):
            # commandline tool execution as part of workflow
            name = ""
            if isinstance(job, (CommandLineJob, JobBase, WorkflowJob)):
                name = job.name
            process_name = urllib.parse.quote(name, safe=":/,#")
            process_run_id = self.start_process(process_name,
                                                datetime.datetime.now())
        return process_run_id

    def start_process(
        self,
        process_name: str,
        when: datetime.datetime,
        process_run_id: Optional[str] = None,
    ) -> str:
        """Record the start of each Process."""
        if process_run_id is None:
            process_run_id = uuid.uuid4().urn
        prov_label = "Run of workflow/packed.cwl#main/" + process_name
        self.document.activity(
            process_run_id,
            None,
            None,
            {
                PROV_TYPE: WFPROV["ProcessRun"],
                PROV_LABEL: prov_label
            },
        )
        self.document.wasAssociatedWith(process_run_id, self.engine_uuid,
                                        str("wf:main/" + process_name))
        self.document.wasStartedBy(process_run_id, None, self.workflow_run_uri,
                                   when, None, None)
        return process_run_id

    def record_process_end(
        self,
        process_name: str,
        process_run_id: str,
        outputs: Union[CWLObjectType, MutableSequence[CWLObjectType], None],
        when: datetime.datetime,
    ) -> None:
        self.generate_output_prov(outputs, process_run_id, process_name)
        self.document.wasEndedBy(process_run_id, None, self.workflow_run_uri,
                                 when)

    def declare_file(
            self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, str]:
        if value["class"] != "File":
            raise ValueError("Must have class:File: %s" % value)
        # Need to determine file hash aka RO filename
        entity = None  # type: Optional[ProvEntity]
        checksum = None
        if "checksum" in value:
            csum = cast(str, value["checksum"])
            (method, checksum) = csum.split("$", 1)
            if method == SHA1 and self.research_object.has_data_file(checksum):
                entity = self.document.entity("data:" + checksum)

        if not entity and "location" in value:
            location = str(value["location"])
            # If we made it here, we'll have to add it to the RO
            with self.fsaccess.open(location, "rb") as fhandle:
                relative_path = self.research_object.add_data_file(fhandle)
                # FIXME: This naively relies on add_data_file setting hash as filename
                checksum = PurePath(relative_path).name
                entity = self.document.entity("data:" + checksum,
                                              {PROV_TYPE: WFPROV["Artifact"]})
                if "checksum" not in value:
                    value["checksum"] = f"{SHA1}${checksum}"

        if not entity and "contents" in value:
            # Anonymous file, add content as string
            entity, checksum = self.declare_string(cast(
                str, value["contents"]))

        # By here one of them should have worked!
        if not entity or not checksum:
            raise ValueError(
                "class:File but missing checksum/location/content: %r" % value)

        # Track filename and extension, this is generally useful only for
        # secondaryFiles. Note that multiple uses of a file might thus record
        # different names for the same entity, so we'll
        # make/track a specialized entity by UUID
        file_id = value.setdefault("@id", uuid.uuid4().urn)
        # A specialized entity that has just these names
        file_entity = self.document.entity(
            file_id,
            [(PROV_TYPE, WFPROV["Artifact"]), (PROV_TYPE, WF4EVER["File"])],
        )  # type: ProvEntity

        if "basename" in value:
            file_entity.add_attributes(
                {CWLPROV["basename"]: value["basename"]})
        if "nameroot" in value:
            file_entity.add_attributes(
                {CWLPROV["nameroot"]: value["nameroot"]})
        if "nameext" in value:
            file_entity.add_attributes({CWLPROV["nameext"]: value["nameext"]})
        self.document.specializationOf(file_entity, entity)

        # Check for secondaries
        for sec in cast(MutableSequence[CWLObjectType],
                        value.get("secondaryFiles", [])):
            # TODO: Record these in a specializationOf entity with UUID?
            if sec["class"] == "File":
                (sec_entity, _, _) = self.declare_file(sec)
            elif sec["class"] == "Directory":
                sec_entity = self.declare_directory(sec)
            else:
                raise ValueError(f"Got unexpected secondaryFiles value: {sec}")
            # We don't know how/when/where the secondary file was generated,
            # but CWL convention is a kind of summary/index derived
            # from the original file. As its generally in a different format
            # then prov:Quotation is not appropriate.
            self.document.derivation(
                sec_entity,
                file_entity,
                other_attributes={PROV["type"]: CWLPROV["SecondaryFile"]},
            )

        return file_entity, entity, checksum

    def declare_directory(self, value: CWLObjectType) -> ProvEntity:
        """Register any nested files/directories."""
        # FIXME: Calculate a hash-like identifier for directory
        # so we get same value if it's the same filenames/hashes
        # in a different location.
        # For now, mint a new UUID to identify this directory, but
        # attempt to keep it inside the value dictionary
        dir_id = cast(str, value.setdefault("@id", uuid.uuid4().urn))

        # New annotation file to keep the ORE Folder listing
        ore_doc_fn = dir_id.replace("urn:uuid:", "directory-") + ".ttl"
        dir_bundle = self.document.bundle(self.metadata_ns[ore_doc_fn])

        coll = self.document.entity(
            dir_id,
            [
                (PROV_TYPE, WFPROV["Artifact"]),
                (PROV_TYPE, PROV["Collection"]),
                (PROV_TYPE, PROV["Dictionary"]),
                (PROV_TYPE, RO["Folder"]),
            ],
        )
        # ORE description of ro:Folder, saved separately
        coll_b = dir_bundle.entity(
            dir_id,
            [(PROV_TYPE, RO["Folder"]), (PROV_TYPE, ORE["Aggregation"])],
        )
        self.document.mentionOf(dir_id + "#ore", dir_id, dir_bundle.identifier)

        # dir_manifest = dir_bundle.entity(
        #     dir_bundle.identifier, {PROV["type"]: ORE["ResourceMap"],
        #                             ORE["describes"]: coll_b.identifier})

        coll_attribs = [(ORE["isDescribedBy"], dir_bundle.identifier)]
        coll_b_attribs = []  # type: List[Tuple[Identifier, ProvEntity]]

        # FIXME: .listing might not be populated yet - hopefully
        # a later call to this method will sort that
        is_empty = True

        if "listing" not in value:
            get_listing(self.fsaccess, value)
        for entry in cast(MutableSequence[CWLObjectType],
                          value.get("listing", [])):
            is_empty = False
            # Declare child-artifacts
            entity = self.declare_artefact(entry)
            self.document.membership(coll, entity)
            # Membership relation aka our ORE Proxy
            m_id = uuid.uuid4().urn
            m_entity = self.document.entity(m_id)
            m_b = dir_bundle.entity(m_id)

            # PROV-O style Dictionary
            # https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition
            # ..as prov.py do not currently allow PROV-N extensions
            # like hadDictionaryMember(..)
            m_entity.add_asserted_type(PROV["KeyEntityPair"])

            m_entity.add_attributes({
                PROV["pairKey"]: entry["basename"],
                PROV["pairEntity"]: entity,
            })

            # As well as a being a
            # http://wf4ever.github.io/ro/2016-01-28/ro/#FolderEntry
            m_b.add_asserted_type(RO["FolderEntry"])
            m_b.add_asserted_type(ORE["Proxy"])
            m_b.add_attributes({
                RO["entryName"]: entry["basename"],
                ORE["proxyIn"]: coll,
                ORE["proxyFor"]: entity,
            })
            coll_attribs.append((PROV["hadDictionaryMember"], m_entity))
            coll_b_attribs.append((ORE["aggregates"], m_b))

        coll.add_attributes(coll_attribs)
        coll_b.add_attributes(coll_b_attribs)

        # Also Save ORE Folder as annotation metadata
        ore_doc = ProvDocument()
        ore_doc.add_namespace(ORE)
        ore_doc.add_namespace(RO)
        ore_doc.add_namespace(UUID)
        ore_doc.add_bundle(dir_bundle)
        ore_doc = ore_doc.flattened()
        ore_doc_path = str(PurePosixPath(METADATA, ore_doc_fn))
        with self.research_object.write_bag_file(
                ore_doc_path) as provenance_file:
            ore_doc.serialize(provenance_file,
                              format="rdf",
                              rdf_format="turtle")
        self.research_object.add_annotation(dir_id, [ore_doc_fn],
                                            ORE["isDescribedBy"].uri)

        if is_empty:
            # Empty directory
            coll.add_asserted_type(PROV["EmptyCollection"])
            coll.add_asserted_type(PROV["EmptyDictionary"])
        self.research_object.add_uri(coll.identifier.uri)
        return coll

    def declare_string(self, value: str) -> Tuple[ProvEntity, str]:
        """Save as string in UTF-8."""
        byte_s = BytesIO(str(value).encode(ENCODING))
        data_file = self.research_object.add_data_file(byte_s,
                                                       content_type=TEXT_PLAIN)
        checksum = PurePosixPath(data_file).name
        # FIXME: Don't naively assume add_data_file uses hash in filename!
        data_id = "data:%s" % PurePosixPath(data_file).stem
        entity = self.document.entity(data_id, {
            PROV_TYPE: WFPROV["Artifact"],
            PROV_VALUE: str(value)
        })  # type: ProvEntity
        return entity, checksum

    def declare_artefact(self, value: Optional[CWLOutputType]) -> ProvEntity:
        """Create data artefact entities for all file objects."""
        if value is None:
            # FIXME: If this can happen in CWL, we'll
            # need a better way to represent this in PROV
            return self.document.entity(CWLPROV["None"], {PROV_LABEL: "None"})

        if isinstance(value, (bool, int, float)):
            # Typically used in job documents for flags

            # FIXME: Make consistent hash URIs for these
            # that somehow include the type
            # (so "1" != 1 != "1.0" != true)
            entity = self.document.entity(uuid.uuid4().urn,
                                          {PROV_VALUE: value})
            self.research_object.add_uri(entity.identifier.uri)
            return entity

        if isinstance(value, (str, str)):
            (entity, _) = self.declare_string(value)
            return entity

        if isinstance(value, bytes):
            # If we got here then we must be in Python 3
            byte_s = BytesIO(value)
            data_file = self.research_object.add_data_file(byte_s)
            # FIXME: Don't naively assume add_data_file uses hash in filename!
            data_id = "data:%s" % PurePosixPath(data_file).stem
            return self.document.entity(
                data_id,
                {
                    PROV_TYPE: WFPROV["Artifact"],
                    PROV_VALUE: str(value)
                },
            )

        if isinstance(value, MutableMapping):
            if "@id" in value:
                # Already processed this value, but it might not be in this PROV
                entities = self.document.get_record(value["@id"])
                if entities:
                    return entities[0]
                # else, unknown in PROV, re-add below as if it's fresh

            # Base case - we found a File we need to update
            if value.get("class") == "File":
                (entity, _, _) = self.declare_file(value)
                value["@id"] = entity.identifier.uri
                return entity

            if value.get("class") == "Directory":
                entity = self.declare_directory(value)
                value["@id"] = entity.identifier.uri
                return entity
            coll_id = value.setdefault("@id", uuid.uuid4().urn)
            # some other kind of dictionary?
            # TODO: also Save as JSON
            coll = self.document.entity(
                coll_id,
                [
                    (PROV_TYPE, WFPROV["Artifact"]),
                    (PROV_TYPE, PROV["Collection"]),
                    (PROV_TYPE, PROV["Dictionary"]),
                ],
            )

            if value.get("class"):
                _logger.warning("Unknown data class %s.", value["class"])
                # FIXME: The class might be "http://example.com/somethingelse"
                coll.add_asserted_type(CWLPROV[value["class"]])

            # Let's iterate and recurse
            coll_attribs = []  # type: List[Tuple[Identifier, ProvEntity]]
            for (key, val) in value.items():
                v_ent = self.declare_artefact(val)
                self.document.membership(coll, v_ent)
                m_entity = self.document.entity(uuid.uuid4().urn)
                # Note: only support PROV-O style dictionary
                # https://www.w3.org/TR/prov-dictionary/#dictionary-ontological-definition
                # as prov.py do not easily allow PROV-N extensions
                m_entity.add_asserted_type(PROV["KeyEntityPair"])
                m_entity.add_attributes({
                    PROV["pairKey"]: str(key),
                    PROV["pairEntity"]: v_ent
                })
                coll_attribs.append((PROV["hadDictionaryMember"], m_entity))
            coll.add_attributes(coll_attribs)
            self.research_object.add_uri(coll.identifier.uri)
            return coll

        # some other kind of Collection?
        # TODO: also save as JSON
        try:
            members = []
            for each_input_obj in iter(value):
                # Recurse and register any nested objects
                e = self.declare_artefact(each_input_obj)
                members.append(e)

            # If we reached this, then we were allowed to iterate
            coll = self.document.entity(
                uuid.uuid4().urn,
                [
                    (PROV_TYPE, WFPROV["Artifact"]),
                    (PROV_TYPE, PROV["Collection"]),
                ],
            )
            if not members:
                coll.add_asserted_type(PROV["EmptyCollection"])
            else:
                for member in members:
                    # FIXME: This won't preserve order, for that
                    # we would need to use PROV.Dictionary
                    # with numeric keys
                    self.document.membership(coll, member)
            self.research_object.add_uri(coll.identifier.uri)
            # FIXME: list value does not support adding "@id"
            return coll
        except TypeError:
            _logger.warning("Unrecognized type %s of %r", type(value), value)
            # Let's just fall back to Python repr()
            entity = self.document.entity(uuid.uuid4().urn,
                                          {PROV_LABEL: repr(value)})
            self.research_object.add_uri(entity.identifier.uri)
            return entity

    def used_artefacts(
        self,
        job_order: Union[CWLObjectType, List[CWLObjectType]],
        process_run_id: str,
        name: Optional[str] = None,
    ) -> None:
        """Add used() for each data artefact."""
        if isinstance(job_order, list):
            for entry in job_order:
                self.used_artefacts(entry, process_run_id, name)
        else:
            # FIXME: Use workflow name in packed.cwl, "main" is wrong for nested workflows
            base = "main"
            if name is not None:
                base += "/" + name
            for key, value in job_order.items():
                prov_role = self.wf_ns[f"{base}/{key}"]
                try:
                    entity = self.declare_artefact(value)
                    self.document.used(
                        process_run_id,
                        entity,
                        datetime.datetime.now(),
                        None,
                        {"prov:role": prov_role},
                    )
                except OSError:
                    pass

    def generate_output_prov(
        self,
        final_output: Union[CWLObjectType, MutableSequence[CWLObjectType],
                            None],
        process_run_id: Optional[str],
        name: Optional[str],
    ) -> None:
        """Call wasGeneratedBy() for each output,copy the files into the RO."""
        if isinstance(final_output, MutableSequence):
            for entry in final_output:
                self.generate_output_prov(entry, process_run_id, name)
        elif final_output is not None:
            # Timestamp should be created at the earliest
            timestamp = datetime.datetime.now()

            # For each output, find/register the corresponding
            # entity (UUID) and document it as generated in
            # a role corresponding to the output
            for output, value in final_output.items():
                entity = self.declare_artefact(value)
                if name is not None:
                    name = urllib.parse.quote(str(name), safe=":/,#")
                    # FIXME: Probably not "main" in nested workflows
                    role = self.wf_ns[f"main/{name}/{output}"]
                else:
                    role = self.wf_ns["main/%s" % output]

                if not process_run_id:
                    process_run_id = self.workflow_run_uri

                self.document.wasGeneratedBy(entity, process_run_id, timestamp,
                                             None, {"prov:role": role})

    def prospective_prov(self, job: JobsType) -> None:
        """Create prospective prov recording as wfdesc prov:Plan."""
        if not isinstance(job, WorkflowJob):
            # direct command line tool execution
            self.document.entity(
                "wf:main",
                {
                    PROV_TYPE: WFDESC["Process"],
                    "prov:type": PROV["Plan"],
                    "prov:label": "Prospective provenance",
                },
            )
            return

        self.document.entity(
            "wf:main",
            {
                PROV_TYPE: WFDESC["Workflow"],
                "prov:type": PROV["Plan"],
                "prov:label": "Prospective provenance",
            },
        )

        for step in job.steps:
            stepnametemp = "wf:main/" + str(step.name)[5:]
            stepname = urllib.parse.quote(stepnametemp, safe=":/,#")
            provstep = self.document.entity(
                stepname,
                {
                    PROV_TYPE: WFDESC["Process"],
                    "prov:type": PROV["Plan"]
                },
            )
            self.document.entity(
                "wf:main",
                {
                    "wfdesc:hasSubProcess": provstep,
                    "prov:label": "Prospective provenance",
                },
            )
        # TODO: Declare roles/parameters as well

    def activity_has_provenance(self, activity, prov_ids):
        # type: (str, List[Identifier]) -> None
        """Add http://www.w3.org/TR/prov-aq/ relations to nested PROV files."""
        # NOTE: The below will only work if the corresponding metadata/provenance arcp URI
        # is a pre-registered namespace in the PROV Document
        attribs = [(PROV["has_provenance"], prov_id) for prov_id in prov_ids]
        self.document.activity(activity, other_attributes=attribs)
        # Tip: we can't use https://www.w3.org/TR/prov-links/#term-mention
        # as prov:mentionOf() is only for entities, not activities
        uris = [i.uri for i in prov_ids]
        self.research_object.add_annotation(activity, uris,
                                            PROV["has_provenance"].uri)

    def finalize_prov_profile(self, name):
        # type: (Optional[str]) -> List[Identifier]
        """Transfer the provenance related files to the RO."""
        # NOTE: Relative posix path
        if name is None:
            # main workflow, fixed filenames
            filename = "primary.cwlprov"
        else:
            # ASCII-friendly filename, avoiding % as we don't want %2520 in manifest.json
            wf_name = urllib.parse.quote(str(name), safe="").replace("%", "_")
            # Note that the above could cause overlaps for similarly named
            # workflows, but that's OK as we'll also include run uuid
            # which also covers thhe case of this step being run in
            # multiple places or iterations
            filename = f"{wf_name}.{self.workflow_run_uuid}.cwlprov"

        basename = str(PurePosixPath(PROVENANCE) / filename)

        # TODO: Also support other profiles than CWLProv, e.g. ProvOne

        # list of prov identifiers of provenance files
        prov_ids = []

        # https://www.w3.org/TR/prov-xml/
        with self.research_object.write_bag_file(basename +
                                                 ".xml") as provenance_file:
            self.document.serialize(provenance_file, format="xml", indent=4)
            prov_ids.append(self.provenance_ns[filename + ".xml"])

        # https://www.w3.org/TR/prov-n/
        with self.research_object.write_bag_file(basename +
                                                 ".provn") as provenance_file:
            self.document.serialize(provenance_file, format="provn", indent=2)
            prov_ids.append(self.provenance_ns[filename + ".provn"])

        # https://www.w3.org/Submission/prov-json/
        with self.research_object.write_bag_file(basename +
                                                 ".json") as provenance_file:
            self.document.serialize(provenance_file, format="json", indent=2)
            prov_ids.append(self.provenance_ns[filename + ".json"])

        # "rdf" aka https://www.w3.org/TR/prov-o/
        # which can be serialized to ttl/nt/jsonld (and more!)

        # https://www.w3.org/TR/turtle/
        with self.research_object.write_bag_file(basename +
                                                 ".ttl") as provenance_file:
            self.document.serialize(provenance_file,
                                    format="rdf",
                                    rdf_format="turtle")
            prov_ids.append(self.provenance_ns[filename + ".ttl"])

        # https://www.w3.org/TR/n-triples/
        with self.research_object.write_bag_file(basename +
                                                 ".nt") as provenance_file:
            self.document.serialize(provenance_file,
                                    format="rdf",
                                    rdf_format="ntriples")
            prov_ids.append(self.provenance_ns[filename + ".nt"])

        # https://www.w3.org/TR/json-ld/
        # TODO: Use a nice JSON-LD context
        # see also https://eprints.soton.ac.uk/395985/
        # 404 Not Found on https://provenance.ecs.soton.ac.uk/prov.jsonld :(
        with self.research_object.write_bag_file(basename +
                                                 ".jsonld") as provenance_file:
            self.document.serialize(provenance_file,
                                    format="rdf",
                                    rdf_format="json-ld")
            prov_ids.append(self.provenance_ns[filename + ".jsonld"])

        _logger.debug("[provenance] added provenance: %s", prov_ids)
        return prov_ids
from prov.model import ProvDocument
from provdbconnector import ProvApi
from provdbconnector.db_adapters.in_memory import SimpleInMemoryAdapter

prov_api = ProvApi(adapter=SimpleInMemoryAdapter, auth_info=None)

# create the prov document
prov_document = ProvDocument()
prov_document.add_namespace("ex", "http://example.com")

prov_document.agent("ex:Bob")
prov_document.activity("ex:Alice")

prov_document.association("ex:Alice", "ex:Bob")
# create bundle
b1 = prov_document.bundle("ex:bundle1")
b1.agent("ex:Yoda")

b2 = prov_document.bundle("ex:bundle2")
b2.agent("ex:Jabba the Hutt")

document_id = prov_api.create_document(prov_document)

print(prov_api.get_document_as_provn(document_id))

# Output:
#
# document
#   prefix ex <http://example.com>
#
#   agent(ex:Bob)