Esempio n. 1
0
    def deriveDependency(self, aDO, aRO, derivedList):

        d1 = ProvDocument()  # d1 is now an empty provenance document
        d1.add_namespace("dt", "http://cs.ncl.ac.uk/dtsim/")
        e1 = d1.entity(DTns + aRO.id)  # deriving
        ag1 = d1.agent(DTns + str(aDO.id))
        for der in derivedList:
            # create provlet
            e2 = d1.entity(DTns + der.id)  # derived
            d1.wasAttributedTo(e2, ag1)
            d1.wasDerivedFrom(e2, e1)

            # update upstream pointer
            der.upstream = [(aRO, None)]  # aRO is upstream from aRO with no activity

            # update downstream
            aRO.downstream.append((der, None))  # aR1 is downstream from aR1 with no activity

        # update global graph
        e1 = pGlobal.entity(DTns + aRO.id)  # deriving
        ag1 = pGlobal.agent(DTns + str(aDO.id))
        pGlobal.wasAttributedTo(e2, ag1)
        for der in derivedList:
            e2 = pGlobal.entity(DTns + der.id)  # derived
            pGlobal.wasDerivedFrom(e2, e1)

        # trigger credit recomputation
        for der in derivedList:
            # aRO needs its credit updated with aRO1.credit
            aCreditManager.addDerivationCredit(aRO, der.currentTotalCredit)

        # 		self.notify(d1)
        return d1
Esempio n. 2
0
def bundles2():
    # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles2.provn
    #===========================================================================
    # document
    g = ProvDocument()

    #   prefix ex  <http://example.org/example/>
    g.add_namespace("ex", "http://www.example.com/")

    #   prefix alice  <http://example.org/alice/>
    #   prefix bob  <http://example.org/bob/>
    g.add_namespace('alice', 'http://example.org/alice/')
    g.add_namespace('bob', 'http://example.org/bob/')

    #   entity(bob:bundle4, [prov:type='prov:Bundle'])
    #   wasGeneratedBy(bob:bundle4, -, 2012-05-24T10:30:00)
    #   agent(ex:Bob)
    #   wasAttributedTo(bob:bundle4, ex:Bob)
    g.entity('bob:bundle4', {'prov:type': PROV['Bundle']})
    g.wasGeneratedBy('bob:bundle4', time='2012-05-24T10:30:00')
    g.agent('ex:Bob')
    g.wasAttributedTo('bob:bundle4', 'ex:Bob')

    #   entity(alice:bundle5, [ prov:type='prov:Bundle' ])
    #   wasGeneratedBy(alice:bundle5, -, 2012-05-25T11:15:00)
    #   agent(ex:Alice)
    #   wasAttributedTo(alice:bundle5, ex:Alice)
    g.entity('alice:bundle5', {'prov:type': PROV['Bundle']})
    g.wasGeneratedBy('alice:bundle5', time='2012-05-25T11:15:00')
    g.agent('ex:Alice')
    g.wasAttributedTo('alice:bundle5', 'ex:Alice')

    #   bundle bob:bundle4
    #     entity(ex:report1, [ prov:type="report", ex:version=1 ])
    #     wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01)
    #   endBundle
    b4 = g.bundle('bob:bundle4')
    b4.entity('ex:report1', {'prov:type': "report", 'ex:version': 1})
    b4.wasGeneratedBy('ex:report1', time='2012-05-24T10:00:01')

    #   bundle alice:bundle5
    #     entity(ex:report1bis)
    #     mentionOf(ex:report1bis, ex:report1, bob:bundle4)
    #     entity(ex:report2, [ prov:type="report", ex:version=2 ])
    #     wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01)
    #     wasDerivedFrom(ex:report2, ex:report1bis)
    #   endBundle
    b5 = g.bundle('alice:bundle5')
    b5.entity('ex:report1bis')
    b5.mentionOf('ex:report1bis', 'ex:report1', 'bob:bundle4')
    b5.entity('ex:report2', [('prov:type', "report"), ('ex:version', 2)])
    b5.wasGeneratedBy('ex:report2', time='2012-05-25T11:00:01')
    b5.wasDerivedFrom('ex:report2', 'ex:report1bis')

    # endDocument
    return g
Esempio n. 3
0
def bundles2():
    # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles2.provn
    # ===========================================================================
    # document
    g = ProvDocument()

    #   prefix ex  <http://example.org/example/>
    g.add_namespace("ex", "http://www.example.com/")

    #   prefix alice  <http://example.org/alice/>
    #   prefix bob  <http://example.org/bob/>
    g.add_namespace("alice", "http://example.org/alice/")
    g.add_namespace("bob", "http://example.org/bob/")

    #   entity(bob:bundle4, [prov:type='prov:Bundle'])
    #   wasGeneratedBy(bob:bundle4, -, 2012-05-24T10:30:00)
    #   agent(ex:Bob)
    #   wasAttributedTo(bob:bundle4, ex:Bob)
    g.entity("bob:bundle4", {"prov:type": PROV["Bundle"]})
    g.wasGeneratedBy("bob:bundle4", time="2012-05-24T10:30:00")
    g.agent("ex:Bob")
    g.wasAttributedTo("bob:bundle4", "ex:Bob")

    #   entity(alice:bundle5, [ prov:type='prov:Bundle' ])
    #   wasGeneratedBy(alice:bundle5, -, 2012-05-25T11:15:00)
    #   agent(ex:Alice)
    #   wasAttributedTo(alice:bundle5, ex:Alice)
    g.entity("alice:bundle5", {"prov:type": PROV["Bundle"]})
    g.wasGeneratedBy("alice:bundle5", time="2012-05-25T11:15:00")
    g.agent("ex:Alice")
    g.wasAttributedTo("alice:bundle5", "ex:Alice")

    #   bundle bob:bundle4
    #     entity(ex:report1, [ prov:type="report", ex:version=1 ])
    #     wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01)
    #   endBundle
    b4 = g.bundle("bob:bundle4")
    b4.entity("ex:report1", {"prov:type": "report", "ex:version": 1})
    b4.wasGeneratedBy("ex:report1", time="2012-05-24T10:00:01")

    #   bundle alice:bundle5
    #     entity(ex:report1bis)
    #     mentionOf(ex:report1bis, ex:report1, bob:bundle4)
    #     entity(ex:report2, [ prov:type="report", ex:version=2 ])
    #     wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01)
    #     wasDerivedFrom(ex:report2, ex:report1bis)
    #   endBundle
    b5 = g.bundle("alice:bundle5")
    b5.entity("ex:report1bis")
    b5.mentionOf("ex:report1bis", "ex:report1", "bob:bundle4")
    b5.entity("ex:report2", [("prov:type", "report"), ("ex:version", 2)])
    b5.wasGeneratedBy("ex:report2", time="2012-05-25T11:00:01")
    b5.wasDerivedFrom("ex:report2", "ex:report1bis")

    # endDocument
    return g
Esempio n. 4
0
def bundles2():
    # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles2.provn
    #===========================================================================
    # document
    g = ProvDocument()

    #   prefix ex  <http://example.org/example/>
    g.add_namespace("ex", "http://www.example.com/")

    #   prefix alice  <http://example.org/alice/>
    #   prefix bob  <http://example.org/bob/>
    g.add_namespace('alice', 'http://example.org/alice/')
    g.add_namespace('bob', 'http://example.org/bob/')

    #   entity(bob:bundle4, [prov:type='prov:Bundle'])
    #   wasGeneratedBy(bob:bundle4, -, 2012-05-24T10:30:00)
    #   agent(ex:Bob)
    #   wasAttributedTo(bob:bundle4, ex:Bob)
    g.entity('bob:bundle4', {'prov:type': PROV['Bundle']})
    g.wasGeneratedBy('bob:bundle4', time='2012-05-24T10:30:00')
    g.agent('ex:Bob')
    g.wasAttributedTo('bob:bundle4', 'ex:Bob')

    #   entity(alice:bundle5, [ prov:type='prov:Bundle' ])
    #   wasGeneratedBy(alice:bundle5, -, 2012-05-25T11:15:00)
    #   agent(ex:Alice)
    #   wasAttributedTo(alice:bundle5, ex:Alice)
    g.entity('alice:bundle5', {'prov:type': PROV['Bundle']})
    g.wasGeneratedBy('alice:bundle5', time='2012-05-25T11:15:00')
    g.agent('ex:Alice')
    g.wasAttributedTo('alice:bundle5', 'ex:Alice')

    #   bundle bob:bundle4
    #     entity(ex:report1, [ prov:type="report", ex:version=1 ])
    #     wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01)
    #   endBundle
    b4 = g.bundle('bob:bundle4')
    b4.entity('ex:report1', {'prov:type': "report", 'ex:version': 1})
    b4.wasGeneratedBy('ex:report1', time='2012-05-24T10:00:01')

    #   bundle alice:bundle5
    #     entity(ex:report1bis)
    #     mentionOf(ex:report1bis, ex:report1, bob:bundle4)
    #     entity(ex:report2, [ prov:type="report", ex:version=2 ])
    #     wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01)
    #     wasDerivedFrom(ex:report2, ex:report1bis)
    #   endBundle
    b5 = g.bundle('alice:bundle5')
    b5.entity('ex:report1bis')
    b5.mentionOf('ex:report1bis', 'ex:report1', 'bob:bundle4')
    b5.entity('ex:report2', [('prov:type', "report"), ('ex:version', 2)])
    b5.wasGeneratedBy('ex:report2', time='2012-05-25T11:00:01')
    b5.wasDerivedFrom('ex:report2', 'ex:report1bis')

    # endDocument
    return g
Esempio n. 5
0
def bundles1():
    # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles1.provn
    #===============================================================================
    # document
    g = ProvDocument()

    #   prefix ex  <http://example.org/example/>
    EX = Namespace("ex", "http://www.example.com/")
    g.add_namespace(EX)

    #   prefix alice  <http://example.org/alice/>
    #   prefix bob  <http://example.org/bob/>
    g.add_namespace('alice', 'http://example.org/alice/')
    g.add_namespace('bob', 'http://example.org/bob/')

    #   entity(bob:bundle1, [prov:type='prov:Bundle'])
    g.entity('bob:bundle1', {'prov:type': PROV['Bundle']})
    #   wasGeneratedBy(bob:bundle1, -, 2012-05-24T10:30:00)
    g.wasGeneratedBy('bob:bundle1', time='2012-05-24T10:30:00')
    #   agent(ex:Bob)
    g.agent('ex:Bob')
    #   wasAttributedTo(bob:bundle1, ex:Bob)
    g.wasAttributedTo('bob:bundle1', 'ex:Bob')

    #   entity(alice:bundle2, [ prov:type='prov:Bundle' ])
    g.entity('alice:bundle2', {'prov:type': PROV['Bundle']})
    #   wasGeneratedBy(alice:bundle2, -, 2012-05-25T11:15:00)
    g.wasGeneratedBy('alice:bundle2', time='2012-05-25T11:15:00')
    #   agent(ex:Alice)
    g.agent('ex:Alice')
    #   wasAttributedTo(alice:bundle2, ex:Alice)
    g.wasAttributedTo('alice:bundle2', 'ex:Alice')

    #   bundle bob:bundle1
    b1 = g.bundle('bob:bundle1')
    #     entity(ex:report1, [ prov:type="report", ex:version=1 ])
    b1.entity('ex:report1', {'prov:type': "report", 'ex:version': 1})
    #     wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01)
    b1.wasGeneratedBy('ex:report1', time='2012-05-24T10:00:01')
    #   endBundle

    #   bundle alice:bundle2
    b2 = g.bundle('alice:bundle2')
    #     entity(ex:report1)
    b2.entity('ex:report1')
    #     entity(ex:report2, [ prov:type="report", ex:version=2 ])
    b2.entity('ex:report2', {'prov:type': "report", 'ex:version': 2})
    #     wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01)
    b2.wasGeneratedBy('ex:report2', time='2012-05-25T11:00:01')
    #     wasDerivedFrom(ex:report2, ex:report1)
    b2.wasDerivedFrom('ex:report2', 'ex:report1')
    #   endBundle

    # endDocument
    return g
Esempio n. 6
0
def bundles1():
    # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/bundles1.provn
    # ===============================================================================
    # document
    g = ProvDocument()

    #   prefix ex  <http://example.org/example/>
    EX = Namespace("ex", "http://www.example.com/")
    g.add_namespace(EX)

    #   prefix alice  <http://example.org/alice/>
    #   prefix bob  <http://example.org/bob/>
    g.add_namespace("alice", "http://example.org/alice/")
    g.add_namespace("bob", "http://example.org/bob/")

    #   entity(bob:bundle1, [prov:type='prov:Bundle'])
    g.entity("bob:bundle1", {"prov:type": PROV["Bundle"]})
    #   wasGeneratedBy(bob:bundle1, -, 2012-05-24T10:30:00)
    g.wasGeneratedBy("bob:bundle1", time="2012-05-24T10:30:00")
    #   agent(ex:Bob)
    g.agent("ex:Bob")
    #   wasAttributedTo(bob:bundle1, ex:Bob)
    g.wasAttributedTo("bob:bundle1", "ex:Bob")

    #   entity(alice:bundle2, [ prov:type='prov:Bundle' ])
    g.entity("alice:bundle2", {"prov:type": PROV["Bundle"]})
    #   wasGeneratedBy(alice:bundle2, -, 2012-05-25T11:15:00)
    g.wasGeneratedBy("alice:bundle2", time="2012-05-25T11:15:00")
    #   agent(ex:Alice)
    g.agent("ex:Alice")
    #   wasAttributedTo(alice:bundle2, ex:Alice)
    g.wasAttributedTo("alice:bundle2", "ex:Alice")

    #   bundle bob:bundle1
    b1 = g.bundle("bob:bundle1")
    #     entity(ex:report1, [ prov:type="report", ex:version=1 ])
    b1.entity("ex:report1", {"prov:type": "report", "ex:version": 1})
    #     wasGeneratedBy(ex:report1, -, 2012-05-24T10:00:01)
    b1.wasGeneratedBy("ex:report1", time="2012-05-24T10:00:01")
    #   endBundle

    #   bundle alice:bundle2
    b2 = g.bundle("alice:bundle2")
    #     entity(ex:report1)
    b2.entity("ex:report1")
    #     entity(ex:report2, [ prov:type="report", ex:version=2 ])
    b2.entity("ex:report2", {"prov:type": "report", "ex:version": 2})
    #     wasGeneratedBy(ex:report2, -, 2012-05-25T11:00:01)
    b2.wasGeneratedBy("ex:report2", time="2012-05-25T11:00:01")
    #     wasDerivedFrom(ex:report2, ex:report1)
    b2.wasDerivedFrom("ex:report2", "ex:report1")
    #   endBundle

    # endDocument
    return g
Esempio n. 7
0
def addition(graph: ProvDocument, package: CommitModelPackage, action: Addition) -> ProvDocument:
    """Add model for a newly added file."""
    file, file_version = action
    author, commit = package.author, package.commit
    graph.entity(*file)
    graph.entity(*file_version)
    graph.wasGeneratedBy(file.id, commit.id)
    graph.wasGeneratedBy(file_version.id, commit.id)
    graph.wasAttributedTo(file.id, author.id)
    graph.wasAttributedTo(file_version.id, author.id)
    graph.specializationOf(file_version.id, file.id)
    return graph
Esempio n. 8
0
def create_document():
    # Create a new provenance document
    document = ProvDocument()  # d1 is now an empty provenance document
    # Before asserting provenance statements, we need to have a way to refer to the "things"
    # we want to describe provenance (e.g. articles, data sets, people). For that purpose,
    # PROV uses qualified names to identify things, which essentially a shortened representation
    # of a URI in the form of prefix:localpart. Valid qualified names require their prefixes defined,
    # which we is going to do next.

    # Declaring namespaces for various prefixes used in the example
    document.add_namespace('now', 'http://www.provbook.org/nownews/')
    document.add_namespace('nowpeople',
                           'http://www.provbook.org/nownews/people/')
    document.add_namespace('bk', 'http://www.provbook.org/ns/#')

    # Entity: now:employment-article-v1.html
    e1 = document.entity('now:employment-article-v1.html')
    e1.add_attributes({'prov:value': 'Conteudo do HTML'})
    document.agent('nowpeople:Filipe')

    # Attributing the article to the agent
    document.wasAttributedTo(
        e1, 'nowpeople:Filipe_' + str(random.randint(1000, 1070000000)))

    # add more namespace declarations
    document.add_namespace('govftp',
                           'ftp://ftp.bls.gov/pub/special.requests/oes/')
    document.add_namespace('void', 'http://vocab.deri.ie/void#')

    # 'now:employment-article-v1.html' was derived from at dataset at govftp
    document.entity('govftp:oesm11st.zip', {
        'prov:label': 'employment-stats-2011',
        'prov:type': 'void:Dataset'
    })
    document.wasDerivedFrom('now:employment-article-v1.html',
                            'govftp:oesm11st.zip')

    # Adding an activity
    document.add_namespace('is', 'http://www.provbook.org/nownews/is/#')
    document.activity('is:writeArticle')
    # Usage and Generation
    document.used('is:writeArticle', 'govftp:oesm11st.zip')
    document.wasGeneratedBy('now:employment-article-v1.html',
                            'is:writeArticle')

    #print("Document prepared.")
    # What we have so far (in PROV-N)
    logging.debug(document.serialize(indent=2))
    # d1.serialize('article-prov.json') # write to file
    return document
Esempio n. 9
0
def add_resource_creation(graph: ProvDocument, package: ResourceModelPackage) -> ProvDocument:
    """Add model for resource creation."""
    creator, creation, resource, resource_version = package.creation
    graph.activity(*creation)
    graph.entity(*resource)
    graph.entity(*resource_version)
    graph.agent(*creator)
    graph.wasAssociatedWith(creation.id, creator.id)
    graph.wasAttributedTo(resource.id, creator.id)
    graph.wasAttributedTo(resource_version.id, creator.id)
    graph.wasGeneratedBy(resource.id, creation.id)
    graph.wasGeneratedBy(resource_version.id, creation.id)
    graph.specializationOf(resource_version.id, resource.id)
    return graph
Esempio n. 10
0
def modification(graph: ProvDocument, package: CommitModelPackage, action: Modification) -> ProvDocument:
    """Add model for a modified file."""
    file, file_version, previous_versions = action
    author, commit = package.author, package.commit
    graph.entity(*file)
    graph.entity(*file_version)
    graph.wasAttributedTo(file_version.id, author.id)
    graph.wasGeneratedBy(file_version.id, commit.id)
    graph.specializationOf(file_version.id, file.id)
    for version in previous_versions:
        graph.entity(*version)
        graph.used(commit.id, version.id)
        graph.wasRevisionOf(file_version.id, version.id)
        graph.specializationOf(version.id, file.id)
    return graph
Esempio n. 11
0
    def generateProvlet(self, aDO, aRO):
        # create provlet
        d1 = ProvDocument()  # d1 is now an empty provenance document
        d1.add_namespace("dt", "http://cs.ncl.ac.uk/dtsim/")

        e1 = d1.entity(DTns + aRO.id)
        ag1 = d1.agent(DTns + str(aDO.id))
        d1.wasAttributedTo(e1, ag1)

        # update global graph
        e1 = pGlobal.entity(DTns + aRO.id)
        ag1 = pGlobal.agent(DTns + str(aDO.id))
        pGlobal.wasAttributedTo(e1, ag1)

        # 		self.notify(d1)
        return d1
Esempio n. 12
0
def add_event_chain(graph: ProvDocument, package: ResourceModelPackage) -> ProvDocument:
    """Add chain of events beginning at the creation event."""
    previous_event = previous_resource_version = None
    for chain_link in package.event_chain:
        user, event, resource, resource_version = chain_link
        graph.entity(*resource)
        graph.entity(*resource_version)
        graph.activity(*event)
        graph.agent(*user)
        graph.wasAssociatedWith(event.id, user.id)
        graph.wasAttributedTo(resource_version.id, user.id)
        graph.specializationOf(resource_version.id, resource.id)
        if previous_event is not None and previous_resource_version is not None:
            graph.entity(*previous_resource_version)
            graph.activity(*previous_event)
            graph.wasGeneratedBy(resource_version.id, event.id)
            graph.used(event.id, previous_resource_version.id)
            graph.wasDerivedFrom(resource_version.id, previous_resource_version.id)
            graph.wasInformedBy(event.id, previous_event.id)
        previous_event = event
        previous_resource_version = resource_version
    return graph
Esempio n. 13
0
def transform_to_prov(context_model):
    from prov.model import ProvDocument
    from prov.dot import prov_to_dot

    doc = ProvDocument()
    doc.add_namespace('is', 'http://www.provbook.org/nownews/is/#')
    doc.add_namespace('void', 'http://vocab.deri.ie/void#')
    doc.add_namespace('nowpeople', 'http://www.provbook.org/nownews/people/')

    input_data = doc.entity("void:Inputdata")
    backend_agent = doc.agent("nowpeople:EODC")
    user_agent = doc.agent("nowpeople:OpenEO-User")
    doc.wasAttributedTo(input_data, backend_agent)

    process_details = context_model["process_details"]
    prev_key = input_data
    for key in process_details:

        key_entity = doc.entity("void:" + key + "_output")

        key_activity = doc.activity('is:' + key)

        doc.used(key_activity, prev_key)

        doc.wasDerivedFrom(key_entity, prev_key)
        doc.wasGeneratedBy(key_entity,
                           key_activity,
                           time=process_details[key]["timing"]["end"])

        doc.wasStartedBy(key_activity,
                         user_agent,
                         time=process_details[key]["timing"]["start"])

        prev_key = key_entity

    dot = prov_to_dot(doc)
    dot.write_png('output-prov.png')

    return doc
Esempio n. 14
0
def release_tag_model(graph: ProvDocument, packages: ReleaseTagPackage):
    for package in packages:
        if package.release_package is not None:
            r_user, release, release_event, release_evidence, assets = package.release_package
            graph.agent(*r_user)
            graph.entity(*release)
            graph.activity(*release_event)
            graph.entity(*release_evidence)
            for asset in assets:
                graph.entity(*asset)
                graph.hadMember(asset.id, release.id)

            graph.hadMember(release_evidence.id, release.id)
            graph.wasGeneratedBy(release.id, release_event.id)
            graph.wasAttributedTo(release.id, r_user.id)
            graph.wasAssociatedWith(release_event.id, r_user.id)

        if package.tag_package is not None:
            t_user, tag, tag_event = package.tag_package
            graph.agent(*t_user)
            graph.entity(*tag)
            graph.activity(*tag_event)

            if package.release_package is not None:
                graph.hadMember(tag.id, release.id)
            graph.wasGeneratedBy(tag.id, tag_event.id)
            graph.wasAttributedTo(tag.id, t_user.id)
            graph.wasAssociatedWith(tag_event.id, t_user.id)

        if package.commit_package is not None:
            author, commit_event, _, commit, _ = package.commit_package
            graph.agent(*author)
            graph.activity(*commit_event)
            graph.entity(*commit)

            if package.tag_package is not None:
                graph.hadMember(commit.id, tag.id)
            graph.wasGeneratedBy(commit.id, commit_event.id)
            graph.wasAttributedTo(commit.id, author.id)
            graph.wasAssociatedWith(commit_event.id, author.id)
    return graph
Esempio n. 15
0
           other_attributes={
               "prov:type": "file",
               "path_at_addition": ""
           })
add.entity("File Version",
           other_attributes={
               "prov:type": "file_version",
               "old_path": "",
               "new_path": ""
           })
add.wasInformedBy("Commit", "Parent Commit")
add.wasAssociatedWith("Commit", "Committer")
add.wasAssociatedWith("Commit", "Author")
add.wasGeneratedBy("File", "Commit")
add.wasGeneratedBy("File Version", "Commit")
add.wasAttributedTo("File", "Author")
add.wasAttributedTo("File Version", "Author")
add.specializationOf("File Version", "File")

mod = ProvDocument()
mod.set_default_namespace("gitlab2prov:")
mod.activity(
    "Commit",
    other_attributes={
        "prov:type": "commit",
        "title": "",
        "message": "",
        "id": "",
        "short_id": "",
        "prov:startedAt": "",
        "prov:endedAt": ""
def example():

    g = ProvDocument()
    # Local namespace
    # Doesnt exist yet so we are creating it
    ap = Namespace('aip', 'https://araport.org/provenance/')
    # Dublin Core
    g.add_namespace("dcterms", "http://purl.org/dc/terms/")
    # FOAF
    g.add_namespace("foaf", "http://xmlns.com/foaf/0.1/")

    # Add sponsors and contributors as Agents
    # ap['matthew_vaughn']
    # aip:matthew_vaughn
    # https://araport.org/provenance/:matthew_vaughn
    # Learn this from a call to profiles service? Adds a dependency on Agave so I am open to figuring out another way
    me = g.agent(ap['matthew_vaughn'], {
        'prov:type': PROV["Person"], 'foaf:givenName': "Matthew Vaughn", 'foaf:mbox': "<mailto:[email protected]>"
    })
    # Hard coded for now
    walter = g.agent(ap['walter_moreira'], {
        'prov:type': PROV["Person"], 'foaf:givenName': "Walter Moreira", 'foaf:mbox': "<mailto:[email protected]>"
    })
    utexas = g.agent(ap['university_of_texas'], {
        'prov:type': PROV["Organization"], 'foaf:givenName': "University of Texas at Austin"
    })

    # Set delegation to our host University
    # We may have trouble doing this for other users since we don't always capture their host instituion
    g.actedOnBehalfOf(walter, utexas)
    g.actedOnBehalfOf(me, utexas)

    # Include the ADAMA platform as an Agent and set attribution
    # dcterms:title and dcterms:description are hardcoded
    # dcterms:language is hard-coded
    # dcterms:source is the URI of the public git source repository for ADAMA
    # "dcterms:updated": "2015-04-17T09:44:56" - this would actually be the date ADAMA was updated
    adama_platform = g.agent(ap['adama_platform'], {'dcterms:title': "ADAMA", 'dcterms:description': "Araport Data and Microservices API", 'dcterms:language':"en-US", 'dcterms:identifier':"https://api.araport.org/community/v0.3/", 'dcterms:updated': "2015-04-17T09:44:56" })
    g.wasGeneratedBy(adama_platform, walter)

    # Include the ADAMA microservice as an Agent and set attribution+delegation
    # dcterms:title and dcterms:description are inherited from the service's metadata
    # dcterms:language is hard-coded
    # dcterms:identifier is the deployment URI for the service
    # dcterms:source is the URI of the public git source repository. The URL in this example is just a dummy
    #
    # The name for each microservice should be unique. We've decided to
    # use the combination of namespace, service name, and version
    microservice_name = 'mwvaughn/bar_annotation_v1.0.0'
    adama_microservice = g.agent(ap[microservice_name], {'dcterms:title': "BAR Annotation Service", 'dcterms:description': "Returns annotation from locus ID", 'dcterms:language':"en-US", 'dcterms:identifier':"https://api.araport.org/community/v0.3/mwvaughn/bar_annotation_v1.0.0", 'dcterms:source':"https://github.com/Arabidopsis-Information-Portal/prov-enabled-api-sample" })

    # the microservice was generated by me on date X (don't use now, use when the service was updated)
    g.wasGeneratedBy(adama_microservice, me, datetime.datetime.now())
    # The microservice used the platform now
    g.used(adama_microservice, adama_platform, datetime.datetime.now())

    # Sources
    #
    # Define BAR
    # Agents
    nick = g.agent(ap['nicholas_provart'], {
        'prov:type': PROV["Person"], 'foaf:givenName': "Nicholas Provart", 'foaf:mbox': "*****@*****.**"
    })
    utoronto = g.agent(ap['university_of_toronto'], {
        'prov:type': PROV["Organization"], 'foaf:givenName': "University of Toronto", 'dcterms:identifier':"http://www.utoronto.ca/"
    })
    g.actedOnBehalfOf(nick, utoronto)

    # Entity
    # All fields derived from Sources.yml
    # dcterms:title and dcterms:description come straight from the YAML
    # dcterms:identifier - URI pointing to the source's canonical URI representation
    # optional - dcterms:language: Recommended best practice is to use a controlled vocabulary such as RFC 4646
    # optional - dcterms:updated: date the source was published or last updated
    # optional - dcterms:license: Simple string or URI to license. Validate URI if provided?
    datasource1 = g.entity(ap['datasource1'], {'dcterms:title': "BAR Arabidopsis AGI -> Annotation", 'dcterms:description': "Most recent annotation for given AGI", 'dcterms:language':"en-US", 'dcterms:identifier':"http://bar.utoronto.ca/webservices/agiToAnnot.php", 'dcterms:updated':"2015-04-17T09:44:56", 'dcterms:license':"Creative Commons 3.0" })
    # Set up attribution to Nick
    g.wasAttributedTo(datasource1, nick)

    # Define TAIR
    # Agents
    # dcterms:language: Recommended best practice is to use a controlled vocabulary such as RFC 4646
    eva = g.agent(ap['eva_huala'], {
        'prov:type': PROV["Person"], 'foaf:givenName': "Eva Huala"
    })
    phoenix = g.agent(ap['phoenix_bioinformatics'], {
        'prov:type': PROV["Organization"], 'foaf:givenName': "Phoenix Bioinformatics"
    })
    g.actedOnBehalfOf(eva, phoenix)

    # Entity
    # All fields derived from Sources.yml
    # optional - dcterms:citation: Plain text bibliographic citation. If only provided as doi, should we try to validate it?
    datasource2 = g.entity(ap['datasource2'], {'dcterms:title': "TAIR", 'dcterms:description': "The Arabidopsis Information Resource", 'dcterms:language':"en-US", 'dcterms:identifier':"https://www.arabidopsis.org/", 'dcterms:citation':"The Arabidopsis Information Resource (TAIR): improved gene annotation and new tools. Nucleic Acids Research 2011 doi: 10.1093/nar/gkr1090"})
    g.wasAttributedTo(datasource2, eva)

    # In Sources.yml, these two sources are nested. Define that relationship here
    # There are other types of relationships but we will just use derived from for simplicity in this prototype
    g.wasDerivedFrom(ap['datasource1'], ap['datasource2'])

    # Depending on which ADAMA microservice type we are using, define an activity
    # Eventually, break these into more atomic actions in a chain
    action1 = g.activity(ap['do_query'], datetime.datetime.now())
    # action1 = g.activity(ap['do_map'], datetime.datetime.now())
    # action1 = g.activity(ap['do_generic'], datetime.datetime.now())
    # action1 = g.activity(ap['do_passthrough'], datetime.datetime.now())
    # Future... Support for ADAMA-native microservices
    # action1 = g.activity(ap['generate'], datetime.datetime.now())

    # Define current ADAMA response as an Entity
    # This is what's being returned to the user and is thus the subject of the PROV record
    # May be able to add more attributes to it but this is the minimum
    response = g.entity(ap['adama_response'])

    # Response is generated by the process_query action
    # Time-stamp it!
    g.wasGeneratedBy(response, ap['do_query'], datetime.datetime.now())
    # The process_query used the microservice
    g.used(ap['do_query'], adama_microservice, datetime.datetime.now())
    # The microservice used datasource1
    g.used(adama_microservice, datasource1, datetime.datetime.now())

    # Print prov_n
    print(g.get_provn())
    # Print prov-json
    print(g.serialize())
    # Write out as a pretty picture
    graph = prov.dot.prov_to_dot(g)
    graph.write_png('Sources.png')
Esempio n. 17
0
def primer_example():
    # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/primer.pn
    # ===========================================================================
    # document
    g = ProvDocument()

    #    prefix ex <http://example/>
    #    prefix dcterms <http://purl.org/dc/terms/>
    #    prefix foaf <http://xmlns.com/foaf/0.1/>
    ex = Namespace(
        "ex", "http://example/"
    )  # namespaces do not need to be explicitly added to a document
    g.add_namespace("dcterms", "http://purl.org/dc/terms/")
    g.add_namespace("foaf", "http://xmlns.com/foaf/0.1/")

    #    entity(ex:article, [dcterms:title="Crime rises in cities"])
    # first time the ex namespace was used, it is added to the document automatically
    g.entity(ex["article"], {"dcterms:title": "Crime rises in cities"})
    #    entity(ex:articleV1)
    g.entity(ex["articleV1"])
    #    entity(ex:articleV2)
    g.entity(ex["articleV2"])
    #    entity(ex:dataSet1)
    g.entity(ex["dataSet1"])
    #    entity(ex:dataSet2)
    g.entity(ex["dataSet2"])
    #    entity(ex:regionList)
    g.entity(ex["regionList"])
    #    entity(ex:composition)
    g.entity(ex["composition"])
    #    entity(ex:chart1)
    g.entity(ex["chart1"])
    #    entity(ex:chart2)
    g.entity(ex["chart2"])
    #    entity(ex:blogEntry)
    g.entity(ex["blogEntry"])

    #    activity(ex:compile)
    g.activity(
        "ex:compile")  # since ex is registered, it can be used like this
    #    activity(ex:compile2)
    g.activity("ex:compile2")
    #    activity(ex:compose)
    g.activity("ex:compose")
    #    activity(ex:correct, 2012-03-31T09:21:00, 2012-04-01T15:21:00)
    g.activity("ex:correct", "2012-03-31T09:21:00",
               "2012-04-01T15:21:00")  # date time can be provided as strings
    #    activity(ex:illustrate)
    g.activity("ex:illustrate")

    #    used(ex:compose, ex:dataSet1, -,   [ prov:role = "ex:dataToCompose"])
    g.used("ex:compose",
           "ex:dataSet1",
           other_attributes={"prov:role": "ex:dataToCompose"})
    #    used(ex:compose, ex:regionList, -, [ prov:role = "ex:regionsToAggregateBy"])
    g.used(
        "ex:compose",
        "ex:regionList",
        other_attributes={"prov:role": "ex:regionsToAggregateBy"},
    )
    #    wasGeneratedBy(ex:composition, ex:compose, -)
    g.wasGeneratedBy("ex:composition", "ex:compose")

    #    used(ex:illustrate, ex:composition, -)
    g.used("ex:illustrate", "ex:composition")
    #    wasGeneratedBy(ex:chart1, ex:illustrate, -)
    g.wasGeneratedBy("ex:chart1", "ex:illustrate")

    #    wasGeneratedBy(ex:chart1, ex:compile,  2012-03-02T10:30:00)
    g.wasGeneratedBy("ex:chart1", "ex:compile", "2012-03-02T10:30:00")
    #    wasGeneratedBy(ex:chart2, ex:compile2, 2012-04-01T15:21:00)
    #
    #
    #    agent(ex:derek, [ prov:type="prov:Person", foaf:givenName = "Derek",
    #           foaf:mbox= "<mailto:[email protected]>"])
    g.agent(
        "ex:derek",
        {
            "prov:type": PROV["Person"],
            "foaf:givenName": "Derek",
            "foaf:mbox": "<mailto:[email protected]>",
        },
    )
    #    wasAssociatedWith(ex:compose, ex:derek, -)
    g.wasAssociatedWith("ex:compose", "ex:derek")
    #    wasAssociatedWith(ex:illustrate, ex:derek, -)
    g.wasAssociatedWith("ex:illustrate", "ex:derek")
    #
    #    agent(ex:chartgen, [ prov:type="prov:Organization",
    #           foaf:name = "Chart Generators Inc"])
    g.agent(
        "ex:chartgen",
        {
            "prov:type": PROV["Organization"],
            "foaf:name": "Chart Generators Inc"
        },
    )
    #    actedOnBehalfOf(ex:derek, ex:chartgen, ex:compose)
    g.actedOnBehalfOf("ex:derek", "ex:chartgen", "ex:compose")
    #    wasAttributedTo(ex:chart1, ex:derek)
    g.wasAttributedTo("ex:chart1", "ex:derek")

    #    wasGeneratedBy(ex:dataSet2, ex:correct, -)
    g.wasGeneratedBy("ex:dataSet2", "ex:correct")
    #    used(ex:correct, ex:dataSet1, -)
    g.used("ex:correct", "ex:dataSet1")
    #    wasDerivedFrom(ex:dataSet2, ex:dataSet1, [prov:type='prov:Revision'])
    g.wasDerivedFrom("ex:dataSet2",
                     "ex:dataSet1",
                     other_attributes={"prov:type": PROV["Revision"]})
    #    wasDerivedFrom(ex:chart2, ex:dataSet2)
    g.wasDerivedFrom("ex:chart2", "ex:dataSet2")

    #    wasDerivedFrom(ex:blogEntry, ex:article, [prov:type='prov:Quotation'])
    g.wasDerivedFrom("ex:blogEntry",
                     "ex:article",
                     other_attributes={"prov:type": PROV["Quotation"]})
    #    specializationOf(ex:articleV1, ex:article)
    g.specializationOf("ex:articleV1", "ex:article")
    #    wasDerivedFrom(ex:articleV1, ex:dataSet1)
    g.wasDerivedFrom("ex:articleV1", "ex:dataSet1")

    #    specializationOf(ex:articleV2, ex:article)
    g.specializationOf("ex:articleV2", "ex:article")
    #    wasDerivedFrom(ex:articleV2, ex:dataSet2)
    g.wasDerivedFrom("ex:articleV2", "ex:dataSet2")

    #    alternateOf(ex:articleV2, ex:articleV1)
    g.alternateOf("ex:articleV2", "ex:articleV1")

    # endDocument
    return g
Esempio n. 18
0
    def useGenDependency(self, aDO, usedList, genList, throughActivity):

        aID = throughActivity.id

        # create provlet
        d1 = ProvDocument()  # d1 is now an empty provenance document
        d1.add_namespace("dt", "http://cs.ncl.ac.uk/dtsim/")

        usedEntities = []
        for aRO in usedList:
            usedEntities.append(d1.entity(DTns + aRO.id))

        genEntities = []
        for aRO1 in genList:
            genEntities.append(d1.entity(DTns + aRO1.id))

        a = d1.activity(DTns + aID)
        ag1 = d1.agent(DTns + str(aDO.id))

        d1.wasAssociatedWith(a, ag1)
        for ue in usedEntities:
            d1.used(a, ue)

        for gene in genEntities:
            d1.wasAttributedTo(gene, ag1)
            d1.wasGeneratedBy(gene, a)

        # associate this provlet to each generated RO
        for aRO1 in genList:
            aRO1.provlet = d1

        print "event {n}: DO {do}: {ro1} <- wgby <- {act} <- used {ro}".format(
            n=currentReuseCount, do=aDO.id, ro1=aRO1.id, act=aID, ro=aRO.id
        )

        for genRO in genList:
            for uRO in usedList:
                # update upstream pointer
                genRO.upstream.append(
                    (uRO, throughActivity)
                )  # dep on aRO through activity aID   FIXME URGENTLY!!!  not designed for M-M

        for uRO in usedList:
            for genRO in genList:
                # update downstream
                uRO.downstream.append((genRO, throughActivity))  # aR1 is downstream from aR1 through activity aID

        # update global graph
        globalUsedEntities = []
        for aRO in usedList:
            globalUsedEntities.append(pGlobal.entity(DTns + aRO.id))

        globalGenEntities = []
        for aR1 in genList:
            globalGenEntities.append(pGlobal.entity(DTns + aR1.id))

        a = pGlobal.activity(DTns + aID)
        ag1 = pGlobal.agent(DTns + str(aDO.id))

        pGlobal.wasAssociatedWith(a, ag1)
        for ue in globalUsedEntities:
            pGlobal.used(a, ue)

        for gene in globalGenEntities:
            pGlobal.wasAttributedTo(gene, ag1)
            pGlobal.wasGeneratedBy(gene, a)

        # trigger credit recomputation
        # each used RO needs its credit updated with aRO1.credit for each generated aRO1 through activity aID
        aCreditManager.addGenerationCredit(usedList, genList, throughActivity)

        # 		self.notify(d1)
        return d1
Esempio n. 19
0
class Provenance(object):
    def __init__(self, output_dir):
        self.output_dir = output_dir
        self.doc = None
        self.workflow = None

    def start(self, workflow=False):
        from daops import __version__ as daops_version
        from housemartin import __version__ as housemartin_version

        self.doc = ProvDocument()
        # Declaring namespaces for various prefixes
        self.doc.set_default_namespace(uri="http://purl.org/roocs/prov#")
        self.doc.add_namespace("prov", uri="http://www.w3.org/ns/prov#")
        self.doc.add_namespace(
            "provone", uri="http://purl.dataone.org/provone/2015/01/15/ontology#"
        )
        self.doc.add_namespace("dcterms", uri="http://purl.org/dc/terms/")
        # Define entities
        project_cds = self.doc.agent(
            ":copernicus_CDS",
            {
                "prov:type": "prov:Organization",
                "dcterms:title": "Copernicus Climate Data Store",
            },
        )
        self.sw_housemartin = self.doc.agent(
            ":housemartin",
            {
                "prov:type": "prov:SoftwareAgent",
                "dcterms:source": f"https://github.com/cedadev/housemartin/releases/tag/v{housemartin_version}",
            },
        )
        self.doc.wasAttributedTo(self.sw_housemartin, project_cds)
        self.sw_daops = self.doc.agent(
            ":daops",
            {
                "prov:type": "prov:SoftwareAgent",
                "dcterms:source": f"https://github.com/roocs/daops/releases/tag/v{daops_version}",
            },
        )
        # workflow
        if workflow is True:
            self.workflow = self.doc.entity(
                ":workflow", {"prov:type": "provone:Workflow"}
            )
            orchestrate = self.doc.activity(
                ":orchestrate",
                other_attributes={
                    "prov:startedAtTime": "2020-11-26T09:15:00",
                    "prov:endedAtTime": "2020-11-26T09:30:00",
                },
            )
            self.doc.wasAssociatedWith(
                orchestrate, agent=self.sw_housemartin, plan=self.workflow
            )

    def add_operator(self, operator, parameters, collection, output):
        op = self.doc.activity(
            f":{operator}",
            other_attributes={
                ":time": parameters.get("time"),
                ":apply_fixes": parameters.get("apply_fixes"),
            },
        )
        # input data
        ds_in = os.path.basename(collection[0])
        # ds_in_attrs = {
        #     'prov:type': 'provone:Data',
        #     'prov:value': f'{ds_in}',
        # }
        op_in = self.doc.entity(f":{ds_in}")
        # operator started by daops
        if self.workflow:
            self.doc.wasAssociatedWith(op, agent=self.sw_daops, plan=self.workflow)
        else:
            self.doc.start(op, starter=self.sw_daops, trigger=self.sw_housemartin)
        # Generated output file

        ds_out = os.path.basename(output[0])
        # ds_out_attrs = {
        #     'prov:type': 'provone:Data',
        #     'prov:value': f'{ds_out}',
        # }
        op_out = self.doc.entity(f":{ds_out}")
        self.doc.wasDerivedFrom(op_out, op_in, activity=op)

    def write_json(self):
        outfile = os.path.join(self.output_dir, "provenance.json")
        self.doc.serialize(outfile, format="json")
        return outfile

    def write_png(self):
        outfile = os.path.join(self.output_dir, "provenance.png")
        figure = prov_to_dot(self.doc)
        figure.write_png(outfile)
        return outfile
Esempio n. 20
0
def primer_example():
    # https://github.com/lucmoreau/ProvToolbox/blob/master/prov-n/src/test/resources/prov/primer.pn
    #===========================================================================
    # document
    g = ProvDocument()

    #    prefix ex <http://example/>
    #    prefix dcterms <http://purl.org/dc/terms/>
    #    prefix foaf <http://xmlns.com/foaf/0.1/>
    ex = Namespace('ex', 'http://example/')  # namespaces do not need to be explicitly added to a document
    g.add_namespace("dcterms", "http://purl.org/dc/terms/")
    g.add_namespace("foaf", "http://xmlns.com/foaf/0.1/")

    #    entity(ex:article, [dcterms:title="Crime rises in cities"])
    # first time the ex namespace was used, it is added to the document automatically
    g.entity(ex['article'], {'dcterms:title': "Crime rises in cities"})
    #    entity(ex:articleV1)
    g.entity(ex['articleV1'])
    #    entity(ex:articleV2)
    g.entity(ex['articleV2'])
    #    entity(ex:dataSet1)
    g.entity(ex['dataSet1'])
    #    entity(ex:dataSet2)
    g.entity(ex['dataSet2'])
    #    entity(ex:regionList)
    g.entity(ex['regionList'])
    #    entity(ex:composition)
    g.entity(ex['composition'])
    #    entity(ex:chart1)
    g.entity(ex['chart1'])
    #    entity(ex:chart2)
    g.entity(ex['chart2'])
    #    entity(ex:blogEntry)
    g.entity(ex['blogEntry'])

    #    activity(ex:compile)
    g.activity('ex:compile')  # since ex is registered, it can be used like this
    #    activity(ex:compile2)
    g.activity('ex:compile2')
    #    activity(ex:compose)
    g.activity('ex:compose')
    #    activity(ex:correct, 2012-03-31T09:21:00, 2012-04-01T15:21:00)
    g.activity('ex:correct', '2012-03-31T09:21:00', '2012-04-01T15:21:00')  # date time can be provided as strings
    #    activity(ex:illustrate)
    g.activity('ex:illustrate')

    #    used(ex:compose, ex:dataSet1, -,   [ prov:role = "ex:dataToCompose"])
    g.used('ex:compose', 'ex:dataSet1', other_attributes={'prov:role': "ex:dataToCompose"})
    #    used(ex:compose, ex:regionList, -, [ prov:role = "ex:regionsToAggregateBy"])
    g.used('ex:compose', 'ex:regionList', other_attributes={'prov:role': "ex:regionsToAggregateBy"})
    #    wasGeneratedBy(ex:composition, ex:compose, -)
    g.wasGeneratedBy('ex:composition', 'ex:compose')

    #    used(ex:illustrate, ex:composition, -)
    g.used('ex:illustrate', 'ex:composition')
    #    wasGeneratedBy(ex:chart1, ex:illustrate, -)
    g.wasGeneratedBy('ex:chart1', 'ex:illustrate')

    #    wasGeneratedBy(ex:chart1, ex:compile,  2012-03-02T10:30:00)
    g.wasGeneratedBy('ex:chart1', 'ex:compile', '2012-03-02T10:30:00')
    #    wasGeneratedBy(ex:chart2, ex:compile2, 2012-04-01T15:21:00)
    #
    #
    #    agent(ex:derek, [ prov:type="prov:Person", foaf:givenName = "Derek",
    #           foaf:mbox= "<mailto:[email protected]>"])
    g.agent('ex:derek', {
        'prov:type': PROV["Person"], 'foaf:givenName': "Derek", 'foaf:mbox': "<mailto:[email protected]>"
    })
    #    wasAssociatedWith(ex:compose, ex:derek, -)
    g.wasAssociatedWith('ex:compose', 'ex:derek')
    #    wasAssociatedWith(ex:illustrate, ex:derek, -)
    g.wasAssociatedWith('ex:illustrate', 'ex:derek')
    #
    #    agent(ex:chartgen, [ prov:type="prov:Organization",
    #           foaf:name = "Chart Generators Inc"])
    g.agent('ex:chartgen', {'prov:type': PROV["Organization"], 'foaf:name': "Chart Generators Inc"})
    #    actedOnBehalfOf(ex:derek, ex:chartgen, ex:compose)
    g.actedOnBehalfOf('ex:derek', 'ex:chartgen', 'ex:compose')
    #    wasAttributedTo(ex:chart1, ex:derek)
    g.wasAttributedTo('ex:chart1', 'ex:derek')

    #    wasGeneratedBy(ex:dataSet2, ex:correct, -)
    g.wasGeneratedBy('ex:dataSet2', 'ex:correct')
    #    used(ex:correct, ex:dataSet1, -)
    g.used('ex:correct', 'ex:dataSet1')
    #    wasDerivedFrom(ex:dataSet2, ex:dataSet1, [prov:type='prov:Revision'])
    g.wasDerivedFrom('ex:dataSet2', 'ex:dataSet1', other_attributes={'prov:type': PROV['Revision']})
    #    wasDerivedFrom(ex:chart2, ex:dataSet2)
    g.wasDerivedFrom('ex:chart2', 'ex:dataSet2')

    #    wasDerivedFrom(ex:blogEntry, ex:article, [prov:type='prov:Quotation'])
    g.wasDerivedFrom('ex:blogEntry', 'ex:article', other_attributes={'prov:type': PROV['Quotation']})
    #    specializationOf(ex:articleV1, ex:article)
    g.specializationOf('ex:articleV1', 'ex:article')
    #    wasDerivedFrom(ex:articleV1, ex:dataSet1)
    g.wasDerivedFrom('ex:articleV1', 'ex:dataSet1')

    #    specializationOf(ex:articleV2, ex:article)
    g.specializationOf('ex:articleV2', 'ex:article')
    #    wasDerivedFrom(ex:articleV2, ex:dataSet2)
    g.wasDerivedFrom('ex:articleV2', 'ex:dataSet2')

    #    alternateOf(ex:articleV2, ex:articleV1)
    g.alternateOf('ex:articleV2', 'ex:articleV1')

    # endDocument
    return g
def example():

    g = ProvDocument()
    # Local namespace
    # Doesnt exist yet so we are creating it
    ap = Namespace('aip', 'https://araport.org/provenance/')
    # Dublin Core
    g.add_namespace("dcterms", "http://purl.org/dc/terms/")
    # FOAF
    g.add_namespace("foaf", "http://xmlns.com/foaf/0.1/")

    # Add sponsors and contributors as Agents
    # ap['matthew_vaughn']
    # aip:matthew_vaughn
    # https://araport.org/provenance/:matthew_vaughn
    # Learn this from a call to profiles service? Adds a dependency on Agave so I am open to figuring out another way
    me = g.agent(
        ap['matthew_vaughn'], {
            'prov:type': PROV["Person"],
            'foaf:givenName': "Matthew Vaughn",
            'foaf:mbox': "<mailto:[email protected]>"
        })
    # Hard coded for now
    walter = g.agent(
        ap['walter_moreira'], {
            'prov:type': PROV["Person"],
            'foaf:givenName': "Walter Moreira",
            'foaf:mbox': "<mailto:[email protected]>"
        })
    utexas = g.agent(
        ap['university_of_texas'], {
            'prov:type': PROV["Organization"],
            'foaf:givenName': "University of Texas at Austin"
        })

    # Set delegation to our host University
    # We may have trouble doing this for other users since we don't always capture their host instituion
    g.actedOnBehalfOf(walter, utexas)
    g.actedOnBehalfOf(me, utexas)

    # Include the ADAMA platform as an Agent and set attribution
    # dcterms:title and dcterms:description are hardcoded
    # dcterms:language is hard-coded
    # dcterms:source is the URI of the public git source repository for ADAMA
    # "dcterms:updated": "2015-04-17T09:44:56" - this would actually be the date ADAMA was updated
    adama_platform = g.agent(
        ap['adama_platform'], {
            'dcterms:title': "ADAMA",
            'dcterms:description': "Araport Data and Microservices API",
            'dcterms:language': "en-US",
            'dcterms:identifier': "https://api.araport.org/community/v0.3/",
            'dcterms:updated': "2015-04-17T09:44:56"
        })
    g.wasGeneratedBy(adama_platform, walter)

    # Include the ADAMA microservice as an Agent and set attribution+delegation
    # dcterms:title and dcterms:description are inherited from the service's metadata
    # dcterms:language is hard-coded
    # dcterms:identifier is the deployment URI for the service
    # dcterms:source is the URI of the public git source repository. The URL in this example is just a dummy
    #
    # The name for each microservice should be unique. We've decided to
    # use the combination of namespace, service name, and version
    microservice_name = 'mwvaughn/bar_annotation_v1.0.0'
    adama_microservice = g.agent(
        ap[microservice_name], {
            'dcterms:title':
            "BAR Annotation Service",
            'dcterms:description':
            "Returns annotation from locus ID",
            'dcterms:language':
            "en-US",
            'dcterms:identifier':
            "https://api.araport.org/community/v0.3/mwvaughn/bar_annotation_v1.0.0",
            'dcterms:source':
            "https://github.com/Arabidopsis-Information-Portal/prov-enabled-api-sample"
        })

    # the microservice was generated by me on date X (don't use now, use when the service was updated)
    g.wasGeneratedBy(adama_microservice, me, datetime.datetime.now())
    # The microservice used the platform now
    g.used(adama_microservice, adama_platform, datetime.datetime.now())

    # Sources
    #
    # Define BAR
    # Agents
    nick = g.agent(
        ap['nicholas_provart'], {
            'prov:type': PROV["Person"],
            'foaf:givenName': "Nicholas Provart",
            'foaf:mbox': "*****@*****.**"
        })
    utoronto = g.agent(
        ap['university_of_toronto'], {
            'prov:type': PROV["Organization"],
            'foaf:givenName': "University of Toronto",
            'dcterms:identifier': "http://www.utoronto.ca/"
        })
    g.actedOnBehalfOf(nick, utoronto)

    # Entity
    # All fields derived from Sources.yml
    # dcterms:title and dcterms:description come straight from the YAML
    # dcterms:identifier - URI pointing to the source's canonical URI representation
    # optional - dcterms:language: Recommended best practice is to use a controlled vocabulary such as RFC 4646
    # optional - dcterms:updated: date the source was published or last updated
    # optional - dcterms:license: Simple string or URI to license. Validate URI if provided?
    datasource1 = g.entity(
        ap['datasource1'], {
            'dcterms:title': "BAR Arabidopsis AGI -> Annotation",
            'dcterms:description': "Most recent annotation for given AGI",
            'dcterms:language': "en-US",
            'dcterms:identifier':
            "http://bar.utoronto.ca/webservices/agiToAnnot.php",
            'dcterms:updated': "2015-04-17T09:44:56",
            'dcterms:license': "Creative Commons 3.0"
        })
    # Set up attribution to Nick
    g.wasAttributedTo(datasource1, nick)

    # Define TAIR
    # Agents
    # dcterms:language: Recommended best practice is to use a controlled vocabulary such as RFC 4646
    eva = g.agent(ap['eva_huala'], {
        'prov:type': PROV["Person"],
        'foaf:givenName': "Eva Huala"
    })
    phoenix = g.agent(
        ap['phoenix_bioinformatics'], {
            'prov:type': PROV["Organization"],
            'foaf:givenName': "Phoenix Bioinformatics"
        })
    g.actedOnBehalfOf(eva, phoenix)

    # Entity
    # All fields derived from Sources.yml
    # optional - dcterms:citation: Plain text bibliographic citation. If only provided as doi, should we try to validate it?
    datasource2 = g.entity(
        ap['datasource2'], {
            'dcterms:title':
            "TAIR",
            'dcterms:description':
            "The Arabidopsis Information Resource",
            'dcterms:language':
            "en-US",
            'dcterms:identifier':
            "https://www.arabidopsis.org/",
            'dcterms:citation':
            "The Arabidopsis Information Resource (TAIR): improved gene annotation and new tools. Nucleic Acids Research 2011 doi: 10.1093/nar/gkr1090"
        })
    g.wasAttributedTo(datasource2, eva)

    # In Sources.yml, these two sources are nested. Define that relationship here
    # There are other types of relationships but we will just use derived from for simplicity in this prototype
    g.wasDerivedFrom(ap['datasource1'], ap['datasource2'])

    # Depending on which ADAMA microservice type we are using, define an activity
    # Eventually, break these into more atomic actions in a chain
    action1 = g.activity(ap['do_query'], datetime.datetime.now())
    # action1 = g.activity(ap['do_map'], datetime.datetime.now())
    # action1 = g.activity(ap['do_generic'], datetime.datetime.now())
    # action1 = g.activity(ap['do_passthrough'], datetime.datetime.now())
    # Future... Support for ADAMA-native microservices
    # action1 = g.activity(ap['generate'], datetime.datetime.now())

    # Define current ADAMA response as an Entity
    # This is what's being returned to the user and is thus the subject of the PROV record
    # May be able to add more attributes to it but this is the minimum
    response = g.entity(ap['adama_response'])

    # Response is generated by the process_query action
    # Time-stamp it!
    g.wasGeneratedBy(response, ap['do_query'], datetime.datetime.now())
    # The process_query used the microservice
    g.used(ap['do_query'], adama_microservice, datetime.datetime.now())
    # The microservice used datasource1
    g.used(adama_microservice, datasource1, datetime.datetime.now())

    # Print prov_n
    print(g.get_provn())
    # Print prov-json
    print(g.serialize())
    # Write out as a pretty picture
    graph = prov.dot.prov_to_dot(g)
    graph.write_png('Sources.png')
Esempio n. 22
0
class LogProv():
    def __init__(self, log_dic):
        self._prov_doc = ProvDocument()
        vre_namespace = self._prov_doc.add_namespace(
            'vre', 'https://www.vre4eic.eu/log#')
        prov_namespace = self._prov_doc.add_namespace(
            'prov', 'http://www.w3.org/ns/prov#')
        if ('request_url_username' in log_dic
                and log_dic['request_url_username']):
            remote_host = self._prov_doc.agent(
                vre_namespace['ag1'], {
                    prov_namespace['type']: PROV["SoftwareAgent"],
                    vre_namespace['hasIP']: log_dic['remote_host'],
                    vre_namespace['hasUsername']:
                    log_dic['request_url_username']
                })
        else:
            remote_host = self._prov_doc.agent(
                vre_namespace['ag1'], {
                    prov_namespace['type']: PROV["SoftwareAgent"],
                    vre_namespace['hasIP']: log_dic['remote_host']
                })

        if ('request_url_hostname' in log_dic
                and log_dic['request_url_hostname']):
            request_hostname = self._prov_doc.agent(
                vre_namespace['ag2'], {
                    prov_namespace['type']: PROV["SoftwareAgent"],
                    vre_namespace['hasIP']: log_dic['remote_host']
                })

        request_entity = self._prov_doc.entity(
            vre_namespace['en1'], {
                vre_namespace['status']: log_dic['status'],
                vre_namespace['responseBytes']: log_dic['response_bytes_clf']
            })

        received_activity = self._prov_doc.activity(
            vre_namespace['ac1'],
            other_attributes={
                vre_namespace['requestURL']: log_dic['request_url'],
                vre_namespace['requestMethod']: log_dic['request_method'],
                vre_namespace['httpVersion']: log_dic['request_http_ver']
            })
        self._prov_doc.generation(remote_host,
                                  activity=received_activity,
                                  time=log_dic['time_received_tz_isoformat'])
        self._prov_doc.wasAttributedTo(request_entity, received_activity)
        self._prov_doc.wasAssociatedWith(received_activity, remote_host)

    @property
    def prov_doc(self):
        return self._prov_doc

    @prov_doc.setter
    def prov_doc(self, value):
        self._prov_doc = value

    @prov_doc.deleter
    def prov_doc(self):
        del self._prov_doc
Esempio n. 23
0
class Context(object):
    """
    Context is a singlton storing all
    of the run specific data.
    """
    def __init__(self):
        # Warning;
        # If new data is added with a site dimension the
        # clip exposure function may need to be updated
        # so the site data stays consistent.

        # --------------  These variables are saved ----
        #  If new variables are added the save functions
        # will need to be modified.

        # Latitude and longitude values of the exposure data
        # Has a site dimension
        self.exposure_lat = None
        self.exposure_long = None

        # Data with a site dimension
        # key - data name
        # value - A numpy array. First dimension is site. (0 axis)
        # Has a site dimension
        self.exposure_att = None

        # Data for aggregation across sites
        self.exposure_agg = None

        #
        # --------------  The above variables are saved ----

        # key - intensity measure
        # value - One instance of RealisedVulnerabilityCurves.  An att in this
        #         class has a site dimension.
        self.exposure_vuln_curves = None

        # A dictionary of the vulnerability sets.
        # Not associated with exposures.
        # key - vulnerability set ID
        # value - vulnerability set instance
        self.vulnerability_sets = {}

        # A dictionary with keys being vulnerability_set_ids and
        # value being the exposure attribute who's values are vulnerability
        # function ID's.
        self.vul_function_titles = {}

        # A `prov.ProvDocument` to manage provenance information, including
        # adding required namespaces
        self.prov = ProvDocument()
        self.prov.set_default_namespace("")
        self.prov.add_namespace('prov', 'http://www.w3.org/ns/prov#')
        self.prov.add_namespace('xsd', 'http://www.w3.org/2001/XMLSchema#')
        self.prov.add_namespace('foaf', 'http://xmlns.com/foaf/0.1/')
        self.prov.add_namespace('void', 'http://vocab.deri.ie/void#')
        self.prov.add_namespace('dcterms', 'http://purl.org/dc/terms/')

        commit, branch, dt = misc.get_git_commit()
        # Create the fundamental software agent that is this code:
        self.prov.agent(
            ":hazimp", {
                "prov:type": "prov:SoftwareAgent",
                "prov:Revision": commit,
                "prov:branch": branch,
                "prov:date": dt
            })
        self.prov.agent(f":{getpass.getuser()}", {"prov:type": "foaf:Person"})
        self.prov.actedOnBehalfOf(":hazimp", f":{getpass.getuser()}")
        self.provlabel = ''

    def set_prov_label(self, label, title="HazImp analysis"):
        """
        Set the qualified label for the provenance data
        """

        self.provlabel = f":{label}"
        self.prov.activity(f":{label}",
                           datetime.now().strftime(DATEFMT), None, {
                               "dcterms:title": title,
                               "prov:type": "void:Analysis"
                           })
        self.prov.wasAttributedTo(self.provlabel, ":hazimp")

    def get_site_shape(self):
        """
        Get the numpy shape of sites the context is storing.
        It is based on the shape of exposure_long.

        :return: The numpy shape of sites the context is storing.
        """
        if self.exposure_long is None:
            shape = (0)
        else:
            shape = self.exposure_long.shape
        return shape

    def clip_exposure(self, min_long, min_lat, max_long, max_lat):
        """ min_long, min_lat, max_long, max_lat
        Clip the exposure data so only the exposure values within
        the rectangle formed by  max_lat, min_lat, max_long and
        min_long are included.

        Note: This must be called before the exposure_vuln_curves
        are determined, since the curves have a site dimension.
        """
        assert self.exposure_vuln_curves is None

        bad_indexes = set()
        bad_indexes = bad_indexes.union(
            numpy.where(self.exposure_long < min_long)[0])
        bad_indexes = bad_indexes.union(
            numpy.where(self.exposure_long > max_long)[0])
        bad_indexes = bad_indexes.union(
            numpy.where(self.exposure_lat < min_lat)[0])
        bad_indexes = bad_indexes.union(
            numpy.where(self.exposure_lat > max_lat)[0])
        good_indexes = numpy.array(list(
            set(range(self.exposure_lat.size)).difference(bad_indexes)),
                                   dtype=int)

        if good_indexes.shape[0] == 0:
            self.exposure_lat = numpy.array([])
            self.exposure_long = numpy.array([])
        else:
            self.exposure_lat = self.exposure_lat[good_indexes]
            self.exposure_long = self.exposure_long[good_indexes]

        if isinstance(self.exposure_att, dict):
            for key in self.exposure_att:
                if good_indexes.shape[0] == 0:
                    exp_att = numpy.array([])
                else:
                    exp_att = self.exposure_att[key][good_indexes]
                self.exposure_att[key] = exp_att
        else:
            self.exposure_att = self.exposure_att.take(good_indexes)

    def save_exposure_atts(self, filename, use_parallel=True):
        """
        Save the exposure attributes, including latitude and longitude.
        The file type saved is based on the filename extension.
        Options
           '.npz': Save the arrays into a single file in uncompressed .npz
                   format.

        :param use_parallel: Set to True for parallel behaviour
        Which is only node 0 writing to file.
        :param filename: The file to be written.
        :return write_dict: The whole dictionary, returned for testing.
        """
        [filename, bucket_name, bucket_key] = \
            misc.create_temp_file_path_for_s3(filename)
        s1 = self.prov.entity(
            ":HazImp output file", {
                "prov:label": "Full HazImp output file",
                "prov:type": "void:Dataset",
                "prov:atLocation": os.path.basename(filename)
            })
        a1 = self.prov.activity(":SaveImpactData",
                                datetime.now().strftime(DATEFMT), None)
        self.prov.wasGeneratedBy(s1, a1)
        self.prov.wasInformedBy(a1, self.provlabel)
        write_dict = self.exposure_att.copy()
        write_dict[EX_LAT] = self.exposure_lat
        write_dict[EX_LONG] = self.exposure_long

        if use_parallel:
            assert misc.INTID in write_dict
            write_dict = parallel.gather_dict(write_dict,
                                              write_dict[misc.INTID])

        if parallel.STATE.rank == 0 or not use_parallel:
            if filename[-4:] == '.csv':
                save_csv(write_dict, filename)
            else:
                numpy.savez(filename, **write_dict)
            misc.upload_to_s3_if_applicable(filename, bucket_name, bucket_key)
            # The write_dict is returned for testing
            # When running in paralled this is a way of getting all
            # of the context info
            return write_dict

    def save_exposure_aggregation(self, filename, use_parallel=True):
        """
        Save the aggregated exposure attributes.
        The file type saved is based on the filename extension.
        Options
           '.npz': Save the arrays into a single file in uncompressed .npz
                   format.

        :param use_parallel: Set to True for parallel behaviour which
        is only node 0 writing to file.
        :param filename: The file to be written.
        :return write_dict: The whole dictionary, returned for testing.
        """
        write_dict = self.exposure_agg.copy()

        s1 = self.prov.entity(
            ":Aggregated HazImp output file", {
                "prov:label": "Aggregated HazImp output file",
                "prov:type": "void:Dataset",
                "prov:atLocation": os.path.basename(filename)
            })
        a1 = self.prov.activity(":SaveAggregatedImpactData",
                                datetime.now().strftime(DATEFMT), None)
        self.prov.wasGeneratedBy(s1, a1)
        self.prov.wasInformedBy(a1, self.prov.activity(":AggregateLoss"))

        if parallel.STATE.rank == 0 or not use_parallel:
            if filename[-4:] == '.csv':
                save_csv_agg(write_dict, filename)
            else:
                numpy.savez(filename, **write_dict)
            # The write_dict is returned for testing
            # When running in paralled this is a way of getting all
            # of the context info
            return write_dict

    def save_aggregation(self,
                         filename,
                         boundaries,
                         impactcode,
                         boundarycode,
                         categories,
                         use_parallel=True):
        """
        Save data aggregated to geospatial regions

        :param str filename: Destination filename
        :param bool use_parallel: True for parallel behaviout, which
                                  is only node 0 writing to file

        """
        LOGGER.info("Saving aggregated data")
        boundaries = misc.download_file_from_s3_if_needed(boundaries)
        [filename, bucket_name, bucket_key] = \
            misc.create_temp_file_path_for_s3(filename)
        write_dict = self.exposure_att.copy()
        dt = datetime.now().strftime(DATEFMT)
        atts = {
            "prov:type": "void:Dataset",
            "prov:atLocation": os.path.basename(boundaries),
            "prov:generatedAtTime": misc.get_file_mtime(boundaries),
            "void:boundary_code": boundarycode
        }
        bdyent = self.prov.entity(":Aggregation boundaries", atts)
        aggact = self.prov.activity(":AggregationByRegions", dt, None,
                                    {'prov:type': "Spatial aggregation"})
        aggatts = {
            "prov:type": "void:Dataset",
            "prov:atLocation": os.path.basename(filename),
            "prov:generatedAtTime": dt
        }
        aggfileent = self.prov.entity(":AggregationFile", aggatts)
        self.prov.used(aggact, bdyent)
        self.prov.wasInformedBy(aggact, self.provlabel)
        self.prov.wasGeneratedBy(aggfileent, aggact)
        if parallel.STATE.rank == 0 or not use_parallel:
            aggregate.choropleth(write_dict, boundaries, impactcode,
                                 boundarycode, filename, categories)
            misc.upload_to_s3_if_applicable(filename, bucket_name, bucket_key)
            if (bucket_name is not None and bucket_key is not None
                    and bucket_key.endswith('.shp')):
                [rootname, ext] = os.path.splitext(filename)
                base_bucket_key = bucket_key[:-len(ext)]
                misc.upload_to_s3_if_applicable(rootname + '.dbf', bucket_name,
                                                base_bucket_key + '.dbf')
                misc.upload_to_s3_if_applicable(rootname + '.shx', bucket_name,
                                                base_bucket_key + '.shx')
                misc.upload_to_s3_if_applicable(rootname + '.prj', bucket_name,
                                                base_bucket_key + '.prj')
                misc.upload_to_s3_if_applicable(rootname + '.cpg', bucket_name,
                                                base_bucket_key + '.cpg', True)
        else:
            pass

    def aggregate_loss(self, groupby=None, kwargs=None):
        """
        Aggregate data by the `groupby` attribute, using the `kwargs` to
        perform any arithmetic aggregation on fields (e.g. summation,
        mean, etc.)

        :param str groupby: A column in the `DataFrame` that corresponds to
        regions by which to aggregate data
        :param dict kwargs: A `dict` with keys of valid column names (from the
        `DataFrame`) and values being lists of aggregation functions to apply
        to the columns.

        For example::

        kwargs = {'REPLACEMENT_VALUE': ['mean', 'sum'],
                'structural_loss_ratio': ['mean', 'std']}


        See
        https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#aggregation
        for more guidance on using aggregation with `DataFrames`

        """
        LOGGER.info(f"Aggregating loss using {groupby} attribute")
        a1 = self.prov.activity(":AggregateLoss",
                                datetime.now().strftime(DATEFMT), None, {
                                    "prov:type": "Aggregation",
                                    "void:aggregator": repr(groupby)
                                })
        self.prov.wasInformedBy(a1, self.provlabel)
        self.exposure_agg = aggregate.aggregate_loss_atts(
            self.exposure_att, groupby, kwargs)

    def categorise(self, bins, labels, field_name):
        """
        Bin values into discrete intervals.

        :param list bins: Monotonically increasing array of bin edges,
                          including the rightmost edge, allowing for
                          non-uniform bin widths.
        :param labels: Specifies the labels for the returned
                       bins. Must be the same length as the resulting bins.
        :param str field_name: Name of the new column in the `exposure_att`
                                `DataFrame`
        """

        for intensity_key in self.exposure_vuln_curves:
            vc = self.exposure_vuln_curves[intensity_key]
            lct = vc.loss_category_type
        LOGGER.info(f"Categorising {lct} values into {len(labels)} categories")
        self.exposure_att[field_name] = pd.cut(self.exposure_att[lct],
                                               bins,
                                               right=False,
                                               labels=labels)

    def tabulate(self, file_name, index=None, columns=None, aggfunc=None):
        """
        Reshape data (produce a "pivot" table) based on column values. Uses
        unique values from specified `index` / `columns` to form axes of the
        resulting DataFrame, then writes to an Excel file. This function does
        not support data aggregation - multiple values will result in a
        MultiIndex in the columns.
        See
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.pivot_table.html
        for further details.

        Parameters
        ----------
        file_name : destination for the pivot table
        index : column or list of columns
            Keys to group by on the pivot table index.  If an array is passed,
            it is being used as the same manner as column values.
        columns : column, or list of the columns
            Keys to group by on the pivot table column.  If an array is passed,
            it is being used as the same manner as column values.
        aggfunc : function, list of functions, dict, default numpy.mean
            If list of functions passed, the resulting pivot table will have
            hierarchical columns whose top level are the function names
            (inferred from the function objects themselves)
            If dict is passed, the key is column to aggregate and value
            is function or list of functions.
        """
        if index not in self.exposure_att.columns:
            LOGGER.error(f"Cannot tabulate data using {index} as index")
            LOGGER.error(f"{index} is not an attribute of the exposure data")
            return

        if columns not in self.exposure_att.columns:
            LOGGER.error(
                f"Required attribute(s) {columns} not in the exposure data")
            LOGGER.error(
                "Maybe you need to run a categorise job before this one?")
            return

        dt = datetime.now().strftime(DATEFMT)
        a1 = self.prov.activity(
            ":Tabulate", dt, None, {
                "prov:type": "Tabulation",
                "void:aggregator": repr(index),
                "void:attributes": repr(columns),
                "void:aggregation": repr(aggfunc)
            })
        tblatts = {
            "prov:type": "void:Dataset",
            "prov:atLocation": os.path.basename(file_name),
            "prov:generatedAtTime": dt
        }
        tblfileent = self.prov.entity(":TabulationFile", tblatts)

        self.pivot = self.exposure_att.pivot_table(index=index,
                                                   columns=columns,
                                                   aggfunc=aggfunc,
                                                   fill_value=0)
        try:
            self.pivot.to_excel(file_name)
        except TypeError as te:
            LOGGER.error(te)
            raise
        except KeyError as ke:
            LOGGER.error(ke)
            raise
        except ValueError as ve:
            LOGGER.error(f"Unable to save tabulated data to {file_name}")
            LOGGER.error(ve)
        else:
            self.prov.wasGeneratedBy(tblfileent, a1)
            self.prov.wasInformedBy(a1, self.provlabel)