Example #1
0
def _arcp2file(base_path: Path, uri: str) -> Path:
    parsed = arcp.parse_arcp(uri)
    # arcp URIs, ensure they are local to our RO
    assert (parsed.uuid == arcp.parse_arcp(find_arcp(base_path)).uuid
            ), "arcp URI must be local to the research object"

    path = parsed.path[1:]  # Strip first /
    return base_path / Path(path)
Example #2
0
    def _arcp2file(self, uri):
        parsed = arcp.parse_arcp(uri)
        # arcp URIs, ensure they are local to our RO
        self.assertEquals(parsed.uuid, arcp.parse_arcp(self.find_arcp()).uuid)

        path = parsed.path[1:]  # Strip first /
        # Convert to local path, in case it uses \ on Windows
        lpath = provenance._convert_path(path, posixpath, os.path)
        return os.path.join(self.folder, lpath)
Example #3
0
def _arcp2file(base_path, uri):
    parsed = arcp.parse_arcp(uri)
    # arcp URIs, ensure they are local to our RO
    assert parsed.uuid == arcp.parse_arcp(find_arcp(base_path)).uuid,\
    'arcp URI must be local to the research object'

    path = parsed.path[1:]  # Strip first /
    # Convert to local path, in case it uses \ on Windows
    lpath = provenance._convert_path(path, posixpath, os.path)
    return os.path.join(base_path, lpath)
Example #4
0
def _arcp2file(base_path, uri):
    parsed = arcp.parse_arcp(uri)
    # arcp URIs, ensure they are local to our RO
    assert parsed.uuid == arcp.parse_arcp(find_arcp(base_path)).uuid,\
    'arcp URI must be local to the research object'

    path = parsed.path[1:]  # Strip first /
    # Convert to local path, in case it uses \ on Windows
    lpath = provenance._convert_path(path, posixpath, os.path)
    return os.path.join(base_path, lpath)
Example #5
0
def _arcp2file(base_path: str, uri: str) -> str:
    parsed = arcp.parse_arcp(uri)
    # arcp URIs, ensure they are local to our RO
    assert (parsed.uuid == arcp.parse_arcp(find_arcp(base_path)).uuid
            ), "arcp URI must be local to the research object"

    path = parsed.path[1:]  # Strip first /
    # Convert to local path, in case it uses \ on Windows
    lpath = str(Path(path))
    return os.path.join(base_path, lpath)
Example #6
0
    def check_ro(self):
        manifest_file = os.path.join(self.folder, "metadata", "manifest.json")
        self.assertTrue(os.path.isfile(manifest_file), "Can't find " + manifest_file)
        arcp_root = self.find_arcp()
        base = urllib.parse.urljoin(arcp_root, "metadata/manifest.json")
        g = Graph()
        with open(manifest_file, "rb") as f:
            # Note: This will use https://w3id.org/bundle/context
            g.parse(file=f, format="json-ld", publicID=base)
        print("Parsed manifest:\n\n")
        g.serialize(sys.stdout, format="nt")
        ro = None

        for ro in g.subjects(ORE.isDescribedBy, URIRef(base)):
            break
        self.assertTrue(ro, "Can't find RO with ore:isDescribedBy")

        profile = None
        for dc in g.objects(ro, DCTERMS.conformsTo):
            profile = dc
            break
        self.assertTrue(profile, "Can't find profile with dct:conformsTo")
        self.assertEquals(profile, URIRef("https://w3id.org/cwl/prov/0.3.0"),
            "Unexpected cwlprov version " + profile)

        paths = []
        externals = []
        for aggregate in g.objects(ro, ORE.aggregates):
            print(aggregate)
            if not arcp.is_arcp_uri(aggregate):
                externals.append(aggregate)
                # Won't check external URIs existence here
                # TODO: Check they are not relative!
                continue
            # arcp URIs - assume they are local to our RO
            path = arcp.parse_arcp(aggregate).path[1:]  # Strip first /
            paths.append(path)
            # Convert to local path, in case it uses \ on Windows
            lpath = provenance._convert_path(path, posixpath, os.path)
            lfile = os.path.join(self.folder, lpath)
            self.assertTrue(os.path.isfile(lfile), "Can't find aggregated " + lfile)

        self.assertTrue(paths, "Didn't find any arcp aggregates")
        self.assertTrue(externals, "Didn't find any data URIs")

        for ext in ["provn", "xml", "json", "jsonld", "nt", "ttl"]:
            f = "metadata/provenance/primary.cwlprov.%s" % ext
            self.assertTrue(f in paths, "provenance file missing " + f)

        for f in ["workflow/primary-job.json", "workflow/packed.cwl"]:
            self.assertTrue(f in paths, "workflow file missing " + f)
Example #7
0
    def resolve_path(self, uri_path):
        if arcp.is_arcp_uri(str(uri_path)):
            uri = arcp.parse_arcp(uri_path)
            # Ensure same base URI meaning this bagit
            assert urllib.parse.urljoin(uri_path, "/") == self.root_uri
            # Strip initial / so path is relative
            path = pathlib.PurePosixPath(uri.path[1:])
        else:
            path = pathlib.PurePosixPath(uri_path)
        assert not path.is_absolute()

        if not str(path) in self.bag.entries:
            raise IOError("Not found in bag manifest/tagmanifest: %s" %
                          uri_path)
        # resolve as OS-specific path
        absolute = pathlib.Path(self.root_path, path)
        # ensure it did not climb out (will throw ValueError if not)
        assert absolute.relative_to(self.root_path)
        return absolute
Example #8
0
def check_prov(base_path,
               nested=False,
               single_tool=False,
               directory=False,
               secondary_files=False):
    prov_file = os.path.join(base_path, "metadata", "provenance",
                             "primary.cwlprov.nt")
    assert os.path.isfile(prov_file), "Can't find " + prov_file
    arcp_root = find_arcp(base_path)
    # Note: We don't need to include metadata/provnance in base URI
    # as .nt always use absolute URIs
    g = Graph()
    with open(prov_file, "rb") as f:
        g.parse(file=f, format="nt", publicID=arcp_root)
    if os.environ.get("DEBUG"):
        print("Parsed %s:\n\n" % prov_file)
        g.serialize(sys.stdout, format="ttl")
    runs = set(g.subjects(RDF.type, WFPROV.WorkflowRun))

    # master workflow run URI (as urn:uuid:) should correspond to arcp uuid part
    uuid = arcp.parse_arcp(arcp_root).uuid
    master_run = URIRef(uuid.urn)
    assert master_run in runs, "Can't find run %s in %s" % (master_run, runs)
    # TODO: we should not need to parse arcp, but follow
    # the has_provenance annotations in manifest.json instead

    # run should have been started by a wf engine

    engines = set(g.subjects(RDF.type, WFPROV.WorkflowEngine))
    assert engines, "Could not find WorkflowEngine"
    assert len(engines) == 1, "Found too many WorkflowEngines: %s" % engines
    engine = engines.pop()

    assert (master_run, PROV.wasAssociatedWith,
            engine) in g, "Wf run not associated with wf engine"
    assert (engine, RDF.type,
            PROV.SoftwareAgent) in g, "Engine not declared as SoftwareAgent"

    if single_tool:
        activities = set(g.subjects(RDF.type, PROV.Activity))
        assert len(activities) == 1, "Too many activities: %s" % activities
        # single tool exec, there should be no other activities
        # than the tool run
        # (NOTE: the WorkflowEngine is also activity, but not declared explicitly)
    else:
        # Check all process runs were started by the master worklow
        stepActivities = set(g.subjects(RDF.type, WFPROV.ProcessRun))
        # Although semantically a WorkflowEngine is also a ProcessRun,
        # we don't declare that,
        # thus only the step activities should be in this set.
        assert master_run not in stepActivities
        assert stepActivities, "No steps executed in workflow"
        for step in stepActivities:
            # Let's check it was started by the master_run. Unfortunately, unlike PROV-N
            # in PROV-O RDF we have to check through the n-ary qualifiedStart relation
            starts = set(g.objects(step, PROV.qualifiedStart))
            assert starts, "Could not find qualifiedStart of step %s" % step
            assert len(
                starts) == 1, "Too many qualifiedStart for step %s" % step
            start = starts.pop()
            assert (start, PROV.hadActivity, master_run) in g,\
                "Step activity not started by master activity"
            # Tip: Any nested workflow step executions should not be in this prov file,
            # but in separate file
    if nested:
        # Find some cwlprov.nt the nested workflow is described in
        prov_ids = set(g.objects(predicate=PROV.has_provenance))
        # FIXME: The above is a bit naive and does not check the subject is
        # one of the steps -- OK for now as this is the only case of prov:has_provenance
        assert prov_ids, "Could not find prov:has_provenance from nested workflow"

        nt_uris = [uri for uri in prov_ids if uri.endswith("cwlprov.nt")]
        # TODO: Look up manifest conformsTo and content-type rather than assuming magic filename
        assert nt_uris, "Could not find *.cwlprov.nt"
        # Load into new graph
        g2 = Graph()
        nt_uri = nt_uris.pop()
        with open(_arcp2file(base_path, nt_uri), "rb") as f:
            g2.parse(file=f, format="nt", publicID=nt_uri)
        # TODO: Check g2 statements that it's the same UUID activity inside
        # as in the outer step
    if directory:
        directories = set(g.subjects(RDF.type, RO.Folder))
        assert directories

        for d in directories:
            assert (d, RDF.type, PROV.Dictionary) in g
            assert (d, RDF.type, PROV.Collection) in g
            assert (d, RDF.type, PROV.Entity) in g

            files = set()
            for entry in g.objects(d, PROV.hadDictionaryMember):
                assert (entry, RDF.type, PROV.KeyEntityPair) in g
                # We don't check what that filename is here
                assert set(g.objects(entry, PROV.pairKey))

                # RO:Folder aspect
                assert set(g.objects(entry, RO.entryName))
                assert (d, ORE.aggregates, entry) in g
                assert (entry, RDF.type, RO.FolderEntry) in g
                assert (entry, RDF.type, ORE.Proxy) in g
                assert (entry, ORE.proxyIn, d) in g
                assert (entry, ORE.proxyIn, d) in g

                # Which file?
                entities = set(g.objects(entry, PROV.pairEntity))
                assert entities
                f = entities.pop()
                files.add(f)
                assert (entry, ORE.proxyFor, f) in g
                assert (f, RDF.type, PROV.Entity) in g

            if not files:
                assert (d, RDF.type, PROV.EmptyCollection) in g
                assert (d, RDF.type, PROV.EmptyDictionary) in g
    if secondary_files:
        derivations = set(g.subjects(RDF.type, CWLPROV.SecondaryFile))
        assert derivations
        for der in derivations:
            sec = set(g.subjects(PROV.qualifiedDerivation, der)).pop()
            prim = set(g.objects(der, PROV.entity)).pop()

            # UUID specializes a hash checksum
            assert set(g.objects(sec, PROV.specializationOf))
            # extensions etc.
            sec_basename = set(g.objects(sec, CWLPROV.basename)).pop()
            sec_nameroot = set(g.objects(sec, CWLPROV.nameroot)).pop()
            sec_nameext = set(g.objects(sec, CWLPROV.nameext)).pop()
            assert str(sec_basename) == "%s%s" % (sec_nameroot, sec_nameext)
            # TODO: Check hash data file exist in RO

            # The primary entity should have the same, but different values
            assert set(g.objects(prim, PROV.specializationOf))
            prim_basename = set(g.objects(prim, CWLPROV.basename)).pop()
            prim_nameroot = set(g.objects(prim, CWLPROV.nameroot)).pop()
            prim_nameext = set(g.objects(prim, CWLPROV.nameext)).pop()
            assert str(prim_basename) == "%s%s" % (prim_nameroot, prim_nameext)
Example #9
0
def check_ro(base_path, nested=False):
    manifest_file = os.path.join(base_path, "metadata", "manifest.json")
    assert os.path.isfile(manifest_file), "Can't find " + manifest_file
    arcp_root = find_arcp(base_path)
    base = urllib.parse.urljoin(arcp_root, "metadata/manifest.json")
    g = Graph()

    # Avoid resolving JSON-LD context https://w3id.org/bundle/context
    # so this test works offline
    context = Path(get_data("tests/bundle-context.jsonld")).as_uri()
    with open(manifest_file, "r", encoding="UTF-8") as f:
        jsonld = f.read()
        # replace with file:/// URI
        jsonld = jsonld.replace("https://w3id.org/bundle/context", context)
    g.parse(data=jsonld, format="json-ld", publicID=base)
    if os.environ.get("DEBUG"):
        print("Parsed manifest:\n\n")
        g.serialize(sys.stdout, format="ttl")
    ro = None

    for ro in g.subjects(ORE.isDescribedBy, URIRef(base)):
        break
    assert ro is not None, "Can't find RO with ore:isDescribedBy"

    profile = None
    for dc in g.objects(ro, DCTERMS.conformsTo):
        profile = dc
        break
    assert profile is not None, "Can't find profile with dct:conformsTo"
    assert profile == URIRef(provenance.CWLPROV_VERSION),\
        "Unexpected cwlprov version " + profile

    paths = []
    externals = []
    for aggregate in g.objects(ro, ORE.aggregates):
        if not arcp.is_arcp_uri(aggregate):
            externals.append(aggregate)
            # Won't check external URIs existence here
            # TODO: Check they are not relative!
            continue
        lfile = _arcp2file(base_path, aggregate)
        paths.append(os.path.relpath(lfile, base_path))
        assert os.path.isfile(lfile), "Can't find aggregated " + lfile

    assert paths, "Didn't find any arcp aggregates"
    assert externals, "Didn't find any data URIs"

    for ext in ["provn", "xml", "json", "jsonld", "nt", "ttl"]:
        f = "metadata/provenance/primary.cwlprov.%s" % ext
        assert f in paths, "provenance file missing " + f

    for f in [
            "workflow/primary-job.json", "workflow/packed.cwl",
            "workflow/primary-output.json"
    ]:
        assert f in paths, "workflow file missing " + f
    # Can't test snapshot/ files directly as their name varies

    # TODO: check urn:hash::sha1 thingies
    # TODO: Check OA annotations

    packed = urllib.parse.urljoin(arcp_root, "/workflow/packed.cwl")
    primary_job = urllib.parse.urljoin(arcp_root, "/workflow/primary-job.json")
    primary_prov_nt = urllib.parse.urljoin(
        arcp_root, "/metadata/provenance/primary.cwlprov.nt")
    uuid = arcp.parse_arcp(arcp_root).uuid

    highlights = set(g.subjects(OA.motivatedBy, OA.highlighting))
    assert highlights, "Didn't find highlights"
    for h in highlights:
        assert (h, OA.hasTarget, URIRef(packed)) in g

    describes = set(g.subjects(OA.motivatedBy, OA.describing))
    for d in describes:
        assert (d, OA.hasBody, URIRef(arcp_root)) in g
        assert (d, OA.hasTarget, URIRef(uuid.urn)) in g

    linked = set(g.subjects(OA.motivatedBy, OA.linking))
    for l in linked:
        assert (l, OA.hasBody, URIRef(packed)) in g
        assert (l, OA.hasBody, URIRef(primary_job)) in g
        assert (l, OA.hasTarget, URIRef(uuid.urn)) in g

    has_provenance = set(g.subjects(OA.hasBody, URIRef(primary_prov_nt)))
    for p in has_provenance:
        assert (p, OA.hasTarget, URIRef(uuid.urn)) in g
        assert (p, OA.motivatedBy, PROV.has_provenance) in g
        # Check all prov elements are listed
        formats = set()
        for prov in g.objects(p, OA.hasBody):
            assert (prov, DCTERMS.conformsTo,
                    URIRef(provenance.CWLPROV_VERSION)) in g
            # NOTE: DC.format is a Namespace method and does not resolve like other terms
            formats.update(set(g.objects(prov, DC["format"])))
        assert formats, "Could not find media types"
        expected = set(
            Literal(f)
            for f in ("application/json", "application/ld+json",
                      "application/n-triples",
                      'text/provenance-notation; charset="UTF-8"',
                      'text/turtle; charset="UTF-8"', "application/xml"))
        assert formats == expected, "Did not match expected PROV media types"

    if nested:
        # Check for additional PROVs
        # Let's try to find the other wf run ID
        otherRuns = set()
        for p in g.subjects(OA.motivatedBy, PROV.has_provenance):
            if (p, OA.hasTarget, URIRef(uuid.urn)) in g:
                continue
            otherRuns.update(set(g.objects(p, OA.hasTarget)))
        assert otherRuns, "Could not find nested workflow run prov annotations"
Example #10
0
def check_prov(base_path, nested=False, single_tool=False, directory=False,
               secondary_files=False):
    prov_file = os.path.join(base_path, "metadata", "provenance", "primary.cwlprov.nt")
    assert os.path.isfile(prov_file), "Can't find " + prov_file
    arcp_root = find_arcp(base_path)
    # Note: We don't need to include metadata/provnance in base URI
    # as .nt always use absolute URIs
    g = Graph()
    with open(prov_file, "rb") as f:
        g.parse(file=f, format="nt", publicID=arcp_root)
    if os.environ.get("DEBUG"):
        print("Parsed %s:\n\n" % prov_file)
        g.serialize(sys.stdout, format="ttl")
    runs = set(g.subjects(RDF.type, WFPROV.WorkflowRun))

    # master workflow run URI (as urn:uuid:) should correspond to arcp uuid part
    uuid = arcp.parse_arcp(arcp_root).uuid
    master_run = URIRef(uuid.urn)
    assert master_run in runs, "Can't find run %s in %s" % (master_run, runs)
    # TODO: we should not need to parse arcp, but follow
    # the has_provenance annotations in manifest.json instead

    # run should have been started by a wf engine

    engines = set(g.subjects(RDF.type, WFPROV.WorkflowEngine))
    assert engines, "Could not find WorkflowEngine"
    assert len(engines) == 1, "Found too many WorkflowEngines: %s" % engines
    engine = engines.pop()

    assert (master_run, PROV.wasAssociatedWith, engine) in g, "Wf run not associated with wf engine"
    assert (engine, RDF.type, PROV.SoftwareAgent) in g, "Engine not declared as SoftwareAgent"

    if single_tool:
        activities = set(g.subjects(RDF.type, PROV.Activity))
        assert len(activities) == 1, "Too many activities: %s" % activities
        # single tool exec, there should be no other activities
        # than the tool run
        # (NOTE: the WorkflowEngine is also activity, but not declared explicitly)
    else:
        # Check all process runs were started by the master worklow
        stepActivities = set(g.subjects(RDF.type, WFPROV.ProcessRun))
        # Although semantically a WorkflowEngine is also a ProcessRun,
        # we don't declare that,
        # thus only the step activities should be in this set.
        assert master_run not in stepActivities
        assert stepActivities, "No steps executed in workflow"
        for step in stepActivities:
            # Let's check it was started by the master_run. Unfortunately, unlike PROV-N
            # in PROV-O RDF we have to check through the n-ary qualifiedStart relation
            starts = set(g.objects(step, PROV.qualifiedStart))
            assert starts, "Could not find qualifiedStart of step %s" % step
            assert len(starts) == 1, "Too many qualifiedStart for step %s" % step
            start = starts.pop()
            assert (start, PROV.hadActivity, master_run) in g,\
                "Step activity not started by master activity"
            # Tip: Any nested workflow step executions should not be in this prov file,
            # but in separate file
    if nested:
        # Find some cwlprov.nt the nested workflow is described in
        prov_ids = set(g.objects(predicate=PROV.has_provenance))
        # FIXME: The above is a bit naive and does not check the subject is
        # one of the steps -- OK for now as this is the only case of prov:has_provenance
        assert prov_ids, "Could not find prov:has_provenance from nested workflow"

        nt_uris = [uri for uri in prov_ids if uri.endswith("cwlprov.nt")]
        # TODO: Look up manifest conformsTo and content-type rather than assuming magic filename
        assert nt_uris, "Could not find *.cwlprov.nt"
        # Load into new graph
        g2 = Graph()
        nt_uri = nt_uris.pop()
        with open(_arcp2file(base_path, nt_uri), "rb") as f:
            g2.parse(file=f, format="nt", publicID=nt_uri)
        # TODO: Check g2 statements that it's the same UUID activity inside
        # as in the outer step
    if directory:
        directories = set(g.subjects(RDF.type, RO.Folder))
        assert directories

        for d in directories:
            assert (d, RDF.type, PROV.Dictionary) in g
            assert (d, RDF.type, PROV.Collection) in g
            assert(d, RDF.type, PROV.Entity) in g

            files = set()
            for entry in g.objects(d, PROV.hadDictionaryMember):
                assert (entry, RDF.type, PROV.KeyEntityPair) in g
                # We don't check what that filename is here
                assert set(g.objects(entry, PROV.pairKey))

                # RO:Folder aspect
                assert set(g.objects(entry, RO.entryName))
                assert (d, ORE.aggregates, entry) in g
                assert (entry, RDF.type, RO.FolderEntry) in g
                assert (entry, RDF.type, ORE.Proxy) in g
                assert (entry, ORE.proxyIn, d) in g
                assert (entry, ORE.proxyIn, d) in g

                # Which file?
                entities = set(g.objects(entry, PROV.pairEntity))
                assert entities
                f = entities.pop()
                files.add(f)
                assert (entry, ORE.proxyFor, f) in g
                assert (f, RDF.type, PROV.Entity) in g

            if not files:
                assert (d, RDF.type, PROV.EmptyCollection) in g
                assert (d, RDF.type, PROV.EmptyDictionary) in g
    if secondary_files:
        derivations = set(g.subjects(RDF.type, CWLPROV.SecondaryFile))
        assert derivations
        for der in derivations:
            sec = set(g.subjects(PROV.qualifiedDerivation, der)).pop()
            prim = set(g.objects(der, PROV.entity)).pop()

            # UUID specializes a hash checksum
            assert set(g.objects(sec, PROV.specializationOf))
            # extensions etc.
            sec_basename = set(g.objects(sec, CWLPROV.basename)).pop()
            sec_nameroot = set(g.objects(sec, CWLPROV.nameroot)).pop()
            sec_nameext = set(g.objects(sec, CWLPROV.nameext)).pop()
            assert str(sec_basename) == "%s%s" % (sec_nameroot, sec_nameext)
            # TODO: Check hash data file exist in RO

            # The primary entity should have the same, but different values
            assert set(g.objects(prim, PROV.specializationOf))
            prim_basename = set(g.objects(prim, CWLPROV.basename)).pop()
            prim_nameroot = set(g.objects(prim, CWLPROV.nameroot)).pop()
            prim_nameext = set(g.objects(prim, CWLPROV.nameext)).pop()
            assert str(prim_basename) == "%s%s" % (prim_nameroot, prim_nameext)
Example #11
0
def check_ro(base_path, nested=False):
    manifest_file = os.path.join(base_path, "metadata", "manifest.json")
    assert os.path.isfile(manifest_file), "Can't find " + manifest_file
    arcp_root = find_arcp(base_path)
    base = urllib.parse.urljoin(arcp_root, "metadata/manifest.json")
    g = Graph()

    # Avoid resolving JSON-LD context https://w3id.org/bundle/context
    # so this test works offline
    context = Path(get_data("tests/bundle-context.jsonld")).as_uri()
    with open(manifest_file, "r", encoding="UTF-8") as f:
        jsonld = f.read()
        # replace with file:/// URI
        jsonld = jsonld.replace("https://w3id.org/bundle/context", context)
    g.parse(data=jsonld, format="json-ld", publicID=base)
    if os.environ.get("DEBUG"):
        print("Parsed manifest:\n\n")
        g.serialize(sys.stdout, format="ttl")
    ro = None

    for ro in g.subjects(ORE.isDescribedBy, URIRef(base)):
        break
    assert ro is not None, "Can't find RO with ore:isDescribedBy"

    profile = None
    for dc in g.objects(ro, DCTERMS.conformsTo):
        profile = dc
        break
    assert profile is not None, "Can't find profile with dct:conformsTo"
    assert profile == URIRef(provenance.CWLPROV_VERSION),\
        "Unexpected cwlprov version " + profile

    paths = []
    externals = []
    for aggregate in g.objects(ro, ORE.aggregates):
        if not arcp.is_arcp_uri(aggregate):
            externals.append(aggregate)
            # Won't check external URIs existence here
            # TODO: Check they are not relative!
            continue
        lfile = _arcp2file(base_path, aggregate)
        paths.append(os.path.relpath(lfile, base_path))
        assert os.path.isfile(lfile), "Can't find aggregated " + lfile

    assert paths, "Didn't find any arcp aggregates"
    assert externals, "Didn't find any data URIs"

    for ext in ["provn", "xml", "json", "jsonld", "nt", "ttl"]:
        f = "metadata/provenance/primary.cwlprov.%s" % ext
        assert f in paths, "provenance file missing " + f

    for f in ["workflow/primary-job.json", "workflow/packed.cwl", "workflow/primary-output.json"]:
        assert f in paths, "workflow file missing " + f
    # Can't test snapshot/ files directly as their name varies

    # TODO: check urn:hash::sha1 thingies
    # TODO: Check OA annotations

    packed = urllib.parse.urljoin(arcp_root, "/workflow/packed.cwl")
    primary_job = urllib.parse.urljoin(arcp_root, "/workflow/primary-job.json")
    primary_prov_nt = urllib.parse.urljoin(arcp_root, "/metadata/provenance/primary.cwlprov.nt")
    uuid = arcp.parse_arcp(arcp_root).uuid

    highlights = set(g.subjects(OA.motivatedBy, OA.highlighting))
    assert highlights, "Didn't find highlights"
    for h in highlights:
        assert (h, OA.hasTarget, URIRef(packed)) in g

    describes = set(g.subjects(OA.motivatedBy, OA.describing))
    for d in describes:
        assert (d, OA.hasBody, URIRef(arcp_root)) in g
        assert (d, OA.hasTarget, URIRef(uuid.urn)) in g

    linked = set(g.subjects(OA.motivatedBy, OA.linking))
    for l in linked:
        assert (l, OA.hasBody, URIRef(packed)) in g
        assert (l, OA.hasBody, URIRef(primary_job)) in g
        assert (l, OA.hasTarget, URIRef(uuid.urn)) in g

    has_provenance = set(g.subjects(OA.hasBody, URIRef(primary_prov_nt)))
    for p in has_provenance:
        assert (p, OA.hasTarget, URIRef(uuid.urn)) in g
        assert (p, OA.motivatedBy, PROV.has_provenance) in g
        # Check all prov elements are listed
        formats = set()
        for prov in g.objects(p, OA.hasBody):
            assert (prov, DCTERMS.conformsTo, URIRef(provenance.CWLPROV_VERSION)) in g
            # NOTE: DC.format is a Namespace method and does not resolve like other terms
            formats.update(set(g.objects(prov, DC["format"])))
        assert formats, "Could not find media types"
        expected = set(Literal(f) for f in (
            "application/json",
            "application/ld+json",
            "application/n-triples",
            'text/provenance-notation; charset="UTF-8"',
            'text/turtle; charset="UTF-8"',
            "application/xml"
        ))
        assert formats == expected, "Did not match expected PROV media types"

    if nested:
        # Check for additional PROVs
        # Let's try to find the other wf run ID
        otherRuns = set()
        for p in g.subjects(OA.motivatedBy, PROV.has_provenance):
            if (p, OA.hasTarget, URIRef(uuid.urn)) in g:
                continue
            otherRuns.update(set(g.objects(p, OA.hasTarget)))
        assert otherRuns, "Could not find nested workflow run prov annotations"
Example #12
0
    def check_ro(self, nested=False):
        manifest_file = os.path.join(self.folder, "metadata", "manifest.json")
        self.assertTrue(os.path.isfile(manifest_file),
                        "Can't find " + manifest_file)
        arcp_root = self.find_arcp()
        base = urllib.parse.urljoin(arcp_root, "metadata/manifest.json")
        g = Graph()
        with open(manifest_file, "rb") as f:
            # Note: This will use https://w3id.org/bundle/context
            g.parse(file=f, format="json-ld", publicID=base)
        if os.environ.get("DEBUG"):
            print("Parsed manifest:\n\n")
            g.serialize(sys.stdout, format="nt")
        ro = None

        for ro in g.subjects(ORE.isDescribedBy, URIRef(base)):
            break
        self.assertTrue(ro, "Can't find RO with ore:isDescribedBy")

        profile = None
        for dc in g.objects(ro, DCTERMS.conformsTo):
            profile = dc
            break
        self.assertTrue(profile, "Can't find profile with dct:conformsTo")
        self.assertEquals(profile, URIRef(provenance.CWLPROV_VERSION),
                          "Unexpected cwlprov version " + profile)

        paths = []
        externals = []
        for aggregate in g.objects(ro, ORE.aggregates):
            if not arcp.is_arcp_uri(aggregate):
                externals.append(aggregate)
                # Won't check external URIs existence here
                # TODO: Check they are not relative!
                continue
            lfile = self._arcp2file(aggregate)
            paths.append(os.path.relpath(lfile, self.folder))
            self.assertTrue(os.path.isfile(lfile),
                            "Can't find aggregated " + lfile)

        self.assertTrue(paths, "Didn't find any arcp aggregates")
        self.assertTrue(externals, "Didn't find any data URIs")

        for ext in ["provn", "xml", "json", "jsonld", "nt", "ttl"]:
            f = "metadata/provenance/primary.cwlprov.%s" % ext
            self.assertTrue(f in paths, "provenance file missing " + f)

        for f in ["workflow/primary-job.json", "workflow/packed.cwl"]:
            self.assertTrue(f in paths, "workflow file missing " + f)
        # Can't test snapshot/ files directly as their name varies

        # TODO: check urn:hash::sha1 thingies
        # TODO: Check OA annotations

        packed = urllib.parse.urljoin(arcp_root, "/workflow/packed.cwl")
        primary_job = urllib.parse.urljoin(arcp_root,
                                           "/workflow/primary-job.json")
        primary_prov_nt = urllib.parse.urljoin(
            arcp_root, "/metadata/provenance/primary.cwlprov.nt")
        uuid = arcp.parse_arcp(arcp_root).uuid

        highlights = set(g.subjects(OA.motivatedBy, OA.highlighting))
        self.assertTrue(highlights, "Didn't find highlights")
        for h in highlights:
            self.assertTrue((h, OA.hasTarget, URIRef(packed)) in g)

        describes = set(g.subjects(OA.motivatedBy, OA.describing))
        for d in describes:
            self.assertTrue((d, OA.hasBody, URIRef(arcp_root)) in g)
            self.assertTrue((d, OA.hasTarget, URIRef(uuid.urn)) in g)

        linked = set(g.subjects(OA.motivatedBy, OA.linking))
        for l in linked:
            self.assertTrue((l, OA.hasBody, URIRef(packed)) in g)
            self.assertTrue((l, OA.hasBody, URIRef(primary_job)) in g)
            self.assertTrue((l, OA.hasTarget, URIRef(uuid.urn)) in g)

        has_provenance = set(g.subjects(OA.hasBody, URIRef(primary_prov_nt)))
        for p in has_provenance:
            self.assertTrue((p, OA.hasTarget, URIRef(uuid.urn)) in g)
            self.assertTrue((p, OA.motivatedBy, PROV.has_provenance) in g)
            # Check all prov elements are listed
            formats = set()
            for prov in g.objects(p, OA.hasBody):
                self.assertTrue((prov, DCTERMS.conformsTo,
                                 URIRef(provenance.CWLPROV_VERSION)) in g)
                # NOTE: DC.format is a Namespace method and does not resolve like other terms
                formats.update(set(g.objects(prov, DC["format"])))
            self.assertTrue(formats, "Could not find media types")
            expected = set(
                Literal(f)
                for f in ("application/json", "application/ld+json",
                          "application/n-triples",
                          'text/provenance-notation; charset="UTF-8"',
                          'text/turtle; charset="UTF-8"', "application/xml"))
            self.assertEquals(formats, expected,
                              "Did not match expected PROV media types")

        if nested:
            # Check for additional PROVs
            # Let's try to find the other wf run ID
            otherRuns = set()
            for p in g.subjects(OA.motivatedBy, PROV.has_provenance):
                if (p, OA.hasTarget, URIRef(uuid.urn)) in g:
                    continue
                otherRuns.update(set(g.objects(p, OA.hasTarget)))
            self.assertTrue(
                otherRuns,
                "Could not find nested workflow run prov annotations")