def _arcp2file(base_path: Path, uri: str) -> Path: parsed = arcp.parse_arcp(uri) # arcp URIs, ensure they are local to our RO assert (parsed.uuid == arcp.parse_arcp(find_arcp(base_path)).uuid ), "arcp URI must be local to the research object" path = parsed.path[1:] # Strip first / return base_path / Path(path)
def _arcp2file(self, uri): parsed = arcp.parse_arcp(uri) # arcp URIs, ensure they are local to our RO self.assertEquals(parsed.uuid, arcp.parse_arcp(self.find_arcp()).uuid) path = parsed.path[1:] # Strip first / # Convert to local path, in case it uses \ on Windows lpath = provenance._convert_path(path, posixpath, os.path) return os.path.join(self.folder, lpath)
def _arcp2file(base_path, uri): parsed = arcp.parse_arcp(uri) # arcp URIs, ensure they are local to our RO assert parsed.uuid == arcp.parse_arcp(find_arcp(base_path)).uuid,\ 'arcp URI must be local to the research object' path = parsed.path[1:] # Strip first / # Convert to local path, in case it uses \ on Windows lpath = provenance._convert_path(path, posixpath, os.path) return os.path.join(base_path, lpath)
def _arcp2file(base_path: str, uri: str) -> str: parsed = arcp.parse_arcp(uri) # arcp URIs, ensure they are local to our RO assert (parsed.uuid == arcp.parse_arcp(find_arcp(base_path)).uuid ), "arcp URI must be local to the research object" path = parsed.path[1:] # Strip first / # Convert to local path, in case it uses \ on Windows lpath = str(Path(path)) return os.path.join(base_path, lpath)
def check_ro(self): manifest_file = os.path.join(self.folder, "metadata", "manifest.json") self.assertTrue(os.path.isfile(manifest_file), "Can't find " + manifest_file) arcp_root = self.find_arcp() base = urllib.parse.urljoin(arcp_root, "metadata/manifest.json") g = Graph() with open(manifest_file, "rb") as f: # Note: This will use https://w3id.org/bundle/context g.parse(file=f, format="json-ld", publicID=base) print("Parsed manifest:\n\n") g.serialize(sys.stdout, format="nt") ro = None for ro in g.subjects(ORE.isDescribedBy, URIRef(base)): break self.assertTrue(ro, "Can't find RO with ore:isDescribedBy") profile = None for dc in g.objects(ro, DCTERMS.conformsTo): profile = dc break self.assertTrue(profile, "Can't find profile with dct:conformsTo") self.assertEquals(profile, URIRef("https://w3id.org/cwl/prov/0.3.0"), "Unexpected cwlprov version " + profile) paths = [] externals = [] for aggregate in g.objects(ro, ORE.aggregates): print(aggregate) if not arcp.is_arcp_uri(aggregate): externals.append(aggregate) # Won't check external URIs existence here # TODO: Check they are not relative! continue # arcp URIs - assume they are local to our RO path = arcp.parse_arcp(aggregate).path[1:] # Strip first / paths.append(path) # Convert to local path, in case it uses \ on Windows lpath = provenance._convert_path(path, posixpath, os.path) lfile = os.path.join(self.folder, lpath) self.assertTrue(os.path.isfile(lfile), "Can't find aggregated " + lfile) self.assertTrue(paths, "Didn't find any arcp aggregates") self.assertTrue(externals, "Didn't find any data URIs") for ext in ["provn", "xml", "json", "jsonld", "nt", "ttl"]: f = "metadata/provenance/primary.cwlprov.%s" % ext self.assertTrue(f in paths, "provenance file missing " + f) for f in ["workflow/primary-job.json", "workflow/packed.cwl"]: self.assertTrue(f in paths, "workflow file missing " + f)
def resolve_path(self, uri_path): if arcp.is_arcp_uri(str(uri_path)): uri = arcp.parse_arcp(uri_path) # Ensure same base URI meaning this bagit assert urllib.parse.urljoin(uri_path, "/") == self.root_uri # Strip initial / so path is relative path = pathlib.PurePosixPath(uri.path[1:]) else: path = pathlib.PurePosixPath(uri_path) assert not path.is_absolute() if not str(path) in self.bag.entries: raise IOError("Not found in bag manifest/tagmanifest: %s" % uri_path) # resolve as OS-specific path absolute = pathlib.Path(self.root_path, path) # ensure it did not climb out (will throw ValueError if not) assert absolute.relative_to(self.root_path) return absolute
def check_prov(base_path, nested=False, single_tool=False, directory=False, secondary_files=False): prov_file = os.path.join(base_path, "metadata", "provenance", "primary.cwlprov.nt") assert os.path.isfile(prov_file), "Can't find " + prov_file arcp_root = find_arcp(base_path) # Note: We don't need to include metadata/provnance in base URI # as .nt always use absolute URIs g = Graph() with open(prov_file, "rb") as f: g.parse(file=f, format="nt", publicID=arcp_root) if os.environ.get("DEBUG"): print("Parsed %s:\n\n" % prov_file) g.serialize(sys.stdout, format="ttl") runs = set(g.subjects(RDF.type, WFPROV.WorkflowRun)) # master workflow run URI (as urn:uuid:) should correspond to arcp uuid part uuid = arcp.parse_arcp(arcp_root).uuid master_run = URIRef(uuid.urn) assert master_run in runs, "Can't find run %s in %s" % (master_run, runs) # TODO: we should not need to parse arcp, but follow # the has_provenance annotations in manifest.json instead # run should have been started by a wf engine engines = set(g.subjects(RDF.type, WFPROV.WorkflowEngine)) assert engines, "Could not find WorkflowEngine" assert len(engines) == 1, "Found too many WorkflowEngines: %s" % engines engine = engines.pop() assert (master_run, PROV.wasAssociatedWith, engine) in g, "Wf run not associated with wf engine" assert (engine, RDF.type, PROV.SoftwareAgent) in g, "Engine not declared as SoftwareAgent" if single_tool: activities = set(g.subjects(RDF.type, PROV.Activity)) assert len(activities) == 1, "Too many activities: %s" % activities # single tool exec, there should be no other activities # than the tool run # (NOTE: the WorkflowEngine is also activity, but not declared explicitly) else: # Check all process runs were started by the master worklow stepActivities = set(g.subjects(RDF.type, WFPROV.ProcessRun)) # Although semantically a WorkflowEngine is also a ProcessRun, # we don't declare that, # thus only the step activities should be in this set. assert master_run not in stepActivities assert stepActivities, "No steps executed in workflow" for step in stepActivities: # Let's check it was started by the master_run. Unfortunately, unlike PROV-N # in PROV-O RDF we have to check through the n-ary qualifiedStart relation starts = set(g.objects(step, PROV.qualifiedStart)) assert starts, "Could not find qualifiedStart of step %s" % step assert len( starts) == 1, "Too many qualifiedStart for step %s" % step start = starts.pop() assert (start, PROV.hadActivity, master_run) in g,\ "Step activity not started by master activity" # Tip: Any nested workflow step executions should not be in this prov file, # but in separate file if nested: # Find some cwlprov.nt the nested workflow is described in prov_ids = set(g.objects(predicate=PROV.has_provenance)) # FIXME: The above is a bit naive and does not check the subject is # one of the steps -- OK for now as this is the only case of prov:has_provenance assert prov_ids, "Could not find prov:has_provenance from nested workflow" nt_uris = [uri for uri in prov_ids if uri.endswith("cwlprov.nt")] # TODO: Look up manifest conformsTo and content-type rather than assuming magic filename assert nt_uris, "Could not find *.cwlprov.nt" # Load into new graph g2 = Graph() nt_uri = nt_uris.pop() with open(_arcp2file(base_path, nt_uri), "rb") as f: g2.parse(file=f, format="nt", publicID=nt_uri) # TODO: Check g2 statements that it's the same UUID activity inside # as in the outer step if directory: directories = set(g.subjects(RDF.type, RO.Folder)) assert directories for d in directories: assert (d, RDF.type, PROV.Dictionary) in g assert (d, RDF.type, PROV.Collection) in g assert (d, RDF.type, PROV.Entity) in g files = set() for entry in g.objects(d, PROV.hadDictionaryMember): assert (entry, RDF.type, PROV.KeyEntityPair) in g # We don't check what that filename is here assert set(g.objects(entry, PROV.pairKey)) # RO:Folder aspect assert set(g.objects(entry, RO.entryName)) assert (d, ORE.aggregates, entry) in g assert (entry, RDF.type, RO.FolderEntry) in g assert (entry, RDF.type, ORE.Proxy) in g assert (entry, ORE.proxyIn, d) in g assert (entry, ORE.proxyIn, d) in g # Which file? entities = set(g.objects(entry, PROV.pairEntity)) assert entities f = entities.pop() files.add(f) assert (entry, ORE.proxyFor, f) in g assert (f, RDF.type, PROV.Entity) in g if not files: assert (d, RDF.type, PROV.EmptyCollection) in g assert (d, RDF.type, PROV.EmptyDictionary) in g if secondary_files: derivations = set(g.subjects(RDF.type, CWLPROV.SecondaryFile)) assert derivations for der in derivations: sec = set(g.subjects(PROV.qualifiedDerivation, der)).pop() prim = set(g.objects(der, PROV.entity)).pop() # UUID specializes a hash checksum assert set(g.objects(sec, PROV.specializationOf)) # extensions etc. sec_basename = set(g.objects(sec, CWLPROV.basename)).pop() sec_nameroot = set(g.objects(sec, CWLPROV.nameroot)).pop() sec_nameext = set(g.objects(sec, CWLPROV.nameext)).pop() assert str(sec_basename) == "%s%s" % (sec_nameroot, sec_nameext) # TODO: Check hash data file exist in RO # The primary entity should have the same, but different values assert set(g.objects(prim, PROV.specializationOf)) prim_basename = set(g.objects(prim, CWLPROV.basename)).pop() prim_nameroot = set(g.objects(prim, CWLPROV.nameroot)).pop() prim_nameext = set(g.objects(prim, CWLPROV.nameext)).pop() assert str(prim_basename) == "%s%s" % (prim_nameroot, prim_nameext)
def check_ro(base_path, nested=False): manifest_file = os.path.join(base_path, "metadata", "manifest.json") assert os.path.isfile(manifest_file), "Can't find " + manifest_file arcp_root = find_arcp(base_path) base = urllib.parse.urljoin(arcp_root, "metadata/manifest.json") g = Graph() # Avoid resolving JSON-LD context https://w3id.org/bundle/context # so this test works offline context = Path(get_data("tests/bundle-context.jsonld")).as_uri() with open(manifest_file, "r", encoding="UTF-8") as f: jsonld = f.read() # replace with file:/// URI jsonld = jsonld.replace("https://w3id.org/bundle/context", context) g.parse(data=jsonld, format="json-ld", publicID=base) if os.environ.get("DEBUG"): print("Parsed manifest:\n\n") g.serialize(sys.stdout, format="ttl") ro = None for ro in g.subjects(ORE.isDescribedBy, URIRef(base)): break assert ro is not None, "Can't find RO with ore:isDescribedBy" profile = None for dc in g.objects(ro, DCTERMS.conformsTo): profile = dc break assert profile is not None, "Can't find profile with dct:conformsTo" assert profile == URIRef(provenance.CWLPROV_VERSION),\ "Unexpected cwlprov version " + profile paths = [] externals = [] for aggregate in g.objects(ro, ORE.aggregates): if not arcp.is_arcp_uri(aggregate): externals.append(aggregate) # Won't check external URIs existence here # TODO: Check they are not relative! continue lfile = _arcp2file(base_path, aggregate) paths.append(os.path.relpath(lfile, base_path)) assert os.path.isfile(lfile), "Can't find aggregated " + lfile assert paths, "Didn't find any arcp aggregates" assert externals, "Didn't find any data URIs" for ext in ["provn", "xml", "json", "jsonld", "nt", "ttl"]: f = "metadata/provenance/primary.cwlprov.%s" % ext assert f in paths, "provenance file missing " + f for f in [ "workflow/primary-job.json", "workflow/packed.cwl", "workflow/primary-output.json" ]: assert f in paths, "workflow file missing " + f # Can't test snapshot/ files directly as their name varies # TODO: check urn:hash::sha1 thingies # TODO: Check OA annotations packed = urllib.parse.urljoin(arcp_root, "/workflow/packed.cwl") primary_job = urllib.parse.urljoin(arcp_root, "/workflow/primary-job.json") primary_prov_nt = urllib.parse.urljoin( arcp_root, "/metadata/provenance/primary.cwlprov.nt") uuid = arcp.parse_arcp(arcp_root).uuid highlights = set(g.subjects(OA.motivatedBy, OA.highlighting)) assert highlights, "Didn't find highlights" for h in highlights: assert (h, OA.hasTarget, URIRef(packed)) in g describes = set(g.subjects(OA.motivatedBy, OA.describing)) for d in describes: assert (d, OA.hasBody, URIRef(arcp_root)) in g assert (d, OA.hasTarget, URIRef(uuid.urn)) in g linked = set(g.subjects(OA.motivatedBy, OA.linking)) for l in linked: assert (l, OA.hasBody, URIRef(packed)) in g assert (l, OA.hasBody, URIRef(primary_job)) in g assert (l, OA.hasTarget, URIRef(uuid.urn)) in g has_provenance = set(g.subjects(OA.hasBody, URIRef(primary_prov_nt))) for p in has_provenance: assert (p, OA.hasTarget, URIRef(uuid.urn)) in g assert (p, OA.motivatedBy, PROV.has_provenance) in g # Check all prov elements are listed formats = set() for prov in g.objects(p, OA.hasBody): assert (prov, DCTERMS.conformsTo, URIRef(provenance.CWLPROV_VERSION)) in g # NOTE: DC.format is a Namespace method and does not resolve like other terms formats.update(set(g.objects(prov, DC["format"]))) assert formats, "Could not find media types" expected = set( Literal(f) for f in ("application/json", "application/ld+json", "application/n-triples", 'text/provenance-notation; charset="UTF-8"', 'text/turtle; charset="UTF-8"', "application/xml")) assert formats == expected, "Did not match expected PROV media types" if nested: # Check for additional PROVs # Let's try to find the other wf run ID otherRuns = set() for p in g.subjects(OA.motivatedBy, PROV.has_provenance): if (p, OA.hasTarget, URIRef(uuid.urn)) in g: continue otherRuns.update(set(g.objects(p, OA.hasTarget))) assert otherRuns, "Could not find nested workflow run prov annotations"
def check_prov(base_path, nested=False, single_tool=False, directory=False, secondary_files=False): prov_file = os.path.join(base_path, "metadata", "provenance", "primary.cwlprov.nt") assert os.path.isfile(prov_file), "Can't find " + prov_file arcp_root = find_arcp(base_path) # Note: We don't need to include metadata/provnance in base URI # as .nt always use absolute URIs g = Graph() with open(prov_file, "rb") as f: g.parse(file=f, format="nt", publicID=arcp_root) if os.environ.get("DEBUG"): print("Parsed %s:\n\n" % prov_file) g.serialize(sys.stdout, format="ttl") runs = set(g.subjects(RDF.type, WFPROV.WorkflowRun)) # master workflow run URI (as urn:uuid:) should correspond to arcp uuid part uuid = arcp.parse_arcp(arcp_root).uuid master_run = URIRef(uuid.urn) assert master_run in runs, "Can't find run %s in %s" % (master_run, runs) # TODO: we should not need to parse arcp, but follow # the has_provenance annotations in manifest.json instead # run should have been started by a wf engine engines = set(g.subjects(RDF.type, WFPROV.WorkflowEngine)) assert engines, "Could not find WorkflowEngine" assert len(engines) == 1, "Found too many WorkflowEngines: %s" % engines engine = engines.pop() assert (master_run, PROV.wasAssociatedWith, engine) in g, "Wf run not associated with wf engine" assert (engine, RDF.type, PROV.SoftwareAgent) in g, "Engine not declared as SoftwareAgent" if single_tool: activities = set(g.subjects(RDF.type, PROV.Activity)) assert len(activities) == 1, "Too many activities: %s" % activities # single tool exec, there should be no other activities # than the tool run # (NOTE: the WorkflowEngine is also activity, but not declared explicitly) else: # Check all process runs were started by the master worklow stepActivities = set(g.subjects(RDF.type, WFPROV.ProcessRun)) # Although semantically a WorkflowEngine is also a ProcessRun, # we don't declare that, # thus only the step activities should be in this set. assert master_run not in stepActivities assert stepActivities, "No steps executed in workflow" for step in stepActivities: # Let's check it was started by the master_run. Unfortunately, unlike PROV-N # in PROV-O RDF we have to check through the n-ary qualifiedStart relation starts = set(g.objects(step, PROV.qualifiedStart)) assert starts, "Could not find qualifiedStart of step %s" % step assert len(starts) == 1, "Too many qualifiedStart for step %s" % step start = starts.pop() assert (start, PROV.hadActivity, master_run) in g,\ "Step activity not started by master activity" # Tip: Any nested workflow step executions should not be in this prov file, # but in separate file if nested: # Find some cwlprov.nt the nested workflow is described in prov_ids = set(g.objects(predicate=PROV.has_provenance)) # FIXME: The above is a bit naive and does not check the subject is # one of the steps -- OK for now as this is the only case of prov:has_provenance assert prov_ids, "Could not find prov:has_provenance from nested workflow" nt_uris = [uri for uri in prov_ids if uri.endswith("cwlprov.nt")] # TODO: Look up manifest conformsTo and content-type rather than assuming magic filename assert nt_uris, "Could not find *.cwlprov.nt" # Load into new graph g2 = Graph() nt_uri = nt_uris.pop() with open(_arcp2file(base_path, nt_uri), "rb") as f: g2.parse(file=f, format="nt", publicID=nt_uri) # TODO: Check g2 statements that it's the same UUID activity inside # as in the outer step if directory: directories = set(g.subjects(RDF.type, RO.Folder)) assert directories for d in directories: assert (d, RDF.type, PROV.Dictionary) in g assert (d, RDF.type, PROV.Collection) in g assert(d, RDF.type, PROV.Entity) in g files = set() for entry in g.objects(d, PROV.hadDictionaryMember): assert (entry, RDF.type, PROV.KeyEntityPair) in g # We don't check what that filename is here assert set(g.objects(entry, PROV.pairKey)) # RO:Folder aspect assert set(g.objects(entry, RO.entryName)) assert (d, ORE.aggregates, entry) in g assert (entry, RDF.type, RO.FolderEntry) in g assert (entry, RDF.type, ORE.Proxy) in g assert (entry, ORE.proxyIn, d) in g assert (entry, ORE.proxyIn, d) in g # Which file? entities = set(g.objects(entry, PROV.pairEntity)) assert entities f = entities.pop() files.add(f) assert (entry, ORE.proxyFor, f) in g assert (f, RDF.type, PROV.Entity) in g if not files: assert (d, RDF.type, PROV.EmptyCollection) in g assert (d, RDF.type, PROV.EmptyDictionary) in g if secondary_files: derivations = set(g.subjects(RDF.type, CWLPROV.SecondaryFile)) assert derivations for der in derivations: sec = set(g.subjects(PROV.qualifiedDerivation, der)).pop() prim = set(g.objects(der, PROV.entity)).pop() # UUID specializes a hash checksum assert set(g.objects(sec, PROV.specializationOf)) # extensions etc. sec_basename = set(g.objects(sec, CWLPROV.basename)).pop() sec_nameroot = set(g.objects(sec, CWLPROV.nameroot)).pop() sec_nameext = set(g.objects(sec, CWLPROV.nameext)).pop() assert str(sec_basename) == "%s%s" % (sec_nameroot, sec_nameext) # TODO: Check hash data file exist in RO # The primary entity should have the same, but different values assert set(g.objects(prim, PROV.specializationOf)) prim_basename = set(g.objects(prim, CWLPROV.basename)).pop() prim_nameroot = set(g.objects(prim, CWLPROV.nameroot)).pop() prim_nameext = set(g.objects(prim, CWLPROV.nameext)).pop() assert str(prim_basename) == "%s%s" % (prim_nameroot, prim_nameext)
def check_ro(base_path, nested=False): manifest_file = os.path.join(base_path, "metadata", "manifest.json") assert os.path.isfile(manifest_file), "Can't find " + manifest_file arcp_root = find_arcp(base_path) base = urllib.parse.urljoin(arcp_root, "metadata/manifest.json") g = Graph() # Avoid resolving JSON-LD context https://w3id.org/bundle/context # so this test works offline context = Path(get_data("tests/bundle-context.jsonld")).as_uri() with open(manifest_file, "r", encoding="UTF-8") as f: jsonld = f.read() # replace with file:/// URI jsonld = jsonld.replace("https://w3id.org/bundle/context", context) g.parse(data=jsonld, format="json-ld", publicID=base) if os.environ.get("DEBUG"): print("Parsed manifest:\n\n") g.serialize(sys.stdout, format="ttl") ro = None for ro in g.subjects(ORE.isDescribedBy, URIRef(base)): break assert ro is not None, "Can't find RO with ore:isDescribedBy" profile = None for dc in g.objects(ro, DCTERMS.conformsTo): profile = dc break assert profile is not None, "Can't find profile with dct:conformsTo" assert profile == URIRef(provenance.CWLPROV_VERSION),\ "Unexpected cwlprov version " + profile paths = [] externals = [] for aggregate in g.objects(ro, ORE.aggregates): if not arcp.is_arcp_uri(aggregate): externals.append(aggregate) # Won't check external URIs existence here # TODO: Check they are not relative! continue lfile = _arcp2file(base_path, aggregate) paths.append(os.path.relpath(lfile, base_path)) assert os.path.isfile(lfile), "Can't find aggregated " + lfile assert paths, "Didn't find any arcp aggregates" assert externals, "Didn't find any data URIs" for ext in ["provn", "xml", "json", "jsonld", "nt", "ttl"]: f = "metadata/provenance/primary.cwlprov.%s" % ext assert f in paths, "provenance file missing " + f for f in ["workflow/primary-job.json", "workflow/packed.cwl", "workflow/primary-output.json"]: assert f in paths, "workflow file missing " + f # Can't test snapshot/ files directly as their name varies # TODO: check urn:hash::sha1 thingies # TODO: Check OA annotations packed = urllib.parse.urljoin(arcp_root, "/workflow/packed.cwl") primary_job = urllib.parse.urljoin(arcp_root, "/workflow/primary-job.json") primary_prov_nt = urllib.parse.urljoin(arcp_root, "/metadata/provenance/primary.cwlprov.nt") uuid = arcp.parse_arcp(arcp_root).uuid highlights = set(g.subjects(OA.motivatedBy, OA.highlighting)) assert highlights, "Didn't find highlights" for h in highlights: assert (h, OA.hasTarget, URIRef(packed)) in g describes = set(g.subjects(OA.motivatedBy, OA.describing)) for d in describes: assert (d, OA.hasBody, URIRef(arcp_root)) in g assert (d, OA.hasTarget, URIRef(uuid.urn)) in g linked = set(g.subjects(OA.motivatedBy, OA.linking)) for l in linked: assert (l, OA.hasBody, URIRef(packed)) in g assert (l, OA.hasBody, URIRef(primary_job)) in g assert (l, OA.hasTarget, URIRef(uuid.urn)) in g has_provenance = set(g.subjects(OA.hasBody, URIRef(primary_prov_nt))) for p in has_provenance: assert (p, OA.hasTarget, URIRef(uuid.urn)) in g assert (p, OA.motivatedBy, PROV.has_provenance) in g # Check all prov elements are listed formats = set() for prov in g.objects(p, OA.hasBody): assert (prov, DCTERMS.conformsTo, URIRef(provenance.CWLPROV_VERSION)) in g # NOTE: DC.format is a Namespace method and does not resolve like other terms formats.update(set(g.objects(prov, DC["format"]))) assert formats, "Could not find media types" expected = set(Literal(f) for f in ( "application/json", "application/ld+json", "application/n-triples", 'text/provenance-notation; charset="UTF-8"', 'text/turtle; charset="UTF-8"', "application/xml" )) assert formats == expected, "Did not match expected PROV media types" if nested: # Check for additional PROVs # Let's try to find the other wf run ID otherRuns = set() for p in g.subjects(OA.motivatedBy, PROV.has_provenance): if (p, OA.hasTarget, URIRef(uuid.urn)) in g: continue otherRuns.update(set(g.objects(p, OA.hasTarget))) assert otherRuns, "Could not find nested workflow run prov annotations"
def check_ro(self, nested=False): manifest_file = os.path.join(self.folder, "metadata", "manifest.json") self.assertTrue(os.path.isfile(manifest_file), "Can't find " + manifest_file) arcp_root = self.find_arcp() base = urllib.parse.urljoin(arcp_root, "metadata/manifest.json") g = Graph() with open(manifest_file, "rb") as f: # Note: This will use https://w3id.org/bundle/context g.parse(file=f, format="json-ld", publicID=base) if os.environ.get("DEBUG"): print("Parsed manifest:\n\n") g.serialize(sys.stdout, format="nt") ro = None for ro in g.subjects(ORE.isDescribedBy, URIRef(base)): break self.assertTrue(ro, "Can't find RO with ore:isDescribedBy") profile = None for dc in g.objects(ro, DCTERMS.conformsTo): profile = dc break self.assertTrue(profile, "Can't find profile with dct:conformsTo") self.assertEquals(profile, URIRef(provenance.CWLPROV_VERSION), "Unexpected cwlprov version " + profile) paths = [] externals = [] for aggregate in g.objects(ro, ORE.aggregates): if not arcp.is_arcp_uri(aggregate): externals.append(aggregate) # Won't check external URIs existence here # TODO: Check they are not relative! continue lfile = self._arcp2file(aggregate) paths.append(os.path.relpath(lfile, self.folder)) self.assertTrue(os.path.isfile(lfile), "Can't find aggregated " + lfile) self.assertTrue(paths, "Didn't find any arcp aggregates") self.assertTrue(externals, "Didn't find any data URIs") for ext in ["provn", "xml", "json", "jsonld", "nt", "ttl"]: f = "metadata/provenance/primary.cwlprov.%s" % ext self.assertTrue(f in paths, "provenance file missing " + f) for f in ["workflow/primary-job.json", "workflow/packed.cwl"]: self.assertTrue(f in paths, "workflow file missing " + f) # Can't test snapshot/ files directly as their name varies # TODO: check urn:hash::sha1 thingies # TODO: Check OA annotations packed = urllib.parse.urljoin(arcp_root, "/workflow/packed.cwl") primary_job = urllib.parse.urljoin(arcp_root, "/workflow/primary-job.json") primary_prov_nt = urllib.parse.urljoin( arcp_root, "/metadata/provenance/primary.cwlprov.nt") uuid = arcp.parse_arcp(arcp_root).uuid highlights = set(g.subjects(OA.motivatedBy, OA.highlighting)) self.assertTrue(highlights, "Didn't find highlights") for h in highlights: self.assertTrue((h, OA.hasTarget, URIRef(packed)) in g) describes = set(g.subjects(OA.motivatedBy, OA.describing)) for d in describes: self.assertTrue((d, OA.hasBody, URIRef(arcp_root)) in g) self.assertTrue((d, OA.hasTarget, URIRef(uuid.urn)) in g) linked = set(g.subjects(OA.motivatedBy, OA.linking)) for l in linked: self.assertTrue((l, OA.hasBody, URIRef(packed)) in g) self.assertTrue((l, OA.hasBody, URIRef(primary_job)) in g) self.assertTrue((l, OA.hasTarget, URIRef(uuid.urn)) in g) has_provenance = set(g.subjects(OA.hasBody, URIRef(primary_prov_nt))) for p in has_provenance: self.assertTrue((p, OA.hasTarget, URIRef(uuid.urn)) in g) self.assertTrue((p, OA.motivatedBy, PROV.has_provenance) in g) # Check all prov elements are listed formats = set() for prov in g.objects(p, OA.hasBody): self.assertTrue((prov, DCTERMS.conformsTo, URIRef(provenance.CWLPROV_VERSION)) in g) # NOTE: DC.format is a Namespace method and does not resolve like other terms formats.update(set(g.objects(prov, DC["format"]))) self.assertTrue(formats, "Could not find media types") expected = set( Literal(f) for f in ("application/json", "application/ld+json", "application/n-triples", 'text/provenance-notation; charset="UTF-8"', 'text/turtle; charset="UTF-8"', "application/xml")) self.assertEquals(formats, expected, "Did not match expected PROV media types") if nested: # Check for additional PROVs # Let's try to find the other wf run ID otherRuns = set() for p in g.subjects(OA.motivatedBy, PROV.has_provenance): if (p, OA.hasTarget, URIRef(uuid.urn)) in g: continue otherRuns.update(set(g.objects(p, OA.hasTarget))) self.assertTrue( otherRuns, "Could not find nested workflow run prov annotations")