コード例 #1
0
def find_arcp(base_path):
    # First try to find External-Identifier
    bag = bagit.Bag(base_path)
    ext_id = bag.info.get("External-Identifier")
    if arcp.is_arcp_uri(ext_id):
        return ext_id
    raise Exception("Can't find External-Identifier")
コード例 #2
0
ファイル: test_provenance.py プロジェクト: denis-yuen/cwltool
def find_arcp(base_path):
    # First try to find External-Identifier
    bag = bagit.Bag(base_path)
    ext_id = bag.info.get("External-Identifier")
    if arcp.is_arcp_uri(ext_id):
        return ext_id
    raise Exception("Can't find External-Identifier")
コード例 #3
0
ファイル: test_provenance.py プロジェクト: sm0179/cwltool
 def find_arcp(self):
     # First try to find External-Identifier
     bag = bagit.Bag(self.folder)
     ext_id = bag.info.get("External-Identifier")
     if arcp.is_arcp_uri(ext_id):
         return ext_id
     else:
         return arcp.arcp_random()
コード例 #4
0
 def format_id(self, identifier):
     if is_arcp_uri(identifier):
         return identifier
     else:
         # check if it's an absolute URL
         url = urlparse(identifier)
         if all([url.scheme, url.netloc, url.path]):
             return identifier
         elif identifier.startswith('#'):
             return identifier
         else:
             return '#' + identifier
コード例 #5
0
ファイル: test_provenance.py プロジェクト: sm0179/cwltool
    def check_ro(self):
        manifest_file = os.path.join(self.folder, "metadata", "manifest.json")
        self.assertTrue(os.path.isfile(manifest_file), "Can't find " + manifest_file)
        arcp_root = self.find_arcp()
        base = urllib.parse.urljoin(arcp_root, "metadata/manifest.json")
        g = Graph()
        with open(manifest_file, "rb") as f:
            # Note: This will use https://w3id.org/bundle/context
            g.parse(file=f, format="json-ld", publicID=base)
        print("Parsed manifest:\n\n")
        g.serialize(sys.stdout, format="nt")
        ro = None

        for ro in g.subjects(ORE.isDescribedBy, URIRef(base)):
            break
        self.assertTrue(ro, "Can't find RO with ore:isDescribedBy")

        profile = None
        for dc in g.objects(ro, DCTERMS.conformsTo):
            profile = dc
            break
        self.assertTrue(profile, "Can't find profile with dct:conformsTo")
        self.assertEquals(profile, URIRef("https://w3id.org/cwl/prov/0.3.0"),
            "Unexpected cwlprov version " + profile)

        paths = []
        externals = []
        for aggregate in g.objects(ro, ORE.aggregates):
            print(aggregate)
            if not arcp.is_arcp_uri(aggregate):
                externals.append(aggregate)
                # Won't check external URIs existence here
                # TODO: Check they are not relative!
                continue
            # arcp URIs - assume they are local to our RO
            path = arcp.parse_arcp(aggregate).path[1:]  # Strip first /
            paths.append(path)
            # Convert to local path, in case it uses \ on Windows
            lpath = provenance._convert_path(path, posixpath, os.path)
            lfile = os.path.join(self.folder, lpath)
            self.assertTrue(os.path.isfile(lfile), "Can't find aggregated " + lfile)

        self.assertTrue(paths, "Didn't find any arcp aggregates")
        self.assertTrue(externals, "Didn't find any data URIs")

        for ext in ["provn", "xml", "json", "jsonld", "nt", "ttl"]:
            f = "metadata/provenance/primary.cwlprov.%s" % ext
            self.assertTrue(f in paths, "provenance file missing " + f)

        for f in ["workflow/primary-job.json", "workflow/packed.cwl"]:
            self.assertTrue(f in paths, "workflow file missing " + f)
コード例 #6
0
ファイル: test_provenance.py プロジェクト: sm0179/cwltool
 def check_bagit(self):
     # check bagit structure
     for f in ("bagit.txt", "bag-info.txt", "manifest-sha1.txt", "tagmanifest-sha1.txt", "tagmanifest-sha256.txt"):
         f = os.path.join(self.folder, f)
         self.assertTrue(os.path.isfile(f))
     bag = bagit.Bag(self.folder)
     self.assertTrue(bag.has_oxum())
     (only_manifest, only_fs) = bag.compare_manifests_with_fs()
     self.assertFalse(list(only_manifest), "Some files only in manifest")
     self.assertFalse(list(only_fs), "Some files only on file system")
     missing_tagfiles = bag.missing_optional_tagfiles()
     self.assertFalse(list(missing_tagfiles), "Some files only in tagmanifest")
     bag.validate()
     # TODO: Check other bag-info attributes
     self.assertTrue(arcp.is_arcp_uri(bag.info.get("External-Identifier")))
コード例 #7
0
ファイル: ro.py プロジェクト: FarahZKhan/cwlprov-py
    def resolve_path(self, uri_path):
        if arcp.is_arcp_uri(str(uri_path)):
            uri = arcp.parse_arcp(uri_path)
            # Ensure same base URI meaning this bagit
            assert urllib.parse.urljoin(uri_path, "/") == self.root_uri
            # Strip initial / so path is relative
            path = pathlib.PurePosixPath(uri.path[1:])
        else:
            path = pathlib.PurePosixPath(uri_path)
        assert not path.is_absolute()

        if not str(path) in self.bag.entries:
            raise IOError("Not found in bag manifest/tagmanifest: %s" %
                          uri_path)
        # resolve as OS-specific path
        absolute = pathlib.Path(self.root_path, path)
        # ensure it did not climb out (will throw ValueError if not)
        assert absolute.relative_to(self.root_path)
        return absolute
コード例 #8
0
ファイル: test_provenance.py プロジェクト: denis-yuen/cwltool
def check_bagit(base_path):
    # check bagit structure
    required_files = [
        "bagit.txt", "bag-info.txt", "manifest-sha1.txt",
        "tagmanifest-sha1.txt", "tagmanifest-sha256.txt"]

    for basename in required_files:
        file_path = os.path.join(base_path, basename)
        assert os.path.isfile(file_path)

    bag = bagit.Bag(base_path)
    assert bag.has_oxum()
    (only_manifest, only_fs) = bag.compare_manifests_with_fs()
    assert not list(only_manifest), "Some files only in manifest"
    assert not list(only_fs), "Some files only on file system"
    missing_tagfiles = bag.missing_optional_tagfiles()
    assert not list(missing_tagfiles), "Some files only in tagmanifest"
    bag.validate()
    # TODO: Check other bag-info attributes
    assert arcp.is_arcp_uri(bag.info.get("External-Identifier"))
コード例 #9
0
def check_bagit(base_path):
    # check bagit structure
    required_files = [
        "bagit.txt", "bag-info.txt", "manifest-sha1.txt",
        "tagmanifest-sha1.txt", "tagmanifest-sha256.txt"]

    for basename in required_files:
        file_path = os.path.join(base_path, basename)
        assert os.path.isfile(file_path)

    bag = bagit.Bag(base_path)
    assert bag.has_oxum()
    (only_manifest, only_fs) = bag.compare_manifests_with_fs()
    assert not list(only_manifest), "Some files only in manifest"
    assert not list(only_fs), "Some files only on file system"
    missing_tagfiles = bag.missing_optional_tagfiles()
    assert not list(missing_tagfiles), "Some files only in tagmanifest"
    bag.validate()
    # TODO: Check other bag-info attributes
    assert arcp.is_arcp_uri(bag.info.get("External-Identifier"))
コード例 #10
0
def check_ro(base_path, nested=False):
    manifest_file = os.path.join(base_path, "metadata", "manifest.json")
    assert os.path.isfile(manifest_file), "Can't find " + manifest_file
    arcp_root = find_arcp(base_path)
    base = urllib.parse.urljoin(arcp_root, "metadata/manifest.json")
    g = Graph()

    # Avoid resolving JSON-LD context https://w3id.org/bundle/context
    # so this test works offline
    context = Path(get_data("tests/bundle-context.jsonld")).as_uri()
    with open(manifest_file, "r", encoding="UTF-8") as f:
        jsonld = f.read()
        # replace with file:/// URI
        jsonld = jsonld.replace("https://w3id.org/bundle/context", context)
    g.parse(data=jsonld, format="json-ld", publicID=base)
    if os.environ.get("DEBUG"):
        print("Parsed manifest:\n\n")
        g.serialize(sys.stdout, format="ttl")
    ro = None

    for ro in g.subjects(ORE.isDescribedBy, URIRef(base)):
        break
    assert ro is not None, "Can't find RO with ore:isDescribedBy"

    profile = None
    for dc in g.objects(ro, DCTERMS.conformsTo):
        profile = dc
        break
    assert profile is not None, "Can't find profile with dct:conformsTo"
    assert profile == URIRef(provenance.CWLPROV_VERSION),\
        "Unexpected cwlprov version " + profile

    paths = []
    externals = []
    for aggregate in g.objects(ro, ORE.aggregates):
        if not arcp.is_arcp_uri(aggregate):
            externals.append(aggregate)
            # Won't check external URIs existence here
            # TODO: Check they are not relative!
            continue
        lfile = _arcp2file(base_path, aggregate)
        paths.append(os.path.relpath(lfile, base_path))
        assert os.path.isfile(lfile), "Can't find aggregated " + lfile

    assert paths, "Didn't find any arcp aggregates"
    assert externals, "Didn't find any data URIs"

    for ext in ["provn", "xml", "json", "jsonld", "nt", "ttl"]:
        f = "metadata/provenance/primary.cwlprov.%s" % ext
        assert f in paths, "provenance file missing " + f

    for f in [
            "workflow/primary-job.json", "workflow/packed.cwl",
            "workflow/primary-output.json"
    ]:
        assert f in paths, "workflow file missing " + f
    # Can't test snapshot/ files directly as their name varies

    # TODO: check urn:hash::sha1 thingies
    # TODO: Check OA annotations

    packed = urllib.parse.urljoin(arcp_root, "/workflow/packed.cwl")
    primary_job = urllib.parse.urljoin(arcp_root, "/workflow/primary-job.json")
    primary_prov_nt = urllib.parse.urljoin(
        arcp_root, "/metadata/provenance/primary.cwlprov.nt")
    uuid = arcp.parse_arcp(arcp_root).uuid

    highlights = set(g.subjects(OA.motivatedBy, OA.highlighting))
    assert highlights, "Didn't find highlights"
    for h in highlights:
        assert (h, OA.hasTarget, URIRef(packed)) in g

    describes = set(g.subjects(OA.motivatedBy, OA.describing))
    for d in describes:
        assert (d, OA.hasBody, URIRef(arcp_root)) in g
        assert (d, OA.hasTarget, URIRef(uuid.urn)) in g

    linked = set(g.subjects(OA.motivatedBy, OA.linking))
    for l in linked:
        assert (l, OA.hasBody, URIRef(packed)) in g
        assert (l, OA.hasBody, URIRef(primary_job)) in g
        assert (l, OA.hasTarget, URIRef(uuid.urn)) in g

    has_provenance = set(g.subjects(OA.hasBody, URIRef(primary_prov_nt)))
    for p in has_provenance:
        assert (p, OA.hasTarget, URIRef(uuid.urn)) in g
        assert (p, OA.motivatedBy, PROV.has_provenance) in g
        # Check all prov elements are listed
        formats = set()
        for prov in g.objects(p, OA.hasBody):
            assert (prov, DCTERMS.conformsTo,
                    URIRef(provenance.CWLPROV_VERSION)) in g
            # NOTE: DC.format is a Namespace method and does not resolve like other terms
            formats.update(set(g.objects(prov, DC["format"])))
        assert formats, "Could not find media types"
        expected = set(
            Literal(f)
            for f in ("application/json", "application/ld+json",
                      "application/n-triples",
                      'text/provenance-notation; charset="UTF-8"',
                      'text/turtle; charset="UTF-8"', "application/xml"))
        assert formats == expected, "Did not match expected PROV media types"

    if nested:
        # Check for additional PROVs
        # Let's try to find the other wf run ID
        otherRuns = set()
        for p in g.subjects(OA.motivatedBy, PROV.has_provenance):
            if (p, OA.hasTarget, URIRef(uuid.urn)) in g:
                continue
            otherRuns.update(set(g.objects(p, OA.hasTarget)))
        assert otherRuns, "Could not find nested workflow run prov annotations"
コード例 #11
0
ファイル: ro.py プロジェクト: FarahZKhan/cwlprov-py
 def _find_arcp(self):
     ext_id = self.bag.info.get("External-Identifier")
     if ext_id and arcp.is_arcp_uri(ext_id):
         return ext_id
     else:
         return arcp.arcp_random()
コード例 #12
0
ファイル: test_provenance.py プロジェクト: denis-yuen/cwltool
def check_ro(base_path, nested=False):
    manifest_file = os.path.join(base_path, "metadata", "manifest.json")
    assert os.path.isfile(manifest_file), "Can't find " + manifest_file
    arcp_root = find_arcp(base_path)
    base = urllib.parse.urljoin(arcp_root, "metadata/manifest.json")
    g = Graph()

    # Avoid resolving JSON-LD context https://w3id.org/bundle/context
    # so this test works offline
    context = Path(get_data("tests/bundle-context.jsonld")).as_uri()
    with open(manifest_file, "r", encoding="UTF-8") as f:
        jsonld = f.read()
        # replace with file:/// URI
        jsonld = jsonld.replace("https://w3id.org/bundle/context", context)
    g.parse(data=jsonld, format="json-ld", publicID=base)
    if os.environ.get("DEBUG"):
        print("Parsed manifest:\n\n")
        g.serialize(sys.stdout, format="ttl")
    ro = None

    for ro in g.subjects(ORE.isDescribedBy, URIRef(base)):
        break
    assert ro is not None, "Can't find RO with ore:isDescribedBy"

    profile = None
    for dc in g.objects(ro, DCTERMS.conformsTo):
        profile = dc
        break
    assert profile is not None, "Can't find profile with dct:conformsTo"
    assert profile == URIRef(provenance.CWLPROV_VERSION),\
        "Unexpected cwlprov version " + profile

    paths = []
    externals = []
    for aggregate in g.objects(ro, ORE.aggregates):
        if not arcp.is_arcp_uri(aggregate):
            externals.append(aggregate)
            # Won't check external URIs existence here
            # TODO: Check they are not relative!
            continue
        lfile = _arcp2file(base_path, aggregate)
        paths.append(os.path.relpath(lfile, base_path))
        assert os.path.isfile(lfile), "Can't find aggregated " + lfile

    assert paths, "Didn't find any arcp aggregates"
    assert externals, "Didn't find any data URIs"

    for ext in ["provn", "xml", "json", "jsonld", "nt", "ttl"]:
        f = "metadata/provenance/primary.cwlprov.%s" % ext
        assert f in paths, "provenance file missing " + f

    for f in ["workflow/primary-job.json", "workflow/packed.cwl", "workflow/primary-output.json"]:
        assert f in paths, "workflow file missing " + f
    # Can't test snapshot/ files directly as their name varies

    # TODO: check urn:hash::sha1 thingies
    # TODO: Check OA annotations

    packed = urllib.parse.urljoin(arcp_root, "/workflow/packed.cwl")
    primary_job = urllib.parse.urljoin(arcp_root, "/workflow/primary-job.json")
    primary_prov_nt = urllib.parse.urljoin(arcp_root, "/metadata/provenance/primary.cwlprov.nt")
    uuid = arcp.parse_arcp(arcp_root).uuid

    highlights = set(g.subjects(OA.motivatedBy, OA.highlighting))
    assert highlights, "Didn't find highlights"
    for h in highlights:
        assert (h, OA.hasTarget, URIRef(packed)) in g

    describes = set(g.subjects(OA.motivatedBy, OA.describing))
    for d in describes:
        assert (d, OA.hasBody, URIRef(arcp_root)) in g
        assert (d, OA.hasTarget, URIRef(uuid.urn)) in g

    linked = set(g.subjects(OA.motivatedBy, OA.linking))
    for l in linked:
        assert (l, OA.hasBody, URIRef(packed)) in g
        assert (l, OA.hasBody, URIRef(primary_job)) in g
        assert (l, OA.hasTarget, URIRef(uuid.urn)) in g

    has_provenance = set(g.subjects(OA.hasBody, URIRef(primary_prov_nt)))
    for p in has_provenance:
        assert (p, OA.hasTarget, URIRef(uuid.urn)) in g
        assert (p, OA.motivatedBy, PROV.has_provenance) in g
        # Check all prov elements are listed
        formats = set()
        for prov in g.objects(p, OA.hasBody):
            assert (prov, DCTERMS.conformsTo, URIRef(provenance.CWLPROV_VERSION)) in g
            # NOTE: DC.format is a Namespace method and does not resolve like other terms
            formats.update(set(g.objects(prov, DC["format"])))
        assert formats, "Could not find media types"
        expected = set(Literal(f) for f in (
            "application/json",
            "application/ld+json",
            "application/n-triples",
            'text/provenance-notation; charset="UTF-8"',
            'text/turtle; charset="UTF-8"',
            "application/xml"
        ))
        assert formats == expected, "Did not match expected PROV media types"

    if nested:
        # Check for additional PROVs
        # Let's try to find the other wf run ID
        otherRuns = set()
        for p in g.subjects(OA.motivatedBy, PROV.has_provenance):
            if (p, OA.hasTarget, URIRef(uuid.urn)) in g:
                continue
            otherRuns.update(set(g.objects(p, OA.hasTarget)))
        assert otherRuns, "Could not find nested workflow run prov annotations"
コード例 #13
0
    def check_ro(self, nested=False):
        manifest_file = os.path.join(self.folder, "metadata", "manifest.json")
        self.assertTrue(os.path.isfile(manifest_file),
                        "Can't find " + manifest_file)
        arcp_root = self.find_arcp()
        base = urllib.parse.urljoin(arcp_root, "metadata/manifest.json")
        g = Graph()
        with open(manifest_file, "rb") as f:
            # Note: This will use https://w3id.org/bundle/context
            g.parse(file=f, format="json-ld", publicID=base)
        if os.environ.get("DEBUG"):
            print("Parsed manifest:\n\n")
            g.serialize(sys.stdout, format="nt")
        ro = None

        for ro in g.subjects(ORE.isDescribedBy, URIRef(base)):
            break
        self.assertTrue(ro, "Can't find RO with ore:isDescribedBy")

        profile = None
        for dc in g.objects(ro, DCTERMS.conformsTo):
            profile = dc
            break
        self.assertTrue(profile, "Can't find profile with dct:conformsTo")
        self.assertEquals(profile, URIRef(provenance.CWLPROV_VERSION),
                          "Unexpected cwlprov version " + profile)

        paths = []
        externals = []
        for aggregate in g.objects(ro, ORE.aggregates):
            if not arcp.is_arcp_uri(aggregate):
                externals.append(aggregate)
                # Won't check external URIs existence here
                # TODO: Check they are not relative!
                continue
            lfile = self._arcp2file(aggregate)
            paths.append(os.path.relpath(lfile, self.folder))
            self.assertTrue(os.path.isfile(lfile),
                            "Can't find aggregated " + lfile)

        self.assertTrue(paths, "Didn't find any arcp aggregates")
        self.assertTrue(externals, "Didn't find any data URIs")

        for ext in ["provn", "xml", "json", "jsonld", "nt", "ttl"]:
            f = "metadata/provenance/primary.cwlprov.%s" % ext
            self.assertTrue(f in paths, "provenance file missing " + f)

        for f in ["workflow/primary-job.json", "workflow/packed.cwl"]:
            self.assertTrue(f in paths, "workflow file missing " + f)
        # Can't test snapshot/ files directly as their name varies

        # TODO: check urn:hash::sha1 thingies
        # TODO: Check OA annotations

        packed = urllib.parse.urljoin(arcp_root, "/workflow/packed.cwl")
        primary_job = urllib.parse.urljoin(arcp_root,
                                           "/workflow/primary-job.json")
        primary_prov_nt = urllib.parse.urljoin(
            arcp_root, "/metadata/provenance/primary.cwlprov.nt")
        uuid = arcp.parse_arcp(arcp_root).uuid

        highlights = set(g.subjects(OA.motivatedBy, OA.highlighting))
        self.assertTrue(highlights, "Didn't find highlights")
        for h in highlights:
            self.assertTrue((h, OA.hasTarget, URIRef(packed)) in g)

        describes = set(g.subjects(OA.motivatedBy, OA.describing))
        for d in describes:
            self.assertTrue((d, OA.hasBody, URIRef(arcp_root)) in g)
            self.assertTrue((d, OA.hasTarget, URIRef(uuid.urn)) in g)

        linked = set(g.subjects(OA.motivatedBy, OA.linking))
        for l in linked:
            self.assertTrue((l, OA.hasBody, URIRef(packed)) in g)
            self.assertTrue((l, OA.hasBody, URIRef(primary_job)) in g)
            self.assertTrue((l, OA.hasTarget, URIRef(uuid.urn)) in g)

        has_provenance = set(g.subjects(OA.hasBody, URIRef(primary_prov_nt)))
        for p in has_provenance:
            self.assertTrue((p, OA.hasTarget, URIRef(uuid.urn)) in g)
            self.assertTrue((p, OA.motivatedBy, PROV.has_provenance) in g)
            # Check all prov elements are listed
            formats = set()
            for prov in g.objects(p, OA.hasBody):
                self.assertTrue((prov, DCTERMS.conformsTo,
                                 URIRef(provenance.CWLPROV_VERSION)) in g)
                # NOTE: DC.format is a Namespace method and does not resolve like other terms
                formats.update(set(g.objects(prov, DC["format"])))
            self.assertTrue(formats, "Could not find media types")
            expected = set(
                Literal(f)
                for f in ("application/json", "application/ld+json",
                          "application/n-triples",
                          'text/provenance-notation; charset="UTF-8"',
                          'text/turtle; charset="UTF-8"', "application/xml"))
            self.assertEquals(formats, expected,
                              "Did not match expected PROV media types")

        if nested:
            # Check for additional PROVs
            # Let's try to find the other wf run ID
            otherRuns = set()
            for p in g.subjects(OA.motivatedBy, PROV.has_provenance):
                if (p, OA.hasTarget, URIRef(uuid.urn)) in g:
                    continue
                otherRuns.update(set(g.objects(p, OA.hasTarget)))
            self.assertTrue(
                otherRuns,
                "Could not find nested workflow run prov annotations")