def __init__( self, research_object: "ResearchObject", full_name: str, host_provenance: bool, user_provenance: bool, orcid: str, fsaccess: StdFsAccess, run_uuid: Optional[uuid.UUID] = None, ) -> None: """Initialize the provenance profile.""" self.fsaccess = fsaccess self.orcid = orcid self.research_object = research_object self.folder = self.research_object.folder self.document = ProvDocument() self.host_provenance = host_provenance self.user_provenance = user_provenance self.engine_uuid = research_object.engine_uuid # type: str self.add_to_manifest = self.research_object.add_to_manifest if self.orcid: _logger.debug("[provenance] Creator ORCID: %s", self.orcid) self.full_name = full_name if self.full_name: _logger.debug("[provenance] Creator Full name: %s", self.full_name) self.workflow_run_uuid = run_uuid or uuid.uuid4() self.workflow_run_uri = self.workflow_run_uuid.urn # type: str self.generate_prov_doc()
def calculate_flat_provenance_types( prov_doc: ProvDocument, to_level: int = 0, including_primitives_types: bool = True, counting_wdf_as_two: bool = False, ignored_types: Iterable[str] = ϕ, ) -> MultiLevelTypeDict: # flatten all the bundles, if any prov_doc = prov_doc.flattened() # initialise index structures level0_types = defaultdict( set) # type: Dict[QualifiedName, Set[QualifiedName]] predecessors = defaultdict( set ) # type: Dict[QualifiedName, Set[Tuple[QualifiedName, QualifiedName]]] types_to_ignore: FrozenSet[str] = frozenset(ignored_types) # indexing node types and relations for rec in prov_doc.get_records(): # type: ProvRecord if rec.is_element(): level0_types[rec.identifier] |= get_element_types( rec, including_primitives_types, types_to_ignore) elif rec.is_relation(): rel_type = rec.get_type() attrs, values = zip(*rec.formal_attributes) # expecting a QualifiedName from the first argument of a relation predecessor, successor = values[:2] if predecessor is not None and successor is not None: predecessors[successor].add((rel_type, predecessor)) # the type map for this graph fp_types = defaultdict(dict) # type: MultiLevelTypeDict # converting type sets to FlatProvenanceType level 0 fp_types[0] = { node: (frozenset(level0_types[node]), ) for node in level0_types } # propagating level-0 types to the specified level for k in range(1, to_level + 1): # only propagating (k-1) types from nodes that have them for node, types in fp_types[k - 1].items(): # propagating the types to the predecessors for rel_type, predecessor in predecessors[node]: k_type = types + (frozenset({rel_type}), ) # type: FlatProvenanceType if counting_wdf_as_two and (rel_type == PROV_DERIVATION): k_p1_type = k_type + (frozenset({rel_type}), ) # type: FlatProvenanceType fp_types[k + 1][predecessor] = ( join_flat_types(fp_types[k + 1][predecessor], k_p1_type) if predecessor in fp_types[k + 1] else k_p1_type) else: fp_types[k][predecessor] = (join_flat_types( fp_types[k][predecessor], k_type) if predecessor in fp_types[k] else k_type) return fp_types
def read(source, format=None): """ Convenience function returning a ProvDocument instance. It does a lazy format detection by simply using try/except for all known formats. The deserializers should fail fairly early when data of the wrong type is passed to them thus the try/except is likely cheap. One could of course also do some more advanced format auto-detection but I am not sure that is necessary. The downside is that no proper error messages will be produced, use the format parameter to get the actual traceback. """ # Lazy imports to not globber the namespace. from prov.model import ProvDocument from prov.serializers import Registry Registry.load_serializers() serializers = Registry.serializers.keys() if format: return ProvDocument.deserialize(source=source, format=format.lower()) for format in serializers: try: return ProvDocument.deserialize(source=source, format=format) except: pass else: raise TypeError("Could not read from the source. To get a proper " "error message, specify the format with the 'format' " "parameter.")
def base_connector_record_parameter_example(): """ Returns a dict with attributes and metadata for a simple node :return:dict with attributes metadata :rtype: dict """ doc = ProvDocument() namespaces = dict() namespaces.update({"ex": "http://example.com"}) namespaces.update({"custom": "http://custom.com"}) type_map = dict() type_map.update({"int value": "int"}) type_map.update({"date value": "xds:datetime"}) metadata = dict() metadata.update( {METADATA_KEY_PROV_TYPE: doc.valid_qualified_name("prov:Activity")}) metadata.update({ METADATA_KEY_IDENTIFIER: doc.valid_qualified_name("prov:example_node") }) metadata.update({METADATA_KEY_TYPE_MAP: type_map}) metadata.update({METADATA_KEY_NAMESPACES: namespaces}) return_data = dict() return_data.update({"attributes": attributes_dict_example()}) return_data.update({"metadata": metadata}) return return_data
def __init__(self, database_helper, full_provenance=False): """ Initializes the provenance for the mjclawar_rarshad project Parameters ---------- database_helper: DatabaseHelper full_provenance: bool Returns ------- """ assert isinstance(database_helper, DatabaseHelper) self.database_helper = database_helper if full_provenance: self.prov_doc = ProvDocument.deserialize(dir_info.plan_json) else: self.prov_doc = ProvDocument() self.prov_doc.add_namespace(mcras.BDP_NAMESPACE.name, mcras.BDP_NAMESPACE.link) self.prov_doc.add_namespace(mcras.ALG_NAMESPACE.name, mcras.ALG_NAMESPACE.link) self.prov_doc.add_namespace(mcras.DAT_NAMESPACE.name, mcras.DAT_NAMESPACE.link) self.prov_doc.add_namespace(mcras.LOG_NAMESPACE.name, mcras.LOG_NAMESPACE.link) self.prov_doc.add_namespace(mcras.ONT_NAMESPACE.name, mcras.ONT_NAMESPACE.link)
def enforce_uniqueness_constraints(graph: ProvDocument) -> ProvDocument: """Enforce model uniqueness constraints. Remove node duplicates: - ProvDocument.unified takes care of this by removing nodes with the same id. Remove relation duplicates: - Allow only one relation of a certain type between two nodes. Enforcing this constraint after having populated the model instead of during population simplifies the model creation. """ records, known = [], set() for relation in graph.get_records(ProvRelation): (_, source), (_, target) = relation.formal_attributes[:2] rel_tuple = (type(relation), source, target) if rel_tuple in known: continue known.add(rel_tuple) records.append(relation) records.extend(graph.get_records(ProvElement)) g = ProvDocument(records) return g.unified()
def test_namespace_inheritance(self): prov_doc = ProvDocument() prov_doc.add_namespace('ex', 'http://www.example.org/') bundle = prov_doc.bundle('ex:bundle') e1 = bundle.entity('ex:e1') self.assertIsNotNone(e1.identifier, "e1's identifier is None!") self.do_tests(prov_doc)
def get_provdoc(format, infile): if format == "json": return ProvDocument.deserialize(infile) elif format == "xml": return ProvDocument.deserialize(infile, format='xml') else: print "Error: unsupported format (xml and json are supported"
def get_provdoc(format,infile): if format == "json": return ProvDocument.deserialize(infile) elif format == "xml": return ProvDocument.deserialize(infile,format='xml') else: print "Error: unsupported format (xml and json are supported"
def test_default_namespace_inheritance(self): prov_doc = ProvDocument() prov_doc.set_default_namespace('http://www.example.org/') bundle = prov_doc.bundle('bundle') e1 = bundle.entity('e1') self.assertIsNotNone(e1.identifier, "e1's identifier is None!") self.assertRoundTripEquivalence(prov_doc)
def diff(diff: DiffModel, document: provo.ProvDocument): print_msg(" Exporting module dependency comparison") added, removed, replaced = diff.modules for module in added: # type: Module _create_module_dep(module, document, suffix="_a") document.wasGeneratedBy("module{}_a".format(module.id), "trial{}Execution".format(diff.trial2.id), None, "module{}AddDep".format(module.id), [(provo.PROV_ROLE, "dependencyAddition")]) for module in removed: # type: Module _create_module_dep(module, document, suffix="_r") document.wasInvalidatedBy("module{}_r".format(module.id), "trial{}Execution".format(diff.trial2.id), None, "module{}RemoveDep".format(module.id), [(provo.PROV_ROLE, "dependencyRemoval")]) for (mod_removed, mod_added) in replaced: # type: Module _create_module_dep(mod_added, document, suffix="_a") document.wasGeneratedBy("module{}_a".format(mod_added.id), "trial{}Execution".format(diff.trial2.id), None, "module{}AddDep".format(mod_added.id), [(provo.PROV_ROLE, "dependencyAddition")]) _create_module_dep(mod_removed, document, suffix="_r") document.wasInvalidatedBy("module{}_r".format(mod_removed.id), "trial{}Execution".format(diff.trial2.id), None, "module{}RemoveDep".format(mod_removed.id), [(provo.PROV_ROLE, "dependencyRemoval")]) document.wasRevisionOf("module{}_a".format(mod_added.id), "module{}_r".format(mod_removed.id), "trial{}Execution".format(diff.trial2.id), None, None, None, [(provo.PROV_TYPE, "dependencyReplacement")])
def test_default_namespace_inheritance(self): prov_doc = ProvDocument() prov_doc.set_default_namespace("http://www.example.org/") bundle = prov_doc.bundle("bundle") e1 = bundle.entity("e1") self.assertIsNotNone(e1.identifier, "e1's identifier is None!") self.do_tests(prov_doc)
def parse(self): """ Parse a result directory to extract the pieces information to be stored in NIDM-Results. """ # Methods: find_software, find_model_fitting, find_contrasts and # find_inferences should be defined in the children classes and return # a list of NIDM Objects as specified in the objects module # Object of type Software describing the neuroimaging software package # used for the analysis self.software = self._find_software() # List of objects of type ModelFitting describing the model fitting # step in NIDM-Results (main activity: Model Parameters Estimation) self.model_fittings = self._find_model_fitting() # Dictionary of (key, value) pairs where where key is a tuple # containing the identifier of a ModelParametersEstimation object and a # tuple of identifiers of ParameterEstimateMap objects and value is an # object of type Contrast describing the contrast estimation step in # NIDM-Results (main activity: Contrast Estimation) self.contrasts = self._find_contrasts() # Inference activity and entities # Dictionary of (key, value) pairs where key is the identifier of a # ContrastEstimation object and value is an object of type Inference # describing the inference step in NIDM-Results (main activity: # Inference) self.inferences = self._find_inferences() # Initialise prov document self.doc = ProvDocument() self._add_namespaces()
def get_document_as_prov(self, document_id=None): """ Get a ProvDocument from the database based on the document_id :param document_id: The id as a sting value :return: ProvDocument """ if type(document_id) is not str: raise InvalidArgumentTypeException() raw_doc = self._adapter.get_document(document_id) # parse document prov_document = ProvDocument() for record in raw_doc.document.records: self._parse_record(prov_document, record) for bundle in raw_doc.bundles: prefixed_identifier = bundle.bundle_record.metadata[ METADATA_KEY_IDENTIFIER] # remove prefix identifier = prefixed_identifier[ len(PROV_API_BUNDLE_IDENTIFIER_PREFIX) - 2:] prov_bundle = prov_document.bundle(identifier=identifier) for record in bundle.records: self._parse_record(prov_bundle, record) return prov_document
def deriveDependency(self, aDO, aRO, derivedList): d1 = ProvDocument() # d1 is now an empty provenance document d1.add_namespace("dt", "http://cs.ncl.ac.uk/dtsim/") e1 = d1.entity(DTns + aRO.id) # deriving ag1 = d1.agent(DTns + str(aDO.id)) for der in derivedList: # create provlet e2 = d1.entity(DTns + der.id) # derived d1.wasAttributedTo(e2, ag1) d1.wasDerivedFrom(e2, e1) # update upstream pointer der.upstream = [(aRO, None)] # aRO is upstream from aRO with no activity # update downstream aRO.downstream.append((der, None)) # aR1 is downstream from aR1 with no activity # update global graph e1 = pGlobal.entity(DTns + aRO.id) # deriving ag1 = pGlobal.agent(DTns + str(aDO.id)) pGlobal.wasAttributedTo(e2, ag1) for der in derivedList: e2 = pGlobal.entity(DTns + der.id) # derived pGlobal.wasDerivedFrom(e2, e1) # trigger credit recomputation for der in derivedList: # aRO needs its credit updated with aRO1.credit aCreditManager.addDerivationCredit(aRO, der.currentTotalCredit) # self.notify(d1) return d1
def primer(): a = ProvDocument() script_path = os.path.dirname(os.path.abspath( __file__ )) # with open(str(script_path) + "/output.json") as json_file: line = json_file.readline() a = a.deserialize(content=line) return a
def primer(): a = ProvDocument() script_path = os.path.dirname(os.path.abspath(__file__)) # with open(str(script_path) + "/output.json") as json_file: line = json_file.readline() a = a.deserialize(content=line) return a
def provRead(source, format=None): from prov.model import ProvDocument from prov.serializers import Registry Registry.load_serializers() serializers = Registry.serializers.keys() if format: try: ret = ProvDocument.deserialize(source=source, format=format.lower()) return ret except Exception as e: log.error(e) raise TypeError(e) for format in serializers: source.seek(0) try: return ProvDocument.deserialize(source=source, format=format) except: pass else: raise TypeError("Could not read from the source. To get a proper " "error message, specify the format with the 'format' " "parameter.")
def add_parents(graph: ProvDocument, package: CommitModelPackage) -> ProvDocument: """Add link between commit activities and their parents.""" commit = package.commit for parent in package.parent_commits: graph.activity(*parent) graph.activity(*commit) graph.wasInformedBy(commit.id, parent.id) return graph
def __init__( self, project, add_attributes=False, add_users=True, _add_project_namespaces=True, _iter_samples=True, _iter_project=True, ): """ Constructs the W3C-PROV document for a project. :param Project project: instance of bioprov.src.Project. :param bool add_attributes: whether to add object attributes. :param bool add_users: whether to add users and environments. :param bool _add_project_namespaces: :param bool _iter_samples: :param bool _iter_project: """ # Assert Project is good before constructing instance assert isinstance(project, Project), Warnings()["incorrect_type"](project, Project) self.ProvDocument = ProvDocument() self.project = project self.project.document = self.ProvDocument self._dot = prov_to_dot(self.ProvDocument) self._provn = self.ProvDocument.get_provn() self._entities = dict() self._activities = dict() self._agents = dict() self._user_bundles = dict() self._provstore_document = None # Don't add attributes if you plan on exporting to graphic format self.add_attributes = add_attributes # Set this before running Namespaces if add_users: self._create_envs_and_users = True else: self._create_envs_and_users = False # Default actions to create the document if _add_project_namespaces: self._add_project_namespaces() if self._create_envs_and_users: self._iter_envs_and_users() if _iter_project: self._iter_project() if _iter_samples: self._iter_samples()
def test_xsd_qnames(self): prov_doc = ProvDocument() ex = Namespace('ex', 'http://www.example.org') prov_doc.add_namespace(ex) an_xsd_qname = XSDQName(ex['a_value']) prov_doc.entity('ex:e1', {'prov:value': an_xsd_qname}) self.assertPROVJSONRoundTripEquivalence(prov_doc)
def document_with_n_bundles_having_default_namespace(n): prov_doc = ProvDocument() prov_doc.add_namespace("ex", "http://www.example.org/") for i in range(n): x = str(i + 1) bundle = prov_doc.bundle("ex:bundle/" + x) bundle.set_default_namespace("http://www.example.org/default/" + x) bundle.entity("e") return prov_doc
def add_commit(graph: ProvDocument, package: CommitModelPackage) -> ProvDocument: """Add commit activity, agents for author and committer, relations between agents and activity.""" author, committer, commit = package.author, package.committer, package.commit graph.agent(*author) graph.agent(*committer) graph.activity(*commit) graph.wasAssociatedWith(commit.id, author.id) graph.wasAssociatedWith(commit.id, committer.id) return graph
def diff(diff: DiffModel, document: provo.ProvDocument): print_msg(" Exporting basic trial comparison information") _create_trial_info(document, diff.trial1, "_{}".format(diff.trial1.id)) _create_trial_info(document, diff.trial2, "_{}".format(diff.trial2.id)) document.wasInfluencedBy("trial{}Execution".format(diff.trial2.id), "trial{}Execution".format(diff.trial1.id), "trial{}ComparedTo{}".format(diff.trial2.id, diff.trial1.id), [(provo.PROV_TYPE, "comparison")])
def document_with_n_bundles_having_default_namespace(n): prov_doc = ProvDocument() prov_doc.add_namespace('ex', 'http://www.example.org/') for i in range(n): x = str(i + 1) bundle = prov_doc.bundle('ex:bundle/' + x) bundle.set_default_namespace('http://www.example.org/default/' + x) bundle.entity('e') return prov_doc
def collections(): g = ProvDocument() ex = Namespace('ex', 'http://example.org/') c1 = g.collection(ex['c1']) e1 = g.entity('ex:e1') g.hadMember(c1, e1) return g
def save_provenance(prov_doc: ProvDocument, filepath: Path): logging.debug("Saving provenance files:") logging.debug(" - %s", filepath) with filepath.open("w") as f: prov_doc.serialize(f) provn_content = prov_doc.get_provn() filepath = filepath.with_suffix(".provn") logging.debug(" - %s", filepath) with filepath.open("w") as f: f.write(provn_content)
def long_literals(): g = ProvDocument() long_uri = "http://Lorem.ipsum/dolor/sit/amet/consectetur/adipiscing/elit/Quisque/vel/sollicitudin/felis/nec/venenatis/massa/Aenean/lectus/arcu/sagittis/sit/amet/nisl/nec/varius/eleifend/sem/In/hac/habitasse/platea/dictumst/Aliquam/eget/fermentum/enim/Curabitur/auctor/elit/non/ipsum/interdum/at/orci/aliquam/" ex = Namespace('ex', long_uri) g.add_namespace(ex) g.entity('ex:e1', {'prov:label': 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec pellentesque luctus nulla vel ullamcorper. Donec sit amet ligula sit amet lorem pretium rhoncus vel vel lorem. Sed at consequat metus, eget eleifend massa. Fusce a facilisis turpis. Lorem volutpat.'}) return g
def get_esmvaltool_provenance(): """Create an esmvaltool run activity.""" provenance = ProvDocument() namespace = 'software' create_namespace(provenance, namespace) attributes = {} # TODO: add dependencies with versions here activity = provenance.activity(namespace + ':esmvaltool==' + __version__, other_attributes=attributes) return activity
def start(self, workflow=False): from daops import __version__ as daops_version from flamingo import __version__ as flamingo_version self.doc = ProvDocument() # Declaring namespaces for various prefixes self.doc.set_default_namespace(uri="http://purl.org/roocs/prov#") self.doc.add_namespace("prov", uri="http://www.w3.org/ns/prov#") self.doc.add_namespace( "provone", uri="http://purl.dataone.org/provone/2015/01/15/ontology#") self.doc.add_namespace("dcterms", uri="http://purl.org/dc/terms/") # Define entities project_cds = self.doc.agent( ":copernicus_CDS", { "prov:type": "prov:Organization", "dcterms:title": "Copernicus Climate Data Store", }, ) self.sw_flamingo = self.doc.agent( ":flamingo", { "prov:type": "prov:SoftwareAgent", "dcterms:source": f"https://github.com/cedadev/flamingo/releases/tag/v{flamingo_version}", }, ) self.doc.wasAttributedTo(self.sw_flamingo, project_cds) self.sw_daops = self.doc.agent( ":daops", { "prov:type": "prov:SoftwareAgent", "dcterms:source": f"https://github.com/roocs/daops/releases/tag/v{daops_version}", }, ) # workflow if workflow is True: self.workflow = self.doc.entity(":workflow", {"prov:type": "provone:Workflow"}) orchestrate = self.doc.activity( ":orchestrate", other_attributes={ "prov:startedAtTime": "2020-11-26T09:15:00", "prov:endedAtTime": "2020-11-26T09:30:00", }, ) self.doc.wasAssociatedWith(orchestrate, agent=self.sw_flamingo, plan=self.workflow)
def __init__(self, version, out_dir, zipped=True): out_dirname = os.path.basename(out_dir) out_path = os.path.dirname(out_dir) # Create output path from output name self.zipped = zipped if not self.zipped: out_dirname = out_dirname + ".nidm" else: out_dirname = out_dirname + ".nidm.zip" out_dir = os.path.join(out_path, out_dirname) # Quit if output path already exists and user doesn't want to overwrite # it if os.path.exists(out_dir): msg = out_dir + " already exists, overwrite?" if not input("%s (y/N) " % msg).lower() == 'y': quit("Bye.") if os.path.isdir(out_dir): shutil.rmtree(out_dir) else: os.remove(out_dir) self.out_dir = out_dir if version == "dev": self.version = { 'major': 10000, 'minor': 0, 'revision': 0, 'num': version } else: major, minor, revision = version.split(".") if "-rc" in revision: revision, rc = revision.split("-rc") else: rc = -1 self.version = { 'major': int(major), 'minor': int(minor), 'revision': int(revision), 'rc': int(rc), 'num': version } # Initialise prov document self.doc = ProvDocument() self._add_namespaces() # A temp directory that will contain the exported data self.export_dir = tempfile.mkdtemp(prefix="nidm-", dir=out_path) self.prepend_path = ''
def test_get_element_invalid(self): """ Test get element with error """ with self.assertRaises(InvalidArgumentTypeException): self.provapi.get_element(None) with self.assertRaises(NotFoundException): doc = ProvDocument() name = doc.valid_qualified_name("prov:Some unused name") self.provapi.get_element(name)
def prov_db_unknown_prov_typ_example(): doc = ProvDocument() doc.add_namespace("ex", "https://example.com") doc.entity(identifier="ex:Entity1") doc.entity(identifier="ex:Entity2") doc.influence(influencee="ex:Entity1", influencer="ex:Entity2") return doc
def test_loading_all_json(self): # self.assertFalse(fails, 'Failed to load/round-trip %d JSON files (%s)' % (len(fails), ', '.join(fails))) # Code for debugging the failed tests for filename in self.fails: # Reload the failed files filepath = self.json_path + filename # os.rename(json_path + filename, json_path + filename + '-fail') with open(filepath) as json_file: logger.info("Loading %s...", filepath) g1 = ProvDocument.deserialize(json_file) json_str = g1.serialize(indent=4) g2 = ProvDocument.deserialize(content=json_str) self.assertEqual(g1, g2, 'Round-trip JSON encoding/decoding failed: %s.' % filename)
def base_connector_relation_parameter_example(): doc = ProvDocument() doc.add_namespace("ex", "http://example.com") doc.add_namespace("custom", "http://custom.com") namespaces = dict() namespaces.update({"ex": "http://example.com"}) namespaces.update({"custom": "http://custom.com"}) type_map = dict() type_map.update({"int value": "int"}) type_map.update({"date value": "xds:datetime"}) metadata = dict() metadata.update({METADATA_KEY_PROV_TYPE: PROV_RECORD_IDS_MAP["mentionOf"]}) metadata.update({METADATA_KEY_IDENTIFIER: "identifier for the relation"}) metadata.update({METADATA_KEY_TYPE_MAP: type_map}) metadata.update({METADATA_KEY_NAMESPACES: namespaces}) return_data = dict() return_data.update({"attributes": attributes_dict_example()}) return_data.update({"metadata": metadata}) return_data.update({"from_node": doc.valid_qualified_name("ex:Yoda")}) return_data.update( {"to_node": doc.valid_qualified_name("ex:Luke Skywalker")}) return_data.update({"doc": doc}) return return_data
class ProjectProvenance: def __init__(self, database_helper, full_provenance=False): """ Initializes the provenance for the mjclawar_rarshad project Parameters ---------- database_helper: DatabaseHelper full_provenance: bool Returns ------- """ assert isinstance(database_helper, DatabaseHelper) self.database_helper = database_helper if full_provenance: self.prov_doc = ProvDocument.deserialize(dir_info.plan_json) else: self.prov_doc = ProvDocument() self.prov_doc.add_namespace(mcras.BDP_NAMESPACE.name, mcras.BDP_NAMESPACE.link) self.prov_doc.add_namespace(mcras.ALG_NAMESPACE.name, mcras.ALG_NAMESPACE.link) self.prov_doc.add_namespace(mcras.DAT_NAMESPACE.name, mcras.DAT_NAMESPACE.link) self.prov_doc.add_namespace(mcras.LOG_NAMESPACE.name, mcras.LOG_NAMESPACE.link) self.prov_doc.add_namespace(mcras.ONT_NAMESPACE.name, mcras.ONT_NAMESPACE.link) def write_provenance_json(self): self.prov_doc.serialize(dir_info.plan_json)
def test_references(tmp_path, monkeypatch): """Test1: references are replaced with bibtex.""" # Create fake provenance provenance = ProvDocument() provenance.add_namespace('file', uri=ESMVALTOOL_URI_PREFIX + 'file') provenance.add_namespace('attribute', uri=ESMVALTOOL_URI_PREFIX + 'attribute') filename = str(tmp_path / 'output.nc') attributes = { 'attribute:references': 'test_tag', 'attribute:script_file': 'diagnostics.py' } provenance.entity('file:' + filename, attributes) # Create fake bibtex references tag file references_path = tmp_path / 'references' references_path.mkdir() monkeypatch.setattr(esmvalcore._citation.DIAGNOSTICS, 'path', tmp_path) fake_bibtex_file = references_path / 'test_tag.bibtex' fake_bibtex = "Fake bibtex file content\n" fake_bibtex_file.write_text(fake_bibtex) _write_citation_files(filename, provenance) citation_file = tmp_path / 'output_citation.bibtex' citation = citation_file.read_text() assert citation == '\n'.join([ESMVALTOOL_PAPER, fake_bibtex])
def test_xsd_qnames(self): prov_doc = ProvDocument() ex = Namespace('ex', 'http://www.example.org/') prov_doc.add_namespace(ex) ex1 = Namespace('ex1', 'http://www.example1.org/') # ex1 is not added to the document an_xsd_qname = XSDQName(ex['a_value']) another_xsd_qname = XSDQName(ex1['another_value']) e1 = prov_doc.entity('ex:e1', {'prov:value': an_xsd_qname, 'prov:type': another_xsd_qname}) for _, attr_value in e1.attributes: self.assertIsInstance(attr_value, XSDQName) self.assertRoundTripEquivalence(prov_doc)
def setUp(self): self.json_path = os.path.dirname(os.path.abspath(__file__)) + '/json/' filenames = os.listdir(self.json_path) self.fails = [] for filename in filenames: if filename.endswith('.json'): with open(self.json_path + filename) as json_file: try: g1 = ProvDocument.deserialize(json_file) json_str = g1.serialize(indent=4) g2 = ProvDocument.deserialize(content=json_str) self.assertEqual(g1, g2, 'Round-trip JSON encoding/decoding failed: %s.' % filename) except: self.fails.append(filename)
def test_cmip6_data_citation_url(tmp_path): """Test3: CMIP6 info_url is retrieved from ES-DOC.""" # Create fake provenance provenance = ProvDocument() provenance.add_namespace('file', uri=ESMVALTOOL_URI_PREFIX + 'file') provenance.add_namespace('attribute', uri=ESMVALTOOL_URI_PREFIX + 'attribute') attributes = { 'attribute:mip_era': 'CMIP6', 'attribute:activity_id': 'activity', 'attribute:institution_id': 'institution', 'attribute:source_id': 'source', 'attribute:experiment_id': 'experiment', } filename = str(tmp_path / 'output.nc') provenance.entity('file:' + filename, attributes) _write_citation_files(filename, provenance) citation_url = tmp_path / 'output_data_citation_info.txt' # Create fake info url fake_url_prefix = '.'.join(attributes.values()) text = '\n'.join([ "Follow the links below to find more information about CMIP6 data:", f"- {CMIP6_URL_STEM}/cmip6?input={fake_url_prefix}", '', ]) assert citation_url.read_text() == text
def test_bundle_update_simple(self): doc = ProvDocument() doc.set_default_namespace(EX_URI) b1 = doc.bundle('b1') b1.entity('e') b2 = doc.bundle('b2') b2.entity('e') self.assertRaises(ProvException, lambda: b1.update(1)) self.assertRaises(ProvException, lambda: b1.update(doc)) b1.update(b2) self.assertEqual(len(b1.get_records()), 2)
def main(auth_json_path, full_provenance=False): with open(auth_json_path, 'r') as f: auth_json = json.load(f) api_token = auth_json['services']['cityofbostondataportal']['token'] username = '******'#auth_json['services']['cityofbostondataportal']['username'] mongo_pass = '******' #auth_json['services']['cityofbostondataportal']['username'] database_helper = database_helpers.DatabaseHelper(username=username, password=mongo_pass) bdp_api = bdp_query.BDPQuery(api_token=api_token) if full_provenance: with open(plan_json, 'w') as f: f.write(json.dumps({})) setup_crime_incidents(database_helper, bdp_api, full_provenance=full_provenance) setup_property_assessment(database_helper, bdp_api, full_provenance=full_provenance) setup_boston_public_schools(database_helper, bdp_api, full_provenance=full_provenance) setup_hospital_locations(database_helper, bdp_api, full_provenance=full_provenance) setup_crime_centroids(database_helper, full_provenance=full_provenance) setup_hospital_distances(database_helper, full_provenance=full_provenance) setup_crime_knn(database_helper, full_provenance=full_provenance) setup_home_value_model(database_helper, full_provenance=full_provenance) setup_hospital_scatter(database_helper, full_provenance=full_provenance) setup_school_distances(database_helper, full_provenance=full_provenance) setup_school_scatter(database_helper, full_provenance=full_provenance) if full_provenance: with open(plan_json, 'r') as f: prov_doc = ProvDocument.deserialize(f) dot = prov_to_dot(prov_doc) dot.write_svg(prov_svg)
def datatypes(): g = ProvDocument() ex = Namespace('ex', 'http://example.org/') g.add_namespace(ex) attributes = {'ex:int': 100, 'ex:float': 100.123456, 'ex:long': 123456789000, 'ex:bool': True, 'ex:str': 'Some string', 'ex:unicode': u'Some unicode string with accents: Huỳnh Trung Đông', 'ex:timedate': datetime.datetime(2012, 12, 12, 14, 7, 48), 'ex:intstr': Literal("PROV Internationalized string", PROV["InternationalizedString"], "en"), } multiline = """Line1 Line2 Line3""" attributes['ex:multi-line'] = multiline g.entity('ex:e1', attributes) return g
def prov(self, format='json', filename=None): if self.prov_url is None: raise APIException('no provenance information found') response = self.adama.utils.request(self.prov_url, format=format) if format in ('json', 'sources'): return response.json() elif format == 'prov-n': return response.text elif format == 'prov': return ProvDocument.deserialize( content=json.dumps(response.json())) elif format == 'png': return png(response.content, filename)
def get_bundle(self, document_id, bundle_id, prov_format=ProvDocument): if prov_format == ProvDocument: extension = 'json' else: extension = prov_format r = self._request('get', "/documents/%i/bundles/%i.%s" % (document_id, bundle_id, extension), headers=self.headers) if prov_format == ProvDocument: return ProvDocument.deserialize(content=r.content) else: return r.content
def get_document(self, doc_id, format=None, flattened=False, view=None): """Returns a ProvBundle object of the document with the ID provided or raises ApiNotFoundError""" extension = format if format is not None else 'json' view = "/views/%s" % view if view in ['data', 'process', 'responsibility'] else "" url = "documents/%d%s%s.%s" % (doc_id, "/flattened" if flattened else "", view, extension) response = self.request(url, raw=True) if format is None: # Try to decode it as a ProvDocument result = ProvDocument.deserialize(content=response) else: # return the raw response result = response return result
def testAllExamples(self): num_graphs = len(examples.tests) logger.info('PROV-JSON round-trip testing %d example provenance graphs', num_graphs) counter = 0 for name, graph in examples.tests: counter += 1 logger.info('%d. Testing the %s example', counter, name) g1 = graph() logger.debug('Original graph in PROV-N\n%s', g1.get_provn()) # json_str = g1.get_provjson(indent=4) json_str = g1.serialize(indent=4) logger.debug('Original graph in PROV-JSON\n%s', json_str) g2 = ProvDocument.deserialize(content=json_str) logger.debug('Graph decoded from PROV-JSON\n%s', g2.get_provn()) self.assertEqual(g1, g2, 'Round-trip JSON encoding/decoding failed: %s.' % name)
def test_unifying(self): # This is a very trivial test just to exercise the unified() function # TODO: Create a proper unification test json_path = os.path.dirname(os.path.abspath(__file__)) + '/unification/' filenames = os.listdir(json_path) for filename in filenames: if not filename.endswith('.json'): continue filepath = json_path + filename with open(filepath) as json_file: logger.info('Testing unifying: %s', filename) logger.debug("Loading %s...", filepath) document = ProvDocument.deserialize(json_file) flattened = document.flattened() unified = flattened.unified() self.assertLess(len(unified.get_records()), len(flattened.get_records()))
def test_decoding_unicode_value(self): unicode_char = u'\u2019' json_content = u'''{ "prefix": { "ex": "http://www.example.org" }, "entity": { "ex:unicode_char": { "prov:label": "%s" } } }''' % unicode_char prov_doc = ProvDocument.deserialize(content=json_content, format='json') e1 = prov_doc.get_record('ex:unicode_char')[0] self.assertIn(unicode_char, e1.get_attribute('prov:label'))
def test_decoding_unicode_value(self): unicode_char = u'\u2019' rdf_content = u''' @prefix ex: <http://www.example.org/> . @prefix prov: <http://www.w3.org/ns/prov#> . @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix xml: <http://www.w3.org/XML/1998/namespace> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . ex:unicode_char a prov:Entity ; rdfs:label "%s"^^xsd:string . ''' % unicode_char prov_doc = ProvDocument.deserialize(content=rdf_content, format='rdf', rdf_format='turtle') e1 = prov_doc.get_record('ex:unicode_char')[0] self.assertIn(unicode_char, e1.get_attribute('prov:label'))
def assertRoundTripEquivalence(self, prov_doc, msg=None): if self.FORMAT is None: # This is a dummy test, just return return with io.BytesIO() as stream: prov_doc.serialize(destination=stream, format=self.FORMAT, indent=4) stream.seek(0, 0) prov_doc_new = ProvDocument.deserialize(source=stream, format=self.FORMAT) stream.seek(0, 0) # Assume UTF-8 encoding which is forced by the particular # PROV XML implementation and should also work for the PROV # JSON implementation. msg_extra = "'%s' serialization content:\n%s" % (self.FORMAT, stream.read().decode("utf-8")) msg = "\n".join((msg, msg_extra)) if msg else msg_extra self.assertEqual(prov_doc, prov_doc_new, msg)
def __init__(self, version, out_dir, zipped=True): out_dirname = os.path.basename(out_dir) out_path = os.path.dirname(out_dir) # Create output path from output name self.zipped = zipped if not self.zipped: out_dirname = out_dirname+".nidm" else: out_dirname = out_dirname+".nidm.zip" out_dir = os.path.join(out_path, out_dirname) # Quit if output path already exists and user doesn't want to overwrite # it if os.path.exists(out_dir): msg = out_dir+" already exists, overwrite?" if not input("%s (y/N) " % msg).lower() == 'y': quit("Bye.") if os.path.isdir(out_dir): shutil.rmtree(out_dir) else: os.remove(out_dir) self.out_dir = out_dir if version == "dev": self.version = {'major': 10000, 'minor': 0, 'revision': 0, 'num': version} else: major, minor, revision = version.split(".") if "-rc" in revision: revision, rc = revision.split("-rc") else: rc = -1 self.version = {'major': int(major), 'minor': int(minor), 'revision': int(revision), 'rc': int(rc), 'num': version} # Initialise prov document self.doc = ProvDocument() self._add_namespaces() # A temp directory that will contain the exported data self.export_dir = tempfile.mkdtemp(prefix="nidm-", dir=out_path) self.prepend_path = ''
def generateProvlet(self, aDO, aRO): # create provlet d1 = ProvDocument() # d1 is now an empty provenance document d1.add_namespace("dt", "http://cs.ncl.ac.uk/dtsim/") e1 = d1.entity(DTns + aRO.id) ag1 = d1.agent(DTns + str(aDO.id)) d1.wasAttributedTo(e1, ag1) # update global graph e1 = pGlobal.entity(DTns + aRO.id) ag1 = pGlobal.agent(DTns + str(aDO.id)) pGlobal.wasAttributedTo(e1, ag1) # self.notify(d1) return d1
def setUp(self): self.export_dir = os.path.join(TEST_FOLDER, 'nidm') if not os.path.isdir(self.export_dir): os.mkdir(self.export_dir) # Retreive owl file for NIDM-Results owl_file = os.path.join(TERM_RESULTS_DIR, 'nidm-results.owl') assert owl_file self.owl = OwlReader(owl_file) self.doc = ProvDocument() # self.bundle = ProvBundle(identifier=NIIRI[software_lc+'_results_id']) self.provn_file = os.path.join(self.export_dir, 'unit_test.provn') namespaces_file = os.path.join(TERM_RESULTS_DIR, "templates", \ "Namespaces.txt") namespaces_fid = open(namespaces_file) self.prefixes = namespaces_fid.read() namespaces_fid.close() self.to_delete_files = [self.provn_file] self.gt_ttl_files = list()
def to_prov(obj, namespace, service): """ :type obj: dict :rtype: prov.model.ProvDocument """ g = ProvDocument() ap = Namespace('aip', 'https://araport.org/provenance/') g.add_namespace("dcterms", "http://purl.org/dc/terms/") g.add_namespace("foaf", "http://xmlns.com/foaf/0.1/") vaughn = g.agent(ap['matthew_vaughn'], { 'prov:type': PROV["Person"], 'foaf:givenName': "Matthew Vaughn", 'foaf:mbox': "<mailto:[email protected]>" }) # Hard coded for now walter = g.agent(ap['walter_moreira'], { 'prov:type': PROV["Person"], 'foaf:givenName': "Walter Moreira", 'foaf:mbox': "<mailto:[email protected]>" }) utexas = g.agent(ap['university_of_texas'], { 'prov:type': PROV["Organization"], 'foaf:givenName': "University of Texas at Austin" }) g.actedOnBehalfOf(walter, utexas) g.actedOnBehalfOf(vaughn, utexas) adama_platform = g.agent( ap['adama_platform'], {'dcterms:title': "ADAMA", 'dcterms:description': "Araport Data And Microservices API", 'dcterms:language': "en-US", 'dcterms:identifier': "https://api.araport.org/community/v0.3/", 'dcterms:updated': "2015-04-17T09:44:56"}) g.wasGeneratedBy(adama_platform, walter) g.wasGeneratedBy(adama_platform, vaughn) iden = service_iden(namespace, service) srv = service_store[iden]['service'] adama_microservice = g.agent( ap[iden], {'dcterms:title': srv.name.title(), 'dcterms:description': srv.description, 'dcterms:language': "en-US", 'dcterms:identifier': api_url_for('service', namespace=namespace, service=service), 'dcterms:source': srv.git_repository }) g.used(adama_microservice, adama_platform, datetime.datetime.now()) for author in getattr(srv, 'authors', []): try: author_name = author['name'] author_email = author['email'] except KeyError: raise APIException( 'name and email are required in author field') author_agent = g.agent( ap[slugify(author_name)], {'prov:type': PROV['Person'], 'foaf:givenName': author_name, 'foaf:mbox': '<mailto:{}>'.format(author_email)}) sponsor_name = author.get('sponsor_organization_name', None) if sponsor_name: sponsor_agent = g.agent( ap[slugify(sponsor_name)], {'prov:type': PROV['Organization'], 'foaf:givenName': sponsor_name, 'dcterms:identifier': author.get('sponsor_uri', '')}) g.actedOnBehalfOf(author_agent, sponsor_agent) g.wasGeneratedBy(adama_microservice, author_agent, datetime.datetime.now()) sources_entities = process_sources(srv.sources, g, ap) for src in sources_entities: g.used(adama_microservice, src, datetime.datetime.now()) response = g.entity(ap['adama_response']) g.wasGeneratedBy(response, ap[srv.type], datetime.datetime.now()) g.used(ap[srv.type], adama_microservice, datetime.datetime.now()) return g
def job2prov(job): """ Create ProvDocument based on job description :param job: UWS job :return: ProvDocument """ # job.jdl.content = { # 'description': description, # 'parameters': parameters, # 'results': results, # 'executionduration': execdur, # 'quote': quote # } # parameters[pname] = { # 'type': p.get('type'), # 'required': p.get('required'), # 'default': p.get('default'), # 'description': list(p)[0].text, # } # results[r.get('value')] = { # 'mediaType': r.get('mediaType'), # 'default': r.get('default'), # 'description': list(r)[0].text, # } pdoc = ProvDocument() # Declaring namespaces for various prefixes used in the example pdoc.add_namespace('prov', 'http://www.w3.org/ns/prov#') pdoc.add_namespace('voprov', 'http://www.ivoa.net/ns/voprov#') pdoc.add_namespace('cta', 'http://www.cta-observatory.org#') pdoc.add_namespace('uwsdata', 'https://voparis-uws-test.obspm.fr/rest/' + job.jobname + '/' + job.jobid + '/') pdoc.add_namespace('ctajobs', 'http://www.cta-observatory.org#') # Adding an activity ctbin = pdoc.activity('ctajobs:' + job.jobname, job.start_time, job.end_time) # TODO: add job description, version, url, ... # Agent pdoc.agent('cta:consortium', other_attributes={'prov:type': "Organization"}) pdoc.wasAssociatedWith(ctbin, 'cta:consortium') # Entities, in and out with relations e_in = [] for pname, pdict in job.jdl.content['parameters'].iteritems(): #if pname.startswith('in'): if any(x in pdict['type'] for x in ['file', 'xs:anyURI']): e_in.append(pdoc.entity('uwsdata:parameters/' + pname)) # TODO: use publisher_did? add prov attributes, add voprov attributes? ctbin.used(e_in[-1]) e_out = [] for rname, rdict in job.jdl.content['results'].iteritems(): e_out.append(pdoc.entity('uwsdata:results/' + rname)) # TODO: use publisher_did? add prov attributes, add voprov attributes? e_out[-1].wasGeneratedBy(ctbin) for e in e_in: e_out[-1].wasDerivedFrom(e) return pdoc