def test_get_provenance_document_for_id(tmpdir): asdf_filename = os.path.join(tmpdir.strpath, "test.h5") data_set = ASDFDataSet(asdf_filename) filename = os.path.join(data_dir, "example_schematic_processing_chain.xml") doc = prov.read(filename) data_set.provenance["test_provenance"] = doc assert data_set.provenance.get_provenance_document_for_id( '{http://seisprov.org/seis_prov/0.1/#}sp002_dt_f87sf7sf78') == \ {"name": "test_provenance", "document": doc} assert data_set.provenance.get_provenance_document_for_id( '{http://seisprov.org/seis_prov/0.1/#}sp004_lp_f87sf7sf78') == \ {"name": "test_provenance", "document": doc} # Id not found. with pytest.raises(ASDFValueError) as err: data_set.provenance.get_provenance_document_for_id( '{http://seisprov.org/seis_prov/0.1/#}bogus_id') assert err.value.args[0] == ( "Document containing id " "'{http://seisprov.org/seis_prov/0.1/#}bogus_id'" " not found in the data set.") # Not a qualified id. with pytest.raises(ASDFValueError) as err: data_set.provenance.get_provenance_document_for_id("bla") assert err.value.args[0] == ("Not a valid qualified name.") data_set.__del__()
def __validate_seis_prov(file_object): """ Core validation function. :param file_object: Open file or file-like object. """ original_position = file_object.tell() # Step 1: Check and read the JSON schema. json_schema = _check_json_schema() # Determine file type. is_json = _is_json_file(file_object) file_object.seek(original_position, 0) is_xml = _is_xml_file(file_object) file_object.seek(original_position, 0) if is_json is False and is_xml is False: _log_error("File is neither a valid JSON nor a valid XML file.") elif is_json is True and is_xml is True: # Should not happen for obvious reasons... raise NotImplementedError elif is_json: fileformat = "json" elif is_xml: fileformat = "xml" # Step 2: Attempt to read the provenance file with the prov Python package. try: doc = prov.read(file_object, format=fileformat) except Exception as e: _log_error("Could not parse the file with the prov Python library due" " to: the following PROV error message: %s" % (repr(e))) # Step 3: Validate against the PROV XML XSD Scheme. _validate_against_xsd_scheme(doc) # Find the seis prov namespace. for ns in doc.namespaces: if ns.uri == SEIS_PROV_NAMESPACE: break else: _log_error("SEIS-PROV namespace not found in document!") # Step 4: Custom validation against the JSON schema. Validate the root # document as well as any bundles. seis_prov_ids = _validate_prov_bundle(doc, json_schema, ns=ns) for bundle in doc.bundles: seis_prov_ids.extend( _validate_prov_bundle(bundle, json_schema, ns=ns)) if not seis_prov_ids: _log_warning("The document is a valid W3C PROV document but not a " "single SEIS-PROV record has been found.") # Find duplicate ids. duplicates = set([i for i in seis_prov_ids if sum([1 for a in seis_prov_ids if a == i]) > 1]) if duplicates: _log_error("One or more ids have been used more than once: %s" % ", ".join(["'%s'" % _i for _i in duplicates]))
def init(): f = open('plan.json', 'r') doc = prov.read(f) f.close() doc.add_namespace('alg', 'https://data-mechanics.s3.amazonaws.com/linshan_luoty/algorithm/') # The scripts in <folder>/<filename> format. doc.add_namespace('dat', 'https://data-mechanics.s3.amazonaws.com/linshan_luoty/data/') # The data sets in <user>/<collection> format. doc.add_namespace('ont', 'https://data-mechanics.s3.amazonaws.com/ontology#') # 'Extension', 'DataResource', 'DataSet', 'Retrieval', 'Query', or 'Computation'. doc.add_namespace('log', 'https://data-mechanics.s3.amazonaws.com/log#') # The event log. doc.add_namespace('bdp', 'https://data.cityofboston.gov/resource/') return doc
def test_get_ids_from_prov_document(): filename = os.path.join(data_dir, "example_schematic_processing_chain.xml") doc = prov.read(filename, format="xml") ids = get_all_ids_for_prov_document(doc) assert ids == [ '{http://seisprov.org/seis_prov/0.1/#}sp001_wf_a34j4didj3', '{http://seisprov.org/seis_prov/0.1/#}sp002_dt_f87sf7sf78', '{http://seisprov.org/seis_prov/0.1/#}sp003_wf_js83hf34aj', '{http://seisprov.org/seis_prov/0.1/#}sp004_lp_f87sf7sf78', '{http://seisprov.org/seis_prov/0.1/#}sp005_wf_378f8ks8kd', '{http://seisprov.org/seis_prov/0.1/#}sp006_dc_f87sf7sf78', '{http://seisprov.org/seis_prov/0.1/#}sp007_wf_jude89du8l']
def test_provenance_list_command(tmpdir): asdf_filename = os.path.join(tmpdir.strpath, "test.h5") data_set = ASDFDataSet(asdf_filename) filename = os.path.join(data_dir, "example_schematic_processing_chain.xml") # Add it as a document. doc = prov.read(filename, format="xml") data_set.add_provenance_document(doc, name="test_provenance") assert data_set.provenance.list() == ["test_provenance"]
def test_wps_subset_cmip6_prov(): client = client_for(Service(processes=[Subset()], cfgfiles=[PYWPS_CFG])) datainputs = "collection=CMIP6.CMIP.IPSL.IPSL-CM6A-LR.historical.r1i1p1f1.Amon.rlds.gr.v20180803" datainputs += ";time=1860-01-01/1900-12-30;area=1,1,300,89" resp = client.get( "?service=WPS&request=Execute&version=1.0.0&identifier=subset&datainputs={}" .format(datainputs)) assert_response_success(resp) doc = prov.read(get_output(resp.xml)["prov"][len("file://"):]) assert ('roocs:time="1860-01-01/1900-12-30", roocs:area="1,1,300,89"' in doc.get_provn()) assert ( "wasDerivedFrom(roocs:rlds_Amon_IPSL-CM6A-LR_historical_r1i1p1f1_gr_18600116-19001216.nc, roocs:CMIP6.CMIP.IPSL.IPSL-CM6A-LR.historical.r1i1p1f1.Amon.rlds.gr.v20180803" # noqa in doc.get_provn())
def test_prov(path): def drop_blank(lines): return set(line for line in lines if line.strip()) provenance = prov.read(path, format='json') output = StringIO() # In production, we won't output PROV-N, # but for tests, it's easy for a human to read, and easy to compare. serializer = prov.serializers.provn.ProvNSerializer(provenance) serializer.serialize(output) actual = output.getvalue() actual_lines = drop_blank(actual.split('\n')) with open(Path(path).parent.parent / 'expected.prov') as prov_fixture: expected_lines = drop_blank(prov_fixture.read().split('\n')) assert actual_lines == expected_lines
def test_adding_a_provenance_record(tmpdir): """ Tests adding a provenance record. """ asdf_filename = os.path.join(tmpdir.strpath, "test.h5") data_set = ASDFDataSet(asdf_filename) filename = os.path.join(data_dir, "example_schematic_processing_chain.xml") # Add it as a document. doc = prov.read(filename, format="xml") data_set.add_provenance_document(doc, name="test_provenance") del data_set # Read it again. data_set = ASDFDataSet(asdf_filename) assert data_set.provenance.test_provenance == doc
def test_wps_subset_cmip6_multiple_files_prov(): client = client_for(Service(processes=[Subset()], cfgfiles=[PYWPS_CFG])) datainputs = "collection=CMIP6.CMIP.MPI-M.MPI-ESM1-2-HR.historical.r1i1p1f1.SImon.siconc.gn.latest" datainputs += ";time=1850-01-01/2013-12-30" resp = client.get( "?service=WPS&request=Execute&version=1.0.0&identifier=subset&datainputs={}" .format(datainputs)) assert_response_success(resp) doc = prov.read(get_output(resp.xml)["prov"][len("file://"):]) print(doc.get_provn()) assert 'roocs:time="1850-01-01/2013-12-30"' in doc.get_provn() assert ( "wasDerivedFrom(roocs:siconc_SImon_MPI-ESM1-2-HR_historical_r1i1p1f1_gn_18500116-18960316.nc, roocs:CMIP6.CMIP.MPI-M.MPI-ESM1-2-HR.historical.r1i1p1f1.SImon.siconc.gn.latest" # noqa in doc.get_provn()) assert ( "wasDerivedFrom(roocs:siconc_SImon_MPI-ESM1-2-HR_historical_r1i1p1f1_gn_18960416-19420616.nc, roocs:CMIP6.CMIP.MPI-M.MPI-ESM1-2-HR.historical.r1i1p1f1.SImon.siconc.gn.latest" # noqa in doc.get_provn())
def init(): f = open('plan.json', 'r') doc = prov.read(f) f.close() doc.add_namespace( 'alg', 'https://data-mechanics.s3.amazonaws.com/linshan_luoty/algorithm/' ) # The scripts in <folder>/<filename> format. doc.add_namespace( 'dat', 'https://data-mechanics.s3.amazonaws.com/linshan_luoty/data/' ) # The data sets in <user>/<collection> format. doc.add_namespace( 'ont', 'https://data-mechanics.s3.amazonaws.com/ontology#' ) # 'Extension', 'DataResource', 'DataSet', 'Retrieval', 'Query', or 'Computation'. doc.add_namespace( 'log', 'https://data-mechanics.s3.amazonaws.com/log#') # The event log. doc.add_namespace('bdp', 'https://data.cityofboston.gov/resource/') return doc
def test_trying_to_add_provenance_record_with_invalid_name_fails(tmpdir): """ The name must be valid according to a particular regular expression. """ asdf_filename = os.path.join(tmpdir.strpath, "test.h5") data_set = ASDFDataSet(asdf_filename) filename = os.path.join(data_dir, "example_schematic_processing_chain.xml") # First try adding it as a prov document. doc = prov.read(filename, format="xml") with pytest.raises(ASDFValueError) as err: data_set.add_provenance_document(doc, name="a-b-c") assert err.value.args[0] == ( "Name 'a-b-c' is invalid. It must validate against the regular " "expression '^[0-9a-z][0-9a-z_]*[0-9a-z]$'.") # Must sometimes be called to get around some bugs. data_set.__del__()
def test_provenance_dicionary_behaviour(tmpdir): asdf_filename = os.path.join(tmpdir.strpath, "test.h5") data_set = ASDFDataSet(asdf_filename) filename = os.path.join(data_dir, "example_schematic_processing_chain.xml") # Add it as a document. doc = prov.read(filename, format="xml") # Setting via setitem. data_set.provenance["test_provenance"] = doc data_set.__del__() del data_set new_data_set = ASDFDataSet(asdf_filename) assert new_data_set.provenance.list() == ["test_provenance"] assert new_data_set.provenance["test_provenance"] == doc assert getattr(new_data_set.provenance, "test_provenance") == doc assert list(new_data_set.provenance.keys()) == ["test_provenance"] assert list(new_data_set.provenance.values()) == [doc] assert list(new_data_set.provenance.items()) == [("test_provenance", doc)]
elif o in ("-o", "--outfile"): outfile = a elif o in ("-i", "--infile"): infile = a elif o in ("-b", "--bindings"): bindings = a elif o in ("-3", "--bindver3"): v3=True else: assert False, "unhandled option" if not infile or not bindings: sys.exit() template=prov.read(infile) bindings_dict=None if v3: bindings_dict=dict() v3_dict=json.load(open(bindings, "r")) if "context" in v3_dict: print v3_dict["context"] namespaces=set() for k in v3_dict["context"]: namespaces.add(prov.model.Namespace(k, v3_dict["context"][k])) template=provconv.set_namespaces(namespaces, template) if "var" in v3_dict: for v in v3_dict["var"]: