def get_provdoc(format,infile): if format == "json": return ProvDocument.deserialize(infile) elif format == "xml": return ProvDocument.deserialize(infile,format='xml') else: print "Error: unsupported format (xml and json are supported"
def provRead(source, format=None): from prov.model import ProvDocument from prov.serializers import Registry Registry.load_serializers() serializers = Registry.serializers.keys() if format: try: ret = ProvDocument.deserialize(source=source, format=format.lower()) return ret except Exception as e: log.error(e) raise TypeError(e) for format in serializers: source.seek(0) try: return ProvDocument.deserialize(source=source, format=format) except: pass else: raise TypeError("Could not read from the source. To get a proper " "error message, specify the format with the 'format' " "parameter.")
def read(source, format=None): """ Convenience function returning a ProvDocument instance. It does a lazy format detection by simply using try/except for all known formats. The deserializers should fail fairly early when data of the wrong type is passed to them thus the try/except is likely cheap. One could of course also do some more advanced format auto-detection but I am not sure that is necessary. The downside is that no proper error messages will be produced, use the format parameter to get the actual traceback. """ # Lazy imports to not globber the namespace. from prov.model import ProvDocument from prov.serializers import Registry Registry.load_serializers() serializers = Registry.serializers.keys() if format: return ProvDocument.deserialize(source=source, format=format.lower()) for format in serializers: try: return ProvDocument.deserialize(source=source, format=format) except: pass else: raise TypeError("Could not read from the source. To get a proper " "error message, specify the format with the 'format' " "parameter.")
def get_provdoc(format, infile): if format == "json": return ProvDocument.deserialize(infile) elif format == "xml": return ProvDocument.deserialize(infile, format='xml') else: print "Error: unsupported format (xml and json are supported"
def test_loading_all_json(self): # self.assertFalse(fails, 'Failed to load/round-trip %d JSON files (%s)' % (len(fails), ', '.join(fails))) # Code for debugging the failed tests for filename in self.fails: # Reload the failed files filepath = self.json_path + filename # os.rename(json_path + filename, json_path + filename + '-fail') with open(filepath) as json_file: logger.info("Loading %s...", filepath) g1 = ProvDocument.deserialize(json_file) json_str = g1.serialize(indent=4) g2 = ProvDocument.deserialize(content=json_str) self.assertEqual(g1, g2, 'Round-trip JSON encoding/decoding failed: %s.' % filename)
def setUp(self): self.json_path = os.path.dirname(os.path.abspath(__file__)) + '/json/' filenames = os.listdir(self.json_path) self.fails = [] for filename in filenames: if filename.endswith('.json'): with open(self.json_path + filename) as json_file: try: g1 = ProvDocument.deserialize(json_file) json_str = g1.serialize(indent=4) g2 = ProvDocument.deserialize(content=json_str) self.assertEqual(g1, g2, 'Round-trip JSON encoding/decoding failed: %s.' % filename) except: self.fails.append(filename)
def testLoadAllJSON(self): # self.assertFalse(fails, 'Failed to load/round-trip %d JSON files (%s)' % (len(fails), ', '.join(fails))) logging.basicConfig(level=logging.DEBUG) # Code for debugging the failed tests for filename in self.fails: # Reload the failed files filepath = self.json_path + filename # os.rename(json_path + filename, json_path + filename + '-fail') with open(filepath) as json_file: logger.info("Loading %s...", filepath) g1 = ProvDocument.deserialize(json_file) json_str = g1.serialize(indent=4) g2 = ProvDocument.deserialize(content=json_str) self.assertEqual(g1, g2, 'Round-trip JSON encoding/decoding failed: %s.' % filename)
def main(auth_json_path, full_provenance=False): with open(auth_json_path, 'r') as f: auth_json = json.load(f) api_token = auth_json['services']['cityofbostondataportal']['token'] username = '******'#auth_json['services']['cityofbostondataportal']['username'] mongo_pass = '******' #auth_json['services']['cityofbostondataportal']['username'] database_helper = database_helpers.DatabaseHelper(username=username, password=mongo_pass) bdp_api = bdp_query.BDPQuery(api_token=api_token) if full_provenance: with open(plan_json, 'w') as f: f.write(json.dumps({})) setup_crime_incidents(database_helper, bdp_api, full_provenance=full_provenance) setup_property_assessment(database_helper, bdp_api, full_provenance=full_provenance) setup_boston_public_schools(database_helper, bdp_api, full_provenance=full_provenance) setup_hospital_locations(database_helper, bdp_api, full_provenance=full_provenance) setup_crime_centroids(database_helper, full_provenance=full_provenance) setup_hospital_distances(database_helper, full_provenance=full_provenance) setup_crime_knn(database_helper, full_provenance=full_provenance) setup_home_value_model(database_helper, full_provenance=full_provenance) setup_hospital_scatter(database_helper, full_provenance=full_provenance) setup_school_distances(database_helper, full_provenance=full_provenance) setup_school_scatter(database_helper, full_provenance=full_provenance) if full_provenance: with open(plan_json, 'r') as f: prov_doc = ProvDocument.deserialize(f) dot = prov_to_dot(prov_doc) dot.write_svg(prov_svg)
def extract_pg_data(filepath: Path): prov_doc = ProvDocument.deserialize(filepath) n_balls_collected = 0 pokemons_strength = dict() pokemons_captured = [] pokemons_disposed = [] strength_captured_avg = -1 strength_disposed_avg = -1 for record in prov_doc.get_records(ProvElement): if isinstance(record, ProvEntity): ent_id = str(record.identifier) if "pokemons" in ent_id: strength_values = record.get_attribute(PGO_strength) # type: set strength = ( next(iter(strength_values)) if strength_values else 0 ) # type: int pokemon_id = ent_id[:-2] if ent_id.endswith(".0"): pokemons_captured.append(pokemon_id) if strength and (pokemon_id not in pokemons_strength): pokemons_strength[pokemon_id] = strength elif isinstance(record, ProvActivity): act_id = str(record.identifier) if "collectballs" in act_id: n_balls_collected += 1 for record in prov_doc.get_records(ProvInvalidation): ent_id = str(record.args[0]) pokemon_id = ent_id[:-2] pokemons_disposed.append(pokemon_id) n_pokemons_captured = len(pokemons_captured) n_pokemons_disposed = len(pokemons_disposed) if pokemons_captured: strength_captured_avg = np.mean( [ pokemons_strength[pokemon_id] for pokemon_id in pokemons_captured if pokemon_id in pokemons_strength ] ) if pokemons_disposed: strength_disposed_avg = np.mean( [ pokemons_strength[pokemon_id] for pokemon_id in pokemons_disposed if pokemon_id in pokemons_strength ] ) return ( n_balls_collected, n_pokemons_captured, n_pokemons_disposed, strength_captured_avg, strength_disposed_avg, )
def primer(): a = ProvDocument() script_path = os.path.dirname(os.path.abspath( __file__ )) # with open(str(script_path) + "/output.json") as json_file: line = json_file.readline() a = a.deserialize(content=line) return a
def main(auth_json_path, full_provenance=False): with open(auth_json_path, 'r') as f: auth_json = json.load(f) api_token = auth_json['services']['cityofbostondataportal']['token'] username = auth_json['services']['cityofbostondataportal']['username'] mongo_pass = auth_json['services']['cityofbostondataportal']['password'] database_helper = database_helpers.DatabaseHelper(username=username, password=mongo_pass) bdp_api = bdp_query.BDPQuery(api_token=api_token) if full_provenance: with open(plan_json, 'w') as f: f.write(json.dumps({})) setup_crime_incidents(database_helper, bdp_api, full_provenance=full_provenance) setup_property_assessment(database_helper, bdp_api, full_provenance=full_provenance) setup_boston_public_schools(database_helper, bdp_api, full_provenance=full_provenance) setup_hospital_locations(database_helper, bdp_api, full_provenance=full_provenance) setup_crime_centroids(database_helper, full_provenance=full_provenance) setup_hospital_distances(database_helper, full_provenance=full_provenance) setup_crime_knn(database_helper, full_provenance=full_provenance) setup_home_value_model(database_helper, full_provenance=full_provenance) setup_hospital_scatter(database_helper, full_provenance=full_provenance) setup_school_distances(database_helper, full_provenance=full_provenance) setup_school_scatter(database_helper, full_provenance=full_provenance) if full_provenance: with open(plan_json, 'r') as f: prov_doc = ProvDocument.deserialize(f) dot = prov_to_dot(prov_doc) dot.write_svg(prov_svg)
def primer(): a = ProvDocument() script_path = os.path.dirname(os.path.abspath(__file__)) # with open(str(script_path) + "/output.json") as json_file: line = json_file.readline() a = a.deserialize(content=line) return a
def count_flatprovenancetypes_for_graphs( dataset_path: Path, graph_filenames: Collection[str], level: int, including_primitives_types: bool, ) -> Tuple[List[Dict[int, Dict[str, int]]], List[List[float]]]: logger.debug( "Calculating flat provenance types up to level %s (with application types: %s) for %d graphs...", level, including_primitives_types, len(graph_filenames), ) results = [] # type: List[Dict[int, Dict[str, int]]] timings = [] # type: List[List[float]] for graph_filename in graph_filenames: filepath = dataset_path / graph_filename prov_doc = ProvDocument.deserialize(filepath) durations = [] # type: List[float] features = dict() # type: Dict[int, Dict[str, int]] for h in range(level + 1): timer = Timer(verbose=False) with timer: fp_types = calculate_flat_provenance_types( prov_doc, h, including_primitives_types ) # counting only the last level features[h] = count_fp_types(fp_types[h].values()) durations.append(timer.interval) results.append(features) timings.append(durations) return results, timings
def __init__(self, database_helper, full_provenance=False): """ Initializes the provenance for the mjclawar_rarshad project Parameters ---------- database_helper: DatabaseHelper full_provenance: bool Returns ------- """ assert isinstance(database_helper, DatabaseHelper) self.database_helper = database_helper if full_provenance: self.prov_doc = ProvDocument.deserialize(dir_info.plan_json) else: self.prov_doc = ProvDocument() self.prov_doc.add_namespace(mcras.BDP_NAMESPACE.name, mcras.BDP_NAMESPACE.link) self.prov_doc.add_namespace(mcras.ALG_NAMESPACE.name, mcras.ALG_NAMESPACE.link) self.prov_doc.add_namespace(mcras.DAT_NAMESPACE.name, mcras.DAT_NAMESPACE.link) self.prov_doc.add_namespace(mcras.LOG_NAMESPACE.name, mcras.LOG_NAMESPACE.link) self.prov_doc.add_namespace(mcras.ONT_NAMESPACE.name, mcras.ONT_NAMESPACE.link)
def test_get_document_as_json(self): example = examples.primer_example() document_id = self.provapi.create_document_from_prov(example) prov_str = self.provapi.get_document_as_json(document_id) self.assertIsNotNone(prov_str) self.assertIsInstance(prov_str, str) prov_document_reverse = ProvDocument.deserialize(content=prov_str, format="json") self.assertEqual(prov_document_reverse, example)
def from_xml(xml_str=None): """ Try to convert a xml string into a ProvDocument :param xml_str: The xml string :type xml_str: str :return: The Prov document :rtype: ProvDocument """ if xml_str is None: raise NoDocumentException() return ProvDocument.deserialize(source=xml_str, format='xml')
def viz_turtle(source=None, content=None, img_file=None, **kwargs): prov_doc = ProvDocument.deserialize(source=source, content=content, format='rdf', rdf_format='turtle') # TODO : show attributes has optional arg dot = prov_to_dot(prov_doc, use_labels=True, show_element_attributes=False, show_relation_attributes=False) dot.write_png(img_file)
def form_string(content): """ Take a string or BufferedReader as argument and transform the string into a ProvDocument :param content: Takes a sting or BufferedReader :return:ProvDocument """ if isinstance(content, ProvDocument): return content elif isinstance(content, BufferedReader): content = reduce(lambda total, a: total + a, content.readlines()) if type(content) is six.binary_type: content_str = content[0:15].decode() if content_str.find("{") > -1: return ProvDocument.deserialize(content=content, format='json') if content_str.find('<?xml') > -1: return ProvDocument.deserialize(content=content, format='xml') elif content_str.find('document') > -1: return ProvDocument.deserialize(content=content, format='provn') raise ParseException("Unsupported input type {}".format(type(content)))
def prov(self, format='json', filename=None): if self.prov_url is None: raise APIException('no provenance information found') response = self.adama.utils.request(self.prov_url, format=format) if format in ('json', 'sources'): return response.json() elif format == 'prov-n': return response.text elif format == 'prov': return ProvDocument.deserialize( content=json.dumps(response.json())) elif format == 'png': return png(response.content, filename)
def get_bundle(self, document_id, bundle_id, prov_format=ProvDocument): if prov_format == ProvDocument: extension = 'json' else: extension = prov_format r = self._request('get', "/documents/%i/bundles/%i.%s" % (document_id, bundle_id, extension), headers=self.headers) if prov_format == ProvDocument: return ProvDocument.deserialize(content=r.content) else: return r.content
def from_json(json=None): """ Try to convert a json string into a document :param json: The json str :type json: str :return: Prov Document :rtype: prov.model.ProvDocument :raise: NoDocumentException """ if json is None: raise NoDocumentException() return ProvDocument.deserialize(source=json, format='json')
def from_provn(provn_str=None): """ Try to convert a provn string into a ProvDocument :param provn_str: The string to convert :type provn_str: str :return: The Prov document :rtype: ProvDocument :raises: NoDocumentException """ if provn_str is None: raise NoDocumentException() return ProvDocument.deserialize(source=provn_str, format='provn')
def get_document_prov(self, document_id, prov_format=ProvDocument): if prov_format == ProvDocument: extension = 'json' else: extension = prov_format r = self._request('get', "/documents/%i.%s" % (document_id, extension), headers=self.headers) if prov_format == ProvDocument: return ProvDocument.deserialize(content=r.content) else: return r.content
def testAllExamples(self): num_graphs = len(examples.tests) logger.info('PROV-JSON round-trip testing %d example provenance graphs', num_graphs) counter = 0 for name, graph in examples.tests: counter += 1 logger.info('%d. Testing the %s example', counter, name) g1 = graph() logger.debug('Original graph in PROV-N\n%s', g1.get_provn()) # json_str = g1.get_provjson(indent=4) json_str = g1.serialize(indent=4) logger.debug('Original graph in PROV-JSON\n%s', json_str) g2 = ProvDocument.deserialize(content=json_str) logger.debug('Graph decoded from PROV-JSON\n%s', g2.get_provn()) self.assertEqual(g1, g2, 'Round-trip JSON encoding/decoding failed: %s.' % name)
def test_get_document_as_json(self): """ try to get the document as json :return: """ self.clear_database() example = examples.primer_example() document_id = self.provapi.save_document_from_prov(example) prov_str = self.provapi.get_document_as_json(document_id) self.assertIsNotNone(prov_str) self.assertIsInstance(prov_str, str) prov_document_reverse = ProvDocument.deserialize(content=prov_str, format="json") self.assertEqual(prov_document_reverse, example)
def build_grakel_graphs(graphs: pd.DataFrame, dataset_path: Path): if "grakel_graphs" in graphs.columns: # nothing to do return graphs # unchanged # expecting a "graphfile" column in the input DataFrame grakel_graphs = [] for graph_filename in graphs.graph_file: filepath = dataset_path / graph_filename # load the file prov_doc = ProvDocument.deserialize(filepath) prov_graph = prov_to_graph(prov_doc) # type: nx.MultiDiGraph grakel_graphs.append(graph_from_prov_networkx_graph(prov_graph)) graphs["grakel_graphs"] = grakel_graphs return graphs
def get_document(self, doc_id, format=None, flattened=False, view=None): """Returns a ProvBundle object of the document with the ID provided or raises ApiNotFoundError""" extension = format if format is not None else 'json' view = "/views/%s" % view if view in ['data', 'process', 'responsibility'] else "" url = "documents/%d%s%s.%s" % (doc_id, "/flattened" if flattened else "", view, extension) response = self.request(url, raw=True) if format is None: # Try to decode it as a ProvDocument result = ProvDocument.deserialize(content=response) else: # return the raw response result = response return result
def test_unifying(self): # This is a very trivial test just to exercise the unified() function # TODO: Create a proper unification test json_path = os.path.dirname(os.path.abspath(__file__)) + '/unification/' filenames = os.listdir(json_path) for filename in filenames: if not filename.endswith('.json'): continue filepath = json_path + filename with open(filepath) as json_file: logger.info('Testing unifying: %s', filename) logger.debug("Loading %s...", filepath) document = ProvDocument.deserialize(json_file) flattened = document.flattened() unified = flattened.unified() self.assertLess(len(unified.get_records()), len(flattened.get_records()))
def test_decoding_unicode_value(self): unicode_char = u'\u2019' json_content = u'''{ "prefix": { "ex": "http://www.example.org" }, "entity": { "ex:unicode_char": { "prov:label": "%s" } } }''' % unicode_char prov_doc = ProvDocument.deserialize(content=json_content, format='json') e1 = prov_doc.get_record('ex:unicode_char')[0] self.assertIn(unicode_char, e1.get_attribute('prov:label'))
def test_decoding_unicode_value(self): unicode_char = u'\u2019' rdf_content = u''' @prefix ex: <http://www.example.org/> . @prefix prov: <http://www.w3.org/ns/prov#> . @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix xml: <http://www.w3.org/XML/1998/namespace> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . ex:unicode_char a prov:Entity ; rdfs:label "%s"^^xsd:string . ''' % unicode_char prov_doc = ProvDocument.deserialize(content=rdf_content, format='rdf', rdf_format='turtle') e1 = prov_doc.get_record('ex:unicode_char')[0] self.assertIn(unicode_char, e1.get_attribute('prov:label'))
def assertRoundTripEquivalence(self, prov_doc, msg=None): if self.FORMAT is None: # This is a dummy test, just return return with io.BytesIO() as stream: prov_doc.serialize(destination=stream, format=self.FORMAT, indent=4) stream.seek(0, 0) prov_doc_new = ProvDocument.deserialize(source=stream, format=self.FORMAT) stream.seek(0, 0) # Assume UTF-8 encoding which is forced by the particular # PROV XML implementation and should also work for the PROV # JSON implementation. msg_extra = "'%s' serialization content:\n%s" % (self.FORMAT, stream.read().decode("utf-8")) msg = "\n".join((msg, msg_extra)) if msg else msg_extra self.assertEqual(prov_doc, prov_doc_new, msg)
def assertRoundTripEquivalence(self, prov_doc, msg=None): if self.FORMAT is None: # This is a dummy test, just return return with io.BytesIO() as stream: prov_doc.serialize(destination=stream, format=self.FORMAT, indent=4) stream.seek(0, 0) prov_doc_new = ProvDocument.deserialize(source=stream, format=self.FORMAT) stream.seek(0, 0) # Assume UTF-8 encoding which is forced by the particular # PROV XML implementation and should also work for the PROV # JSON implementation. msg_extra = u"'%s' serialization content:\n%s" % ( self.FORMAT, stream.read().decode("utf-8")) msg = u'\n'.join((msg, msg_extra)) if msg else msg_extra self.assertEqual(prov_doc, prov_doc_new, msg)
def test_decoding_unicode_value(self): unicode_char = "\u2019" json_content = ( """{ "prefix": { "ex": "http://www.example.org" }, "entity": { "ex:unicode_char": { "prov:label": "%s" } } }""" % unicode_char ) prov_doc = ProvDocument.deserialize(content=json_content, format="json") e1 = prov_doc.get_record("ex:unicode_char")[0] self.assertIn(unicode_char, e1.get_attribute("prov:label"))
def test_decoding_unicode_value(self): unicode_char = "\u2019" rdf_content = ( """ @prefix ex: <http://www.example.org/> . @prefix prov: <http://www.w3.org/ns/prov#> . @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix xml: <http://www.w3.org/XML/1998/namespace> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . ex:unicode_char a prov:Entity ; rdfs:label "%s"^^xsd:string . """ % unicode_char ) prov_doc = ProvDocument.deserialize( content=rdf_content, format="rdf", rdf_format="turtle" ) e1 = prov_doc.get_record("ex:unicode_char")[0] self.assertIn(unicode_char, e1.get_attribute("prov:label"))
def count_flatprovenancetypes_for_graphs( dataset_path: Path, graph_filenames: Collection[str], level: int, including_primitives_types: bool, counting_wdf_as_two: bool = False, ignored_types: FrozenSet[str] = ϕ, ) -> Tuple[List[Dict[int, Dict[FlatProvenanceType, int]]], List[List[float]]]: logger.debug( "Producing linear provenance types up to level %s " "(with application types: %s, counting derivations as 2-length edges: %s) " "for %d graphs...", level, including_primitives_types, counting_wdf_as_two, len(graph_filenames), ) results = [] # type: List[Dict[int, Dict[FlatProvenanceType, int]]] timings = [] # type: List[List[float]] for graph_filename in graph_filenames: filepath = dataset_path / graph_filename prov_doc = ProvDocument.deserialize(filepath) durations = [] # type: List[float] features = dict() # type: Dict[int, Dict[FlatProvenanceType, int]] for h in range(level + 1): timer = Timer(verbose=False) with timer: fp_types = calculate_flat_provenance_types( prov_doc, h, including_primitives_types, counting_wdf_as_two, ignored_types=ignored_types, ) # counting only the last level features[h] = Counter(fp_types[h].values()) durations.append(timer.interval) results.append(features) timings.append(durations) return results, timings
def calculate_provenance_features_for_file(filepath: Path) -> list: # Calculate Provenance Network Metrics (22) and number of edge types try: # load the file prov_doc = ProvDocument.deserialize(filepath) except Exception as e: logger.error("Cannot deserialize %s", filepath) raise e try: timer = Timer(verbose=False) with timer: # counting the record types rec_type_counts = count_record_types(prov_doc) prov_rel_cols = [ rec_type_counts[rec_type] if rec_type in rec_type_counts else 0 for rec_type in PROV_RELATION_NAMES ] mv5 = version5(prov_doc, flat=True) # calculate return mv5[:-4] + prov_rel_cols + [timer.interval] except Exception as e: logger.error("Cannot calculate metrics for %s", filepath) raise e
def assertPROVJSONRoundTripEquivalence(self, prov_doc, msg=None): json_str = prov_doc.serialize(indent=4) prov_doc_new = ProvDocument.deserialize(content=json_str) self.assertEqual(prov_doc, prov_doc_new, msg)