def write_graph_set(self, name: str, graph_set: GraphSet, compression: Optional[str] = None) -> str: """Write a graph artifact Args: name: name graph_set: GraphSet object to write Returns: path to written artifact """ logger = Logger() os.makedirs(self.output_dir, exist_ok=True) if compression is None: artifact_path = os.path.join(self.output_dir, f"{name}.rdf") elif compression == GZIP: artifact_path = os.path.join(self.output_dir, f"{name}.rdf.gz") else: raise ValueError(f"Unknown compression arg {compression}") graph = graph_set.to_rdf() with logger.bind(artifact_path=artifact_path): logger.info(event=LogEvent.WriteToFSStart) with open(artifact_path, "wb") as fp: if compression is None: graph.serialize(fp) elif compression == GZIP: with gzip.GzipFile(fileobj=fp, mode="wb") as gz: graph.serialize(gz) else: raise ValueError(f"Unknown compression arg {compression}") logger.info(event=LogEvent.WriteToFSEnd) return artifact_path
class TestGraphSetWithValidDataMerging(TestCase): def setUp(self): resource_a1 = Resource(resource_id="123", type_name="test:a", links=[SimpleLink(pred="has-foo", obj="goo")]) resource_a2 = Resource(resource_id="123", type_name="test:a", links=[SimpleLink(pred="has-goo", obj="foo")]) resource_b1 = Resource( resource_id="abc", type_name="test:b", links=[ResourceLinkLink(pred="has-a", obj="123")]) resource_b2 = Resource(resource_id="def", type_name="test:b", links=[SimpleLink(pred="name", obj="sue")]) resources = [resource_a1, resource_a2, resource_b1, resource_b2] self.graph_set = GraphSet( name="test-name", version="1", start_time=1234, end_time=4567, resources=resources, errors=["test err 1", "test err 2"], stats=MultilevelCounter(), ) def test_rdf_a_type(self): graph = self.graph_set.to_rdf() a_results = graph.query( "select ?p ?o where {?s a <test-name:test:a> ; ?p ?o} order by ?p ?o" ) expected_a_result_tuples = [ ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "test-name:test:a"), ("test-name:has-foo", "goo"), ("test-name:has-goo", "foo"), ("test-name:id", "123"), ] a_result_tuples = [] for a_result in a_results: self.assertEqual(2, len(a_result)) a_result_tuples.append((str(a_result[0]), str(a_result[1]))) self.assertEqual(expected_a_result_tuples, a_result_tuples) def test_validate(self): self.graph_set.validate()
class TestGraphSetWithValidDataNoMerging(TestCase): def setUp(self): resource_a1 = Resource(resource_id="123", type_name="test:a", links=[SimpleLink(pred="has-foo", obj="goo")]) resource_a2 = Resource(resource_id="456", type_name="test:a") resource_b1 = Resource( resource_id="abc", type_name="test:b", links=[ResourceLinkLink(pred="has-a", obj="123")]) resource_b2 = Resource(resource_id="def", type_name="test:b", links=[SimpleLink(pred="name", obj="sue")]) resources = [resource_a1, resource_a2, resource_b1, resource_b2] self.graph_set = GraphSet( name="test-name", version="1", start_time=1234, end_time=4567, resources=resources, errors=["test err 1", "test err 2"], stats=MultilevelCounter(), ) def test_rdf_a_type(self): graph = self.graph_set.to_rdf() a_results = graph.query( "select ?p ?o where {?s a <test-name:test:a> ; ?p ?o} order by ?p ?o" ) expected_a_result_tuples = [ ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "test-name:test:a"), ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "test-name:test:a"), ("test-name:has-foo", "goo"), ("test-name:id", "123"), ("test-name:id", "456"), ] a_result_tuples = [] for a_result in a_results: self.assertEqual(2, len(a_result)) a_result_tuples.append((str(a_result[0]), str(a_result[1]))) self.assertEqual(expected_a_result_tuples, a_result_tuples) def test_rdf_b_type(self): graph = self.graph_set.to_rdf() graph.serialize("/tmp/test.rdf") linked_a_node_results = graph.query( "select ?s where {?s a <test-name:test:a>; <test-name:id> '123' }") self.assertEqual(len(linked_a_node_results), 1) for linked_a_node_result in linked_a_node_results: linked_a_node = str(linked_a_node_result[0]) b_results = graph.query( "select ?p ?o where {?s a <test-name:test:b> ; ?p ?o} order by ?p ?o" ) expected_b_result_tuples = [ ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "test-name:test:b"), ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "test-name:test:b"), ("test-name:has-a", str(linked_a_node)), ("test-name:id", "abc"), ("test-name:id", "def"), ("test-name:name", "sue"), ] b_result_tuples = [] for b_result in b_results: self.assertEqual(2, len(b_result)) b_result_tuples.append((str(b_result[0]), str(b_result[1]))) self.assertEqual(expected_b_result_tuples, b_result_tuples) def test_rdf_error_graphing(self): graph = self.graph_set.to_rdf() err_results = graph.query( "select ?o where { ?s <test-name:error> ?o } order by ?o") err_strs = [] expected_err_strs = ["test err 1", "test err 2"] for err_result in err_results: self.assertEqual(1, len(err_result)) err_strs.append(str(err_result[0])) self.assertEqual(err_strs, expected_err_strs) def test_to_dict(self): expected_dict = { "name": "test-name", "version": "1", "start_time": 1234, "end_time": 4567, "resources": { "123": { "type": "test:a", "links": [{ "pred": "has-foo", "obj": "goo", "type": "simple" }], }, "456": { "type": "test:a" }, "abc": { "type": "test:b", "links": [{ "pred": "has-a", "obj": "123", "type": "resource_link" }], }, "def": { "type": "test:b", "links": [{ "pred": "name", "obj": "sue", "type": "simple" }], }, }, "errors": ["test err 1", "test err 2"], "stats": { "count": 0 }, } self.assertDictEqual(expected_dict, self.graph_set.to_dict()) def test_from_dict(self): input_dict = { "name": "test-name", "version": "1", "start_time": 1234, "end_time": 4567, "resources": { "123": { "type": "test:a", "links": [{ "pred": "has-foo", "obj": "goo", "type": "simple" }], }, "456": { "type": "test:a" }, "abc": { "type": "test:b", "links": [{ "pred": "has-a", "obj": "123", "type": "resource_link" }], }, "def": { "type": "test:b", "links": [{ "pred": "name", "obj": "sue", "type": "simple" }], }, }, "errors": ["test err 1", "test err 2"], "stats": { "count": 0 }, } graph_set = GraphSet.from_dict(input_dict) self.assertEqual(graph_set.to_dict(), input_dict) def test_validate(self): self.graph_set.validate()
def write_graph_set(self, name: str, graph_set: GraphSet, compression: Optional[str] = None) -> str: """Write a graph artifact Args: name: name graph_set: GraphSet to write Returns: path to written artifact """ logger = Logger() if compression is None: key = f"{name}.rdf" elif compression == GZIP: key = f"{name}.rdf.gz" else: raise ValueError(f"Unknown compression arg {compression}") output_key = "/".join((self.key_prefix, key)) graph = graph_set.to_rdf() with logger.bind(bucket=self.bucket, key_prefix=self.key_prefix, key=key): logger.info(event=LogEvent.WriteToS3Start) with io.BytesIO() as rdf_bytes_buf: if compression is None: graph.serialize(rdf_bytes_buf) elif compression == GZIP: with gzip.GzipFile(fileobj=rdf_bytes_buf, mode="wb") as gz: graph.serialize(gz) else: raise ValueError(f"Unknown compression arg {compression}") rdf_bytes_buf.flush() rdf_bytes_buf.seek(0) session = boto3.Session() s3_client = session.client("s3") s3_client.upload_fileobj(rdf_bytes_buf, self.bucket, output_key) s3_client.put_object_tagging( Bucket=self.bucket, Key=output_key, Tagging={ "TagSet": [ { "Key": "name", "Value": graph_set.name }, { "Key": "version", "Value": graph_set.version }, { "Key": "start_time", "Value": str(graph_set.start_time) }, { "Key": "end_time", "Value": str(graph_set.end_time) }, ] }, ) logger.info(event=LogEvent.WriteToS3End) return f"s3://{self.bucket}/{output_key}"