Exemple #1
0
    def write_graph_set(self,
                        name: str,
                        graph_set: GraphSet,
                        compression: Optional[str] = None) -> str:
        """Write a graph artifact

        Args:
            name: name
            graph_set: GraphSet object to write

        Returns:
            path to written artifact
        """
        logger = Logger()
        os.makedirs(self.output_dir, exist_ok=True)
        if compression is None:
            artifact_path = os.path.join(self.output_dir, f"{name}.rdf")
        elif compression == GZIP:
            artifact_path = os.path.join(self.output_dir, f"{name}.rdf.gz")
        else:
            raise ValueError(f"Unknown compression arg {compression}")
        graph = graph_set.to_rdf()
        with logger.bind(artifact_path=artifact_path):
            logger.info(event=LogEvent.WriteToFSStart)
            with open(artifact_path, "wb") as fp:
                if compression is None:
                    graph.serialize(fp)
                elif compression == GZIP:
                    with gzip.GzipFile(fileobj=fp, mode="wb") as gz:
                        graph.serialize(gz)
                else:
                    raise ValueError(f"Unknown compression arg {compression}")
            logger.info(event=LogEvent.WriteToFSEnd)
        return artifact_path
Exemple #2
0
class TestGraphSetWithValidDataMerging(TestCase):
    def setUp(self):
        resource_a1 = Resource(resource_id="123",
                               type_name="test:a",
                               links=[SimpleLink(pred="has-foo", obj="goo")])
        resource_a2 = Resource(resource_id="123",
                               type_name="test:a",
                               links=[SimpleLink(pred="has-goo", obj="foo")])
        resource_b1 = Resource(
            resource_id="abc",
            type_name="test:b",
            links=[ResourceLinkLink(pred="has-a", obj="123")])
        resource_b2 = Resource(resource_id="def",
                               type_name="test:b",
                               links=[SimpleLink(pred="name", obj="sue")])
        resources = [resource_a1, resource_a2, resource_b1, resource_b2]
        self.graph_set = GraphSet(
            name="test-name",
            version="1",
            start_time=1234,
            end_time=4567,
            resources=resources,
            errors=["test err 1", "test err 2"],
            stats=MultilevelCounter(),
        )

    def test_rdf_a_type(self):
        graph = self.graph_set.to_rdf()

        a_results = graph.query(
            "select ?p ?o where {?s a <test-name:test:a> ; ?p ?o} order by ?p ?o"
        )
        expected_a_result_tuples = [
            ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
             "test-name:test:a"),
            ("test-name:has-foo", "goo"),
            ("test-name:has-goo", "foo"),
            ("test-name:id", "123"),
        ]
        a_result_tuples = []
        for a_result in a_results:
            self.assertEqual(2, len(a_result))
            a_result_tuples.append((str(a_result[0]), str(a_result[1])))
        self.assertEqual(expected_a_result_tuples, a_result_tuples)

    def test_validate(self):
        self.graph_set.validate()
Exemple #3
0
class TestGraphSetWithValidDataNoMerging(TestCase):
    def setUp(self):
        resource_a1 = Resource(resource_id="123",
                               type_name="test:a",
                               links=[SimpleLink(pred="has-foo", obj="goo")])
        resource_a2 = Resource(resource_id="456", type_name="test:a")
        resource_b1 = Resource(
            resource_id="abc",
            type_name="test:b",
            links=[ResourceLinkLink(pred="has-a", obj="123")])
        resource_b2 = Resource(resource_id="def",
                               type_name="test:b",
                               links=[SimpleLink(pred="name", obj="sue")])
        resources = [resource_a1, resource_a2, resource_b1, resource_b2]
        self.graph_set = GraphSet(
            name="test-name",
            version="1",
            start_time=1234,
            end_time=4567,
            resources=resources,
            errors=["test err 1", "test err 2"],
            stats=MultilevelCounter(),
        )

    def test_rdf_a_type(self):
        graph = self.graph_set.to_rdf()

        a_results = graph.query(
            "select ?p ?o where {?s a <test-name:test:a> ; ?p ?o} order by ?p ?o"
        )
        expected_a_result_tuples = [
            ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
             "test-name:test:a"),
            ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
             "test-name:test:a"),
            ("test-name:has-foo", "goo"),
            ("test-name:id", "123"),
            ("test-name:id", "456"),
        ]
        a_result_tuples = []
        for a_result in a_results:
            self.assertEqual(2, len(a_result))
            a_result_tuples.append((str(a_result[0]), str(a_result[1])))
        self.assertEqual(expected_a_result_tuples, a_result_tuples)

    def test_rdf_b_type(self):
        graph = self.graph_set.to_rdf()
        graph.serialize("/tmp/test.rdf")
        linked_a_node_results = graph.query(
            "select ?s where {?s a <test-name:test:a>; <test-name:id> '123' }")
        self.assertEqual(len(linked_a_node_results), 1)
        for linked_a_node_result in linked_a_node_results:
            linked_a_node = str(linked_a_node_result[0])
        b_results = graph.query(
            "select ?p ?o where {?s a <test-name:test:b> ; ?p ?o} order by ?p ?o"
        )
        expected_b_result_tuples = [
            ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
             "test-name:test:b"),
            ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
             "test-name:test:b"),
            ("test-name:has-a", str(linked_a_node)),
            ("test-name:id", "abc"),
            ("test-name:id", "def"),
            ("test-name:name", "sue"),
        ]
        b_result_tuples = []
        for b_result in b_results:
            self.assertEqual(2, len(b_result))
            b_result_tuples.append((str(b_result[0]), str(b_result[1])))
        self.assertEqual(expected_b_result_tuples, b_result_tuples)

    def test_rdf_error_graphing(self):
        graph = self.graph_set.to_rdf()

        err_results = graph.query(
            "select ?o where { ?s <test-name:error> ?o } order by ?o")
        err_strs = []
        expected_err_strs = ["test err 1", "test err 2"]
        for err_result in err_results:
            self.assertEqual(1, len(err_result))
            err_strs.append(str(err_result[0]))
        self.assertEqual(err_strs, expected_err_strs)

    def test_to_dict(self):
        expected_dict = {
            "name": "test-name",
            "version": "1",
            "start_time": 1234,
            "end_time": 4567,
            "resources": {
                "123": {
                    "type": "test:a",
                    "links": [{
                        "pred": "has-foo",
                        "obj": "goo",
                        "type": "simple"
                    }],
                },
                "456": {
                    "type": "test:a"
                },
                "abc": {
                    "type":
                    "test:b",
                    "links": [{
                        "pred": "has-a",
                        "obj": "123",
                        "type": "resource_link"
                    }],
                },
                "def": {
                    "type": "test:b",
                    "links": [{
                        "pred": "name",
                        "obj": "sue",
                        "type": "simple"
                    }],
                },
            },
            "errors": ["test err 1", "test err 2"],
            "stats": {
                "count": 0
            },
        }
        self.assertDictEqual(expected_dict, self.graph_set.to_dict())

    def test_from_dict(self):
        input_dict = {
            "name": "test-name",
            "version": "1",
            "start_time": 1234,
            "end_time": 4567,
            "resources": {
                "123": {
                    "type": "test:a",
                    "links": [{
                        "pred": "has-foo",
                        "obj": "goo",
                        "type": "simple"
                    }],
                },
                "456": {
                    "type": "test:a"
                },
                "abc": {
                    "type":
                    "test:b",
                    "links": [{
                        "pred": "has-a",
                        "obj": "123",
                        "type": "resource_link"
                    }],
                },
                "def": {
                    "type": "test:b",
                    "links": [{
                        "pred": "name",
                        "obj": "sue",
                        "type": "simple"
                    }],
                },
            },
            "errors": ["test err 1", "test err 2"],
            "stats": {
                "count": 0
            },
        }
        graph_set = GraphSet.from_dict(input_dict)
        self.assertEqual(graph_set.to_dict(), input_dict)

    def test_validate(self):
        self.graph_set.validate()
Exemple #4
0
    def write_graph_set(self,
                        name: str,
                        graph_set: GraphSet,
                        compression: Optional[str] = None) -> str:
        """Write a graph artifact

        Args:
            name: name
            graph_set: GraphSet to write

        Returns:
            path to written artifact
        """
        logger = Logger()
        if compression is None:
            key = f"{name}.rdf"
        elif compression == GZIP:
            key = f"{name}.rdf.gz"
        else:
            raise ValueError(f"Unknown compression arg {compression}")
        output_key = "/".join((self.key_prefix, key))
        graph = graph_set.to_rdf()
        with logger.bind(bucket=self.bucket,
                         key_prefix=self.key_prefix,
                         key=key):
            logger.info(event=LogEvent.WriteToS3Start)
            with io.BytesIO() as rdf_bytes_buf:
                if compression is None:
                    graph.serialize(rdf_bytes_buf)
                elif compression == GZIP:
                    with gzip.GzipFile(fileobj=rdf_bytes_buf, mode="wb") as gz:
                        graph.serialize(gz)
                else:
                    raise ValueError(f"Unknown compression arg {compression}")
                rdf_bytes_buf.flush()
                rdf_bytes_buf.seek(0)
                session = boto3.Session()
                s3_client = session.client("s3")
                s3_client.upload_fileobj(rdf_bytes_buf, self.bucket,
                                         output_key)
            s3_client.put_object_tagging(
                Bucket=self.bucket,
                Key=output_key,
                Tagging={
                    "TagSet": [
                        {
                            "Key": "name",
                            "Value": graph_set.name
                        },
                        {
                            "Key": "version",
                            "Value": graph_set.version
                        },
                        {
                            "Key": "start_time",
                            "Value": str(graph_set.start_time)
                        },
                        {
                            "Key": "end_time",
                            "Value": str(graph_set.end_time)
                        },
                    ]
                },
            )
            logger.info(event=LogEvent.WriteToS3End)
        return f"s3://{self.bucket}/{output_key}"