Esempio n. 1
0
    def test_from_cloud(self):
        """Test that a Manifest can be instantiated from a cloud path."""
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset = create_dataset_with_two_files(temporary_directory)
            dataset_path = storage.path.generate_gs_path(
                TEST_BUCKET_NAME, "my_nice_dataset")
            dataset.upload(cloud_path=dataset_path)

            manifest = Manifest(datasets={"my-dataset": dataset})
            cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                       "my-directory",
                                                       "manifest.json")
            manifest.to_cloud(cloud_path)

            persisted_manifest = Manifest.from_cloud(cloud_path)

            self.assertEqual(persisted_manifest.id, manifest.id)
            self.assertEqual(persisted_manifest.hash_value,
                             manifest.hash_value)
            self.assertEqual(
                {
                    dataset.name
                    for dataset in persisted_manifest.datasets.values()
                },
                {dataset.name
                 for dataset in manifest.datasets.values()},
            )

            for dataset in persisted_manifest.datasets.values():
                self.assertEqual(dataset.path, dataset_path)
                self.assertTrue(len(dataset.files), 2)
                self.assertTrue(
                    all(isinstance(file, Datafile) for file in dataset.files))
Esempio n. 2
0
    def test_serialisation_and_deserialisation(self):
        """Test that manifests can be serialised and deserialised."""
        with tempfile.TemporaryDirectory() as temporary_directory:
            datasets = {
                "my_dataset_0":
                Dataset(
                    path=os.path.join(temporary_directory, "my_dataset_0"),
                    files=[
                        Datafile(
                            path=os.path.join(temporary_directory,
                                              "my_dataset_0", "my_file_0.txt"))
                    ],
                ),
                "my_dataset_1":
                Dataset(
                    path=os.path.join(temporary_directory, "my_dataset_1"),
                    files=[
                        Datafile(
                            path=os.path.join(temporary_directory,
                                              "my_dataset_1", "my_file_1.txt"))
                    ],
                ),
            }

            for dataset in datasets.values():
                dataset.update_local_metadata()

            manifest = Manifest(datasets=datasets,
                                id="7e0025cd-bd68-4de6-b48d-2643ebd5effd",
                                name="my-manifest")

            serialised_manifest = manifest.to_primitive()

            self.assertEqual(
                serialised_manifest,
                {
                    "id": manifest.id,
                    "name": "my-manifest",
                    "datasets": {
                        "my_dataset_0":
                        os.path.join(temporary_directory, "my_dataset_0"),
                        "my_dataset_1":
                        os.path.join(temporary_directory, "my_dataset_1"),
                    },
                },
            )

            deserialised_manifest = Manifest.deserialise(serialised_manifest)

        self.assertEqual(manifest.name, deserialised_manifest.name)
        self.assertEqual(manifest.id, deserialised_manifest.id)

        for key in manifest.datasets.keys():
            self.assertEqual(manifest.datasets[key].name,
                             deserialised_manifest.datasets[key].name)
            self.assertEqual(manifest.datasets[key].id,
                             deserialised_manifest.datasets[key].id)
            self.assertEqual(manifest.datasets[key].path,
                             deserialised_manifest.datasets[key].path)
Esempio n. 3
0
    def test_all_datasets_are_in_cloud(self):
        """Test whether all files of all datasets in a manifest are in the cloud or not can be determined."""
        self.assertFalse(
            self.create_valid_manifest().all_datasets_are_in_cloud)
        self.assertTrue(Manifest().all_datasets_are_in_cloud)

        files = [
            Datafile(path="gs://hello/file.txt"),
            Datafile(path="gs://goodbye/file.csv")
        ]
        manifest = Manifest(datasets={"my_dataset": Dataset(files=files)})
        self.assertTrue(manifest.all_datasets_are_in_cloud)
Esempio n. 4
0
    def test_instantiating_from_datasets_from_different_cloud_buckets(self):
        """Test instantiating a manifest from multiple datasets from different cloud buckets."""
        storage_client = GoogleCloudStorageClient()

        extra_bucket_name = TEST_BUCKET_NAME + "-another"
        storage_client.create_bucket(name=extra_bucket_name)

        storage_client.upload_from_string(
            "[1, 2, 3]",
            storage.path.generate_gs_path(TEST_BUCKET_NAME, "my_dataset_a",
                                          "file_0.txt"),
        )

        storage_client.upload_from_string(
            "[4, 5, 6]",
            storage.path.generate_gs_path(extra_bucket_name, "my_dataset_b",
                                          "the_data.txt"))

        manifest = Manifest(
            datasets={
                "my_dataset_a": f"gs://{TEST_BUCKET_NAME}/my_dataset_a",
                "my_dataset_b": f"gs://{extra_bucket_name}/my_dataset_b",
            })

        self.assertEqual(
            {dataset.name
             for dataset in manifest.datasets.values()},
            {"my_dataset_a", "my_dataset_b"})

        files = [
            list(dataset.files)[0] for dataset in manifest.datasets.values()
        ]
        self.assertEqual({file.bucket_name
                          for file in files},
                         {TEST_BUCKET_NAME, extra_bucket_name})
Esempio n. 5
0
    def test_instantiating_from_serialised_cloud_datasets_with_no_dataset_json_file(
            self):
        """Test that a Manifest can be instantiated from a serialized cloud dataset with no `dataset.json` file. This
        simulates what happens when such a cloud dataset is referred to in a manifest received by a child service.
        """
        GoogleCloudStorageClient().upload_from_string(
            "[1, 2, 3]",
            cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                     "my_dataset",
                                                     "file_0.txt"),
        )

        GoogleCloudStorageClient().upload_from_string(
            "[4, 5, 6]",
            cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                     "my_dataset",
                                                     "file_1.txt"),
        )

        serialised_cloud_dataset = Dataset(
            path=f"gs://{TEST_BUCKET_NAME}/my_dataset").to_primitive()

        manifest = Manifest(datasets={"my_dataset": serialised_cloud_dataset})
        self.assertEqual(len(manifest.datasets), 1)
        self.assertEqual(manifest.datasets["my_dataset"].path,
                         f"gs://{TEST_BUCKET_NAME}/my_dataset")
        self.assertEqual(len(manifest.datasets["my_dataset"].files), 2)
Esempio n. 6
0
    def test_to_cloud(self):
        """Test that a manifest can be uploaded to the cloud as a serialised JSON file of the Manifest instance."""
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset = create_dataset_with_two_files(temporary_directory)
            dataset.upload(cloud_path=storage.path.generate_gs_path(
                TEST_BUCKET_NAME, "my-small-dataset"))

            manifest = Manifest(datasets={"my-dataset": dataset})
            cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                       "manifest.json")
            manifest.to_cloud(cloud_path)

            persisted_manifest = json.loads(
                GoogleCloudStorageClient().download_as_string(cloud_path))
            self.assertEqual(persisted_manifest["datasets"]["my-dataset"],
                             f"gs://{TEST_BUCKET_NAME}/my-small-dataset")
Esempio n. 7
0
def run(analysis):
    """Run a mock analysis.

    :param octue.resources.analysis.Analysis analysis:
    :return None:
    """
    analysis.output_values = {"width": 3}
    analysis.output_manifest = Manifest()
Esempio n. 8
0
    def test_instantiating_from_multiple_local_datasets(self):
        """Test instantiating a manifest from multiple local datasets."""
        manifest = Manifest(datasets={
            "dataset_0":
            os.path.join("path", "to", "dataset_0"),
            "dataset_1":
            os.path.join("path", "to", "dataset_1"),
        }, )

        self.assertEqual(
            {dataset.name
             for dataset in manifest.datasets.values()},
            {"dataset_0", "dataset_1"})
Esempio n. 9
0
    def test_ask_with_input_manifest(self):
        """Test that a service can ask a question including an input manifest to another service that is serving and
        receive an answer.
        """
        child = self.make_new_child(BACKEND,
                                    run_function_returnee=MockAnalysis(),
                                    use_mock=True)
        parent = MockService(backend=BACKEND, children={child.id: child})

        dataset_path = f"gs://{TEST_BUCKET_NAME}/my-dataset"

        input_manifest = Manifest(
            datasets={
                "my-dataset":
                Dataset(
                    files=[
                        f"{dataset_path}/hello.txt",
                        f"{dataset_path}/goodbye.csv"
                    ],
                    path=dataset_path,
                )
            })

        with patch("octue.cloud.pub_sub.service.Topic", new=MockTopic):
            with patch("octue.cloud.pub_sub.service.Subscription",
                       new=MockSubscription):
                with patch("google.cloud.pubsub_v1.SubscriberClient",
                           new=MockSubscriber):
                    child.serve()

                    with patch(
                            "google.cloud.storage.blob.Blob.generate_signed_url",
                            new=mock_generate_signed_url):
                        answer = self.ask_question_and_wait_for_answer(
                            parent=parent,
                            child=child,
                            input_values={},
                            input_manifest=input_manifest,
                        )

        self.assertEqual(
            answer,
            {
                "output_values": MockAnalysis().output_values,
                "output_manifest": MockAnalysis().output_manifest
            },
        )
Esempio n. 10
0
    def test_finalise_with_upload(self):
        """Test that the `finalise` method can be used to upload the output manifest's datasets to a cloud location
        and that it updates the manifest with signed URLs for accessing them.
        """
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset_path = os.path.join(temporary_directory, "the_dataset")

            with Datafile(path=os.path.join(dataset_path, "my_file.dat"), mode="w") as (datafile, f):
                f.write("hello")

            output_manifest = Manifest(
                datasets={
                    "the_dataset": Dataset(
                        path=dataset_path, files={datafile.local_path}, labels={"one", "two", "three"}
                    )
                }
            )

            analysis = Analysis(
                twine={
                    "output_values_schema": {"type": "object", "properties": {"blah": {"type": "integer"}}},
                    "output_manifest": {"datasets": {"the_dataset": {"purpose": "testing"}}},
                },
                output_values={"blah": 3},
                output_manifest=output_manifest,
            )

            with patch("google.cloud.storage.blob.Blob.generate_signed_url", new=mock_generate_signed_url):
                analysis.finalise(upload_output_datasets_to=f"gs://{TEST_BUCKET_NAME}/datasets")

        signed_url_for_dataset = analysis.output_manifest.datasets["the_dataset"].path
        self.assertTrue(storage.path.is_url(signed_url_for_dataset))

        self.assertTrue(
            signed_url_for_dataset.startswith(
                f"{self.test_result_modifier.storage_emulator_host}/{TEST_BUCKET_NAME}/datasets/the_dataset"
            )
        )

        downloaded_dataset = Dataset(path=signed_url_for_dataset)
        self.assertEqual(downloaded_dataset.name, "the_dataset")
        self.assertEqual(len(downloaded_dataset.files), 1)
        self.assertEqual(downloaded_dataset.labels, {"one", "two", "three"})

        with downloaded_dataset.files.one() as (downloaded_datafile, f):
            self.assertEqual(f.read(), "hello")
Esempio n. 11
0
    def test_ask_with_input_manifest_with_local_paths_works_if_allowed_and_child_has_access_to_the_local_paths(
            self):
        """Test that an input manifest referencing local files can be used if the files can be accessed by the child and
        the `allow_local_files` parameter is `True`.
        """
        temporary_local_path = tempfile.NamedTemporaryFile(delete=False).name

        with open(temporary_local_path, "w") as f:
            f.write("This is a local file.")

        local_file = Datafile(path=temporary_local_path)
        self.assertFalse(local_file.exists_in_cloud)

        manifest = Manifest(
            datasets={
                "my-local-dataset":
                Dataset(name="my-local-dataset", files={local_file})
            })

        # Get the child to open the local file itself and return the contents as output.
        def run_function(analysis_id, input_values, input_manifest,
                         analysis_log_handler, handle_monitor_message):
            with open(temporary_local_path) as f:
                return MockAnalysis(output_values=f.read())

        child = MockService(backend=BACKEND, run_function=run_function)
        parent = MockService(backend=BACKEND, children={child.id: child})

        with patch("octue.cloud.pub_sub.service.Topic", new=MockTopic):
            with patch("octue.cloud.pub_sub.service.Subscription",
                       new=MockSubscription):
                with patch("google.cloud.pubsub_v1.SubscriberClient",
                           new=MockSubscriber):
                    child.serve()

                    answer = self.ask_question_and_wait_for_answer(
                        parent=parent,
                        child=child,
                        input_values={},
                        input_manifest=manifest,
                        allow_local_files=True,
                    )

        self.assertEqual(answer["output_values"], "This is a local file.")
Esempio n. 12
0
    def test_app(self):
        """Test that the app takes in input in the correct format and returns an analysis with the correct output
        values.
        """
        runner = Runner(app_src=REPOSITORY_ROOT, twine=TWINE_PATH)

        with patch("google.cloud.storage.blob.Blob.generate_signed_url", mock_generate_signed_url):
            analysis = runner.run(input_values={"n_iterations": 3})

        # Check the output values.
        self.assertEqual(analysis.output_values, [1, 2, 3, 4, 5])

        # Test that the signed URLs for the dataset and its files work and can be used to reinstantiate the output
        # manifest after serialisation.
        downloaded_output_manifest = Manifest.deserialise(analysis.output_manifest.to_primitive())

        # Check that the output dataset and its files can be accessed.
        with downloaded_output_manifest.datasets["example_dataset"].files.one() as (datafile, f):
            self.assertEqual(f.read(), "This is some example service output.")
Esempio n. 13
0
    def test_finalise_validates_output(self):
        """Test that the `finalise` method with no other arguments just validates the output manifest and values."""
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset_path = os.path.join(temporary_directory, "the_dataset")

            with Datafile(path=os.path.join(dataset_path, "my_file.dat"), mode="w") as (datafile, f):
                f.write("hello")

            output_manifest = Manifest(
                datasets={"the_dataset": Dataset(path=dataset_path, files={datafile.local_path})}
            )

            analysis = Analysis(
                twine={
                    "output_values_schema": {"type": "object", "properties": {"blah": {"type": "integer"}}},
                    "output_manifest": {"datasets": {"the_dataset": {"purpose": "testing"}}},
                },
                output_values={"blah": 3},
                output_manifest=output_manifest,
            )

            analysis.finalise()
Esempio n. 14
0
 def create_valid_manifest(self):
     """Create a valid manifest with two valid datasets (they're the same dataset in this case)."""
     datasets = {"my_dataset": self.create_valid_dataset(), "another_dataset": self.create_valid_dataset()}
     manifest = Manifest(datasets=datasets)
     return manifest
Esempio n. 15
0
class MockAnalysisWithOutputManifest:
    output_values = "This is an analysis with an empty output manifest."
    output_manifest = Manifest()