Example #1
0
    def test_download(self):
        """Test that all files in a dataset can be downloaded with one command."""
        storage_client = GoogleCloudStorageClient()

        dataset_name = "another-dataset"
        storage_client.upload_from_string(
            string=json.dumps([1, 2, 3]),
            cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                     dataset_name,
                                                     "file_0.txt"),
        )
        storage_client.upload_from_string(
            string=json.dumps([4, 5, 6]),
            cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                     dataset_name,
                                                     "file_1.txt"),
        )

        dataset = Dataset(path=f"gs://{TEST_BUCKET_NAME}/{dataset_name}")

        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset.download(local_directory=temporary_directory)

            with open(os.path.join(temporary_directory, "file_0.txt")) as f:
                self.assertEqual(f.read(), "[1, 2, 3]")

            with open(os.path.join(temporary_directory, "file_1.txt")) as f:
                self.assertEqual(f.read(), "[4, 5, 6]")
Example #2
0
    def test_download_from_nested_dataset(self):
        """Test that all files in a nested dataset can be downloaded with one command."""
        dataset_path = self._create_nested_cloud_dataset()

        dataset = Dataset(path=dataset_path, recursive=True)

        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset.download(local_directory=temporary_directory)

            with open(os.path.join(temporary_directory, "file_0.txt")) as f:
                self.assertEqual(f.read(), "[1, 2, 3]")

            with open(os.path.join(temporary_directory, "file_1.txt")) as f:
                self.assertEqual(f.read(), "[4, 5, 6]")

            with open(
                    os.path.join(temporary_directory, "sub-directory",
                                 "sub_file.txt")) as f:
                self.assertEqual(f.read(), "['a', 'b', 'c']")

            with open(
                    os.path.join(temporary_directory, "sub-directory",
                                 "sub-sub-directory",
                                 "sub_sub_file.txt")) as f:
                self.assertEqual(f.read(), "['blah', 'b', 'c']")
Example #3
0
    def test_upload_with_nested_dataset_preserves_nested_structure(self):
        """Test that uploading a dataset containing datafiles in a nested directory structure to the cloud preserves
        this structure in the cloud.
        """
        with tempfile.TemporaryDirectory() as temporary_directory:
            local_paths = self._create_files_and_nested_subdirectories(
                temporary_directory)
            dataset = Dataset(path=temporary_directory, recursive=True)

            upload_path = storage.path.generate_gs_path(
                TEST_BUCKET_NAME, "my-dataset")
            dataset.upload(cloud_path=upload_path)

        cloud_datafile_relative_paths = {
            blob.name.split(dataset.name)[-1].strip("/")
            for blob in GoogleCloudStorageClient().scandir(
                upload_path,
                filter=lambda blob: not blob.name.endswith(".octue") and
                SIGNED_METADATA_DIRECTORY not in blob.name,
            )
        }

        # Check that the paths relative to the dataset directory are the same in the cloud as they are locally.
        local_datafile_relative_paths = {
            path.split(temporary_directory)[-1].strip(os.path.sep).replace(
                os.path.sep, "/")
            for path in local_paths
        }

        self.assertEqual(cloud_datafile_relative_paths,
                         local_datafile_relative_paths)
Example #4
0
    def test_download_from_nested_dataset_with_no_local_directory_given(self):
        """Test that, when downloading all files from a nested dataset and no local directory is given, the dataset
        structure is preserved in the temporary directory used.
        """
        dataset_path = self._create_nested_cloud_dataset()

        dataset = Dataset(path=dataset_path, recursive=True)

        # Mock the temporary directory created in `Dataset.download_all_files` so we can access it for the test.
        temporary_directory = tempfile.TemporaryDirectory()

        with patch("tempfile.TemporaryDirectory",
                   return_value=temporary_directory):
            dataset.download()

        with open(os.path.join(temporary_directory.name, "file_0.txt")) as f:
            self.assertEqual(f.read(), "[1, 2, 3]")

        with open(os.path.join(temporary_directory.name, "file_1.txt")) as f:
            self.assertEqual(f.read(), "[4, 5, 6]")

        with open(
                os.path.join(temporary_directory.name, "sub-directory",
                             "sub_file.txt")) as f:
            self.assertEqual(f.read(), "['a', 'b', 'c']")

        with open(
                os.path.join(temporary_directory.name, "sub-directory",
                             "sub-sub-directory", "sub_sub_file.txt")) as f:
            self.assertEqual(f.read(), "['blah', 'b', 'c']")
Example #5
0
    def test_serialisation_and_deserialisation(self):
        """Test that manifests can be serialised and deserialised."""
        with tempfile.TemporaryDirectory() as temporary_directory:
            datasets = {
                "my_dataset_0":
                Dataset(
                    path=os.path.join(temporary_directory, "my_dataset_0"),
                    files=[
                        Datafile(
                            path=os.path.join(temporary_directory,
                                              "my_dataset_0", "my_file_0.txt"))
                    ],
                ),
                "my_dataset_1":
                Dataset(
                    path=os.path.join(temporary_directory, "my_dataset_1"),
                    files=[
                        Datafile(
                            path=os.path.join(temporary_directory,
                                              "my_dataset_1", "my_file_1.txt"))
                    ],
                ),
            }

            for dataset in datasets.values():
                dataset.update_local_metadata()

            manifest = Manifest(datasets=datasets,
                                id="7e0025cd-bd68-4de6-b48d-2643ebd5effd",
                                name="my-manifest")

            serialised_manifest = manifest.to_primitive()

            self.assertEqual(
                serialised_manifest,
                {
                    "id": manifest.id,
                    "name": "my-manifest",
                    "datasets": {
                        "my_dataset_0":
                        os.path.join(temporary_directory, "my_dataset_0"),
                        "my_dataset_1":
                        os.path.join(temporary_directory, "my_dataset_1"),
                    },
                },
            )

            deserialised_manifest = Manifest.deserialise(serialised_manifest)

        self.assertEqual(manifest.name, deserialised_manifest.name)
        self.assertEqual(manifest.id, deserialised_manifest.id)

        for key in manifest.datasets.keys():
            self.assertEqual(manifest.datasets[key].name,
                             deserialised_manifest.datasets[key].name)
            self.assertEqual(manifest.datasets[key].id,
                             deserialised_manifest.datasets[key].id)
            self.assertEqual(manifest.datasets[key].path,
                             deserialised_manifest.datasets[key].path)
Example #6
0
 def test_upload_works_with_implicit_cloud_location_if_cloud_location_previously_provided(
         self):
     """Test `Dataset.to_cloud` works with an implicit cloud location if the cloud location has previously been
     provided.
     """
     dataset_path = self._create_nested_cloud_dataset()
     dataset = Dataset(path=dataset_path, recursive=True)
     dataset.upload()
Example #7
0
    def test_cannot_add_non_datafiles(self):
        """Ensures that exception will be raised if adding a non-datafile object"""
        class NotADatafile:
            pass

        resource = Dataset()
        with self.assertRaises(exceptions.InvalidInputException):
            resource.add(NotADatafile())
Example #8
0
    def test_error_raised_if_attempting_to_generate_signed_url_for_local_dataset(
            self):
        """Test that an error is raised if trying to generate a signed URL for a local dataset."""
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset = Dataset(path=temporary_directory,
                              tags={"hello": "world"})

            with self.assertRaises(exceptions.CloudLocationNotSpecified):
                dataset.generate_signed_url()
Example #9
0
 def test_metadata_hash_is_same_for_different_datasets_with_the_same_metadata(
         self):
     """Test that the metadata hash is the same for datasets with different files but the same metadata."""
     first_dataset = Dataset(labels={"a", "b", "c"})
     second_dataset = Dataset(
         files={Datafile(path="blah", hypothetical=True)},
         labels={"a", "b", "c"})
     self.assertEqual(first_dataset.metadata_hash_value,
                      second_dataset.metadata_hash_value)
Example #10
0
    def test_exists_in_cloud(self):
        """Test whether all files of a dataset are in the cloud or not can be determined."""
        self.assertFalse(self.create_valid_dataset().all_files_are_in_cloud)

        with tempfile.TemporaryDirectory() as temporary_directory:
            self.assertTrue(
                Dataset(path=temporary_directory).all_files_are_in_cloud)

        files = [
            Datafile(path="gs://hello/file.txt"),
            Datafile(path="gs://goodbye/file.csv")
        ]
        self.assertTrue(Dataset(files=files).all_files_are_in_cloud)
Example #11
0
    def test_exiting_context_manager_of_cloud_dataset_updates_cloud_metadata(
            self):
        """Test that cloud metadata for a cloud dataset is updated on exit of the dataset context manager."""
        dataset_path = self._create_nested_cloud_dataset()
        dataset = Dataset(path=dataset_path, recursive=True)

        with dataset:
            dataset.tags = {"cat": "dog"}
            dataset.labels = {"animals"}

        reloaded_dataset = Dataset(path=dataset_path)
        self.assertEqual(reloaded_dataset.id, dataset.id)
        self.assertEqual(reloaded_dataset.tags, {"cat": "dog"})
        self.assertEqual(reloaded_dataset.labels, {"animals"})
Example #12
0
    def test_stored_metadata_ignored_if_hypothetical_is_true(self):
        """Test that instantiation metadata is used instead of stored metadata if `hypothetical` is `True`."""
        cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                   "existing_dataset")

        # Create a dataset in the cloud and set some metadata on it.
        with Dataset(path=cloud_path) as dataset:
            dataset.tags = {"existing": True}

        # Load it separately from the cloud object and check that the instantiation metadata is used instead of the
        # stored metadata.
        reloaded_datafile = Dataset(path=cloud_path,
                                    tags={"new": "tag"},
                                    hypothetical=True)
        self.assertEqual(reloaded_datafile.tags, {"new": "tag"})
Example #13
0
 def test_instantiation_metadata_used_if_not_hypothetical_but_no_stored_metadata(
         self):
     """Test that instantiation metadata is used if `hypothetical` is `False` but there's no stored metadata."""
     cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                "non_existing_dataset")
     dataset = Dataset(path=cloud_path, tags={"new": "tag"})
     self.assertEqual(dataset.tags, {"new": "tag"})
Example #14
0
    def test_from_cloud(self):
        """Test that a Dataset in cloud storage can be accessed via a cloud path."""
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset = create_dataset_with_two_files(temporary_directory)
            dataset.tags = {"a": "b", "c": 1}

            cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                       "a_directory",
                                                       dataset.name)
            dataset.upload(cloud_path)
            persisted_dataset = Dataset(path=cloud_path)

            self.assertEqual(
                persisted_dataset.path,
                f"gs://{TEST_BUCKET_NAME}/a_directory/{dataset.name}")
            self.assertEqual(persisted_dataset.id, dataset.id)
            self.assertEqual(persisted_dataset.name, dataset.name)
            self.assertEqual(persisted_dataset.hash_value, dataset.hash_value)
            self.assertEqual(persisted_dataset.tags, dataset.tags)
            self.assertEqual(persisted_dataset.labels, dataset.labels)
            self.assertEqual({file.name
                              for file in persisted_dataset.files},
                             {file.name
                              for file in dataset.files})

            for file in persisted_dataset:
                self.assertEqual(
                    file.cloud_path,
                    f"gs://{TEST_BUCKET_NAME}/a_directory/{dataset.name}/{file.name}"
                )
Example #15
0
    def test_instantiating_from_serialised_cloud_datasets_with_no_dataset_json_file(
            self):
        """Test that a Manifest can be instantiated from a serialized cloud dataset with no `dataset.json` file. This
        simulates what happens when such a cloud dataset is referred to in a manifest received by a child service.
        """
        GoogleCloudStorageClient().upload_from_string(
            "[1, 2, 3]",
            cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                     "my_dataset",
                                                     "file_0.txt"),
        )

        GoogleCloudStorageClient().upload_from_string(
            "[4, 5, 6]",
            cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                     "my_dataset",
                                                     "file_1.txt"),
        )

        serialised_cloud_dataset = Dataset(
            path=f"gs://{TEST_BUCKET_NAME}/my_dataset").to_primitive()

        manifest = Manifest(datasets={"my_dataset": serialised_cloud_dataset})
        self.assertEqual(len(manifest.datasets), 1)
        self.assertEqual(manifest.datasets["my_dataset"].path,
                         f"gs://{TEST_BUCKET_NAME}/my_dataset")
        self.assertEqual(len(manifest.datasets["my_dataset"].files), 2)
Example #16
0
    def test_finalise_with_upload(self):
        """Test that the `finalise` method can be used to upload the output manifest's datasets to a cloud location
        and that it updates the manifest with signed URLs for accessing them.
        """
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset_path = os.path.join(temporary_directory, "the_dataset")

            with Datafile(path=os.path.join(dataset_path, "my_file.dat"), mode="w") as (datafile, f):
                f.write("hello")

            output_manifest = Manifest(
                datasets={
                    "the_dataset": Dataset(
                        path=dataset_path, files={datafile.local_path}, labels={"one", "two", "three"}
                    )
                }
            )

            analysis = Analysis(
                twine={
                    "output_values_schema": {"type": "object", "properties": {"blah": {"type": "integer"}}},
                    "output_manifest": {"datasets": {"the_dataset": {"purpose": "testing"}}},
                },
                output_values={"blah": 3},
                output_manifest=output_manifest,
            )

            with patch("google.cloud.storage.blob.Blob.generate_signed_url", new=mock_generate_signed_url):
                analysis.finalise(upload_output_datasets_to=f"gs://{TEST_BUCKET_NAME}/datasets")

        signed_url_for_dataset = analysis.output_manifest.datasets["the_dataset"].path
        self.assertTrue(storage.path.is_url(signed_url_for_dataset))

        self.assertTrue(
            signed_url_for_dataset.startswith(
                f"{self.test_result_modifier.storage_emulator_host}/{TEST_BUCKET_NAME}/datasets/the_dataset"
            )
        )

        downloaded_dataset = Dataset(path=signed_url_for_dataset)
        self.assertEqual(downloaded_dataset.name, "the_dataset")
        self.assertEqual(len(downloaded_dataset.files), 1)
        self.assertEqual(downloaded_dataset.labels, {"one", "two", "three"})

        with downloaded_dataset.files.one() as (downloaded_datafile, f):
            self.assertEqual(f.read(), "hello")
Example #17
0
 def test_filter_name_filters_exclude_path(self):
     """Ensures that filters applied to the name will not catch terms in the extension"""
     resource = Dataset(files=[
         Datafile(path="first-path-within-dataset/a_test_file.csv"),
         Datafile(path="second-path-within-dataset/a_test_file.txt"),
     ])
     files = resource.files.filter(name__icontains="second")
     self.assertEqual(0, len(files))
Example #18
0
    def test_exiting_context_manager_of_local_dataset_updates_local_metadata(
            self):
        """Test that local metadata for a local dataset is updated on exit of the dataset context manager."""
        with tempfile.TemporaryDirectory() as temporary_directory:
            self._create_files_and_nested_subdirectories(temporary_directory)

            dataset = Dataset(path=temporary_directory, recursive=True)

            with dataset:
                dataset.tags = {"cat": "dog"}
                dataset.labels = {"animals"}

            reloaded_dataset = Dataset(path=temporary_directory,
                                       recursive=True)
            self.assertEqual(reloaded_dataset.id, dataset.id)
            self.assertEqual(reloaded_dataset.tags, {"cat": "dog"})
            self.assertEqual(reloaded_dataset.labels, {"animals"})
Example #19
0
    def test_serialisation_and_deserialisation(self):
        """Test that a dataset can be serialised and deserialised."""
        dataset_id = "e376fb31-8f66-414d-b99f-b43395cebbf1"
        dataset = self.create_valid_dataset(id=dataset_id,
                                            labels=["b", "a"],
                                            tags={
                                                "a": 1,
                                                "b": 2
                                            })

        serialised_dataset = dataset.to_primitive()

        self.assertEqual(
            serialised_dataset,
            {
                "name":
                "test-dataset",
                "labels": ["a", "b"],
                "tags": {
                    "a": 1,
                    "b": 2
                },
                "id":
                dataset_id,
                "path":
                os.path.join(REPOSITORY_ROOT, "tests", "data", "basic_files",
                             "configuration", "test-dataset"),
                "files": [
                    os.path.join(
                        REPOSITORY_ROOT,
                        "tests",
                        "data",
                        "basic_files",
                        "configuration",
                        "test-dataset",
                        "path-within-dataset",
                        "a_test_file.csv",
                    ),
                    os.path.join(
                        REPOSITORY_ROOT,
                        "tests",
                        "data",
                        "basic_files",
                        "configuration",
                        "test-dataset",
                        "path-within-dataset",
                        "another_test_file.csv",
                    ),
                ],
            },
        )

        deserialised_dataset = Dataset.deserialise(serialised_dataset)
        self.assertEqual(dataset.id, deserialised_dataset.id)
        self.assertEqual(dataset.path, deserialised_dataset.path)
        self.assertEqual(dataset.name, deserialised_dataset.name)
        self.assertEqual(dataset.labels, deserialised_dataset.labels)
        self.assertEqual(dataset.tags, deserialised_dataset.tags)
Example #20
0
    def test_get_file_by_label(self):
        """Ensure files can be accessed by label from the dataset."""
        files = [
            Datafile(path="path-within-dataset/a_my_file.csv",
                     labels="one a b3 all"),
            Datafile(path="path-within-dataset/a_your_file.csv",
                     labels="two a2 b3 all"),
            Datafile(path="path-within-dataset/a_your_file.csv",
                     labels="three all"),
        ]

        resource = Dataset(files=files)

        # Check working for single result
        self.assertEqual(
            resource.get_file_by_label("three").labels, files[2].labels)

        # Check raises for too many results
        with self.assertRaises(
                exceptions.UnexpectedNumberOfResultsException) as e:
            resource.get_file_by_label("all")

        self.assertIn("More than one result found", e.exception.args[0])

        # Check raises for no result
        with self.assertRaises(
                exceptions.UnexpectedNumberOfResultsException) as e:
            resource.get_file_by_label("billyjeanisnotmylover")

        self.assertIn(
            "No results found for filters {'labels__contains': 'billyjeanisnotmylover'}",
            e.exception.args[0])
Example #21
0
    def test_adding_cloud_datafile_to_local_dataset(self):
        """Test that when a cloud datafile is added to a local dataset, it is downloaded to the root of the dataset."""
        with Datafile(path=storage.path.generate_gs_path(
                TEST_BUCKET_NAME, "path", "to", "datafile.dat"),
                      mode="w") as (
                          datafile,
                          f,
                      ):
            f.write("hello")

        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset = Dataset(path=os.path.join(temporary_directory, "path",
                                                "to", "dataset"))
            dataset.add(datafile)

        self.assertIn(datafile, dataset)
        self.assertEqual(datafile.local_path,
                         os.path.join(dataset.path, "datafile.dat"))
Example #22
0
    def test_stored_metadata_has_priority_over_instantiation_metadata_if_not_hypothetical(
            self):
        """Test that stored metadata is used instead of instantiation metadata if `hypothetical` is `False`."""
        cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                   "existing_dataset")

        # Create a dataset in the cloud and set some metadata on it.
        with Dataset(path=cloud_path) as dataset:
            dataset.tags = {"existing": True}

        # Load it separately from the cloud object and check that the stored metadata is used instead of the
        # instantiation metadata.
        with self.assertLogs() as logging_context:
            reloaded_dataset = Dataset(path=cloud_path, tags={"new": "tag"})

        self.assertEqual(reloaded_dataset.tags, {"existing": True})
        self.assertIn(
            "Overriding metadata given at instantiation with stored metadata",
            logging_context.output[0])
Example #23
0
    def test_adding_cloud_datafile_to_cloud_dataset_when_file_is_already_in_dataset_directory(
            self):
        """Test that a cloud datafile's path is kept as-is when adding it to a cloud dataset if it is already in the
        dataset directory and no `path_in_dataset` is provided.
        """
        dataset = Dataset(path=storage.path.generate_gs_path(
            TEST_BUCKET_NAME, "path", "to", "dataset"))

        with Datafile(path=storage.path.join(dataset.path, "subfolder",
                                             "datafile.dat"),
                      mode="w") as (datafile, f):
            f.write("hello")

        dataset.add(datafile)

        self.assertIn(datafile, dataset)
        self.assertEqual(
            datafile.cloud_path,
            storage.path.join(dataset.path, "subfolder", "datafile.dat"))
Example #24
0
    def test_from_cloud_with_no_metadata_file(self):
        """Test that any cloud directory can be accessed as a dataset if it has no `.octue` metadata file in it, the
        cloud dataset doesn't lose any information during serialization, and a metadata file is uploaded afterwards.
        """
        cloud_storage_client = GoogleCloudStorageClient()

        cloud_storage_client.upload_from_string(
            "[1, 2, 3]",
            cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                     "my_dataset",
                                                     "file_0.txt"),
        )

        cloud_storage_client.upload_from_string(
            "[4, 5, 6]",
            cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME,
                                                     "my_dataset",
                                                     "file_1.txt"),
        )

        cloud_dataset = Dataset(path=f"gs://{TEST_BUCKET_NAME}/my_dataset")

        self.assertEqual(cloud_dataset.path,
                         f"gs://{TEST_BUCKET_NAME}/my_dataset")
        self.assertEqual(cloud_dataset.name, "my_dataset")
        self.assertEqual({file.name
                          for file in cloud_dataset.files},
                         {"file_0.txt", "file_1.txt"})

        for file in cloud_dataset:
            self.assertEqual(
                file.cloud_path,
                f"gs://{TEST_BUCKET_NAME}/my_dataset/{file.name}")

        # Test serialisation doesn't lose any information.
        deserialised_dataset = Dataset.deserialise(
            cloud_dataset.to_primitive())
        self.assertEqual(deserialised_dataset.id, cloud_dataset.id)
        self.assertEqual(deserialised_dataset.name, cloud_dataset.name)
        self.assertEqual(deserialised_dataset.path, cloud_dataset.path)
        self.assertEqual(deserialised_dataset.hash_value,
                         cloud_dataset.hash_value)
Example #25
0
    def test_adding_local_datafile_to_local_dataset_when_file_is_already_in_dataset_directory(
            self):
        """Test that a local datafile's path is kept as-is when adding it to a local dataset if it is already in the
        dataset directory.
        """
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset = Dataset(path=os.path.join(temporary_directory, "path",
                                                "to", "dataset"))

            with Datafile(path=os.path.join(dataset.path, "subfolder",
                                            "datafile.dat"),
                          mode="w") as (datafile, f):
                f.write("hello")

            dataset.add(datafile)

        self.assertIn(datafile, dataset)
        self.assertEqual(
            datafile.local_path,
            os.path.join(dataset.path, "subfolder", "datafile.dat"))
Example #26
0
    def test_adding_cloud_datafile_to_cloud_dataset(self):
        """Test that a cloud datafile can be added to a cloud dataset and that it's copied into the dataset root if no
        `path_within_dataset` is provided.
        """
        dataset = Dataset(path=storage.path.generate_gs_path(
            TEST_BUCKET_NAME, "path", "to", "dataset"))

        with Datafile(path=storage.path.generate_gs_path(
                TEST_BUCKET_NAME, "path", "to", "datafile.dat"),
                      mode="w") as (
                          datafile,
                          f,
                      ):
            f.write("hello")

        dataset.add(datafile)

        self.assertIn(datafile, dataset)
        self.assertEqual(datafile.cloud_path,
                         storage.path.join(dataset.path, "datafile.dat"))
Example #27
0
    def test_filter_name_filters_include_extension(self):
        """Ensures that filters applied to the name will catch terms in the extension"""
        files = [
            Datafile(path="path-within-dataset/a_test_file.csv"),
            Datafile(path="path-within-dataset/a_test_file.txt"),
        ]

        self.assertEqual(
            Dataset(files=files).files.filter(
                name__icontains="txt").pop().path,
            FilterSet({files[1]}).pop().local_path)
Example #28
0
    def test_all_datasets_are_in_cloud(self):
        """Test whether all files of all datasets in a manifest are in the cloud or not can be determined."""
        self.assertFalse(
            self.create_valid_manifest().all_datasets_are_in_cloud)
        self.assertTrue(Manifest().all_datasets_are_in_cloud)

        files = [
            Datafile(path="gs://hello/file.txt"),
            Datafile(path="gs://goodbye/file.csv")
        ]
        manifest = Manifest(datasets={"my_dataset": Dataset(files=files)})
        self.assertTrue(manifest.all_datasets_are_in_cloud)
Example #29
0
    def create_valid_dataset(self, **kwargs):
        """Create a valid dataset with two valid datafiles (they're the same file in this case)."""
        path = os.path.join(self.data_path, "basic_files", "configuration", "test-dataset")

        return Dataset(
            path=path,
            files=[
                Datafile(path=os.path.join(path, "path-within-dataset", "a_test_file.csv")),
                Datafile(path=os.path.join(path, "path-within-dataset", "another_test_file.csv")),
            ],
            **kwargs
        )
Example #30
0
    def test_adding_local_datafile_to_local_dataset(self):
        """Test that a local datafile can be added to a local dataset and that it is copied to the root of the dataset
        if no `path_within_dataset` is provided.
        """
        with tempfile.TemporaryDirectory() as temporary_directory:
            dataset = Dataset(path=os.path.join(temporary_directory, "path",
                                                "to", "dataset"))

            with Datafile(path=os.path.join(temporary_directory, "path", "to",
                                            "datafile.dat"),
                          mode="w") as (
                              datafile,
                              f,
                          ):
                f.write("hello")

            dataset.add(datafile)

        self.assertIn(datafile, dataset)
        self.assertEqual(datafile.local_path,
                         os.path.join(dataset.path, "datafile.dat"))