def test_download(self): """Test that all files in a dataset can be downloaded with one command.""" storage_client = GoogleCloudStorageClient() dataset_name = "another-dataset" storage_client.upload_from_string( string=json.dumps([1, 2, 3]), cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME, dataset_name, "file_0.txt"), ) storage_client.upload_from_string( string=json.dumps([4, 5, 6]), cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME, dataset_name, "file_1.txt"), ) dataset = Dataset(path=f"gs://{TEST_BUCKET_NAME}/{dataset_name}") with tempfile.TemporaryDirectory() as temporary_directory: dataset.download(local_directory=temporary_directory) with open(os.path.join(temporary_directory, "file_0.txt")) as f: self.assertEqual(f.read(), "[1, 2, 3]") with open(os.path.join(temporary_directory, "file_1.txt")) as f: self.assertEqual(f.read(), "[4, 5, 6]")
def test_download_from_nested_dataset(self): """Test that all files in a nested dataset can be downloaded with one command.""" dataset_path = self._create_nested_cloud_dataset() dataset = Dataset(path=dataset_path, recursive=True) with tempfile.TemporaryDirectory() as temporary_directory: dataset.download(local_directory=temporary_directory) with open(os.path.join(temporary_directory, "file_0.txt")) as f: self.assertEqual(f.read(), "[1, 2, 3]") with open(os.path.join(temporary_directory, "file_1.txt")) as f: self.assertEqual(f.read(), "[4, 5, 6]") with open( os.path.join(temporary_directory, "sub-directory", "sub_file.txt")) as f: self.assertEqual(f.read(), "['a', 'b', 'c']") with open( os.path.join(temporary_directory, "sub-directory", "sub-sub-directory", "sub_sub_file.txt")) as f: self.assertEqual(f.read(), "['blah', 'b', 'c']")
def test_upload_with_nested_dataset_preserves_nested_structure(self): """Test that uploading a dataset containing datafiles in a nested directory structure to the cloud preserves this structure in the cloud. """ with tempfile.TemporaryDirectory() as temporary_directory: local_paths = self._create_files_and_nested_subdirectories( temporary_directory) dataset = Dataset(path=temporary_directory, recursive=True) upload_path = storage.path.generate_gs_path( TEST_BUCKET_NAME, "my-dataset") dataset.upload(cloud_path=upload_path) cloud_datafile_relative_paths = { blob.name.split(dataset.name)[-1].strip("/") for blob in GoogleCloudStorageClient().scandir( upload_path, filter=lambda blob: not blob.name.endswith(".octue") and SIGNED_METADATA_DIRECTORY not in blob.name, ) } # Check that the paths relative to the dataset directory are the same in the cloud as they are locally. local_datafile_relative_paths = { path.split(temporary_directory)[-1].strip(os.path.sep).replace( os.path.sep, "/") for path in local_paths } self.assertEqual(cloud_datafile_relative_paths, local_datafile_relative_paths)
def test_download_from_nested_dataset_with_no_local_directory_given(self): """Test that, when downloading all files from a nested dataset and no local directory is given, the dataset structure is preserved in the temporary directory used. """ dataset_path = self._create_nested_cloud_dataset() dataset = Dataset(path=dataset_path, recursive=True) # Mock the temporary directory created in `Dataset.download_all_files` so we can access it for the test. temporary_directory = tempfile.TemporaryDirectory() with patch("tempfile.TemporaryDirectory", return_value=temporary_directory): dataset.download() with open(os.path.join(temporary_directory.name, "file_0.txt")) as f: self.assertEqual(f.read(), "[1, 2, 3]") with open(os.path.join(temporary_directory.name, "file_1.txt")) as f: self.assertEqual(f.read(), "[4, 5, 6]") with open( os.path.join(temporary_directory.name, "sub-directory", "sub_file.txt")) as f: self.assertEqual(f.read(), "['a', 'b', 'c']") with open( os.path.join(temporary_directory.name, "sub-directory", "sub-sub-directory", "sub_sub_file.txt")) as f: self.assertEqual(f.read(), "['blah', 'b', 'c']")
def test_serialisation_and_deserialisation(self): """Test that manifests can be serialised and deserialised.""" with tempfile.TemporaryDirectory() as temporary_directory: datasets = { "my_dataset_0": Dataset( path=os.path.join(temporary_directory, "my_dataset_0"), files=[ Datafile( path=os.path.join(temporary_directory, "my_dataset_0", "my_file_0.txt")) ], ), "my_dataset_1": Dataset( path=os.path.join(temporary_directory, "my_dataset_1"), files=[ Datafile( path=os.path.join(temporary_directory, "my_dataset_1", "my_file_1.txt")) ], ), } for dataset in datasets.values(): dataset.update_local_metadata() manifest = Manifest(datasets=datasets, id="7e0025cd-bd68-4de6-b48d-2643ebd5effd", name="my-manifest") serialised_manifest = manifest.to_primitive() self.assertEqual( serialised_manifest, { "id": manifest.id, "name": "my-manifest", "datasets": { "my_dataset_0": os.path.join(temporary_directory, "my_dataset_0"), "my_dataset_1": os.path.join(temporary_directory, "my_dataset_1"), }, }, ) deserialised_manifest = Manifest.deserialise(serialised_manifest) self.assertEqual(manifest.name, deserialised_manifest.name) self.assertEqual(manifest.id, deserialised_manifest.id) for key in manifest.datasets.keys(): self.assertEqual(manifest.datasets[key].name, deserialised_manifest.datasets[key].name) self.assertEqual(manifest.datasets[key].id, deserialised_manifest.datasets[key].id) self.assertEqual(manifest.datasets[key].path, deserialised_manifest.datasets[key].path)
def test_upload_works_with_implicit_cloud_location_if_cloud_location_previously_provided( self): """Test `Dataset.to_cloud` works with an implicit cloud location if the cloud location has previously been provided. """ dataset_path = self._create_nested_cloud_dataset() dataset = Dataset(path=dataset_path, recursive=True) dataset.upload()
def test_cannot_add_non_datafiles(self): """Ensures that exception will be raised if adding a non-datafile object""" class NotADatafile: pass resource = Dataset() with self.assertRaises(exceptions.InvalidInputException): resource.add(NotADatafile())
def test_error_raised_if_attempting_to_generate_signed_url_for_local_dataset( self): """Test that an error is raised if trying to generate a signed URL for a local dataset.""" with tempfile.TemporaryDirectory() as temporary_directory: dataset = Dataset(path=temporary_directory, tags={"hello": "world"}) with self.assertRaises(exceptions.CloudLocationNotSpecified): dataset.generate_signed_url()
def test_metadata_hash_is_same_for_different_datasets_with_the_same_metadata( self): """Test that the metadata hash is the same for datasets with different files but the same metadata.""" first_dataset = Dataset(labels={"a", "b", "c"}) second_dataset = Dataset( files={Datafile(path="blah", hypothetical=True)}, labels={"a", "b", "c"}) self.assertEqual(first_dataset.metadata_hash_value, second_dataset.metadata_hash_value)
def test_exists_in_cloud(self): """Test whether all files of a dataset are in the cloud or not can be determined.""" self.assertFalse(self.create_valid_dataset().all_files_are_in_cloud) with tempfile.TemporaryDirectory() as temporary_directory: self.assertTrue( Dataset(path=temporary_directory).all_files_are_in_cloud) files = [ Datafile(path="gs://hello/file.txt"), Datafile(path="gs://goodbye/file.csv") ] self.assertTrue(Dataset(files=files).all_files_are_in_cloud)
def test_exiting_context_manager_of_cloud_dataset_updates_cloud_metadata( self): """Test that cloud metadata for a cloud dataset is updated on exit of the dataset context manager.""" dataset_path = self._create_nested_cloud_dataset() dataset = Dataset(path=dataset_path, recursive=True) with dataset: dataset.tags = {"cat": "dog"} dataset.labels = {"animals"} reloaded_dataset = Dataset(path=dataset_path) self.assertEqual(reloaded_dataset.id, dataset.id) self.assertEqual(reloaded_dataset.tags, {"cat": "dog"}) self.assertEqual(reloaded_dataset.labels, {"animals"})
def test_stored_metadata_ignored_if_hypothetical_is_true(self): """Test that instantiation metadata is used instead of stored metadata if `hypothetical` is `True`.""" cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME, "existing_dataset") # Create a dataset in the cloud and set some metadata on it. with Dataset(path=cloud_path) as dataset: dataset.tags = {"existing": True} # Load it separately from the cloud object and check that the instantiation metadata is used instead of the # stored metadata. reloaded_datafile = Dataset(path=cloud_path, tags={"new": "tag"}, hypothetical=True) self.assertEqual(reloaded_datafile.tags, {"new": "tag"})
def test_instantiation_metadata_used_if_not_hypothetical_but_no_stored_metadata( self): """Test that instantiation metadata is used if `hypothetical` is `False` but there's no stored metadata.""" cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME, "non_existing_dataset") dataset = Dataset(path=cloud_path, tags={"new": "tag"}) self.assertEqual(dataset.tags, {"new": "tag"})
def test_from_cloud(self): """Test that a Dataset in cloud storage can be accessed via a cloud path.""" with tempfile.TemporaryDirectory() as temporary_directory: dataset = create_dataset_with_two_files(temporary_directory) dataset.tags = {"a": "b", "c": 1} cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME, "a_directory", dataset.name) dataset.upload(cloud_path) persisted_dataset = Dataset(path=cloud_path) self.assertEqual( persisted_dataset.path, f"gs://{TEST_BUCKET_NAME}/a_directory/{dataset.name}") self.assertEqual(persisted_dataset.id, dataset.id) self.assertEqual(persisted_dataset.name, dataset.name) self.assertEqual(persisted_dataset.hash_value, dataset.hash_value) self.assertEqual(persisted_dataset.tags, dataset.tags) self.assertEqual(persisted_dataset.labels, dataset.labels) self.assertEqual({file.name for file in persisted_dataset.files}, {file.name for file in dataset.files}) for file in persisted_dataset: self.assertEqual( file.cloud_path, f"gs://{TEST_BUCKET_NAME}/a_directory/{dataset.name}/{file.name}" )
def test_instantiating_from_serialised_cloud_datasets_with_no_dataset_json_file( self): """Test that a Manifest can be instantiated from a serialized cloud dataset with no `dataset.json` file. This simulates what happens when such a cloud dataset is referred to in a manifest received by a child service. """ GoogleCloudStorageClient().upload_from_string( "[1, 2, 3]", cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME, "my_dataset", "file_0.txt"), ) GoogleCloudStorageClient().upload_from_string( "[4, 5, 6]", cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME, "my_dataset", "file_1.txt"), ) serialised_cloud_dataset = Dataset( path=f"gs://{TEST_BUCKET_NAME}/my_dataset").to_primitive() manifest = Manifest(datasets={"my_dataset": serialised_cloud_dataset}) self.assertEqual(len(manifest.datasets), 1) self.assertEqual(manifest.datasets["my_dataset"].path, f"gs://{TEST_BUCKET_NAME}/my_dataset") self.assertEqual(len(manifest.datasets["my_dataset"].files), 2)
def test_finalise_with_upload(self): """Test that the `finalise` method can be used to upload the output manifest's datasets to a cloud location and that it updates the manifest with signed URLs for accessing them. """ with tempfile.TemporaryDirectory() as temporary_directory: dataset_path = os.path.join(temporary_directory, "the_dataset") with Datafile(path=os.path.join(dataset_path, "my_file.dat"), mode="w") as (datafile, f): f.write("hello") output_manifest = Manifest( datasets={ "the_dataset": Dataset( path=dataset_path, files={datafile.local_path}, labels={"one", "two", "three"} ) } ) analysis = Analysis( twine={ "output_values_schema": {"type": "object", "properties": {"blah": {"type": "integer"}}}, "output_manifest": {"datasets": {"the_dataset": {"purpose": "testing"}}}, }, output_values={"blah": 3}, output_manifest=output_manifest, ) with patch("google.cloud.storage.blob.Blob.generate_signed_url", new=mock_generate_signed_url): analysis.finalise(upload_output_datasets_to=f"gs://{TEST_BUCKET_NAME}/datasets") signed_url_for_dataset = analysis.output_manifest.datasets["the_dataset"].path self.assertTrue(storage.path.is_url(signed_url_for_dataset)) self.assertTrue( signed_url_for_dataset.startswith( f"{self.test_result_modifier.storage_emulator_host}/{TEST_BUCKET_NAME}/datasets/the_dataset" ) ) downloaded_dataset = Dataset(path=signed_url_for_dataset) self.assertEqual(downloaded_dataset.name, "the_dataset") self.assertEqual(len(downloaded_dataset.files), 1) self.assertEqual(downloaded_dataset.labels, {"one", "two", "three"}) with downloaded_dataset.files.one() as (downloaded_datafile, f): self.assertEqual(f.read(), "hello")
def test_filter_name_filters_exclude_path(self): """Ensures that filters applied to the name will not catch terms in the extension""" resource = Dataset(files=[ Datafile(path="first-path-within-dataset/a_test_file.csv"), Datafile(path="second-path-within-dataset/a_test_file.txt"), ]) files = resource.files.filter(name__icontains="second") self.assertEqual(0, len(files))
def test_exiting_context_manager_of_local_dataset_updates_local_metadata( self): """Test that local metadata for a local dataset is updated on exit of the dataset context manager.""" with tempfile.TemporaryDirectory() as temporary_directory: self._create_files_and_nested_subdirectories(temporary_directory) dataset = Dataset(path=temporary_directory, recursive=True) with dataset: dataset.tags = {"cat": "dog"} dataset.labels = {"animals"} reloaded_dataset = Dataset(path=temporary_directory, recursive=True) self.assertEqual(reloaded_dataset.id, dataset.id) self.assertEqual(reloaded_dataset.tags, {"cat": "dog"}) self.assertEqual(reloaded_dataset.labels, {"animals"})
def test_serialisation_and_deserialisation(self): """Test that a dataset can be serialised and deserialised.""" dataset_id = "e376fb31-8f66-414d-b99f-b43395cebbf1" dataset = self.create_valid_dataset(id=dataset_id, labels=["b", "a"], tags={ "a": 1, "b": 2 }) serialised_dataset = dataset.to_primitive() self.assertEqual( serialised_dataset, { "name": "test-dataset", "labels": ["a", "b"], "tags": { "a": 1, "b": 2 }, "id": dataset_id, "path": os.path.join(REPOSITORY_ROOT, "tests", "data", "basic_files", "configuration", "test-dataset"), "files": [ os.path.join( REPOSITORY_ROOT, "tests", "data", "basic_files", "configuration", "test-dataset", "path-within-dataset", "a_test_file.csv", ), os.path.join( REPOSITORY_ROOT, "tests", "data", "basic_files", "configuration", "test-dataset", "path-within-dataset", "another_test_file.csv", ), ], }, ) deserialised_dataset = Dataset.deserialise(serialised_dataset) self.assertEqual(dataset.id, deserialised_dataset.id) self.assertEqual(dataset.path, deserialised_dataset.path) self.assertEqual(dataset.name, deserialised_dataset.name) self.assertEqual(dataset.labels, deserialised_dataset.labels) self.assertEqual(dataset.tags, deserialised_dataset.tags)
def test_get_file_by_label(self): """Ensure files can be accessed by label from the dataset.""" files = [ Datafile(path="path-within-dataset/a_my_file.csv", labels="one a b3 all"), Datafile(path="path-within-dataset/a_your_file.csv", labels="two a2 b3 all"), Datafile(path="path-within-dataset/a_your_file.csv", labels="three all"), ] resource = Dataset(files=files) # Check working for single result self.assertEqual( resource.get_file_by_label("three").labels, files[2].labels) # Check raises for too many results with self.assertRaises( exceptions.UnexpectedNumberOfResultsException) as e: resource.get_file_by_label("all") self.assertIn("More than one result found", e.exception.args[0]) # Check raises for no result with self.assertRaises( exceptions.UnexpectedNumberOfResultsException) as e: resource.get_file_by_label("billyjeanisnotmylover") self.assertIn( "No results found for filters {'labels__contains': 'billyjeanisnotmylover'}", e.exception.args[0])
def test_adding_cloud_datafile_to_local_dataset(self): """Test that when a cloud datafile is added to a local dataset, it is downloaded to the root of the dataset.""" with Datafile(path=storage.path.generate_gs_path( TEST_BUCKET_NAME, "path", "to", "datafile.dat"), mode="w") as ( datafile, f, ): f.write("hello") with tempfile.TemporaryDirectory() as temporary_directory: dataset = Dataset(path=os.path.join(temporary_directory, "path", "to", "dataset")) dataset.add(datafile) self.assertIn(datafile, dataset) self.assertEqual(datafile.local_path, os.path.join(dataset.path, "datafile.dat"))
def test_stored_metadata_has_priority_over_instantiation_metadata_if_not_hypothetical( self): """Test that stored metadata is used instead of instantiation metadata if `hypothetical` is `False`.""" cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME, "existing_dataset") # Create a dataset in the cloud and set some metadata on it. with Dataset(path=cloud_path) as dataset: dataset.tags = {"existing": True} # Load it separately from the cloud object and check that the stored metadata is used instead of the # instantiation metadata. with self.assertLogs() as logging_context: reloaded_dataset = Dataset(path=cloud_path, tags={"new": "tag"}) self.assertEqual(reloaded_dataset.tags, {"existing": True}) self.assertIn( "Overriding metadata given at instantiation with stored metadata", logging_context.output[0])
def test_adding_cloud_datafile_to_cloud_dataset_when_file_is_already_in_dataset_directory( self): """Test that a cloud datafile's path is kept as-is when adding it to a cloud dataset if it is already in the dataset directory and no `path_in_dataset` is provided. """ dataset = Dataset(path=storage.path.generate_gs_path( TEST_BUCKET_NAME, "path", "to", "dataset")) with Datafile(path=storage.path.join(dataset.path, "subfolder", "datafile.dat"), mode="w") as (datafile, f): f.write("hello") dataset.add(datafile) self.assertIn(datafile, dataset) self.assertEqual( datafile.cloud_path, storage.path.join(dataset.path, "subfolder", "datafile.dat"))
def test_from_cloud_with_no_metadata_file(self): """Test that any cloud directory can be accessed as a dataset if it has no `.octue` metadata file in it, the cloud dataset doesn't lose any information during serialization, and a metadata file is uploaded afterwards. """ cloud_storage_client = GoogleCloudStorageClient() cloud_storage_client.upload_from_string( "[1, 2, 3]", cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME, "my_dataset", "file_0.txt"), ) cloud_storage_client.upload_from_string( "[4, 5, 6]", cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME, "my_dataset", "file_1.txt"), ) cloud_dataset = Dataset(path=f"gs://{TEST_BUCKET_NAME}/my_dataset") self.assertEqual(cloud_dataset.path, f"gs://{TEST_BUCKET_NAME}/my_dataset") self.assertEqual(cloud_dataset.name, "my_dataset") self.assertEqual({file.name for file in cloud_dataset.files}, {"file_0.txt", "file_1.txt"}) for file in cloud_dataset: self.assertEqual( file.cloud_path, f"gs://{TEST_BUCKET_NAME}/my_dataset/{file.name}") # Test serialisation doesn't lose any information. deserialised_dataset = Dataset.deserialise( cloud_dataset.to_primitive()) self.assertEqual(deserialised_dataset.id, cloud_dataset.id) self.assertEqual(deserialised_dataset.name, cloud_dataset.name) self.assertEqual(deserialised_dataset.path, cloud_dataset.path) self.assertEqual(deserialised_dataset.hash_value, cloud_dataset.hash_value)
def test_adding_local_datafile_to_local_dataset_when_file_is_already_in_dataset_directory( self): """Test that a local datafile's path is kept as-is when adding it to a local dataset if it is already in the dataset directory. """ with tempfile.TemporaryDirectory() as temporary_directory: dataset = Dataset(path=os.path.join(temporary_directory, "path", "to", "dataset")) with Datafile(path=os.path.join(dataset.path, "subfolder", "datafile.dat"), mode="w") as (datafile, f): f.write("hello") dataset.add(datafile) self.assertIn(datafile, dataset) self.assertEqual( datafile.local_path, os.path.join(dataset.path, "subfolder", "datafile.dat"))
def test_adding_cloud_datafile_to_cloud_dataset(self): """Test that a cloud datafile can be added to a cloud dataset and that it's copied into the dataset root if no `path_within_dataset` is provided. """ dataset = Dataset(path=storage.path.generate_gs_path( TEST_BUCKET_NAME, "path", "to", "dataset")) with Datafile(path=storage.path.generate_gs_path( TEST_BUCKET_NAME, "path", "to", "datafile.dat"), mode="w") as ( datafile, f, ): f.write("hello") dataset.add(datafile) self.assertIn(datafile, dataset) self.assertEqual(datafile.cloud_path, storage.path.join(dataset.path, "datafile.dat"))
def test_filter_name_filters_include_extension(self): """Ensures that filters applied to the name will catch terms in the extension""" files = [ Datafile(path="path-within-dataset/a_test_file.csv"), Datafile(path="path-within-dataset/a_test_file.txt"), ] self.assertEqual( Dataset(files=files).files.filter( name__icontains="txt").pop().path, FilterSet({files[1]}).pop().local_path)
def test_all_datasets_are_in_cloud(self): """Test whether all files of all datasets in a manifest are in the cloud or not can be determined.""" self.assertFalse( self.create_valid_manifest().all_datasets_are_in_cloud) self.assertTrue(Manifest().all_datasets_are_in_cloud) files = [ Datafile(path="gs://hello/file.txt"), Datafile(path="gs://goodbye/file.csv") ] manifest = Manifest(datasets={"my_dataset": Dataset(files=files)}) self.assertTrue(manifest.all_datasets_are_in_cloud)
def create_valid_dataset(self, **kwargs): """Create a valid dataset with two valid datafiles (they're the same file in this case).""" path = os.path.join(self.data_path, "basic_files", "configuration", "test-dataset") return Dataset( path=path, files=[ Datafile(path=os.path.join(path, "path-within-dataset", "a_test_file.csv")), Datafile(path=os.path.join(path, "path-within-dataset", "another_test_file.csv")), ], **kwargs )
def test_adding_local_datafile_to_local_dataset(self): """Test that a local datafile can be added to a local dataset and that it is copied to the root of the dataset if no `path_within_dataset` is provided. """ with tempfile.TemporaryDirectory() as temporary_directory: dataset = Dataset(path=os.path.join(temporary_directory, "path", "to", "dataset")) with Datafile(path=os.path.join(temporary_directory, "path", "to", "datafile.dat"), mode="w") as ( datafile, f, ): f.write("hello") dataset.add(datafile) self.assertIn(datafile, dataset) self.assertEqual(datafile.local_path, os.path.join(dataset.path, "datafile.dat"))