def test_generating_signed_url_from_dataset_and_recreating_dataset_from_it( self): """Test that a signed URL can be generated for a dataset that can be used to recreate/get it, its metadata, and all its files. """ with tempfile.TemporaryDirectory() as temporary_directory: dataset_local_path = os.path.join(temporary_directory, "my-dataset-to-sign") with Datafile(path=os.path.join(dataset_local_path, "my-file.dat"), mode="w") as (datafile, f): f.write("hello") datafile.tags = {"my": "metadata"} dataset = Dataset(path=dataset_local_path, tags={"hello": "world"}) dataset.upload( storage.path.generate_gs_path(TEST_BUCKET_NAME, "my-dataset-to-sign")) with patch("google.cloud.storage.blob.Blob.generate_signed_url", new=mock_generate_signed_url): signed_url = dataset.generate_signed_url() downloaded_dataset = Dataset(path=signed_url) self.assertEqual(downloaded_dataset.tags, {"hello": "world"}) with downloaded_dataset.files.one() as (downloaded_datafile, f): self.assertEqual(f.read(), "hello") self.assertEqual(downloaded_datafile.name, "my-file.dat") self.assertEqual(downloaded_datafile.extension, "dat")
def test_serialisation_and_deserialisation(self): """Test that manifests can be serialised and deserialised.""" with tempfile.TemporaryDirectory() as temporary_directory: datasets = { "my_dataset_0": Dataset( path=os.path.join(temporary_directory, "my_dataset_0"), files=[ Datafile( path=os.path.join(temporary_directory, "my_dataset_0", "my_file_0.txt")) ], ), "my_dataset_1": Dataset( path=os.path.join(temporary_directory, "my_dataset_1"), files=[ Datafile( path=os.path.join(temporary_directory, "my_dataset_1", "my_file_1.txt")) ], ), } for dataset in datasets.values(): dataset.update_local_metadata() manifest = Manifest(datasets=datasets, id="7e0025cd-bd68-4de6-b48d-2643ebd5effd", name="my-manifest") serialised_manifest = manifest.to_primitive() self.assertEqual( serialised_manifest, { "id": manifest.id, "name": "my-manifest", "datasets": { "my_dataset_0": os.path.join(temporary_directory, "my_dataset_0"), "my_dataset_1": os.path.join(temporary_directory, "my_dataset_1"), }, }, ) deserialised_manifest = Manifest.deserialise(serialised_manifest) self.assertEqual(manifest.name, deserialised_manifest.name) self.assertEqual(manifest.id, deserialised_manifest.id) for key in manifest.datasets.keys(): self.assertEqual(manifest.datasets[key].name, deserialised_manifest.datasets[key].name) self.assertEqual(manifest.datasets[key].id, deserialised_manifest.datasets[key].id) self.assertEqual(manifest.datasets[key].path, deserialised_manifest.datasets[key].path)
def test_metadata_hash_is_same_for_different_datasets_with_the_same_metadata( self): """Test that the metadata hash is the same for datasets with different files but the same metadata.""" first_dataset = Dataset(labels={"a", "b", "c"}) second_dataset = Dataset( files={Datafile(path="blah", hypothetical=True)}, labels={"a", "b", "c"}) self.assertEqual(first_dataset.metadata_hash_value, second_dataset.metadata_hash_value)
def test_exists_in_cloud(self): """Test whether all files of a dataset are in the cloud or not can be determined.""" self.assertFalse(self.create_valid_dataset().all_files_are_in_cloud) with tempfile.TemporaryDirectory() as temporary_directory: self.assertTrue( Dataset(path=temporary_directory).all_files_are_in_cloud) files = [ Datafile(path="gs://hello/file.txt"), Datafile(path="gs://goodbye/file.csv") ] self.assertTrue(Dataset(files=files).all_files_are_in_cloud)
def test_exiting_context_manager_of_cloud_dataset_updates_cloud_metadata( self): """Test that cloud metadata for a cloud dataset is updated on exit of the dataset context manager.""" dataset_path = self._create_nested_cloud_dataset() dataset = Dataset(path=dataset_path, recursive=True) with dataset: dataset.tags = {"cat": "dog"} dataset.labels = {"animals"} reloaded_dataset = Dataset(path=dataset_path) self.assertEqual(reloaded_dataset.id, dataset.id) self.assertEqual(reloaded_dataset.tags, {"cat": "dog"}) self.assertEqual(reloaded_dataset.labels, {"animals"})
def test_update_metadata_with_cloud_dataset(self): """Test the `update_metadata` method with a cloud dataset.""" dataset_path = self._create_nested_cloud_dataset() dataset = Dataset(path=dataset_path) # Update the instance metadata but don't update the cloud stored metadata. dataset.tags["hello"] = "world" # Check the instance metadata hasn't been stored in the cloud. self.assertEqual(Dataset(path=dataset.path).tags, {}) # Update the cloud stored metadata and check it. dataset.update_metadata() self.assertEqual(Dataset(path=dataset.path).tags, {"hello": "world"})
def test_stored_metadata_ignored_if_hypothetical_is_true(self): """Test that instantiation metadata is used instead of stored metadata if `hypothetical` is `True`.""" cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME, "existing_dataset") # Create a dataset in the cloud and set some metadata on it. with Dataset(path=cloud_path) as dataset: dataset.tags = {"existing": True} # Load it separately from the cloud object and check that the instantiation metadata is used instead of the # stored metadata. reloaded_datafile = Dataset(path=cloud_path, tags={"new": "tag"}, hypothetical=True) self.assertEqual(reloaded_datafile.tags, {"new": "tag"})
def test_update_metadata_with_local_dataset(self): """Test the `update_metadata` method with a local dataset.""" with tempfile.TemporaryDirectory() as temporary_directory: dataset = Dataset(path=temporary_directory) # Update the instance metadata but don't update the local stored metadata. dataset.tags["hello"] = "world" # Check the instance metadata hasn't been stored locally. self.assertEqual(Dataset(path=temporary_directory).tags, {}) # Update the local stored metadata and check it. dataset.update_metadata() self.assertEqual( Dataset(path=temporary_directory).tags, {"hello": "world"})
def test_download(self): """Test that all files in a dataset can be downloaded with one command.""" storage_client = GoogleCloudStorageClient() dataset_name = "another-dataset" storage_client.upload_from_string( string=json.dumps([1, 2, 3]), cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME, dataset_name, "file_0.txt"), ) storage_client.upload_from_string( string=json.dumps([4, 5, 6]), cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME, dataset_name, "file_1.txt"), ) dataset = Dataset(path=f"gs://{TEST_BUCKET_NAME}/{dataset_name}") with tempfile.TemporaryDirectory() as temporary_directory: dataset.download(local_directory=temporary_directory) with open(os.path.join(temporary_directory, "file_0.txt")) as f: self.assertEqual(f.read(), "[1, 2, 3]") with open(os.path.join(temporary_directory, "file_1.txt")) as f: self.assertEqual(f.read(), "[4, 5, 6]")
def test_upload_with_nested_dataset_preserves_nested_structure(self): """Test that uploading a dataset containing datafiles in a nested directory structure to the cloud preserves this structure in the cloud. """ with tempfile.TemporaryDirectory() as temporary_directory: local_paths = self._create_files_and_nested_subdirectories( temporary_directory) dataset = Dataset(path=temporary_directory, recursive=True) upload_path = storage.path.generate_gs_path( TEST_BUCKET_NAME, "my-dataset") dataset.upload(cloud_path=upload_path) cloud_datafile_relative_paths = { blob.name.split(dataset.name)[-1].strip("/") for blob in GoogleCloudStorageClient().scandir( upload_path, filter=lambda blob: not blob.name.endswith(".octue") and SIGNED_METADATA_DIRECTORY not in blob.name, ) } # Check that the paths relative to the dataset directory are the same in the cloud as they are locally. local_datafile_relative_paths = { path.split(temporary_directory)[-1].strip(os.path.sep).replace( os.path.sep, "/") for path in local_paths } self.assertEqual(cloud_datafile_relative_paths, local_datafile_relative_paths)
def test_update_cloud_metadata(self): """Test that metadata for a cloud dataset can be stored in the cloud and used on re-instantiation of the same dataset. """ dataset_path = self._create_nested_cloud_dataset() dataset = Dataset(path=dataset_path) self.assertEqual(dataset.tags, {}) dataset.tags = {"some": "tags"} dataset.update_cloud_metadata() dataset_reloaded = Dataset(path=dataset_path) self.assertEqual(dataset.id, dataset_reloaded.id) self.assertEqual(dataset.tags, dataset_reloaded.tags) self.assertEqual(dataset.labels, dataset_reloaded.labels) self.assertEqual(dataset.hash_value, dataset_reloaded.hash_value)
def test_from_cloud(self): """Test that a Dataset in cloud storage can be accessed via a cloud path.""" with tempfile.TemporaryDirectory() as temporary_directory: dataset = create_dataset_with_two_files(temporary_directory) dataset.tags = {"a": "b", "c": 1} cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME, "a_directory", dataset.name) dataset.upload(cloud_path) persisted_dataset = Dataset(path=cloud_path) self.assertEqual( persisted_dataset.path, f"gs://{TEST_BUCKET_NAME}/a_directory/{dataset.name}") self.assertEqual(persisted_dataset.id, dataset.id) self.assertEqual(persisted_dataset.name, dataset.name) self.assertEqual(persisted_dataset.hash_value, dataset.hash_value) self.assertEqual(persisted_dataset.tags, dataset.tags) self.assertEqual(persisted_dataset.labels, dataset.labels) self.assertEqual({file.name for file in persisted_dataset.files}, {file.name for file in dataset.files}) for file in persisted_dataset: self.assertEqual( file.cloud_path, f"gs://{TEST_BUCKET_NAME}/a_directory/{dataset.name}/{file.name}" )
def test_get_file_by_label(self): """Ensure files can be accessed by label from the dataset.""" files = [ Datafile(path="path-within-dataset/a_my_file.csv", labels="one a b3 all"), Datafile(path="path-within-dataset/a_your_file.csv", labels="two a2 b3 all"), Datafile(path="path-within-dataset/a_your_file.csv", labels="three all"), ] resource = Dataset(files=files) # Check working for single result self.assertEqual( resource.get_file_by_label("three").labels, files[2].labels) # Check raises for too many results with self.assertRaises( exceptions.UnexpectedNumberOfResultsException) as e: resource.get_file_by_label("all") self.assertIn("More than one result found", e.exception.args[0]) # Check raises for no result with self.assertRaises( exceptions.UnexpectedNumberOfResultsException) as e: resource.get_file_by_label("billyjeanisnotmylover") self.assertIn( "No results found for filters {'labels__contains': 'billyjeanisnotmylover'}", e.exception.args[0])
def test_download_from_nested_dataset(self): """Test that all files in a nested dataset can be downloaded with one command.""" dataset_path = self._create_nested_cloud_dataset() dataset = Dataset(path=dataset_path, recursive=True) with tempfile.TemporaryDirectory() as temporary_directory: dataset.download(local_directory=temporary_directory) with open(os.path.join(temporary_directory, "file_0.txt")) as f: self.assertEqual(f.read(), "[1, 2, 3]") with open(os.path.join(temporary_directory, "file_1.txt")) as f: self.assertEqual(f.read(), "[4, 5, 6]") with open( os.path.join(temporary_directory, "sub-directory", "sub_file.txt")) as f: self.assertEqual(f.read(), "['a', 'b', 'c']") with open( os.path.join(temporary_directory, "sub-directory", "sub-sub-directory", "sub_sub_file.txt")) as f: self.assertEqual(f.read(), "['blah', 'b', 'c']")
def test_download_from_nested_dataset_with_no_local_directory_given(self): """Test that, when downloading all files from a nested dataset and no local directory is given, the dataset structure is preserved in the temporary directory used. """ dataset_path = self._create_nested_cloud_dataset() dataset = Dataset(path=dataset_path, recursive=True) # Mock the temporary directory created in `Dataset.download_all_files` so we can access it for the test. temporary_directory = tempfile.TemporaryDirectory() with patch("tempfile.TemporaryDirectory", return_value=temporary_directory): dataset.download() with open(os.path.join(temporary_directory.name, "file_0.txt")) as f: self.assertEqual(f.read(), "[1, 2, 3]") with open(os.path.join(temporary_directory.name, "file_1.txt")) as f: self.assertEqual(f.read(), "[4, 5, 6]") with open( os.path.join(temporary_directory.name, "sub-directory", "sub_file.txt")) as f: self.assertEqual(f.read(), "['a', 'b', 'c']") with open( os.path.join(temporary_directory.name, "sub-directory", "sub-sub-directory", "sub_sub_file.txt")) as f: self.assertEqual(f.read(), "['blah', 'b', 'c']")
def test_instantiation_metadata_used_if_not_hypothetical_but_no_stored_metadata( self): """Test that instantiation metadata is used if `hypothetical` is `False` but there's no stored metadata.""" cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME, "non_existing_dataset") dataset = Dataset(path=cloud_path, tags={"new": "tag"}) self.assertEqual(dataset.tags, {"new": "tag"})
def test_instantiating_from_serialised_cloud_datasets_with_no_dataset_json_file( self): """Test that a Manifest can be instantiated from a serialized cloud dataset with no `dataset.json` file. This simulates what happens when such a cloud dataset is referred to in a manifest received by a child service. """ GoogleCloudStorageClient().upload_from_string( "[1, 2, 3]", cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME, "my_dataset", "file_0.txt"), ) GoogleCloudStorageClient().upload_from_string( "[4, 5, 6]", cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME, "my_dataset", "file_1.txt"), ) serialised_cloud_dataset = Dataset( path=f"gs://{TEST_BUCKET_NAME}/my_dataset").to_primitive() manifest = Manifest(datasets={"my_dataset": serialised_cloud_dataset}) self.assertEqual(len(manifest.datasets), 1) self.assertEqual(manifest.datasets["my_dataset"].path, f"gs://{TEST_BUCKET_NAME}/my_dataset") self.assertEqual(len(manifest.datasets["my_dataset"].files), 2)
def test_finalise_with_upload(self): """Test that the `finalise` method can be used to upload the output manifest's datasets to a cloud location and that it updates the manifest with signed URLs for accessing them. """ with tempfile.TemporaryDirectory() as temporary_directory: dataset_path = os.path.join(temporary_directory, "the_dataset") with Datafile(path=os.path.join(dataset_path, "my_file.dat"), mode="w") as (datafile, f): f.write("hello") output_manifest = Manifest( datasets={ "the_dataset": Dataset( path=dataset_path, files={datafile.local_path}, labels={"one", "two", "three"} ) } ) analysis = Analysis( twine={ "output_values_schema": {"type": "object", "properties": {"blah": {"type": "integer"}}}, "output_manifest": {"datasets": {"the_dataset": {"purpose": "testing"}}}, }, output_values={"blah": 3}, output_manifest=output_manifest, ) with patch("google.cloud.storage.blob.Blob.generate_signed_url", new=mock_generate_signed_url): analysis.finalise(upload_output_datasets_to=f"gs://{TEST_BUCKET_NAME}/datasets") signed_url_for_dataset = analysis.output_manifest.datasets["the_dataset"].path self.assertTrue(storage.path.is_url(signed_url_for_dataset)) self.assertTrue( signed_url_for_dataset.startswith( f"{self.test_result_modifier.storage_emulator_host}/{TEST_BUCKET_NAME}/datasets/the_dataset" ) ) downloaded_dataset = Dataset(path=signed_url_for_dataset) self.assertEqual(downloaded_dataset.name, "the_dataset") self.assertEqual(len(downloaded_dataset.files), 1) self.assertEqual(downloaded_dataset.labels, {"one", "two", "three"}) with downloaded_dataset.files.one() as (downloaded_datafile, f): self.assertEqual(f.read(), "hello")
def test_filter_name_filters_exclude_path(self): """Ensures that filters applied to the name will not catch terms in the extension""" resource = Dataset(files=[ Datafile(path="first-path-within-dataset/a_test_file.csv"), Datafile(path="second-path-within-dataset/a_test_file.txt"), ]) files = resource.files.filter(name__icontains="second") self.assertEqual(0, len(files))
def test_cannot_add_non_datafiles(self): """Ensures that exception will be raised if adding a non-datafile object""" class NotADatafile: pass resource = Dataset() with self.assertRaises(exceptions.InvalidInputException): resource.add(NotADatafile())
def test_exiting_context_manager_of_local_dataset_updates_local_metadata( self): """Test that local metadata for a local dataset is updated on exit of the dataset context manager.""" with tempfile.TemporaryDirectory() as temporary_directory: self._create_files_and_nested_subdirectories(temporary_directory) dataset = Dataset(path=temporary_directory, recursive=True) with dataset: dataset.tags = {"cat": "dog"} dataset.labels = {"animals"} reloaded_dataset = Dataset(path=temporary_directory, recursive=True) self.assertEqual(reloaded_dataset.id, dataset.id) self.assertEqual(reloaded_dataset.tags, {"cat": "dog"}) self.assertEqual(reloaded_dataset.labels, {"animals"})
def test_upload_works_with_implicit_cloud_location_if_cloud_location_previously_provided( self): """Test `Dataset.to_cloud` works with an implicit cloud location if the cloud location has previously been provided. """ dataset_path = self._create_nested_cloud_dataset() dataset = Dataset(path=dataset_path, recursive=True) dataset.upload()
def test_error_raised_if_attempting_to_generate_signed_url_for_local_dataset( self): """Test that an error is raised if trying to generate a signed URL for a local dataset.""" with tempfile.TemporaryDirectory() as temporary_directory: dataset = Dataset(path=temporary_directory, tags={"hello": "world"}) with self.assertRaises(exceptions.CloudLocationNotSpecified): dataset.generate_signed_url()
def test_stored_metadata_has_priority_over_instantiation_metadata_if_not_hypothetical( self): """Test that stored metadata is used instead of instantiation metadata if `hypothetical` is `False`.""" cloud_path = storage.path.generate_gs_path(TEST_BUCKET_NAME, "existing_dataset") # Create a dataset in the cloud and set some metadata on it. with Dataset(path=cloud_path) as dataset: dataset.tags = {"existing": True} # Load it separately from the cloud object and check that the stored metadata is used instead of the # instantiation metadata. with self.assertLogs() as logging_context: reloaded_dataset = Dataset(path=cloud_path, tags={"new": "tag"}) self.assertEqual(reloaded_dataset.tags, {"existing": True}) self.assertIn( "Overriding metadata given at instantiation with stored metadata", logging_context.output[0])
def test_filter_name_filters_include_extension(self): """Ensures that filters applied to the name will catch terms in the extension""" files = [ Datafile(path="path-within-dataset/a_test_file.csv"), Datafile(path="path-within-dataset/a_test_file.txt"), ] self.assertEqual( Dataset(files=files).files.filter( name__icontains="txt").pop().path, FilterSet({files[1]}).pop().local_path)
def test_all_datasets_are_in_cloud(self): """Test whether all files of all datasets in a manifest are in the cloud or not can be determined.""" self.assertFalse( self.create_valid_manifest().all_datasets_are_in_cloud) self.assertTrue(Manifest().all_datasets_are_in_cloud) files = [ Datafile(path="gs://hello/file.txt"), Datafile(path="gs://goodbye/file.csv") ] manifest = Manifest(datasets={"my_dataset": Dataset(files=files)}) self.assertTrue(manifest.all_datasets_are_in_cloud)
def create_valid_dataset(self, **kwargs): """Create a valid dataset with two valid datafiles (they're the same file in this case).""" path = os.path.join(self.data_path, "basic_files", "configuration", "test-dataset") return Dataset( path=path, files=[ Datafile(path=os.path.join(path, "path-within-dataset", "a_test_file.csv")), Datafile(path=os.path.join(path, "path-within-dataset", "another_test_file.csv")), ], **kwargs )
def test_from_local_directory_recursively(self): """Test that a dataset can be instantiated from a local nested directory including its subdirectories.""" with tempfile.TemporaryDirectory() as temporary_directory: paths = self._create_files_and_nested_subdirectories( temporary_directory) dataset = Dataset(path=temporary_directory, recursive=True) # Check that all the files from the directory are present in the dataset. datafile_paths = { datafile.local_path for datafile in dataset.files } self.assertEqual(datafile_paths, set(paths))
def test_filter_catches_single_underscore_mistake(self): """Ensure that if the filter name contains only single underscores, an error is raised.""" resource = Dataset(files=[ Datafile(path="path-within-dataset/A_Test_file.csv"), Datafile(path="path-within-dataset/a_test_file.txt"), ]) with self.assertRaises(exceptions.InvalidInputException) as e: resource.files.filter(name_icontains="Test") self.assertIn( "Invalid filter name 'name_icontains'. Filter names should be in the form", e.exception.args[0])
def test_update_local_metadata(self): """Test that metadata for a local dataset can be stored locally and used on re-instantiation of the same dataset. """ with tempfile.TemporaryDirectory() as temporary_directory: self._create_files_and_nested_subdirectories(temporary_directory) dataset = Dataset( path=temporary_directory, recursive=True, id="69253db4-7972-42de-8ccc-61336a28cd50", tags={"cat": "dog"}, labels=["animals"], ) dataset.update_local_metadata() dataset_reloaded = Dataset(path=temporary_directory, recursive=True) self.assertEqual(dataset.id, dataset_reloaded.id) self.assertEqual(dataset.tags, dataset_reloaded.tags) self.assertEqual(dataset.labels, dataset_reloaded.labels) self.assertEqual(dataset.hash_value, dataset_reloaded.hash_value)