def test_serialisation_and_deserialisation(self): """Test that a dataset can be serialised and deserialised.""" dataset_id = "e376fb31-8f66-414d-b99f-b43395cebbf1" dataset = self.create_valid_dataset(id=dataset_id, labels=["b", "a"], tags={ "a": 1, "b": 2 }) serialised_dataset = dataset.to_primitive() self.assertEqual( serialised_dataset, { "name": "test-dataset", "labels": ["a", "b"], "tags": { "a": 1, "b": 2 }, "id": dataset_id, "path": os.path.join(REPOSITORY_ROOT, "tests", "data", "basic_files", "configuration", "test-dataset"), "files": [ os.path.join( REPOSITORY_ROOT, "tests", "data", "basic_files", "configuration", "test-dataset", "path-within-dataset", "a_test_file.csv", ), os.path.join( REPOSITORY_ROOT, "tests", "data", "basic_files", "configuration", "test-dataset", "path-within-dataset", "another_test_file.csv", ), ], }, ) deserialised_dataset = Dataset.deserialise(serialised_dataset) self.assertEqual(dataset.id, deserialised_dataset.id) self.assertEqual(dataset.path, deserialised_dataset.path) self.assertEqual(dataset.name, deserialised_dataset.name) self.assertEqual(dataset.labels, deserialised_dataset.labels) self.assertEqual(dataset.tags, deserialised_dataset.tags)
def test_from_cloud_with_no_metadata_file(self): """Test that any cloud directory can be accessed as a dataset if it has no `.octue` metadata file in it, the cloud dataset doesn't lose any information during serialization, and a metadata file is uploaded afterwards. """ cloud_storage_client = GoogleCloudStorageClient() cloud_storage_client.upload_from_string( "[1, 2, 3]", cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME, "my_dataset", "file_0.txt"), ) cloud_storage_client.upload_from_string( "[4, 5, 6]", cloud_path=storage.path.generate_gs_path(TEST_BUCKET_NAME, "my_dataset", "file_1.txt"), ) cloud_dataset = Dataset(path=f"gs://{TEST_BUCKET_NAME}/my_dataset") self.assertEqual(cloud_dataset.path, f"gs://{TEST_BUCKET_NAME}/my_dataset") self.assertEqual(cloud_dataset.name, "my_dataset") self.assertEqual({file.name for file in cloud_dataset.files}, {"file_0.txt", "file_1.txt"}) for file in cloud_dataset: self.assertEqual( file.cloud_path, f"gs://{TEST_BUCKET_NAME}/my_dataset/{file.name}") # Test serialisation doesn't lose any information. deserialised_dataset = Dataset.deserialise( cloud_dataset.to_primitive()) self.assertEqual(deserialised_dataset.id, cloud_dataset.id) self.assertEqual(deserialised_dataset.name, cloud_dataset.name) self.assertEqual(deserialised_dataset.path, cloud_dataset.path) self.assertEqual(deserialised_dataset.hash_value, cloud_dataset.hash_value)