def test_functional(tmp_dir_fixture, monkeypatch): # NOQA from smarttoolbase import SmartTool input_admin_metadata = dtoolcore.generate_admin_metadata( "my_input_ds", "testing_bot") input_dataset = dtoolcore.generate_proto_dataset( admin_metadata=input_admin_metadata, prefix=tmp_dir_fixture, storage="file") input_dataset.create() input_dataset.put_readme("") input_dataset.freeze() output_admin_metadata = dtoolcore.generate_admin_metadata( "my_output_ds", "testing_bot") output_dataset = dtoolcore.generate_proto_dataset( admin_metadata=output_admin_metadata, prefix=tmp_dir_fixture, storage="file") output_dataset.create() output_dataset.put_readme("") with SmartTool( input_uri=input_dataset.uri, output_uri=output_dataset.uri, ) as smart_tool: assert smart_tool.input_dataset.uri == input_dataset.uri assert smart_tool.output_proto_dataset.uri == output_dataset.uri smart_tool.base_commands = [ "bowtie2 -x {reference_prefix} -1 {forward_read_fpath} -2 {reverse_read_fpath} -S {output_fpath}", ] smart_tool.outputs = [] smart_tool.base_command_props = { "reference_prefix": "/tmp/reference/Athaliana", "forward_read_fpath": "/tmp/input/data/read1.fq", "reverse_read_fpath": "/tmp/input/data/read2.fq", "output_fpath": "/tmp/working/output", } expected_command_list = [ "bowtie2", "-x", "/tmp/reference/Athaliana", "-1", "/tmp/input/data/read1.fq", "-2", "/tmp/input/data/read2.fq", "-S", "/tmp/working/output" ] # assert smart_tool.command_list("identifier") == expected_command_list import subprocess subprocess.call = MagicMock() smart_tool.pre_run = MagicMock() smart_tool("identifier") subprocess.call.assert_called_once_with( expected_command_list, cwd=smart_tool.working_directory) smart_tool.pre_run.assert_called_once()
def test_diff_sizes(tmp_uri_fixture): # NOQA from dtoolcore import ( DataSet, generate_admin_metadata, generate_proto_dataset, ) from dtoolcore.utils import generate_identifier from dtoolcore.compare import diff_sizes fpaths = create_test_files(tmp_uri_fixture) proto_ds_a = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_1"), base_uri=tmp_uri_fixture) proto_ds_a.create() proto_ds_a.put_item(fpaths["he"], "file.txt") proto_ds_a.freeze() proto_ds_b = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_2"), base_uri=tmp_uri_fixture) proto_ds_b.create() proto_ds_b.put_item(fpaths["she"], "file.txt") proto_ds_b.freeze() ds_a = DataSet.from_uri(proto_ds_a.uri) ds_b = DataSet.from_uri(proto_ds_b.uri) assert diff_sizes(ds_a, ds_a) == [] expected = [ (generate_identifier("file.txt"), 2, 3), ] assert diff_sizes(ds_a, ds_b) == expected
def generate_dataset(base_uri, name, size, num_files): # print( # "Generating dataset in {} with {} files of size {} bytes".format( # storage, num_files, size # ) # ) admin_metadata = generate_admin_metadata(name=name, creator_username="******") proto_dataset = generate_proto_dataset(admin_metadata, base_uri) proto_dataset.create() proto_dataset.put_readme("") for i in range(num_files): handle = "{}.txt".format(i) with tempfile.NamedTemporaryFile() as fp: fp.write(os.urandom(size)) fp.flush() proto_dataset.put_item(fp.name, handle) proto_dataset.add_item_metadata(handle, "number", i) start = time.time() # cProfile.runctx("proto_dataset.freeze()", {"proto_dataset": proto_dataset}, {}, sort="cumtime") proto_dataset.freeze() elapsed = time.time() - start # print("Freezing {} took: {}s".format(name, elapsed)) print("{},{}".format(num_files, elapsed))
def test_update_name_of_frozen_dataset(tmp_uri_fixture): # NOQA import dtoolcore # Create a dataset. admin_metadata = dtoolcore.generate_admin_metadata("test_name") proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, base_uri=tmp_uri_fixture) proto_dataset.create() proto_dataset.freeze() dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) assert dataset.name == "test_name" dataset.update_name("updated_name") assert dataset.name == "updated_name" dataset_again = dtoolcore.DataSet.from_uri(proto_dataset.uri) assert dataset_again.name == "updated_name" # Make sure that none of the other admin metadata has been altered. for key, value in admin_metadata.items(): if key == "name": continue assert dataset_again._admin_metadata[key] == value
def test_http_enable_with_presigned_url(tmp_uuid_and_uri): # NOQA uuid, dest_uri = tmp_uuid_and_uri from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet name = "my_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset proto_dataset = ProtoDataSet(uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, 'tiny.png') proto_dataset.put_readme("---\nproject: testing\n") proto_dataset.freeze() dataset = DataSet.from_uri(dest_uri) # Add an annotation. dataset.put_annotation("project", "dtool-testing") # Add tags. dataset.put_tag("amazing") dataset.put_tag("stuff") with tmp_env_var("DTOOL_S3_PUBLISH_EXPIRY", "120"): access_url = dataset._storage_broker.http_enable() assert access_url.find("?") != -1 # This is a presigned URL dataset. assert access_url.startswith("https://") dataset_from_http = DataSet.from_uri(access_url) # Assert that the annotation has been copied across. assert dataset_from_http.get_annotation("project") == "dtool-testing" # Asser that the tags are available. assert dataset_from_http.list_tags() == ["amazing", "stuff"] from dtoolcore.compare import (diff_identifiers, diff_sizes, diff_content) assert len(diff_identifiers(dataset, dataset_from_http)) == 0 assert len(diff_sizes(dataset, dataset_from_http)) == 0 assert len(diff_content(dataset_from_http, dataset)) == 0 # Make sure that all the URLs in the manifest are presigned. http_manifest = dataset_from_http._storage_broker.http_manifest assert http_manifest["manifest_url"].find("?") != -1 assert http_manifest["readme_url"].find("?") != -1 for url in http_manifest["item_urls"].values(): assert url.find("?") != -1 for url in http_manifest["annotations"].values(): assert url.find("?") != -1
def test_basic_workflow_on_first_namespace(tmp_uuid_and_uri): # NOQA uuid, dest_uri = tmp_uuid_and_uri from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet from dtoolcore.utils import generate_identifier name = "my_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset proto_dataset = ProtoDataSet( uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, 'tiny.png') proto_dataset.freeze() # Read in a dataset dataset = DataSet.from_uri(dest_uri) expected_identifier = generate_identifier('tiny.png') assert expected_identifier in dataset.identifiers assert len(dataset.identifiers) == 1
def test_copy(tmp_dir_fixture): # NOQA import dtoolcore admin_metadata = dtoolcore.generate_admin_metadata("test_name") proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, prefix=tmp_dir_fixture, storage="file") assert proto_dataset.name == "test_name" proto_dataset.update_name("test_new_name") assert proto_dataset.name == "test_new_name" proto_dataset.create() proto_dataset.update_name("test_another_new_name") assert proto_dataset.name == "test_another_new_name" read_proto_dataset = dtoolcore.ProtoDataSet.from_uri(proto_dataset.uri) assert read_proto_dataset.name == "test_another_new_name"
def test_works_if_all_set(tmp_uuid_and_uri): # NOQA from dtoolcore import ProtoDataSet, generate_admin_metadata bucket_name = S3_TEST_BASE_URI[5:] endpoint_key = "DTOOL_S3_ENDPOINT_{}".format(bucket_name) access_key = "DTOOL_S3_ACCESS_KEY_ID_{}".format(bucket_name) secret_access_key = "DTOOL_S3_SECRET_ACCESS_KEY_{}".format(bucket_name) uuid, dest_uri = tmp_uuid_and_uri name = "my_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid env_vars = { endpoint_key: "https://s3.amazonaws.com", access_key: S3_TEST_ACCESS_KEY_ID, secret_access_key: S3_TEST_SECRET_ACCESS_KEY, } a, b, c = list(env_vars.keys()) with tmp_env_var(a, env_vars[a]): with tmp_env_var(b, env_vars[b]): with tmp_env_var(c, env_vars[c]): proto_dataset = ProtoDataSet(dest_uri, admin_metadata) proto_dataset.create()
def test_basic_workflow(tmp_dir_fixture): # NOQA from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet from dtoolcore.utils import generate_identifier from dtoolcore.storagebroker import DiskStorageBroker name = "my_dataset" admin_metadata = generate_admin_metadata(name) dest_uri = DiskStorageBroker.generate_uri( name=name, uuid=admin_metadata["uuid"], base_uri=tmp_dir_fixture) sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset proto_dataset = ProtoDataSet( uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, 'tiny.png') proto_dataset.freeze() # Read in a dataset dataset = DataSet.from_uri(dest_uri) expected_identifier = generate_identifier('tiny.png') assert expected_identifier in dataset.identifiers assert len(dataset.identifiers) == 1
def create_proto_dataset(base_uri, name, username): admin_metadata = dtoolcore.generate_admin_metadata(name, username) proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, base_uri=base_uri) proto_dataset.create() proto_dataset.put_readme("") return proto_dataset
def test_copy_resume_fixes_broken_files(tmp_uri_fixture): # NOQA import dtoolcore src_dir = os.path.join(uri_to_path(tmp_uri_fixture), "src") dest_dir = os.path.join(uri_to_path(tmp_uri_fixture), "dest") for directory in [src_dir, dest_dir]: os.mkdir(directory) # Create the src dataset to be copied. admin_metadata = dtoolcore.generate_admin_metadata("test_copy") proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, base_uri=tmp_uri_fixture + "/src") proto_dataset.create() src_uri = proto_dataset.uri proto_dataset.put_readme("---\nproject: exciting\n") overlay = "file_extension" for fname in os.listdir(TEST_SAMPLE_DATA): _, ext = os.path.splitext(fname) item_fpath = os.path.join(TEST_SAMPLE_DATA, fname) proto_dataset.put_item(item_fpath, fname) proto_dataset.add_item_metadata(fname, overlay, ext) proto_dataset.freeze() # Create a partial copy. src_dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) dest_proto_dataset = dtoolcore._copy_create_proto_dataset( src_dataset, tmp_uri_fixture + "/dest") broken_content_fpath = os.path.join(TEST_SAMPLE_DATA, "another_file.txt") dest_proto_dataset.put_item(broken_content_fpath, "random_bytes") # Copy resume should work. dest_uri = dtoolcore.copy_resume(src_uri, tmp_uri_fixture + "/dest") # Compare the two datasets. src_ds = dtoolcore.DataSet.from_uri(src_uri) dest_ds = dtoolcore.DataSet.from_uri(dest_uri) for key, value in src_ds._admin_metadata.items(): assert dest_ds._admin_metadata[key] == value assert src_ds.identifiers == dest_ds.identifiers for i in src_ds.identifiers: src_item_props = src_ds.item_properties(i) dest_item_props = dest_ds.item_properties(i) for key, value in src_item_props.items(): if key == "utc_timestamp": tolerance = 2 # seconds (number chosen arbitrarily) assert dest_item_props[key] >= value assert dest_item_props[key] < value + tolerance else: assert dest_item_props[key] == value assert src_ds.get_readme_content() == dest_ds.get_readme_content() assert src_ds.list_overlay_names() == dest_ds.list_overlay_names() assert src_ds.get_overlay(overlay) == dest_ds.get_overlay(overlay)
def test_annotations(tmp_uuid_and_uri): # NOQA uuid, dest_uri = tmp_uuid_and_uri from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet name = "my_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset proto_dataset = ProtoDataSet(uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, 'tiny.png') proto_dataset.freeze() # Read in a dataset dataset = DataSet.from_uri(dest_uri) assert dataset.list_annotation_names() == [] dataset.put_annotation("project", "demo") assert dataset.get_annotation("project") == "demo" assert dataset.list_annotation_names() == ["project"]
def test_writing_of_dtool_readme_file(tmp_uuid_and_uri): # NOQA from dtoolcore import ProtoDataSet, generate_admin_metadata # Create a proto dataset. uuid, dest_uri = tmp_uuid_and_uri name = "test_dtool_readme_file" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid proto_dataset = ProtoDataSet( uri=dest_uri, admin_metadata=admin_metadata, config_path=None ) proto_dataset.create() # Check that the ".dtool/README.txt" file exists. expected_s3_key = uuid + '/README.txt' assert _key_exists_in_storage_broker( proto_dataset._storage_broker, expected_s3_key ) actual_content = _get_unicode_from_key( proto_dataset._storage_broker, expected_s3_key ) assert actual_content.startswith("README")
def test_list_dataset_uris(tmp_uuid_and_uri): # NOQA uuid, dest_uri = tmp_uuid_and_uri from dtoolcore import ProtoDataSet, generate_admin_metadata name = "my_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset proto_dataset = ProtoDataSet( uri=dest_uri, admin_metadata=admin_metadata, config_path=CONFIG_PATH) proto_dataset.create() proto_dataset.put_item(local_file_path, 'tiny with space.png') proto_dataset.freeze() from dtool_azure.storagebroker import AzureStorageBroker assert len(AzureStorageBroker.list_dataset_uris( dest_uri, CONFIG_PATH) ) > 0
def test_item_local_abspath_with_clean_cache(tmp_uuid_and_uri): # NOQA from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet from dtoolcore.utils import generate_identifier uuid, dest_uri = tmp_uuid_and_uri name = "my_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset proto_dataset = ProtoDataSet(uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, 'tiny.png') proto_dataset.freeze() identifier = generate_identifier('tiny.png') with tmp_directory() as cache_dir: with tmp_env_var("DTOOL_S3_CACHE_DIRECTORY", cache_dir): dataset = DataSet.from_uri(dest_uri) fpath = dataset.item_content_abspath(identifier) assert os.path.isfile(fpath)
def test_uri_property_when_using_relpath(chdir_fixture): # NOQA from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet from dtoolcore.storagebroker import DiskStorageBroker name = "my_dataset" admin_metadata = generate_admin_metadata(name) dest_uri = DiskStorageBroker.generate_uri(name=name, uuid=admin_metadata["uuid"], prefix=".") sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset proto_dataset = ProtoDataSet(uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, 'tiny.png') proto_dataset.freeze() dataset = DataSet.from_uri("./my_dataset") expected_uri = "file://" + os.path.abspath("my_dataset") assert dataset.uri == expected_uri
def test_fails_if_any_endpoint_is_missing(tmp_uuid_and_uri): # NOQA from dtoolcore import ProtoDataSet, generate_admin_metadata bucket_name = S3_TEST_BASE_URI[5:] endpoint_key = "DTOOL_S3_ENDPOINT_{}".format(bucket_name) access_key = "DTOOL_S3_ACCESS_KEY_ID_{}".format(bucket_name) secret_access_key = "DTOOL_S3_SECRET_ACCESS_KEY_{}".format(bucket_name) uuid, dest_uri = tmp_uuid_and_uri name = "my_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid env_vars = { endpoint_key: "https://s3.amazonaws.com", access_key: S3_TEST_ACCESS_KEY_ID, secret_access_key: S3_TEST_SECRET_ACCESS_KEY, } from itertools import combinations for a, b in combinations(env_vars.keys(), 2): with tmp_env_var(a, env_vars[a]): with tmp_env_var(b, env_vars[b]): with pytest.raises(RuntimeError): ProtoDataSet(dest_uri, admin_metadata)
def test_diff_content(tmp_dir_fixture): # NOQA from dtoolcore import ( DataSet, generate_admin_metadata, generate_proto_dataset, ) from dtoolcore.utils import generate_identifier from dtoolcore.compare import diff_content from dtoolcore.storagebroker import DiskStorageBroker fpaths = create_test_files(tmp_dir_fixture) proto_ds_a = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_1"), prefix=tmp_dir_fixture, storage="file") proto_ds_a.create() proto_ds_a.put_item(fpaths["cat"], "file.txt") proto_ds_a.freeze() proto_ds_b = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_2"), prefix=tmp_dir_fixture, storage="file") proto_ds_b.create() proto_ds_b.put_item(fpaths["she"], "file.txt") proto_ds_b.freeze() ds_a = DataSet.from_uri(proto_ds_a.uri) ds_b = DataSet.from_uri(proto_ds_b.uri) assert diff_content(ds_a, ds_a) == [] identifier = generate_identifier("file.txt") expected = [ (generate_identifier("file.txt"), DiskStorageBroker.hasher(ds_a.item_content_abspath(identifier)), DiskStorageBroker.hasher(ds_b.item_content_abspath(identifier))) ] assert diff_content(ds_a, ds_b) == expected
def proto_dataset_from_base_uri(name, base_uri): admin_metadata = dtoolcore.generate_admin_metadata(name) parsed_base_uri = dtoolcore.utils.generous_parse_uri(base_uri) proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, base_uri=dtoolcore.utils.urlunparse(parsed_base_uri)) proto_dataset.create() return proto_dataset
def tmp_uuid_and_uri(request): admin_metadata = generate_admin_metadata("test_dataset") uuid = admin_metadata["uuid"] uri = AzureStorageBroker.generate_uri("test_dataset", uuid, AZURE_TEST_BASE_URI) @request.addfinalizer def teardown(): _remove_dataset(uri) return (uuid, uri)
def tmp_uuid_and_uri(request): admin_metadata = generate_admin_metadata("test_dataset") uuid = admin_metadata["uuid"] uri = IrodsStorageBroker.generate_uri("test_dataset", uuid, TEST_ZONE) @request.addfinalizer def teardown(): _, irods_path = uri.split(":", 1) _rm_if_exists(irods_path) return (uuid, uri)
def tmp_uuid_and_uri(request): admin_metadata = generate_admin_metadata("test_dataset") uuid = admin_metadata["uuid"] uri = S3StorageBroker.generate_uri("test_dataset", uuid, "s3://test-dtool-s3-bucket") @request.addfinalizer def teardown(): _remove_dataset(uri) return (uuid, uri)
def tmp_uuid_and_uri(request): admin_metadata = generate_admin_metadata("test_dataset") uuid = admin_metadata["uuid"] # The namespace needs to be configured in ~/.config/dtool/dtool.json uri = ECSStorageBroker.generate_uri("test_dataset", uuid, "ecs://test1") @request.addfinalizer def teardown(): _remove_dataset(uri) return (uuid, uri)
def test_tags_functional(tmp_uuid_and_uri): # NOQA uuid, dest_uri = tmp_uuid_and_uri from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet name = "my_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid # Create a minimal dataset proto_dataset = ProtoDataSet( uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() # Test put_tag on proto dataset. proto_dataset.put_tag("testing") proto_dataset.freeze() dataset = DataSet.from_uri(proto_dataset.uri) assert dataset.list_tags() == ["testing"] dataset.put_tag("amazing") dataset.put_tag("stuff") assert dataset.list_tags() == ["amazing", "stuff", "testing"] dataset.delete_tag("stuff") assert dataset.list_tags() == ["amazing", "testing"] # Putting the same tag is idempotent. dataset.put_tag("amazing") dataset.put_tag("amazing") dataset.put_tag("amazing") assert dataset.list_tags() == ["amazing", "testing"] # Tags can only be strings. from dtoolcore import DtoolCoreValueError with pytest.raises(DtoolCoreValueError): dataset.put_tag(1) # Tags need to adhere to the utils.name_is_valid() rules. from dtoolcore import DtoolCoreInvalidNameError with pytest.raises(DtoolCoreInvalidNameError): dataset.put_tag("!invalid") # Deleting a non exiting tag does not raise. It silently succeeds. dataset.delete_tag("dontexist")
def test_status_command_on_proto_dataset_functional(tmp_dir_fixture): # NOQA from dtoolcore import generate_admin_metadata, generate_proto_dataset from dtool_info.dataset import status admin_metadata = generate_admin_metadata("test_ds") proto_dataset = generate_proto_dataset(admin_metadata=admin_metadata, base_uri=tmp_dir_fixture) proto_dataset.create() runner = CliRunner() result = runner.invoke(status, [proto_dataset.uri]) assert result.exit_code == 0 assert result.output.strip() == "proto"
def test_basic_workflow_with_nested_handle(tmp_dir_fixture): # NOQA from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet from dtoolcore.utils import generate_identifier from dtoolcore.storagebroker import DiskStorageBroker name = "my_dataset" admin_metadata = generate_admin_metadata(name) dest_uri = DiskStorageBroker.generate_uri( name=name, uuid=admin_metadata["uuid"], base_uri=tmp_dir_fixture) sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') handle = "subdir/tiny.png" # Create a minimal dataset proto_dataset = ProtoDataSet( uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, handle) proto_dataset.freeze() # Read in a dataset dataset = DataSet.from_uri(dest_uri) expected_identifier = generate_identifier(handle) assert expected_identifier in dataset.identifiers assert len(dataset.identifiers) == 1 # Ensure that the file exists in the disk dataset. # Particularly on Windows. item_abspath = os.path.join( tmp_dir_fixture, name, "data", "subdir", "tiny.png" ) assert os.path.isfile(item_abspath) assert os.path.isfile(dataset.item_content_abspath(expected_identifier)) # Ensure that the correct abspath is returned. # Particularly on Windows. assert dataset.item_content_abspath(expected_identifier) == item_abspath # NOQA
def test_diff_identifiers(tmp_dir_fixture): # NOQA from dtoolcore import ( DataSet, generate_admin_metadata, generate_proto_dataset, ) from dtoolcore.utils import generate_identifier from dtoolcore.compare import diff_identifiers fpaths = create_test_files(tmp_dir_fixture) proto_ds_a = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_1"), prefix=tmp_dir_fixture, storage="file") proto_ds_a.create() proto_ds_a.put_item(fpaths["cat"], "a.txt") proto_ds_a.freeze() proto_ds_b = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_2"), prefix=tmp_dir_fixture, storage="file") proto_ds_b.create() proto_ds_b.put_item(fpaths["cat"], "b.txt") proto_ds_b.freeze() ds_a = DataSet.from_uri(proto_ds_a.uri) ds_b = DataSet.from_uri(proto_ds_b.uri) assert diff_identifiers(ds_a, ds_a) == [] expected = [(generate_identifier("a.txt"), True, False), (generate_identifier("b.txt"), False, True)] assert diff_identifiers(ds_a, ds_b) == expected
def create_derived_dataset(parent_dataset, dest_location_uri, name_suffix): dest_dataset_name = "{}_{}".format(parent_dataset.name, name_suffix) admin_metadata = dtoolcore.generate_admin_metadata(dest_dataset_name) dest_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, base_uri=dest_location_uri, config_path=CONFIG_PATH) try: dest_dataset.create() except dtoolcore.storagebroker.StorageBrokerOSError as err: raise click.UsageError(str(err)) return dest_dataset
def test_fails_if_only_endpoint_is_set(tmp_uuid_and_uri): # NOQA from dtoolcore import ProtoDataSet, generate_admin_metadata bucket_name = S3_TEST_BASE_URI[5:] endpoint_key = "DTOOL_S3_ENDPOINT_{}".format(bucket_name) uuid, dest_uri = tmp_uuid_and_uri name = "my_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid with tmp_env_var(endpoint_key, "https://s3.amazonaws.com"): with pytest.raises(RuntimeError): ProtoDataSet(dest_uri, admin_metadata)
def create(quiet, name, storage, prefix): """Create a proto dataset.""" admin_metadata = dtoolcore.generate_admin_metadata(name) # Create the dataset. proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, prefix=prefix, storage=storage, config_path=CONFIG_PATH) try: proto_dataset.create() except dtoolcore.storagebroker.StorageBrokerOSError as err: raise click.UsageError(str(err)) proto_dataset.put_readme("") if quiet: click.secho(proto_dataset.uri) else: # Give the user some feedback and hints on what to do next. click.secho("Created proto dataset ", nl=False, fg="green") click.secho(proto_dataset.uri) click.secho("Next steps: ") step = 1 click.secho("{}. Add descriptive metadata, e.g: ".format(step)) click.secho(" dtool readme interactive {}".format(proto_dataset.uri), fg="cyan") if storage != "symlink": step = step + 1 click.secho("{}. Add raw data, eg:".format(step)) click.secho(" dtool add item my_file.txt {}".format( proto_dataset.uri), fg="cyan") if storage == "file": # Find the abspath of the data directory for user feedback. data_path = proto_dataset._storage_broker._data_abspath click.secho(" Or use your system commands, e.g: ") click.secho(" mv my_data_directory {}/".format(data_path), fg="cyan") step = step + 1 click.secho( "{}. Convert the proto dataset into a dataset: ".format(step)) click.secho(" dtool freeze {}".format(proto_dataset.uri), fg="cyan")