def test_functional(tmp_dir_fixture, monkeypatch): # NOQA from smarttoolbase import SmartTool input_admin_metadata = dtoolcore.generate_admin_metadata( "my_input_ds", "testing_bot") input_dataset = dtoolcore.generate_proto_dataset( admin_metadata=input_admin_metadata, prefix=tmp_dir_fixture, storage="file") input_dataset.create() input_dataset.put_readme("") input_dataset.freeze() output_admin_metadata = dtoolcore.generate_admin_metadata( "my_output_ds", "testing_bot") output_dataset = dtoolcore.generate_proto_dataset( admin_metadata=output_admin_metadata, prefix=tmp_dir_fixture, storage="file") output_dataset.create() output_dataset.put_readme("") with SmartTool( input_uri=input_dataset.uri, output_uri=output_dataset.uri, ) as smart_tool: assert smart_tool.input_dataset.uri == input_dataset.uri assert smart_tool.output_proto_dataset.uri == output_dataset.uri smart_tool.base_commands = [ "bowtie2 -x {reference_prefix} -1 {forward_read_fpath} -2 {reverse_read_fpath} -S {output_fpath}", ] smart_tool.outputs = [] smart_tool.base_command_props = { "reference_prefix": "/tmp/reference/Athaliana", "forward_read_fpath": "/tmp/input/data/read1.fq", "reverse_read_fpath": "/tmp/input/data/read2.fq", "output_fpath": "/tmp/working/output", } expected_command_list = [ "bowtie2", "-x", "/tmp/reference/Athaliana", "-1", "/tmp/input/data/read1.fq", "-2", "/tmp/input/data/read2.fq", "-S", "/tmp/working/output" ] # assert smart_tool.command_list("identifier") == expected_command_list import subprocess subprocess.call = MagicMock() smart_tool.pre_run = MagicMock() smart_tool("identifier") subprocess.call.assert_called_once_with( expected_command_list, cwd=smart_tool.working_directory) smart_tool.pre_run.assert_called_once()
def test_diff_sizes(tmp_uri_fixture): # NOQA from dtoolcore import ( DataSet, generate_admin_metadata, generate_proto_dataset, ) from dtoolcore.utils import generate_identifier from dtoolcore.compare import diff_sizes fpaths = create_test_files(tmp_uri_fixture) proto_ds_a = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_1"), base_uri=tmp_uri_fixture) proto_ds_a.create() proto_ds_a.put_item(fpaths["he"], "file.txt") proto_ds_a.freeze() proto_ds_b = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_2"), base_uri=tmp_uri_fixture) proto_ds_b.create() proto_ds_b.put_item(fpaths["she"], "file.txt") proto_ds_b.freeze() ds_a = DataSet.from_uri(proto_ds_a.uri) ds_b = DataSet.from_uri(proto_ds_b.uri) assert diff_sizes(ds_a, ds_a) == [] expected = [ (generate_identifier("file.txt"), 2, 3), ] assert diff_sizes(ds_a, ds_b) == expected
def test_copy(tmp_dir_fixture): # NOQA import dtoolcore admin_metadata = dtoolcore.generate_admin_metadata("test_name") proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, prefix=tmp_dir_fixture, storage="file") assert proto_dataset.name == "test_name" proto_dataset.update_name("test_new_name") assert proto_dataset.name == "test_new_name" proto_dataset.create() proto_dataset.update_name("test_another_new_name") assert proto_dataset.name == "test_another_new_name" read_proto_dataset = dtoolcore.ProtoDataSet.from_uri(proto_dataset.uri) assert read_proto_dataset.name == "test_another_new_name"
def test_copy_resume_fixes_broken_files(tmp_uri_fixture): # NOQA import dtoolcore src_dir = os.path.join(uri_to_path(tmp_uri_fixture), "src") dest_dir = os.path.join(uri_to_path(tmp_uri_fixture), "dest") for directory in [src_dir, dest_dir]: os.mkdir(directory) # Create the src dataset to be copied. admin_metadata = dtoolcore.generate_admin_metadata("test_copy") proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, base_uri=tmp_uri_fixture + "/src") proto_dataset.create() src_uri = proto_dataset.uri proto_dataset.put_readme("---\nproject: exciting\n") overlay = "file_extension" for fname in os.listdir(TEST_SAMPLE_DATA): _, ext = os.path.splitext(fname) item_fpath = os.path.join(TEST_SAMPLE_DATA, fname) proto_dataset.put_item(item_fpath, fname) proto_dataset.add_item_metadata(fname, overlay, ext) proto_dataset.freeze() # Create a partial copy. src_dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) dest_proto_dataset = dtoolcore._copy_create_proto_dataset( src_dataset, tmp_uri_fixture + "/dest") broken_content_fpath = os.path.join(TEST_SAMPLE_DATA, "another_file.txt") dest_proto_dataset.put_item(broken_content_fpath, "random_bytes") # Copy resume should work. dest_uri = dtoolcore.copy_resume(src_uri, tmp_uri_fixture + "/dest") # Compare the two datasets. src_ds = dtoolcore.DataSet.from_uri(src_uri) dest_ds = dtoolcore.DataSet.from_uri(dest_uri) for key, value in src_ds._admin_metadata.items(): assert dest_ds._admin_metadata[key] == value assert src_ds.identifiers == dest_ds.identifiers for i in src_ds.identifiers: src_item_props = src_ds.item_properties(i) dest_item_props = dest_ds.item_properties(i) for key, value in src_item_props.items(): if key == "utc_timestamp": tolerance = 2 # seconds (number chosen arbitrarily) assert dest_item_props[key] >= value assert dest_item_props[key] < value + tolerance else: assert dest_item_props[key] == value assert src_ds.get_readme_content() == dest_ds.get_readme_content() assert src_ds.list_overlay_names() == dest_ds.list_overlay_names() assert src_ds.get_overlay(overlay) == dest_ds.get_overlay(overlay)
def create_proto_dataset(base_uri, name, username): admin_metadata = dtoolcore.generate_admin_metadata(name, username) proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, base_uri=base_uri) proto_dataset.create() proto_dataset.put_readme("") return proto_dataset
def generate_dataset(base_uri, name, size, num_files): # print( # "Generating dataset in {} with {} files of size {} bytes".format( # storage, num_files, size # ) # ) admin_metadata = generate_admin_metadata(name=name, creator_username="******") proto_dataset = generate_proto_dataset(admin_metadata, base_uri) proto_dataset.create() proto_dataset.put_readme("") for i in range(num_files): handle = "{}.txt".format(i) with tempfile.NamedTemporaryFile() as fp: fp.write(os.urandom(size)) fp.flush() proto_dataset.put_item(fp.name, handle) proto_dataset.add_item_metadata(handle, "number", i) start = time.time() # cProfile.runctx("proto_dataset.freeze()", {"proto_dataset": proto_dataset}, {}, sort="cumtime") proto_dataset.freeze() elapsed = time.time() - start # print("Freezing {} took: {}s".format(name, elapsed)) print("{},{}".format(num_files, elapsed))
def test_update_name_of_frozen_dataset(tmp_uri_fixture): # NOQA import dtoolcore # Create a dataset. admin_metadata = dtoolcore.generate_admin_metadata("test_name") proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, base_uri=tmp_uri_fixture) proto_dataset.create() proto_dataset.freeze() dataset = dtoolcore.DataSet.from_uri(proto_dataset.uri) assert dataset.name == "test_name" dataset.update_name("updated_name") assert dataset.name == "updated_name" dataset_again = dtoolcore.DataSet.from_uri(proto_dataset.uri) assert dataset_again.name == "updated_name" # Make sure that none of the other admin metadata has been altered. for key, value in admin_metadata.items(): if key == "name": continue assert dataset_again._admin_metadata[key] == value
def test_diff_content(tmp_dir_fixture): # NOQA from dtoolcore import ( DataSet, generate_admin_metadata, generate_proto_dataset, ) from dtoolcore.utils import generate_identifier from dtoolcore.compare import diff_content from dtoolcore.storagebroker import DiskStorageBroker fpaths = create_test_files(tmp_dir_fixture) proto_ds_a = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_1"), prefix=tmp_dir_fixture, storage="file") proto_ds_a.create() proto_ds_a.put_item(fpaths["cat"], "file.txt") proto_ds_a.freeze() proto_ds_b = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_2"), prefix=tmp_dir_fixture, storage="file") proto_ds_b.create() proto_ds_b.put_item(fpaths["she"], "file.txt") proto_ds_b.freeze() ds_a = DataSet.from_uri(proto_ds_a.uri) ds_b = DataSet.from_uri(proto_ds_b.uri) assert diff_content(ds_a, ds_a) == [] identifier = generate_identifier("file.txt") expected = [ (generate_identifier("file.txt"), DiskStorageBroker.hasher(ds_a.item_content_abspath(identifier)), DiskStorageBroker.hasher(ds_b.item_content_abspath(identifier))) ] assert diff_content(ds_a, ds_b) == expected
def proto_dataset_from_base_uri(name, base_uri): admin_metadata = dtoolcore.generate_admin_metadata(name) parsed_base_uri = dtoolcore.utils.generous_parse_uri(base_uri) proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, base_uri=dtoolcore.utils.urlunparse(parsed_base_uri)) proto_dataset.create() return proto_dataset
def test_status_command_on_proto_dataset_functional(tmp_dir_fixture): # NOQA from dtoolcore import generate_admin_metadata, generate_proto_dataset from dtool_info.dataset import status admin_metadata = generate_admin_metadata("test_ds") proto_dataset = generate_proto_dataset(admin_metadata=admin_metadata, base_uri=tmp_dir_fixture) proto_dataset.create() runner = CliRunner() result = runner.invoke(status, [proto_dataset.uri]) assert result.exit_code == 0 assert result.output.strip() == "proto"
def create_derived_dataset(parent_dataset, dest_location_uri, name_suffix): dest_dataset_name = "{}_{}".format(parent_dataset.name, name_suffix) admin_metadata = dtoolcore.generate_admin_metadata(dest_dataset_name) dest_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, base_uri=dest_location_uri, config_path=CONFIG_PATH) try: dest_dataset.create() except dtoolcore.storagebroker.StorageBrokerOSError as err: raise click.UsageError(str(err)) return dest_dataset
def test_diff_identifiers(tmp_dir_fixture): # NOQA from dtoolcore import ( DataSet, generate_admin_metadata, generate_proto_dataset, ) from dtoolcore.utils import generate_identifier from dtoolcore.compare import diff_identifiers fpaths = create_test_files(tmp_dir_fixture) proto_ds_a = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_1"), prefix=tmp_dir_fixture, storage="file") proto_ds_a.create() proto_ds_a.put_item(fpaths["cat"], "a.txt") proto_ds_a.freeze() proto_ds_b = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_2"), prefix=tmp_dir_fixture, storage="file") proto_ds_b.create() proto_ds_b.put_item(fpaths["cat"], "b.txt") proto_ds_b.freeze() ds_a = DataSet.from_uri(proto_ds_a.uri) ds_b = DataSet.from_uri(proto_ds_b.uri) assert diff_identifiers(ds_a, ds_a) == [] expected = [(generate_identifier("a.txt"), True, False), (generate_identifier("b.txt"), False, True)] assert diff_identifiers(ds_a, ds_b) == expected
def create(quiet, name, storage, prefix): """Create a proto dataset.""" admin_metadata = dtoolcore.generate_admin_metadata(name) # Create the dataset. proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, prefix=prefix, storage=storage, config_path=CONFIG_PATH) try: proto_dataset.create() except dtoolcore.storagebroker.StorageBrokerOSError as err: raise click.UsageError(str(err)) proto_dataset.put_readme("") if quiet: click.secho(proto_dataset.uri) else: # Give the user some feedback and hints on what to do next. click.secho("Created proto dataset ", nl=False, fg="green") click.secho(proto_dataset.uri) click.secho("Next steps: ") step = 1 click.secho("{}. Add descriptive metadata, e.g: ".format(step)) click.secho(" dtool readme interactive {}".format(proto_dataset.uri), fg="cyan") if storage != "symlink": step = step + 1 click.secho("{}. Add raw data, eg:".format(step)) click.secho(" dtool add item my_file.txt {}".format( proto_dataset.uri), fg="cyan") if storage == "file": # Find the abspath of the data directory for user feedback. data_path = proto_dataset._storage_broker._data_abspath click.secho(" Or use your system commands, e.g: ") click.secho(" mv my_data_directory {}/".format(data_path), fg="cyan") step = step + 1 click.secho( "{}. Convert the proto dataset into a dataset: ".format(step)) click.secho(" dtool freeze {}".format(proto_dataset.uri), fg="cyan")
def test_copy(tmp_dir_fixture): # NOQA import dtoolcore src_dir = os.path.join(tmp_dir_fixture, "src") dest_dir = os.path.join(tmp_dir_fixture, "dest") for directory in [src_dir, dest_dir]: os.mkdir(directory) # Create the src dataset to be copied. admin_metadata = dtoolcore.generate_admin_metadata("test_copy") proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, prefix=src_dir, storage="file") proto_dataset.create() src_uri = proto_dataset.uri proto_dataset.put_readme("---\nproject: exciting\n") overlay = "file_extension" for fname in os.listdir(TEST_SAMPLE_DATA): _, ext = os.path.splitext(fname) item_fpath = os.path.join(TEST_SAMPLE_DATA, fname) proto_dataset.put_item(item_fpath, fname) proto_dataset.add_item_metadata(fname, overlay, ext) proto_dataset.freeze() # Copy the src dataset to dest. dest_uri = dtoolcore.copy(src_uri, dest_dir, "file") # Compare the two datasets. src_ds = dtoolcore.DataSet.from_uri(src_uri) dest_ds = dtoolcore.DataSet.from_uri(dest_uri) assert src_ds._admin_metadata == dest_ds._admin_metadata assert src_ds.identifiers == dest_ds.identifiers for i in src_ds.identifiers: assert src_ds.item_properties(i) == dest_ds.item_properties(i) assert src_ds.get_readme_content() == dest_ds.get_readme_content() assert src_ds.list_overlay_names() == dest_ds.list_overlay_names() assert src_ds.get_overlay(overlay) == dest_ds.get_overlay(overlay)
def test_update_name_raises_DtoolCoreInvalidName(tmp_uri_fixture): # NOQA import dtoolcore from dtoolcore import DtoolCoreInvalidNameError admin_metadata = dtoolcore.generate_admin_metadata("test_name") proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, base_uri=tmp_uri_fixture) assert proto_dataset.name == "test_name" proto_dataset.update_name("test_new_name") assert proto_dataset.name == "test_new_name" proto_dataset.create() with pytest.raises(DtoolCoreInvalidNameError): proto_dataset.update_name("test_another:new_name")
def test_list_dataset_uris(tmp_dir_fixture): # NOQA import dtoolcore from dtoolcore.storagebroker import DiskStorageBroker assert [] == DiskStorageBroker.list_dataset_uris(prefix=tmp_dir_fixture, config_path=None) # Create two datasets to be copied. expected_uris = [] for name in ["test_ds_1", "test_ds_2"]: admin_metadata = dtoolcore.generate_admin_metadata(name) proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, prefix=tmp_dir_fixture, storage="file") proto_dataset.create() expected_uris.append(proto_dataset.uri) actual_uris = DiskStorageBroker.list_dataset_uris(prefix=tmp_dir_fixture, config_path=None) assert set(expected_uris) == set(actual_uris)
def test_update_name(tmp_uri_fixture): # NOQA import dtoolcore admin_metadata = dtoolcore.generate_admin_metadata("test_name") proto_dataset = dtoolcore.generate_proto_dataset( admin_metadata=admin_metadata, base_uri=tmp_uri_fixture) assert proto_dataset.name == "test_name" proto_dataset.update_name("test_new_name") assert proto_dataset.name == "test_new_name" proto_dataset.create() proto_dataset.update_name("test_another_new_name") assert proto_dataset.name == "test_another_new_name" read_proto_dataset = dtoolcore.ProtoDataSet.from_uri(proto_dataset.uri) assert read_proto_dataset.name == "test_another_new_name"