def test_diff_identifiers(tmp_uri_fixture): # NOQA from dtoolcore import ( DataSet, generate_admin_metadata, generate_proto_dataset, ) from dtoolcore.utils import generate_identifier from dtoolcore.compare import diff_identifiers fpaths = create_test_files(tmp_uri_fixture) proto_ds_a = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_1"), base_uri=tmp_uri_fixture) proto_ds_a.create() proto_ds_a.put_item(fpaths["cat"], "a.txt") proto_ds_a.freeze() proto_ds_b = generate_proto_dataset( admin_metadata=generate_admin_metadata("test_compare_2"), base_uri=tmp_uri_fixture) proto_ds_b.create() proto_ds_b.put_item(fpaths["cat"], "b.txt") proto_ds_b.freeze() ds_a = DataSet.from_uri(proto_ds_a.uri) ds_b = DataSet.from_uri(proto_ds_b.uri) assert diff_identifiers(ds_a, ds_a) == [] expected = [(generate_identifier("a.txt"), True, False), (generate_identifier("b.txt"), False, True)] assert diff_identifiers(ds_a, ds_b) == expected
def test_http_enable_with_presigned_url(tmp_uuid_and_uri): # NOQA uuid, dest_uri = tmp_uuid_and_uri from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet name = "my_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset proto_dataset = ProtoDataSet(uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, 'tiny.png') proto_dataset.put_readme("---\nproject: testing\n") proto_dataset.freeze() dataset = DataSet.from_uri(dest_uri) # Add an annotation. dataset.put_annotation("project", "dtool-testing") # Add tags. dataset.put_tag("amazing") dataset.put_tag("stuff") with tmp_env_var("DTOOL_S3_PUBLISH_EXPIRY", "120"): access_url = dataset._storage_broker.http_enable() assert access_url.find("?") != -1 # This is a presigned URL dataset. assert access_url.startswith("https://") dataset_from_http = DataSet.from_uri(access_url) # Assert that the annotation has been copied across. assert dataset_from_http.get_annotation("project") == "dtool-testing" # Asser that the tags are available. assert dataset_from_http.list_tags() == ["amazing", "stuff"] from dtoolcore.compare import (diff_identifiers, diff_sizes, diff_content) assert len(diff_identifiers(dataset, dataset_from_http)) == 0 assert len(diff_sizes(dataset, dataset_from_http)) == 0 assert len(diff_content(dataset_from_http, dataset)) == 0 # Make sure that all the URLs in the manifest are presigned. http_manifest = dataset_from_http._storage_broker.http_manifest assert http_manifest["manifest_url"].find("?") != -1 assert http_manifest["readme_url"].find("?") != -1 for url in http_manifest["item_urls"].values(): assert url.find("?") != -1 for url in http_manifest["annotations"].values(): assert url.find("?") != -1
def diff(full, dataset_uri, reference_dataset_uri): """Report the difference between two datasets. 1. Checks that the identifiers are identicial 2. Checks that the sizes are identical 3. Checks that the hashes are identical, if the '--full' option is used If a differences is detected in step 1, steps 2 and 3 will not be carried out. Similarly if a difference is detected in step 2, step 3 will not be carried out. When checking that the hashes are identical the hashes for the first dataset are recalculated using the hashing algorithm of the reference dataset. """ def echo_header(desc, ds_name, ref_ds_name, prop): click.secho("Different {}".format(desc), fg="red") click.secho("ID, {} in '{}', {} in '{}'".format( prop, ds_name, prop, ref_ds_name)) def echo_diff(diff): for d in diff: line = "{}, {}, {}".format(d[0], d[1], d[2]) click.secho(line) ds = dtoolcore.DataSet.from_uri(dataset_uri) ref_ds = dtoolcore.DataSet.from_uri(reference_dataset_uri) num_items = len(list(ref_ds.identifiers)) ids_diff = diff_identifiers(ds, ref_ds) if len(ids_diff) > 0: echo_header("identifiers", ds.name, ref_ds.name, "present") echo_diff(ids_diff) sys.exit(1) with click.progressbar(length=num_items, label="Comparing sizes") as progressbar: sizes_diff = diff_sizes(ds, ref_ds, progressbar) if len(sizes_diff) > 0: echo_header("sizes", ds.name, ref_ds.name, "size") echo_diff(sizes_diff) sys.exit(2) if full: with click.progressbar(length=num_items, label="Comparing hashes") as progressbar: content_diff = diff_content(ds, ref_ds, progressbar) if len(content_diff) > 0: echo_header("content", ds.name, ref_ds.name, "hash") echo_diff(content_diff) sys.exit(3)
def test_http_enable(tmp_uuid_and_uri): # NOQA uuid, dest_uri = tmp_uuid_and_uri from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet name = "my_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset proto_dataset = ProtoDataSet( uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, 'tiny.png') proto_dataset.put_readme("---\nproject: testing\n") proto_dataset.freeze() dataset = DataSet.from_uri(dest_uri) access_url = dataset._storage_broker.http_enable() assert access_url.startswith("https://") dataset_from_http = DataSet.from_uri(access_url) from dtoolcore.compare import ( diff_identifiers, diff_sizes, diff_content ) assert len(diff_identifiers(dataset, dataset_from_http)) == 0 assert len(diff_sizes(dataset, dataset_from_http)) == 0 assert len(diff_content(dataset_from_http, dataset)) == 0
def test_copy_and_diff(tmp_uuid_and_uri): # NOQA uuid, dest_uri = tmp_uuid_and_uri import dtoolcore from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet from dtoolcore.compare import ( diff_identifiers, diff_sizes, diff_content, ) name = "my_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset proto_dataset = ProtoDataSet(uri=dest_uri, admin_metadata=admin_metadata) proto_dataset.create() proto_dataset.put_readme(content='---\ndescription: test') proto_dataset.put_item(local_file_path, 'tiny.png') proto_dataset.freeze() remote_dataset = DataSet.from_uri(dest_uri) with tmp_directory() as local_dir: local_uri = dtoolcore.copy(dest_uri, local_dir) assert local_uri.startswith("file:/") local_dataset = DataSet.from_uri(local_uri) assert len(diff_identifiers(local_dataset, remote_dataset)) == 0 assert len(diff_sizes(local_dataset, remote_dataset)) == 0 assert len(diff_content(local_dataset, remote_dataset)) == 0
def test_http_enable(tmp_uuid_and_uri): # NOQA uuid, dest_uri = tmp_uuid_and_uri from dtoolcore import ProtoDataSet, generate_admin_metadata from dtoolcore import DataSet name = "my_dataset" admin_metadata = generate_admin_metadata(name) admin_metadata["uuid"] = uuid sample_data_path = os.path.join(TEST_SAMPLE_DATA) local_file_path = os.path.join(sample_data_path, 'tiny.png') # Create a minimal dataset proto_dataset = ProtoDataSet(uri=dest_uri, admin_metadata=admin_metadata, config_path=None) proto_dataset.create() proto_dataset.put_item(local_file_path, 'tiny.png') proto_dataset.put_readme("---\nproject: testing\n") proto_dataset.freeze() dataset = DataSet.from_uri(dest_uri) # Test HTTP manifest. http_manifest = dataset._storage_broker._generate_http_manifest() assert "admin_metadata" in http_manifest assert http_manifest["admin_metadata"] == dataset._admin_metadata assert "overlays" in http_manifest assert "readme_url" in http_manifest assert "manifest_url" in http_manifest assert "item_urls" in http_manifest assert "annotations" in http_manifest assert "tags" in http_manifest assert set(http_manifest["item_urls"].keys()) == set(dataset.identifiers) # Add an annotation. dataset.put_annotation("project", "dtool-testing") # Add tags. dataset.put_tag("amazing") dataset.put_tag("stuff") access_url = dataset._storage_broker.http_enable() assert access_url.startswith("https://") dataset_from_http = DataSet.from_uri(access_url) # Assert that the annotation has been copied across. assert dataset_from_http.get_annotation("project") == "dtool-testing" # Asser that the tags are available. assert dataset_from_http.list_tags() == ["amazing", "stuff"] from dtoolcore.compare import (diff_identifiers, diff_sizes, diff_content) assert len(diff_identifiers(dataset, dataset_from_http)) == 0 assert len(diff_sizes(dataset, dataset_from_http)) == 0 assert len(diff_content(dataset_from_http, dataset)) == 0