Ejemplo n.º 1
0
def test_load_last_manifest(tmpdir, example_datasets_with_metadata):
    """
    Test that load_last_manifest works
    """
    bucket_name = 'load_lst_manifest_bucket'
    metadatasets = example_datasets_with_metadata['metadata']
    create_bucket(bucket_name,
                  example_datasets_with_metadata['data'],
                  metadatasets=metadatasets)

    cache_dir = pathlib.Path(tmpdir) / 'load_last_cache'
    cache = S3CloudCache(cache_dir, bucket_name, 'project-x')

    # check that load_last_manifest in a new cache loads the
    # latest manifest without emitting a warning
    with pytest.warns(None) as warnings:
        cache.load_last_manifest()
    ct = 0
    for w in warnings.list:
        if w._category_name == 'OutdatedManifestWarning':
            ct += 1
    assert ct == 0
    assert cache.current_manifest == 'project-x_manifest_v15.0.0.json'

    cache.load_manifest('project-x_manifest_v7.0.0.json')

    del cache

    # check that load_last_manifest on an old cache emits the
    # expected warning and loads the correct manifest
    cache = S3CloudCache(cache_dir, bucket_name, 'project-x')
    expected = 'A more up to date version of the '
    expected += 'dataset -- project-x_manifest_v15.0.0.json '
    expected += '-- exists online'
    with pytest.warns(OutdatedManifestWarning, match=expected) as warnings:
        cache.load_last_manifest()

    assert cache.current_manifest == 'project-x_manifest_v7.0.0.json'
    cache.load_manifest('project-x_manifest_v4.0.0.json')
    del cache

    # repeat the above test, making sure the correct manifest is
    # loaded again
    cache = S3CloudCache(cache_dir, bucket_name, 'project-x')
    expected = 'A more up to date version of the '
    expected += 'dataset -- project-x_manifest_v15.0.0.json '
    expected += '-- exists online'
    with pytest.warns(OutdatedManifestWarning, match=expected) as warnings:
        cache.load_last_manifest()

    assert cache.current_manifest == 'project-x_manifest_v4.0.0.json'
Ejemplo n.º 2
0
def test_on_removed_symlinks(tmpdir, example_datasets):
    """
    Test that the CloudCache re-downloads files when the
    the symlinks have been removed
    """
    bucket_name = 'corruption_bucket'
    create_bucket(bucket_name, example_datasets)

    cache_dir = pathlib.Path(tmpdir) / 'cache'
    cache = S3CloudCache(cache_dir, bucket_name, 'project-x')

    version_list = ('1.0.0', '2.0.0', '3.0.0')
    file_id_list = ('1', '2', '3')

    for version in version_list:
        cache.load_manifest(f'project-x_manifest_v{version}.json')
        for file_id in file_id_list:
            cache.download_data(file_id)

    # make sure that all files exist
    for version in version_list:
        cache.load_manifest(f'project-x_manifest_v{version}.json')
        for file_id in file_id_list:
            attr = cache.data_path(file_id)
            assert attr['exists']

    hasher = hashlib.blake2b()
    hasher.update(b'4567890')
    true_hash = hasher.hexdigest()

    p1 = cache_dir / 'project-x-1.0.0' / 'data' / 'f2.txt'
    p2 = cache_dir / 'project-x-2.0.0' / 'data' / 'f2.txt'

    # note that f2.txt is identical between v 1.0.0 and 2.0.0
    assert p1.is_file()
    assert not p1.is_symlink()
    assert p2.is_symlink()
    assert p1.resolve() == p2.resolve()

    # remove symlink at p2 and show that the file
    # still exists (and that the symlink gets restored
    # once you ask for the file path)
    p2.unlink()
    assert not p2.exists()
    assert not p2.is_symlink()
    assert p1.is_file()

    cache.load_manifest('project-x_manifest_v2.0.0.json')
    test_path = cache.data_path('2')
    assert test_path['exists']
    p2 = pathlib.Path(test_path['local_path'])
    assert p2.is_symlink()
    assert p2.exists()
    assert p1.absolute() != p2.absolute()
    assert p1.resolve() == p2.resolve()

    hasher = hashlib.blake2b()
    with open(p2, 'rb') as in_file:
        hasher.update(in_file.read())
    assert hasher.hexdigest() == true_hash
Ejemplo n.º 3
0
def test_compare_manifesst_string(tmpdir, example_datasets_with_metadata):
    """
    Test that CloudCacheBase.compare_manifests reports the correct
    changes when comparing two manifests
    """
    bucket_name = 'compare_manifest_bucket'
    create_bucket(bucket_name,
                  example_datasets_with_metadata['data'],
                  metadatasets=example_datasets_with_metadata['metadata'])

    cache_dir = pathlib.Path(tmpdir) / 'cache'
    cache = S3CloudCache(cache_dir, bucket_name, 'project-x')

    msg = cache.compare_manifests('project-x_manifest_v1.0.0.json',
                                  'project-x_manifest_v15.0.0.json')

    expected = 'Changes going from\n'
    expected += 'project-x_manifest_v1.0.0.json\n'
    expected += 'to\n'
    expected += 'project-x_manifest_v15.0.0.json\n\n'
    expected += 'project_metadata/metadata_1.csv deleted\n'
    expected += 'project_metadata/metadata_2.csv renamed '
    expected += 'project_metadata/metadata_4.csv\n'
    expected += 'project_metadata/metadata_3.csv deleted\n'
    expected += 'data/f1.txt renamed data/f4.txt\n'
    expected += 'data/f5.txt created\n'
    expected += 'data/f6.txt created\n'

    assert msg == expected
Ejemplo n.º 4
0
def test_metadata(tmpdir):
    """
    Test that S3CloudCache.metadata() returns the expected pandas DataFrame
    """
    data = {}
    data['mouse_id'] = [1, 4, 6, 8]
    data['sex'] = ['F', 'F', 'M', 'M']
    data['age'] = ['P50', 'P46', 'P23', 'P40']
    true_df = pd.DataFrame(data)

    with io.StringIO() as stream:
        true_df.to_csv(stream, index=False)
        stream.seek(0)
        data = bytes(stream.read(), 'utf-8')

    hasher = hashlib.blake2b()
    hasher.update(data)
    true_checksum = hasher.hexdigest()

    test_bucket_name = 'bucket_for_metadata'
    conn = boto3.resource('s3', region_name='us-east-1')
    conn.create_bucket(Bucket=test_bucket_name, ACL='public-read')

    # turn on bucket versioning
    # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#bucketversioning
    bucket_versioning = conn.BucketVersioning(test_bucket_name)
    bucket_versioning.enable()

    client = boto3.client('s3', region_name='us-east-1')
    client.put_object(Bucket=test_bucket_name,
                      Key='metadata_file.csv',
                      Body=data)

    response = client.list_object_versions(Bucket=test_bucket_name)
    version_id = response['Versions'][0]['VersionId']

    manifest = {}
    manifest['manifest_version'] = '1'
    manifest['project_name'] = "project-X"
    manifest['metadata_file_id_column_name'] = 'file_id'
    url = f'http://{test_bucket_name}.s3.amazonaws.com/metadata_file.csv'
    metadata_file = {
        'url': url,
        'version_id': version_id,
        'file_hash': true_checksum
    }

    manifest['metadata_files'] = {'metadata_file.csv': metadata_file}
    manifest['data_pipeline'] = 'placeholder'

    client.put_object(Bucket=test_bucket_name,
                      Key='proj/manifests/manifest_1.json',
                      Body=bytes(json.dumps(manifest), 'utf-8'))

    cache_dir = pathlib.Path(tmpdir) / "metadata/cache"
    cache = S3CloudCache(cache_dir, test_bucket_name, 'proj')
    cache.load_manifest('manifest_1.json')

    metadata_df = cache.get_metadata('metadata_file.csv')
    assert true_df.equals(metadata_df)
Ejemplo n.º 5
0
def test_list_all_downloaded(tmpdir, example_datasets_with_metadata):
    """
    Test that list_all_downloaded_manifests works
    """

    bucket_name = 'outdated_manifest_bucket'
    metadatasets = example_datasets_with_metadata['metadata']
    create_bucket(bucket_name,
                  example_datasets_with_metadata['data'],
                  metadatasets=metadatasets)

    cache_dir = pathlib.Path(tmpdir) / 'cache'
    cache = S3CloudCache(cache_dir, bucket_name, 'project-x')

    assert cache.list_all_downloaded_manifests() == []

    cache.load_manifest('project-x_manifest_v5.0.0.json')
    assert cache.current_manifest == 'project-x_manifest_v5.0.0.json'
    cache.load_manifest('project-x_manifest_v2.0.0.json')
    assert cache.current_manifest == 'project-x_manifest_v2.0.0.json'
    cache.load_manifest('project-x_manifest_v3.0.0.json')
    assert cache.current_manifest == 'project-x_manifest_v3.0.0.json'

    expected = {
        'project-x_manifest_v5.0.0.json', 'project-x_manifest_v2.0.0.json',
        'project-x_manifest_v3.0.0.json'
    }
    downloaded = set(cache.list_all_downloaded_manifests())
    assert downloaded == expected
Ejemplo n.º 6
0
def test_latest_manifest_warning(tmpdir, example_datasets_with_metadata):
    """
    Test that the correct warning is emitted when the user tries
    to load_latest_manifest but that has not been downloaded yet
    """

    bucket_name = 'outdated_manifest_bucket'
    metadatasets = example_datasets_with_metadata['metadata']
    create_bucket(bucket_name,
                  example_datasets_with_metadata['data'],
                  metadatasets=metadatasets)

    cache_dir = pathlib.Path(tmpdir) / 'cache'
    cache = S3CloudCache(cache_dir, bucket_name, 'project-x')

    cache.load_manifest('project-x_manifest_v4.0.0.json')

    with pytest.warns(OutdatedManifestWarning) as warnings:
        cache.load_latest_manifest()
    assert len(warnings) == 1
    msg = str(warnings[0].message)
    assert 'project-x_manifest_v4.0.0.json' in msg
    assert 'project-x_manifest_v15.0.0.json' in msg
    assert 'It is possible that some data files' in msg
    cmd = "S3CloudCache.load_manifest('project-x_manifest_v4.0.0.json')"
    assert cmd in msg
Ejemplo n.º 7
0
    def from_s3_cache(cache_dir: Union[str, Path], bucket_name: str,
                      project_name: str,
                      ui_class_name: str) -> "BehaviorProjectCloudApi":
        """instantiates this object with a connection to an s3 bucket and/or
        a local cache related to that bucket.

        Parameters
        ----------
        cache_dir: str or pathlib.Path
            Path to the directory where data will be stored on the local system

        bucket_name: str
            for example, if bucket URI is 's3://mybucket' this value should be
            'mybucket'

        project_name: str
            the name of the project this cache is supposed to access. This
            project name is the first part of the prefix of the release data
            objects. I.e. s3://<bucket_name>/<project_name>/<object tree>

        ui_class_name: str
            Name of user interface class (used to populate error messages)

        Returns
        -------
        BehaviorProjectCloudApi instance

        """
        cache = S3CloudCache(cache_dir,
                             bucket_name,
                             project_name,
                             ui_class_name=ui_class_name)
        return BehaviorProjectCloudApi(cache)
Ejemplo n.º 8
0
def test_download_data(tmpdir):
    """
    Test that S3CloudCache.download_data() correctly downloads files from S3
    """

    hasher = hashlib.blake2b()
    data = b'11235813kjlssergwesvsdd'
    hasher.update(data)
    true_checksum = hasher.hexdigest()

    test_bucket_name = 'bucket_for_download_data'
    conn = boto3.resource('s3', region_name='us-east-1')
    conn.create_bucket(Bucket=test_bucket_name, ACL='public-read')

    # turn on bucket versioning
    # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#bucketversioning
    bucket_versioning = conn.BucketVersioning(test_bucket_name)
    bucket_versioning.enable()

    client = boto3.client('s3', region_name='us-east-1')
    client.put_object(Bucket=test_bucket_name,
                      Key='data/data_file.txt',
                      Body=data)

    response = client.list_object_versions(Bucket=test_bucket_name)
    version_id = response['Versions'][0]['VersionId']

    manifest = {}
    manifest['manifest_version'] = '1'
    manifest['project_name'] = "project-z"
    manifest['metadata_file_id_column_name'] = 'file_id'
    manifest['metadata_files'] = {}
    url = f'http://{test_bucket_name}.s3.amazonaws.com/project-z/data/data_file.txt'  # noqa: E501
    data_file = {
        'url': url,
        'version_id': version_id,
        'file_hash': true_checksum
    }

    manifest['data_files'] = {'only_data_file': data_file}
    manifest['data_pipeline'] = 'placeholder'

    client.put_object(Bucket=test_bucket_name,
                      Key='proj/manifests/manifest_v1.0.0.json',
                      Body=bytes(json.dumps(manifest), 'utf-8'))

    cache_dir = pathlib.Path(tmpdir) / "data/path/cache"
    cache = S3CloudCache(cache_dir, test_bucket_name, 'proj')

    cache.load_manifest('manifest_v1.0.0.json')

    expected_path = cache_dir / 'project-z-1' / 'data/data_file.txt'
    assert not expected_path.exists()

    # test data_path
    attr = cache.data_path('only_data_file')
    assert attr['local_path'] == expected_path
    assert not attr['exists']
Ejemplo n.º 9
0
def test_re_download_file(tmpdir):
    """
    Test that S3CloudCache._download_file will re-download a file
    when it has been removed from the local system
    """

    hasher = hashlib.blake2b()
    data = b'11235813kjlssergwesvsdd'
    hasher.update(data)
    true_checksum = hasher.hexdigest()

    test_bucket_name = 'bucket_for_re_download'
    conn = boto3.resource('s3', region_name='us-east-1')
    conn.create_bucket(Bucket=test_bucket_name, ACL='public-read')

    # turn on bucket versioning
    # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#bucketversioning
    bucket_versioning = conn.BucketVersioning(test_bucket_name)
    bucket_versioning.enable()

    client = boto3.client('s3', region_name='us-east-1')
    client.put_object(Bucket=test_bucket_name,
                      Key='data/data_file.txt',
                      Body=data)

    response = client.list_object_versions(Bucket=test_bucket_name)
    version_id = response['Versions'][0]['VersionId']

    cache_dir = pathlib.Path(tmpdir) / 'download/test/cache'
    cache = S3CloudCache(cache_dir, test_bucket_name, 'proj')

    expected_path = cache_dir / true_checksum / 'data/data_file.txt'

    url = f'http://{test_bucket_name}.s3.amazonaws.com/data/data_file.txt'
    good_attributes = CacheFileAttributes(url, version_id, true_checksum,
                                          expected_path)

    assert not expected_path.exists()
    cache._download_file(good_attributes)
    assert expected_path.exists()
    hasher = hashlib.blake2b()
    with open(expected_path, 'rb') as in_file:
        hasher.update(in_file.read())
    assert hasher.hexdigest() == true_checksum

    # now, remove the file, and see if it gets re-downloaded
    expected_path.unlink()
    assert not expected_path.exists()

    cache._download_file(good_attributes)
    assert expected_path.exists()
    hasher = hashlib.blake2b()
    with open(expected_path, 'rb') as in_file:
        hasher.update(in_file.read())
    assert hasher.hexdigest() == true_checksum
Ejemplo n.º 10
0
def test_corrupted_load_last_manifest(tmpdir, example_datasets_with_metadata):
    """
    Test that load_last_manifest works when the record of the last
    manifest has been corrupted
    """
    bucket_name = 'load_lst_manifest_bucket'
    metadatasets = example_datasets_with_metadata['metadata']
    create_bucket(bucket_name,
                  example_datasets_with_metadata['data'],
                  metadatasets=metadatasets)

    cache_dir = pathlib.Path(tmpdir) / 'load_last_cache'
    cache = S3CloudCache(cache_dir, bucket_name, 'project-x')
    cache.load_manifest('project-x_manifest_v9.0.0.json')
    fname = cache._manifest_last_used.resolve()
    del cache
    with open(fname, 'w') as out_file:
        out_file.write('babababa')
    cache = S3CloudCache(cache_dir, bucket_name, 'project-x')
    expected = 'Loading latest version -- project-x_manifest_v15.0.0.json'
    with pytest.warns(UserWarning, match=expected):
        cache.load_last_manifest()
    assert cache.current_manifest == 'project-x_manifest_v15.0.0.json'
Ejemplo n.º 11
0
def test_file_exists(tmpdir):
    """
    Test that cache._file_exists behaves correctly
    """

    data = b'aakderasjklsafetss77123523asf'
    hasher = hashlib.blake2b()
    hasher.update(data)
    true_checksum = hasher.hexdigest()
    test_file_path = pathlib.Path(tmpdir) / 'junk.txt'
    with open(test_file_path, 'wb') as out_file:
        out_file.write(data)

    # need to populate a bucket in order for
    # S3CloudCache to be instantiated
    test_bucket_name = 'silly_bucket'
    conn = boto3.resource('s3', region_name='us-east-1')
    conn.create_bucket(Bucket=test_bucket_name, ACL='public-read')

    cache = S3CloudCache(tmpdir, test_bucket_name, 'proj')

    # should be true
    good_attribute = CacheFileAttributes('http://silly.url.com', '12345',
                                         true_checksum, test_file_path)
    assert cache._file_exists(good_attribute)

    # test when checksum is wrong
    bad_attribute = CacheFileAttributes('http://silly.url.com', '12345',
                                        'probably_not_the_checksum',
                                        test_file_path)
    assert not cache._file_exists(bad_attribute)

    # test when file path is wrong
    bad_path = pathlib.Path('definitely/not/a/file.txt')
    bad_attribute = CacheFileAttributes('http://silly.url.com', '12345',
                                        true_checksum, bad_path)

    assert not cache._file_exists(bad_attribute)

    # test when path exists but is not a file
    bad_attribute = CacheFileAttributes('http://silly.url.com', '12345',
                                        true_checksum, pathlib.Path(tmpdir))
    with pytest.raises(RuntimeError) as context:
        cache._file_exists(bad_attribute)
    assert 'but is not a file' in context.value.args[0]
Ejemplo n.º 12
0
def test_local_cache_file_access(tmpdir, example_datasets):
    """
    Create a cache; download some, but not all of the files
    with S3CloudCache; verify that we can access the files
    with LocalCache
    """

    bucket_name = 'local_cache_bucket'
    create_bucket(bucket_name, example_datasets)
    cache_dir = pathlib.Path(tmpdir) / 'cache'
    cloud_cache = S3CloudCache(cache_dir, bucket_name, 'project-x')

    cloud_cache.load_manifest('project-x_manifest_v1.0.0.json')
    cloud_cache.download_data('1')
    cloud_cache.download_data('3')

    cloud_cache.load_manifest('project-x_manifest_v3.0.0.json')
    cloud_cache.download_data('2')

    del cloud_cache

    local_cache = LocalCache(cache_dir, 'project-x')

    manifest_set = set(local_cache.manifest_file_names)
    assert manifest_set == {
        'project-x_manifest_v1.0.0.json', 'project-x_manifest_v3.0.0.json'
    }

    local_cache.load_manifest('project-x_manifest_v1.0.0.json')
    attr = local_cache.data_path('1')
    assert attr['exists']
    attr = local_cache.data_path('2')
    assert not attr['exists']
    attr = local_cache.data_path('3')
    assert attr['exists']

    local_cache.load_manifest('project-x_manifest_v3.0.0.json')
    attr = local_cache.data_path('1')
    assert attr['exists']  # because file 1 is the same in v1.0 and v3.0
    attr = local_cache.data_path('2')
    assert attr['exists']
    attr = local_cache.data_path('3')
    assert not attr['exists']
Ejemplo n.º 13
0
def test_local_cache_symlink(tmpdir, example_datasets):
    """
    Test that a LocalCache is smart enough to construct
    a symlink where appropriate
    """
    test_bucket_name = 'local_cache_test_bucket'
    create_bucket(test_bucket_name, example_datasets)

    cache_dir = pathlib.Path(tmpdir) / 'cache'

    # create an online cache and download some data
    online_cache = S3CloudCache(cache_dir, test_bucket_name, 'project-x')
    online_cache.load_manifest('project-x_manifest_v1.0.0.json')
    p0 = online_cache.download_data('1')
    online_cache.load_manifest('project-x_manifest_v3.0.0.json')

    # path to file we intend to download
    # (just making sure it wasn't accidentally created early
    # by the online cache)
    shld_be = cache_dir / 'project-x-3.0.0/data/f1.txt'
    assert not shld_be.exists()

    del online_cache

    # create a local cache pointing to the same cache directory
    # an try to access a data file that, while not downloaded,
    # is identical to a file that has been downloaded
    local_cache = LocalCache(cache_dir, test_bucket_name, 'project-x')
    local_cache.load_manifest('project-x_manifest_v3.0.0.json')
    attr = local_cache.data_path('1')
    assert attr['exists']
    assert attr['local_path'].absolute() == shld_be.absolute()
    assert attr['local_path'].is_symlink()
    assert attr['local_path'].resolve() == p0.resolve()

    # test that LocalCache does not have access to data that
    # has not been downloaded
    attr = local_cache.data_path('2')
    assert not attr['exists']
    with pytest.raises(NotImplementedError):
        local_cache.download_data('2')
Ejemplo n.º 14
0
def test_list_all_manifests(tmpdir):
    """
    Test that S3CloudCache.list_al_manifests() returns the correct result
    """

    test_bucket_name = 'list_manifest_bucket'

    conn = boto3.resource('s3', region_name='us-east-1')
    conn.create_bucket(Bucket=test_bucket_name)

    client = boto3.client('s3', region_name='us-east-1')
    client.put_object(Bucket=test_bucket_name,
                      Key='proj/manifests/manifest_1.json',
                      Body=b'123456')
    client.put_object(Bucket=test_bucket_name,
                      Key='proj/manifests/manifest_2.json',
                      Body=b'123456')
    client.put_object(Bucket=test_bucket_name, Key='junk.txt', Body=b'123456')

    cache = S3CloudCache(tmpdir, test_bucket_name, 'proj')

    assert cache.manifest_file_names == ['manifest_1.json', 'manifest_2.json']
Ejemplo n.º 15
0
def test_latest_manifest(tmpdir, example_datasets_with_metadata):
    """
    Test that the methods which return the latest and latest downloaded
    manifest file names work correctly
    """
    bucket_name = 'latest_manifest_bucket'
    create_bucket(bucket_name,
                  example_datasets_with_metadata['data'],
                  metadatasets=example_datasets_with_metadata['metadata'])

    cache_dir = pathlib.Path(tmpdir) / 'cache'
    cache = S3CloudCache(cache_dir, bucket_name, 'project-x')

    assert cache.latest_downloaded_manifest_file == ''

    cache.load_manifest('project-x_manifest_v7.0.0.json')
    cache.load_manifest('project-x_manifest_v3.0.0.json')
    cache.load_manifest('project-x_manifest_v2.0.0.json')

    assert cache.latest_manifest_file == 'project-x_manifest_v15.0.0.json'

    expected = 'project-x_manifest_v7.0.0.json'
    assert cache.latest_downloaded_manifest_file == expected
Ejemplo n.º 16
0
def test_outdated_manifest_warning(tmpdir, example_datasets_with_metadata):
    """
    Test that a warning is raised the first time you try to load an outdated
    manifest
    """

    bucket_name = 'outdated_manifest_bucket'
    metadatasets = example_datasets_with_metadata['metadata']
    create_bucket(bucket_name,
                  example_datasets_with_metadata['data'],
                  metadatasets=metadatasets)

    cache_dir = pathlib.Path(tmpdir) / 'cache'
    cache = S3CloudCache(cache_dir, bucket_name, 'project-x')

    m_warn_type = 'OutdatedManifestWarning'

    with pytest.warns(OutdatedManifestWarning) as warnings:
        cache.load_manifest('project-x_manifest_v7.0.0.json')
    ct = 0
    for w in warnings.list:
        if w._category_name == m_warn_type:
            msg = str(w.message)
            assert 'is not the most up to date' in msg
            assert 'S3CloudCache.compare_manifests' in msg
            assert 'load_latest_manifest' in msg
            ct += 1
    assert ct > 0

    # assert no warning is raised the second time by catching
    # any warnings that are emitted and making sure they are
    # not OutdatedManifestWarnings
    with pytest.warns(None) as warnings:
        cache.load_manifest('project-x_manifest_v11.0.0.json')
    if len(warnings) > 0:
        for w in warnings.list:
            assert w._category_name != 'OutdatedManifestWarning'
Ejemplo n.º 17
0
def test_list_all_manifests_many(tmpdir):
    """
    Test the extreme case when there are more manifests than list_objects_v2
    can return at a time
    """

    test_bucket_name = 'list_manifest_bucket'

    conn = boto3.resource('s3', region_name='us-east-1')
    conn.create_bucket(Bucket=test_bucket_name)

    client = boto3.client('s3', region_name='us-east-1')
    for ii in range(2000):
        client.put_object(Bucket=test_bucket_name,
                          Key=f'proj/manifests/manifest_{ii}.json',
                          Body=b'123456')

    client.put_object(Bucket=test_bucket_name, Key='junk.txt', Body=b'123456')

    cache = S3CloudCache(tmpdir, test_bucket_name, 'proj')

    expected = list([f'manifest_{ii}.json' for ii in range(2000)])
    expected.sort()
    assert cache.manifest_file_names == expected
Ejemplo n.º 18
0
def test_corrupted_download_manifest(tmpdir, example_datasets):
    """
    Test that CloudCache can handle the case where the
    _downloaded_data_path dict gets corrupted
    """
    bucket_name = 'manifest_corruption_bucket'
    create_bucket(bucket_name, example_datasets)

    cache_dir = pathlib.Path(tmpdir) / 'cache'
    cache = S3CloudCache(cache_dir, bucket_name, 'project-x')

    version_list = ('1.0.0', '2.0.0', '3.0.0')
    file_id_list = ('1', '2', '3')

    for version in version_list:
        cache.load_manifest(f'project-x_manifest_v{version}.json')
        for file_id in file_id_list:
            cache.download_data(file_id)

    with open(cache._downloaded_data_path, 'rb') as in_file:
        src_data = json.load(in_file)

    # write a corrupted downloaded_data_path
    for k in src_data:
        src_data[k] = ''
    with open(cache._downloaded_data_path, 'w') as out_file:
        out_file.write(json.dumps(src_data, indent=2))

    hasher = hashlib.blake2b()
    hasher.update(b'4567890')
    true_hash = hasher.hexdigest()

    cache.load_manifest('project-x_manifest_v1.0.0.json')
    attr = cache.data_path('2')

    # assert below will pass; because file exists and is not yet corrupted,
    # CloudCache won't consult _downloaded_data_path
    assert attr['exists']

    # now remove one of the data files
    attr['local_path'].unlink()

    # now that the file is corrupted, 'exists' is False
    attr = cache.data_path('2')
    assert not attr['exists']

    # note that v0.2.0/f2.txt is identical to v0.1.0/f2.txt
    cache.load_manifest('project-x_manifest_v2.0.0.json')
    attr = cache.data_path('2')
    assert not attr['exists']

    # re download the file
    cache.download_data('2')
    attr = cache.data_path('2')
    downloaded_path = attr['local_path']

    assert attr['exists']
    hasher = hashlib.blake2b()
    with open(attr['local_path'], 'rb') as in_file:
        hasher.update(in_file.read())
    test_hash = hasher.hexdigest()
    assert test_hash == true_hash

    # check that the v0.1.0 version of the file, which should be
    # identical to the v0.2.0 version of the file, is also
    # fixed
    cache.load_manifest('project-x_manifest_v1.0.0.json')
    attr = cache.data_path('2')
    assert attr['exists']
    assert attr['local_path'].resolve() == downloaded_path.resolve()
    assert attr['local_path'].absolute() != downloaded_path.absolute()
Ejemplo n.º 19
0
def test_loading_manifest(tmpdir):
    """
    Test loading manifests with S3CloudCache
    """

    test_bucket_name = 'list_manifest_bucket'

    conn = boto3.resource('s3', region_name='us-east-1')
    conn.create_bucket(Bucket=test_bucket_name, ACL='public-read')

    client = boto3.client('s3', region_name='us-east-1')

    manifest_1 = {
        'manifest_version': '1',
        'metadata_file_id_column_name': 'file_id',
        'data_pipeline': 'placeholder',
        'project_name': 'sam-beckett',
        'data_files': {},
        'metadata_files': {
            'a.csv': {
                'url': 'http://www.junk.com',
                'version_id': '1111',
                'file_hash': 'abcde'
            },
            'b.csv': {
                'url': 'http://silly.com',
                'version_id': '2222',
                'file_hash': 'fghijk'
            }
        }
    }

    manifest_2 = {
        'manifest_version': '2',
        'metadata_file_id_column_name': 'file_id',
        'data_pipeline': 'placeholder',
        'project_name': 'al',
        'data_files': {},
        'metadata_files': {
            'c.csv': {
                'url': 'http://www.absurd.com',
                'version_id': '3333',
                'file_hash': 'lmnop'
            },
            'd.csv': {
                'url': 'http://nonsense.com',
                'version_id': '4444',
                'file_hash': 'qrstuv'
            }
        }
    }

    client.put_object(Bucket=test_bucket_name,
                      Key='proj/manifests/manifest_v1.0.0.json',
                      Body=bytes(json.dumps(manifest_1), 'utf-8'))

    client.put_object(Bucket=test_bucket_name,
                      Key='proj/manifests/manifest_v2.0.0.json',
                      Body=bytes(json.dumps(manifest_2), 'utf-8'))

    cache = S3CloudCache(pathlib.Path(tmpdir), test_bucket_name, 'proj')
    assert cache.current_manifest is None
    cache.load_manifest('manifest_v1.0.0.json')
    assert cache._manifest._data == manifest_1
    assert cache.version == '1'
    assert cache.file_id_column == 'file_id'
    assert cache.metadata_file_names == ['a.csv', 'b.csv']
    assert cache.current_manifest == 'manifest_v1.0.0.json'

    cache.load_manifest('manifest_v2.0.0.json')
    assert cache._manifest._data == manifest_2
    assert cache.version == '2'
    assert cache.file_id_column == 'file_id'
    assert cache.metadata_file_names == ['c.csv', 'd.csv']

    with pytest.raises(ValueError) as context:
        cache.load_manifest('manifest_v3.0.0.json')
    msg = 'is not one of the valid manifest names'
    assert msg in context.value.args[0]
Ejemplo n.º 20
0
def test_on_corrupted_files(tmpdir, example_datasets):
    """
    Test that the CloudCache re-downloads files when they have been
    corrupted
    """
    bucket_name = 'corruption_bucket'
    create_bucket(bucket_name, example_datasets)

    cache_dir = pathlib.Path(tmpdir) / 'cache'
    cache = S3CloudCache(cache_dir, bucket_name, 'project-x')

    version_list = ('1.0.0', '2.0.0', '3.0.0')
    file_id_list = ('1', '2', '3')

    for version in version_list:
        cache.load_manifest(f'project-x_manifest_v{version}.json')
        for file_id in file_id_list:
            cache.download_data(file_id)

    # make sure that all files exist
    for version in version_list:
        cache.load_manifest(f'project-x_manifest_v{version}.json')
        for file_id in file_id_list:
            attr = cache.data_path(file_id)
            assert attr['exists']

    hasher = hashlib.blake2b()
    hasher.update(b'4567890')
    true_hash = hasher.hexdigest()

    # Check that, when a file on disk gets removed,
    # all of the symlinks that point back to that file
    # get marked as `not exists`

    cache.load_manifest('project-x_manifest_v1.0.0.json')
    attr = cache.data_path('2')
    attr['local_path'].unlink()

    attr = cache.data_path('2')
    assert not attr['exists']

    # note that v0.2.0/f2.txt is identical to v0.1.0/f2.txt
    # in the example data set
    cache.load_manifest('project-x_manifest_v2.0.0.json')
    attr = cache.data_path('2')
    assert not attr['exists']

    # re-download one of the identical files, and verify
    # that both datasets are restored
    cache.download_data('2')
    attr = cache.data_path('2')
    assert attr['exists']
    redownloaded_path = attr['local_path']

    cache.load_manifest('project-x_manifest_v1.0.0.json')
    attr = cache.data_path('2')
    assert attr['exists']
    other_path = attr['local_path']

    hasher = hashlib.blake2b()
    with open(other_path, 'rb') as in_file:
        hasher.update(in_file.read())
    assert hasher.hexdigest() == true_hash

    # The file is downloaded to other_path because that was
    # the first path originally downloaded and stored
    # in CloudCache._downloaded_data_path

    assert other_path.resolve() == redownloaded_path.resolve()
    assert other_path.absolute() != redownloaded_path.absolute()
Ejemplo n.º 21
0
def test_download_metadata(tmpdir):
    """
    Test that S3CloudCache.download_metadata() correctly
    downloads files from S3
    """

    hasher = hashlib.blake2b()
    data = b'11235813kjlssergwesvsdd'
    hasher.update(data)
    true_checksum = hasher.hexdigest()

    test_bucket_name = 'bucket_for_download_metadata'
    conn = boto3.resource('s3', region_name='us-east-1')
    conn.create_bucket(Bucket=test_bucket_name, ACL='public-read')

    # turn on bucket versioning
    # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#bucketversioning
    bucket_versioning = conn.BucketVersioning(test_bucket_name)
    bucket_versioning.enable()

    client = boto3.client('s3', region_name='us-east-1')
    meta_version = client.put_object(Bucket=test_bucket_name,
                                     Key='metadata_file.csv',
                                     Body=data)["VersionId"]

    response = client.list_object_versions(Bucket=test_bucket_name)
    version_id = response['Versions'][0]['VersionId']

    manifest = {}
    manifest['manifest_version'] = '1'
    manifest['project_name'] = "project4"
    manifest['metadata_file_id_column_name'] = 'file_id'
    url = f'http://{test_bucket_name}.s3.amazonaws.com/project4/metadata_file.csv'  # noqa: E501
    metadata_file = {
        'url': url,
        'version_id': version_id,
        'file_hash': true_checksum
    }

    manifest['metadata_files'] = {'metadata_file.csv': metadata_file}
    manifest['data_pipeline'] = 'placeholder'

    client.put_object(Bucket=test_bucket_name,
                      Key='proj/manifests/manifest_1.json',
                      Body=bytes(json.dumps(manifest), 'utf-8'))

    cache_dir = pathlib.Path(tmpdir) / "metadata/path/cache"
    cache = S3CloudCache(cache_dir, test_bucket_name, 'proj')

    cache.load_manifest('manifest_1.json')

    expected_path = cache_dir / "project4-1" / 'metadata_file.csv'
    assert not expected_path.exists()

    # test that metadata_path also works
    attr = cache.metadata_path('metadata_file.csv')
    assert attr['local_path'] == expected_path
    assert not attr['exists']

    def response_fun(Bucket, Prefix):
        # moto doesn't cover list_object_versions
        return {
            "Versions": [{
                "VersionId": meta_version,
                "Key": "metadata_file.csv",
                "Size": 12
            }]
        }
Ejemplo n.º 22
0
def test_download_file_multiple_versions(tmpdir):
    """
    Test that S3CloudCache._download_file behaves as expected
    when there are multiple versions of the same file in the
    bucket

    (This is really just testing that S3's versioning behaves the
    way we think it does)
    """

    hasher = hashlib.blake2b()
    data_1 = b'11235813kjlssergwesvsdd'
    hasher.update(data_1)
    true_checksum_1 = hasher.hexdigest()

    hasher = hashlib.blake2b()
    data_2 = b'zzzzxxxxyyyywwwwjjjj'
    hasher.update(data_2)
    true_checksum_2 = hasher.hexdigest()

    assert true_checksum_2 != true_checksum_1

    test_bucket_name = 'bucket_for_download_versions'
    conn = boto3.resource('s3', region_name='us-east-1')
    conn.create_bucket(Bucket=test_bucket_name, ACL='public-read')

    # turn on bucket versioning
    # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#bucketversioning
    bucket_versioning = conn.BucketVersioning(test_bucket_name)
    bucket_versioning.enable()

    client = boto3.client('s3', region_name='us-east-1')
    client.put_object(Bucket=test_bucket_name,
                      Key='data/data_file.txt',
                      Body=data_1)

    response = client.list_object_versions(Bucket=test_bucket_name)
    version_id_1 = response['Versions'][0]['VersionId']

    client = boto3.client('s3', region_name='us-east-1')
    client.put_object(Bucket=test_bucket_name,
                      Key='data/data_file.txt',
                      Body=data_2)

    response = client.list_object_versions(Bucket=test_bucket_name)
    version_id_2 = None
    for v in response['Versions']:
        if v['IsLatest']:
            version_id_2 = v['VersionId']
    assert version_id_2 is not None
    assert version_id_2 != version_id_1

    cache_dir = pathlib.Path(tmpdir) / 'download/test/cache'
    cache = S3CloudCache(cache_dir, test_bucket_name, 'proj')

    url = f'http://{test_bucket_name}.s3.amazonaws.com/data/data_file.txt'

    # download first version of file
    expected_path = cache_dir / true_checksum_1 / 'data/data_file.txt'

    good_attributes = CacheFileAttributes(url, version_id_1, true_checksum_1,
                                          expected_path)

    assert not expected_path.exists()
    cache._download_file(good_attributes)
    assert expected_path.exists()
    hasher = hashlib.blake2b()
    with open(expected_path, 'rb') as in_file:
        hasher.update(in_file.read())
    assert hasher.hexdigest() == true_checksum_1

    # download second version of file
    expected_path = cache_dir / true_checksum_2 / 'data/data_file.txt'

    good_attributes = CacheFileAttributes(url, version_id_2, true_checksum_2,
                                          expected_path)

    assert not expected_path.exists()
    cache._download_file(good_attributes)
    assert expected_path.exists()
    hasher = hashlib.blake2b()
    with open(expected_path, 'rb') as in_file:
        hasher.update(in_file.read())
    assert hasher.hexdigest() == true_checksum_2
Ejemplo n.º 23
0
def test_reconstruction_of_local_manifest(tmpdir):
    """
    Test that, if _downloaded_data.json gets lost, it can be reconstructed
    so that the CloudCache does not automatically download new copies of files
    """

    # define a cache class that cannot download from S3
    class DummyCache(S3CloudCache):
        def _download_file(self, file_attributes: CacheFileAttributes):
            if not self._file_exists(file_attributes):
                raise RuntimeError("Cannot download files")
            return True

    # first two versions of dataset are identical;
    # third differs
    example_data = {}
    example_data['1.0.0'] = {}
    example_data['1.0.0']['f1.txt'] = {'file_id': '1', 'data': b'abc'}
    example_data['1.0.0']['f2.txt'] = {'file_id': '2', 'data': b'def'}

    example_data['2.0.0'] = {}
    example_data['2.0.0']['f1.txt'] = {'file_id': '1', 'data': b'abc'}
    example_data['2.0.0']['f2.txt'] = {'file_id': '2', 'data': b'def'}

    example_data['3.0.0'] = {}
    example_data['3.0.0']['f1.txt'] = {'file_id': '1', 'data': b'tuv'}
    example_data['3.0.0']['f2.txt'] = {'file_id': '2', 'data': b'wxy'}

    test_bucket_name = 'cache_from_scratch_bucket'
    create_bucket(test_bucket_name, example_data)

    cache_dir = pathlib.Path(tmpdir) / 'cache'

    # read in v1.0.0 data files using normal S3 cache class
    with pytest.warns(None) as warnings:
        cache = S3CloudCache(cache_dir, test_bucket_name, 'project-x')

    # make sure no MissingLocalManifestWarnings were raised
    w_type = 'MissingLocalManifestWarning'
    for w in warnings.list:
        if w._category_name == w_type:
            msg = 'Raised MissingLocalManifestWarning on empty '
            msg += 'cache dir'
            assert False, msg

    expected_hash = {}
    cache.load_manifest('project-x_manifest_v1.0.0.json')
    for file_id in ('1', '2'):
        local_path = cache.download_data(file_id)
        hasher = hashlib.blake2b()
        with open(local_path, 'rb') as in_file:
            hasher.update(in_file.read())
        expected_hash[file_id] = hasher.hexdigest()

    # load the other manifests, so DummyCache can get it
    cache.load_manifest('project-x_manifest_v2.0.0.json')
    cache.load_manifest('project-x_manifest_v3.0.0.json')

    # delete the JSON file that maps local path to file hash
    lookup_path = cache._downloaded_data_path
    assert lookup_path.exists()
    lookup_path.unlink()
    assert not lookup_path.exists()

    del cache

    # Reload the data using the cache class that cannot download
    # files. Verify that paths to files with the correct hashes
    # are returned. This will mean that the local manifest mapping
    # filename to file hash was correctly reconstructed.
    with pytest.warns(MissingLocalManifestWarning) as warnings:
        dummy = DummyCache(cache_dir, test_bucket_name, 'project-x')

    dummy.construct_local_manifest()

    dummy.load_manifest('project-x_manifest_v2.0.0.json')
    for file_id in ('1', '2'):
        local_path = dummy.download_data(file_id)
        hasher = hashlib.blake2b()
        with open(local_path, 'rb') as in_file:
            hasher.update(in_file.read())
        assert hasher.hexdigest() == expected_hash[file_id]

    # make sure that dummy really is unable to download by trying
    # (and failing) to get data from v3.0.0
    dummy.load_manifest('project-x_manifest_v3.0.0.json')
    with pytest.raises(RuntimeError):
        dummy.download_data('1')
Ejemplo n.º 24
0
def test_on_removed_files(tmpdir, example_datasets):
    """
    Test that the CloudCache re-downloads files when the
    the files at the root of the symlinks have been removed
    """
    bucket_name = 'corruption_bucket'
    create_bucket(bucket_name, example_datasets)

    cache_dir = pathlib.Path(tmpdir) / 'cache'
    cache = S3CloudCache(cache_dir, bucket_name, 'project-x')

    version_list = ('1.0.0', '2.0.0', '3.0.0')
    file_id_list = ('1', '2', '3')

    for version in version_list:
        cache.load_manifest(f'project-x_manifest_v{version}.json')
        for file_id in file_id_list:
            cache.download_data(file_id)

    # make sure that all files exist
    for version in version_list:
        cache.load_manifest(f'project-x_manifest_v{version}.json')
        for file_id in file_id_list:
            attr = cache.data_path(file_id)
            assert attr['exists']

    hasher = hashlib.blake2b()
    hasher.update(b'4567890')
    true_hash = hasher.hexdigest()

    p1 = cache_dir / 'project-x-1.0.0' / 'data' / 'f2.txt'
    p2 = cache_dir / 'project-x-2.0.0' / 'data' / 'f2.txt'

    # note that f2.txt is identical between v 1.0.0 and 2.0.0
    assert p1.is_file()
    assert not p1.is_symlink()
    assert p2.is_symlink()
    assert p1.resolve() == p2.resolve()

    # remove p1
    p1.unlink()
    assert not p1.exists()
    assert not p1.is_file()
    assert not p2.is_file()
    assert p2.is_symlink()

    # make sure that the file which has been moved is now
    # marked as not existing
    cache.load_manifest('project-x_manifest_v1.0.0.json')
    test_path = cache.data_path('2')
    assert not test_path['exists']

    cache.load_manifest('project-x_manifest_v2.0.0.json')
    test_path = cache.data_path('2')
    assert not test_path['exists']

    # now, re-download the data by way of manifest 2
    # and verify that the symlink relationship is
    # re-established
    p2 = cache.download_data('2')
    assert p2.is_file()
    assert p2.is_symlink()  # because the symlink was not removed

    cache.load_manifest('project-x_manifest_v1.0.0.json')
    p1 = cache.download_data('2')

    assert p1.is_file()
    assert not p1.is_symlink()
    assert p1.resolve() == p2.resolve()
    assert p1.absolute() != p2.absolute()

    hasher = hashlib.blake2b()
    with open(p2, 'rb') as in_file:
        hasher.update(in_file.read())
    assert hasher.hexdigest() == true_hash
Ejemplo n.º 25
0
def test_full_cache_system(tmpdir):
    """
    Test the process of loading different versions of the same dataset,
    each of which involve different versions of files
    """

    test_bucket_name = 'full_cache_bucket'

    conn = boto3.resource('s3', region_name='us-east-1')
    conn.create_bucket(Bucket=test_bucket_name, ACL='public-read')

    # turn on bucket versioning
    # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#bucketversioning
    bucket_versioning = conn.BucketVersioning(test_bucket_name)
    bucket_versioning.enable()

    s3_client = boto3.client('s3', region_name='us-east-1')

    # generate data and expected hashes

    true_hashes = {}
    version_id_lookup = {}

    data1_v1 = b'12345678'
    data1_v2 = b'45678901'
    data2_v1 = b'abcdefghijk'
    data2_v2 = b'lmnopqrstuv'
    data3_v1 = b'jklmnopqrst'

    metadata1_v1 = pd.DataFrame({'mouse': [1, 2, 3], 'sex': ['F', 'F', 'M']})

    metadata2_v1 = pd.DataFrame({
        'experiment': [5, 6, 7],
        'file_id': ['data1', 'data2', 'data3']
    })

    metadata1_v2 = pd.DataFrame({'mouse': [8, 9, 0], 'sex': ['M', 'F', 'M']})

    v1_hashes = {}
    for data, key in zip((data1_v1, data2_v1, data3_v1),
                         ('data1', 'data2', 'data3')):

        hasher = hashlib.blake2b()
        hasher.update(data)
        v1_hashes[key] = hasher.hexdigest()
        s3_client.put_object(Bucket=test_bucket_name,
                             Key=f'proj/data/{key}',
                             Body=data)

    for df, key in zip((metadata1_v1, metadata2_v1),
                       ('proj/metadata1.csv', 'proj/metadata2.csv')):

        with io.StringIO() as stream:
            df.to_csv(stream, index=False)
            stream.seek(0)
            data = bytes(stream.read(), 'utf-8')

        hasher = hashlib.blake2b()
        hasher.update(data)
        v1_hashes[key.replace('proj/', '')] = hasher.hexdigest()
        s3_client.put_object(Bucket=test_bucket_name, Key=key, Body=data)

    true_hashes['v1'] = v1_hashes
    v1_version_id = {}
    response = s3_client.list_object_versions(Bucket=test_bucket_name)
    for v in response['Versions']:
        vkey = v['Key'].replace('proj/', '').replace('data/', '')
        v1_version_id[vkey] = v['VersionId']

    version_id_lookup['v1'] = v1_version_id

    v2_hashes = {}
    v2_version_id = {}
    for data, key in zip((data1_v2, data2_v2), ('data1', 'data2')):

        hasher = hashlib.blake2b()
        hasher.update(data)
        v2_hashes[key] = hasher.hexdigest()
        s3_client.put_object(Bucket=test_bucket_name,
                             Key=f'proj/data/{key}',
                             Body=data)

    s3_client.delete_object(Bucket=test_bucket_name, Key='proj/data/data3')

    with io.StringIO() as stream:
        metadata1_v2.to_csv(stream, index=False)
        stream.seek(0)
        data = bytes(stream.read(), 'utf-8')

    hasher = hashlib.blake2b()
    hasher.update(data)
    v2_hashes['metadata1.csv'] = hasher.hexdigest()
    s3_client.put_object(Bucket=test_bucket_name,
                         Key='proj/metadata1.csv',
                         Body=data)

    s3_client.delete_object(Bucket=test_bucket_name, Key='proj/metadata2.csv')

    true_hashes['v2'] = v2_hashes
    v2_version_id = {}
    response = s3_client.list_object_versions(Bucket=test_bucket_name)
    for v in response['Versions']:
        if not v['IsLatest']:
            continue
        vkey = v['Key'].replace('proj/', '').replace('data/', '')
        v2_version_id[vkey] = v['VersionId']
    version_id_lookup['v2'] = v2_version_id

    # check thata data3 and metadata2.csv do not occur in v2 of
    # the dataset, but other data/metadata files do

    assert 'data3' in version_id_lookup['v1']
    assert 'data3' not in version_id_lookup['v2']
    assert 'data1' in version_id_lookup['v1']
    assert 'data2' in version_id_lookup['v1']
    assert 'data1' in version_id_lookup['v2']
    assert 'data2' in version_id_lookup['v2']
    assert 'metadata1.csv' in version_id_lookup['v1']
    assert 'metadata2.csv' in version_id_lookup['v1']
    assert 'metadata1.csv' in version_id_lookup['v2']
    assert 'metadata2.csv' not in version_id_lookup['v2']

    # build manifests

    manifest_1 = {}
    manifest_1['manifest_version'] = 'A'
    manifest_1['project_name'] = "project-A1"
    manifest_1['metadata_file_id_column_name'] = 'file_id'
    manifest_1['data_pipeline'] = 'placeholder'
    data_files_1 = {}
    for k in ('data1', 'data2', 'data3'):
        obj = {}
        obj['url'] = f'http://{test_bucket_name}.s3.amazonaws.com/proj/data/{k}'  # noqa: E501
        obj['file_hash'] = true_hashes['v1'][k]
        obj['version_id'] = version_id_lookup['v1'][k]
        data_files_1[k] = obj
    manifest_1['data_files'] = data_files_1
    metadata_files_1 = {}
    for k in ('metadata1.csv', 'metadata2.csv'):
        obj = {}
        obj['url'] = f'http://{test_bucket_name}.s3.amazonaws.com/proj/{k}'
        obj['file_hash'] = true_hashes['v1'][k]
        obj['version_id'] = version_id_lookup['v1'][k]
        metadata_files_1[k] = obj
    manifest_1['metadata_files'] = metadata_files_1

    manifest_2 = {}
    manifest_2['manifest_version'] = 'B'
    manifest_2['project_name'] = "project-B2"
    manifest_2['metadata_file_id_column_name'] = 'file_id'
    manifest_2['data_pipeline'] = 'placeholder'
    data_files_2 = {}
    for k in ('data1', 'data2'):
        obj = {}
        obj['url'] = f'http://{test_bucket_name}.s3.amazonaws.com/proj/data/{k}'  # noqa: E501
        obj['file_hash'] = true_hashes['v2'][k]
        obj['version_id'] = version_id_lookup['v2'][k]
        data_files_2[k] = obj
    manifest_2['data_files'] = data_files_2
    metadata_files_2 = {}
    for k in ['metadata1.csv']:
        obj = {}
        obj['url'] = f'http://{test_bucket_name}.s3.amazonaws.com/proj/{k}'
        obj['file_hash'] = true_hashes['v2'][k]
        obj['version_id'] = version_id_lookup['v2'][k]
        metadata_files_2[k] = obj
    manifest_2['metadata_files'] = metadata_files_2

    s3_client.put_object(Bucket=test_bucket_name,
                         Key='proj/manifests/manifest_v1.0.0.json',
                         Body=bytes(json.dumps(manifest_1), 'utf-8'))

    s3_client.put_object(Bucket=test_bucket_name,
                         Key='proj/manifests/manifest_v2.0.0.json',
                         Body=bytes(json.dumps(manifest_2), 'utf-8'))

    # Use S3CloudCache to interact with dataset
    cache_dir = pathlib.Path(tmpdir) / 'my/test/cache'
    cache = S3CloudCache(cache_dir, test_bucket_name, 'proj')

    # load the first version of the dataset

    cache.load_manifest('manifest_v1.0.0.json')
    assert cache.version == 'A'

    # check that metadata dataframes have expected contents
    m1 = cache.get_metadata('metadata1.csv')
    assert metadata1_v1.equals(m1)
    m2 = cache.get_metadata('metadata2.csv')
    assert metadata2_v1.equals(m2)

    # check that data files have expected hashes
    for k in ('data1', 'data2', 'data3'):

        attr = cache.data_path(k)
        assert not attr['exists']

        local_path = cache.download_data(k)
        assert local_path.exists()
        hasher = hashlib.blake2b()
        with open(local_path, 'rb') as in_file:
            hasher.update(in_file.read())
        assert hasher.hexdigest() == true_hashes['v1'][k]

        attr = cache.data_path(k)
        assert attr['exists']

    # now load the second version of the dataset

    cache.load_manifest('manifest_v2.0.0.json')
    assert cache.version == 'B'

    # metadata2.csv should not exist in this version of the dataset
    with pytest.raises(ValueError) as context:
        cache.get_metadata('metadata2.csv')
    assert 'is not in self.metadata_file_names' in context.value.args[0]

    # check that metadata1 has expected contents
    m1 = cache.get_metadata('metadata1.csv')
    assert metadata1_v2.equals(m1)

    # data3 should not exist in this version of the dataset
    with pytest.raises(ValueError) as context:
        _ = cache.download_data('data3')
    assert 'not a data file listed' in context.value.args[0]

    with pytest.raises(ValueError) as context:
        _ = cache.data_path('data3')
    assert 'not a data file listed' in context.value.args[0]

    # check that data1, data2 have expected hashes
    for k in ('data1', 'data2'):
        attr = cache.data_path(k)
        assert not attr['exists']

        local_path = cache.download_data(k)
        assert local_path.exists()
        hasher = hashlib.blake2b()
        with open(local_path, 'rb') as in_file:
            hasher.update(in_file.read())
        assert hasher.hexdigest() == true_hashes['v2'][k]

        attr = cache.data_path(k)
        assert attr['exists']
Ejemplo n.º 26
0
def test_summarize_comparison(tmpdir, example_datasets_with_metadata):
    """
    Test that CloudCacheBase.summarize_comparison reports the correct
    changes when comparing two manifests
    """
    bucket_name = 'summarizing_bucket'
    create_bucket(bucket_name,
                  example_datasets_with_metadata['data'],
                  metadatasets=example_datasets_with_metadata['metadata'])

    cache_dir = pathlib.Path(tmpdir) / 'cache'
    cache = S3CloudCache(cache_dir, bucket_name, 'project-x')

    log = cache.summarize_comparison('project-x_manifest_v1.0.0.json',
                                     'project-x_manifest_v2.0.0.json')

    assert len(log['metadata_changes']) == 0
    assert len(log['data_changes']) == 1
    assert ('data/f2.txt', 'data/f2.txt deleted') in log['data_changes']

    log = cache.summarize_comparison('project-x_manifest_v1.0.0.json',
                                     'project-x_manifest_v3.0.0.json')

    assert len(log['metadata_changes']) == 0
    assert len(log['data_changes']) == 1
    assert ('data/f2.txt',
            'data/f2.txt renamed data/f4.txt') in log['data_changes']

    log = cache.summarize_comparison('project-x_manifest_v1.0.0.json',
                                     'project-x_manifest_v4.0.0.json')

    assert len(log['metadata_changes']) == 0
    assert len(log['data_changes']) == 1
    assert ('data/f3.txt', 'data/f3.txt changed') in log['data_changes']

    log = cache.summarize_comparison('project-x_manifest_v1.0.0.json',
                                     'project-x_manifest_v5.0.0.json')

    assert len(log['metadata_changes']) == 0
    assert len(log['data_changes']) == 1
    assert ('data/f4.txt', 'data/f4.txt created') in log['data_changes']

    log = cache.summarize_comparison('project-x_manifest_v1.0.0.json',
                                     'project-x_manifest_v6.0.0.json')

    assert len(log['metadata_changes']) == 0
    assert len(log['data_changes']) == 2
    assert ('data/f2.txt', 'data/f2.txt deleted') in log['data_changes']
    assert ('data/f1.txt', 'data/f1.txt changed') in log['data_changes']

    log = cache.summarize_comparison('project-x_manifest_v1.0.0.json',
                                     'project-x_manifest_v7.0.0.json')

    assert len(log['metadata_changes']) == 0
    assert len(log['data_changes']) == 2
    assert ('data/f2.txt', 'data/f2.txt deleted') in log['data_changes']
    assert ('data/f3.txt', 'data/f3.txt '
            'renamed data/f5.txt') in log['data_changes']

    log = cache.summarize_comparison('project-x_manifest_v1.0.0.json',
                                     'project-x_manifest_v8.0.0.json')

    assert len(log['metadata_changes']) == 0
    assert len(log['data_changes']) == 2
    assert ('data/f2.txt', 'data/f2.txt deleted') in log['data_changes']
    assert ('data/f5.txt', 'data/f5.txt created') in log['data_changes']

    log = cache.summarize_comparison('project-x_manifest_v1.0.0.json',
                                     'project-x_manifest_v9.0.0.json')

    assert len(log['metadata_changes']) == 0
    assert len(log['data_changes']) == 2
    assert ('data/f3.txt', 'data/f3.txt renamed '
            'data/f4.txt') in log['data_changes']
    assert ('data/f5.txt', 'data/f5.txt created') in log['data_changes']

    log = cache.summarize_comparison('project-x_manifest_v1.0.0.json',
                                     'project-x_manifest_v10.0.0.json')

    assert len(log['data_changes']) == 0
    assert len(log['metadata_changes']) == 1
    assert ('project_metadata/metadata_2.csv',
            'project_metadata/metadata_2.csv '
            'deleted') in log['metadata_changes']

    log = cache.summarize_comparison('project-x_manifest_v1.0.0.json',
                                     'project-x_manifest_v11.0.0.json')

    assert len(log['data_changes']) == 0
    assert len(log['metadata_changes']) == 1
    assert ('project_metadata/metadata_2.csv',
            'project_metadata/metadata_2.csv renamed '
            'project_metadata/metadata_4.csv') in log['metadata_changes']

    log = cache.summarize_comparison('project-x_manifest_v1.0.0.json',
                                     'project-x_manifest_v12.0.0.json')

    assert len(log['data_changes']) == 0
    assert len(log['metadata_changes']) == 1
    assert ('project_metadata/metadata_3.csv',
            'project_metadata/metadata_3.csv '
            'changed') in log['metadata_changes']

    log = cache.summarize_comparison('project-x_manifest_v1.0.0.json',
                                     'project-x_manifest_v13.0.0.json')

    assert len(log['data_changes']) == 0
    assert len(log['metadata_changes']) == 1
    assert ('project_metadata/metadata_4.csv',
            'project_metadata/metadata_4.csv '
            'created') in log['metadata_changes']

    log = cache.summarize_comparison('project-x_manifest_v1.0.0.json',
                                     'project-x_manifest_v14.0.0.json')
    assert len(log['data_changes']) == 1
    assert len(log['metadata_changes']) == 1
    assert ('data/f2.txt', 'data/f2.txt deleted') in log['data_changes']
    assert ('project_metadata/metadata_3.csv',
            'project_metadata/metadata_3.csv renamed '
            'project_metadata/metadata_4.csv') in log['metadata_changes']

    log = cache.summarize_comparison('project-x_manifest_v1.0.0.json',
                                     'project-x_manifest_v15.0.0.json')
    assert len(log['data_changes']) == 3
    assert len(log['metadata_changes']) == 3

    ans1 = ('data/f1.txt', 'data/f1.txt renamed data/f4.txt')
    ans2 = ('data/f5.txt', 'data/f5.txt created')
    ans3 = ('data/f6.txt', 'data/f6.txt created')

    assert set(log['data_changes']) == {ans1, ans2, ans3}

    ans1 = ('project_metadata/metadata_2.csv',
            'project_metadata/metadata_2.csv renamed '
            'project_metadata/metadata_4.csv')
    ans2 = ('project_metadata/metadata_1.csv',
            'project_metadata/metadata_1.csv deleted')
    ans3 = ('project_metadata/metadata_3.csv',
            'project_metadata/metadata_3.csv deleted')

    assert set(log['metadata_changes']) == {ans1, ans2, ans3}
Ejemplo n.º 27
0
def test_smart_file_downloading(tmpdir, example_datasets):
    """
    Test that the CloudCache is smart enough to build symlinks
    where possible
    """
    test_bucket_name = 'smart_download_bucket'
    create_bucket(test_bucket_name, example_datasets)

    cache_dir = pathlib.Path(tmpdir) / 'cache'
    cache = S3CloudCache(cache_dir, test_bucket_name, 'project-x')

    # download all data files from all versions, keeping track
    # of the paths to the downloaded data files
    downloaded = {}
    for version in ('1.0.0', '2.0.0', '3.0.0'):
        downloaded[version] = {}
        cache.load_manifest(f'project-x_manifest_v{version}.json')
        for file_id in ('1', '2', '3'):
            downloaded[version][file_id] = cache.download_data(file_id)

    # check that the version 1.0.0 of all files are actual files
    for file_id in ('1', '2', '3'):
        assert downloaded['1.0.0'][file_id].is_file()
        assert not downloaded['1.0.0'][file_id].is_symlink()

    # check that v2.0.0 f1.txt is a new file
    assert downloaded['2.0.0']['1'].is_file()
    assert not downloaded['2.0.0']['1'].is_symlink()

    # check that v2.0.0 f2.txt and f3.txt are symlinks to
    # the correct v1.0.0 files
    for file_id in ('2', '3'):
        assert downloaded['2.0.0'][file_id].is_file()
        assert downloaded['2.0.0'][file_id].is_symlink()

        # check that symlink points to the correct file
        test = downloaded['2.0.0'][file_id].resolve()
        control = downloaded['1.0.0'][file_id].resolve()
        if test != control:
            test = downloaded['2.0.0'][file_id].resolve()
            control = downloaded['1.0.0'][file_id].resolve()
            raise RuntimeError(f'{test} != {control}\n'
                               'even though the first is a symlink')

        # check that the absolute paths of the files are different,
        # even though one is a symlink
        test = downloaded['2.0.0'][file_id].absolute()
        control = downloaded['1.0.0'][file_id].absolute()
        if test == control:
            test = downloaded['2.0.0'][file_id].absolute()
            control = downloaded['1.0.0'][file_id].absolute()
            raise RuntimeError(f'{test} == {control}\n'
                               'even though they should be '
                               'different absolute paths')

    # repeat the above tests for v3.0.0, f1.txt
    assert downloaded['3.0.0']['1'].is_file()
    assert downloaded['3.0.0']['1'].is_symlink()

    res3 = downloaded['3.0.0']['1'].resolve()
    res1 = downloaded['1.0.0']['1'].resolve()
    if res3 != res1:
        test = downloaded['3.0.0']['1'].resolve()
        control = downloaded['1.0.0']['1'].resolve()
        raise RuntimeError(f'{test} != {control}\n'
                           'even though the first is a symlink')

    abs3 = downloaded['3.0.0']['1'].absolute()
    abs1 = downloaded['1.0.0']['1'].absolute()
    if abs3 == abs1:
        test = downloaded['3.0.0']['1'].absolute()
        control = downloaded['1.0.0']['1'].absolute()
        raise RuntimeError(f'{test} == {control}\n'
                           'even though they should be '
                           'different absolute paths')

    # check that v3 v2.txt and f3.txt are not symlinks
    assert downloaded['3.0.0']['2'].is_file()
    assert not downloaded['3.0.0']['2'].is_symlink()
    assert downloaded['3.0.0']['3'].is_file()
    assert not downloaded['3.0.0']['3'].is_symlink()