Esempio n. 1
0
def test_cache_file_attributes():
    attr = CacheFileAttributes(url='http://my/url',
                               version_id='aaabbb',
                               file_hash='12345',
                               local_path=pathlib.Path('/my/local/path'))

    assert attr.url == 'http://my/url'
    assert attr.version_id == 'aaabbb'
    assert attr.file_hash == '12345'
    assert attr.local_path == pathlib.Path('/my/local/path')

    # test that the correct ValueErrors are raised
    # when you pass invalid arguments

    with pytest.raises(ValueError) as context:
        attr = CacheFileAttributes(url=5.0,
                                   version_id='aaabbb',
                                   file_hash='12345',
                                   local_path=pathlib.Path('/my/local/path'))

    msg = "url must be str; got <class 'float'>"
    assert context.value.args[0] == msg

    with pytest.raises(ValueError) as context:
        attr = CacheFileAttributes(url='http://my/url/',
                                   version_id=5.0,
                                   file_hash='12345',
                                   local_path=pathlib.Path('/my/local/path'))

    msg = "version_id must be str; got <class 'float'>"
    assert context.value.args[0] == msg

    with pytest.raises(ValueError) as context:
        attr = CacheFileAttributes(url='http://my/url/',
                                   version_id='aaabbb',
                                   file_hash=5.0,
                                   local_path=pathlib.Path('/my/local/path'))

    msg = "file_hash must be str; got <class 'float'>"
    assert context.value.args[0] == msg

    with pytest.raises(ValueError) as context:
        attr = CacheFileAttributes(url='http://my/url/',
                                   version_id='aaabbb',
                                   file_hash='12345',
                                   local_path='/my/local/path')

    msg = "local_path must be pathlib.Path; got <class 'str'>"
    assert context.value.args[0] == msg
Esempio n. 2
0
def test_file_exists(tmpdir):
    """
    Test that cache._file_exists behaves correctly
    """

    data = b'aakderasjklsafetss77123523asf'
    hasher = hashlib.blake2b()
    hasher.update(data)
    true_checksum = hasher.hexdigest()
    test_file_path = pathlib.Path(tmpdir) / 'junk.txt'
    with open(test_file_path, 'wb') as out_file:
        out_file.write(data)

    # need to populate a bucket in order for
    # S3CloudCache to be instantiated
    test_bucket_name = 'silly_bucket'
    conn = boto3.resource('s3', region_name='us-east-1')
    conn.create_bucket(Bucket=test_bucket_name, ACL='public-read')

    cache = S3CloudCache(tmpdir, test_bucket_name, 'proj')

    # should be true
    good_attribute = CacheFileAttributes('http://silly.url.com', '12345',
                                         true_checksum, test_file_path)
    assert cache._file_exists(good_attribute)

    # test when checksum is wrong
    bad_attribute = CacheFileAttributes('http://silly.url.com', '12345',
                                        'probably_not_the_checksum',
                                        test_file_path)
    assert not cache._file_exists(bad_attribute)

    # test when file path is wrong
    bad_path = pathlib.Path('definitely/not/a/file.txt')
    bad_attribute = CacheFileAttributes('http://silly.url.com', '12345',
                                        true_checksum, bad_path)

    assert not cache._file_exists(bad_attribute)

    # test when path exists but is not a file
    bad_attribute = CacheFileAttributes('http://silly.url.com', '12345',
                                        true_checksum, pathlib.Path(tmpdir))
    with pytest.raises(RuntimeError) as context:
        cache._file_exists(bad_attribute)
    assert 'but is not a file' in context.value.args[0]
Esempio n. 3
0
def test_re_download_file(tmpdir):
    """
    Test that S3CloudCache._download_file will re-download a file
    when it has been removed from the local system
    """

    hasher = hashlib.blake2b()
    data = b'11235813kjlssergwesvsdd'
    hasher.update(data)
    true_checksum = hasher.hexdigest()

    test_bucket_name = 'bucket_for_re_download'
    conn = boto3.resource('s3', region_name='us-east-1')
    conn.create_bucket(Bucket=test_bucket_name, ACL='public-read')

    # turn on bucket versioning
    # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#bucketversioning
    bucket_versioning = conn.BucketVersioning(test_bucket_name)
    bucket_versioning.enable()

    client = boto3.client('s3', region_name='us-east-1')
    client.put_object(Bucket=test_bucket_name,
                      Key='data/data_file.txt',
                      Body=data)

    response = client.list_object_versions(Bucket=test_bucket_name)
    version_id = response['Versions'][0]['VersionId']

    cache_dir = pathlib.Path(tmpdir) / 'download/test/cache'
    cache = S3CloudCache(cache_dir, test_bucket_name, 'proj')

    expected_path = cache_dir / true_checksum / 'data/data_file.txt'

    url = f'http://{test_bucket_name}.s3.amazonaws.com/data/data_file.txt'
    good_attributes = CacheFileAttributes(url, version_id, true_checksum,
                                          expected_path)

    assert not expected_path.exists()
    cache._download_file(good_attributes)
    assert expected_path.exists()
    hasher = hashlib.blake2b()
    with open(expected_path, 'rb') as in_file:
        hasher.update(in_file.read())
    assert hasher.hexdigest() == true_checksum

    # now, remove the file, and see if it gets re-downloaded
    expected_path.unlink()
    assert not expected_path.exists()

    cache._download_file(good_attributes)
    assert expected_path.exists()
    hasher = hashlib.blake2b()
    with open(expected_path, 'rb') as in_file:
        hasher.update(in_file.read())
    assert hasher.hexdigest() == true_checksum
Esempio n. 4
0
def test_str():
    """
    Test the string representation of CacheFileParameters
    """
    attr = CacheFileAttributes(url='http://my/url',
                               version_id='aaabbb',
                               file_hash='12345',
                               local_path=pathlib.Path('/my/local/path'))

    s = f'{attr}'
    assert "CacheFileParameters{" in s
    assert '"file_hash": "12345"' in s
    assert '"url": "http://my/url"' in s
    assert '"version_id": "aaabbb"' in s
    if platform.system().lower() != 'windows':
        assert '"local_path": "/my/local/path"' in s
Esempio n. 5
0
    def _create_file_attributes(self, remote_path: str, version_id: str,
                                file_hash: str) -> CacheFileAttributes:
        """
        Create the cache_file_attributes describing a file.
        This method does the work of assigning a local_path for a remote file.

        Parameters
        ----------
        remote_path: str
            The full URL to a file
        version_id: str
            The string specifying the version of the file
        file_hash: str
            The (hexadecimal) file hash of the file

        Returns
        -------
        CacheFileAttributes
        """

        if self._use_static_project_dir:
            # If we only want to support 1 version of the project on disk
            # like when mounting the project S3 bucket as a file system
            project_dir_name = f"{self._project_name}"
        else:
            # If we want to support multiple versions of the project on disk
            # paths should be built like:
            # {cache_dir} / {project_name}-{manifest_version} / relative_path
            # Example:
            # my_cache_dir/visual-behavior-ophys-1.0.0/behavior_sessions/etc...
            project_dir_name = f"{self._project_name}-{self._version}"

        project_dir = self._cache_dir / project_dir_name

        # The convention of the data release tool is to have all
        # relative_paths from remote start with the project name which
        # we want to remove since we already specified a project_dir_name
        relative_path = relative_path_from_url(remote_path)
        shaved_rel_path = "/".join(relative_path.split("/")[1:])

        local_path = project_dir / shaved_rel_path

        obj = CacheFileAttributes(remote_path, version_id, file_hash,
                                  local_path)

        return obj
Esempio n. 6
0
def test_download_file_multiple_versions(tmpdir):
    """
    Test that S3CloudCache._download_file behaves as expected
    when there are multiple versions of the same file in the
    bucket

    (This is really just testing that S3's versioning behaves the
    way we think it does)
    """

    hasher = hashlib.blake2b()
    data_1 = b'11235813kjlssergwesvsdd'
    hasher.update(data_1)
    true_checksum_1 = hasher.hexdigest()

    hasher = hashlib.blake2b()
    data_2 = b'zzzzxxxxyyyywwwwjjjj'
    hasher.update(data_2)
    true_checksum_2 = hasher.hexdigest()

    assert true_checksum_2 != true_checksum_1

    test_bucket_name = 'bucket_for_download_versions'
    conn = boto3.resource('s3', region_name='us-east-1')
    conn.create_bucket(Bucket=test_bucket_name, ACL='public-read')

    # turn on bucket versioning
    # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#bucketversioning
    bucket_versioning = conn.BucketVersioning(test_bucket_name)
    bucket_versioning.enable()

    client = boto3.client('s3', region_name='us-east-1')
    client.put_object(Bucket=test_bucket_name,
                      Key='data/data_file.txt',
                      Body=data_1)

    response = client.list_object_versions(Bucket=test_bucket_name)
    version_id_1 = response['Versions'][0]['VersionId']

    client = boto3.client('s3', region_name='us-east-1')
    client.put_object(Bucket=test_bucket_name,
                      Key='data/data_file.txt',
                      Body=data_2)

    response = client.list_object_versions(Bucket=test_bucket_name)
    version_id_2 = None
    for v in response['Versions']:
        if v['IsLatest']:
            version_id_2 = v['VersionId']
    assert version_id_2 is not None
    assert version_id_2 != version_id_1

    cache_dir = pathlib.Path(tmpdir) / 'download/test/cache'
    cache = S3CloudCache(cache_dir, test_bucket_name, 'proj')

    url = f'http://{test_bucket_name}.s3.amazonaws.com/data/data_file.txt'

    # download first version of file
    expected_path = cache_dir / true_checksum_1 / 'data/data_file.txt'

    good_attributes = CacheFileAttributes(url, version_id_1, true_checksum_1,
                                          expected_path)

    assert not expected_path.exists()
    cache._download_file(good_attributes)
    assert expected_path.exists()
    hasher = hashlib.blake2b()
    with open(expected_path, 'rb') as in_file:
        hasher.update(in_file.read())
    assert hasher.hexdigest() == true_checksum_1

    # download second version of file
    expected_path = cache_dir / true_checksum_2 / 'data/data_file.txt'

    good_attributes = CacheFileAttributes(url, version_id_2, true_checksum_2,
                                          expected_path)

    assert not expected_path.exists()
    cache._download_file(good_attributes)
    assert expected_path.exists()
    hasher = hashlib.blake2b()
    with open(expected_path, 'rb') as in_file:
        hasher.update(in_file.read())
    assert hasher.hexdigest() == true_checksum_2