def test_cache_file_attributes(): attr = CacheFileAttributes(url='http://my/url', version_id='aaabbb', file_hash='12345', local_path=pathlib.Path('/my/local/path')) assert attr.url == 'http://my/url' assert attr.version_id == 'aaabbb' assert attr.file_hash == '12345' assert attr.local_path == pathlib.Path('/my/local/path') # test that the correct ValueErrors are raised # when you pass invalid arguments with pytest.raises(ValueError) as context: attr = CacheFileAttributes(url=5.0, version_id='aaabbb', file_hash='12345', local_path=pathlib.Path('/my/local/path')) msg = "url must be str; got <class 'float'>" assert context.value.args[0] == msg with pytest.raises(ValueError) as context: attr = CacheFileAttributes(url='http://my/url/', version_id=5.0, file_hash='12345', local_path=pathlib.Path('/my/local/path')) msg = "version_id must be str; got <class 'float'>" assert context.value.args[0] == msg with pytest.raises(ValueError) as context: attr = CacheFileAttributes(url='http://my/url/', version_id='aaabbb', file_hash=5.0, local_path=pathlib.Path('/my/local/path')) msg = "file_hash must be str; got <class 'float'>" assert context.value.args[0] == msg with pytest.raises(ValueError) as context: attr = CacheFileAttributes(url='http://my/url/', version_id='aaabbb', file_hash='12345', local_path='/my/local/path') msg = "local_path must be pathlib.Path; got <class 'str'>" assert context.value.args[0] == msg
def test_file_exists(tmpdir): """ Test that cache._file_exists behaves correctly """ data = b'aakderasjklsafetss77123523asf' hasher = hashlib.blake2b() hasher.update(data) true_checksum = hasher.hexdigest() test_file_path = pathlib.Path(tmpdir) / 'junk.txt' with open(test_file_path, 'wb') as out_file: out_file.write(data) # need to populate a bucket in order for # S3CloudCache to be instantiated test_bucket_name = 'silly_bucket' conn = boto3.resource('s3', region_name='us-east-1') conn.create_bucket(Bucket=test_bucket_name, ACL='public-read') cache = S3CloudCache(tmpdir, test_bucket_name, 'proj') # should be true good_attribute = CacheFileAttributes('http://silly.url.com', '12345', true_checksum, test_file_path) assert cache._file_exists(good_attribute) # test when checksum is wrong bad_attribute = CacheFileAttributes('http://silly.url.com', '12345', 'probably_not_the_checksum', test_file_path) assert not cache._file_exists(bad_attribute) # test when file path is wrong bad_path = pathlib.Path('definitely/not/a/file.txt') bad_attribute = CacheFileAttributes('http://silly.url.com', '12345', true_checksum, bad_path) assert not cache._file_exists(bad_attribute) # test when path exists but is not a file bad_attribute = CacheFileAttributes('http://silly.url.com', '12345', true_checksum, pathlib.Path(tmpdir)) with pytest.raises(RuntimeError) as context: cache._file_exists(bad_attribute) assert 'but is not a file' in context.value.args[0]
def test_re_download_file(tmpdir): """ Test that S3CloudCache._download_file will re-download a file when it has been removed from the local system """ hasher = hashlib.blake2b() data = b'11235813kjlssergwesvsdd' hasher.update(data) true_checksum = hasher.hexdigest() test_bucket_name = 'bucket_for_re_download' conn = boto3.resource('s3', region_name='us-east-1') conn.create_bucket(Bucket=test_bucket_name, ACL='public-read') # turn on bucket versioning # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#bucketversioning bucket_versioning = conn.BucketVersioning(test_bucket_name) bucket_versioning.enable() client = boto3.client('s3', region_name='us-east-1') client.put_object(Bucket=test_bucket_name, Key='data/data_file.txt', Body=data) response = client.list_object_versions(Bucket=test_bucket_name) version_id = response['Versions'][0]['VersionId'] cache_dir = pathlib.Path(tmpdir) / 'download/test/cache' cache = S3CloudCache(cache_dir, test_bucket_name, 'proj') expected_path = cache_dir / true_checksum / 'data/data_file.txt' url = f'http://{test_bucket_name}.s3.amazonaws.com/data/data_file.txt' good_attributes = CacheFileAttributes(url, version_id, true_checksum, expected_path) assert not expected_path.exists() cache._download_file(good_attributes) assert expected_path.exists() hasher = hashlib.blake2b() with open(expected_path, 'rb') as in_file: hasher.update(in_file.read()) assert hasher.hexdigest() == true_checksum # now, remove the file, and see if it gets re-downloaded expected_path.unlink() assert not expected_path.exists() cache._download_file(good_attributes) assert expected_path.exists() hasher = hashlib.blake2b() with open(expected_path, 'rb') as in_file: hasher.update(in_file.read()) assert hasher.hexdigest() == true_checksum
def test_str(): """ Test the string representation of CacheFileParameters """ attr = CacheFileAttributes(url='http://my/url', version_id='aaabbb', file_hash='12345', local_path=pathlib.Path('/my/local/path')) s = f'{attr}' assert "CacheFileParameters{" in s assert '"file_hash": "12345"' in s assert '"url": "http://my/url"' in s assert '"version_id": "aaabbb"' in s if platform.system().lower() != 'windows': assert '"local_path": "/my/local/path"' in s
def _create_file_attributes(self, remote_path: str, version_id: str, file_hash: str) -> CacheFileAttributes: """ Create the cache_file_attributes describing a file. This method does the work of assigning a local_path for a remote file. Parameters ---------- remote_path: str The full URL to a file version_id: str The string specifying the version of the file file_hash: str The (hexadecimal) file hash of the file Returns ------- CacheFileAttributes """ if self._use_static_project_dir: # If we only want to support 1 version of the project on disk # like when mounting the project S3 bucket as a file system project_dir_name = f"{self._project_name}" else: # If we want to support multiple versions of the project on disk # paths should be built like: # {cache_dir} / {project_name}-{manifest_version} / relative_path # Example: # my_cache_dir/visual-behavior-ophys-1.0.0/behavior_sessions/etc... project_dir_name = f"{self._project_name}-{self._version}" project_dir = self._cache_dir / project_dir_name # The convention of the data release tool is to have all # relative_paths from remote start with the project name which # we want to remove since we already specified a project_dir_name relative_path = relative_path_from_url(remote_path) shaved_rel_path = "/".join(relative_path.split("/")[1:]) local_path = project_dir / shaved_rel_path obj = CacheFileAttributes(remote_path, version_id, file_hash, local_path) return obj
def test_download_file_multiple_versions(tmpdir): """ Test that S3CloudCache._download_file behaves as expected when there are multiple versions of the same file in the bucket (This is really just testing that S3's versioning behaves the way we think it does) """ hasher = hashlib.blake2b() data_1 = b'11235813kjlssergwesvsdd' hasher.update(data_1) true_checksum_1 = hasher.hexdigest() hasher = hashlib.blake2b() data_2 = b'zzzzxxxxyyyywwwwjjjj' hasher.update(data_2) true_checksum_2 = hasher.hexdigest() assert true_checksum_2 != true_checksum_1 test_bucket_name = 'bucket_for_download_versions' conn = boto3.resource('s3', region_name='us-east-1') conn.create_bucket(Bucket=test_bucket_name, ACL='public-read') # turn on bucket versioning # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#bucketversioning bucket_versioning = conn.BucketVersioning(test_bucket_name) bucket_versioning.enable() client = boto3.client('s3', region_name='us-east-1') client.put_object(Bucket=test_bucket_name, Key='data/data_file.txt', Body=data_1) response = client.list_object_versions(Bucket=test_bucket_name) version_id_1 = response['Versions'][0]['VersionId'] client = boto3.client('s3', region_name='us-east-1') client.put_object(Bucket=test_bucket_name, Key='data/data_file.txt', Body=data_2) response = client.list_object_versions(Bucket=test_bucket_name) version_id_2 = None for v in response['Versions']: if v['IsLatest']: version_id_2 = v['VersionId'] assert version_id_2 is not None assert version_id_2 != version_id_1 cache_dir = pathlib.Path(tmpdir) / 'download/test/cache' cache = S3CloudCache(cache_dir, test_bucket_name, 'proj') url = f'http://{test_bucket_name}.s3.amazonaws.com/data/data_file.txt' # download first version of file expected_path = cache_dir / true_checksum_1 / 'data/data_file.txt' good_attributes = CacheFileAttributes(url, version_id_1, true_checksum_1, expected_path) assert not expected_path.exists() cache._download_file(good_attributes) assert expected_path.exists() hasher = hashlib.blake2b() with open(expected_path, 'rb') as in_file: hasher.update(in_file.read()) assert hasher.hexdigest() == true_checksum_1 # download second version of file expected_path = cache_dir / true_checksum_2 / 'data/data_file.txt' good_attributes = CacheFileAttributes(url, version_id_2, true_checksum_2, expected_path) assert not expected_path.exists() cache._download_file(good_attributes) assert expected_path.exists() hasher = hashlib.blake2b() with open(expected_path, 'rb') as in_file: hasher.update(in_file.read()) assert hasher.hexdigest() == true_checksum_2