def construct_local_manifest(self) -> None: """ Construct the dict that maps between file_hash and absolute local path. Save it to self._downloaded_data_path """ lookup = {} files_to_hash = set() c_dir = pathlib.Path(self._cache_dir) file_iterator = c_dir.glob('**/*') for file_name in file_iterator: if file_name.is_file(): if 'json' not in file_name.name: if file_name != self._manifest_last_used: files_to_hash.add(file_name.resolve()) with tqdm.tqdm(files_to_hash, total=len(files_to_hash), unit='(files hashed)') as pbar: for local_path in pbar: hsh = file_hash_from_path(local_path) lookup[str(local_path.absolute())] = hsh with open(self._downloaded_data_path, 'w') as out_file: out_file.write(json.dumps(lookup, indent=2, sort_keys=True))
def test_file_hash_from_path(tmpdir): rng = np.random.RandomState(881) alphabet = list('abcdefghijklmnopqrstuvwxyz') fname = tmpdir / 'hash_dummy.txt' with open(fname, 'w') as out_file: for ii in range(10): out_file.write(''.join(rng.choice(alphabet, size=10))) out_file.write('\n') hasher = hashlib.blake2b() with open(fname, 'rb') as in_file: chunk = in_file.read(7) while len(chunk) > 0: hasher.update(chunk) chunk = in_file.read(7) ans = utils.file_hash_from_path(fname) assert ans == hasher.hexdigest()
def _file_exists(self, file_attributes: CacheFileAttributes) -> bool: """ Given a CacheFileAttributes describing a file, assess whether or not that file exists locally and is valid (i.e. has the expected file hash) Parameters ---------- file_attributes: CacheFileAttributes Description of the file to look for Returns ------- bool True if the file exists and is valid; False otherwise Raises ----- RuntimeError If file_attributes.local_path exists but is not a file. It would be unclear how the cache should proceed in this case. """ if not file_attributes.local_path.exists(): return False if not file_attributes.local_path.is_file(): raise RuntimeError(f"{file_attributes.local_path}\n" "exists, but is not a file;\n" "unsure how to proceed") full_path = file_attributes.local_path.resolve() test_checksum = file_hash_from_path(full_path) if test_checksum != file_attributes.file_hash: return False return True
def _download_file(self, file_attributes: CacheFileAttributes) -> bool: """ Check if a file exists locally. If it does not, download it and return True. Return False otherwise. Parameters ---------- file_attributes: CacheFileAttributes Describes the file to download Returns ------- bool True if the file was downloaded; False otherwise Raises ------ RuntimeError If the path to the directory where the file is to be saved points to something that is not a directory. RuntimeError If it is not able to successfully download the file after 10 iterations """ was_downloaded = False local_path = file_attributes.local_path local_dir = pathlib.Path(safe_system_path(str(local_path.parents[0]))) # make sure Windows references to Allen Institute # local networked file system get handled correctly local_path = pathlib.Path(safe_system_path(str(local_path))) # using os here rather than pathlib because safe_system_path # returns a str os.makedirs(local_dir, exist_ok=True) if not os.path.isdir(local_dir): raise RuntimeError(f"{local_dir}\n" "is not a directory") bucket_name = bucket_name_from_url(file_attributes.url) obj_key = relative_path_from_url(file_attributes.url) n_iter = 0 max_iter = 10 # maximum number of times to try download version_id = file_attributes.version_id pbar = None if not self._file_exists(file_attributes): response = self.s3_client.list_object_versions(Bucket=bucket_name, Prefix=str(obj_key)) object_info = [ i for i in response["Versions"] if i["VersionId"] == version_id ][0] pbar = tqdm.tqdm(desc=object_info["Key"].split("/")[-1], total=object_info["Size"], unit_scale=True, unit_divisor=1000., unit="MB") while not self._file_exists(file_attributes): was_downloaded = True response = self.s3_client.get_object(Bucket=bucket_name, Key=str(obj_key), VersionId=version_id) if 'Body' in response: with open(local_path, 'wb') as out_file: for chunk in response['Body'].iter_chunks(): out_file.write(chunk) pbar.update(len(chunk)) # Verify the hash of the downloaded file full_path = file_attributes.local_path.resolve() test_checksum = file_hash_from_path(full_path) if test_checksum != file_attributes.file_hash: file_attributes.local_path.exists() file_attributes.local_path.unlink() n_iter += 1 if n_iter > max_iter: pbar.close() raise RuntimeError("Could not download\n" f"{file_attributes}\n" "In {max_iter} iterations") if pbar is not None: pbar.close() return was_downloaded
def mounted_s3_dataset_fixture(tmp_path, request) -> Tuple[Path, str, dict]: """A fixture which simulates a project s3 bucket that has been mounted as a local directory. """ # Get fixture parameters project_name = request.param.get("project_name", "test_project_name_1") dataset_version = request.param.get("dataset_version", "0.3.0") metadata_file_id_column_name = request.param.get( "metadata_file_id_column_name", "file_id" ) metadata_files_contents = request.param.get( "metadata_files_contents", # Each item in list is a tuple of: # (metadata_filename, metadata_contents) [ ("metadata_1.csv", {"mouse": [1, 2, 3], "sex": ["F", "F", "M"]}), ( "metadata_2.csv", { "experiment": [4, 5, 6], metadata_file_id_column_name: ["data1", "data2", "data3"] } ) ] ) data_files_contents = request.param.get( "data_files_contents", # Each item in list is a tuple of: # (data_filename, data_contents) [ ("data_1.nwb", "123456"), ("data_2.nwb", "abcdef"), ("data_3.nwb", "ghijkl") ] ) # Create mock mounted s3 directory structure mock_mounted_base_dir = tmp_path / "mounted_remote_data" mock_mounted_base_dir.mkdir() mock_project_dir = mock_mounted_base_dir / project_name mock_project_dir.mkdir() # Create metadata files and manifest entries mock_metadata_dir = mock_project_dir / "project_metadata" mock_metadata_dir.mkdir() manifest_meta_entries = dict() for meta_fname, meta_contents in metadata_files_contents: meta_save_path = mock_metadata_dir / meta_fname df_to_save = pd.DataFrame(meta_contents) df_to_save.to_csv(str(meta_save_path), index=False) manifest_meta_entries[meta_fname.rstrip(".csv")] = { "url": ( f"http://{project_name}.s3.amazonaws.com/{project_name}" f"/project_metadata/{meta_fname}" ), "version_id": "test_placeholder", "file_hash": file_hash_from_path(meta_save_path) } # Create data files and manifest entries mock_data_dir = mock_project_dir / "project_data" mock_data_dir.mkdir() manifest_data_entries = dict() for file_fname, file_contents in data_files_contents: file_save_path = mock_data_dir / file_fname with file_save_path.open('w') as f: f.write(file_contents) manifest_data_entries[file_fname.rstrip(".nwb")] = { "url": ( f"http://{project_name}.s3.amazonaws.com/{project_name}" f"/project_data/{file_fname}" ), "version_id": "test_placeholder", "file_hash": file_hash_from_path(file_save_path) } # Create manifest dir and manifest mock_manifests_dir = mock_project_dir / "manifests" mock_manifests_dir.mkdir() manifest_fname = f"test_manifest_v{dataset_version}.json" manifest_path = mock_manifests_dir / manifest_fname manifest_contents = { "project_name": project_name, "manifest_version": dataset_version, "data_pipeline": [ { "name": "AllenSDK", "version": "2.11.0", "comment": "This is a test entry. NOT REAL." } ], "metadata_file_id_column_name": metadata_file_id_column_name, "metadata_files": manifest_meta_entries, "data_files": manifest_data_entries } with manifest_path.open('w') as f: json.dump(manifest_contents, f, indent=4) expected = { "expected_metadata": metadata_files_contents, "expected_data": data_files_contents } return mock_mounted_base_dir, project_name, expected