def test_pull_objects_all(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj2_target = obj_to_push[1].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) obj2_source = os.path.join('/tmp', uuid.uuid4().hex) check_info = {obj1_target: obj1_source, obj2_target: obj2_source} assert os.path.exists(obj1_target) is True assert os.path.exists(obj2_target) is True helper_compress_file(obj1_target, obj1_source) helper_compress_file(obj2_target, obj2_source) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj2_target) is False assert os.path.isfile(obj1_source) is True assert os.path.isfile(obj2_source) is True # remove data from the local file cache os.remove( os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "test1.txt")) os.remove( os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "test2.txt")) shutil.rmtree(os.path.join(manifest.cache_mgr.cache_root, 'objects')) os.makedirs(os.path.join(manifest.cache_mgr.cache_root, 'objects')) with aioresponses() as mocked_responses: mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_1}?params=1", "namespace": ds.namespace, "obj_id": obj_id_1, "dataset": ds.name }, status=200) with open(obj1_source, 'rb') as data1: mocked_responses.get( f"https://dummyurl.com/{obj_id_1}?params=1", body=data1.read(), status=200, content_type='application/octet-stream') mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_2}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_2}?params=1", "namespace": ds.namespace, "obj_id": obj_id_2, "dataset": ds.name }, status=200) with open(obj2_source, 'rb') as data2: mocked_responses.get( f"https://dummyurl.com/{obj_id_2}?params=1", body=data2.read(), status=200, content_type='application/octet-stream') iom.dataset.backend.set_default_configuration( "test-user", "abcd", '1234') result = iom.pull_all() assert len(result.success) == 2 assert len(result.failure) == 0 assert result.success[0].object_path != result.success[ 1].object_path assert result.success[0].object_path in [ obj_to_push[0].object_path, obj_to_push[1].object_path ] assert result.success[1].object_path in [ obj_to_push[0].object_path, obj_to_push[1].object_path ] assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is True decompressor = snappy.StreamDecompressor() for r in result.success: with open(check_info[r.object_path], 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(r.object_path, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1
def test_pull_objects_all_partial_download(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "other_dir/test3.txt", "1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 3 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1) _, obj_id_3 = obj_to_push[2].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj2_target = obj_to_push[1].object_path obj3_target = obj_to_push[2].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) assert "test3.txt" in obj_to_push[0].dataset_path assert os.path.exists(obj1_target) is True assert os.path.exists(obj2_target) is True assert os.path.exists(obj3_target) is True # Completely remove other_dir/test3.txt object os.remove( os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "other_dir", "test3.txt")) helper_compress_file(obj1_target, obj1_source) # Remove link for test1.txt os.remove( os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "test1.txt")) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj2_target) is True assert os.path.isfile(obj3_target) is True with aioresponses() as mocked_responses: mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_1}?params=1", "namespace": ds.namespace, "obj_id": obj_id_1, "dataset": ds.name }, status=200) with open(obj1_source, 'rb') as data1: mocked_responses.get( f"https://dummyurl.com/{obj_id_1}?params=1", body=data1.read(), status=200, content_type='application/octet-stream') iom.dataset.backend.set_default_configuration( "test-user", "abcd", '1234') result = iom.pull_all() assert len(result.success) == 1 assert len(result.failure) == 0 assert result.success[0].object_path == obj1_target assert "test3.txt" in result.success[0].dataset_path assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is True assert os.path.isfile(obj3_target) is True filename = os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "other_dir", "test3.txt") assert os.path.isfile(filename) is True with open(filename, 'rt') as dd: assert dd.read() == "1" filename = os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "test1.txt") assert os.path.isfile(filename) is True with open(filename, 'rt') as dd: assert dd.read() == "test content 1" filename = os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "test2.txt") assert os.path.isfile(filename) is True with open(filename, 'rt') as dd: assert dd.read() == "test content 2" # Try pulling all again with nothing to pull result = iom.pull_all() assert len(result.success) == 0 assert len(result.failure) == 0 assert result.message == "Dataset already downloaded."
def download_dataset_files(logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None, all_keys: Optional[bool] = False, keys: Optional[List[str]] = None): """Method to import a dataset from a zip file Args: logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked all_keys: Boolean indicating if all remaining files should be downloaded keys: List if file keys to download Returns: str: directory path of imported labbook """ def update_meta(msg): job = get_current_job() if not job: return if 'feedback' not in job.meta: job.meta['feedback'] = msg else: job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}' job.save_meta() logger = LMLogger.get_logger() try: p = os.getpid() logger.info(f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner}," f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}") im = InventoryManager() if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) if all_keys: result = iom.pull_all(status_update_fn=update_meta) elif keys: result = iom.pull_objects(keys=keys, status_update_fn=update_meta) else: raise ValueError("Must provide a list of keys or set all_keys=True") # Save the Relay node IDs to the job metadata so the UI can re-fetch as needed job = get_current_job() if job: job.meta['success_keys'] = [x.dataset_path for x in result.success] job.meta['failure_keys'] = [x.dataset_path for x in result.failure] job.save_meta() if len(result.failure) > 0: # If any downloads failed, exit non-zero to the UI knows there was an error sys.exit(-1) except Exception as err: logger.exception(err) raise