def test_objects_to_push(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") helper_append_file(manifest.cache_mgr.cache_root, revision, "other_dir/test4.txt", "test content 4") manifest.sweep_all_changes() # Modify file to have 2 objects with same key helper_append_file(manifest.cache_mgr.cache_root, iom.manifest.dataset_revision, "test2.txt", "test content 22") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 4 assert obj_to_push[0].dataset_path == "other_dir/test4.txt" assert obj_to_push[1].dataset_path == "test1.txt" assert obj_to_push[2].dataset_path == "test2.txt" assert obj_to_push[3].dataset_path == "test2.txt" assert obj_to_push[2].revision != obj_to_push[3].revision assert iom.num_objects_to_push() == 4
def test_compute_push_batches(self, mock_dataset_with_manifest_bg_tests): """Test compute push batches, verifying it works OK when you've deleted some files""" ds, manifest, working_dir = mock_dataset_with_manifest_bg_tests iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "other_dir/test3.txt", "test content 3") helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test" * 4300000) helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") helper_append_file(manifest.cache_mgr.cache_root, revision, "test4.txt", "test content 4") helper_append_file(manifest.cache_mgr.cache_root, revision, "test5.txt", "test content 5") manifest.sweep_all_changes() assert len(manifest.manifest) == 6 # remove a file from the manifest manifest.delete(['test5.txt']) assert len(manifest.manifest) == 5 key_batches, total_bytes, num_files = iom.compute_push_batches() assert num_files == 5 assert total_bytes == (4 * 4300000) + (14 * 4) assert len(key_batches) == 2 assert len(key_batches[0]) == 4 assert len(key_batches[1]) == 1 assert key_batches[1][0].dataset_path == 'test1.txt'
def test_objects_to_push_deduped(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content dup") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content dup") helper_append_file(manifest.cache_mgr.cache_root, revision, "test3.txt", "test content dup") helper_append_file(manifest.cache_mgr.cache_root, revision, "other_dir/test4.txt", "test content 4") manifest.sweep_all_changes() # Write a .DS_Store file in the objects dir to make sure it gets skipped with open( os.path.join(manifest.cache_mgr.cache_root, 'objects', '.push', '.DS_Store'), 'wt') as ff: ff.write("") obj_to_push = iom.objects_to_push(remove_duplicates=True) assert len(obj_to_push) == 2 assert obj_to_push[0].dataset_path == "other_dir/test4.txt" assert obj_to_push[1].dataset_path == "test1.txt" assert iom.num_objects_to_push(remove_duplicates=True) == 2
def test_push_objects_with_failure(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 _, obj1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj2 = obj_to_push[1].object_path.rsplit('/', 1) with aioresponses() as mocked_responses: mocked_responses.put( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj1}', payload={ "presigned_url": f"https://dummyurl.com/{obj1}?params=1", "namespace": ds.namespace, "key_id": "hghghg", "obj_id": obj1, "dataset": ds.name }, status=200) mocked_responses.put(f"https://dummyurl.com/{obj1}?params=1", payload={}, status=200) mocked_responses.put( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj2}', payload={ "presigned_url": f"https://dummyurl.com/{obj2}?params=1", "namespace": ds.namespace, "key_id": "hghghg", "obj_id": obj2, "dataset": ds.name }, status=200) mocked_responses.put(f"https://dummyurl.com/{obj2}?params=1", payload={}, status=400) assert len(glob.glob(f'{iom.push_dir}/*')) == 1 iom.dataset.backend.set_default_configuration( "test-user", "abcd", '1234') result = iom.push_objects() assert len(glob.glob(f'{iom.push_dir}/*')) == 1 assert len(result.success) == 1 assert len(result.failure) == 1 assert result.success[0].object_path == obj_to_push[0].object_path assert result.failure[0].object_path == obj_to_push[1].object_path
def _push_dataset_objects(self, dataset: Dataset, logged_in_username: str, feedback_callback: Callable, access_token, id_token) -> None: dataset.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(dataset, logged_in_username) iom = IOManager(dataset, m) iom.push_objects(status_update_fn=feedback_callback) iom.manifest.link_revision()
def test_objects_to_push_ignore_other_branch(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "fdsfgfd") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 assert obj_to_push[0].dataset_path == "test1.txt" assert obj_to_push[1].dataset_path == "test2.txt" # Create new branch and add a file there bm = BranchManager(ds, username=USERNAME) starting_branch = bm.active_branch bm.create_branch(title="test-branch") assert bm.active_branch == "test-branch" assert ds.is_repo_clean is True helper_append_file(manifest.cache_mgr.cache_root, iom.manifest.dataset_revision, "test3.txt", "fdsfgfd") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 3 assert obj_to_push[0].dataset_path == "test1.txt" assert obj_to_push[1].dataset_path == "test2.txt" assert obj_to_push[2].dataset_path == "test3.txt" # Go back to original branch, you shouldn't have to push file on other branch bm.workon_branch(starting_branch) obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 assert obj_to_push[0].dataset_path == "test1.txt" assert obj_to_push[1].dataset_path == "test2.txt"
def test_compute_pull_batches(self, mock_dataset_with_manifest_bg_tests): ds, manifest, working_dir = mock_dataset_with_manifest_bg_tests iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "other_dir/test3.txt", "test content 3") helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test" * 4300000) helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") helper_append_file(manifest.cache_mgr.cache_root, revision, "test4.txt", "test content 4") helper_append_file(manifest.cache_mgr.cache_root, revision, "test5.txt", "test content 5") manifest.sweep_all_changes() with pytest.raises(ValueError): iom.compute_pull_batches() # Remove all files so everything needs to be pulled rev_dir = os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision) object_dir = os.path.join(manifest.cache_mgr.cache_root, 'objects') shutil.rmtree(rev_dir) shutil.rmtree(object_dir) key_batches, total_bytes, num_files = iom.compute_pull_batches( pull_all=True) assert num_files == 5 assert total_bytes == (4 * 4300000) + (14 * 4) assert len(key_batches) == 2 assert len(key_batches[0]) == 4 assert len(key_batches[1]) == 1 assert key_batches[1][0] == 'test1.txt'
def test_sync__dataset(self, mock_config_file): def update_feedback(msg: str, has_failures: Optional[bool] = None, failure_detail: Optional[str] = None, percent_complete: Optional[float] = None): """Method to update the job's metadata and provide feedback to the UI""" assert has_failures is None or has_failures is False assert failure_detail is None def dispatch_query_mock(self, job_key): JobStatus = namedtuple("JobStatus", ['status', 'meta']) return JobStatus(status='finished', meta={'completed_bytes': '100'}) def dispatch_mock(self, method_reference, kwargs, metadata, persist): return "afakejobkey" username = '******' im = InventoryManager(mock_config_file[0]) ds = im.create_dataset(username, username, 'dataset-1', 'gigantum_object_v1') m = Manifest(ds, username) wf = DatasetWorkflow(ds) iom = IOManager(ds, m) assert len(glob.glob(f'{iom.push_dir}/*')) == 0 wf.publish(username=username, feedback_callback=update_feedback) # Put a file into the dataset that needs to be pushed helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "asdfadfsdf") m.sweep_all_changes() assert len(glob.glob(f'{iom.push_dir}/*')) == 1 with patch.object(Dispatcher, 'dispatch_task', dispatch_mock): with patch.object(Dispatcher, 'query_task', dispatch_query_mock): wf.sync(username=username, feedback_callback=update_feedback) assert os.path.exists(wf.remote) assert len(glob.glob(f'{iom.push_dir}/*')) == 0
def _push_dataset_objects(self, logged_in_username: str, feedback_callback: Callable, access_token, id_token) -> None: """Method to schedule a push operta Args: logged_in_username: feedback_callback: access_token: id_token: Returns: """ dispatcher_obj = Dispatcher() try: self.dataset.backend.set_default_configuration( logged_in_username, access_token, id_token) m = Manifest(self.dataset, logged_in_username) iom = IOManager(self.dataset, m) obj_batches, total_bytes, num_files = iom.compute_push_batches() if obj_batches: # Schedule jobs for batches bg_jobs = list() for objs in obj_batches: job_kwargs = { 'objs': objs, 'logged_in_username': logged_in_username, 'access_token': access_token, 'id_token': id_token, 'dataset_owner': self.dataset.namespace, 'dataset_name': self.dataset.name, 'config_file': self.dataset.client_config.config_file, } job_metadata = { 'dataset': f"{logged_in_username}|{self.dataset.namespace}|{self.dataset.name}", 'method': 'pull_objects' } feedback_callback( f"Preparing to upload {num_files} files. Please wait..." ) job_key = dispatcher_obj.dispatch_task( method_reference=gtmcore.dispatcher.dataset_jobs. push_dataset_objects, kwargs=job_kwargs, metadata=job_metadata, persist=True) bg_jobs.append( BackgroundUploadJob(dispatcher_obj, objs, job_key)) logger.info( f"Schedule dataset object upload job for" f" {logged_in_username}/{self.dataset.namespace}/{self.dataset.name} with" f" {len(objs)} objects to upload") while sum([(x.is_complete or x.is_failed) for x in bg_jobs]) != len(bg_jobs): # Refresh all job statuses and update status feedback [j.refresh_status() for j in bg_jobs] total_completed_bytes = sum( [j.completed_bytes for j in bg_jobs]) if total_completed_bytes > 0: pc = (float(total_completed_bytes) / float(total_bytes)) * 100 feedback_callback( f"Please wait - Uploading {num_files} files ({format_size(total_completed_bytes)}" f" of {format_size(total_bytes)}) - {round(pc)}% complete", percent_complete=pc) time.sleep(1) # if you get here, all jobs are done or failed. # Remove all the push files so they can be regenerated if needed for f in glob.glob(f'{iom.push_dir}/*'): os.remove(f) # Aggregate failures if they exist failure_keys: List[str] = list() for j in bg_jobs: if j.is_failed: # Background job hard failed. Assume entire batch should get re-uploaded for obj in j.objs: failure_keys.append( f"{obj.dataset_path} at {obj.revision[0:8]}") m.queue_to_push(obj.object_path, obj.dataset_path, obj.revision) else: for obj in j.get_failed_objects(): # Some individual objects failed failure_keys.append( f"{obj.dataset_path} at {obj.revision[0:8]}") m.queue_to_push(obj.object_path, obj.dataset_path, obj.revision) # Set final status for UI if len(failure_keys) == 0: feedback_callback(f"Upload complete!", percent_complete=100, has_failures=False) else: failure_str = "\n".join(failure_keys) failure_detail_str = f"Files that failed to upload:\n{failure_str}" feedback_callback("", percent_complete=100, has_failures=True, failure_detail=failure_detail_str) # Finish up by linking everything just in case iom.manifest.link_revision() if len(failure_keys) > 0: # If any downloads failed, exit non-zero to the UI knows there was an error raise IOError( f"{len(failure_keys)} file(s) failed to upload. Check message detail for more information" " and try to sync again.") except Exception as err: logger.exception(err) raise
def test_pull_objects_all_partial_download(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "other_dir/test3.txt", "1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 3 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1) _, obj_id_3 = obj_to_push[2].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj2_target = obj_to_push[1].object_path obj3_target = obj_to_push[2].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) assert "test3.txt" in obj_to_push[0].dataset_path assert os.path.exists(obj1_target) is True assert os.path.exists(obj2_target) is True assert os.path.exists(obj3_target) is True # Completely remove other_dir/test3.txt object os.remove( os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "other_dir", "test3.txt")) helper_compress_file(obj1_target, obj1_source) # Remove link for test1.txt os.remove( os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "test1.txt")) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj2_target) is True assert os.path.isfile(obj3_target) is True with aioresponses() as mocked_responses: mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_1}?params=1", "namespace": ds.namespace, "obj_id": obj_id_1, "dataset": ds.name }, status=200) with open(obj1_source, 'rb') as data1: mocked_responses.get( f"https://dummyurl.com/{obj_id_1}?params=1", body=data1.read(), status=200, content_type='application/octet-stream') iom.dataset.backend.set_default_configuration( "test-user", "abcd", '1234') result = iom.pull_all() assert len(result.success) == 1 assert len(result.failure) == 0 assert result.success[0].object_path == obj1_target assert "test3.txt" in result.success[0].dataset_path assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is True assert os.path.isfile(obj3_target) is True filename = os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "other_dir", "test3.txt") assert os.path.isfile(filename) is True with open(filename, 'rt') as dd: assert dd.read() == "1" filename = os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "test1.txt") assert os.path.isfile(filename) is True with open(filename, 'rt') as dd: assert dd.read() == "test content 1" filename = os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "test2.txt") assert os.path.isfile(filename) is True with open(filename, 'rt') as dd: assert dd.read() == "test content 2" # Try pulling all again with nothing to pull result = iom.pull_all() assert len(result.success) == 0 assert len(result.failure) == 0 assert result.message == "Dataset already downloaded."
def test_pull_objects_all(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj2_target = obj_to_push[1].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) obj2_source = os.path.join('/tmp', uuid.uuid4().hex) check_info = {obj1_target: obj1_source, obj2_target: obj2_source} assert os.path.exists(obj1_target) is True assert os.path.exists(obj2_target) is True helper_compress_file(obj1_target, obj1_source) helper_compress_file(obj2_target, obj2_source) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj2_target) is False assert os.path.isfile(obj1_source) is True assert os.path.isfile(obj2_source) is True # remove data from the local file cache os.remove( os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "test1.txt")) os.remove( os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision, "test2.txt")) shutil.rmtree(os.path.join(manifest.cache_mgr.cache_root, 'objects')) os.makedirs(os.path.join(manifest.cache_mgr.cache_root, 'objects')) with aioresponses() as mocked_responses: mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_1}?params=1", "namespace": ds.namespace, "obj_id": obj_id_1, "dataset": ds.name }, status=200) with open(obj1_source, 'rb') as data1: mocked_responses.get( f"https://dummyurl.com/{obj_id_1}?params=1", body=data1.read(), status=200, content_type='application/octet-stream') mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_2}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_2}?params=1", "namespace": ds.namespace, "obj_id": obj_id_2, "dataset": ds.name }, status=200) with open(obj2_source, 'rb') as data2: mocked_responses.get( f"https://dummyurl.com/{obj_id_2}?params=1", body=data2.read(), status=200, content_type='application/octet-stream') iom.dataset.backend.set_default_configuration( "test-user", "abcd", '1234') result = iom.pull_all() assert len(result.success) == 2 assert len(result.failure) == 0 assert result.success[0].object_path != result.success[ 1].object_path assert result.success[0].object_path in [ obj_to_push[0].object_path, obj_to_push[1].object_path ] assert result.success[1].object_path in [ obj_to_push[0].object_path, obj_to_push[1].object_path ] assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is True decompressor = snappy.StreamDecompressor() for r in result.success: with open(check_info[r.object_path], 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(r.object_path, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1
def test_pull_objects(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj2_target = obj_to_push[1].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) obj2_source = os.path.join('/tmp', uuid.uuid4().hex) assert os.path.exists(obj1_target) is True assert os.path.exists(obj2_target) is True helper_compress_file(obj1_target, obj1_source) helper_compress_file(obj2_target, obj2_source) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj2_target) is False assert os.path.isfile(obj1_source) is True assert os.path.isfile(obj2_source) is True with aioresponses() as mocked_responses: mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_1}?params=1", "namespace": ds.namespace, "obj_id": obj_id_1, "dataset": ds.name }, status=200) with open(obj1_source, 'rb') as data1: mocked_responses.get( f"https://dummyurl.com/{obj_id_1}?params=1", body=data1.read(), status=200, content_type='application/octet-stream') mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_2}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_2}?params=1", "namespace": ds.namespace, "obj_id": obj_id_2, "dataset": ds.name }, status=200) with open(obj2_source, 'rb') as data2: mocked_responses.get( f"https://dummyurl.com/{obj_id_2}?params=1", body=data2.read(), status=200, content_type='application/octet-stream') assert len(glob.glob(f'{iom.push_dir}/*')) == 1 iom.dataset.backend.set_default_configuration( "test-user", "abcd", '1234') result = iom.pull_objects(keys=["test1.txt"]) assert len(glob.glob(f'{iom.push_dir}/*')) == 1 assert len(result.success) == 1 assert len(result.failure) == 0 assert result.success[0].object_path == obj_to_push[0].object_path assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is False with open(obj1_target, 'rt') as dd: assert "test content 1" == dd.read() result = iom.pull_objects(keys=["test2.txt"]) assert len(glob.glob(f'{iom.push_dir}/*')) == 1 assert len(result.success) == 1 assert len(result.failure) == 0 assert result.success[0].object_path == obj_to_push[1].object_path assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is True with open(obj1_target, 'rt') as dd: assert "test content 1" == dd.read() with open(obj2_target, 'rt') as dd: assert "test content 2" == dd.read()
def test__get_pull_all_keys(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "other_dir/test3.txt", "dummy content") helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 3 obj3 = obj_to_push[0].object_path obj1 = obj_to_push[1].object_path obj2 = obj_to_push[2].object_path rev_dir = os.path.join(manifest.cache_mgr.cache_root, manifest.dataset_revision) file3 = os.path.join(rev_dir, obj_to_push[0].dataset_path) file1 = os.path.join(rev_dir, obj_to_push[1].dataset_path) file2 = os.path.join(rev_dir, obj_to_push[2].dataset_path) assert os.path.exists(obj1) is True assert os.path.exists(obj2) is True assert os.path.exists(obj3) is True assert os.path.exists(file1) is True assert os.path.exists(file2) is True assert os.path.exists(file3) is True result = iom._get_pull_all_keys() assert len(result) == 0 # Completely remove other_dir/test3.txt object os.remove(obj3) os.remove(file3) # Remove link for test1.txt, should relink automatically and not need to be pulled os.remove(file1) assert os.path.exists(obj1) is True assert os.path.exists(obj2) is True assert os.path.exists(obj3) is False assert os.path.exists(file1) is False assert os.path.exists(file2) is True assert os.path.exists(file3) is False result = iom._get_pull_all_keys() assert len(result) == 1 assert result[0] == 'other_dir/test3.txt' assert os.path.exists(obj1) is True assert os.path.exists(obj2) is True assert os.path.exists(obj3) is False assert os.path.exists(file1) is True assert os.path.exists(file2) is True assert os.path.exists(file3) is False
def test_init(self, mock_dataset_with_manifest): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) assert isinstance(iom, IOManager) assert isinstance(iom.push_dir, str)
def download_dataset_files(logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None, all_keys: Optional[bool] = False, keys: Optional[List[str]] = None, config_file: str = None) -> None: """Method to download files from a dataset in the background and provide status to the UI. This job schedules `pull_objects` jobs after splitting up the download work into batches. At the end, the job removes any partially downloaded files (due to failures) and links all the files for the dataset. Args: logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked all_keys: Boolean indicating if all remaining files should be downloaded keys: List if file keys to download config_file: config file (used for test mocking) Returns: str: directory path of imported labbook """ dispatcher_obj = Dispatcher() def update_feedback(msg: str, has_failures: Optional[bool] = None, failure_detail: Optional[str] = None, percent_complete: Optional[float] = None) -> None: """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if has_failures: current_job.meta['has_failures'] = has_failures if failure_detail: current_job.meta['failure_detail'] = failure_detail if percent_complete: current_job.meta['percent_complete'] = percent_complete current_job.meta['feedback'] = msg current_job.save_meta() logger = LMLogger.get_logger() try: p = os.getpid() logger.info( f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username}," f" dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner}," f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}") im = InventoryManager(config_file=config_file) if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) key_batches, total_bytes, num_files = iom.compute_pull_batches( keys, pull_all=all_keys) failure_keys = list() if key_batches: # Schedule jobs for batches bg_jobs = list() for keys in key_batches: job_kwargs = { 'keys': keys, 'logged_in_username': logged_in_username, 'access_token': access_token, 'id_token': id_token, 'dataset_owner': dataset_owner, 'dataset_name': dataset_name, 'labbook_owner': labbook_owner, 'labbook_name': labbook_name, 'config_file': config_file, } job_metadata = { 'dataset': f"{logged_in_username}|{dataset_owner}|{dataset_name}", 'method': 'pull_objects' } job_key = dispatcher_obj.dispatch_task( method_reference=pull_objects, kwargs=job_kwargs, metadata=job_metadata, persist=True) bg_jobs.append( BackgroundDownloadJob(dispatcher_obj, keys, job_key)) update_feedback( f"Please wait - Downloading {num_files} files ({format_size(total_bytes)}) - 0% complete", percent_complete=0, has_failures=False) logger.info( f"(Job {p}) Starting file downloads for" f" {logged_in_username}/{dataset_owner}/{dataset_name} with {len(key_batches)} jobs" ) while sum([(x.is_complete or x.is_failed) for x in bg_jobs]) != len(bg_jobs): # Refresh all job statuses and update status feedback [j.refresh_status() for j in bg_jobs] total_completed_bytes = sum( [j.completed_bytes for j in bg_jobs]) pc = (float(total_completed_bytes) / float(total_bytes)) * 100 update_feedback( f"Please wait - Downloading {num_files} files ({format_size(total_completed_bytes)} of " f"{format_size(total_bytes)}) - {round(pc)}% complete", percent_complete=pc) time.sleep(1) # Aggregate failures if they exist for j in bg_jobs: if j.is_failed: # Whole job failed...assume entire batch should get re-uploaded for now failure_keys.extend(j.keys) else: failure_keys.extend(j.get_failed_keys()) # Set final status for UI if len(failure_keys) == 0: update_feedback(f"Download complete!", percent_complete=100, has_failures=False) else: failure_str = "" for f in failure_keys: # If any failed files partially downloaded, remove them. abs_dataset_path = os.path.join(m.current_revision_dir, f) abs_object_path = m.dataset_to_object_path(f) if os.path.exists(abs_dataset_path): os.remove(abs_dataset_path) if os.path.exists(abs_object_path): os.remove(abs_object_path) failure_str = f"{failure_str}{f}\n" failure_detail_str = f"Files that failed to download:\n{failure_str}" update_feedback("", has_failures=True, failure_detail=failure_detail_str) # Link dataset files, so anything that was successfully pulled will materialize m.link_revision() if len(failure_keys) > 0: # If any downloads failed, exit non-zero to the UI knows there was an error raise IOError( f"{len(failure_keys)} file(s) failed to download. Check message detail and try again." ) except Exception as err: logger.exception(err) raise
def pull_objects(keys: List[str], logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None, config_file: str = None) -> None: """Method to pull a collection of objects from a dataset's backend. This runs the IOManager.pull_objects() method with `link_revision=False`. This is because this job can be run in parallel multiple times with different sets of keys. You don't want to link until the very end, which is handled in the `download_dataset_files` job, which is what scheduled this job. Args: keys: List if file keys to download logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked config_file: config file (used for test mocking) Returns: str: directory path of imported labbook """ logger = LMLogger.get_logger() def progress_update_callback(completed_bytes: int) -> None: """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if 'completed_bytes' not in current_job.meta: current_job.meta['completed_bytes'] = 0 current_job.meta['completed_bytes'] = int( current_job.meta['completed_bytes']) + completed_bytes current_job.save_meta() try: p = os.getpid() logger.info( f"(Job {p}) Starting pull_objects(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner}," f" labbook_name={labbook_name}") im = InventoryManager(config_file=config_file) if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) result = iom.pull_objects(keys=keys, progress_update_fn=progress_update_callback, link_revision=False) job = get_current_job() if job: job.meta['failure_keys'] = ",".join( [x.dataset_path for x in result.failure]) job.meta['message'] = result.message job.save_meta() except Exception as err: logger.exception(err) raise
def test_push_objects_deduplicate(self, mock_dataset_with_manifest, mock_dataset_head): ds, manifest, working_dir = mock_dataset_with_manifest iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content dup") helper_append_file(manifest.cache_mgr.cache_root, revision, "test3.txt", "test content dup") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 3 _, obj1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj2 = obj_to_push[1].object_path.rsplit('/', 1) _, obj3 = obj_to_push[2].object_path.rsplit('/', 1) assert obj1 != obj2 assert obj2 == obj3 with aioresponses() as mocked_responses: mocked_responses.put( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj1}', payload={ "presigned_url": f"https://dummyurl.com/{obj1}?params=1", "namespace": ds.namespace, "key_id": "hghghg", "obj_id": obj1, "dataset": ds.name }, status=200) mocked_responses.put(f"https://dummyurl.com/{obj1}?params=1", headers={'Etag': 'asdfasdf'}, status=200) mocked_responses.put( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj2}', payload={ "presigned_url": f"https://dummyurl.com/{obj2}?params=1", "namespace": ds.namespace, "key_id": "hghghg", "obj_id": obj2, "dataset": ds.name }, status=200) mocked_responses.put(f"https://dummyurl.com/{obj2}?params=1", headers={'Etag': '12341234'}, status=200) assert len(glob.glob(f'{iom.push_dir}/*')) == 1 iom.dataset.backend.set_default_configuration( "test-user", "abcd", '1234') obj_to_push = iom.objects_to_push(remove_duplicates=True) result = iom.push_objects(obj_to_push, chunk_update_callback) assert len(glob.glob(f'{iom.push_dir}/*')) == 1 assert len(result.success) == 2 assert len(result.failure) == 0 assert isinstance(result, PushResult) is True assert isinstance(result.success[0], PushObject) is True assert result.success[0].object_path != result.success[ 1].object_path assert result.success[0].object_path in [ obj_to_push[0].object_path, obj_to_push[1].object_path ] assert result.success[1].object_path in [ obj_to_push[0].object_path, obj_to_push[1].object_path ]
def test_download_dataset_files_file_fail( self, mock_config_file_background_tests): def dispatch_query_mock(self, job_key): # mock the job actually running and returning status JobStatus = namedtuple("JobStatus", ['status', 'meta']) return JobStatus(status='finished', meta={ 'completed_bytes': '0', 'failure_keys': 'test1.txt' }) def dispatch_mock(self, method_reference, kwargs, metadata, persist): gtmcore.dispatcher.dataset_jobs.pull_objects(**kwargs) return "afakejobkey" im = InventoryManager(mock_config_file_background_tests[0]) ds = im.create_dataset('default', 'default', "dataset100", storage_type="gigantum_object_v1", description="100") m = Manifest(ds, 'default') iom = IOManager(ds, m) helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "asdfadfsdf") m.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 1 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) assert os.path.exists(obj1_target) is True helper_compress_file(obj1_target, obj1_source) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj1_source) is True # Clear out from linked dir os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) with patch.object(Configuration, 'find_default_config', lambda self: mock_config_file_background_tests[0]): with patch.object(Dispatcher, 'dispatch_task', dispatch_mock): with patch.object(Dispatcher, 'query_task', dispatch_query_mock): dl_kwargs = { 'logged_in_username': "******", 'access_token': "asdf", 'id_token': "1234", 'dataset_owner': "default", 'dataset_name': "dataset100", 'labbook_owner': None, 'labbook_name': None, 'keys': ["test1.txt"], 'config_file': mock_config_file_background_tests[0] } with pytest.raises(IOError): gtmcore.dispatcher.dataset_jobs.download_dataset_files( **dl_kwargs) assert os.path.isfile(obj1_target) is False
def test_download_dataset_files(self, mock_config_file_background_tests, mock_dataset_head): def dispatch_query_mock(self, job_key): JobStatus = namedtuple("JobStatus", ['status', 'meta']) return JobStatus(status='finished', meta={'completed_bytes': '500'}) def dispatch_mock(self, method_reference, kwargs, metadata, persist): with aioresponses() as mocked_responses: mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_1}?params=1", "namespace": ds.namespace, "obj_id": obj_id_1, "dataset": ds.name }, status=200) with open(obj1_source, 'rb') as data1: mocked_responses.get( f"https://dummyurl.com/{obj_id_1}?params=1", body=data1.read(), status=200, content_type='application/octet-stream') gtmcore.dispatcher.dataset_jobs.pull_objects(**kwargs) return "afakejobkey" im = InventoryManager(mock_config_file_background_tests[0]) ds = im.create_dataset('default', 'default', "dataset100", storage_type="gigantum_object_v1", description="100") m = Manifest(ds, 'default') iom = IOManager(ds, m) helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "asdfadfsdf") m.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 1 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) assert os.path.exists(obj1_target) is True helper_compress_file(obj1_target, obj1_source) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj1_source) is True # Clear out from linked dir os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) with patch.object(Configuration, 'find_default_config', lambda self: mock_config_file_background_tests[0]): with patch.object(Dispatcher, 'dispatch_task', dispatch_mock): with patch.object(Dispatcher, 'query_task', dispatch_query_mock): dl_kwargs = { 'logged_in_username': "******", 'access_token': "asdf", 'id_token': "1234", 'dataset_owner': "default", 'dataset_name': "dataset100", 'labbook_owner': None, 'labbook_name': None, 'keys': ["test1.txt"], 'config_file': mock_config_file_background_tests[0] } gtmcore.dispatcher.dataset_jobs.download_dataset_files( **dl_kwargs) assert os.path.isfile(obj1_target) is True decompressor = snappy.StreamDecompressor() with open(obj1_source, 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(obj1_target, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1
def test_pull_objects(self, mock_config_file, mock_dataset_head): im = InventoryManager(mock_config_file[0]) ds = im.create_dataset('default', 'default', "dataset100", storage_type="gigantum_object_v1", description="100") m = Manifest(ds, 'default') iom = IOManager(ds, m) os.makedirs( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, "other_dir")) helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test1.txt", "asdfadfsdf") helper_append_file(m.cache_mgr.cache_root, m.dataset_revision, "test2.txt", "fdsfgfd") m.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1) obj1_target = obj_to_push[0].object_path obj2_target = obj_to_push[1].object_path obj1_source = os.path.join('/tmp', uuid.uuid4().hex) obj2_source = os.path.join('/tmp', uuid.uuid4().hex) assert os.path.exists(obj1_target) is True assert os.path.exists(obj2_target) is True helper_compress_file(obj1_target, obj1_source) helper_compress_file(obj2_target, obj2_source) assert os.path.isfile(obj1_target) is False assert os.path.isfile(obj2_target) is False assert os.path.isfile(obj1_source) is True assert os.path.isfile(obj2_source) is True # Clear out from linked dir os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) os.remove( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test2.txt')) with patch.object(Configuration, 'find_default_config', lambda self: mock_config_file[0]): with aioresponses() as mocked_responses: mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_1}?params=1", "namespace": ds.namespace, "obj_id": obj_id_1, "dataset": ds.name }, status=200) with open(obj1_source, 'rb') as data1: mocked_responses.get( f"https://dummyurl.com/{obj_id_1}?params=1", body=data1.read(), status=200, content_type='application/octet-stream') mocked_responses.get( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_2}', payload={ "presigned_url": f"https://dummyurl.com/{obj_id_2}?params=1", "namespace": ds.namespace, "obj_id": obj_id_2, "dataset": ds.name }, status=200) with open(obj2_source, 'rb') as data2: mocked_responses.get( f"https://dummyurl.com/{obj_id_2}?params=1", body=data2.read(), status=200, content_type='application/octet-stream') dl_kwargs = { 'logged_in_username': "******", 'access_token': "asdf", 'id_token': "1234", 'dataset_owner': "default", 'dataset_name': "dataset100", 'labbook_owner': None, 'labbook_name': None, 'keys': ["test1.txt"] } gtmcore.dispatcher.dataset_jobs.pull_objects(**dl_kwargs) # Manually link since this is disabled by default in the job (because in real use, multiple jobs run # in parallel and you only want to link once. m.link_revision() assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is False decompressor = snappy.StreamDecompressor() with open(obj1_source, 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(obj1_target, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1 # Download other file dl_kwargs = { 'logged_in_username': "******", 'access_token': "asdf", 'id_token': "1234", 'dataset_owner': "default", 'dataset_name': "dataset100", 'labbook_owner': None, 'labbook_name': None, 'keys': ["test2.txt"] } gtmcore.dispatcher.dataset_jobs.pull_objects(**dl_kwargs) # Manually link since this is disabled by default in the job (because in real use, multiple jobs run # in parallel and you only want to link once. m.link_revision() assert os.path.isfile(obj1_target) is True assert os.path.isfile(obj2_target) is True with open(obj1_source, 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(obj1_target, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1 with open(obj2_source, 'rb') as dd: source1 = decompressor.decompress(dd.read()) source1 += decompressor.flush() with open(obj2_target, 'rt') as dd: dest1 = dd.read() assert source1.decode("utf-8") == dest1
def test_push_objects(self, mock_config_file, mock_dataset_head): im = InventoryManager(mock_config_file[0]) ds = im.create_dataset('default', 'default', "dataset100", storage_type="gigantum_object_v1", description="100") manifest = Manifest(ds, 'default') iom = IOManager(ds, manifest) revision = manifest.dataset_revision os.makedirs( os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir")) helper_append_file(manifest.cache_mgr.cache_root, revision, "test1.txt", "test content 1") helper_append_file(manifest.cache_mgr.cache_root, revision, "test2.txt", "test content 2") manifest.sweep_all_changes() obj_to_push = iom.objects_to_push() assert len(obj_to_push) == 2 _, obj1 = obj_to_push[0].object_path.rsplit('/', 1) _, obj2 = obj_to_push[1].object_path.rsplit('/', 1) with aioresponses() as mocked_responses: mocked_responses.put( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj1}', payload={ "presigned_url": f"https://dummyurl.com/{obj1}?params=1", "namespace": ds.namespace, "key_id": "hghghg", "obj_id": obj1, "dataset": ds.name }, status=200) mocked_responses.put(f"https://dummyurl.com/{obj1}?params=1", payload={}, status=200) mocked_responses.put( f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj2}', payload={ "presigned_url": f"https://dummyurl.com/{obj2}?params=1", "namespace": ds.namespace, "key_id": "hghghg", "obj_id": obj2, "dataset": ds.name }, status=200) mocked_responses.put(f"https://dummyurl.com/{obj2}?params=1", payload={}, status=200) job_kwargs = { 'objs': obj_to_push, 'logged_in_username': "******", 'access_token': "faketoken", 'id_token': "faketoken", 'dataset_owner': ds.namespace, 'dataset_name': ds.name, 'config_file': ds.client_config.config_file, } gtmcore.dispatcher.dataset_jobs.push_dataset_objects(**job_kwargs)
def push_dataset_objects(objs: List[PushObject], logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, config_file: str = None) -> None: """Method to pull a collection of objects from a dataset's backend Args: objs: List if file PushObject to push logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download config_file: config file (used for test mocking) Returns: str: directory path of imported labbook """ logger = LMLogger.get_logger() def progress_update_callback(completed_bytes: int) -> None: """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if 'completed_bytes' not in current_job.meta: current_job.meta['completed_bytes'] = 0 current_job.meta['completed_bytes'] = int( current_job.meta['completed_bytes']) + completed_bytes current_job.save_meta() try: p = os.getpid() logger.info( f"(Job {p}) Starting push_dataset_objects(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}") im = InventoryManager(config_file=config_file) ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) result = iom.push_objects(objs, progress_update_fn=progress_update_callback) job = get_current_job() if job: job.meta['failures'] = ",".join([ f"{x.object_path}|{x.dataset_path}|{x.revision}" for x in result.failure ]) job.meta['message'] = result.message job.save_meta() except Exception as err: logger.exception(err) raise
def download_dataset_files(logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None, all_keys: Optional[bool] = False, keys: Optional[List[str]] = None): """Method to import a dataset from a zip file Args: logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked all_keys: Boolean indicating if all remaining files should be downloaded keys: List if file keys to download Returns: str: directory path of imported labbook """ def update_meta(msg): job = get_current_job() if not job: return if 'feedback' not in job.meta: job.meta['feedback'] = msg else: job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}' job.save_meta() logger = LMLogger.get_logger() try: p = os.getpid() logger.info(f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner}," f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}") im = InventoryManager() if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) if all_keys: result = iom.pull_all(status_update_fn=update_meta) elif keys: result = iom.pull_objects(keys=keys, status_update_fn=update_meta) else: raise ValueError("Must provide a list of keys or set all_keys=True") # Save the Relay node IDs to the job metadata so the UI can re-fetch as needed job = get_current_job() if job: job.meta['success_keys'] = [x.dataset_path for x in result.success] job.meta['failure_keys'] = [x.dataset_path for x in result.failure] job.save_meta() if len(result.failure) > 0: # If any downloads failed, exit non-zero to the UI knows there was an error sys.exit(-1) except Exception as err: logger.exception(err) raise