def test_load_dataset_from_file(self, mock_config_file): inv_manager = InventoryManager(mock_config_file[0]) auth = GitAuthor(name="username", email="*****@*****.**") ds = inv_manager.create_dataset("test", "test", "dataset1", "gigantum_object_v1", description="my first dataset", author=auth) with tempfile.TemporaryDirectory() as tempdir: r = shutil.move(ds.root_dir, tempdir) ds_loaded_from_file = inv_manager.load_dataset_from_directory(r) # Test failing case - invalid dir with pytest.raises(InventoryException): r = inv_manager.load_dataset_from_directory('/tmp')
def mutate_and_get_payload(cls, root, info, dataset_owner, dataset_name, labbook_name=None, labbook_owner=None, all_keys=None, keys=None, client_mutation_id=None): logged_in_username = get_logged_in_username() lb = None im = InventoryManager() if labbook_name: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) d = Dispatcher() dl_kwargs = { 'logged_in_username': logged_in_username, 'access_token': flask.g.access_token, 'id_token': flask.g.id_token, 'dataset_owner': dataset_owner, 'dataset_name': dataset_name, 'labbook_owner': labbook_owner, 'labbook_name': labbook_name, 'all_keys': all_keys, 'keys': keys } # Gen unique keys for tracking jobs lb_key = f"{logged_in_username}|{labbook_owner}|{labbook_name}" if lb else None ds_key = f"{logged_in_username}|{dataset_owner}|{dataset_name}" if lb_key: ds_key = f"{lb_key}|LINKED|{ds_key}" metadata = { 'dataset': ds_key, 'labbook': lb_key, 'method': 'download_dataset_files' } res = d.dispatch_task(jobs.download_dataset_files, kwargs=dl_kwargs, metadata=metadata) return DownloadDatasetFiles(background_job_key=res.key_str)
def test_put_dataset(self, mock_config_file): inv_manager = InventoryManager(mock_config_file[0]) auth = GitAuthor(name="username", email="*****@*****.**") ds = inv_manager.create_dataset("test", "test", "dataset1", "gigantum_object_v1", description="my first dataset", author=auth) ds.namespace = 'test' orig_location = ds.root_dir with tempfile.TemporaryDirectory() as tempdir: r = shutil.move(ds.root_dir, tempdir) ds_loaded_from_file = inv_manager.load_dataset_from_directory(r) assert not os.path.exists(orig_location) assert orig_location not in [d.root_dir for d in inv_manager.list_datasets('test')] placed_ds = inv_manager.put_dataset(r, 'test', 'test') assert placed_ds.root_dir in [d.root_dir for d in inv_manager.list_datasets('test')]
def verify_dataset_contents(logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None) -> None: """Method to update/populate an unmanaged dataset from it local state Args: logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked Returns: None """ job = get_current_job() def update_meta(msg): if not job: return if 'feedback' not in job.meta: job.meta['feedback'] = msg else: job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}' job.save_meta() logger = LMLogger.get_logger() try: p = os.getpid() logger.info( f"(Job {p}) Starting verify_dataset_contents(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}," f"labbook_owner={labbook_owner}, labbook_name={labbook_name}") im = InventoryManager() if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) result = ds.backend.verify_contents(ds, update_meta) job.meta['modified_keys'] = result except Exception as err: logger.exception(err) raise
def download_dataset_files(logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None, all_keys: Optional[bool] = False, keys: Optional[List[str]] = None, config_file: str = None) -> None: """Method to download files from a dataset in the background and provide status to the UI. This job schedules `pull_objects` jobs after splitting up the download work into batches. At the end, the job removes any partially downloaded files (due to failures) and links all the files for the dataset. Args: logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked all_keys: Boolean indicating if all remaining files should be downloaded keys: List if file keys to download config_file: config file (used for test mocking) Returns: str: directory path of imported labbook """ dispatcher_obj = Dispatcher() def update_feedback(msg: str, has_failures: Optional[bool] = None, failure_detail: Optional[str] = None, percent_complete: Optional[float] = None) -> None: """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if has_failures: current_job.meta['has_failures'] = has_failures if failure_detail: current_job.meta['failure_detail'] = failure_detail if percent_complete: current_job.meta['percent_complete'] = percent_complete current_job.meta['feedback'] = msg current_job.save_meta() logger = LMLogger.get_logger() try: p = os.getpid() logger.info( f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username}," f" dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner}," f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}") im = InventoryManager(config_file=config_file) if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) key_batches, total_bytes, num_files = iom.compute_pull_batches( keys, pull_all=all_keys) failure_keys = list() if key_batches: # Schedule jobs for batches bg_jobs = list() for keys in key_batches: job_kwargs = { 'keys': keys, 'logged_in_username': logged_in_username, 'access_token': access_token, 'id_token': id_token, 'dataset_owner': dataset_owner, 'dataset_name': dataset_name, 'labbook_owner': labbook_owner, 'labbook_name': labbook_name, 'config_file': config_file, } job_metadata = { 'dataset': f"{logged_in_username}|{dataset_owner}|{dataset_name}", 'method': 'pull_objects' } job_key = dispatcher_obj.dispatch_task( method_reference=pull_objects, kwargs=job_kwargs, metadata=job_metadata, persist=True) bg_jobs.append( BackgroundDownloadJob(dispatcher_obj, keys, job_key)) update_feedback( f"Please wait - Downloading {num_files} files ({format_size(total_bytes)}) - 0% complete", percent_complete=0, has_failures=False) logger.info( f"(Job {p}) Starting file downloads for" f" {logged_in_username}/{dataset_owner}/{dataset_name} with {len(key_batches)} jobs" ) while sum([(x.is_complete or x.is_failed) for x in bg_jobs]) != len(bg_jobs): # Refresh all job statuses and update status feedback [j.refresh_status() for j in bg_jobs] total_completed_bytes = sum( [j.completed_bytes for j in bg_jobs]) pc = (float(total_completed_bytes) / float(total_bytes)) * 100 update_feedback( f"Please wait - Downloading {num_files} files ({format_size(total_completed_bytes)} of " f"{format_size(total_bytes)}) - {round(pc)}% complete", percent_complete=pc) time.sleep(1) # Aggregate failures if they exist for j in bg_jobs: if j.is_failed: # Whole job failed...assume entire batch should get re-uploaded for now failure_keys.extend(j.keys) else: failure_keys.extend(j.get_failed_keys()) # Set final status for UI if len(failure_keys) == 0: update_feedback(f"Download complete!", percent_complete=100, has_failures=False) else: failure_str = "" for f in failure_keys: # If any failed files partially downloaded, remove them. abs_dataset_path = os.path.join(m.current_revision_dir, f) abs_object_path = m.dataset_to_object_path(f) if os.path.exists(abs_dataset_path): os.remove(abs_dataset_path) if os.path.exists(abs_object_path): os.remove(abs_object_path) failure_str = f"{failure_str}{f}\n" failure_detail_str = f"Files that failed to download:\n{failure_str}" update_feedback("", has_failures=True, failure_detail=failure_detail_str) # Link dataset files, so anything that was successfully pulled will materialize m.link_revision() if len(failure_keys) > 0: # If any downloads failed, exit non-zero to the UI knows there was an error raise IOError( f"{len(failure_keys)} file(s) failed to download. Check message detail and try again." ) except Exception as err: logger.exception(err) raise
def pull_objects(keys: List[str], logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None, config_file: str = None) -> None: """Method to pull a collection of objects from a dataset's backend. This runs the IOManager.pull_objects() method with `link_revision=False`. This is because this job can be run in parallel multiple times with different sets of keys. You don't want to link until the very end, which is handled in the `download_dataset_files` job, which is what scheduled this job. Args: keys: List if file keys to download logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked config_file: config file (used for test mocking) Returns: str: directory path of imported labbook """ logger = LMLogger.get_logger() def progress_update_callback(completed_bytes: int) -> None: """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if 'completed_bytes' not in current_job.meta: current_job.meta['completed_bytes'] = 0 current_job.meta['completed_bytes'] = int( current_job.meta['completed_bytes']) + completed_bytes current_job.save_meta() try: p = os.getpid() logger.info( f"(Job {p}) Starting pull_objects(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner}," f" labbook_name={labbook_name}") im = InventoryManager(config_file=config_file) if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) result = iom.pull_objects(keys=keys, progress_update_fn=progress_update_callback, link_revision=False) job = get_current_job() if job: job.meta['failure_keys'] = ",".join( [x.dataset_path for x in result.failure]) job.meta['message'] = result.message job.save_meta() except Exception as err: logger.exception(err) raise
def test_verify_contents_linked_dataset(self, mock_dataset_with_local_dir): class JobMock(): def __init__(self): self.meta = dict() def save_meta(self): pass CURRENT_JOB = JobMock() def get_current_job_mock(): return CURRENT_JOB with patch('gtmcore.dispatcher.jobs.get_current_job', side_effect=get_current_job_mock): ds = mock_dataset_with_local_dir[0] im = InventoryManager() ds.backend.update_from_remote(ds, lambda x: print(x)) m = Manifest(ds, 'tester') assert len(m.manifest.keys()) == 4 assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test1.txt')) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'test2.txt')) assert os.path.isfile( os.path.join(m.cache_mgr.cache_root, m.dataset_revision, 'subdir', 'test3.txt')) modified_items = ds.backend.verify_contents(ds, lambda x: print(x)) assert len(modified_items) == 0 lb = im.create_labbook("tester", "tester", 'test-labbook') im.link_dataset_to_labbook(f"{ds.root_dir}/.git", "tester", ds.name, lb) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', 'tester', ds.name) ds = im.load_dataset_from_directory(dataset_dir) test_dir = os.path.join(mock_dataset_with_local_dir[1], "local_data", "test_dir") with open(os.path.join(test_dir, 'test1.txt'), 'wt') as tf: tf.write("This file got changed in the filesystem") kwargs = { 'logged_in_username': "******", 'access_token': "asdf", 'id_token': "1234", 'dataset_owner': "tester", 'dataset_name': 'dataset-1', 'labbook_owner': "tester", 'labbook_name': 'test-labbook' } jobs.verify_dataset_contents(**kwargs) job = gtmcore.dispatcher.jobs.get_current_job() assert 'modified_keys' in job.meta assert job.meta['modified_keys'] == ["test1.txt"] assert 'Validating contents of 3 files.' in job.meta['feedback']
def download_dataset_files(logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None, all_keys: Optional[bool] = False, keys: Optional[List[str]] = None): """Method to import a dataset from a zip file Args: logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked all_keys: Boolean indicating if all remaining files should be downloaded keys: List if file keys to download Returns: str: directory path of imported labbook """ def update_meta(msg): job = get_current_job() if not job: return if 'feedback' not in job.meta: job.meta['feedback'] = msg else: job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}' job.save_meta() logger = LMLogger.get_logger() try: p = os.getpid() logger.info(f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner}," f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}") im = InventoryManager() if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) if all_keys: result = iom.pull_all(status_update_fn=update_meta) elif keys: result = iom.pull_objects(keys=keys, status_update_fn=update_meta) else: raise ValueError("Must provide a list of keys or set all_keys=True") # Save the Relay node IDs to the job metadata so the UI can re-fetch as needed job = get_current_job() if job: job.meta['success_keys'] = [x.dataset_path for x in result.success] job.meta['failure_keys'] = [x.dataset_path for x in result.failure] job.save_meta() if len(result.failure) > 0: # If any downloads failed, exit non-zero to the UI knows there was an error sys.exit(-1) except Exception as err: logger.exception(err) raise