def test_check_and_import_dataset(self, mock_config_file): im = InventoryManager(mock_config_file[0]) ds = im.create_dataset('default', 'default', "dataset100", storage_type="gigantum_object_v1", description="100") # Fake publish to a local bare repo _MOCK_create_remote_repo2(ds, 'test', None, None) remote_url = ds.remote im.delete_dataset('default', 'default', "dataset100") with pytest.raises(InventoryException): im.load_dataset('default', 'default', "dataset100") kwargs = { 'logged_in_username': "******", 'dataset_owner': "default", 'dataset_name': "dataset100", 'remote_url': remote_url, 'config_file': mock_config_file[0] } gtmcore.dispatcher.dataset_jobs.check_and_import_dataset(**kwargs) ds = im.load_dataset('default', 'default', "dataset100") assert ds.name == 'dataset100' assert ds.namespace == 'default'
def clean_dataset_file_cache(logged_in_username: str, dataset_owner: str, dataset_name: str, cache_location: str, config_file: str = None) -> None: """Method to import a dataset from a zip file Args: logged_in_username: username for the currently logged in user dataset_owner: Owner of the labbook if this dataset is linked dataset_name: Name of the labbook if this dataset is linked cache_location: Absolute path to the file cache (inside the container) for this dataset config_file: Returns: None """ logger = LMLogger.get_logger() p = os.getpid() try: logger.info( f"(Job {p}) Starting clean_dataset_file_cache(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}") im = InventoryManager(config_file=config_file) # Check for dataset try: im.load_dataset(logged_in_username, dataset_owner, dataset_name) logger.info( f"{logged_in_username}/{dataset_owner}/{dataset_name} still exists. Skipping file cache clean." ) return except InventoryException: # Dataset not found, move along pass # Check for submodule references for lb in im.list_labbooks(logged_in_username): for ds in im.get_linked_datasets(lb): if ds.namespace == dataset_owner and ds.name == dataset_name: logger.info( f"{logged_in_username}/{dataset_owner}/{dataset_name} still referenced by {str(lb)}." f" Skipping file cache clean.") return # If you get here the dataset no longer exists and is not used by any projects, clear files shutil.rmtree(cache_location) except Exception as err: logger.error(f"(Job {p}) Error in clean_dataset_file_cache job") logger.exception(err) raise
def test_pagination_sort_modified(self, fixture_working_dir_dataset_populated_scoped, snapshot): query = """ { datasetList{ localDatasets(orderBy: "modified_on", sort: "desc") { edges { node { id name description } cursor } pageInfo { hasNextPage hasPreviousPage } } } } """ snapshot.assert_match(fixture_working_dir_dataset_populated_scoped[2].execute(query)) im = InventoryManager(fixture_working_dir_dataset_populated_scoped[0]) ds = im.load_dataset("default", "default", "dataset4") with open(os.path.join(ds.root_dir, "test.txt"), 'wt') as tf: tf.write("asdfasdf") ds.git.add_all() ds.git.commit("Changing the repo") # Run query again snapshot.assert_match(fixture_working_dir_dataset_populated_scoped[2].execute(query))
def test_list_datasets_modified_on(self, mock_config_file): """Test list az datasets""" inv_manager = InventoryManager(mock_config_file[0]) inv_manager.create_dataset("user1", "user1", "dataset2", "gigantum_object_v1", description="my dataset") time.sleep(1) inv_manager.create_dataset("user1", "user2", "a-dataset3", "gigantum_object_v1", description="my dataset") time.sleep(1) inv_manager.create_dataset("user1", "user1", "dataset12", "gigantum_object_v1", description="my dataset") time.sleep(1) inv_manager.create_dataset("user2", "user1", "dataset1", "gigantum_object_v1", description="my dataset") datasets = inv_manager.list_datasets(username="******", sort_mode="modified_on") assert len(datasets) == 3 assert datasets[0].name == 'dataset2' assert datasets[1].name == 'a-dataset3' assert datasets[2].name == 'dataset12' # modify a repo time.sleep(1.2) ds = inv_manager.load_dataset('user1', 'user1', 'dataset2') with open(os.path.join(ds.root_dir, "manifest", "test.txt"), 'wt') as tf: tf.write("asdfasdf") ds.git.add_all() ds.git.commit("Changing the repo") datasets = inv_manager.list_datasets(username="******", sort_mode="modified_on") assert len(datasets) == 3 assert datasets[0].name == 'a-dataset3' assert datasets[1].name == 'dataset12' assert datasets[2].name == 'dataset2'
def mutate_and_get_payload(cls, root, info, dataset_owner, dataset_name, labbook_name=None, labbook_owner=None, all_keys=None, keys=None, client_mutation_id=None): logged_in_username = get_logged_in_username() lb = None im = InventoryManager() if labbook_name: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) d = Dispatcher() dl_kwargs = { 'logged_in_username': logged_in_username, 'access_token': flask.g.access_token, 'id_token': flask.g.id_token, 'dataset_owner': dataset_owner, 'dataset_name': dataset_name, 'labbook_owner': labbook_owner, 'labbook_name': labbook_name, 'all_keys': all_keys, 'keys': keys } # Gen unique keys for tracking jobs lb_key = f"{logged_in_username}|{labbook_owner}|{labbook_name}" if lb else None ds_key = f"{logged_in_username}|{dataset_owner}|{dataset_name}" if lb_key: ds_key = f"{lb_key}|LINKED|{ds_key}" metadata = { 'dataset': ds_key, 'labbook': lb_key, 'method': 'download_dataset_files' } res = d.dispatch_task(jobs.download_dataset_files, kwargs=dl_kwargs, metadata=metadata) return DownloadDatasetFiles(background_job_key=res.key_str)
def update_unmanaged_dataset_from_remote(logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str) -> None: """Method to update/populate an unmanaged dataset from its remote automatically Args: logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download Returns: """ def update_meta(msg): job = get_current_job() if not job: return if 'feedback' not in job.meta: job.meta['feedback'] = msg else: job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}' job.save_meta() logger = LMLogger.get_logger() try: p = os.getpid() logger.info( f"(Job {p}) Starting update_unmanaged_dataset_from_remote(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}") im = InventoryManager() ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) if not isinstance(ds.backend, UnmanagedStorageBackend): raise ValueError("Can only auto-update unmanaged dataset types") if not ds.backend.can_update_from_remote: raise ValueError( "Storage backend cannot update automatically from remote.") ds.backend.update_from_remote(ds, update_meta) except Exception as err: logger.exception(err) raise
def mock_legacy_dataset(mock_dataset_with_cache_dir): """A pytest fixture that imports the legacy dataset""" archive_path = os.path.join( resource_filename('gtmcore.dataset.tests', 'data'), 'test-legacy-dataset.zip') temp_path = os.path.join(tempfile.gettempdir(), 'test-legacy-dataset.zip') shutil.copyfile(archive_path, temp_path) conf_file = mock_dataset_with_cache_dir[0].client_config.config_file import_dataset_from_zip(archive_path=temp_path, username=USERNAME, owner=USERNAME, config_file=conf_file) im = InventoryManager() ds = im.load_dataset(USERNAME, USERNAME, 'test-legacy-dataset') m = Manifest(ds, USERNAME) # yield dataset, manifest, working_dir yield ds, m, mock_dataset_with_cache_dir[1]
def verify_dataset_contents(logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None) -> None: """Method to update/populate an unmanaged dataset from it local state Args: logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked Returns: None """ job = get_current_job() def update_meta(msg): if not job: return if 'feedback' not in job.meta: job.meta['feedback'] = msg else: job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}' job.save_meta() logger = LMLogger.get_logger() try: p = os.getpid() logger.info( f"(Job {p}) Starting verify_dataset_contents(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}," f"labbook_owner={labbook_owner}, labbook_name={labbook_name}") im = InventoryManager() if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) result = ds.backend.verify_contents(ds, update_meta) job.meta['modified_keys'] = result except Exception as err: logger.exception(err) raise
def download_dataset_files(logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None, all_keys: Optional[bool] = False, keys: Optional[List[str]] = None, config_file: str = None) -> None: """Method to download files from a dataset in the background and provide status to the UI. This job schedules `pull_objects` jobs after splitting up the download work into batches. At the end, the job removes any partially downloaded files (due to failures) and links all the files for the dataset. Args: logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked all_keys: Boolean indicating if all remaining files should be downloaded keys: List if file keys to download config_file: config file (used for test mocking) Returns: str: directory path of imported labbook """ dispatcher_obj = Dispatcher() def update_feedback(msg: str, has_failures: Optional[bool] = None, failure_detail: Optional[str] = None, percent_complete: Optional[float] = None) -> None: """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if has_failures: current_job.meta['has_failures'] = has_failures if failure_detail: current_job.meta['failure_detail'] = failure_detail if percent_complete: current_job.meta['percent_complete'] = percent_complete current_job.meta['feedback'] = msg current_job.save_meta() logger = LMLogger.get_logger() try: p = os.getpid() logger.info( f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username}," f" dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner}," f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}") im = InventoryManager(config_file=config_file) if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) key_batches, total_bytes, num_files = iom.compute_pull_batches( keys, pull_all=all_keys) failure_keys = list() if key_batches: # Schedule jobs for batches bg_jobs = list() for keys in key_batches: job_kwargs = { 'keys': keys, 'logged_in_username': logged_in_username, 'access_token': access_token, 'id_token': id_token, 'dataset_owner': dataset_owner, 'dataset_name': dataset_name, 'labbook_owner': labbook_owner, 'labbook_name': labbook_name, 'config_file': config_file, } job_metadata = { 'dataset': f"{logged_in_username}|{dataset_owner}|{dataset_name}", 'method': 'pull_objects' } job_key = dispatcher_obj.dispatch_task( method_reference=pull_objects, kwargs=job_kwargs, metadata=job_metadata, persist=True) bg_jobs.append( BackgroundDownloadJob(dispatcher_obj, keys, job_key)) update_feedback( f"Please wait - Downloading {num_files} files ({format_size(total_bytes)}) - 0% complete", percent_complete=0, has_failures=False) logger.info( f"(Job {p}) Starting file downloads for" f" {logged_in_username}/{dataset_owner}/{dataset_name} with {len(key_batches)} jobs" ) while sum([(x.is_complete or x.is_failed) for x in bg_jobs]) != len(bg_jobs): # Refresh all job statuses and update status feedback [j.refresh_status() for j in bg_jobs] total_completed_bytes = sum( [j.completed_bytes for j in bg_jobs]) pc = (float(total_completed_bytes) / float(total_bytes)) * 100 update_feedback( f"Please wait - Downloading {num_files} files ({format_size(total_completed_bytes)} of " f"{format_size(total_bytes)}) - {round(pc)}% complete", percent_complete=pc) time.sleep(1) # Aggregate failures if they exist for j in bg_jobs: if j.is_failed: # Whole job failed...assume entire batch should get re-uploaded for now failure_keys.extend(j.keys) else: failure_keys.extend(j.get_failed_keys()) # Set final status for UI if len(failure_keys) == 0: update_feedback(f"Download complete!", percent_complete=100, has_failures=False) else: failure_str = "" for f in failure_keys: # If any failed files partially downloaded, remove them. abs_dataset_path = os.path.join(m.current_revision_dir, f) abs_object_path = m.dataset_to_object_path(f) if os.path.exists(abs_dataset_path): os.remove(abs_dataset_path) if os.path.exists(abs_object_path): os.remove(abs_object_path) failure_str = f"{failure_str}{f}\n" failure_detail_str = f"Files that failed to download:\n{failure_str}" update_feedback("", has_failures=True, failure_detail=failure_detail_str) # Link dataset files, so anything that was successfully pulled will materialize m.link_revision() if len(failure_keys) > 0: # If any downloads failed, exit non-zero to the UI knows there was an error raise IOError( f"{len(failure_keys)} file(s) failed to download. Check message detail and try again." ) except Exception as err: logger.exception(err) raise
def pull_objects(keys: List[str], logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None, config_file: str = None) -> None: """Method to pull a collection of objects from a dataset's backend. This runs the IOManager.pull_objects() method with `link_revision=False`. This is because this job can be run in parallel multiple times with different sets of keys. You don't want to link until the very end, which is handled in the `download_dataset_files` job, which is what scheduled this job. Args: keys: List if file keys to download logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked config_file: config file (used for test mocking) Returns: str: directory path of imported labbook """ logger = LMLogger.get_logger() def progress_update_callback(completed_bytes: int) -> None: """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if 'completed_bytes' not in current_job.meta: current_job.meta['completed_bytes'] = 0 current_job.meta['completed_bytes'] = int( current_job.meta['completed_bytes']) + completed_bytes current_job.save_meta() try: p = os.getpid() logger.info( f"(Job {p}) Starting pull_objects(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner}," f" labbook_name={labbook_name}") im = InventoryManager(config_file=config_file) if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) result = iom.pull_objects(keys=keys, progress_update_fn=progress_update_callback, link_revision=False) job = get_current_job() if job: job.meta['failure_keys'] = ",".join( [x.dataset_path for x in result.failure]) job.meta['message'] = result.message job.save_meta() except Exception as err: logger.exception(err) raise
def push_dataset_objects(objs: List[PushObject], logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, config_file: str = None) -> None: """Method to pull a collection of objects from a dataset's backend Args: objs: List if file PushObject to push logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download config_file: config file (used for test mocking) Returns: str: directory path of imported labbook """ logger = LMLogger.get_logger() def progress_update_callback(completed_bytes: int) -> None: """Method to update the job's metadata and provide feedback to the UI""" current_job = get_current_job() if not current_job: return if 'completed_bytes' not in current_job.meta: current_job.meta['completed_bytes'] = 0 current_job.meta['completed_bytes'] = int( current_job.meta['completed_bytes']) + completed_bytes current_job.save_meta() try: p = os.getpid() logger.info( f"(Job {p}) Starting push_dataset_objects(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}") im = InventoryManager(config_file=config_file) ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) result = iom.push_objects(objs, progress_update_fn=progress_update_callback) job = get_current_job() if job: job.meta['failures'] = ",".join([ f"{x.object_path}|{x.dataset_path}|{x.revision}" for x in result.failure ]) job.meta['message'] = result.message job.save_meta() except Exception as err: logger.exception(err) raise
def check_and_import_dataset(logged_in_username: str, dataset_owner: str, dataset_name: str, remote_url: str, access_token: Optional[str] = None, config_file: Optional[str] = None) -> None: """Job to check if a dataset exists in the user's working directory, and if not import it. This is primarily used when importing, syncing, or switching branches on a project with linked datasets Args: logged_in_username: username for the currently logged in user dataset_owner: Owner of the labbook if this dataset is linked dataset_name: Name of the labbook if this dataset is linked remote_url: URL of the dataset to import if needed access_token: The current user's access token, needed to initialize git credentials in certain situations config_file: config file (used for test mocking) Returns: None """ logger = LMLogger.get_logger() p = os.getpid() try: logger.info( f"(Job {p}) Starting check_and_import_dataset(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}") im = InventoryManager(config_file=config_file) try: # Check for dataset already existing in the user's working directory im.load_dataset(logged_in_username, dataset_owner, dataset_name) logger.info( f"{logged_in_username}/{dataset_owner}/{dataset_name} exists. Skipping auto-import." ) return except InventoryException: # Dataset not found, import it logger.info( f"{logged_in_username}/{dataset_owner}/{dataset_name} not found. " f"Auto-importing remote dataset from {remote_url}") config_obj = Configuration(config_file=config_file) if access_token: # If the access token is set, git creds should be configured remote_parts = urlsplit(remote_url) if remote_parts.netloc: remote_target = f"{remote_parts.scheme}://{remote_parts.netloc}/" else: remote_target = remote_parts.path admin_service = None for remote in config_obj.config['git']['remotes']: if remote == remote_target: admin_service = config_obj.config['git']['remotes'][ remote]['admin_service'] break if not admin_service: raise ValueError( f"Failed to configure admin service URL based on target remote: {remote_target}" ) gl_mgr = GitLabManager(remote_target, admin_service=admin_service, access_token=access_token) gl_mgr.configure_git_credentials(remote_target, logged_in_username) gitworkflows_utils.clone_repo( remote_url=remote_url, username=logged_in_username, owner=dataset_owner, load_repository=im.load_dataset_from_directory, put_repository=im.put_dataset) logger.info( f"{logged_in_username}/{dataset_owner}/{dataset_name} auto-imported successfully" ) except Exception as err: logger.error(f"(Job {p}) Error in check_and_import_dataset job") logger.exception(err) raise
def mutate_and_get_payload(cls, root, info, dataset_owner, dataset_name, from_local=False, from_remote=False, client_mutation_id=None): logged_in_username = get_logged_in_username() im = InventoryManager() ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name, get_logged_in_author()) ds.backend.set_default_configuration(logged_in_username, bearer_token=flask.g.access_token, id_token=flask.g.id_token) if not ds.backend.is_configured: raise ValueError("Dataset is not fully configured. Cannot update.") d = Dispatcher() kwargs = { 'logged_in_username': logged_in_username, 'access_token': flask.g.access_token, 'id_token': flask.g.id_token, 'dataset_owner': dataset_owner, 'dataset_name': dataset_name, } background_job_key = None if from_remote is True: if ds.backend.can_update_from_remote: # Gen unique keys for tracking jobs metadata = { 'dataset': f"{logged_in_username}|{dataset_owner}|{dataset_name}", 'method': 'update_unmanaged_dataset_from_remote' } job_response = d.dispatch_task( jobs.update_unmanaged_dataset_from_remote, kwargs=kwargs, metadata=metadata) background_job_key = job_response.key_str else: raise ValueError( "This dataset type does not support automatic update via querying its remote" ) elif from_local is True: # Gen unique keys for tracking jobs metadata = { 'dataset': f"{logged_in_username}|{dataset_owner}|{dataset_name}", 'method': 'update_unmanaged_dataset_from_local' } job_response = d.dispatch_task( jobs.update_unmanaged_dataset_from_local, kwargs=kwargs, metadata=metadata) background_job_key = job_response.key_str else: ValueError("Either `fromRemote` or `fromLocal` must be True.") return UpdateUnmanagedDataset(dataset=Dataset(id="{}&{}".format( dataset_owner, dataset_name), name=dataset_name, owner=dataset_owner), background_job_key=background_job_key)
def mutate_and_get_payload(cls, root, info, dataset_owner, dataset_name, parameters=None, confirm=None, client_mutation_id=None): logged_in_username = get_logged_in_username() im = InventoryManager() ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name, get_logged_in_author()) ds.backend.set_default_configuration(logged_in_username, bearer_token=flask.g.access_token, id_token=flask.g.id_token) should_confirm = False error_message = None confirm_message = None background_job_key = None is_configured = None if confirm is None: if parameters: # Update the configuration current_config = ds.backend_config for param in parameters: current_config[param.parameter] = param.value ds.backend_config = current_config # Validate the configuration try: confirm_message = ds.backend.confirm_configuration(ds) if confirm_message is not None: should_confirm = True except ValueError as err: error_message = f"{err}" is_configured = False else: if confirm is False: # Clear configuration current_config = ds.backend_config for param in parameters: current_config[param.parameter] = None ds.backend_config = current_config else: if ds.backend.can_update_from_remote: d = Dispatcher() kwargs = { 'logged_in_username': logged_in_username, 'access_token': flask.g.access_token, 'id_token': flask.g.id_token, 'dataset_owner': dataset_owner, 'dataset_name': dataset_name, } # Gen unique keys for tracking jobs metadata = { 'dataset': f"{logged_in_username}|{dataset_owner}|{dataset_name}", 'method': 'update_unmanaged_dataset_from_remote' } job_response = d.dispatch_task( jobs.update_unmanaged_dataset_from_remote, kwargs=kwargs, metadata=metadata) background_job_key = job_response.key_str if is_configured is None: is_configured = ds.backend.is_configured return ConfigureDataset( dataset=Dataset(id="{}&{}".format(dataset_owner, dataset_name), name=dataset_name, owner=dataset_owner), is_configured=is_configured, should_confirm=should_confirm, confirm_message=confirm_message, error_message=error_message, has_background_job=ds.backend.can_update_from_remote, background_job_key=background_job_key)
def test_checkout__linked_dataset(self, mock_labbook_lfs_disabled, mock_config_file): """ test checking out a branch in a project that pulls in a linked dataset""" def dispatcher_mock(self, function_ref, kwargs, metadata): assert kwargs['logged_in_username'] == 'other-test-user2' assert kwargs['dataset_owner'] == 'testuser' assert kwargs['dataset_name'] == 'test-ds' # Inject mocked config file kwargs['config_file'] = mock_config_file[0] # Stop patching so job gets scheduled for real dispatcher_patch.stop() # Call same method as in mutation d = Dispatcher() res = d.dispatch_task( gtmcore.dispatcher.dataset_jobs.check_and_import_dataset, kwargs=kwargs, metadata=metadata) return res username = '******' lb = mock_labbook_lfs_disabled[2] im = InventoryManager(config_file=mock_labbook_lfs_disabled[0]) ds = im.create_dataset(username, username, 'test-ds', storage_type='gigantum_object_v1') # Publish dataset dataset_wf = DatasetWorkflow(ds) dataset_wf.publish(username=username) # Publish project labbook_wf = LabbookWorkflow(lb) labbook_wf.publish(username=username) # Switch branches labbook_wf.labbook.checkout_branch(branch_name="dataset-branch", new=True) # Link to project im.link_dataset_to_labbook(dataset_wf.remote, username, username, labbook_wf.labbook) # Publish branch labbook_wf.sync(username=username) # Import project other_user = '******' wf_other = LabbookWorkflow.import_from_remote( labbook_wf.remote, username=other_user, config_file=mock_config_file[0]) # The remotes must be the same, cause it's the same remote repo assert wf_other.remote == labbook_wf.remote assert wf_other.repository != labbook_wf.repository assert f'{other_user}/{username}/labbooks/labbook1' in wf_other.repository.root_dir with pytest.raises(InventoryException): im_other_user = InventoryManager(config_file=mock_config_file[0]) ds = im_other_user.load_dataset(other_user, username, 'test-ds') # Patch dispatch_task so you can inject the mocked config file dispatcher_patch = patch.object(Dispatcher, 'dispatch_task', dispatcher_mock) dispatcher_patch.start() # Checkout the branch assert wf_other.labbook.active_branch == "master" wf_other.checkout(username=other_user, branch_name="dataset-branch") cnt = 0 while cnt < 20: try: im_other_user = InventoryManager( config_file=mock_config_file[0]) ds = im_other_user.load_dataset(other_user, username, 'test-ds') break except InventoryException: cnt += 1 time.sleep(1) assert cnt < 20 assert ds.name == 'test-ds' assert ds.namespace == username assert mock_config_file[1] in ds.root_dir assert wf_other.labbook.active_branch == "dataset-branch"
def test_import_from_remote__linked_dataset(self, mock_labbook_lfs_disabled, mock_config_file): """ test importing a project with a linked dataset""" def dispatcher_mock(self, function_ref, kwargs, metadata): assert kwargs['logged_in_username'] == 'other-test-user2' assert kwargs['dataset_owner'] == 'testuser' assert kwargs['dataset_name'] == 'test-ds' # Inject mocked config file kwargs['config_file'] = mock_config_file[0] # Stop patching so job gets scheduled for real dispatcher_patch.stop() # Call same method as in mutation d = Dispatcher() res = d.dispatch_task( gtmcore.dispatcher.dataset_jobs.check_and_import_dataset, kwargs=kwargs, metadata=metadata) return res username = '******' lb = mock_labbook_lfs_disabled[2] im = InventoryManager(config_file=mock_labbook_lfs_disabled[0]) ds = im.create_dataset(username, username, 'test-ds', storage_type='gigantum_object_v1') # Publish dataset dataset_wf = DatasetWorkflow(ds) dataset_wf.publish(username=username) # Link to project im.link_dataset_to_labbook(dataset_wf.remote, username, username, lb) # Publish project labbook_wf = LabbookWorkflow(lb) labbook_wf.publish(username=username) # Patch dispatch_task so you can inject the mocked config file dispatcher_patch = patch.object(Dispatcher, 'dispatch_task', dispatcher_mock) dispatcher_patch.start() # Import project, triggering an auto-import of the dataset other_user = '******' wf_other = LabbookWorkflow.import_from_remote( labbook_wf.remote, username=other_user, config_file=mock_config_file[0]) # The remotes must be the same, cause it's the same remote repo assert wf_other.remote == labbook_wf.remote # The actual path on disk will be different, though assert wf_other.repository != labbook_wf.repository # Check imported into namespace of original owner (testuser) assert f'{other_user}/{username}/labbooks/labbook1' in wf_other.repository.root_dir cnt = 0 while cnt < 20: try: im_other_user = InventoryManager( config_file=mock_config_file[0]) ds = im_other_user.load_dataset(other_user, username, 'test-ds') break except InventoryException: cnt += 1 time.sleep(1) assert cnt < 20 assert ds.name == 'test-ds' assert ds.namespace == username assert mock_config_file[1] in ds.root_dir
def mutate_and_get_payload(cls, root, info, labbook_owner, labbook_name, dataset_owner, dataset_name, action, dataset_url=None, client_mutation_id=None): logged_in_username = get_logged_in_username() im = InventoryManager() lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name, author=get_logged_in_author()) with lb.lock(): if action == 'link': if dataset_url: remote_domain = cls._get_remote_domain( dataset_url, dataset_owner, dataset_name) if remote_domain: # Make sure git creds are configured for the remote admin_service = None for remote in lb.client_config.config['git'][ 'remotes']: if remote_domain == remote: admin_service = lb.client_config.config['git'][ 'remotes'][remote]['admin_service'] break if "HTTP_AUTHORIZATION" in info.context.headers.environ: token = parse_token(info.context.headers. environ["HTTP_AUTHORIZATION"]) else: raise ValueError( "Authorization header not provided." " Must have a valid session to query for collaborators" ) mgr = GitLabManager(remote_domain, admin_service, token) mgr.configure_git_credentials(remote_domain, logged_in_username) else: # Link to local dataset ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) dataset_url = f"{ds.root_dir}/.git" # Link the dataset to the labbook ds = im.link_dataset_to_labbook(dataset_url, dataset_owner, dataset_name, lb) ds.namespace = dataset_owner # Preload the dataloader info.context.dataset_loader.prime( f"{get_logged_in_username()}&{dataset_owner}&{dataset_name}", ds) # Relink the revision m = Manifest(ds, logged_in_username) m.link_revision() elif action == 'unlink': im.unlink_dataset_from_labbook(dataset_owner, dataset_name, lb) elif action == 'update': ds = im.update_linked_dataset_reference( dataset_owner, dataset_name, lb) m = Manifest(ds, logged_in_username) m.force_reload() info.context.dataset_loader.prime( f"{get_logged_in_username()}&{dataset_owner}&{dataset_name}", ds) else: raise ValueError( "Unsupported action. Use `link`, `unlink`, or `update`") info.context.labbook_loader.prime( f"{get_logged_in_username()}&{labbook_owner}&{labbook_name}", lb) edge = LabbookConnection.Edge(node=Labbook(owner=labbook_owner, name=labbook_name), cursor=base64.b64encode( f"{0}".encode('utf-8'))) return ModifyDatasetLink(new_labbook_edge=edge)
def download_dataset_files(logged_in_username: str, access_token: str, id_token: str, dataset_owner: str, dataset_name: str, labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None, all_keys: Optional[bool] = False, keys: Optional[List[str]] = None): """Method to import a dataset from a zip file Args: logged_in_username: username for the currently logged in user access_token: bearer token id_token: identity token dataset_owner: Owner of the dataset containing the files to download dataset_name: Name of the dataset containing the files to download labbook_owner: Owner of the labbook if this dataset is linked labbook_name: Name of the labbook if this dataset is linked all_keys: Boolean indicating if all remaining files should be downloaded keys: List if file keys to download Returns: str: directory path of imported labbook """ def update_meta(msg): job = get_current_job() if not job: return if 'feedback' not in job.meta: job.meta['feedback'] = msg else: job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}' job.save_meta() logger = LMLogger.get_logger() try: p = os.getpid() logger.info(f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner}," f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}") im = InventoryManager() if labbook_owner is not None and labbook_name is not None: # This is a linked dataset, load repo from the Project lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name) dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name) ds = im.load_dataset_from_directory(dataset_dir) else: # this is a normal dataset. Load repo from working dir ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name) ds.namespace = dataset_owner ds.backend.set_default_configuration(logged_in_username, access_token, id_token) m = Manifest(ds, logged_in_username) iom = IOManager(ds, m) if all_keys: result = iom.pull_all(status_update_fn=update_meta) elif keys: result = iom.pull_objects(keys=keys, status_update_fn=update_meta) else: raise ValueError("Must provide a list of keys or set all_keys=True") # Save the Relay node IDs to the job metadata so the UI can re-fetch as needed job = get_current_job() if job: job.meta['success_keys'] = [x.dataset_path for x in result.success] job.meta['failure_keys'] = [x.dataset_path for x in result.failure] job.save_meta() if len(result.failure) > 0: # If any downloads failed, exit non-zero to the UI knows there was an error sys.exit(-1) except Exception as err: logger.exception(err) raise