def test_get_linked_datasets(self, mock_labbook): inv_manager = InventoryManager(mock_labbook[0]) lb = mock_labbook[2] datasets = inv_manager.get_linked_datasets(lb) assert len(datasets) == 0 ds = inv_manager.create_dataset("test", "test", "dataset100", "gigantum_object_v1", description="my dataset") # Fake publish to a local bare repo _MOCK_create_remote_repo2(ds, 'test', None, None) assert os.path.exists(os.path.join(lb.root_dir, '.gitmodules')) is False inv_manager.link_dataset_to_labbook(ds.remote, 'test', 'dataset100', lb) assert os.path.exists(os.path.join(lb.root_dir, '.gitmodules')) is True dataset_submodule_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', 'test', 'dataset100') assert os.path.exists(dataset_submodule_dir) is True assert os.path.exists(os.path.join(dataset_submodule_dir, '.gigantum')) is True datasets = inv_manager.get_linked_datasets(lb) assert len(datasets) == 1 assert datasets[0].name == ds.name assert datasets[0].namespace == ds.namespace
def process_linked_datasets(labbook: LabBook, logged_in_username: str) -> None: """Method to update or init any linked dataset submodule references, clean up lingering files, and schedule jobs to auto-import if needed Args: labbook: the labbook to analyze logged_in_username: the current logged in username Returns: """ im = InventoryManager(config_file=labbook.client_config.config_file) # Update linked datasets inside the Project or clean them out if needed im.update_linked_datasets(labbook, logged_in_username, init=True) # Check for linked datasets, and schedule auto-imports d = Dispatcher() datasets = im.get_linked_datasets(labbook) for ds in datasets: kwargs = { 'logged_in_username': logged_in_username, 'dataset_owner': ds.namespace, 'dataset_name': ds.name, 'remote_url': ds.remote, } metadata = { 'dataset': f"{logged_in_username}|{ds.namespace}|{ds.name}", 'method': 'dataset_jobs.check_and_import_dataset' } d.dispatch_task( gtmcore.dispatcher.dataset_jobs.check_and_import_dataset, kwargs=kwargs, metadata=metadata)
def clean_dataset_file_cache(logged_in_username: str, dataset_owner: str, dataset_name: str, cache_location: str, config_file: str = None) -> None: """Method to import a dataset from a zip file Args: logged_in_username: username for the currently logged in user dataset_owner: Owner of the labbook if this dataset is linked dataset_name: Name of the labbook if this dataset is linked cache_location: Absolute path to the file cache (inside the container) for this dataset config_file: Returns: None """ logger = LMLogger.get_logger() p = os.getpid() try: logger.info( f"(Job {p}) Starting clean_dataset_file_cache(logged_in_username={logged_in_username}," f"dataset_owner={dataset_owner}, dataset_name={dataset_name}") im = InventoryManager(config_file=config_file) # Check for dataset try: im.load_dataset(logged_in_username, dataset_owner, dataset_name) logger.info( f"{logged_in_username}/{dataset_owner}/{dataset_name} still exists. Skipping file cache clean." ) return except InventoryException: # Dataset not found, move along pass # Check for submodule references for lb in im.list_labbooks(logged_in_username): for ds in im.get_linked_datasets(lb): if ds.namespace == dataset_owner and ds.name == dataset_name: logger.info( f"{logged_in_username}/{dataset_owner}/{dataset_name} still referenced by {str(lb)}." f" Skipping file cache clean.") return # If you get here the dataset no longer exists and is not used by any projects, clear files shutil.rmtree(cache_location) except Exception as err: logger.error(f"(Job {p}) Error in clean_dataset_file_cache job") logger.exception(err) raise
def test_linked_to(self, mock_config_file): im = InventoryManager(mock_config_file[0]) lb = im.create_labbook("test", "test", "lb1", "testing") ds = im.create_dataset("test", "test", "dataset1", "gigantum_object_v1", description="my first dataset", author=GitAuthor(name="test", email="*****@*****.**")) assert ds.linked_to() is None im.link_dataset_to_labbook(f"{ds.root_dir}/.git", "test", "dataset1", lb) assert ds.linked_to() is None linked_datasets = im.get_linked_datasets(lb) assert len(linked_datasets) == 1 assert linked_datasets[0].linked_to() == "test|test|lb1"