def test_get_linked_datasets(self, mock_labbook):
        inv_manager = InventoryManager(mock_labbook[0])
        lb = mock_labbook[2]

        datasets = inv_manager.get_linked_datasets(lb)
        assert len(datasets) == 0

        ds = inv_manager.create_dataset("test",
                                        "test",
                                        "dataset100",
                                        "gigantum_object_v1",
                                        description="my dataset")

        # Fake publish to a local bare repo
        _MOCK_create_remote_repo2(ds, 'test', None, None)

        assert os.path.exists(os.path.join(lb.root_dir,
                                           '.gitmodules')) is False

        inv_manager.link_dataset_to_labbook(ds.remote, 'test', 'dataset100',
                                            lb)

        assert os.path.exists(os.path.join(lb.root_dir, '.gitmodules')) is True
        dataset_submodule_dir = os.path.join(lb.root_dir, '.gigantum',
                                             'datasets', 'test', 'dataset100')
        assert os.path.exists(dataset_submodule_dir) is True
        assert os.path.exists(os.path.join(dataset_submodule_dir,
                                           '.gigantum')) is True

        datasets = inv_manager.get_linked_datasets(lb)
        assert len(datasets) == 1
        assert datasets[0].name == ds.name
        assert datasets[0].namespace == ds.namespace
def process_linked_datasets(labbook: LabBook, logged_in_username: str) -> None:
    """Method to update or init any linked dataset submodule references, clean up lingering files, and schedule
    jobs to auto-import if needed

    Args:
        labbook: the labbook to analyze
        logged_in_username: the current logged in username

    Returns:

    """
    im = InventoryManager(config_file=labbook.client_config.config_file)

    # Update linked datasets inside the Project or clean them out if needed
    im.update_linked_datasets(labbook, logged_in_username, init=True)

    # Check for linked datasets, and schedule auto-imports
    d = Dispatcher()
    datasets = im.get_linked_datasets(labbook)
    for ds in datasets:
        kwargs = {
            'logged_in_username': logged_in_username,
            'dataset_owner': ds.namespace,
            'dataset_name': ds.name,
            'remote_url': ds.remote,
        }
        metadata = {
            'dataset': f"{logged_in_username}|{ds.namespace}|{ds.name}",
            'method': 'dataset_jobs.check_and_import_dataset'
        }

        d.dispatch_task(
            gtmcore.dispatcher.dataset_jobs.check_and_import_dataset,
            kwargs=kwargs,
            metadata=metadata)
Exemple #3
0
def clean_dataset_file_cache(logged_in_username: str,
                             dataset_owner: str,
                             dataset_name: str,
                             cache_location: str,
                             config_file: str = None) -> None:
    """Method to import a dataset from a zip file

    Args:
        logged_in_username: username for the currently logged in user
        dataset_owner: Owner of the labbook if this dataset is linked
        dataset_name: Name of the labbook if this dataset is linked
        cache_location: Absolute path to the file cache (inside the container) for this dataset
        config_file:

    Returns:
        None
    """
    logger = LMLogger.get_logger()

    p = os.getpid()
    try:
        logger.info(
            f"(Job {p}) Starting clean_dataset_file_cache(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}")

        im = InventoryManager(config_file=config_file)

        # Check for dataset
        try:
            im.load_dataset(logged_in_username, dataset_owner, dataset_name)
            logger.info(
                f"{logged_in_username}/{dataset_owner}/{dataset_name} still exists. Skipping file cache clean."
            )
            return
        except InventoryException:
            # Dataset not found, move along
            pass

        # Check for submodule references
        for lb in im.list_labbooks(logged_in_username):
            for ds in im.get_linked_datasets(lb):
                if ds.namespace == dataset_owner and ds.name == dataset_name:
                    logger.info(
                        f"{logged_in_username}/{dataset_owner}/{dataset_name} still referenced by {str(lb)}."
                        f" Skipping file cache clean.")
                    return

        # If you get here the dataset no longer exists and is not used by any projects, clear files
        shutil.rmtree(cache_location)

    except Exception as err:
        logger.error(f"(Job {p}) Error in clean_dataset_file_cache job")
        logger.exception(err)
        raise
Exemple #4
0
    def test_linked_to(self, mock_config_file):
        im = InventoryManager(mock_config_file[0])
        lb = im.create_labbook("test", "test", "lb1", "testing")
        ds = im.create_dataset("test", "test", "dataset1", "gigantum_object_v1",
                               description="my first dataset",
                               author=GitAuthor(name="test", email="*****@*****.**"))

        assert ds.linked_to() is None

        im.link_dataset_to_labbook(f"{ds.root_dir}/.git", "test", "dataset1", lb)

        assert ds.linked_to() is None

        linked_datasets = im.get_linked_datasets(lb)
        assert len(linked_datasets) == 1
        assert linked_datasets[0].linked_to() == "test|test|lb1"