Esempio n. 1
0
    def test_check_and_import_dataset(self, mock_config_file):
        im = InventoryManager(mock_config_file[0])
        ds = im.create_dataset('default',
                               'default',
                               "dataset100",
                               storage_type="gigantum_object_v1",
                               description="100")

        # Fake publish to a local bare repo
        _MOCK_create_remote_repo2(ds, 'test', None, None)
        remote_url = ds.remote

        im.delete_dataset('default', 'default', "dataset100")

        with pytest.raises(InventoryException):
            im.load_dataset('default', 'default', "dataset100")

        kwargs = {
            'logged_in_username': "******",
            'dataset_owner': "default",
            'dataset_name': "dataset100",
            'remote_url': remote_url,
            'config_file': mock_config_file[0]
        }

        gtmcore.dispatcher.dataset_jobs.check_and_import_dataset(**kwargs)

        ds = im.load_dataset('default', 'default', "dataset100")
        assert ds.name == 'dataset100'
        assert ds.namespace == 'default'
Esempio n. 2
0
def clean_dataset_file_cache(logged_in_username: str,
                             dataset_owner: str,
                             dataset_name: str,
                             cache_location: str,
                             config_file: str = None) -> None:
    """Method to import a dataset from a zip file

    Args:
        logged_in_username: username for the currently logged in user
        dataset_owner: Owner of the labbook if this dataset is linked
        dataset_name: Name of the labbook if this dataset is linked
        cache_location: Absolute path to the file cache (inside the container) for this dataset
        config_file:

    Returns:
        None
    """
    logger = LMLogger.get_logger()

    p = os.getpid()
    try:
        logger.info(
            f"(Job {p}) Starting clean_dataset_file_cache(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}")

        im = InventoryManager(config_file=config_file)

        # Check for dataset
        try:
            im.load_dataset(logged_in_username, dataset_owner, dataset_name)
            logger.info(
                f"{logged_in_username}/{dataset_owner}/{dataset_name} still exists. Skipping file cache clean."
            )
            return
        except InventoryException:
            # Dataset not found, move along
            pass

        # Check for submodule references
        for lb in im.list_labbooks(logged_in_username):
            for ds in im.get_linked_datasets(lb):
                if ds.namespace == dataset_owner and ds.name == dataset_name:
                    logger.info(
                        f"{logged_in_username}/{dataset_owner}/{dataset_name} still referenced by {str(lb)}."
                        f" Skipping file cache clean.")
                    return

        # If you get here the dataset no longer exists and is not used by any projects, clear files
        shutil.rmtree(cache_location)

    except Exception as err:
        logger.error(f"(Job {p}) Error in clean_dataset_file_cache job")
        logger.exception(err)
        raise
Esempio n. 3
0
    def test_pagination_sort_modified(self, fixture_working_dir_dataset_populated_scoped, snapshot):

        query = """
                {
                datasetList{
                    localDatasets(orderBy: "modified_on", sort: "desc") {
                        edges {
                            node {
                                id
                                name
                                description
                            }
                            cursor
                        }
                        pageInfo {
                            hasNextPage
                            hasPreviousPage
                        }
                    }
                }
                }
                """
        snapshot.assert_match(fixture_working_dir_dataset_populated_scoped[2].execute(query))

        im = InventoryManager(fixture_working_dir_dataset_populated_scoped[0])
        ds = im.load_dataset("default", "default", "dataset4")
        with open(os.path.join(ds.root_dir, "test.txt"), 'wt') as tf:
            tf.write("asdfasdf")
        ds.git.add_all()
        ds.git.commit("Changing the repo")

        # Run query again
        snapshot.assert_match(fixture_working_dir_dataset_populated_scoped[2].execute(query))
Esempio n. 4
0
    def test_list_datasets_modified_on(self, mock_config_file):
        """Test list az datasets"""
        inv_manager = InventoryManager(mock_config_file[0])
        inv_manager.create_dataset("user1", "user1", "dataset2", "gigantum_object_v1", description="my dataset")
        time.sleep(1)
        inv_manager.create_dataset("user1", "user2", "a-dataset3", "gigantum_object_v1", description="my dataset")
        time.sleep(1)
        inv_manager.create_dataset("user1", "user1", "dataset12", "gigantum_object_v1", description="my dataset")
        time.sleep(1)
        inv_manager.create_dataset("user2", "user1", "dataset1", "gigantum_object_v1", description="my dataset")

        datasets = inv_manager.list_datasets(username="******", sort_mode="modified_on")
        assert len(datasets) == 3
        assert datasets[0].name == 'dataset2'
        assert datasets[1].name == 'a-dataset3'
        assert datasets[2].name == 'dataset12'

        # modify a repo
        time.sleep(1.2)
        ds = inv_manager.load_dataset('user1', 'user1', 'dataset2')
        with open(os.path.join(ds.root_dir, "manifest", "test.txt"), 'wt') as tf:
            tf.write("asdfasdf")
        ds.git.add_all()
        ds.git.commit("Changing the repo")

        datasets = inv_manager.list_datasets(username="******", sort_mode="modified_on")
        assert len(datasets) == 3
        assert datasets[0].name == 'a-dataset3'
        assert datasets[1].name == 'dataset12'
        assert datasets[2].name == 'dataset2'
Esempio n. 5
0
    def mutate_and_get_payload(cls,
                               root,
                               info,
                               dataset_owner,
                               dataset_name,
                               labbook_name=None,
                               labbook_owner=None,
                               all_keys=None,
                               keys=None,
                               client_mutation_id=None):
        logged_in_username = get_logged_in_username()

        lb = None
        im = InventoryManager()
        if labbook_name:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner,
                                 labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets',
                                       dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner,
                                 dataset_name)

        d = Dispatcher()
        dl_kwargs = {
            'logged_in_username': logged_in_username,
            'access_token': flask.g.access_token,
            'id_token': flask.g.id_token,
            'dataset_owner': dataset_owner,
            'dataset_name': dataset_name,
            'labbook_owner': labbook_owner,
            'labbook_name': labbook_name,
            'all_keys': all_keys,
            'keys': keys
        }

        # Gen unique keys for tracking jobs
        lb_key = f"{logged_in_username}|{labbook_owner}|{labbook_name}" if lb else None
        ds_key = f"{logged_in_username}|{dataset_owner}|{dataset_name}"
        if lb_key:
            ds_key = f"{lb_key}|LINKED|{ds_key}"

        metadata = {
            'dataset': ds_key,
            'labbook': lb_key,
            'method': 'download_dataset_files'
        }

        res = d.dispatch_task(jobs.download_dataset_files,
                              kwargs=dl_kwargs,
                              metadata=metadata)

        return DownloadDatasetFiles(background_job_key=res.key_str)
Esempio n. 6
0
def update_unmanaged_dataset_from_remote(logged_in_username: str,
                                         access_token: str, id_token: str,
                                         dataset_owner: str,
                                         dataset_name: str) -> None:
    """Method to update/populate an unmanaged dataset from its remote automatically

    Args:
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download

    Returns:

    """
    def update_meta(msg):
        job = get_current_job()
        if not job:
            return
        if 'feedback' not in job.meta:
            job.meta['feedback'] = msg
        else:
            job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}'
        job.save_meta()

    logger = LMLogger.get_logger()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting update_unmanaged_dataset_from_remote(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}")

        im = InventoryManager()
        ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)

        if not isinstance(ds.backend, UnmanagedStorageBackend):
            raise ValueError("Can only auto-update unmanaged dataset types")

        if not ds.backend.can_update_from_remote:
            raise ValueError(
                "Storage backend cannot update automatically from remote.")

        ds.backend.update_from_remote(ds, update_meta)

    except Exception as err:
        logger.exception(err)
        raise
Esempio n. 7
0
def mock_legacy_dataset(mock_dataset_with_cache_dir):
    """A pytest fixture that imports the legacy dataset"""
    archive_path = os.path.join(
        resource_filename('gtmcore.dataset.tests', 'data'),
        'test-legacy-dataset.zip')
    temp_path = os.path.join(tempfile.gettempdir(), 'test-legacy-dataset.zip')
    shutil.copyfile(archive_path, temp_path)
    conf_file = mock_dataset_with_cache_dir[0].client_config.config_file
    import_dataset_from_zip(archive_path=temp_path,
                            username=USERNAME,
                            owner=USERNAME,
                            config_file=conf_file)

    im = InventoryManager()
    ds = im.load_dataset(USERNAME, USERNAME, 'test-legacy-dataset')
    m = Manifest(ds, USERNAME)

    # yield dataset, manifest, working_dir
    yield ds, m, mock_dataset_with_cache_dir[1]
Esempio n. 8
0
def verify_dataset_contents(logged_in_username: str,
                            access_token: str,
                            id_token: str,
                            dataset_owner: str,
                            dataset_name: str,
                            labbook_owner: Optional[str] = None,
                            labbook_name: Optional[str] = None) -> None:
    """Method to update/populate an unmanaged dataset from it local state

    Args:
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked

    Returns:
        None
    """
    job = get_current_job()

    def update_meta(msg):
        if not job:
            return
        if 'feedback' not in job.meta:
            job.meta['feedback'] = msg
        else:
            job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}'
        job.save_meta()

    logger = LMLogger.get_logger()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting verify_dataset_contents(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name},"
            f"labbook_owner={labbook_owner}, labbook_name={labbook_name}")

        im = InventoryManager()
        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner,
                                 labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets',
                                       dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner,
                                 dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)

        result = ds.backend.verify_contents(ds, update_meta)
        job.meta['modified_keys'] = result

    except Exception as err:
        logger.exception(err)
        raise
Esempio n. 9
0
def download_dataset_files(logged_in_username: str,
                           access_token: str,
                           id_token: str,
                           dataset_owner: str,
                           dataset_name: str,
                           labbook_owner: Optional[str] = None,
                           labbook_name: Optional[str] = None,
                           all_keys: Optional[bool] = False,
                           keys: Optional[List[str]] = None,
                           config_file: str = None) -> None:
    """Method to download files from a dataset in the background and provide status to the UI.

    This job schedules `pull_objects` jobs after splitting up the download work into batches. At the end, the job
    removes any partially downloaded files (due to failures) and links all the files for the dataset.

    Args:
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked
        all_keys: Boolean indicating if all remaining files should be downloaded
        keys: List if file keys to download
        config_file: config file (used for test mocking)

    Returns:
        str: directory path of imported labbook
    """
    dispatcher_obj = Dispatcher()

    def update_feedback(msg: str,
                        has_failures: Optional[bool] = None,
                        failure_detail: Optional[str] = None,
                        percent_complete: Optional[float] = None) -> None:
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if has_failures:
            current_job.meta['has_failures'] = has_failures
        if failure_detail:
            current_job.meta['failure_detail'] = failure_detail
        if percent_complete:
            current_job.meta['percent_complete'] = percent_complete

        current_job.meta['feedback'] = msg
        current_job.save_meta()

    logger = LMLogger.get_logger()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username},"
            f" dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner},"
            f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}")

        im = InventoryManager(config_file=config_file)

        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner,
                                 labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets',
                                       dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner,
                                 dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        key_batches, total_bytes, num_files = iom.compute_pull_batches(
            keys, pull_all=all_keys)

        failure_keys = list()
        if key_batches:
            # Schedule jobs for batches
            bg_jobs = list()
            for keys in key_batches:
                job_kwargs = {
                    'keys': keys,
                    'logged_in_username': logged_in_username,
                    'access_token': access_token,
                    'id_token': id_token,
                    'dataset_owner': dataset_owner,
                    'dataset_name': dataset_name,
                    'labbook_owner': labbook_owner,
                    'labbook_name': labbook_name,
                    'config_file': config_file,
                }
                job_metadata = {
                    'dataset':
                    f"{logged_in_username}|{dataset_owner}|{dataset_name}",
                    'method': 'pull_objects'
                }

                job_key = dispatcher_obj.dispatch_task(
                    method_reference=pull_objects,
                    kwargs=job_kwargs,
                    metadata=job_metadata,
                    persist=True)
                bg_jobs.append(
                    BackgroundDownloadJob(dispatcher_obj, keys, job_key))

            update_feedback(
                f"Please wait - Downloading {num_files} files ({format_size(total_bytes)}) - 0% complete",
                percent_complete=0,
                has_failures=False)
            logger.info(
                f"(Job {p}) Starting file downloads for"
                f" {logged_in_username}/{dataset_owner}/{dataset_name} with {len(key_batches)} jobs"
            )

            while sum([(x.is_complete or x.is_failed)
                       for x in bg_jobs]) != len(bg_jobs):
                # Refresh all job statuses and update status feedback
                [j.refresh_status() for j in bg_jobs]
                total_completed_bytes = sum(
                    [j.completed_bytes for j in bg_jobs])
                pc = (float(total_completed_bytes) / float(total_bytes)) * 100
                update_feedback(
                    f"Please wait - Downloading {num_files} files ({format_size(total_completed_bytes)} of "
                    f"{format_size(total_bytes)}) - {round(pc)}% complete",
                    percent_complete=pc)
                time.sleep(1)

            # Aggregate failures if they exist
            for j in bg_jobs:
                if j.is_failed:
                    # Whole job failed...assume entire batch should get re-uploaded for now
                    failure_keys.extend(j.keys)
                else:
                    failure_keys.extend(j.get_failed_keys())

        # Set final status for UI
        if len(failure_keys) == 0:
            update_feedback(f"Download complete!",
                            percent_complete=100,
                            has_failures=False)
        else:
            failure_str = ""
            for f in failure_keys:
                # If any failed files partially downloaded, remove them.
                abs_dataset_path = os.path.join(m.current_revision_dir, f)
                abs_object_path = m.dataset_to_object_path(f)
                if os.path.exists(abs_dataset_path):
                    os.remove(abs_dataset_path)
                if os.path.exists(abs_object_path):
                    os.remove(abs_object_path)
                failure_str = f"{failure_str}{f}\n"

            failure_detail_str = f"Files that failed to download:\n{failure_str}"
            update_feedback("",
                            has_failures=True,
                            failure_detail=failure_detail_str)

        # Link dataset files, so anything that was successfully pulled will materialize
        m.link_revision()

        if len(failure_keys) > 0:
            # If any downloads failed, exit non-zero to the UI knows there was an error
            raise IOError(
                f"{len(failure_keys)} file(s) failed to download. Check message detail and try again."
            )

    except Exception as err:
        logger.exception(err)
        raise
Esempio n. 10
0
def pull_objects(keys: List[str],
                 logged_in_username: str,
                 access_token: str,
                 id_token: str,
                 dataset_owner: str,
                 dataset_name: str,
                 labbook_owner: Optional[str] = None,
                 labbook_name: Optional[str] = None,
                 config_file: str = None) -> None:
    """Method to pull a collection of objects from a dataset's backend.

    This runs the IOManager.pull_objects() method with `link_revision=False`. This is because this job can be run in
    parallel multiple times with different sets of keys. You don't want to link until the very end, which is handled
    in the `download_dataset_files` job, which is what scheduled this job.

    Args:
        keys: List if file keys to download
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked
        config_file: config file (used for test mocking)

    Returns:
        str: directory path of imported labbook
    """
    logger = LMLogger.get_logger()

    def progress_update_callback(completed_bytes: int) -> None:
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if 'completed_bytes' not in current_job.meta:
            current_job.meta['completed_bytes'] = 0

        current_job.meta['completed_bytes'] = int(
            current_job.meta['completed_bytes']) + completed_bytes
        current_job.save_meta()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting pull_objects(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner},"
            f" labbook_name={labbook_name}")

        im = InventoryManager(config_file=config_file)

        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner,
                                 labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets',
                                       dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner,
                                 dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        result = iom.pull_objects(keys=keys,
                                  progress_update_fn=progress_update_callback,
                                  link_revision=False)

        job = get_current_job()
        if job:
            job.meta['failure_keys'] = ",".join(
                [x.dataset_path for x in result.failure])
            job.meta['message'] = result.message
            job.save_meta()

    except Exception as err:
        logger.exception(err)
        raise
Esempio n. 11
0
def push_dataset_objects(objs: List[PushObject],
                         logged_in_username: str,
                         access_token: str,
                         id_token: str,
                         dataset_owner: str,
                         dataset_name: str,
                         config_file: str = None) -> None:
    """Method to pull a collection of objects from a dataset's backend

    Args:
        objs: List if file PushObject to push
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        config_file: config file (used for test mocking)

    Returns:
        str: directory path of imported labbook
    """
    logger = LMLogger.get_logger()

    def progress_update_callback(completed_bytes: int) -> None:
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if 'completed_bytes' not in current_job.meta:
            current_job.meta['completed_bytes'] = 0

        current_job.meta['completed_bytes'] = int(
            current_job.meta['completed_bytes']) + completed_bytes
        current_job.save_meta()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting push_dataset_objects(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}")

        im = InventoryManager(config_file=config_file)
        ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        result = iom.push_objects(objs,
                                  progress_update_fn=progress_update_callback)

        job = get_current_job()
        if job:
            job.meta['failures'] = ",".join([
                f"{x.object_path}|{x.dataset_path}|{x.revision}"
                for x in result.failure
            ])
            job.meta['message'] = result.message
            job.save_meta()

    except Exception as err:
        logger.exception(err)
        raise
Esempio n. 12
0
def check_and_import_dataset(logged_in_username: str,
                             dataset_owner: str,
                             dataset_name: str,
                             remote_url: str,
                             access_token: Optional[str] = None,
                             config_file: Optional[str] = None) -> None:
    """Job to check if a dataset exists in the user's working directory, and if not import it. This is primarily used
    when importing, syncing, or switching branches on a project with linked datasets

    Args:
        logged_in_username: username for the currently logged in user
        dataset_owner: Owner of the labbook if this dataset is linked
        dataset_name: Name of the labbook if this dataset is linked
        remote_url: URL of the dataset to import if needed
        access_token: The current user's access token, needed to initialize git credentials in certain situations
        config_file: config file (used for test mocking)

    Returns:
        None
    """
    logger = LMLogger.get_logger()

    p = os.getpid()
    try:
        logger.info(
            f"(Job {p}) Starting check_and_import_dataset(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}")

        im = InventoryManager(config_file=config_file)

        try:
            # Check for dataset already existing in the user's working directory
            im.load_dataset(logged_in_username, dataset_owner, dataset_name)
            logger.info(
                f"{logged_in_username}/{dataset_owner}/{dataset_name} exists. Skipping auto-import."
            )
            return
        except InventoryException:
            # Dataset not found, import it
            logger.info(
                f"{logged_in_username}/{dataset_owner}/{dataset_name} not found. "
                f"Auto-importing remote dataset from {remote_url}")
            config_obj = Configuration(config_file=config_file)

            if access_token:
                # If the access token is set, git creds should be configured
                remote_parts = urlsplit(remote_url)
                if remote_parts.netloc:
                    remote_target = f"{remote_parts.scheme}://{remote_parts.netloc}/"
                else:
                    remote_target = remote_parts.path

                admin_service = None
                for remote in config_obj.config['git']['remotes']:
                    if remote == remote_target:
                        admin_service = config_obj.config['git']['remotes'][
                            remote]['admin_service']
                        break

                if not admin_service:
                    raise ValueError(
                        f"Failed to configure admin service URL based on target remote: {remote_target}"
                    )

                gl_mgr = GitLabManager(remote_target,
                                       admin_service=admin_service,
                                       access_token=access_token)
                gl_mgr.configure_git_credentials(remote_target,
                                                 logged_in_username)

            gitworkflows_utils.clone_repo(
                remote_url=remote_url,
                username=logged_in_username,
                owner=dataset_owner,
                load_repository=im.load_dataset_from_directory,
                put_repository=im.put_dataset)
            logger.info(
                f"{logged_in_username}/{dataset_owner}/{dataset_name} auto-imported successfully"
            )

    except Exception as err:
        logger.error(f"(Job {p}) Error in check_and_import_dataset job")
        logger.exception(err)
        raise
Esempio n. 13
0
    def mutate_and_get_payload(cls,
                               root,
                               info,
                               dataset_owner,
                               dataset_name,
                               from_local=False,
                               from_remote=False,
                               client_mutation_id=None):
        logged_in_username = get_logged_in_username()
        im = InventoryManager()
        ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name,
                             get_logged_in_author())
        ds.backend.set_default_configuration(logged_in_username,
                                             bearer_token=flask.g.access_token,
                                             id_token=flask.g.id_token)

        if not ds.backend.is_configured:
            raise ValueError("Dataset is not fully configured. Cannot update.")

        d = Dispatcher()
        kwargs = {
            'logged_in_username': logged_in_username,
            'access_token': flask.g.access_token,
            'id_token': flask.g.id_token,
            'dataset_owner': dataset_owner,
            'dataset_name': dataset_name,
        }

        background_job_key = None

        if from_remote is True:
            if ds.backend.can_update_from_remote:
                # Gen unique keys for tracking jobs
                metadata = {
                    'dataset':
                    f"{logged_in_username}|{dataset_owner}|{dataset_name}",
                    'method': 'update_unmanaged_dataset_from_remote'
                }

                job_response = d.dispatch_task(
                    jobs.update_unmanaged_dataset_from_remote,
                    kwargs=kwargs,
                    metadata=metadata)
                background_job_key = job_response.key_str
            else:
                raise ValueError(
                    "This dataset type does not support automatic update via querying its remote"
                )

        elif from_local is True:
            # Gen unique keys for tracking jobs
            metadata = {
                'dataset':
                f"{logged_in_username}|{dataset_owner}|{dataset_name}",
                'method': 'update_unmanaged_dataset_from_local'
            }

            job_response = d.dispatch_task(
                jobs.update_unmanaged_dataset_from_local,
                kwargs=kwargs,
                metadata=metadata)
            background_job_key = job_response.key_str
        else:
            ValueError("Either `fromRemote` or `fromLocal` must be True.")

        return UpdateUnmanagedDataset(dataset=Dataset(id="{}&{}".format(
            dataset_owner, dataset_name),
                                                      name=dataset_name,
                                                      owner=dataset_owner),
                                      background_job_key=background_job_key)
Esempio n. 14
0
    def mutate_and_get_payload(cls,
                               root,
                               info,
                               dataset_owner,
                               dataset_name,
                               parameters=None,
                               confirm=None,
                               client_mutation_id=None):
        logged_in_username = get_logged_in_username()
        im = InventoryManager()
        ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name,
                             get_logged_in_author())
        ds.backend.set_default_configuration(logged_in_username,
                                             bearer_token=flask.g.access_token,
                                             id_token=flask.g.id_token)

        should_confirm = False
        error_message = None
        confirm_message = None
        background_job_key = None
        is_configured = None

        if confirm is None:
            if parameters:
                # Update the configuration
                current_config = ds.backend_config
                for param in parameters:
                    current_config[param.parameter] = param.value
                ds.backend_config = current_config

            # Validate the configuration
            try:
                confirm_message = ds.backend.confirm_configuration(ds)
                if confirm_message is not None:
                    should_confirm = True
            except ValueError as err:
                error_message = f"{err}"
                is_configured = False
        else:
            if confirm is False:
                # Clear configuration
                current_config = ds.backend_config
                for param in parameters:
                    current_config[param.parameter] = None
                ds.backend_config = current_config

            else:
                if ds.backend.can_update_from_remote:
                    d = Dispatcher()
                    kwargs = {
                        'logged_in_username': logged_in_username,
                        'access_token': flask.g.access_token,
                        'id_token': flask.g.id_token,
                        'dataset_owner': dataset_owner,
                        'dataset_name': dataset_name,
                    }

                    # Gen unique keys for tracking jobs
                    metadata = {
                        'dataset':
                        f"{logged_in_username}|{dataset_owner}|{dataset_name}",
                        'method': 'update_unmanaged_dataset_from_remote'
                    }
                    job_response = d.dispatch_task(
                        jobs.update_unmanaged_dataset_from_remote,
                        kwargs=kwargs,
                        metadata=metadata)

                    background_job_key = job_response.key_str

        if is_configured is None:
            is_configured = ds.backend.is_configured

        return ConfigureDataset(
            dataset=Dataset(id="{}&{}".format(dataset_owner, dataset_name),
                            name=dataset_name,
                            owner=dataset_owner),
            is_configured=is_configured,
            should_confirm=should_confirm,
            confirm_message=confirm_message,
            error_message=error_message,
            has_background_job=ds.backend.can_update_from_remote,
            background_job_key=background_job_key)
Esempio n. 15
0
    def test_checkout__linked_dataset(self, mock_labbook_lfs_disabled,
                                      mock_config_file):
        """ test checking out a branch in a project that pulls in a linked dataset"""
        def dispatcher_mock(self, function_ref, kwargs, metadata):
            assert kwargs['logged_in_username'] == 'other-test-user2'
            assert kwargs['dataset_owner'] == 'testuser'
            assert kwargs['dataset_name'] == 'test-ds'

            # Inject mocked config file
            kwargs['config_file'] = mock_config_file[0]

            # Stop patching so job gets scheduled for real
            dispatcher_patch.stop()

            # Call same method as in mutation
            d = Dispatcher()
            res = d.dispatch_task(
                gtmcore.dispatcher.dataset_jobs.check_and_import_dataset,
                kwargs=kwargs,
                metadata=metadata)

            return res

        username = '******'
        lb = mock_labbook_lfs_disabled[2]
        im = InventoryManager(config_file=mock_labbook_lfs_disabled[0])
        ds = im.create_dataset(username,
                               username,
                               'test-ds',
                               storage_type='gigantum_object_v1')

        # Publish dataset
        dataset_wf = DatasetWorkflow(ds)
        dataset_wf.publish(username=username)

        # Publish project
        labbook_wf = LabbookWorkflow(lb)
        labbook_wf.publish(username=username)

        # Switch branches
        labbook_wf.labbook.checkout_branch(branch_name="dataset-branch",
                                           new=True)

        # Link to project
        im.link_dataset_to_labbook(dataset_wf.remote, username, username,
                                   labbook_wf.labbook)

        # Publish branch
        labbook_wf.sync(username=username)

        # Import project
        other_user = '******'
        wf_other = LabbookWorkflow.import_from_remote(
            labbook_wf.remote,
            username=other_user,
            config_file=mock_config_file[0])

        # The remotes must be the same, cause it's the same remote repo
        assert wf_other.remote == labbook_wf.remote
        assert wf_other.repository != labbook_wf.repository
        assert f'{other_user}/{username}/labbooks/labbook1' in wf_other.repository.root_dir

        with pytest.raises(InventoryException):
            im_other_user = InventoryManager(config_file=mock_config_file[0])
            ds = im_other_user.load_dataset(other_user, username, 'test-ds')

        # Patch dispatch_task so you can inject the mocked config file
        dispatcher_patch = patch.object(Dispatcher, 'dispatch_task',
                                        dispatcher_mock)
        dispatcher_patch.start()

        # Checkout the branch
        assert wf_other.labbook.active_branch == "master"
        wf_other.checkout(username=other_user, branch_name="dataset-branch")

        cnt = 0
        while cnt < 20:
            try:
                im_other_user = InventoryManager(
                    config_file=mock_config_file[0])
                ds = im_other_user.load_dataset(other_user, username,
                                                'test-ds')
                break
            except InventoryException:
                cnt += 1
                time.sleep(1)

        assert cnt < 20
        assert ds.name == 'test-ds'
        assert ds.namespace == username
        assert mock_config_file[1] in ds.root_dir
        assert wf_other.labbook.active_branch == "dataset-branch"
Esempio n. 16
0
    def test_import_from_remote__linked_dataset(self,
                                                mock_labbook_lfs_disabled,
                                                mock_config_file):
        """ test importing a project with a linked dataset"""
        def dispatcher_mock(self, function_ref, kwargs, metadata):
            assert kwargs['logged_in_username'] == 'other-test-user2'
            assert kwargs['dataset_owner'] == 'testuser'
            assert kwargs['dataset_name'] == 'test-ds'

            # Inject mocked config file
            kwargs['config_file'] = mock_config_file[0]

            # Stop patching so job gets scheduled for real
            dispatcher_patch.stop()

            # Call same method as in mutation
            d = Dispatcher()
            res = d.dispatch_task(
                gtmcore.dispatcher.dataset_jobs.check_and_import_dataset,
                kwargs=kwargs,
                metadata=metadata)

            return res

        username = '******'
        lb = mock_labbook_lfs_disabled[2]
        im = InventoryManager(config_file=mock_labbook_lfs_disabled[0])
        ds = im.create_dataset(username,
                               username,
                               'test-ds',
                               storage_type='gigantum_object_v1')

        # Publish dataset
        dataset_wf = DatasetWorkflow(ds)
        dataset_wf.publish(username=username)

        # Link to project
        im.link_dataset_to_labbook(dataset_wf.remote, username, username, lb)

        # Publish project
        labbook_wf = LabbookWorkflow(lb)
        labbook_wf.publish(username=username)

        # Patch dispatch_task so you can inject the mocked config file
        dispatcher_patch = patch.object(Dispatcher, 'dispatch_task',
                                        dispatcher_mock)
        dispatcher_patch.start()

        # Import project, triggering an auto-import of the dataset
        other_user = '******'
        wf_other = LabbookWorkflow.import_from_remote(
            labbook_wf.remote,
            username=other_user,
            config_file=mock_config_file[0])

        # The remotes must be the same, cause it's the same remote repo
        assert wf_other.remote == labbook_wf.remote
        # The actual path on disk will be different, though
        assert wf_other.repository != labbook_wf.repository
        # Check imported into namespace of original owner (testuser)
        assert f'{other_user}/{username}/labbooks/labbook1' in wf_other.repository.root_dir

        cnt = 0
        while cnt < 20:
            try:
                im_other_user = InventoryManager(
                    config_file=mock_config_file[0])
                ds = im_other_user.load_dataset(other_user, username,
                                                'test-ds')
                break
            except InventoryException:
                cnt += 1
                time.sleep(1)

        assert cnt < 20
        assert ds.name == 'test-ds'
        assert ds.namespace == username
        assert mock_config_file[1] in ds.root_dir
Esempio n. 17
0
    def mutate_and_get_payload(cls,
                               root,
                               info,
                               labbook_owner,
                               labbook_name,
                               dataset_owner,
                               dataset_name,
                               action,
                               dataset_url=None,
                               client_mutation_id=None):
        logged_in_username = get_logged_in_username()
        im = InventoryManager()
        lb = im.load_labbook(logged_in_username,
                             labbook_owner,
                             labbook_name,
                             author=get_logged_in_author())

        with lb.lock():
            if action == 'link':
                if dataset_url:
                    remote_domain = cls._get_remote_domain(
                        dataset_url, dataset_owner, dataset_name)

                    if remote_domain:
                        # Make sure git creds are configured for the remote
                        admin_service = None
                        for remote in lb.client_config.config['git'][
                                'remotes']:
                            if remote_domain == remote:
                                admin_service = lb.client_config.config['git'][
                                    'remotes'][remote]['admin_service']
                                break
                        if "HTTP_AUTHORIZATION" in info.context.headers.environ:
                            token = parse_token(info.context.headers.
                                                environ["HTTP_AUTHORIZATION"])
                        else:
                            raise ValueError(
                                "Authorization header not provided."
                                " Must have a valid session to query for collaborators"
                            )
                        mgr = GitLabManager(remote_domain, admin_service,
                                            token)
                        mgr.configure_git_credentials(remote_domain,
                                                      logged_in_username)
                else:
                    # Link to local dataset
                    ds = im.load_dataset(logged_in_username, dataset_owner,
                                         dataset_name)
                    dataset_url = f"{ds.root_dir}/.git"

                # Link the dataset to the labbook
                ds = im.link_dataset_to_labbook(dataset_url, dataset_owner,
                                                dataset_name, lb)
                ds.namespace = dataset_owner

                # Preload the dataloader
                info.context.dataset_loader.prime(
                    f"{get_logged_in_username()}&{dataset_owner}&{dataset_name}",
                    ds)

                # Relink the revision
                m = Manifest(ds, logged_in_username)
                m.link_revision()
            elif action == 'unlink':
                im.unlink_dataset_from_labbook(dataset_owner, dataset_name, lb)
            elif action == 'update':
                ds = im.update_linked_dataset_reference(
                    dataset_owner, dataset_name, lb)
                m = Manifest(ds, logged_in_username)
                m.force_reload()

                info.context.dataset_loader.prime(
                    f"{get_logged_in_username()}&{dataset_owner}&{dataset_name}",
                    ds)
            else:
                raise ValueError(
                    "Unsupported action. Use `link`, `unlink`, or `update`")

            info.context.labbook_loader.prime(
                f"{get_logged_in_username()}&{labbook_owner}&{labbook_name}",
                lb)
            edge = LabbookConnection.Edge(node=Labbook(owner=labbook_owner,
                                                       name=labbook_name),
                                          cursor=base64.b64encode(
                                              f"{0}".encode('utf-8')))

        return ModifyDatasetLink(new_labbook_edge=edge)
Esempio n. 18
0
def download_dataset_files(logged_in_username: str, access_token: str, id_token: str,
                           dataset_owner: str, dataset_name: str,
                           labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None,
                           all_keys: Optional[bool] = False, keys: Optional[List[str]] = None):
    """Method to import a dataset from a zip file

    Args:
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked
        all_keys: Boolean indicating if all remaining files should be downloaded
        keys: List if file keys to download

    Returns:
        str: directory path of imported labbook
    """
    def update_meta(msg):
        job = get_current_job()
        if not job:
            return
        if 'feedback' not in job.meta:
            job.meta['feedback'] = msg
        else:
            job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}'
        job.save_meta()

    logger = LMLogger.get_logger()

    try:
        p = os.getpid()
        logger.info(f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username},"
                    f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner},"
                    f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}")

        im = InventoryManager()

        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token, id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        if all_keys:
            result = iom.pull_all(status_update_fn=update_meta)
        elif keys:
            result = iom.pull_objects(keys=keys, status_update_fn=update_meta)
        else:
            raise ValueError("Must provide a list of keys or set all_keys=True")

        # Save the Relay node IDs to the job metadata so the UI can re-fetch as needed
        job = get_current_job()
        if job:
            job.meta['success_keys'] = [x.dataset_path for x in result.success]
            job.meta['failure_keys'] = [x.dataset_path for x in result.failure]
            job.save_meta()

        if len(result.failure) > 0:
            # If any downloads failed, exit non-zero to the UI knows there was an error
            sys.exit(-1)

    except Exception as err:
        logger.exception(err)
        raise