def test_delete_labbook_linked_dataset(self, mock_config_file):
        """Test trying to create a labbook with a name that already exists locally"""
        inv_manager = InventoryManager(mock_config_file[0])
        inv_manager.create_labbook("test",
                                   "test",
                                   "labbook1",
                                   description="my first labbook")
        lb = inv_manager.load_labbook("test", "test", "labbook1")

        auth = GitAuthor(name="test", email="*****@*****.**")
        ds = inv_manager.create_dataset("test",
                                        "test",
                                        "dataset1",
                                        "gigantum_object_v1",
                                        description="my first dataset",
                                        author=auth)

        inv_manager.link_dataset_to_labbook(f"{ds.root_dir}/.git", "test",
                                            "dataset1", lb)

        dataset_delete_jobs = inv_manager.delete_labbook(
            "test", "test", "labbook1")
        assert len(dataset_delete_jobs) == 1
        assert dataset_delete_jobs[0].namespace == "test"
        assert dataset_delete_jobs[0].name == "dataset1"

        with pytest.raises(InventoryException):
            inv_manager.load_labbook("test", "test", "labbook1")
    def test_delete_labbook_no_linked_datasets(self, mock_config_file):
        """Test trying to create a labbook with a name that already exists locally"""
        inv_manager = InventoryManager(mock_config_file[0])
        inv_manager.create_labbook("test",
                                   "test",
                                   "labbook1",
                                   description="my first labbook")
        inv_manager.load_labbook("test", "test", "labbook1")

        dataset_delete_jobs = inv_manager.delete_labbook(
            "test", "test", "labbook1")
        assert dataset_delete_jobs == []

        with pytest.raises(InventoryException):
            inv_manager.load_labbook("test", "test", "labbook1")
Example #3
0
    def test_publish_basic(self, fixture_working_dir,
                           mock_create_labbooks_no_lfs):

        # Mock the request context so a fake authorization header is present
        builder = EnvironBuilder(path='/labbook',
                                 method='POST',
                                 headers={'Authorization': 'Bearer AJDFHASD'})
        env = builder.get_environ()
        req = Request(environ=env)

        im = InventoryManager(mock_create_labbooks_no_lfs[0])
        test_user_lb = im.load_labbook('default', 'default', 'labbook1')

        publish_query = f"""
        mutation c {{
            publishLabbook(input: {{
                labbookName: "labbook1",
                owner: "default"
            }}) {{
                jobKey
            }}
        }}
        """

        r = mock_create_labbooks_no_lfs[2].execute(publish_query,
                                                   context_value=req)
        assert 'errors' not in r
Example #4
0
    def test_reset_branch_to_remote(self, fixture_working_dir, mock_create_labbooks_no_lfs):
        # Mock the request context so a fake authorization header is present
        builder = EnvironBuilder(path='/labbook', method='POST', headers={'Authorization': 'Bearer AJDFHASD'})
        env = builder.get_environ()
        req = Request(environ=env)

        im = InventoryManager(mock_create_labbooks_no_lfs[0])
        test_user_lb = im.load_labbook('default', 'default', 'labbook1')
        wf = LabbookWorkflow(test_user_lb)
        wf.publish(username='******')
        hash_original = wf.labbook.git.commit_hash

        new_file_path = os.path.join(wf.labbook.root_dir, 'input', 'new-file')
        with open(new_file_path, 'w') as f: f.write('File data')
        wf.labbook.sweep_uncommitted_changes()
        hash_before_reset = wf.labbook.git.commit_hash

        publish_query = f"""
        mutation c {{
            resetBranchToRemote(input: {{
                labbookName: "labbook1",
                owner: "default"
            }}) {{
                labbook {{
                    activeBranchName
                }}
            }}
        }}
        """

        r = mock_create_labbooks_no_lfs[2].execute(publish_query, context_value=req)
        assert 'errors' not in r
        assert wf.labbook.git.commit_hash == hash_original
Example #5
0
    def test_sync_1(self, mock_create_labbooks_no_lfs, mock_config_file):

        # Setup responses mock for this test
        responses.add(responses.GET, 'https://usersrv.gigantum.io/key',
                      json={'key': 'afaketoken'}, status=200)

        im = InventoryManager(mock_create_labbooks_no_lfs[0])
        test_user_lb = im.load_labbook('default', 'default', 'labbook1')
        test_user_wf = LabbookWorkflow(test_user_lb)
        test_user_wf.publish('default')

        # Mock the request context so a fake authorization header is present
        builder = EnvironBuilder(path='/labbook', method='POST', headers={'Authorization': 'Bearer AJDFHASD'})
        env = builder.get_environ()
        req = Request(environ=env)


        sally_wf = LabbookWorkflow.import_from_remote(test_user_wf.remote, 'sally', config_file=mock_config_file[0])
        sally_lb = sally_wf.labbook
        FileOperations.makedir(sally_lb, relative_path='code/sally-dir', create_activity_record=True)
        sally_wf.sync('sally')

        sync_query = """
        mutation x {
            syncLabbook(input: {
                labbookName: "labbook1",
                owner: "default"
            }) {
                jobKey
            }
        }
        """
        r = mock_create_labbooks_no_lfs[2].execute(sync_query, context_value=req)

        assert 'errors' not in r
Example #6
0
    def mutate_and_get_payload(cls,
                               root,
                               info,
                               dataset_owner,
                               dataset_name,
                               labbook_name=None,
                               labbook_owner=None,
                               all_keys=None,
                               keys=None,
                               client_mutation_id=None):
        logged_in_username = get_logged_in_username()

        lb = None
        im = InventoryManager()
        if labbook_name:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner,
                                 labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets',
                                       dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner,
                                 dataset_name)

        d = Dispatcher()
        dl_kwargs = {
            'logged_in_username': logged_in_username,
            'access_token': flask.g.access_token,
            'id_token': flask.g.id_token,
            'dataset_owner': dataset_owner,
            'dataset_name': dataset_name,
            'labbook_owner': labbook_owner,
            'labbook_name': labbook_name,
            'all_keys': all_keys,
            'keys': keys
        }

        # Gen unique keys for tracking jobs
        lb_key = f"{logged_in_username}|{labbook_owner}|{labbook_name}" if lb else None
        ds_key = f"{logged_in_username}|{dataset_owner}|{dataset_name}"
        if lb_key:
            ds_key = f"{lb_key}|LINKED|{ds_key}"

        metadata = {
            'dataset': ds_key,
            'labbook': lb_key,
            'method': 'download_dataset_files'
        }

        res = d.dispatch_task(jobs.download_dataset_files,
                              kwargs=dl_kwargs,
                              metadata=metadata)

        return DownloadDatasetFiles(background_job_key=res.key_str)
Example #7
0
    def test_load_labbook(self, mock_config_file):
        """Test loading a labbook from a directory"""
        inv_manager = InventoryManager(mock_config_file[0])
        lb = inv_manager.create_labbook("test",
                                        "test",
                                        "labbook1",
                                        description="my first labbook")
        labbook_dir = lb.root_dir

        assert labbook_dir == os.path.join(mock_config_file[1], "test", "test",
                                           "labbooks", "labbook1")
        assert type(lb) == LabBook

        # Validate directory structure
        assert os.path.isdir(os.path.join(labbook_dir, "code")) is True
        assert os.path.isdir(os.path.join(labbook_dir, "input")) is True
        assert os.path.isdir(os.path.join(labbook_dir, "output")) is True
        assert os.path.isdir(os.path.join(labbook_dir, ".gigantum")) is True
        assert os.path.isdir(os.path.join(labbook_dir, ".gigantum",
                                          "env")) is True
        assert os.path.isdir(os.path.join(labbook_dir, ".gigantum",
                                          "activity")) is True
        assert os.path.isdir(
            os.path.join(labbook_dir, ".gigantum", "activity", "log")) is True
        assert os.path.isdir(
            os.path.join(labbook_dir, ".gigantum", "activity",
                         "index")) is True

        # Validate labbook data file
        with open(os.path.join(labbook_dir, ".gigantum", "project.yaml"),
                  "rt") as data_file:
            data = yaml.safe_load(data_file)

        assert data["name"] == "labbook1"
        assert data["description"] == "my first labbook"
        assert "id" in data

        lb_loaded = inv_manager.load_labbook("test", "test", "labbook1")

        assert lb_loaded.root_dir == os.path.join(mock_config_file[1], "test",
                                                  "test", "labbooks",
                                                  "labbook1")
        assert type(lb) == LabBook

        # Validate labbook data file
        assert lb_loaded.root_dir == lb.root_dir
        assert lb_loaded.id == lb.id
        assert lb_loaded.name == lb.name
        assert lb_loaded.description == lb.description
        assert lb_loaded.key == 'test|test|labbook1'
Example #8
0
    def test_change_properties(self, mock_config_file):
        """Test loading a labbook from a directory"""
        im = InventoryManager(mock_config_file[0])
        lb = im.create_labbook('test',
                               'test',
                               'labbook1',
                               description="my first labbook")
        lb.description = "an updated description"

        # Reload and see changes
        lb_loaded = im.load_labbook("test", "test", 'labbook1')

        # Validate labbook data file
        assert lb_loaded.id == lb.id
        assert lb_loaded.description == "an updated description"
Example #9
0
    def __init__(self,
                 user: str,
                 owner: str,
                 labbook_name: str,
                 monitor_key: str,
                 config_file: str = None,
                 author_name: Optional[str] = None,
                 author_email: Optional[str] = None) -> None:
        """Constructor requires info to load the lab book

        Args:
            user(str): current logged in user
            owner(str): owner of the lab book
            labbook_name(str): name of the lab book
            monitor_key(str): Unique key for the activity monitor in redis
            author_name(str): Name of the user starting this activity monitor
            author_email(str): Email of the user starting this activity monitor
        """
        self.monitor_key = monitor_key

        # List of processor classes that will be invoked in order
        self.processors: List[ActivityProcessor] = []

        # Populate GitAuthor instance if available
        if author_name:
            author: Optional[GitAuthor] = GitAuthor(name=author_name,
                                                    email=author_email)
        else:
            author = None

        # Load Lab Book instance
        im = InventoryManager(config_file)
        self.labbook = im.load_labbook(user,
                                       owner,
                                       labbook_name,
                                       author=author)
        self.user = user
        self.owner = owner
        self.labbook_name = labbook_name

        # Create ActivityStore instance
        self.activity_store = ActivityStore(self.labbook)

        # A flag indicating if the activity record is OK to store
        self.can_store_activity_record = False
    def test_delete_dir(self, mock_create_labbooks):
        im = InventoryManager(mock_create_labbooks[0])
        lb = im.load_labbook('default', 'default', 'labbook1')
        FileOperations.makedir(lb, 'code/subdir')

        test_file = os.path.join(lb.root_dir, 'code', 'subdir', 'test.txt')
        with open(test_file, 'wt') as tf:
            tf.write("puppers")

        lb.git.add_all('code/')
        lb.git.commit("blah")

        dir_path = os.path.join(lb.root_dir, 'code', 'subdir')
        assert os.path.exists(dir_path) is True
        assert os.path.exists(test_file) is True

        # Note, deleting a file should work with and without a trailing / at the end.
        query = """
        mutation deleteLabbookFiles {
          deleteLabbookFiles(
            input: {
              owner: "default",
              labbookName: "labbook1",
              section: "code",
              filePaths: ["subdir/"]
            }) {
              success
            }
        }
        """
        res = mock_create_labbooks[2].execute(query)
        print(res)
        assert res['data']['deleteLabbookFiles']['success'] is True

        assert os.path.exists(dir_path) is False
        assert os.path.exists(test_file) is False
        assert os.path.exists(os.path.join(lb.root_dir, 'code')) is True
Example #11
0
def verify_dataset_contents(logged_in_username: str,
                            access_token: str,
                            id_token: str,
                            dataset_owner: str,
                            dataset_name: str,
                            labbook_owner: Optional[str] = None,
                            labbook_name: Optional[str] = None) -> None:
    """Method to update/populate an unmanaged dataset from it local state

    Args:
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked

    Returns:
        None
    """
    job = get_current_job()

    def update_meta(msg):
        if not job:
            return
        if 'feedback' not in job.meta:
            job.meta['feedback'] = msg
        else:
            job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}'
        job.save_meta()

    logger = LMLogger.get_logger()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting verify_dataset_contents(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name},"
            f"labbook_owner={labbook_owner}, labbook_name={labbook_name}")

        im = InventoryManager()
        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner,
                                 labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets',
                                       dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner,
                                 dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)

        result = ds.backend.verify_contents(ds, update_meta)
        job.meta['modified_keys'] = result

    except Exception as err:
        logger.exception(err)
        raise
Example #12
0
def download_dataset_files(logged_in_username: str,
                           access_token: str,
                           id_token: str,
                           dataset_owner: str,
                           dataset_name: str,
                           labbook_owner: Optional[str] = None,
                           labbook_name: Optional[str] = None,
                           all_keys: Optional[bool] = False,
                           keys: Optional[List[str]] = None,
                           config_file: str = None) -> None:
    """Method to download files from a dataset in the background and provide status to the UI.

    This job schedules `pull_objects` jobs after splitting up the download work into batches. At the end, the job
    removes any partially downloaded files (due to failures) and links all the files for the dataset.

    Args:
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked
        all_keys: Boolean indicating if all remaining files should be downloaded
        keys: List if file keys to download
        config_file: config file (used for test mocking)

    Returns:
        str: directory path of imported labbook
    """
    dispatcher_obj = Dispatcher()

    def update_feedback(msg: str,
                        has_failures: Optional[bool] = None,
                        failure_detail: Optional[str] = None,
                        percent_complete: Optional[float] = None) -> None:
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if has_failures:
            current_job.meta['has_failures'] = has_failures
        if failure_detail:
            current_job.meta['failure_detail'] = failure_detail
        if percent_complete:
            current_job.meta['percent_complete'] = percent_complete

        current_job.meta['feedback'] = msg
        current_job.save_meta()

    logger = LMLogger.get_logger()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username},"
            f" dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner},"
            f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}")

        im = InventoryManager(config_file=config_file)

        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner,
                                 labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets',
                                       dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner,
                                 dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        key_batches, total_bytes, num_files = iom.compute_pull_batches(
            keys, pull_all=all_keys)

        failure_keys = list()
        if key_batches:
            # Schedule jobs for batches
            bg_jobs = list()
            for keys in key_batches:
                job_kwargs = {
                    'keys': keys,
                    'logged_in_username': logged_in_username,
                    'access_token': access_token,
                    'id_token': id_token,
                    'dataset_owner': dataset_owner,
                    'dataset_name': dataset_name,
                    'labbook_owner': labbook_owner,
                    'labbook_name': labbook_name,
                    'config_file': config_file,
                }
                job_metadata = {
                    'dataset':
                    f"{logged_in_username}|{dataset_owner}|{dataset_name}",
                    'method': 'pull_objects'
                }

                job_key = dispatcher_obj.dispatch_task(
                    method_reference=pull_objects,
                    kwargs=job_kwargs,
                    metadata=job_metadata,
                    persist=True)
                bg_jobs.append(
                    BackgroundDownloadJob(dispatcher_obj, keys, job_key))

            update_feedback(
                f"Please wait - Downloading {num_files} files ({format_size(total_bytes)}) - 0% complete",
                percent_complete=0,
                has_failures=False)
            logger.info(
                f"(Job {p}) Starting file downloads for"
                f" {logged_in_username}/{dataset_owner}/{dataset_name} with {len(key_batches)} jobs"
            )

            while sum([(x.is_complete or x.is_failed)
                       for x in bg_jobs]) != len(bg_jobs):
                # Refresh all job statuses and update status feedback
                [j.refresh_status() for j in bg_jobs]
                total_completed_bytes = sum(
                    [j.completed_bytes for j in bg_jobs])
                pc = (float(total_completed_bytes) / float(total_bytes)) * 100
                update_feedback(
                    f"Please wait - Downloading {num_files} files ({format_size(total_completed_bytes)} of "
                    f"{format_size(total_bytes)}) - {round(pc)}% complete",
                    percent_complete=pc)
                time.sleep(1)

            # Aggregate failures if they exist
            for j in bg_jobs:
                if j.is_failed:
                    # Whole job failed...assume entire batch should get re-uploaded for now
                    failure_keys.extend(j.keys)
                else:
                    failure_keys.extend(j.get_failed_keys())

        # Set final status for UI
        if len(failure_keys) == 0:
            update_feedback(f"Download complete!",
                            percent_complete=100,
                            has_failures=False)
        else:
            failure_str = ""
            for f in failure_keys:
                # If any failed files partially downloaded, remove them.
                abs_dataset_path = os.path.join(m.current_revision_dir, f)
                abs_object_path = m.dataset_to_object_path(f)
                if os.path.exists(abs_dataset_path):
                    os.remove(abs_dataset_path)
                if os.path.exists(abs_object_path):
                    os.remove(abs_object_path)
                failure_str = f"{failure_str}{f}\n"

            failure_detail_str = f"Files that failed to download:\n{failure_str}"
            update_feedback("",
                            has_failures=True,
                            failure_detail=failure_detail_str)

        # Link dataset files, so anything that was successfully pulled will materialize
        m.link_revision()

        if len(failure_keys) > 0:
            # If any downloads failed, exit non-zero to the UI knows there was an error
            raise IOError(
                f"{len(failure_keys)} file(s) failed to download. Check message detail and try again."
            )

    except Exception as err:
        logger.exception(err)
        raise
Example #13
0
def pull_objects(keys: List[str],
                 logged_in_username: str,
                 access_token: str,
                 id_token: str,
                 dataset_owner: str,
                 dataset_name: str,
                 labbook_owner: Optional[str] = None,
                 labbook_name: Optional[str] = None,
                 config_file: str = None) -> None:
    """Method to pull a collection of objects from a dataset's backend.

    This runs the IOManager.pull_objects() method with `link_revision=False`. This is because this job can be run in
    parallel multiple times with different sets of keys. You don't want to link until the very end, which is handled
    in the `download_dataset_files` job, which is what scheduled this job.

    Args:
        keys: List if file keys to download
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked
        config_file: config file (used for test mocking)

    Returns:
        str: directory path of imported labbook
    """
    logger = LMLogger.get_logger()

    def progress_update_callback(completed_bytes: int) -> None:
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if 'completed_bytes' not in current_job.meta:
            current_job.meta['completed_bytes'] = 0

        current_job.meta['completed_bytes'] = int(
            current_job.meta['completed_bytes']) + completed_bytes
        current_job.save_meta()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting pull_objects(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner},"
            f" labbook_name={labbook_name}")

        im = InventoryManager(config_file=config_file)

        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner,
                                 labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets',
                                       dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner,
                                 dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        result = iom.pull_objects(keys=keys,
                                  progress_update_fn=progress_update_callback,
                                  link_revision=False)

        job = get_current_job()
        if job:
            job.meta['failure_keys'] = ",".join(
                [x.dataset_path for x in result.failure])
            job.meta['message'] = result.message
            job.save_meta()

    except Exception as err:
        logger.exception(err)
        raise
Example #14
0
    def mutate_and_get_payload(cls,
                               root,
                               info,
                               labbook_owner,
                               labbook_name,
                               dataset_owner,
                               dataset_name,
                               action,
                               dataset_url=None,
                               client_mutation_id=None):
        logged_in_username = get_logged_in_username()
        im = InventoryManager()
        lb = im.load_labbook(logged_in_username,
                             labbook_owner,
                             labbook_name,
                             author=get_logged_in_author())

        with lb.lock():
            if action == 'link':
                if dataset_url:
                    remote_domain = cls._get_remote_domain(
                        dataset_url, dataset_owner, dataset_name)

                    if remote_domain:
                        # Make sure git creds are configured for the remote
                        admin_service = None
                        for remote in lb.client_config.config['git'][
                                'remotes']:
                            if remote_domain == remote:
                                admin_service = lb.client_config.config['git'][
                                    'remotes'][remote]['admin_service']
                                break
                        if "HTTP_AUTHORIZATION" in info.context.headers.environ:
                            token = parse_token(info.context.headers.
                                                environ["HTTP_AUTHORIZATION"])
                        else:
                            raise ValueError(
                                "Authorization header not provided."
                                " Must have a valid session to query for collaborators"
                            )
                        mgr = GitLabManager(remote_domain, admin_service,
                                            token)
                        mgr.configure_git_credentials(remote_domain,
                                                      logged_in_username)
                else:
                    # Link to local dataset
                    ds = im.load_dataset(logged_in_username, dataset_owner,
                                         dataset_name)
                    dataset_url = f"{ds.root_dir}/.git"

                # Link the dataset to the labbook
                ds = im.link_dataset_to_labbook(dataset_url, dataset_owner,
                                                dataset_name, lb)
                ds.namespace = dataset_owner

                # Preload the dataloader
                info.context.dataset_loader.prime(
                    f"{get_logged_in_username()}&{dataset_owner}&{dataset_name}",
                    ds)

                # Relink the revision
                m = Manifest(ds, logged_in_username)
                m.link_revision()
            elif action == 'unlink':
                im.unlink_dataset_from_labbook(dataset_owner, dataset_name, lb)
            elif action == 'update':
                ds = im.update_linked_dataset_reference(
                    dataset_owner, dataset_name, lb)
                m = Manifest(ds, logged_in_username)
                m.force_reload()

                info.context.dataset_loader.prime(
                    f"{get_logged_in_username()}&{dataset_owner}&{dataset_name}",
                    ds)
            else:
                raise ValueError(
                    "Unsupported action. Use `link`, `unlink`, or `update`")

            info.context.labbook_loader.prime(
                f"{get_logged_in_username()}&{labbook_owner}&{labbook_name}",
                lb)
            edge = LabbookConnection.Edge(node=Labbook(owner=labbook_owner,
                                                       name=labbook_name),
                                          cursor=base64.b64encode(
                                              f"{0}".encode('utf-8')))

        return ModifyDatasetLink(new_labbook_edge=edge)
Example #15
0
def download_dataset_files(logged_in_username: str, access_token: str, id_token: str,
                           dataset_owner: str, dataset_name: str,
                           labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None,
                           all_keys: Optional[bool] = False, keys: Optional[List[str]] = None):
    """Method to import a dataset from a zip file

    Args:
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked
        all_keys: Boolean indicating if all remaining files should be downloaded
        keys: List if file keys to download

    Returns:
        str: directory path of imported labbook
    """
    def update_meta(msg):
        job = get_current_job()
        if not job:
            return
        if 'feedback' not in job.meta:
            job.meta['feedback'] = msg
        else:
            job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}'
        job.save_meta()

    logger = LMLogger.get_logger()

    try:
        p = os.getpid()
        logger.info(f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username},"
                    f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner},"
                    f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}")

        im = InventoryManager()

        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token, id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        if all_keys:
            result = iom.pull_all(status_update_fn=update_meta)
        elif keys:
            result = iom.pull_objects(keys=keys, status_update_fn=update_meta)
        else:
            raise ValueError("Must provide a list of keys or set all_keys=True")

        # Save the Relay node IDs to the job metadata so the UI can re-fetch as needed
        job = get_current_job()
        if job:
            job.meta['success_keys'] = [x.dataset_path for x in result.success]
            job.meta['failure_keys'] = [x.dataset_path for x in result.failure]
            job.save_meta()

        if len(result.failure) > 0:
            # If any downloads failed, exit non-zero to the UI knows there was an error
            sys.exit(-1)

    except Exception as err:
        logger.exception(err)
        raise