def test_pull_objects_all(self, mock_dataset_with_manifest):
        ds, manifest, working_dir = mock_dataset_with_manifest
        iom = IOManager(ds, manifest)

        revision = manifest.dataset_revision
        os.makedirs(
            os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir"))
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test1.txt", "test content 1")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test2.txt", "test content 2")
        manifest.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 2
        _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1)
        _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1)
        obj1_target = obj_to_push[0].object_path
        obj2_target = obj_to_push[1].object_path

        obj1_source = os.path.join('/tmp', uuid.uuid4().hex)
        obj2_source = os.path.join('/tmp', uuid.uuid4().hex)

        check_info = {obj1_target: obj1_source, obj2_target: obj2_source}

        assert os.path.exists(obj1_target) is True
        assert os.path.exists(obj2_target) is True

        helper_compress_file(obj1_target, obj1_source)
        helper_compress_file(obj2_target, obj2_source)

        assert os.path.isfile(obj1_target) is False
        assert os.path.isfile(obj2_target) is False
        assert os.path.isfile(obj1_source) is True
        assert os.path.isfile(obj2_source) is True

        # remove data from the local file cache
        os.remove(
            os.path.join(manifest.cache_mgr.cache_root,
                         manifest.dataset_revision, "test1.txt"))
        os.remove(
            os.path.join(manifest.cache_mgr.cache_root,
                         manifest.dataset_revision, "test2.txt"))
        shutil.rmtree(os.path.join(manifest.cache_mgr.cache_root, 'objects'))
        os.makedirs(os.path.join(manifest.cache_mgr.cache_root, 'objects'))

        with aioresponses() as mocked_responses:
            mocked_responses.get(
                f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}',
                payload={
                    "presigned_url":
                    f"https://dummyurl.com/{obj_id_1}?params=1",
                    "namespace": ds.namespace,
                    "obj_id": obj_id_1,
                    "dataset": ds.name
                },
                status=200)

            with open(obj1_source, 'rb') as data1:
                mocked_responses.get(
                    f"https://dummyurl.com/{obj_id_1}?params=1",
                    body=data1.read(),
                    status=200,
                    content_type='application/octet-stream')

            mocked_responses.get(
                f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_2}',
                payload={
                    "presigned_url":
                    f"https://dummyurl.com/{obj_id_2}?params=1",
                    "namespace": ds.namespace,
                    "obj_id": obj_id_2,
                    "dataset": ds.name
                },
                status=200)

            with open(obj2_source, 'rb') as data2:
                mocked_responses.get(
                    f"https://dummyurl.com/{obj_id_2}?params=1",
                    body=data2.read(),
                    status=200,
                    content_type='application/octet-stream')

            iom.dataset.backend.set_default_configuration(
                "test-user", "abcd", '1234')

            result = iom.pull_all()
            assert len(result.success) == 2
            assert len(result.failure) == 0
            assert result.success[0].object_path != result.success[
                1].object_path
            assert result.success[0].object_path in [
                obj_to_push[0].object_path, obj_to_push[1].object_path
            ]
            assert result.success[1].object_path in [
                obj_to_push[0].object_path, obj_to_push[1].object_path
            ]

            assert os.path.isfile(obj1_target) is True
            assert os.path.isfile(obj2_target) is True

            decompressor = snappy.StreamDecompressor()
            for r in result.success:
                with open(check_info[r.object_path], 'rb') as dd:
                    source1 = decompressor.decompress(dd.read())
                    source1 += decompressor.flush()
                with open(r.object_path, 'rt') as dd:
                    dest1 = dd.read()
                assert source1.decode("utf-8") == dest1
    def test_pull_objects_all_partial_download(self,
                                               mock_dataset_with_manifest):
        ds, manifest, working_dir = mock_dataset_with_manifest
        iom = IOManager(ds, manifest)

        revision = manifest.dataset_revision
        os.makedirs(
            os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir"))
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "other_dir/test3.txt", "1")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test1.txt", "test content 1")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test2.txt", "test content 2")
        manifest.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 3
        _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1)
        _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1)
        _, obj_id_3 = obj_to_push[2].object_path.rsplit('/', 1)
        obj1_target = obj_to_push[0].object_path
        obj2_target = obj_to_push[1].object_path
        obj3_target = obj_to_push[2].object_path

        obj1_source = os.path.join('/tmp', uuid.uuid4().hex)

        assert "test3.txt" in obj_to_push[0].dataset_path

        assert os.path.exists(obj1_target) is True
        assert os.path.exists(obj2_target) is True
        assert os.path.exists(obj3_target) is True

        # Completely remove other_dir/test3.txt object
        os.remove(
            os.path.join(manifest.cache_mgr.cache_root,
                         manifest.dataset_revision, "other_dir", "test3.txt"))
        helper_compress_file(obj1_target, obj1_source)

        # Remove link for test1.txt
        os.remove(
            os.path.join(manifest.cache_mgr.cache_root,
                         manifest.dataset_revision, "test1.txt"))

        assert os.path.isfile(obj1_target) is False
        assert os.path.isfile(obj2_target) is True
        assert os.path.isfile(obj3_target) is True

        with aioresponses() as mocked_responses:
            mocked_responses.get(
                f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}',
                payload={
                    "presigned_url":
                    f"https://dummyurl.com/{obj_id_1}?params=1",
                    "namespace": ds.namespace,
                    "obj_id": obj_id_1,
                    "dataset": ds.name
                },
                status=200)

            with open(obj1_source, 'rb') as data1:
                mocked_responses.get(
                    f"https://dummyurl.com/{obj_id_1}?params=1",
                    body=data1.read(),
                    status=200,
                    content_type='application/octet-stream')

            iom.dataset.backend.set_default_configuration(
                "test-user", "abcd", '1234')

            result = iom.pull_all()
            assert len(result.success) == 1
            assert len(result.failure) == 0
            assert result.success[0].object_path == obj1_target
            assert "test3.txt" in result.success[0].dataset_path

            assert os.path.isfile(obj1_target) is True
            assert os.path.isfile(obj2_target) is True
            assert os.path.isfile(obj3_target) is True

            filename = os.path.join(manifest.cache_mgr.cache_root,
                                    manifest.dataset_revision, "other_dir",
                                    "test3.txt")
            assert os.path.isfile(filename) is True
            with open(filename, 'rt') as dd:
                assert dd.read() == "1"

            filename = os.path.join(manifest.cache_mgr.cache_root,
                                    manifest.dataset_revision, "test1.txt")
            assert os.path.isfile(filename) is True
            with open(filename, 'rt') as dd:
                assert dd.read() == "test content 1"

            filename = os.path.join(manifest.cache_mgr.cache_root,
                                    manifest.dataset_revision, "test2.txt")
            assert os.path.isfile(filename) is True
            with open(filename, 'rt') as dd:
                assert dd.read() == "test content 2"

            # Try pulling all again with nothing to pull
            result = iom.pull_all()
            assert len(result.success) == 0
            assert len(result.failure) == 0
            assert result.message == "Dataset already downloaded."
Exemple #3
0
def download_dataset_files(logged_in_username: str, access_token: str, id_token: str,
                           dataset_owner: str, dataset_name: str,
                           labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None,
                           all_keys: Optional[bool] = False, keys: Optional[List[str]] = None):
    """Method to import a dataset from a zip file

    Args:
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked
        all_keys: Boolean indicating if all remaining files should be downloaded
        keys: List if file keys to download

    Returns:
        str: directory path of imported labbook
    """
    def update_meta(msg):
        job = get_current_job()
        if not job:
            return
        if 'feedback' not in job.meta:
            job.meta['feedback'] = msg
        else:
            job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}'
        job.save_meta()

    logger = LMLogger.get_logger()

    try:
        p = os.getpid()
        logger.info(f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username},"
                    f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner},"
                    f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}")

        im = InventoryManager()

        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token, id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        if all_keys:
            result = iom.pull_all(status_update_fn=update_meta)
        elif keys:
            result = iom.pull_objects(keys=keys, status_update_fn=update_meta)
        else:
            raise ValueError("Must provide a list of keys or set all_keys=True")

        # Save the Relay node IDs to the job metadata so the UI can re-fetch as needed
        job = get_current_job()
        if job:
            job.meta['success_keys'] = [x.dataset_path for x in result.success]
            job.meta['failure_keys'] = [x.dataset_path for x in result.failure]
            job.save_meta()

        if len(result.failure) > 0:
            # If any downloads failed, exit non-zero to the UI knows there was an error
            sys.exit(-1)

    except Exception as err:
        logger.exception(err)
        raise