Ejemplo n.º 1
0
    def test_objects_to_push(self, mock_dataset_with_manifest):
        ds, manifest, working_dir = mock_dataset_with_manifest
        iom = IOManager(ds, manifest)

        revision = manifest.dataset_revision
        os.makedirs(
            os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir"))
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test1.txt", "test content 1")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test2.txt", "test content 2")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "other_dir/test4.txt", "test content 4")
        manifest.sweep_all_changes()

        # Modify file to have 2 objects with same key
        helper_append_file(manifest.cache_mgr.cache_root,
                           iom.manifest.dataset_revision, "test2.txt",
                           "test content 22")
        manifest.sweep_all_changes()

        obj_to_push = iom.objects_to_push()

        assert len(obj_to_push) == 4
        assert obj_to_push[0].dataset_path == "other_dir/test4.txt"
        assert obj_to_push[1].dataset_path == "test1.txt"
        assert obj_to_push[2].dataset_path == "test2.txt"
        assert obj_to_push[3].dataset_path == "test2.txt"
        assert obj_to_push[2].revision != obj_to_push[3].revision

        assert iom.num_objects_to_push() == 4
Ejemplo n.º 2
0
    def test_compute_push_batches(self, mock_dataset_with_manifest_bg_tests):
        """Test compute push batches, verifying it works OK when you've deleted some files"""
        ds, manifest, working_dir = mock_dataset_with_manifest_bg_tests
        iom = IOManager(ds, manifest)

        revision = manifest.dataset_revision
        os.makedirs(
            os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir"))
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "other_dir/test3.txt", "test content 3")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test1.txt", "test" * 4300000)
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test2.txt", "test content 2")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test4.txt", "test content 4")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test5.txt", "test content 5")
        manifest.sweep_all_changes()

        assert len(manifest.manifest) == 6

        # remove a file from the manifest
        manifest.delete(['test5.txt'])
        assert len(manifest.manifest) == 5

        key_batches, total_bytes, num_files = iom.compute_push_batches()
        assert num_files == 5
        assert total_bytes == (4 * 4300000) + (14 * 4)
        assert len(key_batches) == 2
        assert len(key_batches[0]) == 4
        assert len(key_batches[1]) == 1
        assert key_batches[1][0].dataset_path == 'test1.txt'
Ejemplo n.º 3
0
    def test_objects_to_push_deduped(self, mock_dataset_with_manifest):
        ds, manifest, working_dir = mock_dataset_with_manifest
        iom = IOManager(ds, manifest)

        revision = manifest.dataset_revision
        os.makedirs(
            os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir"))
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test1.txt", "test content dup")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test2.txt", "test content dup")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test3.txt", "test content dup")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "other_dir/test4.txt", "test content 4")
        manifest.sweep_all_changes()

        # Write a .DS_Store file in the objects dir to make sure it gets skipped
        with open(
                os.path.join(manifest.cache_mgr.cache_root, 'objects', '.push',
                             '.DS_Store'), 'wt') as ff:
            ff.write("")

        obj_to_push = iom.objects_to_push(remove_duplicates=True)

        assert len(obj_to_push) == 2
        assert obj_to_push[0].dataset_path == "other_dir/test4.txt"
        assert obj_to_push[1].dataset_path == "test1.txt"

        assert iom.num_objects_to_push(remove_duplicates=True) == 2
Ejemplo n.º 4
0
    def test_push_objects_with_failure(self, mock_dataset_with_manifest):
        ds, manifest, working_dir = mock_dataset_with_manifest
        iom = IOManager(ds, manifest)

        revision = manifest.dataset_revision
        os.makedirs(
            os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir"))
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test1.txt", "test content 1")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test2.txt", "test content 2")
        manifest.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 2
        _, obj1 = obj_to_push[0].object_path.rsplit('/', 1)
        _, obj2 = obj_to_push[1].object_path.rsplit('/', 1)

        with aioresponses() as mocked_responses:
            mocked_responses.put(
                f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj1}',
                payload={
                    "presigned_url": f"https://dummyurl.com/{obj1}?params=1",
                    "namespace": ds.namespace,
                    "key_id": "hghghg",
                    "obj_id": obj1,
                    "dataset": ds.name
                },
                status=200)
            mocked_responses.put(f"https://dummyurl.com/{obj1}?params=1",
                                 payload={},
                                 status=200)

            mocked_responses.put(
                f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj2}',
                payload={
                    "presigned_url": f"https://dummyurl.com/{obj2}?params=1",
                    "namespace": ds.namespace,
                    "key_id": "hghghg",
                    "obj_id": obj2,
                    "dataset": ds.name
                },
                status=200)
            mocked_responses.put(f"https://dummyurl.com/{obj2}?params=1",
                                 payload={},
                                 status=400)

            assert len(glob.glob(f'{iom.push_dir}/*')) == 1
            iom.dataset.backend.set_default_configuration(
                "test-user", "abcd", '1234')

            result = iom.push_objects()
            assert len(glob.glob(f'{iom.push_dir}/*')) == 1

            assert len(result.success) == 1
            assert len(result.failure) == 1
            assert result.success[0].object_path == obj_to_push[0].object_path
            assert result.failure[0].object_path == obj_to_push[1].object_path
Ejemplo n.º 5
0
 def _push_dataset_objects(self, dataset: Dataset, logged_in_username: str,
                           feedback_callback: Callable, access_token,
                           id_token) -> None:
     dataset.backend.set_default_configuration(logged_in_username,
                                               access_token, id_token)
     m = Manifest(dataset, logged_in_username)
     iom = IOManager(dataset, m)
     iom.push_objects(status_update_fn=feedback_callback)
     iom.manifest.link_revision()
Ejemplo n.º 6
0
    def test_objects_to_push_ignore_other_branch(self,
                                                 mock_dataset_with_manifest):
        ds, manifest, working_dir = mock_dataset_with_manifest
        iom = IOManager(ds, manifest)

        revision = manifest.dataset_revision
        os.makedirs(
            os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir"))
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test1.txt", "test content 1")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test2.txt", "fdsfgfd")
        manifest.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 2
        assert obj_to_push[0].dataset_path == "test1.txt"
        assert obj_to_push[1].dataset_path == "test2.txt"

        # Create new branch and add a file there
        bm = BranchManager(ds, username=USERNAME)
        starting_branch = bm.active_branch
        bm.create_branch(title="test-branch")
        assert bm.active_branch == "test-branch"
        assert ds.is_repo_clean is True

        helper_append_file(manifest.cache_mgr.cache_root,
                           iom.manifest.dataset_revision, "test3.txt",
                           "fdsfgfd")
        manifest.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 3
        assert obj_to_push[0].dataset_path == "test1.txt"
        assert obj_to_push[1].dataset_path == "test2.txt"
        assert obj_to_push[2].dataset_path == "test3.txt"

        # Go back to original branch, you shouldn't have to push file on other branch
        bm.workon_branch(starting_branch)

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 2
        assert obj_to_push[0].dataset_path == "test1.txt"
        assert obj_to_push[1].dataset_path == "test2.txt"
Ejemplo n.º 7
0
    def test_sync__dataset(self, mock_config_file):
        def update_feedback(msg: str,
                            has_failures: Optional[bool] = None,
                            failure_detail: Optional[str] = None,
                            percent_complete: Optional[float] = None):
            """Method to update the job's metadata and provide feedback to the UI"""
            assert has_failures is None or has_failures is False
            assert failure_detail is None

        def dispatch_query_mock(self, job_key):
            JobStatus = namedtuple("JobStatus", ['status', 'meta'])
            return JobStatus(status='finished',
                             meta={'completed_bytes': '100'})

        def dispatch_mock(self, method_reference, kwargs, metadata, persist):
            return "afakejobkey"

        username = '******'
        im = InventoryManager(mock_config_file[0])
        ds = im.create_dataset(username, username, 'dataset-1',
                               'gigantum_object_v1')
        m = Manifest(ds, username)
        wf = DatasetWorkflow(ds)

        iom = IOManager(ds, m)
        assert len(glob.glob(f'{iom.push_dir}/*')) == 0
        wf.publish(username=username, feedback_callback=update_feedback)

        # Put a file into the dataset that needs to be pushed
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test1.txt", "asdfadfsdf")
        m.sweep_all_changes()

        assert len(glob.glob(f'{iom.push_dir}/*')) == 1
        with patch.object(Dispatcher, 'dispatch_task', dispatch_mock):
            with patch.object(Dispatcher, 'query_task', dispatch_query_mock):
                wf.sync(username=username, feedback_callback=update_feedback)
                assert os.path.exists(wf.remote)
                assert len(glob.glob(f'{iom.push_dir}/*')) == 0
Ejemplo n.º 8
0
    def test_compute_pull_batches(self, mock_dataset_with_manifest_bg_tests):
        ds, manifest, working_dir = mock_dataset_with_manifest_bg_tests
        iom = IOManager(ds, manifest)

        revision = manifest.dataset_revision
        os.makedirs(
            os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir"))
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "other_dir/test3.txt", "test content 3")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test1.txt", "test" * 4300000)
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test2.txt", "test content 2")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test4.txt", "test content 4")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test5.txt", "test content 5")
        manifest.sweep_all_changes()

        with pytest.raises(ValueError):
            iom.compute_pull_batches()

        # Remove all files so everything needs to be pulled
        rev_dir = os.path.join(manifest.cache_mgr.cache_root,
                               manifest.dataset_revision)
        object_dir = os.path.join(manifest.cache_mgr.cache_root, 'objects')
        shutil.rmtree(rev_dir)
        shutil.rmtree(object_dir)

        key_batches, total_bytes, num_files = iom.compute_pull_batches(
            pull_all=True)
        assert num_files == 5
        assert total_bytes == (4 * 4300000) + (14 * 4)
        assert len(key_batches) == 2
        assert len(key_batches[0]) == 4
        assert len(key_batches[1]) == 1
        assert key_batches[1][0] == 'test1.txt'
Ejemplo n.º 9
0
    def _push_dataset_objects(self, logged_in_username: str,
                              feedback_callback: Callable, access_token,
                              id_token) -> None:
        """Method to schedule a push operta

        Args:
            logged_in_username:
            feedback_callback:
            access_token:
            id_token:

        Returns:

        """
        dispatcher_obj = Dispatcher()

        try:
            self.dataset.backend.set_default_configuration(
                logged_in_username, access_token, id_token)
            m = Manifest(self.dataset, logged_in_username)
            iom = IOManager(self.dataset, m)

            obj_batches, total_bytes, num_files = iom.compute_push_batches()

            if obj_batches:
                # Schedule jobs for batches
                bg_jobs = list()
                for objs in obj_batches:
                    job_kwargs = {
                        'objs': objs,
                        'logged_in_username': logged_in_username,
                        'access_token': access_token,
                        'id_token': id_token,
                        'dataset_owner': self.dataset.namespace,
                        'dataset_name': self.dataset.name,
                        'config_file': self.dataset.client_config.config_file,
                    }
                    job_metadata = {
                        'dataset':
                        f"{logged_in_username}|{self.dataset.namespace}|{self.dataset.name}",
                        'method': 'pull_objects'
                    }

                    feedback_callback(
                        f"Preparing to upload {num_files} files. Please wait..."
                    )
                    job_key = dispatcher_obj.dispatch_task(
                        method_reference=gtmcore.dispatcher.dataset_jobs.
                        push_dataset_objects,
                        kwargs=job_kwargs,
                        metadata=job_metadata,
                        persist=True)
                    bg_jobs.append(
                        BackgroundUploadJob(dispatcher_obj, objs, job_key))
                    logger.info(
                        f"Schedule dataset object upload job for"
                        f" {logged_in_username}/{self.dataset.namespace}/{self.dataset.name} with"
                        f" {len(objs)} objects to upload")

                while sum([(x.is_complete or x.is_failed)
                           for x in bg_jobs]) != len(bg_jobs):
                    # Refresh all job statuses and update status feedback
                    [j.refresh_status() for j in bg_jobs]
                    total_completed_bytes = sum(
                        [j.completed_bytes for j in bg_jobs])
                    if total_completed_bytes > 0:
                        pc = (float(total_completed_bytes) /
                              float(total_bytes)) * 100
                        feedback_callback(
                            f"Please wait - Uploading {num_files} files ({format_size(total_completed_bytes)}"
                            f" of {format_size(total_bytes)}) - {round(pc)}% complete",
                            percent_complete=pc)
                    time.sleep(1)

                # if you get here, all jobs are done or failed.
                # Remove all the push files so they can be regenerated if needed
                for f in glob.glob(f'{iom.push_dir}/*'):
                    os.remove(f)

                # Aggregate failures if they exist
                failure_keys: List[str] = list()
                for j in bg_jobs:
                    if j.is_failed:
                        # Background job hard failed. Assume entire batch should get re-uploaded
                        for obj in j.objs:
                            failure_keys.append(
                                f"{obj.dataset_path} at {obj.revision[0:8]}")
                            m.queue_to_push(obj.object_path, obj.dataset_path,
                                            obj.revision)
                    else:
                        for obj in j.get_failed_objects():
                            # Some individual objects failed
                            failure_keys.append(
                                f"{obj.dataset_path} at {obj.revision[0:8]}")
                            m.queue_to_push(obj.object_path, obj.dataset_path,
                                            obj.revision)

                # Set final status for UI
                if len(failure_keys) == 0:
                    feedback_callback(f"Upload complete!",
                                      percent_complete=100,
                                      has_failures=False)
                else:
                    failure_str = "\n".join(failure_keys)
                    failure_detail_str = f"Files that failed to upload:\n{failure_str}"
                    feedback_callback("",
                                      percent_complete=100,
                                      has_failures=True,
                                      failure_detail=failure_detail_str)

                # Finish up by linking everything just in case
                iom.manifest.link_revision()

                if len(failure_keys) > 0:
                    # If any downloads failed, exit non-zero to the UI knows there was an error
                    raise IOError(
                        f"{len(failure_keys)} file(s) failed to upload. Check message detail for more information"
                        " and try to sync again.")
        except Exception as err:
            logger.exception(err)
            raise
Ejemplo n.º 10
0
    def test_pull_objects_all_partial_download(self,
                                               mock_dataset_with_manifest):
        ds, manifest, working_dir = mock_dataset_with_manifest
        iom = IOManager(ds, manifest)

        revision = manifest.dataset_revision
        os.makedirs(
            os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir"))
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "other_dir/test3.txt", "1")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test1.txt", "test content 1")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test2.txt", "test content 2")
        manifest.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 3
        _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1)
        _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1)
        _, obj_id_3 = obj_to_push[2].object_path.rsplit('/', 1)
        obj1_target = obj_to_push[0].object_path
        obj2_target = obj_to_push[1].object_path
        obj3_target = obj_to_push[2].object_path

        obj1_source = os.path.join('/tmp', uuid.uuid4().hex)

        assert "test3.txt" in obj_to_push[0].dataset_path

        assert os.path.exists(obj1_target) is True
        assert os.path.exists(obj2_target) is True
        assert os.path.exists(obj3_target) is True

        # Completely remove other_dir/test3.txt object
        os.remove(
            os.path.join(manifest.cache_mgr.cache_root,
                         manifest.dataset_revision, "other_dir", "test3.txt"))
        helper_compress_file(obj1_target, obj1_source)

        # Remove link for test1.txt
        os.remove(
            os.path.join(manifest.cache_mgr.cache_root,
                         manifest.dataset_revision, "test1.txt"))

        assert os.path.isfile(obj1_target) is False
        assert os.path.isfile(obj2_target) is True
        assert os.path.isfile(obj3_target) is True

        with aioresponses() as mocked_responses:
            mocked_responses.get(
                f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}',
                payload={
                    "presigned_url":
                    f"https://dummyurl.com/{obj_id_1}?params=1",
                    "namespace": ds.namespace,
                    "obj_id": obj_id_1,
                    "dataset": ds.name
                },
                status=200)

            with open(obj1_source, 'rb') as data1:
                mocked_responses.get(
                    f"https://dummyurl.com/{obj_id_1}?params=1",
                    body=data1.read(),
                    status=200,
                    content_type='application/octet-stream')

            iom.dataset.backend.set_default_configuration(
                "test-user", "abcd", '1234')

            result = iom.pull_all()
            assert len(result.success) == 1
            assert len(result.failure) == 0
            assert result.success[0].object_path == obj1_target
            assert "test3.txt" in result.success[0].dataset_path

            assert os.path.isfile(obj1_target) is True
            assert os.path.isfile(obj2_target) is True
            assert os.path.isfile(obj3_target) is True

            filename = os.path.join(manifest.cache_mgr.cache_root,
                                    manifest.dataset_revision, "other_dir",
                                    "test3.txt")
            assert os.path.isfile(filename) is True
            with open(filename, 'rt') as dd:
                assert dd.read() == "1"

            filename = os.path.join(manifest.cache_mgr.cache_root,
                                    manifest.dataset_revision, "test1.txt")
            assert os.path.isfile(filename) is True
            with open(filename, 'rt') as dd:
                assert dd.read() == "test content 1"

            filename = os.path.join(manifest.cache_mgr.cache_root,
                                    manifest.dataset_revision, "test2.txt")
            assert os.path.isfile(filename) is True
            with open(filename, 'rt') as dd:
                assert dd.read() == "test content 2"

            # Try pulling all again with nothing to pull
            result = iom.pull_all()
            assert len(result.success) == 0
            assert len(result.failure) == 0
            assert result.message == "Dataset already downloaded."
Ejemplo n.º 11
0
    def test_pull_objects_all(self, mock_dataset_with_manifest):
        ds, manifest, working_dir = mock_dataset_with_manifest
        iom = IOManager(ds, manifest)

        revision = manifest.dataset_revision
        os.makedirs(
            os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir"))
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test1.txt", "test content 1")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test2.txt", "test content 2")
        manifest.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 2
        _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1)
        _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1)
        obj1_target = obj_to_push[0].object_path
        obj2_target = obj_to_push[1].object_path

        obj1_source = os.path.join('/tmp', uuid.uuid4().hex)
        obj2_source = os.path.join('/tmp', uuid.uuid4().hex)

        check_info = {obj1_target: obj1_source, obj2_target: obj2_source}

        assert os.path.exists(obj1_target) is True
        assert os.path.exists(obj2_target) is True

        helper_compress_file(obj1_target, obj1_source)
        helper_compress_file(obj2_target, obj2_source)

        assert os.path.isfile(obj1_target) is False
        assert os.path.isfile(obj2_target) is False
        assert os.path.isfile(obj1_source) is True
        assert os.path.isfile(obj2_source) is True

        # remove data from the local file cache
        os.remove(
            os.path.join(manifest.cache_mgr.cache_root,
                         manifest.dataset_revision, "test1.txt"))
        os.remove(
            os.path.join(manifest.cache_mgr.cache_root,
                         manifest.dataset_revision, "test2.txt"))
        shutil.rmtree(os.path.join(manifest.cache_mgr.cache_root, 'objects'))
        os.makedirs(os.path.join(manifest.cache_mgr.cache_root, 'objects'))

        with aioresponses() as mocked_responses:
            mocked_responses.get(
                f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}',
                payload={
                    "presigned_url":
                    f"https://dummyurl.com/{obj_id_1}?params=1",
                    "namespace": ds.namespace,
                    "obj_id": obj_id_1,
                    "dataset": ds.name
                },
                status=200)

            with open(obj1_source, 'rb') as data1:
                mocked_responses.get(
                    f"https://dummyurl.com/{obj_id_1}?params=1",
                    body=data1.read(),
                    status=200,
                    content_type='application/octet-stream')

            mocked_responses.get(
                f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_2}',
                payload={
                    "presigned_url":
                    f"https://dummyurl.com/{obj_id_2}?params=1",
                    "namespace": ds.namespace,
                    "obj_id": obj_id_2,
                    "dataset": ds.name
                },
                status=200)

            with open(obj2_source, 'rb') as data2:
                mocked_responses.get(
                    f"https://dummyurl.com/{obj_id_2}?params=1",
                    body=data2.read(),
                    status=200,
                    content_type='application/octet-stream')

            iom.dataset.backend.set_default_configuration(
                "test-user", "abcd", '1234')

            result = iom.pull_all()
            assert len(result.success) == 2
            assert len(result.failure) == 0
            assert result.success[0].object_path != result.success[
                1].object_path
            assert result.success[0].object_path in [
                obj_to_push[0].object_path, obj_to_push[1].object_path
            ]
            assert result.success[1].object_path in [
                obj_to_push[0].object_path, obj_to_push[1].object_path
            ]

            assert os.path.isfile(obj1_target) is True
            assert os.path.isfile(obj2_target) is True

            decompressor = snappy.StreamDecompressor()
            for r in result.success:
                with open(check_info[r.object_path], 'rb') as dd:
                    source1 = decompressor.decompress(dd.read())
                    source1 += decompressor.flush()
                with open(r.object_path, 'rt') as dd:
                    dest1 = dd.read()
                assert source1.decode("utf-8") == dest1
Ejemplo n.º 12
0
    def test_pull_objects(self, mock_dataset_with_manifest):
        ds, manifest, working_dir = mock_dataset_with_manifest
        iom = IOManager(ds, manifest)

        revision = manifest.dataset_revision
        os.makedirs(
            os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir"))
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test1.txt", "test content 1")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test2.txt", "test content 2")
        manifest.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 2
        _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1)
        _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1)
        obj1_target = obj_to_push[0].object_path
        obj2_target = obj_to_push[1].object_path

        obj1_source = os.path.join('/tmp', uuid.uuid4().hex)
        obj2_source = os.path.join('/tmp', uuid.uuid4().hex)

        assert os.path.exists(obj1_target) is True
        assert os.path.exists(obj2_target) is True

        helper_compress_file(obj1_target, obj1_source)
        helper_compress_file(obj2_target, obj2_source)

        assert os.path.isfile(obj1_target) is False
        assert os.path.isfile(obj2_target) is False
        assert os.path.isfile(obj1_source) is True
        assert os.path.isfile(obj2_source) is True

        with aioresponses() as mocked_responses:
            mocked_responses.get(
                f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}',
                payload={
                    "presigned_url":
                    f"https://dummyurl.com/{obj_id_1}?params=1",
                    "namespace": ds.namespace,
                    "obj_id": obj_id_1,
                    "dataset": ds.name
                },
                status=200)

            with open(obj1_source, 'rb') as data1:
                mocked_responses.get(
                    f"https://dummyurl.com/{obj_id_1}?params=1",
                    body=data1.read(),
                    status=200,
                    content_type='application/octet-stream')

            mocked_responses.get(
                f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_2}',
                payload={
                    "presigned_url":
                    f"https://dummyurl.com/{obj_id_2}?params=1",
                    "namespace": ds.namespace,
                    "obj_id": obj_id_2,
                    "dataset": ds.name
                },
                status=200)

            with open(obj2_source, 'rb') as data2:
                mocked_responses.get(
                    f"https://dummyurl.com/{obj_id_2}?params=1",
                    body=data2.read(),
                    status=200,
                    content_type='application/octet-stream')

            assert len(glob.glob(f'{iom.push_dir}/*')) == 1
            iom.dataset.backend.set_default_configuration(
                "test-user", "abcd", '1234')

            result = iom.pull_objects(keys=["test1.txt"])
            assert len(glob.glob(f'{iom.push_dir}/*')) == 1
            assert len(result.success) == 1
            assert len(result.failure) == 0
            assert result.success[0].object_path == obj_to_push[0].object_path

            assert os.path.isfile(obj1_target) is True
            assert os.path.isfile(obj2_target) is False
            with open(obj1_target, 'rt') as dd:
                assert "test content 1" == dd.read()

            result = iom.pull_objects(keys=["test2.txt"])
            assert len(glob.glob(f'{iom.push_dir}/*')) == 1
            assert len(result.success) == 1
            assert len(result.failure) == 0
            assert result.success[0].object_path == obj_to_push[1].object_path

            assert os.path.isfile(obj1_target) is True
            assert os.path.isfile(obj2_target) is True
            with open(obj1_target, 'rt') as dd:
                assert "test content 1" == dd.read()
            with open(obj2_target, 'rt') as dd:
                assert "test content 2" == dd.read()
Ejemplo n.º 13
0
    def test__get_pull_all_keys(self, mock_dataset_with_manifest):
        ds, manifest, working_dir = mock_dataset_with_manifest
        iom = IOManager(ds, manifest)

        revision = manifest.dataset_revision
        os.makedirs(
            os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir"))
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "other_dir/test3.txt", "dummy content")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test1.txt", "test content 1")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test2.txt", "test content 2")
        manifest.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 3
        obj3 = obj_to_push[0].object_path
        obj1 = obj_to_push[1].object_path
        obj2 = obj_to_push[2].object_path

        rev_dir = os.path.join(manifest.cache_mgr.cache_root,
                               manifest.dataset_revision)
        file3 = os.path.join(rev_dir, obj_to_push[0].dataset_path)
        file1 = os.path.join(rev_dir, obj_to_push[1].dataset_path)
        file2 = os.path.join(rev_dir, obj_to_push[2].dataset_path)

        assert os.path.exists(obj1) is True
        assert os.path.exists(obj2) is True
        assert os.path.exists(obj3) is True
        assert os.path.exists(file1) is True
        assert os.path.exists(file2) is True
        assert os.path.exists(file3) is True

        result = iom._get_pull_all_keys()
        assert len(result) == 0

        # Completely remove other_dir/test3.txt object
        os.remove(obj3)
        os.remove(file3)

        # Remove link for test1.txt, should relink automatically and not need to be pulled
        os.remove(file1)

        assert os.path.exists(obj1) is True
        assert os.path.exists(obj2) is True
        assert os.path.exists(obj3) is False
        assert os.path.exists(file1) is False
        assert os.path.exists(file2) is True
        assert os.path.exists(file3) is False

        result = iom._get_pull_all_keys()
        assert len(result) == 1
        assert result[0] == 'other_dir/test3.txt'

        assert os.path.exists(obj1) is True
        assert os.path.exists(obj2) is True
        assert os.path.exists(obj3) is False
        assert os.path.exists(file1) is True
        assert os.path.exists(file2) is True
        assert os.path.exists(file3) is False
Ejemplo n.º 14
0
 def test_init(self, mock_dataset_with_manifest):
     ds, manifest, working_dir = mock_dataset_with_manifest
     iom = IOManager(ds, manifest)
     assert isinstance(iom, IOManager)
     assert isinstance(iom.push_dir, str)
Ejemplo n.º 15
0
def download_dataset_files(logged_in_username: str,
                           access_token: str,
                           id_token: str,
                           dataset_owner: str,
                           dataset_name: str,
                           labbook_owner: Optional[str] = None,
                           labbook_name: Optional[str] = None,
                           all_keys: Optional[bool] = False,
                           keys: Optional[List[str]] = None,
                           config_file: str = None) -> None:
    """Method to download files from a dataset in the background and provide status to the UI.

    This job schedules `pull_objects` jobs after splitting up the download work into batches. At the end, the job
    removes any partially downloaded files (due to failures) and links all the files for the dataset.

    Args:
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked
        all_keys: Boolean indicating if all remaining files should be downloaded
        keys: List if file keys to download
        config_file: config file (used for test mocking)

    Returns:
        str: directory path of imported labbook
    """
    dispatcher_obj = Dispatcher()

    def update_feedback(msg: str,
                        has_failures: Optional[bool] = None,
                        failure_detail: Optional[str] = None,
                        percent_complete: Optional[float] = None) -> None:
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if has_failures:
            current_job.meta['has_failures'] = has_failures
        if failure_detail:
            current_job.meta['failure_detail'] = failure_detail
        if percent_complete:
            current_job.meta['percent_complete'] = percent_complete

        current_job.meta['feedback'] = msg
        current_job.save_meta()

    logger = LMLogger.get_logger()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username},"
            f" dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner},"
            f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}")

        im = InventoryManager(config_file=config_file)

        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner,
                                 labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets',
                                       dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner,
                                 dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        key_batches, total_bytes, num_files = iom.compute_pull_batches(
            keys, pull_all=all_keys)

        failure_keys = list()
        if key_batches:
            # Schedule jobs for batches
            bg_jobs = list()
            for keys in key_batches:
                job_kwargs = {
                    'keys': keys,
                    'logged_in_username': logged_in_username,
                    'access_token': access_token,
                    'id_token': id_token,
                    'dataset_owner': dataset_owner,
                    'dataset_name': dataset_name,
                    'labbook_owner': labbook_owner,
                    'labbook_name': labbook_name,
                    'config_file': config_file,
                }
                job_metadata = {
                    'dataset':
                    f"{logged_in_username}|{dataset_owner}|{dataset_name}",
                    'method': 'pull_objects'
                }

                job_key = dispatcher_obj.dispatch_task(
                    method_reference=pull_objects,
                    kwargs=job_kwargs,
                    metadata=job_metadata,
                    persist=True)
                bg_jobs.append(
                    BackgroundDownloadJob(dispatcher_obj, keys, job_key))

            update_feedback(
                f"Please wait - Downloading {num_files} files ({format_size(total_bytes)}) - 0% complete",
                percent_complete=0,
                has_failures=False)
            logger.info(
                f"(Job {p}) Starting file downloads for"
                f" {logged_in_username}/{dataset_owner}/{dataset_name} with {len(key_batches)} jobs"
            )

            while sum([(x.is_complete or x.is_failed)
                       for x in bg_jobs]) != len(bg_jobs):
                # Refresh all job statuses and update status feedback
                [j.refresh_status() for j in bg_jobs]
                total_completed_bytes = sum(
                    [j.completed_bytes for j in bg_jobs])
                pc = (float(total_completed_bytes) / float(total_bytes)) * 100
                update_feedback(
                    f"Please wait - Downloading {num_files} files ({format_size(total_completed_bytes)} of "
                    f"{format_size(total_bytes)}) - {round(pc)}% complete",
                    percent_complete=pc)
                time.sleep(1)

            # Aggregate failures if they exist
            for j in bg_jobs:
                if j.is_failed:
                    # Whole job failed...assume entire batch should get re-uploaded for now
                    failure_keys.extend(j.keys)
                else:
                    failure_keys.extend(j.get_failed_keys())

        # Set final status for UI
        if len(failure_keys) == 0:
            update_feedback(f"Download complete!",
                            percent_complete=100,
                            has_failures=False)
        else:
            failure_str = ""
            for f in failure_keys:
                # If any failed files partially downloaded, remove them.
                abs_dataset_path = os.path.join(m.current_revision_dir, f)
                abs_object_path = m.dataset_to_object_path(f)
                if os.path.exists(abs_dataset_path):
                    os.remove(abs_dataset_path)
                if os.path.exists(abs_object_path):
                    os.remove(abs_object_path)
                failure_str = f"{failure_str}{f}\n"

            failure_detail_str = f"Files that failed to download:\n{failure_str}"
            update_feedback("",
                            has_failures=True,
                            failure_detail=failure_detail_str)

        # Link dataset files, so anything that was successfully pulled will materialize
        m.link_revision()

        if len(failure_keys) > 0:
            # If any downloads failed, exit non-zero to the UI knows there was an error
            raise IOError(
                f"{len(failure_keys)} file(s) failed to download. Check message detail and try again."
            )

    except Exception as err:
        logger.exception(err)
        raise
Ejemplo n.º 16
0
def pull_objects(keys: List[str],
                 logged_in_username: str,
                 access_token: str,
                 id_token: str,
                 dataset_owner: str,
                 dataset_name: str,
                 labbook_owner: Optional[str] = None,
                 labbook_name: Optional[str] = None,
                 config_file: str = None) -> None:
    """Method to pull a collection of objects from a dataset's backend.

    This runs the IOManager.pull_objects() method with `link_revision=False`. This is because this job can be run in
    parallel multiple times with different sets of keys. You don't want to link until the very end, which is handled
    in the `download_dataset_files` job, which is what scheduled this job.

    Args:
        keys: List if file keys to download
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked
        config_file: config file (used for test mocking)

    Returns:
        str: directory path of imported labbook
    """
    logger = LMLogger.get_logger()

    def progress_update_callback(completed_bytes: int) -> None:
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if 'completed_bytes' not in current_job.meta:
            current_job.meta['completed_bytes'] = 0

        current_job.meta['completed_bytes'] = int(
            current_job.meta['completed_bytes']) + completed_bytes
        current_job.save_meta()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting pull_objects(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner},"
            f" labbook_name={labbook_name}")

        im = InventoryManager(config_file=config_file)

        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner,
                                 labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets',
                                       dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner,
                                 dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        result = iom.pull_objects(keys=keys,
                                  progress_update_fn=progress_update_callback,
                                  link_revision=False)

        job = get_current_job()
        if job:
            job.meta['failure_keys'] = ",".join(
                [x.dataset_path for x in result.failure])
            job.meta['message'] = result.message
            job.save_meta()

    except Exception as err:
        logger.exception(err)
        raise
Ejemplo n.º 17
0
    def test_push_objects_deduplicate(self, mock_dataset_with_manifest,
                                      mock_dataset_head):
        ds, manifest, working_dir = mock_dataset_with_manifest
        iom = IOManager(ds, manifest)

        revision = manifest.dataset_revision
        os.makedirs(
            os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir"))
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test1.txt", "test content 1")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test2.txt", "test content dup")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test3.txt", "test content dup")
        manifest.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 3
        _, obj1 = obj_to_push[0].object_path.rsplit('/', 1)
        _, obj2 = obj_to_push[1].object_path.rsplit('/', 1)
        _, obj3 = obj_to_push[2].object_path.rsplit('/', 1)
        assert obj1 != obj2
        assert obj2 == obj3

        with aioresponses() as mocked_responses:
            mocked_responses.put(
                f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj1}',
                payload={
                    "presigned_url": f"https://dummyurl.com/{obj1}?params=1",
                    "namespace": ds.namespace,
                    "key_id": "hghghg",
                    "obj_id": obj1,
                    "dataset": ds.name
                },
                status=200)
            mocked_responses.put(f"https://dummyurl.com/{obj1}?params=1",
                                 headers={'Etag': 'asdfasdf'},
                                 status=200)

            mocked_responses.put(
                f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj2}',
                payload={
                    "presigned_url": f"https://dummyurl.com/{obj2}?params=1",
                    "namespace": ds.namespace,
                    "key_id": "hghghg",
                    "obj_id": obj2,
                    "dataset": ds.name
                },
                status=200)
            mocked_responses.put(f"https://dummyurl.com/{obj2}?params=1",
                                 headers={'Etag': '12341234'},
                                 status=200)

            assert len(glob.glob(f'{iom.push_dir}/*')) == 1
            iom.dataset.backend.set_default_configuration(
                "test-user", "abcd", '1234')

            obj_to_push = iom.objects_to_push(remove_duplicates=True)
            result = iom.push_objects(obj_to_push, chunk_update_callback)
            assert len(glob.glob(f'{iom.push_dir}/*')) == 1

            assert len(result.success) == 2
            assert len(result.failure) == 0
            assert isinstance(result, PushResult) is True
            assert isinstance(result.success[0], PushObject) is True
            assert result.success[0].object_path != result.success[
                1].object_path
            assert result.success[0].object_path in [
                obj_to_push[0].object_path, obj_to_push[1].object_path
            ]
            assert result.success[1].object_path in [
                obj_to_push[0].object_path, obj_to_push[1].object_path
            ]
Ejemplo n.º 18
0
    def test_download_dataset_files_file_fail(
            self, mock_config_file_background_tests):
        def dispatch_query_mock(self, job_key):
            # mock the job actually running and returning status
            JobStatus = namedtuple("JobStatus", ['status', 'meta'])
            return JobStatus(status='finished',
                             meta={
                                 'completed_bytes': '0',
                                 'failure_keys': 'test1.txt'
                             })

        def dispatch_mock(self, method_reference, kwargs, metadata, persist):
            gtmcore.dispatcher.dataset_jobs.pull_objects(**kwargs)
            return "afakejobkey"

        im = InventoryManager(mock_config_file_background_tests[0])
        ds = im.create_dataset('default',
                               'default',
                               "dataset100",
                               storage_type="gigantum_object_v1",
                               description="100")
        m = Manifest(ds, 'default')
        iom = IOManager(ds, m)

        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test1.txt", "asdfadfsdf")
        m.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 1
        _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1)
        obj1_target = obj_to_push[0].object_path

        obj1_source = os.path.join('/tmp', uuid.uuid4().hex)

        assert os.path.exists(obj1_target) is True
        helper_compress_file(obj1_target, obj1_source)
        assert os.path.isfile(obj1_target) is False
        assert os.path.isfile(obj1_source) is True

        # Clear out from linked dir
        os.remove(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test1.txt'))

        with patch.object(Configuration, 'find_default_config',
                          lambda self: mock_config_file_background_tests[0]):
            with patch.object(Dispatcher, 'dispatch_task', dispatch_mock):
                with patch.object(Dispatcher, 'query_task',
                                  dispatch_query_mock):
                    dl_kwargs = {
                        'logged_in_username': "******",
                        'access_token': "asdf",
                        'id_token': "1234",
                        'dataset_owner': "default",
                        'dataset_name': "dataset100",
                        'labbook_owner': None,
                        'labbook_name': None,
                        'keys': ["test1.txt"],
                        'config_file': mock_config_file_background_tests[0]
                    }

                    with pytest.raises(IOError):
                        gtmcore.dispatcher.dataset_jobs.download_dataset_files(
                            **dl_kwargs)
                    assert os.path.isfile(obj1_target) is False
Ejemplo n.º 19
0
    def test_download_dataset_files(self, mock_config_file_background_tests,
                                    mock_dataset_head):
        def dispatch_query_mock(self, job_key):
            JobStatus = namedtuple("JobStatus", ['status', 'meta'])
            return JobStatus(status='finished',
                             meta={'completed_bytes': '500'})

        def dispatch_mock(self, method_reference, kwargs, metadata, persist):
            with aioresponses() as mocked_responses:
                mocked_responses.get(
                    f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}',
                    payload={
                        "presigned_url":
                        f"https://dummyurl.com/{obj_id_1}?params=1",
                        "namespace": ds.namespace,
                        "obj_id": obj_id_1,
                        "dataset": ds.name
                    },
                    status=200)

                with open(obj1_source, 'rb') as data1:
                    mocked_responses.get(
                        f"https://dummyurl.com/{obj_id_1}?params=1",
                        body=data1.read(),
                        status=200,
                        content_type='application/octet-stream')
                gtmcore.dispatcher.dataset_jobs.pull_objects(**kwargs)

                return "afakejobkey"

        im = InventoryManager(mock_config_file_background_tests[0])
        ds = im.create_dataset('default',
                               'default',
                               "dataset100",
                               storage_type="gigantum_object_v1",
                               description="100")
        m = Manifest(ds, 'default')
        iom = IOManager(ds, m)

        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test1.txt", "asdfadfsdf")
        m.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 1
        _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1)
        obj1_target = obj_to_push[0].object_path

        obj1_source = os.path.join('/tmp', uuid.uuid4().hex)

        assert os.path.exists(obj1_target) is True
        helper_compress_file(obj1_target, obj1_source)
        assert os.path.isfile(obj1_target) is False
        assert os.path.isfile(obj1_source) is True

        # Clear out from linked dir
        os.remove(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test1.txt'))

        with patch.object(Configuration, 'find_default_config',
                          lambda self: mock_config_file_background_tests[0]):
            with patch.object(Dispatcher, 'dispatch_task', dispatch_mock):
                with patch.object(Dispatcher, 'query_task',
                                  dispatch_query_mock):
                    dl_kwargs = {
                        'logged_in_username': "******",
                        'access_token': "asdf",
                        'id_token': "1234",
                        'dataset_owner': "default",
                        'dataset_name': "dataset100",
                        'labbook_owner': None,
                        'labbook_name': None,
                        'keys': ["test1.txt"],
                        'config_file': mock_config_file_background_tests[0]
                    }

                    gtmcore.dispatcher.dataset_jobs.download_dataset_files(
                        **dl_kwargs)
                    assert os.path.isfile(obj1_target) is True

                    decompressor = snappy.StreamDecompressor()
                    with open(obj1_source, 'rb') as dd:
                        source1 = decompressor.decompress(dd.read())
                        source1 += decompressor.flush()
                    with open(obj1_target, 'rt') as dd:
                        dest1 = dd.read()
                    assert source1.decode("utf-8") == dest1
Ejemplo n.º 20
0
    def test_pull_objects(self, mock_config_file, mock_dataset_head):
        im = InventoryManager(mock_config_file[0])
        ds = im.create_dataset('default',
                               'default',
                               "dataset100",
                               storage_type="gigantum_object_v1",
                               description="100")
        m = Manifest(ds, 'default')
        iom = IOManager(ds, m)

        os.makedirs(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         "other_dir"))
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test1.txt", "asdfadfsdf")
        helper_append_file(m.cache_mgr.cache_root, m.dataset_revision,
                           "test2.txt", "fdsfgfd")
        m.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 2
        _, obj_id_1 = obj_to_push[0].object_path.rsplit('/', 1)
        _, obj_id_2 = obj_to_push[1].object_path.rsplit('/', 1)
        obj1_target = obj_to_push[0].object_path
        obj2_target = obj_to_push[1].object_path

        obj1_source = os.path.join('/tmp', uuid.uuid4().hex)
        obj2_source = os.path.join('/tmp', uuid.uuid4().hex)

        assert os.path.exists(obj1_target) is True
        assert os.path.exists(obj2_target) is True
        helper_compress_file(obj1_target, obj1_source)
        helper_compress_file(obj2_target, obj2_source)
        assert os.path.isfile(obj1_target) is False
        assert os.path.isfile(obj2_target) is False
        assert os.path.isfile(obj1_source) is True
        assert os.path.isfile(obj2_source) is True

        # Clear out from linked dir
        os.remove(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test1.txt'))
        os.remove(
            os.path.join(m.cache_mgr.cache_root, m.dataset_revision,
                         'test2.txt'))

        with patch.object(Configuration, 'find_default_config',
                          lambda self: mock_config_file[0]):
            with aioresponses() as mocked_responses:
                mocked_responses.get(
                    f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_1}',
                    payload={
                        "presigned_url":
                        f"https://dummyurl.com/{obj_id_1}?params=1",
                        "namespace": ds.namespace,
                        "obj_id": obj_id_1,
                        "dataset": ds.name
                    },
                    status=200)

                with open(obj1_source, 'rb') as data1:
                    mocked_responses.get(
                        f"https://dummyurl.com/{obj_id_1}?params=1",
                        body=data1.read(),
                        status=200,
                        content_type='application/octet-stream')

                mocked_responses.get(
                    f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj_id_2}',
                    payload={
                        "presigned_url":
                        f"https://dummyurl.com/{obj_id_2}?params=1",
                        "namespace": ds.namespace,
                        "obj_id": obj_id_2,
                        "dataset": ds.name
                    },
                    status=200)

                with open(obj2_source, 'rb') as data2:
                    mocked_responses.get(
                        f"https://dummyurl.com/{obj_id_2}?params=1",
                        body=data2.read(),
                        status=200,
                        content_type='application/octet-stream')

                dl_kwargs = {
                    'logged_in_username': "******",
                    'access_token': "asdf",
                    'id_token': "1234",
                    'dataset_owner': "default",
                    'dataset_name': "dataset100",
                    'labbook_owner': None,
                    'labbook_name': None,
                    'keys': ["test1.txt"]
                }

                gtmcore.dispatcher.dataset_jobs.pull_objects(**dl_kwargs)

                # Manually link since this is disabled by default in the job (because in real use, multiple jobs run
                # in parallel and you only want to link once.
                m.link_revision()

                assert os.path.isfile(obj1_target) is True
                assert os.path.isfile(obj2_target) is False

                decompressor = snappy.StreamDecompressor()
                with open(obj1_source, 'rb') as dd:
                    source1 = decompressor.decompress(dd.read())
                    source1 += decompressor.flush()
                with open(obj1_target, 'rt') as dd:
                    dest1 = dd.read()
                assert source1.decode("utf-8") == dest1

                # Download other file
                dl_kwargs = {
                    'logged_in_username': "******",
                    'access_token': "asdf",
                    'id_token': "1234",
                    'dataset_owner': "default",
                    'dataset_name': "dataset100",
                    'labbook_owner': None,
                    'labbook_name': None,
                    'keys': ["test2.txt"]
                }

                gtmcore.dispatcher.dataset_jobs.pull_objects(**dl_kwargs)

                # Manually link since this is disabled by default in the job (because in real use, multiple jobs run
                # in parallel and you only want to link once.
                m.link_revision()

                assert os.path.isfile(obj1_target) is True
                assert os.path.isfile(obj2_target) is True

                with open(obj1_source, 'rb') as dd:
                    source1 = decompressor.decompress(dd.read())
                    source1 += decompressor.flush()
                with open(obj1_target, 'rt') as dd:
                    dest1 = dd.read()
                assert source1.decode("utf-8") == dest1

                with open(obj2_source, 'rb') as dd:
                    source1 = decompressor.decompress(dd.read())
                    source1 += decompressor.flush()
                with open(obj2_target, 'rt') as dd:
                    dest1 = dd.read()
                assert source1.decode("utf-8") == dest1
Ejemplo n.º 21
0
    def test_push_objects(self, mock_config_file, mock_dataset_head):
        im = InventoryManager(mock_config_file[0])
        ds = im.create_dataset('default',
                               'default',
                               "dataset100",
                               storage_type="gigantum_object_v1",
                               description="100")
        manifest = Manifest(ds, 'default')
        iom = IOManager(ds, manifest)

        revision = manifest.dataset_revision
        os.makedirs(
            os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir"))
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test1.txt", "test content 1")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test2.txt", "test content 2")
        manifest.sweep_all_changes()

        obj_to_push = iom.objects_to_push()
        assert len(obj_to_push) == 2
        _, obj1 = obj_to_push[0].object_path.rsplit('/', 1)
        _, obj2 = obj_to_push[1].object_path.rsplit('/', 1)

        with aioresponses() as mocked_responses:
            mocked_responses.put(
                f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj1}',
                payload={
                    "presigned_url": f"https://dummyurl.com/{obj1}?params=1",
                    "namespace": ds.namespace,
                    "key_id": "hghghg",
                    "obj_id": obj1,
                    "dataset": ds.name
                },
                status=200)
            mocked_responses.put(f"https://dummyurl.com/{obj1}?params=1",
                                 payload={},
                                 status=200)

            mocked_responses.put(
                f'https://api.gigantum.com/object-v1/{ds.namespace}/{ds.name}/{obj2}',
                payload={
                    "presigned_url": f"https://dummyurl.com/{obj2}?params=1",
                    "namespace": ds.namespace,
                    "key_id": "hghghg",
                    "obj_id": obj2,
                    "dataset": ds.name
                },
                status=200)
            mocked_responses.put(f"https://dummyurl.com/{obj2}?params=1",
                                 payload={},
                                 status=200)

            job_kwargs = {
                'objs': obj_to_push,
                'logged_in_username': "******",
                'access_token': "faketoken",
                'id_token': "faketoken",
                'dataset_owner': ds.namespace,
                'dataset_name': ds.name,
                'config_file': ds.client_config.config_file,
            }
            gtmcore.dispatcher.dataset_jobs.push_dataset_objects(**job_kwargs)
Ejemplo n.º 22
0
def push_dataset_objects(objs: List[PushObject],
                         logged_in_username: str,
                         access_token: str,
                         id_token: str,
                         dataset_owner: str,
                         dataset_name: str,
                         config_file: str = None) -> None:
    """Method to pull a collection of objects from a dataset's backend

    Args:
        objs: List if file PushObject to push
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        config_file: config file (used for test mocking)

    Returns:
        str: directory path of imported labbook
    """
    logger = LMLogger.get_logger()

    def progress_update_callback(completed_bytes: int) -> None:
        """Method to update the job's metadata and provide feedback to the UI"""
        current_job = get_current_job()
        if not current_job:
            return
        if 'completed_bytes' not in current_job.meta:
            current_job.meta['completed_bytes'] = 0

        current_job.meta['completed_bytes'] = int(
            current_job.meta['completed_bytes']) + completed_bytes
        current_job.save_meta()

    try:
        p = os.getpid()
        logger.info(
            f"(Job {p}) Starting push_dataset_objects(logged_in_username={logged_in_username},"
            f"dataset_owner={dataset_owner}, dataset_name={dataset_name}")

        im = InventoryManager(config_file=config_file)
        ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token,
                                             id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        result = iom.push_objects(objs,
                                  progress_update_fn=progress_update_callback)

        job = get_current_job()
        if job:
            job.meta['failures'] = ",".join([
                f"{x.object_path}|{x.dataset_path}|{x.revision}"
                for x in result.failure
            ])
            job.meta['message'] = result.message
            job.save_meta()

    except Exception as err:
        logger.exception(err)
        raise
Ejemplo n.º 23
0
def download_dataset_files(logged_in_username: str, access_token: str, id_token: str,
                           dataset_owner: str, dataset_name: str,
                           labbook_owner: Optional[str] = None, labbook_name: Optional[str] = None,
                           all_keys: Optional[bool] = False, keys: Optional[List[str]] = None):
    """Method to import a dataset from a zip file

    Args:
        logged_in_username: username for the currently logged in user
        access_token: bearer token
        id_token: identity token
        dataset_owner: Owner of the dataset containing the files to download
        dataset_name: Name of the dataset containing the files to download
        labbook_owner: Owner of the labbook if this dataset is linked
        labbook_name: Name of the labbook if this dataset is linked
        all_keys: Boolean indicating if all remaining files should be downloaded
        keys: List if file keys to download

    Returns:
        str: directory path of imported labbook
    """
    def update_meta(msg):
        job = get_current_job()
        if not job:
            return
        if 'feedback' not in job.meta:
            job.meta['feedback'] = msg
        else:
            job.meta['feedback'] = job.meta['feedback'] + f'\n{msg}'
        job.save_meta()

    logger = LMLogger.get_logger()

    try:
        p = os.getpid()
        logger.info(f"(Job {p}) Starting download_dataset_files(logged_in_username={logged_in_username},"
                    f"dataset_owner={dataset_owner}, dataset_name={dataset_name}, labbook_owner={labbook_owner},"
                    f" labbook_name={labbook_name}, all_keys={all_keys}, keys={keys}")

        im = InventoryManager()

        if labbook_owner is not None and labbook_name is not None:
            # This is a linked dataset, load repo from the Project
            lb = im.load_labbook(logged_in_username, labbook_owner, labbook_name)
            dataset_dir = os.path.join(lb.root_dir, '.gigantum', 'datasets', dataset_owner, dataset_name)
            ds = im.load_dataset_from_directory(dataset_dir)
        else:
            # this is a normal dataset. Load repo from working dir
            ds = im.load_dataset(logged_in_username, dataset_owner, dataset_name)

        ds.namespace = dataset_owner
        ds.backend.set_default_configuration(logged_in_username, access_token, id_token)
        m = Manifest(ds, logged_in_username)
        iom = IOManager(ds, m)

        if all_keys:
            result = iom.pull_all(status_update_fn=update_meta)
        elif keys:
            result = iom.pull_objects(keys=keys, status_update_fn=update_meta)
        else:
            raise ValueError("Must provide a list of keys or set all_keys=True")

        # Save the Relay node IDs to the job metadata so the UI can re-fetch as needed
        job = get_current_job()
        if job:
            job.meta['success_keys'] = [x.dataset_path for x in result.success]
            job.meta['failure_keys'] = [x.dataset_path for x in result.failure]
            job.save_meta()

        if len(result.failure) > 0:
            # If any downloads failed, exit non-zero to the UI knows there was an error
            sys.exit(-1)

    except Exception as err:
        logger.exception(err)
        raise