def test_compute_push_batches(self, mock_dataset_with_manifest_bg_tests):
        """Test compute push batches, verifying it works OK when you've deleted some files"""
        ds, manifest, working_dir = mock_dataset_with_manifest_bg_tests
        iom = IOManager(ds, manifest)

        revision = manifest.dataset_revision
        os.makedirs(
            os.path.join(manifest.cache_mgr.cache_root, revision, "other_dir"))
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "other_dir/test3.txt", "test content 3")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test1.txt", "test" * 4300000)
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test2.txt", "test content 2")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test4.txt", "test content 4")
        helper_append_file(manifest.cache_mgr.cache_root, revision,
                           "test5.txt", "test content 5")
        manifest.sweep_all_changes()

        assert len(manifest.manifest) == 6

        # remove a file from the manifest
        manifest.delete(['test5.txt'])
        assert len(manifest.manifest) == 5

        key_batches, total_bytes, num_files = iom.compute_push_batches()
        assert num_files == 5
        assert total_bytes == (4 * 4300000) + (14 * 4)
        assert len(key_batches) == 2
        assert len(key_batches[0]) == 4
        assert len(key_batches[1]) == 1
        assert key_batches[1][0].dataset_path == 'test1.txt'
Example #2
0
    def _push_dataset_objects(self, logged_in_username: str,
                              feedback_callback: Callable, access_token,
                              id_token) -> None:
        """Method to schedule a push operta

        Args:
            logged_in_username:
            feedback_callback:
            access_token:
            id_token:

        Returns:

        """
        dispatcher_obj = Dispatcher()

        try:
            self.dataset.backend.set_default_configuration(
                logged_in_username, access_token, id_token)
            m = Manifest(self.dataset, logged_in_username)
            iom = IOManager(self.dataset, m)

            obj_batches, total_bytes, num_files = iom.compute_push_batches()

            if obj_batches:
                # Schedule jobs for batches
                bg_jobs = list()
                for objs in obj_batches:
                    job_kwargs = {
                        'objs': objs,
                        'logged_in_username': logged_in_username,
                        'access_token': access_token,
                        'id_token': id_token,
                        'dataset_owner': self.dataset.namespace,
                        'dataset_name': self.dataset.name,
                        'config_file': self.dataset.client_config.config_file,
                    }
                    job_metadata = {
                        'dataset':
                        f"{logged_in_username}|{self.dataset.namespace}|{self.dataset.name}",
                        'method': 'pull_objects'
                    }

                    feedback_callback(
                        f"Preparing to upload {num_files} files. Please wait..."
                    )
                    job_key = dispatcher_obj.dispatch_task(
                        method_reference=gtmcore.dispatcher.dataset_jobs.
                        push_dataset_objects,
                        kwargs=job_kwargs,
                        metadata=job_metadata,
                        persist=True)
                    bg_jobs.append(
                        BackgroundUploadJob(dispatcher_obj, objs, job_key))
                    logger.info(
                        f"Schedule dataset object upload job for"
                        f" {logged_in_username}/{self.dataset.namespace}/{self.dataset.name} with"
                        f" {len(objs)} objects to upload")

                while sum([(x.is_complete or x.is_failed)
                           for x in bg_jobs]) != len(bg_jobs):
                    # Refresh all job statuses and update status feedback
                    [j.refresh_status() for j in bg_jobs]
                    total_completed_bytes = sum(
                        [j.completed_bytes for j in bg_jobs])
                    if total_completed_bytes > 0:
                        pc = (float(total_completed_bytes) /
                              float(total_bytes)) * 100
                        feedback_callback(
                            f"Please wait - Uploading {num_files} files ({format_size(total_completed_bytes)}"
                            f" of {format_size(total_bytes)}) - {round(pc)}% complete",
                            percent_complete=pc)
                    time.sleep(1)

                # if you get here, all jobs are done or failed.
                # Remove all the push files so they can be regenerated if needed
                for f in glob.glob(f'{iom.push_dir}/*'):
                    os.remove(f)

                # Aggregate failures if they exist
                failure_keys: List[str] = list()
                for j in bg_jobs:
                    if j.is_failed:
                        # Background job hard failed. Assume entire batch should get re-uploaded
                        for obj in j.objs:
                            failure_keys.append(
                                f"{obj.dataset_path} at {obj.revision[0:8]}")
                            m.queue_to_push(obj.object_path, obj.dataset_path,
                                            obj.revision)
                    else:
                        for obj in j.get_failed_objects():
                            # Some individual objects failed
                            failure_keys.append(
                                f"{obj.dataset_path} at {obj.revision[0:8]}")
                            m.queue_to_push(obj.object_path, obj.dataset_path,
                                            obj.revision)

                # Set final status for UI
                if len(failure_keys) == 0:
                    feedback_callback(f"Upload complete!",
                                      percent_complete=100,
                                      has_failures=False)
                else:
                    failure_str = "\n".join(failure_keys)
                    failure_detail_str = f"Files that failed to upload:\n{failure_str}"
                    feedback_callback("",
                                      percent_complete=100,
                                      has_failures=True,
                                      failure_detail=failure_detail_str)

                # Finish up by linking everything just in case
                iom.manifest.link_revision()

                if len(failure_keys) > 0:
                    # If any downloads failed, exit non-zero to the UI knows there was an error
                    raise IOError(
                        f"{len(failure_keys)} file(s) failed to upload. Check message detail for more information"
                        " and try to sync again.")
        except Exception as err:
            logger.exception(err)
            raise