Exemple #1
0
def test_init_with_crashed_worker():
    max_episode_length = 16
    env = GarageEnv(PointEnv())
    policy = FixedPolicy(env.spec,
                         scripted_actions=[
                             env.action_space.sample()
                             for _ in range(max_episode_length)
                         ])
    tasks = SetTaskSampler(lambda: GarageEnv(PointEnv()))
    n_workers = 2
    workers = WorkerFactory(seed=100,
                            max_episode_length=max_episode_length,
                            n_workers=n_workers)

    class CrashingPolicy:
        def reset(self, **kwargs):
            raise Exception('Intentional subprocess crash')

    bad_policy = CrashingPolicy()

    #  This causes worker 2 to crash.
    sampler = MultiprocessingSampler.from_worker_factory(
        workers, [policy, bad_policy], envs=tasks.sample(n_workers))
    rollouts = sampler.obtain_samples(0, 160, None)
    assert sum(rollouts.lengths) >= 160
    sampler.shutdown_worker()
    env.close()
def test_obtain_exact_episodes():
    max_episode_length = 15
    n_workers = 8
    env = PointEnv()
    per_worker_actions = [env.action_space.sample() for _ in range(n_workers)]
    policies = [
        FixedPolicy(env.spec, [action] * max_episode_length)
        for action in per_worker_actions
    ]
    workers = WorkerFactory(seed=100,
                            max_episode_length=max_episode_length,
                            n_workers=n_workers)
    sampler = MultiprocessingSampler.from_worker_factory(workers,
                                                         policies,
                                                         envs=env)
    n_eps_per_worker = 3
    episodes = sampler.obtain_exact_episodes(n_eps_per_worker,
                                             agent_update=policies)
    # At least one action per episode.
    assert sum(episodes.lengths) >= n_workers * n_eps_per_worker
    # All of the episodes.
    assert len(episodes.lengths) == n_workers * n_eps_per_worker
    worker = -1
    for count, eps in enumerate(episodes.split()):
        if count % n_eps_per_worker == 0:
            worker += 1
        assert (eps.actions == per_worker_actions[worker]).all()
    sampler.shutdown_worker()
    env.close()
def test_update_envs_env_update():
    max_episode_length = 16
    env = PointEnv(max_episode_length=max_episode_length)
    policy = FixedPolicy(env.spec,
                         scripted_actions=[
                             env.action_space.sample()
                             for _ in range(max_episode_length)
                         ])
    tasks = SetTaskSampler(PointEnv)
    n_workers = 8
    workers = WorkerFactory(seed=100,
                            max_episode_length=max_episode_length,
                            n_workers=n_workers)
    sampler = MultiprocessingSampler.from_worker_factory(workers, policy, env)
    episodes = sampler.obtain_samples(0,
                                      161,
                                      np.asarray(policy.get_param_values()),
                                      env_update=tasks.sample(n_workers))
    mean_rewards = []
    goals = []
    for eps in episodes.split():
        mean_rewards.append(eps.rewards.mean())
        goals.append(eps.env_infos['task'][0]['goal'])
    assert np.var(mean_rewards) > 0
    assert np.var(goals) > 0
    with pytest.raises(ValueError):
        sampler.obtain_samples(0,
                               10,
                               np.asarray(policy.get_param_values()),
                               env_update=tasks.sample(n_workers + 1))
    sampler.shutdown_worker()
    env.close()
 def setup_method(self):
     self.env = PointEnv()
     self.base_len = len(self.env.reset())
     self.n_total_tasks = 5
     self.task_index = 1
     self.wrapped = TaskOnehotWrapper(self.env, self.task_index,
                                      self.n_total_tasks)
Exemple #5
0
def multi_env_trpo(ctxt=None, seed=1):
    """Train TRPO on two different PointEnv instances.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.
        seed (int): Used to seed the random number generator to produce
            determinism.

    """
    set_seed(seed)
    with LocalTFRunner(ctxt) as runner:
        env1 = TfEnv(normalize(PointEnv(goal=(-1., 0.))))
        env2 = TfEnv(normalize(PointEnv(goal=(1., 0.))))
        env = MultiEnvWrapper([env1, env2])

        policy = GaussianMLPPolicy(env_spec=env.spec)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    gae_lambda=0.95,
                    lr_clip_range=0.2,
                    policy_ent_coeff=0.0)

        runner.setup(algo, env)
        runner.train(n_epochs=40, batch_size=2048, plot=False)
Exemple #6
0
def run_task(snapshot_config, *_):
    """Run task.

    Args:
        snapshot_config (garage.experiment.SnapshotConfig): The snapshot
            configuration used by LocalRunner to create the snapshotter.

        _ (object): Ignored by this function.

    """
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env1 = TfEnv(normalize(PointEnv(goal=(-1., 0.))))
        env2 = TfEnv(normalize(PointEnv(goal=(1., 0.))))
        env = MultiEnvWrapper([env1, env2])

        policy = GaussianMLPPolicy(env_spec=env.spec)

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=100,
                    discount=0.99,
                    gae_lambda=0.95,
                    lr_clip_range=0.2,
                    policy_ent_coeff=0.0)

        runner.setup(algo, env)
        runner.train(n_epochs=40, batch_size=2048, plot=False)
Exemple #7
0
def test_pickle_meta_evaluator():
    set_seed(100)
    tasks = SetTaskSampler(lambda: GarageEnv(PointEnv()))
    max_path_length = 200
    env = GarageEnv(PointEnv())
    n_traj = 3
    with tempfile.TemporaryDirectory() as log_dir_name:
        runner = LocalRunner(
            SnapshotConfig(snapshot_dir=log_dir_name,
                           snapshot_mode='last',
                           snapshot_gap=1))
        meta_eval = MetaEvaluator(test_task_sampler=tasks,
                                  max_path_length=max_path_length,
                                  n_test_tasks=10,
                                  n_exploration_traj=n_traj)
        policy = RandomPolicy(env.spec.action_space)
        algo = MockAlgo(env, policy, max_path_length, n_traj, meta_eval)
        runner.setup(algo, env)
        log_file = tempfile.NamedTemporaryFile()
        csv_output = CsvOutput(log_file.name)
        logger.add_output(csv_output)
        meta_eval.evaluate(algo)
        meta_eval_pickle = cloudpickle.dumps(meta_eval)
        meta_eval2 = cloudpickle.loads(meta_eval_pickle)
        meta_eval2.evaluate(algo)
def test_pickle():
    max_episode_length = 16
    env = PointEnv()
    policy = FixedPolicy(env.spec,
                         scripted_actions=[
                             env.action_space.sample()
                             for _ in range(max_episode_length)
                         ])
    tasks = SetTaskSampler(PointEnv)
    n_workers = 4
    workers = WorkerFactory(seed=100,
                            max_episode_length=max_episode_length,
                            n_workers=n_workers)
    sampler = MultiprocessingSampler.from_worker_factory(workers, policy, env)
    sampler_pickled = pickle.dumps(sampler)
    sampler.shutdown_worker()
    sampler2 = pickle.loads(sampler_pickled)
    episodes = sampler2.obtain_samples(0,
                                       500,
                                       np.asarray(policy.get_param_values()),
                                       env_update=tasks.sample(n_workers))
    mean_rewards = []
    goals = []
    for eps in episodes.split():
        mean_rewards.append(eps.rewards.mean())
        goals.append(eps.env_infos['task'][0]['goal'])
    assert np.var(mean_rewards) > 0
    assert np.var(goals) > 0
    sampler2.shutdown_worker()
    env.close()
class TestSingleWrappedEnv:
    def setup_method(self):
        self.env = PointEnv()
        obs, _ = self.env.reset()
        self.base_len = len(obs)
        self.n_total_tasks = 5
        self.task_index = 1
        self.wrapped = TaskOnehotWrapper(self.env, self.task_index,
                                         self.n_total_tasks)

    def test_produces_correct_onehots(self):
        obs, _ = self.wrapped.reset()
        assert len(obs) == self.base_len + self.n_total_tasks
        assert (obs[-self.n_total_tasks:] == np.array([0, 1, 0, 0, 0])).all()

    def test_spec_obs_space(self):
        obs, _ = self.wrapped.reset()
        assert self.wrapped.observation_space.contains(obs)
        assert self.wrapped.spec.observation_space.contains(obs)
        assert (self.wrapped.spec.observation_space ==
                self.wrapped.observation_space)

    def test_visualization(self):
        assert self.env.render_modes == self.wrapped.render_modes
        mode = self.env.render_modes[0]
        assert self.env.render(mode) == self.wrapped.render(mode)
Exemple #10
0
def test_meta_evaluator():
    set_seed(100)
    tasks = SetTaskSampler(lambda: GarageEnv(PointEnv()))
    max_path_length = 200
    with tempfile.TemporaryDirectory() as log_dir_name:
        runner = LocalRunner(
            SnapshotConfig(snapshot_dir=log_dir_name,
                           snapshot_mode='last',
                           snapshot_gap=1))
        env = GarageEnv(PointEnv())
        algo = OptimalActionInference(env=env, max_path_length=max_path_length)
        runner.setup(algo, env)
        meta_eval = MetaEvaluator(test_task_sampler=tasks,
                                  max_path_length=max_path_length,
                                  n_test_tasks=10)
        log_file = tempfile.NamedTemporaryFile()
        csv_output = CsvOutput(log_file.name)
        logger.add_output(csv_output)
        meta_eval.evaluate(algo)
        logger.log(tabular)
        meta_eval.evaluate(algo)
        logger.log(tabular)
        logger.dump_output_type(CsvOutput)
        logger.remove_output_type(CsvOutput)
        with open(log_file.name, 'r') as file:
            rows = list(csv.DictReader(file))
        assert len(rows) == 2
        assert float(
            rows[0]['MetaTest/__unnamed_task__/TerminationRate']) < 1.0
        assert float(rows[0]['MetaTest/__unnamed_task__/Iteration']) == 0
        assert (float(rows[0]['MetaTest/__unnamed_task__/MaxReturn']) >= float(
            rows[0]['MetaTest/__unnamed_task__/AverageReturn']))
        assert (float(rows[0]['MetaTest/__unnamed_task__/AverageReturn']) >=
                float(rows[0]['MetaTest/__unnamed_task__/MinReturn']))
        assert float(rows[1]['MetaTest/__unnamed_task__/Iteration']) == 1
Exemple #11
0
def test_meta_evaluator_with_tf():
    set_seed(100)
    tasks = SetTaskSampler(lambda: GarageEnv(PointEnv()))
    max_path_length = 200
    env = GarageEnv(PointEnv())
    n_traj = 3
    with tempfile.TemporaryDirectory() as log_dir_name:
        ctxt = SnapshotConfig(snapshot_dir=log_dir_name,
                              snapshot_mode='none',
                              snapshot_gap=1)
        with LocalTFRunner(ctxt) as runner:
            meta_eval = MetaEvaluator(test_task_sampler=tasks,
                                      max_path_length=max_path_length,
                                      n_test_tasks=10,
                                      n_exploration_traj=n_traj)
            policy = GaussianMLPPolicy(env.spec)
            algo = MockAlgo(env, policy, max_path_length, n_traj, meta_eval)
            runner.setup(algo, env)
            log_file = tempfile.NamedTemporaryFile()
            csv_output = CsvOutput(log_file.name)
            logger.add_output(csv_output)
            meta_eval.evaluate(algo)
            algo_pickle = cloudpickle.dumps(algo)
        tf.compat.v1.reset_default_graph()
        with LocalTFRunner(ctxt) as runner:
            algo2 = cloudpickle.loads(algo_pickle)
            runner.setup(algo2, env)
            runner.train(10, 0)
Exemple #12
0
 def test_observation_dimension_with_max_obs_dim(self):
     env = PointEnv()
     wrapped_env = RL2Env(PointEnv(), max_obs_dim=10)
     assert wrapped_env.spec.observation_space.shape[
         0] == 10 + env.action_space.shape[0] + 2
     obs = wrapped_env.reset()
     assert 10 + env.action_space.shape[0] + 2 == obs.shape[0]
     obs, _, _, _ = wrapped_env.step(env.action_space.sample())
     assert 10 + env.action_space.shape[0] + 2 == obs.shape[0]
    def test_visualization(self):
        inner_env = PointEnv(goal=(1., 2.))
        env = NormalizedEnv(inner_env)

        env.visualize()
        env.reset()
        assert inner_env.render_modes == env.render_modes
        mode = inner_env.render_modes[0]
        assert inner_env.render(mode) == env.render(mode)
Exemple #14
0
 def test_observation_dimension(self):
     env = PointEnv()
     wrapped_env = RL2Env(PointEnv())
     assert wrapped_env.spec.observation_space.shape[0] == (
         env.observation_space.shape[0] + env.action_space.shape[0] + 2)
     obs = env.reset()
     obs2 = wrapped_env.reset()
     assert obs.shape[0] + env.action_space.shape[0] + 2 == obs2.shape[0]
     obs, _, _, _ = env.step(env.action_space.sample())
     obs2, _, _, _ = wrapped_env.step(env.action_space.sample())
     assert obs.shape[0] + env.action_space.shape[0] + 2 == obs2.shape[0]
Exemple #15
0
    def test_visualization(self):
        env = PointEnv()
        wrapped_env = RL2Env(env)

        assert env.render_modes == wrapped_env.render_modes
        mode = env.render_modes[0]
        assert env.render(mode) == wrapped_env.render(mode)

        wrapped_env.reset()
        wrapped_env.visualize()
        wrapped_env.step(wrapped_env.action_space.sample())
        wrapped_env.close()
def test_wrapped_env_list_produces_correct_onehots():
    envs = [PointEnv(), PointEnv(), PointEnv(), PointEnv()]
    base_len = len(envs[0].reset())
    n_total_tasks = len(envs)
    wrapped = TaskOnehotWrapper.wrap_env_list(envs)
    assert len(wrapped) == n_total_tasks
    for i, env in enumerate(wrapped):
        obs = env.reset()
        assert len(obs) == base_len + n_total_tasks
        onehot = np.zeros(n_total_tasks)
        onehot[i] = 1.
        assert (obs[-n_total_tasks:] == onehot).all()
        next_obs, _, _, _ = env.step(env.action_space.sample())
        assert (next_obs[-n_total_tasks:] == onehot).all()
def test_obtain_exact_episodes(ray_local_session_fixture):
    del ray_local_session_fixture
    assert ray.is_initialized()
    max_episode_length = 15
    n_workers = 8
    env = PointEnv()
    per_worker_actions = [env.action_space.sample() for _ in range(n_workers)]
    policies = [
        FixedPolicy(env.spec, [action] * max_episode_length)
        for action in per_worker_actions
    ]
    workers = WorkerFactory(seed=100,
                            max_episode_length=max_episode_length,
                            n_workers=n_workers)
    sampler = RaySampler.from_worker_factory(workers, policies, envs=env)
    n_eps_per_worker = 3
    episodes = sampler.obtain_exact_episodes(n_eps_per_worker, policies)
    # At least one action per episode.
    assert sum(episodes.lengths) >= n_workers * n_eps_per_worker
    # All of the episodes.
    assert len(episodes.lengths) == n_workers * n_eps_per_worker
    worker = -1
    for count, eps in enumerate(episodes.split()):
        if count % n_eps_per_worker == 0:
            worker += 1
        assert (eps.actions == per_worker_actions[worker]).all()
Exemple #18
0
def bc_point(ctxt=None, loss='log_prob'):
    """Run Behavioral Cloning on garage.envs.PointEnv.

    Args:
        ctxt (ExperimentContext): Provided by wrap_experiment.
        loss (str): Either 'log_prob' or 'mse'

    """
    trainer = Trainer(ctxt)
    goal = np.array([1., 1.])
    env = PointEnv(goal=goal, max_episode_length=200)
    expert = OptimalPolicy(env.spec, goal=goal)
    policy = GaussianMLPPolicy(env.spec, [8, 8])
    batch_size = 1000
    sampler = RaySampler(agents=expert,
                         envs=env,
                         max_episode_length=env.spec.max_episode_length)
    algo = BC(env.spec,
              policy,
              batch_size=batch_size,
              source=expert,
              sampler=sampler,
              policy_lr=1e-2,
              loss=loss)
    trainer.setup(algo, env)
    trainer.train(100, batch_size=batch_size)
def test_update_envs_env_update():
    max_path_length = 16
    env = TfEnv(PointEnv())
    policy = FixedPolicy(env.spec,
                         scripted_actions=[
                             env.action_space.sample()
                             for _ in range(max_path_length)
                         ])
    tasks = SetTaskSampler(PointEnv)
    n_workers = 8
    workers = WorkerFactory(seed=100,
                            max_path_length=max_path_length,
                            n_workers=n_workers)
    sampler = LocalSampler.from_worker_factory(workers, policy, env)
    rollouts = sampler.obtain_samples(0,
                                      160,
                                      np.asarray(policy.get_param_values()),
                                      env_update=tasks.sample(n_workers))
    mean_rewards = []
    goals = []
    for rollout in rollouts.split():
        mean_rewards.append(rollout.rewards.mean())
        goals.append(rollout.env_infos['task'][0]['goal'])
    assert np.var(mean_rewards) > 0
    assert np.var(goals) > 0
    with pytest.raises(ValueError):
        sampler.obtain_samples(0,
                               10,
                               np.asarray(policy.get_param_values()),
                               env_update=tasks.sample(n_workers + 1))
def test_obtain_exact_trajectories():
    max_path_length = 15
    n_workers = 8
    env = TfEnv(PointEnv())
    per_worker_actions = [env.action_space.sample() for _ in range(n_workers)]
    policies = [
        FixedPolicy(env.spec, [action] * max_path_length)
        for action in per_worker_actions
    ]
    workers = WorkerFactory(seed=100,
                            max_path_length=max_path_length,
                            n_workers=n_workers)
    sampler = LocalSampler.from_worker_factory(workers, policies, envs=env)
    n_traj_per_worker = 3
    rollouts = sampler.obtain_exact_trajectories(n_traj_per_worker,
                                                 agent_update=policies)
    # At least one action per trajectory.
    assert sum(rollouts.lengths) >= n_workers * n_traj_per_worker
    # All of the trajectories.
    assert len(rollouts.lengths) == n_workers * n_traj_per_worker
    worker = -1
    for count, rollout in enumerate(rollouts.split()):
        if count % n_traj_per_worker == 0:
            worker += 1
        assert (rollout.actions == per_worker_actions[worker]).all()
def test_init_with_env_updates():
    max_path_length = 16
    env = TfEnv(PointEnv())
    policy = FixedPolicy(env.spec,
                         scripted_actions=[
                             env.action_space.sample()
                             for _ in range(max_path_length)
                         ])
    tasks = SetTaskSampler(lambda: TfEnv(PointEnv()))
    n_workers = 8
    workers = WorkerFactory(seed=100,
                            max_path_length=max_path_length,
                            n_workers=n_workers)
    sampler = LocalSampler.from_worker_factory(workers,
                                               policy,
                                               envs=tasks.sample(n_workers))
    rollouts = sampler.obtain_samples(0, 160, policy)
    assert sum(rollouts.lengths) >= 160
 def test_does_not_modify_action(self):
     inner_env = PointEnv(goal=(1., 2.))
     env = NormalizedEnv(inner_env, scale_reward=10.)
     a = env.action_space.high + 1.
     a_copy = a
     env.reset()
     env.step(a)
     assert np.array_equal(a, a_copy)
     env.close()
Exemple #23
0
def test_init_with_env_updates():
    max_episode_length = 16
    env = GarageEnv(PointEnv())
    policy = FixedPolicy(env.spec,
                         scripted_actions=[
                             env.action_space.sample()
                             for _ in range(max_episode_length)
                         ])
    tasks = SetTaskSampler(lambda: GarageEnv(PointEnv()))
    n_workers = 8
    workers = WorkerFactory(seed=100,
                            max_episode_length=max_episode_length,
                            n_workers=n_workers)
    sampler = MultiprocessingSampler.from_worker_factory(
        workers, policy, envs=tasks.sample(n_workers))
    rollouts = sampler.obtain_samples(0, 160, policy)
    assert sum(rollouts.lengths) >= 160
    sampler.shutdown_worker()
    env.close()
 def test_pickleable(self):
     inner_env = PointEnv(goal=(1., 2.))
     env = NormalizedEnv(inner_env, scale_reward=10.)
     round_trip = pickle.loads(pickle.dumps(env))
     assert round_trip
     assert round_trip._scale_reward == env._scale_reward
     assert np.array_equal(round_trip.env._goal, env.env._goal)
     step_env(round_trip)
     round_trip.close()
     env.close()
Exemple #25
0
def test_init_with_env_updates(ray_local_session_fixture):
    del ray_local_session_fixture
    assert ray.is_initialized()
    max_path_length = 16
    env = GarageEnv(PointEnv())
    policy = FixedPolicy(env.spec,
                         scripted_actions=[
                             env.action_space.sample()
                             for _ in range(max_path_length)
                         ])
    tasks = SetTaskSampler(lambda: GarageEnv(PointEnv()))
    n_workers = 8
    workers = WorkerFactory(seed=100,
                            max_path_length=max_path_length,
                            n_workers=n_workers)
    sampler = RaySampler.from_worker_factory(workers,
                                             policy,
                                             envs=tasks.sample(n_workers))
    rollouts = sampler.obtain_samples(0, 160, policy)
    assert sum(rollouts.lengths) >= 160
Exemple #26
0
def test_init_without_worker_factory():
    max_episode_length = 16
    env = PointEnv()
    policy = FixedPolicy(env.spec,
                         scripted_actions=[
                             env.action_space.sample()
                             for _ in range(max_episode_length)
                         ])
    sampler = MultiprocessingSampler(agents=policy,
                                     envs=env,
                                     seed=100,
                                     max_episode_length=max_episode_length)
    worker_factory = WorkerFactory(seed=100,
                                   max_episode_length=max_episode_length)
    assert sampler._factory._seed == worker_factory._seed
    assert (sampler._factory._max_episode_length ==
            worker_factory._max_episode_length)
    with pytest.raises(TypeError, match='Must construct a sampler from'):
        MultiprocessingSampler(agents=policy, envs=env)
    sampler.shutdown_worker()
    env.close()
Exemple #27
0
def tutorial_vpg(ctxt=None):
    """Train VPG with PointEnv environment.

    Args:
        ctxt (ExperimentContext): The experiment configuration used by
            :class:`~LocalRunner` to create the :class:`~Snapshotter`.

    """
    set_seed(100)
    runner = LocalRunner(ctxt)
    env = PointEnv()
    policy = GaussianMLPPolicy(env.spec)
    algo = SimpleVPG(env.spec, policy)
    runner.setup(algo, env)
    runner.train(n_epochs=200, batch_size=4000)
Exemple #28
0
def tutorial_vpg(ctxt=None):
    """Train VPG with PointEnv environment.

    Args:
        ctxt (garage.experiment.ExperimentContext): The experiment
            configuration used by LocalRunner to create the snapshotter.

    """
    set_seed(100)
    with LocalTFRunner(ctxt) as runner:
        env = GarageEnv(PointEnv())
        policy = GaussianMLPPolicy(env.spec)
        algo = SimpleVPG(env.spec, policy)
        runner.setup(algo, env)
        runner.train(n_epochs=200, batch_size=4000)
Exemple #29
0
def test_no_seed():
    max_episode_length = 16
    env = PointEnv()
    policy = FixedPolicy(env.spec,
                         scripted_actions=[
                             env.action_space.sample()
                             for _ in range(max_episode_length)
                         ])
    n_workers = 8
    workers = WorkerFactory(seed=None,
                            max_episode_length=max_episode_length,
                            n_workers=n_workers)
    sampler = LocalSampler.from_worker_factory(workers, policy, env)
    episodes = sampler.obtain_samples(0, 160, policy)
    assert sum(episodes.lengths) >= 160
Exemple #30
0
def test_bc_point_sample_batches():
    deterministic.set_seed(100)
    runner = LocalRunner(snapshot_config)
    goal = np.array([1., 1.])
    env = PointEnv(goal=goal)
    max_episode_length = 100
    source = list(expert_source(env, goal, max_episode_length, 5))
    policy = DeterministicMLPPolicy(env.spec, hidden_sizes=[8, 8])
    batch_size = 600
    algo = BC(env.spec,
              policy,
              batch_size=batch_size,
              source=source,
              policy_lr=1e-2,
              loss='mse')
    runner.setup(algo, env)
    run_bc(runner, algo, batch_size)