Ejemplo n.º 1
0
def learn_cartpole():
    """Train an agent."""
    env = gym.make('CartPole-v0')
    try:
        agent = ActorCritic(gym_space_distribution(env.action_space),
                            gym_space_vectorizer(env.observation_space))
        with tf.Session() as sess:
            a2c = A2C(sess, agent, target_kl=0.03)
            roller = BasicRoller(env, agent, min_episodes=8, min_steps=1024)
            while True:
                with agent.frozen():
                    rollouts = roller.rollouts()
                print('mean=%f' % (mean_total_reward(rollouts), ))
                agent.actor.extend(
                    a2c.policy_update(rollouts,
                                      STEP_SIZE,
                                      NUM_STEPS,
                                      min_leaf=30))
                agent.critic.extend(
                    a2c.value_update(rollouts,
                                     VAL_STEP,
                                     NUM_STEPS,
                                     min_leaf=30))
    finally:
        env.close()
Ejemplo n.º 2
0
def test_truncation(stateful, state_tuple):
    """
    Test sequence truncation for TruncatedRoller with a
    batch of one environment.
    """
    def env_fn():
        return SimpleEnv(7, (5, 3), 'uint8')

    env = env_fn()
    model = SimpleModel(env.action_space.low.shape,
                        stateful=stateful,
                        state_tuple=state_tuple)
    basic_roller = BasicRoller(env, model, min_episodes=5)
    expected = basic_roller.rollouts()
    total_timesteps = sum([x.num_steps for x in expected])

    batched_env = batched_gym_env([env_fn], sync=True)
    trunc_roller = TruncatedRoller(batched_env, model,
                                   total_timesteps // 2 + 1)
    actual1 = trunc_roller.rollouts()
    assert actual1[-1].trunc_end
    actual2 = trunc_roller.rollouts()
    expected1, expected2 = _artificial_truncation(expected,
                                                  len(actual1) - 1,
                                                  actual1[-1].num_steps)
    assert len(actual2) == len(expected2) + 1
    actual2 = actual2[:-1]
    _compare_rollout_batch(actual1, expected1)
    _compare_rollout_batch(actual2, expected2)
Ejemplo n.º 3
0
def run_ppo():
    """
    Run a training worker.
    """
    env = gym.make('CartPole-v0')
    action_dist = gym_space_distribution(env.action_space)
    obs_vectorizer = gym_space_vectorizer(env.observation_space)

    with tf.Session() as sess:
        model = MLP(sess, action_dist, obs_vectorizer, layer_sizes=[32])

        # Deal with CartPole-v0 reward scale.
        model.scale_outputs(20)

        roller = BasicRoller(env, model, min_episodes=30)
        ppo = PPO(model)
        optimizer = MPIOptimizer(tf.train.AdamOptimizer(learning_rate=1e-3),
                                 -ppo.objective)

        sess.run(tf.global_variables_initializer())
        optimizer.sync_from_root(sess)
        for i in range(50):
            rollouts = roller.rollouts()
            # pylint: disable=E1101
            print('batch %d: rank=%d mean=%f' %
                  (i, MPI.COMM_WORLD.Get_rank(), mean_total_reward(rollouts)))
            mpi_ppo(ppo, optimizer, rollouts, log_fn=print)
Ejemplo n.º 4
0
def test_ep_batches(stateful, state_tuple, limits):
    """
    Test that EpisodeRoller is equivalent to a
    BasicRoller when run on a batch of envs.
    """
    def env_fn():
        return SimpleEnv(3, (4, 5), 'uint8')

    model = SimpleModel((4, 5), stateful=stateful, state_tuple=state_tuple)

    batched_env = batched_gym_env([env_fn] * 21, num_sub_batches=7, sync=True)
    ep_roller = EpisodeRoller(batched_env, model, **limits)
    actual = ep_roller.rollouts()

    total_steps = sum([r.num_steps for r in actual])
    assert len(actual) >= ep_roller.min_episodes
    assert total_steps >= ep_roller.min_steps

    if 'min_steps' not in limits:
        num_eps = ep_roller.min_episodes + batched_env.num_envs - 1
        assert len(actual) == num_eps

    basic_roller = BasicRoller(env_fn(), model, min_episodes=len(actual))
    expected = basic_roller.rollouts()

    _compare_rollout_batch(actual, expected)
Ejemplo n.º 5
0
    def _test_truncation_case(self, stateful, state_tuple):
        """
        Test rollout truncation and continuation for a
        specific set of model parameters.
        """
        env_fn = lambda: SimpleEnv(7, (5, 3), 'uint8')
        env = env_fn()
        model = SimpleModel(env.action_space.low.shape,
                            stateful=stateful,
                            state_tuple=state_tuple)
        basic_roller = BasicRoller(env, model, min_episodes=5)
        expected = basic_roller.rollouts()
        total_timesteps = sum([x.num_steps for x in expected])

        batched_env = batched_gym_env([env_fn], sync=True)
        trunc_roller = TruncatedRoller(batched_env, model,
                                       total_timesteps // 2 + 1)
        actual1 = trunc_roller.rollouts()
        self.assertTrue(actual1[-1].trunc_end)
        actual2 = trunc_roller.rollouts()
        expected1, expected2 = _artificial_truncation(expected,
                                                      len(actual1) - 1,
                                                      actual1[-1].num_steps)
        self.assertEqual(len(actual2), len(expected2) + 1)
        actual2 = actual2[:-1]
        _compare_rollout_batch(self, actual1, expected1)
        _compare_rollout_batch(self, actual2, expected2)
Ejemplo n.º 6
0
    def _test_batch_equivalence_case(self, stateful, state_tuple,
                                     **roller_kwargs):
        """
        Test BasicRoller equivalence when using a batch of
        environments.
        """
        env_fn = lambda: SimpleEnv(3, (4, 5), 'uint8')
        model = SimpleModel((4, 5), stateful=stateful, state_tuple=state_tuple)

        batched_env = batched_gym_env([env_fn] * 21,
                                      num_sub_batches=7,
                                      sync=True)
        ep_roller = EpisodeRoller(batched_env, model, **roller_kwargs)
        actual = ep_roller.rollouts()

        total_steps = sum([r.num_steps for r in actual])
        self.assertTrue(len(actual) >= ep_roller.min_episodes)
        self.assertTrue(total_steps >= ep_roller.min_steps)

        if 'min_steps' not in roller_kwargs:
            num_eps = ep_roller.min_episodes + batched_env.num_envs - 1
            self.assertTrue(len(actual) == num_eps)

        basic_roller = BasicRoller(env_fn(), model, min_episodes=len(actual))
        expected = basic_roller.rollouts()

        _compare_rollout_batch(self, actual, expected)
Ejemplo n.º 7
0
def test_ep_basic_equivalence(stateful, state_tuple, limits):
    """
    Test that EpisodeRoller is equivalent to a
    BasicRoller when run on a single environment.
    """
    env_fn = lambda: SimpleEnv(3, (4, 5), 'uint8')
    env = env_fn()
    model = SimpleModel(env.action_space.low.shape,
                        stateful=stateful,
                        state_tuple=state_tuple)
    basic_roller = BasicRoller(env, model, **limits)
    expected = basic_roller.rollouts()

    batched_env = batched_gym_env([env_fn], sync=True)
    ep_roller = EpisodeRoller(batched_env, model, **limits)
    actual = ep_roller.rollouts()
    _compare_rollout_batch(actual, expected)
Ejemplo n.º 8
0
def learn_setup(env_id=None,
                timesteps=int(5e6),
                env_name=None,
                param_scale=1,
                name="test",
                expnum=0,
                env=None,
                n_episodes=None,
                n_steps_per_episode=None,
                reward_threshold=0,
                CMA_mu=None,
                CMA_cmean=None,
                CMA_rankmu=None,
                CMA_rankone=None,
                log_file=None):

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    if env_id is None:
        env_id = env_name
    if env is None:
        env = make_vec_env(env_id,
                           "mujoco",
                           1,
                           None,
                           reward_scale=1.0,
                           flatten_dict_observations=True)

    if log_file is None:
        log_file = os.path.join(
            'results', "recent" + name + "_" + str(expnum) + ".monitor.csv")
        log_npy = os.path.join('results',
                               "recent" + name + '_' + str(expnum) + '.npy')
    #env = LoggedEnv(env, log_file, log_npy)

    model = ContinuousMLP(sess, env.action_space,
                          gym_space_vectorizer(env.observation_space))
    roller = BasicRoller(env,
                         model,
                         min_episodes=1,
                         min_steps=n_steps_per_episode)
    sess.run(tf.global_variables_initializer())
    trainer = CMATrainer(sess,
                         scale=param_scale,
                         CMA_mu=CMA_mu,
                         CMA_cmean=CMA_cmean,
                         CMA_rankmu=CMA_rankmu,
                         CMA_rankone=CMA_rankone)  #, popsize=n_episodes)
    rewards = []
    local_variables = {
        'roller': roller,
        'trainer': trainer,
        'env_id': env_name,
        'reward_threshold': reward_threshold,
        'rewards': rewards
    }
    return local_variables
Ejemplo n.º 9
0
    def _test_basic_equivalence_case(self, stateful, state_tuple):
        """
        Test BasicRoller equivalence for a specific set of
        model settings.
        """
        env_fn = lambda: SimpleEnv(3, (4, 5), 'uint8')
        env = env_fn()
        model = SimpleModel(env.action_space.low.shape,
                            stateful=stateful,
                            state_tuple=state_tuple)
        basic_roller = BasicRoller(env, model, min_episodes=5)
        expected = basic_roller.rollouts()
        total_timesteps = sum([x.num_steps for x in expected])

        batched_env = batched_gym_env([env_fn], sync=True)
        trunc_roller = TruncatedRoller(batched_env, model, total_timesteps)
        actual = trunc_roller.rollouts()
        _compare_rollout_batch(self, actual, expected)
Ejemplo n.º 10
0
    def _test_basic_equivalence_case(self, stateful, state_tuple,
                                     **roller_kwargs):
        """
        Test BasicRoller equivalence for a single env in a
        specific case.
        """
        env_fn = lambda: SimpleEnv(3, (4, 5), 'uint8')
        env = env_fn()
        model = SimpleModel(env.action_space.low.shape,
                            stateful=stateful,
                            state_tuple=state_tuple)
        basic_roller = BasicRoller(env, model, **roller_kwargs)
        expected = basic_roller.rollouts()

        batched_env = batched_gym_env([env_fn], sync=True)
        ep_roller = EpisodeRoller(batched_env, model, **roller_kwargs)
        actual = ep_roller.rollouts()
        _compare_rollout_batch(self, actual, expected)
Ejemplo n.º 11
0
 def _test_batches_consistency(self, batch_size, trunc_start):
     """
     Make sure that batches() produces the same outputs
     that we got with step().
     """
     env = TupleCartPole()
     try:
         roller = BasicRoller(env, self.model, min_episodes=7)
         rollouts = roller.rollouts()
         if trunc_start:
             rollouts = self._truncate_first(rollouts)
         num_batches = 10
         for batch in self.model.batches(rollouts, batch_size=batch_size):
             num_batches -= 1
             if num_batches == 0:
                 break
             self._test_batch(rollouts, batch)
     finally:
         env.close()
Ejemplo n.º 12
0
def test_trunc_basic_equivalence(stateful, state_tuple):
    """
    Test that TruncatedRoller is equivalent to BasicRoller
    for batches of one environment when the episodes end
    cleanly.
    """
    env_fn = lambda: SimpleEnv(3, (4, 5), 'uint8')
    env = env_fn()
    model = SimpleModel(env.action_space.low.shape,
                        stateful=stateful,
                        state_tuple=state_tuple)
    basic_roller = BasicRoller(env, model, min_episodes=5)
    expected = basic_roller.rollouts()
    total_timesteps = sum([x.num_steps for x in expected])

    batched_env = batched_gym_env([env_fn], sync=True)
    trunc_roller = TruncatedRoller(batched_env, model, total_timesteps)
    actual = trunc_roller.rollouts()
    _compare_rollout_batch(actual, expected)
Ejemplo n.º 13
0
def run_algorithm(algo_name):
    """
    Run the specified training algorithm.
    """
    env = gym.make('CartPole-v0')
    action_dist = gym_space_distribution(env.action_space)
    obs_vectorizer = gym_space_vectorizer(env.observation_space)

    with tf.Session() as sess:
        model = MLP(sess, action_dist, obs_vectorizer, layer_sizes=[32])

        # Deal with CartPole-v0 reward scale.
        model.scale_outputs(20)

        roller = BasicRoller(env, model, min_episodes=30)
        inner_loop = algorithm_inner_loop(algo_name, model)

        sess.run(tf.global_variables_initializer())
        print('running algorithm:', algo_name)
        for i in range(50):
            rollouts = roller.rollouts()
            print('batch %d: mean=%f' % (i, mean_total_reward(rollouts)))
            inner_loop(rollouts)
Ejemplo n.º 14
0
def training_loop(env_id=None,
                  timesteps=int(5e6),
                  param_scale=1,
                  log_file=None):
    """
    Run CMA on the environment.
    """
    if log_file is None:
        log_file = os.path.join('results', env_id + '.monitor.csv')
    env = LoggedEnv(gym.make(env_id), log_file)
    with tf.Session() as sess:
        model = ContinuousMLP(sess, env.action_space,
                              gym_space_vectorizer(env.observation_space))
        roller = BasicRoller(env, model, min_episodes=4, min_steps=500)
        sess.run(tf.global_variables_initializer())
        trainer = CMATrainer(sess, scale=param_scale)
        steps = 0
        rewards = []
        while steps < timesteps:
            sub_steps, sub_rewards = trainer.train(roller)
            steps += sub_steps
            rewards.extend(sub_rewards)
            print('%s: steps=%d mean=%f batch_mean=%f' %
                  (env_id, steps, np.mean(rewards), np.mean(sub_rewards)))
Ejemplo n.º 15
0
if __name__ == '__main__':
    monitor = "results/unbiased_random/1"
    action_repeat = True
    single_life = True
    render = None
    env = retro.make("SuperMarioBros-Nes")
    env = MarioDiscretizer(env)
    if single_life:
        env = SingleLifeEnv(env)
    if monitor is not None:
        env = Monitor(env, monitor, video_callable=lambda i: False)
    if render is not None:
        env = AutoRenderer(env, auto_render_period=render)
    if action_repeat:
        env = FrameStack(env, 4)
    # model = WeightedRandomAgent()
    model = RandomAgent(lambda: env.action_space.sample())
    player = BasicRoller(env, model, min_episodes=1)
    # total_rollouts = [player.rollouts() for rollout_i in trange(40)]
    # flat_rollouts = reduce(list.__add__, total_rollouts)
    # total_rewards = map(lambda r: r.total_reward, flat_rollouts)
    # [filename for path in dirs for filename in os.listdir(path)]
    total_rewards = []
    for i in tqdm(range(150)):
        rollouts = player.rollouts()
        total_rewards += [roll.total_reward for roll in rollouts]
    print(total_rewards)
    rewards_numbers = list(zip(count(), total_rewards))
    sorted_reward_numbers = sorted(rewards_numbers, key=lambda t: t[1])
    print(sorted_reward_numbers[-5:])