예제 #1
0
    def _test_abc(self,
                  steps=100000,
                  require_success=True,
                  gpu=-1,
                  load_model=False):

        if self.recurrent and gpu >= 0:
            self.skipTest(
                'NStepLSTM does not support double backprop with GPU.')
        if self.recurrent and chainer.__version__ == '7.0.0b3':
            self.skipTest(
                'chainer==7.0.0b3 has a bug in double backrop of LSTM.'
                ' See https://github.com/chainer/chainer/pull/8037')

        env, _ = self.make_env_and_successful_return(test=False)
        test_env, successful_return = self.make_env_and_successful_return(
            test=True)

        agent = self.make_agent(env, gpu)

        if load_model:
            print('Load agent from', self.agent_dirname)
            agent.load(self.agent_dirname)

        max_episode_len = None if self.episodic else 2

        # Train
        train_agent_with_evaluation(
            agent=agent,
            env=env,
            eval_env=test_env,
            steps=steps,
            outdir=self.tmpdir,
            eval_interval=200,
            eval_n_steps=None,
            eval_n_episodes=5,
            successful_score=successful_return,
            train_max_episode_len=max_episode_len,
        )

        agent.stop_episode()

        # Test
        n_test_runs = 10
        eval_returns = run_evaluation_episodes(
            test_env,
            agent,
            n_steps=None,
            n_episodes=n_test_runs,
            max_episode_len=max_episode_len,
        )
        if require_success:
            n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return)
            self.assertEqual(n_succeeded, n_test_runs)

        # Save
        agent.save(self.agent_dirname)
예제 #2
0
    def test_timesteps(self):
        agent = mock.Mock()
        env = mock.Mock()
        # First episode: 0 -> 1 -> 2 -> 3 (reset)
        # Second episode: 4 -> 5 -> 6 -> 7 (done)
        env.reset.side_effect = [('state', 0), ('state', 4)]
        env.step.side_effect = [
            (('state', 1), 0.1, False, {}),
            (('state', 2), 0.2, False, {}),
            (('state', 3), 0.3, False, {
                'needs_reset': True
            }),
            (('state', 5), -0.5, False, {}),
            (('state', 6), 0, False, {}),
            (('state', 7), 1, True, {}),
        ]

        if self.n_episodes:
            with self.assertRaises(AssertionError):
                scores = evaluator.run_evaluation_episodes(
                    env,
                    agent,
                    n_steps=self.n_timesteps,
                    n_episodes=self.n_episodes)
        else:
            scores = evaluator.run_evaluation_episodes(
                env,
                agent,
                n_steps=self.n_timesteps,
                n_episodes=self.n_episodes)
            if self.n_timesteps == 2:
                self.assertAlmostEqual(len(scores), 1)
                self.assertAlmostEqual(scores[0], 0.3)
                self.assertEqual(agent.stop_episode.call_count, 1)
            elif self.n_timesteps == 5:
                self.assertAlmostEqual(len(scores), 1)
                self.assertAlmostEqual(scores[0], 0.6)
                self.assertEqual(agent.stop_episode.call_count, 2)
            else:
                self.assertAlmostEqual(len(scores), 2)
                self.assertAlmostEqual(scores[0], 0.6)
                self.assertAlmostEqual(scores[1], 0.5)
                self.assertEqual(agent.stop_episode.call_count, 2)
    def _test_abc(self,
                  steps=100000,
                  require_success=True,
                  gpu=-1,
                  load_model=False):

        env, _ = self.make_env_and_successful_return(test=False)
        test_env, successful_return = self.make_env_and_successful_return(
            test=True)

        agent = self.make_agent(env, gpu)

        if load_model:
            print('Load agent from', self.agent_dirname)
            agent.load(self.agent_dirname)

        max_episode_len = None if self.episodic else 2

        # Train
        train_agent_with_evaluation(
            agent=agent,
            env=env,
            eval_env=test_env,
            steps=steps,
            outdir=self.tmpdir,
            eval_interval=200,
            eval_n_steps=None,
            eval_n_episodes=5,
            successful_score=successful_return,
            train_max_episode_len=max_episode_len,
        )

        agent.stop_episode()

        # Test
        n_test_runs = 5
        eval_returns = run_evaluation_episodes(
            test_env,
            agent,
            n_steps=None,
            n_episodes=n_test_runs,
            max_episode_len=max_episode_len,
        )
        if require_success:
            n_succeeded = np.sum(np.asarray(eval_returns) >= successful_return)
            self.assertEqual(n_succeeded, n_test_runs)

        # Save
        agent.save(self.agent_dirname)
예제 #4
0
 def test_needs_reset(self):
     agent = mock.Mock()
     env = mock.Mock()
     # First episode: 0 -> 1 -> 2 -> 3 (reset)
     # Second episode: 4 -> 5 -> 6 -> 7 (done)
     env.reset.side_effect = [('state', 0), ('state', 4)]
     env.step.side_effect = [
         (('state', 1), 0, False, {}),
         (('state', 2), 0, False, {}),
         (('state', 3), 0, False, {'needs_reset': True}),
         (('state', 5), -0.5, False, {}),
         (('state', 6), 0, False, {}),
         (('state', 7), 1, True, {}),
     ]
     scores = evaluator.run_evaluation_episodes(
         env, agent, n_steps=None, n_episodes=2)
     self.assertAlmostEqual(len(scores), 2)
     self.assertAlmostEqual(scores[0], 0)
     self.assertAlmostEqual(scores[1], 0.5)
예제 #5
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--processes', type=int, default=16)
    parser.add_argument('--env', type=str, default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--outdir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--t-max', type=int, default=5)
    parser.add_argument('--beta', type=float, default=1e-2)
    parser.add_argument('--profile', action='store_true')
    parser.add_argument('--steps', type=int, default=8 * 10 ** 7)
    parser.add_argument('--max-frames', type=int,
                        default=30 * 60 * 60,  # 30 minutes with 60 fps
                        help='Maximum number of frames for each episode.')
    parser.add_argument('--lr', type=float, default=7e-4)
    parser.add_argument('--eval-interval', type=int, default=250000)
    parser.add_argument('--eval-n-steps', type=int, default=125000)
    parser.add_argument('--eval-eps', type=int, default=320)
    parser.add_argument('--weight-decay', type=float, default=0.0)
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load-pretrained', action='store_true',
                        default=False)
    parser.add_argument('--load', type=str, default='')
    parser.add_argument('--disadvantage', action='store_true', default=False,
                        help='Set this option when you disable the advantage function.')
    parser.add_argument('--logging-level', type=int, default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render', action='store_true', default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor', action='store_true', default=False,
                        help='Monitor env. Videos and additional information'
                             ' are saved as output files.')
    parser.add_argument('--random-agent', action='store_true', default=False, 
                        help='Use with demo to get random results.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    # If you use more than one processes, the results will be no longer
    # deterministic even with the same random seed.
    misc.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.processes) + args.seed * args.processes
    assert process_seeds.max() < 2 ** 31

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    n_actions = gym.make(args.env).action_space.n

    model = A3CFF(n_actions)

    # Draw the computational graph and save it in the output directory.
    fake_obs = chainer.Variable(
        np.zeros((4, 84, 84), dtype=np.float32)[None],
        name='observation')
    with chainerrl.recurrent.state_reset(model):
        # The state of the model is reset again after drawing the graph
        chainerrl.misc.draw_computational_graph(
            [model(fake_obs)],
            os.path.join(args.outdir, 'model'))

    opt = rmsprop_async.RMSpropAsync(lr=7e-4, eps=1e-1, alpha=0.99)
    opt.setup(model)
    opt.add_hook(chainer.optimizer.GradientClipping(40))
    if args.weight_decay > 0:
        opt.add_hook(NonbiasWeightDecay(args.weight_decay))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = a3c.A3C(model, opt,
                    t_max=args.t_max,
                    gamma=0.99,
                    beta=args.beta,
                    phi=phi,
                    disadvantage=args.disadvantage,
                    use_average_reward=True)

    if args.load or args.load_pretrained:
        # either load or load_pretrained must be false
        assert not args.load or not args.load_pretrained
        if args.load:
            agent.load(args.load)
        else:
            agent.load(misc.download_model("A3C", args.env,
                                           model_type="final")[0])

    def make_env(process_idx, test):
        # Use different random seeds for train and test envs
        process_seed = process_seeds[process_idx]
        env_seed = 2 ** 31 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test)
        env.seed(int(env_seed))
        if args.monitor:
            env = chainerrl.wrappers.Monitor(
                env, args.outdir,
                mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    if args.demo:
        env = make_env(0, True)
        if not args.random_agent:
            eval_stats = experiments.eval_performance(
                env=env,
                agent=agent,
                n_steps=args.eval_n_steps,
                n_episodes=None)
            print('n_steps: {} mean: {} median: {} stdev: {}'.format(
                args.eval_n_steps, eval_stats['mean'], eval_stats['median'],
                eval_stats['stdev']))
        else:
            agent = random_agent(envname=args.env)
            results = run_evaluation_episodes(
                env=env,
                agent=agent,
                max_episode_len=args.max_frames,
                n_steps=None,
                n_episodes=args.eval_eps)
            
            with open(os.path.join(args.outdir, 'scores.txt'), 'a+') as f:
                print('\n'.join(str(result) for result in results), file=f)

            print('episodes: {} mean: {} min: {} max: {}'.format(
                len(results), sum(results)/len(results), min(results), max(results)))
    else:

        # Linearly decay the learning rate to zero
        def lr_setter(env, agent, value):
            agent.optimizer.lr = value

        lr_decay_hook = experiments.LinearInterpolationHook(
            args.steps, args.lr, 0, lr_setter)

        experiments.train_agent_async(
            agent=agent,
            outdir=args.outdir,
            processes=args.processes,
            make_env=make_env,
            profile=args.profile,
            steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            global_step_hooks=[lr_decay_hook],
            save_best_so_far_agent=True,
        )