Beispiel #1
0
def parse_arch(arch, n_actions):
    if arch == 'nature':
        return links.Sequence(links.NatureDQNHead(n_input_channels=3),
                              L.Linear(512, n_actions), DiscreteActionValue)

    elif arch == 'doubledqn':

        class SingleSharedBias(chainer.Chain):
            """Single shared bias used in the Double DQN paper.
            You can add this link after a Linear layer with nobias=True to implement a
            Linear layer with a single shared bias parameter.
            See http://arxiv.org/abs/1509.06461.
            """
            def __init__(self):
                super().__init__()
                with self.init_scope():
                    self.bias = chainer.Parameter(0, shape=1)

            def __call__(self, x):
                return x + F.broadcast_to(self.bias, x.shape)

        return links.Sequence(links.NatureDQNHead(n_input_channels=3),
                              L.Linear(512, n_actions, nobias=True),
                              SingleSharedBias(), DiscreteActionValue)

    elif arch == 'nips':
        return links.Sequence(links.NIPSDQNHead(n_input_channels=3),
                              L.Linear(256, n_actions), DiscreteActionValue)

    elif arch == 'dueling':
        return DuelingDQN(n_actions, n_input_channels=3)
    else:
        raise RuntimeError('Not supported architecture: {}'.format(arch))
Beispiel #2
0
def parse_arch(arch, n_actions, activation):
    if arch == 'nature':
        return links.Sequence(links.NatureDQNHead(activation=activation),
                              L.Linear(512, n_actions), DiscreteActionValue)
    elif arch == 'nips':
        return links.Sequence(links.NIPSDQNHead(activation=activation),
                              L.Linear(256, n_actions), DiscreteActionValue)
    elif arch == 'dueling':
        return DuelingDQN(n_actions)
    else:
        raise RuntimeError('Not supported architecture: {}'.format(arch))
def parse_arch(arch, n_actions):
    if arch == 'nature':
        return links.Sequence(links.NatureDQNHead(), L.Linear(512, n_actions),
                              DiscreteActionValue)
    elif arch == 'doubledqn':
        return links.Sequence(links.NatureDQNHead(),
                              L.Linear(512, n_actions, nobias=True),
                              SingleSharedBias(), DiscreteActionValue)
    elif arch == 'nips':
        return links.Sequence(links.NIPSDQNHead(), L.Linear(256, n_actions),
                              DiscreteActionValue)
    elif arch == 'dueling':
        return DuelingDQN(n_actions)
    else:
        raise RuntimeError('Not supported architecture: {}'.format(arch))
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='HeroNoFrameskip-v4',
                        help='OpenAI Atari domain to perform algorithm on.')
    parser.add_argument('--outdir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--monitor',
                        action='store_true',
                        default=False,
                        help='Monitor env. Videos and additional information'
                        ' are saved as output files.')
    parser.add_argument('--steps',
                        type=int,
                        default=5 * 10**7,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument(
        '--max-frames',
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help='Maximum number of frames for each episode.')
    # replay-start-size set low so expert demos used for the first few steps
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=1000,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--eval-n-steps', type=int, default=125000)
    parser.add_argument('--eval-interval', type=int, default=250000)
    parser.add_argument('--n-best-episodes', type=int, default=200)
    parser.add_argument('--target-update-interval',
                        type=int,
                        default=10**4,
                        help='Frequency (in timesteps) at which ' +
                        'the target network is updated.')
    parser.add_argument('--update-interval',
                        type=int,
                        default=4,
                        help='Frequency (in timesteps) of network updates.')
    parser.add_argument('--minibatch-size', type=int, default=32)
    parser.add_argument('--replay-buffer-size', type=int, default=10**6)

    parser.add_argument('--num_step_return', type=int, default=10)
    parser.set_defaults(clip_delta=True)
    parser.add_argument('--no-clip-delta',
                        dest='clip_delta',
                        action='store_false')
    parser.add_argument("--batch-accumulator", type=str, default="sum")

    # DQfD specific parameters
    parser.add_argument('--expert-demo-path',
                        type=str,
                        required=True,
                        help="Path to expert demonstrations saved by \
                        chainerrl.experiments.collect_demonstrations")
    parser.add_argument('--n-pretrain-steps', type=int, default=750000)
    parser.add_argument('--demo-supervised-margin', type=float, default=0.8)
    parser.add_argument('--loss-coeff-l2', type=float, default=1e-5)
    parser.add_argument('--loss-coeff-nstep', type=float, default=1.0)
    parser.add_argument('--loss-coeff-supervised', type=float, default=1.0)
    parser.add_argument('--bonus-priority-agent', type=float, default=0.001)
    parser.add_argument('--bonus-priority-demo', type=float, default=1.0)
    parser.add_argument('--priority-error-max', type=float, default=1.0)
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print('Output files are saved in {}'.format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(atari_wrappers.make_atari(
            args.env, max_frames=args.max_frames),
                                           episode_life=not test,
                                           clip_rewards=False)
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = chainerrl.wrappers.RandomizeAction(env, 0.001)
        else:
            # Log scale train environment
            env = LogScaleReward(env)
        if args.monitor:
            env = gym.wrappers.Monitor(
                env, args.outdir, mode='evaluation' if test else 'training')
        if args.render:
            env = chainerrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n
    q_func = DuelingDQN(n_actions)

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((4, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.outdir, 'model'))

    # Use the same hyperparameters as the Nature paper
    opt = chainer.optimizers.RMSpropGraves(lr=2.5e-4,
                                           alpha=0.95,
                                           momentum=0.0,
                                           eps=1e-2)

    opt.setup(q_func)

    betasteps = args.steps / args.update_interval
    replay_buffer = PrioritizedDemoReplayBuffer(
        args.replay_buffer_size,
        alpha=0.6,
        beta0=0.4,
        betasteps=betasteps,
        error_max=args.priority_error_max,
        num_steps=args.num_step_return)

    # Fill the demo buffer with expert transitions
    n_demo_transitions = 0
    with chainer.datasets.open_pickle_dataset(args.expert_demo_path) as dset:
        for transition in dset:
            (obs, a, r, new_obs, done, info) = transition
            n_demo_transitions += 1
            r = log_scale_reward(r)
            replay_buffer.append(state=obs,
                                 action=a,
                                 reward=r,
                                 next_state=new_obs,
                                 next_action=None,
                                 is_state_terminal=done,
                                 demo=True)
            if ("needs_reset" in info and info["needs_reset"]):
                replay_buffer.stop_current_episode(demo=True)
    print("Demo buffer loaded with %d (1 and n-step) transitions from "
          "%d expert demonstration transitions" %
          (len(replay_buffer), n_demo_transitions))

    explorer = explorers.LinearDecayEpsilonGreedy(
        start_epsilon=1.0,
        end_epsilon=0.01,
        decay_steps=10**6,
        random_action_func=lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    agent = DQfD(q_func,
                 opt,
                 replay_buffer,
                 gamma=0.99,
                 explorer=explorer,
                 n_pretrain_steps=args.n_pretrain_steps,
                 demo_supervised_margin=args.demo_supervised_margin,
                 bonus_priority_agent=args.bonus_priority_agent,
                 bonus_priority_demo=args.bonus_priority_demo,
                 loss_coeff_nstep=args.loss_coeff_nstep,
                 loss_coeff_supervised=args.loss_coeff_supervised,
                 loss_coeff_l2=args.loss_coeff_l2,
                 gpu=args.gpu,
                 replay_start_size=args.replay_start_size,
                 target_update_interval=args.target_update_interval,
                 clip_delta=args.clip_delta,
                 update_interval=args.update_interval,
                 batch_accumulator=args.batch_accumulator,
                 phi=phi,
                 minibatch_size=args.minibatch_size)

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_steps=args.eval_n_steps,
                                                  n_episodes=None)
        print('n_episodes: {} mean: {} median: {} stdev {}'.format(
            eval_stats['episodes'], eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        logger = logging.getLogger(__name__)
        evaluator = experiments.Evaluator(agent=agent,
                                          n_steps=args.eval_n_steps,
                                          n_episodes=None,
                                          eval_interval=args.eval_interval,
                                          outdir=args.outdir,
                                          max_episode_len=None,
                                          env=eval_env,
                                          step_offset=0,
                                          save_best_so_far_agent=True,
                                          logger=logger)

        # Evaluate the agent BEFORE training begins
        evaluator.evaluate_and_update_max_score(t=0, episodes=0)

        experiments.train_agent(agent=agent,
                                env=env,
                                steps=args.steps,
                                outdir=args.outdir,
                                max_episode_len=None,
                                step_offset=0,
                                evaluator=evaluator,
                                successful_score=None,
                                step_hooks=[])

        dir_of_best_network = os.path.join(args.outdir, "best")
        agent.load(dir_of_best_network)

        # run 30 evaluation episodes, each capped at 30 mins of play
        stats = experiments.evaluator.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.n_best_episodes,
            max_episode_len=27000,
            logger=None)
        with open(os.path.join(args.outdir, 'bestscores.json'), 'w') as f:
            # temporary hack to handle python 2/3 support issues.
            # json dumps does not support non-string literal dict keys
            json_stats = json.dumps(stats)
            print(str(json_stats), file=f)
        print("The results of the best scoring network:")
        for stat in stats:
            print(str(stat) + ":" + str(stats[stat]))