예제 #1
0
 def create_agent(self, env):
     model = create_state_q_function_for_env(env)
     rbuf = replay_buffer.ReplayBuffer(10**5)
     opt = optimizers.Adam()
     opt.setup(model)
     explorer = explorers.ConstantEpsilonGreedy(
         0.2, random_action_func=lambda: env.action_space.sample())
     return agents.DQN(model, opt, rbuf, gamma=0.99, explorer=explorer)
예제 #2
0
    def _test_load_dqn(self, gpu):
        q_func = links.Sequence(links.NatureDQNHead(), L.Linear(512, 4),
                                DiscreteActionValue)

        opt = optimizers.RMSpropGraves(lr=2.5e-4,
                                       alpha=0.95,
                                       momentum=0.0,
                                       eps=1e-2)
        opt.setup(q_func)

        rbuf = replay_buffer.ReplayBuffer(100)

        explorer = explorers.LinearDecayEpsilonGreedy(
            start_epsilon=1.0,
            end_epsilon=0.1,
            decay_steps=10**6,
            random_action_func=lambda: np.random.randint(4))

        agent = agents.DQN(q_func,
                           opt,
                           rbuf,
                           gpu=gpu,
                           gamma=0.99,
                           explorer=explorer,
                           replay_start_size=50,
                           target_update_interval=10**4,
                           clip_delta=True,
                           update_interval=4,
                           batch_accumulator='sum',
                           phi=lambda x: x)

        model, exists = download_model("DQN",
                                       "BreakoutNoFrameskip-v4",
                                       model_type=self.pretrained_type)
        agent.load(model)
        if os.environ.get('CHAINERRL_ASSERT_DOWNLOADED_MODEL_IS_CACHED'):
            assert exists
예제 #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--out_dir', type=str, default='results',
                        help='Directory path to save output files.'
                             ' If it does not exist, it will be created.')
    parser.add_argument('--seed', type=int, default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu', type=int, default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-exploration-frames',
                        type=int, default=10 ** 5,
                        help='Timesteps after which we stop ' +
                        'annealing exploration rate')
    parser.add_argument('--final-epsilon', type=float, default=0.1,
                        help='Final value of epsilon during training.')
    parser.add_argument('--eval-epsilon', type=float, default=0.05,
                        help='Exploration epsilon used during eval episodes.')
    parser.add_argument('--steps', type=int, default=10 ** 6,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument('--max-episode-len', type=int,
                        default=30 * 60 * 60 // 4,  # 30 minutes with 60/4 fps
                        help='Maximum number of timesteps for each episode.')
    parser.add_argument('--replay-start-size', type=int, default=1000,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--target-update-interval',
                        type=int, default=1 * 10 ** 4,
                        help='Frequency (in timesteps) at which ' +
                        'the target network is updated.')
    parser.add_argument('--eval-interval', type=int, default=10 ** 5,
                        help='Frequency (in timesteps) of evaluation phase.')
    parser.add_argument('--update-interval', type=int, default=4,
                        help='Frequency (in timesteps) of network updates.')
    parser.add_argument('--eval-n-runs', type=int, default=10)
    parser.add_argument('--logging-level', type=int, default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--lr', type=float, default=2.5e-4,
                        help='Learning rate.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu,))

    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    experiments.set_log_base_dir(args.out_dir)
    print('Output files are saved in {}'.format(args.out_dir))

    env = make_env(env_seed=args.seed)

    n_actions = env.action_space.n
    
    q_func = links.Sequence(
        links.NatureDQNHead(n_input_channels=3),
        L.Linear(512, n_actions),
        DiscreteActionValue
    )

    # Use the same hyper parameters as the Nature paper's
    opt = optimizers.RMSpropGraves(
        lr=args.lr, alpha=0.95, momentum=0.0, eps=1e-2)

    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10 ** 6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon,
        args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        x = x.transpose(2, 0, 1)
        return np.asarray(x, dtype=np.float32) / 255

    agent = agents.DQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator='sum',
        phi=phi
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=env,
            agent=agent,
            n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.out_dir,
            save_best_so_far_agent=False,
            max_episode_len=args.max_episode_len,
            eval_env=env,
        )
예제 #4
0
    def __init__(self, alg, env, model_path):
        self.alg = alg
        seed = 0
        n_actions = gym.make(env).action_space.n
        gpus = [-1]
        gpu = None
        misc.set_random_seed(seed, gpus=gpus)
        if alg == "DQN-C":
            model = links.Sequence(
                links.NatureDQNHead(),
                L.Linear(512, n_actions),
                DiscreteActionValue)
        if alg == "PPO":
            winit_last = chainer.initializers.LeCunNormal(1e-2)
            model = chainer.Sequential(
                L.Convolution2D(None, 32, 8, stride=4),
                F.relu,
                L.Convolution2D(None, 64, 4, stride=2),
                F.relu,
                L.Convolution2D(None, 64, 3, stride=1),
                F.relu,
                L.Linear(None, 512),
                F.relu,
                links.Branched(
                    chainer.Sequential(
                        L.Linear(None, n_actions, initialW=winit_last),
                        SoftmaxDistribution,
                    ),
                    L.Linear(None, 1),
                )
            )
        if alg == "C51":
            n_atoms = 51
            v_max = 10
            v_min = -10
            model = links.Sequence(
                links.NatureDQNHead(),
                DistributionalFCStateQFunctionWithDiscreteAction(
                    None, n_actions, n_atoms, v_min, v_max,
                    n_hidden_channels=0, n_hidden_layers=0),
            )
        if alg == "ACER":
            model = agents.acer.ACERSharedModel(
                shared=links.Sequence(
                    links.NIPSDQNHead(),
                    L.LSTM(256, 256)),
                pi=links.Sequence(
                    L.Linear(256, n_actions),
                    SoftmaxDistribution),
                q=links.Sequence(
                    L.Linear(256, n_actions),
                    DiscreteActionValue),
            )
        if alg == "A3C":
            model = A3CFF(n_actions)
        if alg == "Rainbow":
            n_atoms = 51
            v_max = 10
            v_min = -10
            model = DistributionalDuelingDQN(n_actions, n_atoms, v_min, v_max)
            links.to_factorized_noisy(model, sigma_scale=0.5)
        if alg == "IQN":
            model = agents.iqn.ImplicitQuantileQFunction(
                psi=chainerrl.links.Sequence(
                    L.Convolution2D(None, 32, 8, stride=4),
                    F.relu,
                    L.Convolution2D(None, 64, 4, stride=2),
                    F.relu,
                    L.Convolution2D(None, 64, 3, stride=1),
                    F.relu,
                    functools.partial(F.reshape, shape=(-1, 3136)),
                ),
                phi=chainerrl.links.Sequence(
                    chainerrl.agents.iqn.CosineBasisLinear(64, 3136),
                    F.relu,
                ),
                f=chainerrl.links.Sequence(
                    L.Linear(None, 512),
                    F.relu,
                    L.Linear(None, n_actions),
                ),
            )
        if alg in ["A3C"]:
            fake_obs = chainer.Variable(
                np.zeros((4, 84, 84), dtype=np.float32)[None],
                name='observation')
            with chainerrl.recurrent.state_reset(model):
                # The state of the model is reset again after drawing the graph
                variables = misc.collect_variables([model(fake_obs)])
                chainer.computational_graph.build_computational_graph(variables)
        elif alg in ["Rainbow", "DQN-C", "C51", "ACER", "PPO"]:
            variables = misc.collect_variables([model(np.zeros((4, 84, 84), dtype=np.float32)[None])])
            chainer.computational_graph.build_computational_graph(variables)
        else:
            fake_obs = np.zeros((4, 84, 84), dtype=np.float32)[None]
            fake_taus = np.zeros(32, dtype=np.float32)[None]
            variables = misc.collect_variables([model(fake_obs)(fake_taus)])

        def phi(x):
            # Feature extractor
            return np.asarray(x, dtype=np.float32) / 255

        opt = optimizers.RMSpropGraves()
        opt.setup(model)
        rbuf = replay_buffer.ReplayBuffer(1)
        if alg == "IQN":
            self.agent = agents.IQN(model, opt, rbuf, gpu=gpu, gamma=0.99, act_deterministically=True, explorer=None,
                                    replay_start_size=1, minibatch_size=1, target_update_interval=None, clip_delta=True,
                                    update_interval=4, phi=phi)
        if alg == "A3C":
            self.agent = a3c.A3C(model, opt, t_max=5, gamma=0.99, phi=phi, act_deterministically=True)
        if alg == "Rainbow":
            self.agent = agents.CategoricalDoubleDQN(model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None,
                                                     replay_start_size=1, minibatch_size=1, target_update_interval=None,
                                                     clip_delta=True, update_interval=4, phi=phi)
        if alg == "DQN-C":
            self.agent = agents.DQN(model, opt, rbuf, gpu=gpu, gamma=0.99, explorer=None, replay_start_size=1,
                                    minibatch_size=1, target_update_interval=None, clip_delta=True, update_interval=4,
                                    phi=phi)
        if alg == "C51":
            self.agent = agents.CategoricalDQN(
                model, opt, rbuf, gpu=gpu, gamma=0.99,
                explorer=None, replay_start_size=1,
                minibatch_size=1,
                target_update_interval=None,
                clip_delta=True,
                update_interval=4,
                phi=phi,
            )
        if alg == "ACER":
            self.agent = agents.acer.ACER(model, opt, t_max=5, gamma=0.99,
                                          replay_buffer=rbuf,
                                          n_times_replay=4,
                                          replay_start_size=1,
                                          act_deterministically=True,
                                          phi=phi
                                          )
        if alg == "PPO":
            self.agent = agents.PPO(model, opt, gpu=gpu, phi=phi, update_interval=4, minibatch_size=1, clip_eps=0.1,
                                    recurrent=False, act_deterministically=True)
        self.agent.load(os.path.join(model_path, 'chainer', alg, env.replace("NoFrameskip-v4", ""), 'final'))
예제 #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--env',
                        type=str,
                        default='BreakoutNoFrameskip-v4',
                        help='OpenAI Atari domain to perform algorithm on.')
    parser.add_argument('--out_dir',
                        type=str,
                        default='results',
                        help='Directory path to save output files.'
                        ' If it does not exist, it will be created.')
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help='Random seed [0, 2 ** 31)')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='GPU to use, set to -1 if no GPU.')
    parser.add_argument('--demo', action='store_true', default=False)
    parser.add_argument('--load', type=str, default=None)
    parser.add_argument('--final-exploration-frames',
                        type=int,
                        default=10**5,
                        help='Timesteps after which we stop ' +
                        'annealing exploration rate')
    parser.add_argument('--final-epsilon',
                        type=float,
                        default=0.1,
                        help='Final value of epsilon during training.')
    parser.add_argument('--eval-epsilon',
                        type=float,
                        default=0.05,
                        help='Exploration epsilon used during eval episodes.')
    parser.add_argument('--arch',
                        type=str,
                        default='doubledqn',
                        choices=['nature', 'nips', 'dueling', 'doubledqn'],
                        help='Network architecture to use.')
    parser.add_argument('--steps',
                        type=int,
                        default=10**6,
                        help='Total number of timesteps to train the agent.')
    parser.add_argument(
        '--max-episode-len',
        type=int,
        default=30 * 60 * 60 // 4,  # 30 minutes with 60/4 fps
        help='Maximum number of timesteps for each episode.')
    parser.add_argument('--replay-start-size',
                        type=int,
                        default=1000,
                        help='Minimum replay buffer size before ' +
                        'performing gradient updates.')
    parser.add_argument('--target-update-interval',
                        type=int,
                        default=1 * 10**4,
                        help='Frequency (in timesteps) at which ' +
                        'the target network is updated.')
    parser.add_argument('--eval-interval',
                        type=int,
                        default=10**5,
                        help='Frequency (in timesteps) of evaluation phase.')
    parser.add_argument('--update-interval',
                        type=int,
                        default=4,
                        help='Frequency (in timesteps) of network updates.')
    parser.add_argument('--eval-n-runs', type=int, default=100)
    parser.add_argument('--logging-level',
                        type=int,
                        default=20,
                        help='Logging level. 10:DEBUG, 20:INFO etc.')
    parser.add_argument('--render',
                        action='store_true',
                        default=False,
                        help='Render env states in a GUI window.')
    parser.add_argument('--lr',
                        type=float,
                        default=2.5e-4,
                        help='Learning rate.')
    args = parser.parse_args()

    import logging
    logging.basicConfig(level=args.logging_level)

    # Set a random seed used in ChainerRL.
    misc.set_random_seed(args.seed, gpus=(args.gpu, ))

    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    experiments.set_log_base_dir(args.out_dir)
    print('Output files are saved in {}'.format(args.out_dir))

    def make_env(render=False, env_seed=0):
        join_tokens = marlo.make("MarLo-FindTheGoal-v0",
                                 params=dict(
                                     allowContinuousMovement=["move", "turn"],
                                     videoResolution=[84, 84],
                                     kill_clients_after_num_rounds=500))
        env = marlo.init(join_tokens[0])

        obs = env.reset()
        if render:
            env.render(mode="rgb_array")
        action = env.action_space.sample()
        obs, r, done, info = env.step(action)
        env.seed(int(env_seed))
        return env

    env = make_env(render=args.render, env_seed=args.seed)

    n_actions = env.action_space.n
    q_func = links.Sequence(links.NatureDQNHead(n_input_channels=3),
                            L.Linear(512, n_actions), DiscreteActionValue)

    # Draw the computational graph and save it in the output directory.
    chainerrl.misc.draw_computational_graph(
        [q_func(np.zeros((3, 84, 84), dtype=np.float32)[None])],
        os.path.join(args.out_dir, 'model'))

    # Use the same hyper parameters as the Nature paper's
    opt = optimizers.RMSpropGraves(lr=args.lr,
                                   alpha=0.95,
                                   momentum=0.0,
                                   eps=1e-2)

    opt.setup(q_func)

    rbuf = replay_buffer.ReplayBuffer(10**6)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0, args.final_epsilon, args.final_exploration_frames,
        lambda: np.random.randint(n_actions))

    def phi(x):
        # Feature extractor
        x = x.transpose(2, 0, 1)
        return np.asarray(x, dtype=np.float32) / 255

    agent = agents.DQN(q_func,
                       opt,
                       rbuf,
                       gpu=args.gpu,
                       gamma=0.99,
                       explorer=explorer,
                       replay_start_size=args.replay_start_size,
                       target_update_interval=args.target_update_interval,
                       update_interval=args.update_interval,
                       batch_accumulator='sum',
                       phi=phi)

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=env,
                                                  agent=agent,
                                                  n_runs=args.eval_n_runs)
        print('n_runs: {} mean: {} median: {} stdev {}'.format(
            args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
            eval_stats['stdev']))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_runs=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.out_dir,
            save_best_so_far_agent=False,
            max_episode_len=args.max_episode_len,
            eval_env=env,
        )