예제 #1
0
def main():

    # 強化学習のパラメータ
    gamma = 0.995
    num_episodes = 100  #総試行回数

    # DDPGセットアップ
    q_func = QFunction()  # Q関数
    policy = PolicyNetwork()  # ポリシーネットワーク
    model = DDPGModel(q_func=q_func, policy=policy)
    optimizer_p = chainer.optimizers.Adam(alpha=1e-4)
    optimizer_q = chainer.optimizers.Adam(alpha=1e-3)
    optimizer_p.setup(model['policy'])
    optimizer_q.setup(model['q_function'])

    explorer = chainerrl.explorers.AdditiveOU(sigma=1.0)  # sigmaで付与するノイズの強さを設定
    replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10**6)
    phi = lambda x: x.astype(np.float32, copy=False)

    agent = DDPG(model,
                 optimizer_p,
                 optimizer_q,
                 replay_buffer,
                 gamma=gamma,
                 explorer=explorer,
                 replay_start_size=1000,
                 target_update_method='soft',
                 target_update_interval=1,
                 update_interval=4,
                 soft_update_tau=0.01,
                 n_times_update=1,
                 phi=phi,
                 gpu=-1,
                 minibatch_size=200)

    def reward_filter(r):  # 報酬値を小さくする(0〜1の範囲になるようにする)
        return r * 0.01

    outdir = 'result'
    chainerrl.misc.set_random_seed(0)
    env = gym.make('SpaceInvaders-v0')  #スペースインベーダーの環境呼び出し
    env.seed(0)
    chainerrl.misc.env_modifiers.make_reward_filtered(env, reward_filter)
    env = gym.wrappers.Monitor(env, outdir)  # 動画を保存

    # エピソードの試行&強化学習スタート
    for episode in range(1, num_episodes + 1):  #試行数分繰り返す
        done = False
        reward = 0
        n_steps = 0
        total_reward = 0
        obs = env.reset()
        obs = np.asarray(obs.transpose(2, 0, 1), dtype=np.float32)
        while not done:
            action = agent.act_and_train(obs, reward)  # actionは連続値
            action = F.argmax(action).data  # 出力値が最大の行動を選択
            obs, reward, done, info = env.step(action)  # actionを実行
            total_reward += reward
            n_steps += 1
            obs = np.asarray(obs.transpose(2, 0, 1), dtype=np.float32)
            print('{0:4d}: action {1}, reward {2}, done? {3}, {4}'.format(
                n_steps, action, reward, done, info))
        agent.stop_episode_and_train(obs, reward, done)
        print('Episode {0:4d}: total reward {1}, n_steps {2}, statistics: {3}'.
              format(episode, total_reward, n_steps, agent.get_statistics()))
        if episode % 10 == 0:
            agent.save('agent_DDPG_spaceinvaders_' + str(episode))
예제 #2
0
def make_agent_ddpg(args, env):
    obs_size = np.asarray(env.observation_space.shape).prod()
    action_space = env.action_space

    action_size = np.asarray(action_space.shape).prod()
    q_func = FCSAQFunction(
        obs_size, action_size,
        n_hidden_channels=args.n_hidden_channels,
        n_hidden_layers=args.n_hidden_layers)
    pi = FCDeterministicPolicy(
        obs_size, action_size=action_size,
        n_hidden_channels=args.n_hidden_channels,
        n_hidden_layers=args.n_hidden_layers,
        min_action=action_space.low, max_action=action_space.high,
        bound_action=True)
    if args.gpu > -1:
        q_func.to_gpu(args.gpu)
        pi.to_gpu(args.gpu)
    else:
        q_func.to_cpu()
        pi.to_cpu()
    model = DDPGModel(q_func=q_func, policy=pi)
    opt_a = optimizers.Adam(alpha=args.actor_lr)
    opt_c = optimizers.Adam(alpha=args.critic_lr)
    opt_a.setup(model['policy'])
    opt_c.setup(model['q_function'])
    opt_a.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a')
    opt_c.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c')

    rbuf = replay_buffer.ReplayBuffer(5 * 10 ** 5)

    def phi(obs):
        return obs.astype(np.float32)

    # def random_action():
    #    a = action_space.sample()
    #    if isinstance(a, np.ndarray):
    #        a = a.astype(np.float32)
    #    return a

    ou_sigma = (action_space.high - action_space.low) * 0.2
    explorer = explorers.AdditiveOU(sigma=ou_sigma)
    if args.skip_step == 0:
        agent = DDPG(model, opt_a, opt_c, rbuf, gamma=args.gamma,
                     explorer=explorer, replay_start_size=args.replay_start_size,
                     target_update_method=args.target_update_method,
                     target_update_interval=args.target_update_interval,
                     update_interval=args.update_interval,
                     soft_update_tau=args.soft_update_tau,
                     n_times_update=args.n_update_times,
                     phi=phi, gpu=args.gpu, minibatch_size=args.minibatch_size)
    else:
        agent = DDPGStep(model, opt_a, opt_c, rbuf, gamma=args.gamma,
                         explorer=explorer, replay_start_size=args.replay_start_size,
                         target_update_method=args.target_update_method,
                         target_update_interval=args.target_update_interval,
                         update_interval=args.update_interval,
                         soft_update_tau=args.soft_update_tau,
                         n_times_update=args.n_update_times,
                         phi=phi, gpu=args.gpu, minibatch_size=args.minibatch_size, skip_step=args.skip_step)
        if args.model_dir is not None:
            agent.save(args.model_dir)
    return agent
예제 #3
0
            print("Episode: ", ep)
            print("Rewards: ", episode_rewards_sum)
            print("Max reward so far: ", maximumReturn)
            # Mean reward
            total_reward_mean = np.divide(total_G, ep + 1)
            G_mean.append(total_reward_mean)
            print("Mean Reward", total_reward_mean)
            # Statistics
            print('Statistics Alan:', agent.get_statistics())

        if ep % 10 == 0:
            print(velocity(env))

    if episode_rewards_sum > best_reward:
        best_reward = episode_rewards_sum
        agent.save("DDPG_best_model")
        print('new best', ep)

    # Save the model every 100 episode.
    if ep % 100 == 0:
        agent.save("DDPG_last_model")

    #generate graph of rewards vs episodes
    if ep % 50 == 0:
        graph_reward(G, ep, 'DDPGargs')
    agent.stop_episode_and_train(obs, reward, done)

print('Good job Alan')

plt.plot(G, color='cadetblue')
plt.ylabel('Returns')