def main(): # 強化学習のパラメータ gamma = 0.995 num_episodes = 100 #総試行回数 # DDPGセットアップ q_func = QFunction() # Q関数 policy = PolicyNetwork() # ポリシーネットワーク model = DDPGModel(q_func=q_func, policy=policy) optimizer_p = chainer.optimizers.Adam(alpha=1e-4) optimizer_q = chainer.optimizers.Adam(alpha=1e-3) optimizer_p.setup(model['policy']) optimizer_q.setup(model['q_function']) explorer = chainerrl.explorers.AdditiveOU(sigma=1.0) # sigmaで付与するノイズの強さを設定 replay_buffer = chainerrl.replay_buffer.ReplayBuffer(capacity=10**6) phi = lambda x: x.astype(np.float32, copy=False) agent = DDPG(model, optimizer_p, optimizer_q, replay_buffer, gamma=gamma, explorer=explorer, replay_start_size=1000, target_update_method='soft', target_update_interval=1, update_interval=4, soft_update_tau=0.01, n_times_update=1, phi=phi, gpu=-1, minibatch_size=200) def reward_filter(r): # 報酬値を小さくする(0〜1の範囲になるようにする) return r * 0.01 outdir = 'result' chainerrl.misc.set_random_seed(0) env = gym.make('SpaceInvaders-v0') #スペースインベーダーの環境呼び出し env.seed(0) chainerrl.misc.env_modifiers.make_reward_filtered(env, reward_filter) env = gym.wrappers.Monitor(env, outdir) # 動画を保存 # エピソードの試行&強化学習スタート for episode in range(1, num_episodes + 1): #試行数分繰り返す done = False reward = 0 n_steps = 0 total_reward = 0 obs = env.reset() obs = np.asarray(obs.transpose(2, 0, 1), dtype=np.float32) while not done: action = agent.act_and_train(obs, reward) # actionは連続値 action = F.argmax(action).data # 出力値が最大の行動を選択 obs, reward, done, info = env.step(action) # actionを実行 total_reward += reward n_steps += 1 obs = np.asarray(obs.transpose(2, 0, 1), dtype=np.float32) print('{0:4d}: action {1}, reward {2}, done? {3}, {4}'.format( n_steps, action, reward, done, info)) agent.stop_episode_and_train(obs, reward, done) print('Episode {0:4d}: total reward {1}, n_steps {2}, statistics: {3}'. format(episode, total_reward, n_steps, agent.get_statistics())) if episode % 10 == 0: agent.save('agent_DDPG_spaceinvaders_' + str(episode))
def make_agent_ddpg(args, env): obs_size = np.asarray(env.observation_space.shape).prod() action_space = env.action_space action_size = np.asarray(action_space.shape).prod() q_func = FCSAQFunction( obs_size, action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers) pi = FCDeterministicPolicy( obs_size, action_size=action_size, n_hidden_channels=args.n_hidden_channels, n_hidden_layers=args.n_hidden_layers, min_action=action_space.low, max_action=action_space.high, bound_action=True) if args.gpu > -1: q_func.to_gpu(args.gpu) pi.to_gpu(args.gpu) else: q_func.to_cpu() pi.to_cpu() model = DDPGModel(q_func=q_func, policy=pi) opt_a = optimizers.Adam(alpha=args.actor_lr) opt_c = optimizers.Adam(alpha=args.critic_lr) opt_a.setup(model['policy']) opt_c.setup(model['q_function']) opt_a.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_a') opt_c.add_hook(chainer.optimizer.GradientClipping(1.0), 'hook_c') rbuf = replay_buffer.ReplayBuffer(5 * 10 ** 5) def phi(obs): return obs.astype(np.float32) # def random_action(): # a = action_space.sample() # if isinstance(a, np.ndarray): # a = a.astype(np.float32) # return a ou_sigma = (action_space.high - action_space.low) * 0.2 explorer = explorers.AdditiveOU(sigma=ou_sigma) if args.skip_step == 0: agent = DDPG(model, opt_a, opt_c, rbuf, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_method=args.target_update_method, target_update_interval=args.target_update_interval, update_interval=args.update_interval, soft_update_tau=args.soft_update_tau, n_times_update=args.n_update_times, phi=phi, gpu=args.gpu, minibatch_size=args.minibatch_size) else: agent = DDPGStep(model, opt_a, opt_c, rbuf, gamma=args.gamma, explorer=explorer, replay_start_size=args.replay_start_size, target_update_method=args.target_update_method, target_update_interval=args.target_update_interval, update_interval=args.update_interval, soft_update_tau=args.soft_update_tau, n_times_update=args.n_update_times, phi=phi, gpu=args.gpu, minibatch_size=args.minibatch_size, skip_step=args.skip_step) if args.model_dir is not None: agent.save(args.model_dir) return agent
print("Episode: ", ep) print("Rewards: ", episode_rewards_sum) print("Max reward so far: ", maximumReturn) # Mean reward total_reward_mean = np.divide(total_G, ep + 1) G_mean.append(total_reward_mean) print("Mean Reward", total_reward_mean) # Statistics print('Statistics Alan:', agent.get_statistics()) if ep % 10 == 0: print(velocity(env)) if episode_rewards_sum > best_reward: best_reward = episode_rewards_sum agent.save("DDPG_best_model") print('new best', ep) # Save the model every 100 episode. if ep % 100 == 0: agent.save("DDPG_last_model") #generate graph of rewards vs episodes if ep % 50 == 0: graph_reward(G, ep, 'DDPGargs') agent.stop_episode_and_train(obs, reward, done) print('Good job Alan') plt.plot(G, color='cadetblue') plt.ylabel('Returns')