def dqn_argo(param_set: Parameter_Set, max_reward): # Agentの生成 netWork = Network(action_dim=2) target_network = Network(action_dim=2) agent = Agent(network=netWork, target_network=target_network, eps_start=param_set.eps_init, eps_anneal=param_set.eps_anneal, eps_min=param_set.eps_min, lr=param_set.lr, gamma=param_set.gamma) # Envの生成 env = gym.make('CartPole-v0') replay_buffer = Replay_Buffer(param_set.cap) save_reward_list = [] reward_list = [] for i in range(REWARD_SAVE_EVALUATION_SIZE): save_reward_list.append(0) for i in range(REWARD_EVALUATION_SIZE): reward_list.append(0) # データ集め(何回ゲームをやるか) for i in range(EPISODE_NUM): # Envの初期化情報の取得 state = env.reset() done = False # エピソード報酬初期化 episode_reward = 0 # 1ゲーム終了させる(Envから終了判定もらう) while not done: if i > INIT_EXPLORATION: # Actionをε-greedyで決める action = agent.get_action(state) else: action = env.action_space.sample() # Action引数にEnvからS、r,dの情報を引っ張ってくる next_state, reward, done, info = env.step(action) # エピソード報酬計算 episode_reward += reward # ReplayBufferにaddする replay_buffer.add(state, action, next_state, reward, done) # StにSt+1を代入(更新処理) state = next_state loss = tf.constant(0) if i > INIT_EXPLORATION: # ニューラルネットワーク学習 sample = replay_buffer.sample(BATCH_SIZE) if sample: loss = agent.update(replay_buffer.sample(BATCH_SIZE)) if i % param_set.q_update == 0: agent.network_synchronize() reward_list[i % REWARD_EVALUATION_SIZE] = episode_reward save_reward_list[i % REWARD_SAVE_EVALUATION_SIZE] = episode_reward if sum(save_reward_list) / len(save_reward_list) >= max_reward: print("最高記録更新!!!") agent.save(SAVE_DIRECTORY + SAVE_FILE) max_reward = sum(save_reward_list) / len(save_reward_list) return sum(reward_list) / len(reward_list), max_reward
class DQN: """ """ def __init__(self, state_shape, n_action, net, model_path='model/dqn'): self.state_shape = state_shape self.n_action = n_action self.lr = 1e-4 self.gamma = 0.9 self.sampling_size = 20000 self.agent = Agent(self.state_shape, self.n_action, self.lr, 0.9, net) self.sampling_pool = Sampling_Pool(self.sampling_size) self.cum_r = [] self.model_path = model_path def train_agent(self): state, reward, done, action, next_state = self.sampling_pool.get_sample( self.batch_size) q_target = self.agent.q_target(next_state) q = self.agent.q_eval(state) q_next = self.agent.q_eval(next_state) for i in range(self.batch_size): if done[i]: q[i, action[i]] = reward[i] else: max_action = np.argmax(q_next[i, :]) q[i, action[i]] = reward[i] + self.gamma * q_target[i, max_action] self.agent.update(state, q) def train(self, episode, batch_size=64, freq=100): self.batch_size = batch_size tqdm_e = tqdm(range(episode)) env = game.GameState() for i in tqdm_e: state = env.reset() cum_r = 0 done = False while not done: # STATUS = "explore" state_newaxis = state[np.newaxis, :] action = self.agent.e_greedy_action(state_newaxis) action_array = np.array([0, 0]) action_array[action] = 1 next_state, reward, done = env.step(action_array) action_onehot = to_categorical(action, self.n_action) ob = (state, reward, done, action_onehot, next_state) self.sampling_pool.add_to_buffer(ob) state = next_state cum_r += reward if (self.sampling_pool.get_size() > self.batch_size): self.train_agent() STATUS = "train" if i % freq == 0: self.agent.transfer_weights() STATUS = "transfer weights" self.cum_r.append(cum_r) if (i > 10000) & (not (i % 10000)): self.save_model(f"{i}-eps-.h5") tqdm_e.set_description("Score: " + str(cum_r) + "\n Status" + STATUS) tqdm_e.refresh() self.save_model(f"final-{i}-eps-.h5") def save_model(self, save_name): path = self.model_path if not os.path.exists(path): os.makedirs(path) self.agent.q_eval_net.save(os.path.join(path, save_name))