コード例 #1
0
ファイル: dqn_main.py プロジェクト: waderaku/DQN
def dqn_argo(param_set: Parameter_Set, max_reward):
    # Agentの生成
    netWork = Network(action_dim=2)
    target_network = Network(action_dim=2)
    agent = Agent(network=netWork,
                  target_network=target_network,
                  eps_start=param_set.eps_init,
                  eps_anneal=param_set.eps_anneal,
                  eps_min=param_set.eps_min,
                  lr=param_set.lr,
                  gamma=param_set.gamma)

    # Envの生成
    env = gym.make('CartPole-v0')

    replay_buffer = Replay_Buffer(param_set.cap)

    save_reward_list = []
    reward_list = []
    for i in range(REWARD_SAVE_EVALUATION_SIZE):
        save_reward_list.append(0)
    for i in range(REWARD_EVALUATION_SIZE):
        reward_list.append(0)

    # データ集め(何回ゲームをやるか)
    for i in range(EPISODE_NUM):

        # Envの初期化情報の取得
        state = env.reset()
        done = False

        # エピソード報酬初期化
        episode_reward = 0

        # 1ゲーム終了させる(Envから終了判定もらう)
        while not done:

            if i > INIT_EXPLORATION:
                # Actionをε-greedyで決める
                action = agent.get_action(state)
            else:
                action = env.action_space.sample()

            # Action引数にEnvからS、r,dの情報を引っ張ってくる
            next_state, reward, done, info = env.step(action)

            # エピソード報酬計算
            episode_reward += reward

            # ReplayBufferにaddする
            replay_buffer.add(state, action, next_state, reward, done)

            # StにSt+1を代入(更新処理)
            state = next_state
        loss = tf.constant(0)

        if i > INIT_EXPLORATION:
            # ニューラルネットワーク学習
            sample = replay_buffer.sample(BATCH_SIZE)
            if sample:
                loss = agent.update(replay_buffer.sample(BATCH_SIZE))

            if i % param_set.q_update == 0:
                agent.network_synchronize()

            reward_list[i % REWARD_EVALUATION_SIZE] = episode_reward

            save_reward_list[i % REWARD_SAVE_EVALUATION_SIZE] = episode_reward

            if sum(save_reward_list) / len(save_reward_list) >= max_reward:
                print("最高記録更新!!!")
                agent.save(SAVE_DIRECTORY + SAVE_FILE)
                max_reward = sum(save_reward_list) / len(save_reward_list)
    return sum(reward_list) / len(reward_list), max_reward
コード例 #2
0
class DQN:
    """
    """
    def __init__(self, state_shape, n_action, net, model_path='model/dqn'):
        self.state_shape = state_shape
        self.n_action = n_action
        self.lr = 1e-4
        self.gamma = 0.9
        self.sampling_size = 20000
        self.agent = Agent(self.state_shape, self.n_action, self.lr, 0.9, net)
        self.sampling_pool = Sampling_Pool(self.sampling_size)
        self.cum_r = []
        self.model_path = model_path

    def train_agent(self):
        state, reward, done, action, next_state = self.sampling_pool.get_sample(
            self.batch_size)
        q_target = self.agent.q_target(next_state)
        q = self.agent.q_eval(state)
        q_next = self.agent.q_eval(next_state)

        for i in range(self.batch_size):
            if done[i]:
                q[i, action[i]] = reward[i]
            else:
                max_action = np.argmax(q_next[i, :])
                q[i,
                  action[i]] = reward[i] + self.gamma * q_target[i, max_action]
        self.agent.update(state, q)

    def train(self, episode, batch_size=64, freq=100):
        self.batch_size = batch_size
        tqdm_e = tqdm(range(episode))
        env = game.GameState()

        for i in tqdm_e:
            state = env.reset()
            cum_r = 0
            done = False
            while not done:
                # STATUS = "explore"
                state_newaxis = state[np.newaxis, :]
                action = self.agent.e_greedy_action(state_newaxis)
                action_array = np.array([0, 0])
                action_array[action] = 1
                next_state, reward, done = env.step(action_array)
                action_onehot = to_categorical(action, self.n_action)
                ob = (state, reward, done, action_onehot, next_state)
                self.sampling_pool.add_to_buffer(ob)
                state = next_state
                cum_r += reward

                if (self.sampling_pool.get_size() > self.batch_size):
                    self.train_agent()
                    STATUS = "train"
                    if i % freq == 0:
                        self.agent.transfer_weights()
                        STATUS = "transfer weights"
            self.cum_r.append(cum_r)
            if (i > 10000) & (not (i % 10000)):
                self.save_model(f"{i}-eps-.h5")
            tqdm_e.set_description("Score: " + str(cum_r) + "\n Status" +
                                   STATUS)
            tqdm_e.refresh()
        self.save_model(f"final-{i}-eps-.h5")

    def save_model(self, save_name):
        path = self.model_path
        if not os.path.exists(path):
            os.makedirs(path)
        self.agent.q_eval_net.save(os.path.join(path, save_name))