class Agent: def __init__(self, state_size, action_size, sample_num): sess = tf.Session() self.policy = Policy(sess, state_size, action_size, sample_num) self.state_batch = [] self.action_batch = [] self.reward_list = [] self.step_list = [] self.weight_bach = [] self.sample_num = sample_num sess.run(tf.global_variables_initializer()) def choose_action(self, state): return self.policy.choose_action(state) def store(self, state, action, reward): self.state_batch.append(state) self.action_batch.append(action) self.reward_list.append(reward) def train(self): state_batch = np.vstack(self.state_batch) action_batch = np.vstack(self.action_batch) t = 0 for i in range(self.sample_num): tlast = t + self.step_list[i] for _ in range(self.step_list[i]): weight = 0.0 for n in range(t, tlast): weight += self.reward_list[n] * np.power(gamma, (n - t)) self.weight_bach.append(weight) t += 1 weight_bach = np.vstack(self.weight_bach) self.policy.train(state_batch, action_batch, weight_bach) self.state_batch = [] self.action_batch = [] self.reward_list = [] self.step_list = [] self.weight_bach = []
else: render = False if episode % SHOW_INFOS == 1: show_infos(episode) init_game() over = False if render: show_render() while not over: action = policy.choose_action() env.apply_action(action) over = is_over() policy.update_replay_memory(over) policy.train(over) steps_remaining -= 1 if render: show_render() if policy.END_EPSILON_DECAYING >= episode >= policy.START_EPSILON_DECAYING: policy.epsilon -= policy.epsilon_decay_value policy.test_model()