Esempio n. 1
0
class Agent:
    def __init__(self, state_size, action_size, sample_num):
        sess = tf.Session()
        self.policy = Policy(sess, state_size, action_size, sample_num)
        self.state_batch = []
        self.action_batch = []
        self.reward_list = []
        self.step_list = []
        self.weight_bach = []
        self.sample_num = sample_num
        sess.run(tf.global_variables_initializer())

    def choose_action(self, state):
        return self.policy.choose_action(state)

    def store(self, state, action, reward):
        self.state_batch.append(state)
        self.action_batch.append(action)
        self.reward_list.append(reward)

    def train(self):
        state_batch = np.vstack(self.state_batch)
        action_batch = np.vstack(self.action_batch)
        t = 0
        for i in range(self.sample_num):
            tlast = t + self.step_list[i]
            for _ in range(self.step_list[i]):
                weight = 0.0
                for n in range(t, tlast):
                    weight += self.reward_list[n] * np.power(gamma, (n - t))
                self.weight_bach.append(weight)
                t += 1
        weight_bach = np.vstack(self.weight_bach)
        self.policy.train(state_batch, action_batch, weight_bach)
        self.state_batch = []
        self.action_batch = []
        self.reward_list = []
        self.step_list = []
        self.weight_bach = []
Esempio n. 2
0
    else:
        render = False

    if episode % SHOW_INFOS == 1:
        show_infos(episode)

    init_game()

    over = False

    if render:
        show_render()

    while not over:

        action = policy.choose_action()
        env.apply_action(action)
        over = is_over()

        policy.update_replay_memory(over)
        policy.train(over)

        steps_remaining -= 1

        if render:
            show_render()

    if policy.END_EPSILON_DECAYING >= episode >= policy.START_EPSILON_DECAYING:
        policy.epsilon -= policy.epsilon_decay_value

policy.test_model()