Exemple #1
0
class BaseAgent:

    # must be implemented by each agent
    def update(self):
        return

    def __init__(self, config, session):
        # build the net
        self.config = config
        self.sess = session
        self.RM = ReplayMemory(config)
        self.step_count = 0
        self.episode = 0
        self.isTesting = False
        self.game_state = np.zeros(
            (1, 84, 84, self.config.buff_size), dtype=np.uint8)
        self.reset_game()
        self.timeout_option = tf.RunOptions(timeout_in_ms=5000)

        # if the new agent needs other action modes define a different dict
        self.action_modes = {
            str(config.epsilon) + "_greedy": self.e_greedy_action}
        self.default_action_mode = self.action_modes.items()[0][0]
        self.action_mode = self.default_action_mode

        self.representations = []

    def step(self, screen, reward):
        # clip the reward
        if not self.isTesting:
            # add the last transition
            self.RM.add(self.game_state[:, :, :, -1],
                        self.game_action, self.game_reward, False)
            self.observe(screen, reward)
            self.game_action = self.e_greedy_action(self.epsilon())
            if self.step_count > self.config.steps_before_training:
                self.update()
            self.step_count += 1
        else:
            # if the agent is testing
            self.observe(screen, reward)
            self.game_action = self.e_greedy_action(0.01)
        return self.game_action

    # Add the final transition to the RM and reset the internal state for the next
    # episode
    def terminal(self):
        if not self.isTesting:
            self.RM.add(
                self.game_state[:, :, :, -1],
                self.game_action, self.game_reward, True)
        self.reset_game()

    def observe(self, screen, reward):
        self.game_reward = max(-1, min(1, reward))
        screen = cv2.resize(screen, (84, 84))
        screen = cv2.cvtColor(screen, cv2.COLOR_RGB2GRAY)
        self.game_state = np.roll(self.game_state, -1, axis=3)
        self.game_state[0, :, :, -1] = screen

    def e_greedy_action(self, epsilon):
        ops = [self.Q] + self.representations
        res= self.sess.run( ops, feed_dict={
            self.state_ph: self.game_state})

        self.Q_np = res[0]
        self.representations_np = res[1:]

        action = np.argmax(self.Q_np)
        if np.random.uniform() < epsilon:
            action = random.randint(0, self.config.action_num - 1)
        return action

    def testing(self, t=True):
        self.isTesting = t

    def set_action_mode(self, mode):
        if mode not in self.action_modes:
            raise Exception(str(mode) + " is not a valid action mode")
        self.select_action = self.action_modes[mode]

    def reset_game(self):
        self.game_state.fill(0)
        self.game_action = 0
        self.game_reward = 0
        if not self.isTesting:
            # add initial black screens for next episode
            for i in range(self.config.buff_size - 1):
                self.RM.add(np.zeros((84, 84)), 0, 0, False)

    def epsilon(self):
        if self.step_count < self.config.exploration_steps:
            return self.config.initial_epsilon - \
                ((self.config.initial_epsilon - self.config.final_epsilon) /
                 self.config.exploration_steps) * self.step_count
        else:
            return self.config.final_epsilon
Exemple #2
0
 terminal = False
 pseudo_terminal = False
 lives = ale.lives()
 episode_begining_step = global_step
 while terminal == False:
     action = e_greedy_action(get_epsilon(), state)
     reward = ale.act(action_map[action])
     clipped_reward = max(-1, min(1, reward))
     R += reward
     pseudo_terminal = False
     if ale.game_over():
         terminal = True
     if lives != ale.lives() or terminal:
         lives = ale.lives()
         pseudo_terminal = True
     RM.add(state[0, :, :, config.buff_size -1], action, clipped_reward, pseudo_terminal)
     update_params()
     state = preprocess(ale.getScreenGrayscale(), state)
     global_step += 1
 ep_duration = time.time() - ep_begin_t
 if logging and episode%100 == 0 and episode != 0 or num_episodes == episode:
     episode_online_summary = tf.Summary(value=[tf.Summary.Value(tag="online/epsilon", simple_value=get_epsilon()),
                                 tf.Summary.Value(tag="online/R", simple_value=R),
                                 tf.Summary.Value(tag="online/steps_in_episode", simple_value= global_step - episode_begining_step),
                                 tf.Summary.Value(tag="online/global_step", simple_value = global_step),
                                 tf.Summary.Value(tag="online/ep_duration_seconds", simple_value=ep_duration)])
     summary_writter.add_summary(episode_online_summary, global_episode)
 # log percent
 if logging and logging==True and episode%500 == 0 and episode != 0 or num_episodes == episode:
     percent = int(float(episode - initial_episode)/num_episodes * 100)
     print("%i%% -- epsilon:%.2f"%(percent, get_epsilon()))
Exemple #3
0
def train(sess, env, args, actors, critics, noise):
    summary_ops, summary_vars = build_summaries()
    # summary_ops,episode_reward1 = build_summaries()
    init = tf.initialize_all_variables()
    sess.run(init)
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    for a in actors:
        a.update_target()
    for b in critics:
        b.update_target()

    replayMemory = ReplayMemory(int(args['buffer_size']),
                                int(args['random_seed']))

    for ep in range(int(args['max_episodes']) + 1):
        print('starting runing')
        print('this is {} of epoch'.format(ep))
        s = env.reset()
        episode_reward = np.zeros((env.n, ))

        if ep % 1000 == 0:
            for k in range(env.n):
                file1 = 'results/actor' + str(k) + str(ep) + '.h5'
                # file2 = 'results/actor'+str(k)+'/target'+str(ep)+'.h5'
                file3 = 'results/critic' + str(k) + str(ep) + '.h5'
                # file4 = 'results/critic'+str(k)+'/target'+str(ep)+'.h5'
                actor = actors[k]
                critic = critics[k]
                actor.mainModel.save(file1)
                # actor.targetModel.save(file2)
                critic.mainModel.save(file3)
                # critic.targetModel.save(file4)
        plt.close()
        plt.figure()
        for stp in range(int(args['max_episode_len'])):
            if args['render_env']:
                env.render(s)
                plt.clf()
            a = []  # shape=(n,actor.action_dim)
            for i in range(env.n):
                actor = actors[i]
                a.append(
                    actor.act(np.reshape(s[i], (-1, actor.state_dim)),
                              noise[i]()).reshape(actor.action_dim, ))
            s2, r, done = env.step(
                a)  # a is a list with each element being an array
            replayMemory.add(s, a, r, done, s2)
            s = s2
            action_dims_done = 0
            for i in range(env.n):
                actor = actors[i]
                critic = critics[i]
                if replayMemory.size() > int(args['minibatch_size']):

                    s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(
                        int(args['minibatch_size']))
                    a = []
                    for j in range(env.n):
                        state_batch_j = np.asarray(
                            [x for x in s_batch[:, j]]
                        )  # batch processing will be much more efficient even though reshaping will have to be done
                        a.append(actors[j].predict_target(state_batch_j))

                    a_temp = np.transpose(np.asarray(a), (1, 0, 2))
                    a_for_critic = np.asarray([x.flatten() for x in a_temp])
                    s2_batch_i = np.asarray([
                        x for x in s2_batch[:, i]
                    ])  # Checked till this point, should be fine.
                    targetQ = critic.predict_target(
                        s2_batch_i, a_for_critic)  # Should  work, probably

                    yi = []
                    for k in range(int(args['minibatch_size'])):
                        if d_batch[:, i][k]:
                            yi.append(r_batch[:, i][k])
                        else:
                            yi.append(r_batch[:, i][k] +
                                      critic.gamma * targetQ[k])
                    s_batch_i = np.asarray([x for x in s_batch[:, i]])
                    critic.train(
                        s_batch_i, np.asarray([x.flatten() for x in a_batch]),
                        np.reshape(yi, (int(args['minibatch_size']), 1)))

                    actions_pred = []
                    for j in range(env.n):
                        state_batch_j = np.asarray([x for x in s2_batch[:, j]])
                        actions_pred.append(
                            actors[j].predict(state_batch_j)
                        )  # Should work till here, roughly, probably

                    a_temp = np.transpose(np.asarray(actions_pred), (1, 0, 2))
                    a_for_critic_pred = np.asarray(
                        [x.flatten() for x in a_temp])
                    s_batch_i = np.asarray([x for x in s_batch[:, i]])
                    grads = critic.action_gradients(
                        s_batch_i,
                        a_for_critic_pred)[:,
                                           action_dims_done:action_dims_done +
                                           actor.action_dim]
                    actor.train(s_batch_i, grads)
                    actor.update_target()
                    critic.update_target()

                action_dims_done = action_dims_done + actor.action_dim
            episode_reward += r
            # print(done)
            if sum(done):
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: episode_reward[0],
                                           summary_vars[1]: episode_reward[2]
                                       })
                writer.add_summary(summary_str, ep)
                writer.flush()
                break
Exemple #4
0
 terminal = False
 pseudo_terminal = False
 lives = ale.lives()
 episode_begining_step = global_step
 while terminal == False:
     action = e_greedy_action(get_epsilon(), state)
     reward = ale.act(action_map[action])
     clipped_reward = max(-1, min(1, reward))
     R += reward
     pseudo_terminal = False
     if ale.game_over():
         terminal = True
     if lives != ale.lives() or terminal:
         lives = ale.lives()
         pseudo_terminal = True
     RM.add(state[0, :, :, config.buff_size - 1], action, clipped_reward,
            pseudo_terminal)
     update_params()
     state = preprocess(ale.getScreenGrayscale(), state)
     global_step += 1
 ep_duration = time.time() - ep_begin_t
 if logging and episode % 100 == 0 and episode != 0 or num_episodes == episode:
     episode_online_summary = tf.Summary(value=[
         tf.Summary.Value(tag="online/epsilon", simple_value=get_epsilon()),
         tf.Summary.Value(tag="online/R", simple_value=R),
         tf.Summary.Value(tag="online/steps_in_episode",
                          simple_value=global_step - episode_begining_step),
         tf.Summary.Value(tag="online/global_step",
                          simple_value=global_step),
         tf.Summary.Value(tag="online/ep_duration_seconds",
                          simple_value=ep_duration)
     ])