class BaseAgent: # must be implemented by each agent def update(self): return def __init__(self, config, session): # build the net self.config = config self.sess = session self.RM = ReplayMemory(config) self.step_count = 0 self.episode = 0 self.isTesting = False self.game_state = np.zeros( (1, 84, 84, self.config.buff_size), dtype=np.uint8) self.reset_game() self.timeout_option = tf.RunOptions(timeout_in_ms=5000) # if the new agent needs other action modes define a different dict self.action_modes = { str(config.epsilon) + "_greedy": self.e_greedy_action} self.default_action_mode = self.action_modes.items()[0][0] self.action_mode = self.default_action_mode self.representations = [] def step(self, screen, reward): # clip the reward if not self.isTesting: # add the last transition self.RM.add(self.game_state[:, :, :, -1], self.game_action, self.game_reward, False) self.observe(screen, reward) self.game_action = self.e_greedy_action(self.epsilon()) if self.step_count > self.config.steps_before_training: self.update() self.step_count += 1 else: # if the agent is testing self.observe(screen, reward) self.game_action = self.e_greedy_action(0.01) return self.game_action # Add the final transition to the RM and reset the internal state for the next # episode def terminal(self): if not self.isTesting: self.RM.add( self.game_state[:, :, :, -1], self.game_action, self.game_reward, True) self.reset_game() def observe(self, screen, reward): self.game_reward = max(-1, min(1, reward)) screen = cv2.resize(screen, (84, 84)) screen = cv2.cvtColor(screen, cv2.COLOR_RGB2GRAY) self.game_state = np.roll(self.game_state, -1, axis=3) self.game_state[0, :, :, -1] = screen def e_greedy_action(self, epsilon): ops = [self.Q] + self.representations res= self.sess.run( ops, feed_dict={ self.state_ph: self.game_state}) self.Q_np = res[0] self.representations_np = res[1:] action = np.argmax(self.Q_np) if np.random.uniform() < epsilon: action = random.randint(0, self.config.action_num - 1) return action def testing(self, t=True): self.isTesting = t def set_action_mode(self, mode): if mode not in self.action_modes: raise Exception(str(mode) + " is not a valid action mode") self.select_action = self.action_modes[mode] def reset_game(self): self.game_state.fill(0) self.game_action = 0 self.game_reward = 0 if not self.isTesting: # add initial black screens for next episode for i in range(self.config.buff_size - 1): self.RM.add(np.zeros((84, 84)), 0, 0, False) def epsilon(self): if self.step_count < self.config.exploration_steps: return self.config.initial_epsilon - \ ((self.config.initial_epsilon - self.config.final_epsilon) / self.config.exploration_steps) * self.step_count else: return self.config.final_epsilon
terminal = False pseudo_terminal = False lives = ale.lives() episode_begining_step = global_step while terminal == False: action = e_greedy_action(get_epsilon(), state) reward = ale.act(action_map[action]) clipped_reward = max(-1, min(1, reward)) R += reward pseudo_terminal = False if ale.game_over(): terminal = True if lives != ale.lives() or terminal: lives = ale.lives() pseudo_terminal = True RM.add(state[0, :, :, config.buff_size -1], action, clipped_reward, pseudo_terminal) update_params() state = preprocess(ale.getScreenGrayscale(), state) global_step += 1 ep_duration = time.time() - ep_begin_t if logging and episode%100 == 0 and episode != 0 or num_episodes == episode: episode_online_summary = tf.Summary(value=[tf.Summary.Value(tag="online/epsilon", simple_value=get_epsilon()), tf.Summary.Value(tag="online/R", simple_value=R), tf.Summary.Value(tag="online/steps_in_episode", simple_value= global_step - episode_begining_step), tf.Summary.Value(tag="online/global_step", simple_value = global_step), tf.Summary.Value(tag="online/ep_duration_seconds", simple_value=ep_duration)]) summary_writter.add_summary(episode_online_summary, global_episode) # log percent if logging and logging==True and episode%500 == 0 and episode != 0 or num_episodes == episode: percent = int(float(episode - initial_episode)/num_episodes * 100) print("%i%% -- epsilon:%.2f"%(percent, get_epsilon()))
def train(sess, env, args, actors, critics, noise): summary_ops, summary_vars = build_summaries() # summary_ops,episode_reward1 = build_summaries() init = tf.initialize_all_variables() sess.run(init) writer = tf.summary.FileWriter(args['summary_dir'], sess.graph) for a in actors: a.update_target() for b in critics: b.update_target() replayMemory = ReplayMemory(int(args['buffer_size']), int(args['random_seed'])) for ep in range(int(args['max_episodes']) + 1): print('starting runing') print('this is {} of epoch'.format(ep)) s = env.reset() episode_reward = np.zeros((env.n, )) if ep % 1000 == 0: for k in range(env.n): file1 = 'results/actor' + str(k) + str(ep) + '.h5' # file2 = 'results/actor'+str(k)+'/target'+str(ep)+'.h5' file3 = 'results/critic' + str(k) + str(ep) + '.h5' # file4 = 'results/critic'+str(k)+'/target'+str(ep)+'.h5' actor = actors[k] critic = critics[k] actor.mainModel.save(file1) # actor.targetModel.save(file2) critic.mainModel.save(file3) # critic.targetModel.save(file4) plt.close() plt.figure() for stp in range(int(args['max_episode_len'])): if args['render_env']: env.render(s) plt.clf() a = [] # shape=(n,actor.action_dim) for i in range(env.n): actor = actors[i] a.append( actor.act(np.reshape(s[i], (-1, actor.state_dim)), noise[i]()).reshape(actor.action_dim, )) s2, r, done = env.step( a) # a is a list with each element being an array replayMemory.add(s, a, r, done, s2) s = s2 action_dims_done = 0 for i in range(env.n): actor = actors[i] critic = critics[i] if replayMemory.size() > int(args['minibatch_size']): s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch( int(args['minibatch_size'])) a = [] for j in range(env.n): state_batch_j = np.asarray( [x for x in s_batch[:, j]] ) # batch processing will be much more efficient even though reshaping will have to be done a.append(actors[j].predict_target(state_batch_j)) a_temp = np.transpose(np.asarray(a), (1, 0, 2)) a_for_critic = np.asarray([x.flatten() for x in a_temp]) s2_batch_i = np.asarray([ x for x in s2_batch[:, i] ]) # Checked till this point, should be fine. targetQ = critic.predict_target( s2_batch_i, a_for_critic) # Should work, probably yi = [] for k in range(int(args['minibatch_size'])): if d_batch[:, i][k]: yi.append(r_batch[:, i][k]) else: yi.append(r_batch[:, i][k] + critic.gamma * targetQ[k]) s_batch_i = np.asarray([x for x in s_batch[:, i]]) critic.train( s_batch_i, np.asarray([x.flatten() for x in a_batch]), np.reshape(yi, (int(args['minibatch_size']), 1))) actions_pred = [] for j in range(env.n): state_batch_j = np.asarray([x for x in s2_batch[:, j]]) actions_pred.append( actors[j].predict(state_batch_j) ) # Should work till here, roughly, probably a_temp = np.transpose(np.asarray(actions_pred), (1, 0, 2)) a_for_critic_pred = np.asarray( [x.flatten() for x in a_temp]) s_batch_i = np.asarray([x for x in s_batch[:, i]]) grads = critic.action_gradients( s_batch_i, a_for_critic_pred)[:, action_dims_done:action_dims_done + actor.action_dim] actor.train(s_batch_i, grads) actor.update_target() critic.update_target() action_dims_done = action_dims_done + actor.action_dim episode_reward += r # print(done) if sum(done): summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: episode_reward[0], summary_vars[1]: episode_reward[2] }) writer.add_summary(summary_str, ep) writer.flush() break
terminal = False pseudo_terminal = False lives = ale.lives() episode_begining_step = global_step while terminal == False: action = e_greedy_action(get_epsilon(), state) reward = ale.act(action_map[action]) clipped_reward = max(-1, min(1, reward)) R += reward pseudo_terminal = False if ale.game_over(): terminal = True if lives != ale.lives() or terminal: lives = ale.lives() pseudo_terminal = True RM.add(state[0, :, :, config.buff_size - 1], action, clipped_reward, pseudo_terminal) update_params() state = preprocess(ale.getScreenGrayscale(), state) global_step += 1 ep_duration = time.time() - ep_begin_t if logging and episode % 100 == 0 and episode != 0 or num_episodes == episode: episode_online_summary = tf.Summary(value=[ tf.Summary.Value(tag="online/epsilon", simple_value=get_epsilon()), tf.Summary.Value(tag="online/R", simple_value=R), tf.Summary.Value(tag="online/steps_in_episode", simple_value=global_step - episode_begining_step), tf.Summary.Value(tag="online/global_step", simple_value=global_step), tf.Summary.Value(tag="online/ep_duration_seconds", simple_value=ep_duration) ])