def main(): from experience import Experience from visualization import hist_classes, scatter_classes class_count_list = [] agents = [ 'RainbowAgent', 'SimpleAgent', 'SecondAgent', 'ProbabilisticAgent' ] for agent in agents: exp = Experience(agent, load=True) labels, _, examples, _ = exp.load() class_count, _ = divide_and_count(examples, labels) class_count_list.append(class_count) scatter_classes(class_count_list, agents)
class Game(object): def __init__(self, env=None, agent=None, logdir=None, should_render=None, should_load=None): self.env = env self.agent = agent self.config = self.agent.config self.logdir = logdir self.should_render = should_render self.experience = Experience(self.config) if should_load: self.load() else: self.step = 0 self.epsilon = 0.3 self.train_rewards = [0] * 100 self.current_episode = 0 def save(self): with open(os.path.join(self.logdir, "variables.save"), "wb") as w: save_list = [self.step, self.epsilon, self.train_rewards, self.current_episode] cPickle.dump(save_list, w, protocol=cPickle.HIGHEST_PROTOCOL) self.experience.save(self.logdir) def load(self): with open(os.path.join(self.logdir, "variables.save"), "rb") as f: self.step, self.epsilon, self.train_rewards, self.current_episode = cPickle.load(f) self.experience.load(self.logdir) def runEpisode(self, sess, is_training=True): # Reset environment. state = self.env.reset() # Initialize stats. num_steps = 0 total_reward = 0. total_loss = 0. old_step = self.step self.epsilon = max(self.epsilon * 0.99, 0.1) if is_training else self.config.TEST_EPSILON for t in xrange(self.config.NUM_STEPS): # Render environment. if self.should_render: self.env.render() # Agent plans according to the current action values. queried_state = self.experience.getLastestState() action = self.agent.plan(sess, queried_state, self.env.action_space.sample(), self.epsilon) # Environment executes the action. next_state, reward, done, info = self.env.step(action) total_reward += reward # Add observation to experience. self.experience.add(state, action, next_state, reward, 1 if done else 0) # If training, agent updates parameters according to the observation. if is_training and self.step > self.config.START_LEARN: # Update parameters of the current training network. if self.step % self.config.TRAIN_FREQUENCY == 0: (batch_states, batch_actions, batch_next_states, batch_rewards, batch_is_terminal) = self.experience.sample() total_loss += self.agent.observe( sess, batch_states, batch_actions, batch_next_states, batch_rewards, batch_is_terminal) # Update parameters of the target network if self.step % self.config.UPDATE_TARGET_FREQUENCY == 0: self.agent.update_target(sess) # End of episode. if done: break # Update state state = next_state # Increment time. self.step += 1 # Message at the end of the episode. num_steps = self.step - old_step + 1 return num_steps, total_reward, total_loss def train(self, num_episodes): global_step = slim.get_or_create_global_step() sv = tf.train.Supervisor(logdir=self.logdir, save_model_secs=20, global_step=global_step) with sv.managed_session("") as sess: for i in xrange(num_episodes): ep_num_steps, ep_reward, ep_loss = self.runEpisode(sess, is_training=True) self.train_rewards[self.current_episode % 100] = ep_reward if self.current_episode % 20 == 0: self.save() self.current_episode += 1 print "Episode", self.current_episode, "has finshed in", ep_num_steps, "steps" print " Reward: {:10} Loss: {:.6f} Epsilon: {:.3f}".format(ep_reward, ep_loss, self.epsilon) print "Running average reward for the last 100 episodes:", \ sum(self.train_rewards) / min(100, self.current_episode) sv.saver.save(sess, self.logdir, global_step=sv.global_step) self.save() def play(self, num_episodes): sv = tf.train.Supervisor(logdir=self.logdir) with sv.managed_session("") as sess: total_reward = 0. self.experience.reset() for i in xrange(num_episodes): ep_num_steps, ep_reward, num_steps = self.runEpisode(sess, is_training=False) total_reward += ep_reward print "Episode", i, "has finshed in", num_steps, "steps" print " Reward: {:10}".format(ep_reward) print "Running average reward:", total_reward / (i + 1)