def run_n_steps(self, n, sess): transitions = [] for _ in range(n): # Take a step action_probs = self._policy_net_predict(self.state, sess) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = self.env.step(action) next_state = atari_helpers.atari_make_next_state( self.state, self.sp.process(next_state)) # Store transition transitions.append( Transition(state=self.state, action=action, reward=reward, next_state=next_state, done=done)) # Increase local and global counters local_t = next(self.local_counter) global_t = next(self.global_counter) if local_t % 100 == 0: tf.logging.info("{}: local Step {}, global step {}".format( self.name, local_t, global_t)) if done: self.state = atari_helpers.atari_make_initial_state( self.sp.process(self.env.reset())) break else: self.state = next_state return transitions, local_t, global_t
def eval_once(self, sess): with sess.as_default(), sess.graph.as_default(): # Copy params to local model global_step, _ = sess.run([tf.contrib.framework.get_global_step(), self.copy_params_op]) # Run an episode done = False state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset())) total_reward = 0.0 episode_length = 0 while not done: action_probs = self._policy_net_predict(state, sess) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = self.env.step(action) next_state = atari_helpers.atari_make_next_state(state, self.sp.process(next_state)) total_reward += reward episode_length += 1 state = next_state # Add summaries episode_summary = tf.Summary() episode_summary.value.add(simple_value=total_reward, tag="eval/total_reward") episode_summary.value.add(simple_value=episode_length, tag="eval/episode_length") self.summary_writer.add_summary(episode_summary, global_step) self.summary_writer.flush() if self.saver is not None: self.saver.save(sess, self.checkpoint_path) tf.logging.info("Eval results at step {}: total_reward {}, episode_length {}".format(global_step, total_reward, episode_length)) f_reward.write(str(global_step) + " " + str(total_reward) + " " + str(episode_length) + "\n") return total_reward, episode_length
def eval_once(self, sess): with sess.as_default(), sess.graph.as_default(): # Copy params to local model global_step, _ = sess.run([tf.contrib.framework.get_global_step(), self.copy_params_op]) # Run an episode done = False state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset())) total_reward = 0.0 episode_length = 0 while not done: action_probs = self._policy_net_predict(state, sess) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = self.env.step(action) next_state = atari_helpers.atari_make_next_state(state, self.sp.process(next_state)) total_reward += reward episode_length += 1 state = next_state # Add summaries episode_summary = tf.Summary() episode_summary.value.add(simple_value=total_reward, tag="eval/total_reward") episode_summary.value.add(simple_value=episode_length, tag="eval/episode_length") self.summary_writer.add_summary(episode_summary, global_step) self.summary_writer.flush() if self.saver is not None: self.saver.save(sess, self.checkpoint_path) tf.logging.info("Eval results at step {}: total_reward {}, episode_length {}".format(global_step, total_reward, episode_length)) return total_reward, episode_length
def run_n_steps(self, n, sess): transitions = [] for _ in range(n): # Take a step action_probs = self._policy_net_predict(self.state, sess) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = self.env.step(action) next_state = atari_helpers.atari_make_next_state(self.state, self.sp.process(next_state)) # Store transition transitions.append(Transition( state=self.state, action=action, reward=reward, next_state=next_state, done=done)) # Increase local and global counters local_t = next(self.local_counter) global_t = next(self.global_counter) if local_t % 100 == 0: tf.logging.info("{}: local Step {}, global step {}".format(self.name, local_t, global_t)) if done: self.state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset())) break else: self.state = next_state return transitions, local_t, global_t
def run_n_steps(self, n, sess): transitions = [] for _ in range(n): # Take a step action_probs = self._policy_net_predict(self.state, sess) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = self.env.step(action) next_state = atari_helpers.atari_make_next_state(self.state, self.sp.process(next_state)) self.total_reward += reward self.episode_length += 1 # Store transition transitions.append(Transition( state=self.state, action=action, reward=reward, next_state=next_state, done=done)) # Increase local and global counters local_t = next(self.local_counter) global_t = next(self.global_counter) if local_t % 100 == 0: tf.logging.info("{}: local Step {}, global step {}".format(self.name, local_t, global_t)) if done: self.state = atari_helpers.atari_make_initial_state(self.sp.process(self.env.reset())) f = open('logs_policy.out', 'a') f.write("agent {}, local {}, global {}, total_reward {}, episode_length {}\n".format( self.name, self.local_counter, self.global_counter, self.total_reward, self.episode_length)) f.close() self.total_reward = 0 self.episode_length = 0 break else: self.state = next_state return transitions, local_t, global_t
def run_n_steps(self, n, sess): transitions = [] for _ in range(n): # Take a step action_probs = self._policy_net_predict(self.state, sess) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) repetition_probs = self._repetition_net_predict(self.state, sess) repetition = np.random.choice(np.arange(len(repetition_probs)), p=repetition_probs) rewards_collected = [] # print("repetition", self.name,repetition) for rep in range(repetition + 1): next_state, reward, done, _ = self.env.step(action) # print(self.name,rep) # print("action",action) next_state = atari_helpers.atari_make_next_state( self.state, self.sp.process(next_state)) rewards_collected.append(reward) # Increase local and global counters local_t = next(self.local_counter) global_t = next(self.global_counter) if local_t % 100 == 0: tf.logging.info("{}: local Step {}, global step {}".format( self.name, local_t, global_t)) if done: transitions.append( Transition(state=self.state, action=action, repetition=repetition, reward=sum(rewards_collected) / len(rewards_collected), next_state=next_state, done=done)) self.state = atari_helpers.atari_make_initial_state( self.sp.process(self.env.reset())) break else: if rep == repetition: transitions.append( Transition(state=self.state, action=action, repetition=repetition, reward=sum(rewards_collected) / len(rewards_collected), next_state=next_state, done=done)) self.state = next_state return transitions, local_t, global_t
def run_n_steps(self, n, sess): transitions = [] for _ in range(n): action_probes = self._policy_net_predict(self.state, sess) action = np.random.choice( np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = self.env.step(action) next_state = atari_helpers.atari_make_next_state( self.state, self.sp.process(next_state)) transitions.append(Transition(s tate=self.state, action=action, reward=reward, next_state=next_state, done=done)) local_t = next(self.local_counter)