def run_n_steps(self, num_steps, max_env=None): import tensorflow as tf if max_env is not None: self.env.__num_steps = max_env self.reset_data() state = self.env.reset() step = 0 while step < num_steps: done = False new_state = self.env.reset() episode_rewards = [] while not done: state = new_state agent_out = self.agent.act_experience( np.expand_dims(state, axis=0), self.return_log_prob) # S self.data_agg["state"].append(state) if self.return_feature_state: self.data_agg["feature_state"].append( self.agent.get_state()) # A action = agent_out["action"] if tf.is_tensor(action): action = action.numpy() if self.discrete_env: action = int(action) elif action.shape == (): action = np.expand_dims(action, 0) new_state, reward, done, _ = self.env.step(action) self.data_agg["action"].append(action) # R self.data_agg["reward"].append(reward) episode_rewards.append(reward) # S+1 self.data_agg["state_new"].append(new_state) # info on terminal state self.data_agg["not_done"].append(float(int(not (done)))) # append optional in time values to data if self.return_log_prob: self.data_agg["log_prob"].append( agent_out["log_probability"]) if self.return_value_estimate: self.data_agg["value_estimate"].append( agent_out["value_estimate"]) step += 1 if step == num_steps: break if self.return_monte_carlo: self.data_agg["monte_carlo"].extend( discount_cumsum(episode_rewards, self.gamma)) return self.data_agg, self.runner_position
def run_n_episodes(self, num_episodes, max_env=None): import tensorflow as tf if max_env is not None: self.env.__num_steps = max_env state = self.env.reset() for e in range(num_episodes): done = False new_state = self.env.reset() while not done: state = new_state agent_out = self.agent.act_experience( np.expand_dims(state, axis=0), self.return_log_prob) # S self.data_agg["state"].append(state) # A action = agent_out["action"] if tf.is_tensor(action): action = action.numpy() # A if self.discrete_env: action = int(action) new_state, reward, done, info = self.env.step(action) self.data_agg["action"].append(action) # R self.data_agg["reward"].append(reward) # S+1 self.data_agg["state_new"].append(new_state) # info on terminal state self.data_agg["not_done"].append(int(not (done))) # append optional in time values to data data if self.return_log_prob: self.data_agg["log_prob"].append( agent_out["log_probability"]) if self.return_value_estimate: self.data_agg["value_estimate"].append( agent_out["value_estimate"]) if self.return_monte_carlo: self.data_agg["monte_carlo"] = discount_cumsum( self.data_agg["reward"], self.gamma) return self.data_agg, self.runner_position