def gen_user_traj(max_steps=100, path=None, record=False): # if record Save Policy and Trajectories to Folder video_path = path video_recorder = None if record: video_recorder = VideoRecorder(env, video_path, enabled=video_path is not None) traj = [] obs = env.reset() obs = get_pong_symbolic(obs) for i in range(max_steps): _obs = obs action = get_user_action(obs) obs, reward, terminate, _ = env.step(action) obs = get_pong_symbolic(obs) traj.append([_obs, action, reward, terminate]) #print(traj[-1]) env.render() # Note: rendering increases step time. if record: video_recorder.capture_frame() if terminate: print('Total Steps:', i) break if record: video_recorder.close() return traj
def sample(self, horizon, policy, record_fname=None): """Samples a rollout from the agent. Arguments: horizon: (int) The length of the rollout to generate from the agent. policy: (policy) The policy that the agent will use for actions. record_fname: (str/None) The name of the file to which a recording of the rollout will be saved. If None, the rollout will not be recorded. Returns: (dict) A dictionary containing data from the rollout. The keys of the dictionary are 'obs', 'ac', and 'reward_sum'. """ video_record = record_fname is not None recorder = None if not video_record else VideoRecorder( self.env, record_fname) times, rewards = [], [] O, A, reward_sum, done = [self.env.reset()], [], 0, False policy.reset() for t in range(horizon): if video_record: recorder.capture_frame() start = time.time() A.append(policy.act(O[t], t)) times.append(time.time() - start) obs, reward, done, info = self.env.step(A[t]) O.append(obs) reward_sum += reward rewards.append(reward) if done: break if video_record: recorder.capture_frame() recorder.close() print("Average action selection time: ", np.mean(times)) print("Rollout length: ", len(A)) return { "obs": np.array(O), "ac": np.array(A), "reward_sum": reward_sum, "rewards": np.array(rewards), }