def sample_trajectory(env, policy: BasePolicy, max_path_length, render=False, render_mode=('rgb_array')): # initialize env for the beginning of a new rollout ob = env.reset() # HINT: should be the output of resetting the env # init vars obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], [] steps = 0 while True: # render image of the simulated env if render: if 'rgb_array' in render_mode: if hasattr(env, 'sim'): image_obs.append( env.sim.render(camera_name='track', height=500, width=500)[::-1]) else: image_obs.append(env.render(mode=render_mode)) if 'human' in render_mode: env.render(mode=render_mode) time.sleep(env.model.opt.timestep) # use the most recent ob to decide what to do obs.append(ob) ac = policy.get_action( ob) # HINT: query the policy's get_action function ac = ac.cpu() ac = ac[0].detach().numpy() acs.append(ac) # take that action and record results ob, rew, done, _ = env.step(ac) # _ = info # record result of taking that action steps += 1 next_obs.append(ob) rewards.append(rew) # TODO done end the rollout if the rollout ended # HINT: rollout can end due to done, or due to max_path_length rollout_done = False # HINT: this is either 0 or 1 if done or steps >= max_path_length: rollout_done = True terminals.append(rollout_done) if rollout_done: break # obs lags one element to next_obs return Path(obs, image_obs, acs, rewards, next_obs, terminals)
def sample_trajectory( env, policy: BasePolicy, max_path_length: int, render: bool=False, render_mode=('rgb_array'), ) -> PathDict: # initialize env for the beginning of a new rollout ob: np.ndarray = env.reset() # init vars obs: List[np.ndarray] = [] acs: List[np.ndarray] = [] rewards: List[np.ndarray] = [] next_obs: List[np.ndarray] = [] terminals: List[bool] = [] image_obs: List[np.ndarray] = [] steps = 0 while True: # render image of the simulated env if render: if 'rgb_array' in render_mode: if hasattr(env, 'sim'): image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1]) else: image_obs.append(env.render(mode=render_mode)) if 'human' in render_mode: env.render(mode=render_mode) time.sleep(env.model.opt.timestep) # use the most recent ob to decide what to do obs.append(ob) ac = policy.get_action(ob) ac = int(ac) # ac = ac[0] acs.append(ac) # take that action and record results ob, rew, done, _ = env.step(ac) # record result of taking that action steps += 1 next_obs.append(ob) rewards.append(rew) # end the rollout if the rollout ended # HINT: rollout can end due to done, or due to max_path_length rollout_done = bool(done) or steps >= max_path_length terminals.append(rollout_done) if rollout_done: break return Path(obs, image_obs, acs, rewards, next_obs, terminals)
def do_relabel_with_expert(self, expert_policy: BasePolicy, paths): print( "\nRelabelling collected observations with labels from an expert policy..." ) # TODO relabel collected obsevations (from our policy) with labels from an expert policy # HINT: query the policy (using the get_action function) with paths[i]["observation"] # and replace paths[i]["action"] with these expert labels for i in range(len(paths)): obs = paths[i]['observation'] paths[i]['action'] = expert_policy.get_action(obs) return paths
def sample_trajectory( env, policy: BasePolicy, max_path_length: int, render: bool = False, render_mode=('rgb_array'), ) -> PathDict: ob = env.reset() obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], [] steps = 0 while True: if render: if 'rgb_array' in render_mode: if hasattr(env, 'sim'): if 'track' in env.env.model.camera_names: image_obs.append( env.sim.render(camera_name='track', height=500, width=500)[::-1]) else: image_obs.append( env.sim.render(height=500, width=500)[::-1]) else: image_obs.append(env.render(mode=render_mode)) if 'human' in render_mode: env.render(mode=render_mode) time.sleep(env.model.opt.timestep) obs.append(ob) ac = policy.get_action(ob) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) # add the observation after taking a step to next_obs next_obs.append(ob) rewards.append(rew) steps += 1 # If the episode ended, the corresponding terminal value is 1 # otherwise, it is 0 if done or steps > max_path_length: terminals.append(1) break else: terminals.append(0) return Path(obs, image_obs, acs, rewards, next_obs, terminals)
def do_relabel_with_expert(self, expert_policy: BasePolicy, paths: List[PathDict]) -> List[PathDict]: print( "\nRelabelling collected observations with labels from an expert policy..." ) # TODO relabel collected obsevations (from our policy) with labels from an expert policy # HINT: query the policy (using the get_action function) with paths[i]["observation"] # and replace paths[i]["action"] with these expert labels relabeled_paths: List[PathDict] = [] for path in paths: relabeled_path = copy.deepcopy(path) for t, observation in enumerate(path['observation']): path['action'][t] = expert_policy.get_action(observation) relabeled_paths.append(relabeled_path) return paths
def collect_loss( self, actions, labels, criterion, expert_policy: BasePolicy, ): print("Start collecting path and expert actions...") total_envsteps = 0 emp_action = [] done = False reward = 0 loss = 0 for _ in range(self.params['batch_size']): if total_envsteps == 0: obs = self.env.reset() else: action_to_step = map_utils.restore_continuous_action(emp_action, self.params['bins'], -5, 5) obs, reward, done, info = self.env.step(action_to_step) obs_tensor = torch.tensor(obs, device=ptu.device, dtype=torch.float64) emp_action = self.gm_net(torch.flatten(obs_tensor).float()) expert_labels = np.squeeze(expert_policy.get_action(obs)) discretized_labels = torch.tensor(map_utils.discretize_action(expert_labels, 3, self.params['bins'], -5, 5), device=ptu.device, dtype=torch.float64) # print debug logs # print("step's reward is ", reward) # print("obs is", obs) print("expert_labels is", expert_labels) # print("action is", map_utils.restore_continuous_action(emp_action, self.params['bins'], -1, 1)) # print("discrete labels is", discretized_labels) print("emp_action is", map_utils.restore_continuous_action(emp_action, self.params['bins'], -5, 5)) # print("discrete action is", emp_action) loss += criterion(emp_action, discretized_labels) if done: print("Ending current training iteration") break else: total_envsteps += 1 return loss
def collect_trajactory( self, dataset, expert_policy: BasePolicy, ): print("Start collecting path and expert actions...") total_envsteps = 0 emp_action = None done = False rewards = [] reward = 0 for _ in range(self.params['batch_size']): if total_envsteps == 0: obs = self.env.reset() else: action_to_step = map_utils.restore_continuous_action_from_binary( emp_action, self.params['bins'], -4, 4) obs, reward, done, info = self.env.step(action_to_step) obs_tensor = torch.tensor(obs, device=ptu.device, dtype=torch.float64) emp_action = self.gm_net(torch.flatten(obs_tensor).float()) expert_labels = np.squeeze(expert_policy.get_action(obs)) dataset["expert_action"].append(expert_labels) dataset["observation"].append(obs) rewards.append(reward) # print("expert action", expert_labels) # print("emp action", map_utils.restore_continuous_action_from_binary(emp_action, self.params['bins'], -4, 4)) # print("step's reward is ", reward) if done: print("Ending current training iteration") break else: total_envsteps += 1 print("obs", len(dataset["observation"])) print("expert_action", len(dataset["expert_action"])) return dataset, rewards