def collect_rollouts_per_task(task_idx, agent, policy_storage, env, num_rollouts): for rollout in range(num_rollouts): obs = ptu.from_numpy(env.reset(task_idx)) obs = obs.reshape(-1, obs.shape[-1]) done_rollout = False while not done_rollout: action, _, _, _ = agent.act(obs=obs) # SAC # observe reward and next obs next_obs, reward, done, info = utl.env_step( env, action.squeeze(dim=0)) done_rollout = False if ptu.get_numpy(done[0][0]) == 0. else True # add data to policy buffer - (s+, a, r, s'+, term) term = env.unwrapped.is_goal_state() if "is_goal_state" in dir( env.unwrapped) else False rew_to_buffer = ptu.get_numpy(reward.squeeze(dim=0)) policy_storage.add_sample( task=0, #task_idx, observation=ptu.get_numpy(obs.squeeze(dim=0)), action=ptu.get_numpy(action.squeeze(dim=0)), reward=rew_to_buffer, terminal=np.array([term], dtype=float), next_observation=ptu.get_numpy(next_obs.squeeze(dim=0))) # set: obs <- next_obs obs = next_obs.clone()
def transform_mdps_ds_to_bamdp_ds(dataset, vae, args): ''' :param dataset: list of lists of lists. each list is list of arrays (s,a,r,s',done) arrays of size (traj_len, n_trajs, dim) :param vae: trained vae model :return: ''' bamdp_dataset = [] for i, set in enumerate(dataset): obs, actions, rewards, next_obs, terminals = set augmented_obs, belief_rewards, augmented_next_obs = \ transform_mdp_to_bamdp_rollouts(vae, args, ptu.FloatTensor(obs), ptu.FloatTensor(actions), ptu.FloatTensor(rewards), ptu.FloatTensor(next_obs), ptu.FloatTensor(terminals)) rewards = belief_rewards if belief_rewards is not None else ptu.FloatTensor( rewards) bamdp_dataset.append([ ptu.get_numpy(augmented_obs), actions, ptu.get_numpy(rewards), ptu.get_numpy(augmented_next_obs), terminals ]) print('{} datasets were processed.'.format(i + 1)) return bamdp_dataset
def load_replaying_dataset(data_dir, args, num_tasks=None): dataset = [] env_dir = args.env_name exps_dir = os.path.join(args.main_data_dir, env_dir, data_dir) goals = [] all_dirs = os.listdir(exps_dir) policies_per_task = 20 if num_tasks is None: tasks = np.random.permutation(len(all_dirs)) else: tasks = np.random.choice(len(all_dirs), num_tasks) for i, task in enumerate(tasks): task_dir = os.path.join(exps_dir, all_dirs[task]) all_policies = os.listdir(task_dir) policies_to_load = np.random.choice(len(all_policies), policies_per_task, replace=False) goals.append(extract_goal_from_path(all_dirs[task])) task_obs, task_actions, task_rewards, task_next_obs, task_terminals = [], [], [], [], [] for j, policy in enumerate(policies_to_load): exp_dir = os.path.join(exps_dir, all_dirs[task], all_policies[policy]) obs, actions, rewards, next_obs, terminals = load_transitions( exp_dir) obs = obs.reshape(-1, args.trajectory_len, obs.shape[-1]).transpose(0, 1) actions = actions.reshape(-1, args.trajectory_len, actions.shape[-1]).transpose(0, 1) rewards = rewards.reshape(-1, args.trajectory_len, rewards.shape[-1]).transpose(0, 1) next_obs = next_obs.reshape(-1, args.trajectory_len, next_obs.shape[-1]).transpose(0, 1) terminals = terminals.reshape(-1, args.trajectory_len, terminals.shape[-1]).transpose(0, 1) obs = ptu.get_numpy(obs) actions = ptu.get_numpy(actions) rewards = ptu.get_numpy(rewards) next_obs = ptu.get_numpy(next_obs) terminals = ptu.get_numpy(terminals) task_obs.append(obs) task_actions.append(actions) task_rewards.append(rewards) task_next_obs.append(next_obs) task_terminals.append(terminals) obs = np.concatenate(task_obs, axis=1) actions = np.concatenate(task_actions, axis=1) rewards = np.concatenate(task_rewards, axis=1) next_obs = np.concatenate(task_next_obs, axis=1) terminals = np.concatenate(task_terminals, axis=1) dataset.append([obs, actions, rewards, next_obs, terminals]) print('{} experiments loaded.'.format(i + 1)) goals = np.vstack(goals) return dataset, goals
def evaluate(self, tasks): num_episodes = self.args.max_rollouts_per_task num_steps_per_episode = self.env.unwrapped._max_episode_steps returns_per_episode = np.zeros((len(tasks), num_episodes)) success_rate = np.zeros(len(tasks)) if self.args.policy == 'dqn': values = np.zeros((len(tasks), self.args.max_trajectory_len)) else: obs_size = self.env.unwrapped.observation_space.shape[0] observations = np.zeros((len(tasks), self.args.max_trajectory_len + 1, obs_size)) log_probs = np.zeros((len(tasks), self.args.max_trajectory_len)) for task_idx, task in enumerate(tasks): obs = ptu.from_numpy(self.env.reset(task)) obs = obs.reshape(-1, obs.shape[-1]) step = 0 if self.args.policy == 'sac': observations[task_idx, step, :] = ptu.get_numpy(obs[0, :obs_size]) for episode_idx in range(num_episodes): running_reward = 0. for step_idx in range(num_steps_per_episode): # add distribution parameters to observation - policy is conditioned on posterior if self.args.policy == 'dqn': action, value = self.agent.act(obs=obs, deterministic=True) else: action, _, _, log_prob = self.agent.act(obs=obs, deterministic=self.args.eval_deterministic, return_log_prob=True) # observe reward and next obs next_obs, reward, done, info = utl.env_step(self.env, action.squeeze(dim=0)) running_reward += reward.item() if self.args.policy == 'dqn': values[task_idx, step] = value.item() else: observations[task_idx, step + 1, :] = ptu.get_numpy(next_obs[0, :obs_size]) log_probs[task_idx, step] = ptu.get_numpy(log_prob[0]) if "is_goal_state" in dir(self.env.unwrapped) and self.env.unwrapped.is_goal_state(): success_rate[task_idx] = 1. # set: obs <- next_obs obs = next_obs.clone() step += 1 returns_per_episode[task_idx, episode_idx] = running_reward if self.args.policy == 'dqn': return returns_per_episode, success_rate, values else: return returns_per_episode, success_rate, log_probs, observations
def vis_rew_pred(args, rew_pred_arr, goal, **kwargs): env = gym.make(args.env_name) if args.env_name.startswith('GridNavi'): fig = plt.figure(figsize=(6, 6)) else: # 'TwoRooms' fig = plt.figure(figsize=(12, 6)) ax = plt.gca() cmap = plt.cm.viridis for state in env.states: cell = Rectangle((state[0], state[1]), width=1, height=1, fc=cmap(rew_pred_arr[ptu.get_numpy( env.task_to_id(ptu.FloatTensor(state)))[0]])) ax.add_patch(cell) ax.text(state[0] + 0.5, state[1] + 0.5, rew_pred_arr[ptu.get_numpy( env.task_to_id(ptu.FloatTensor(state)))[0]], ha="center", va="center", color="w") plt.xlim(env.observation_space.low[0] - 0.1, env.observation_space.high[0] + 1 + 0.1) plt.ylim(env.observation_space.low[1] - 0.1, env.observation_space.high[1] + 1 + 0.1) # add goal's position on grid line = Line2D([goal[0] + 0.3, goal[0] + 0.7], [goal[1] + 0.3, goal[1] + 0.7], lw=5, color='black', axes=ax) ax.add_line(line) line = Line2D([goal[0] + 0.3, goal[0] + 0.7], [goal[1] + 0.7, goal[1] + 0.3], lw=5, color='black', axes=ax) ax.add_line(line) if 'title' in kwargs: plt.title(kwargs['title']) if args.env_name.startswith('GridNavi'): ax.axis('equal') ax.set_xticklabels([]) ax.set_yticklabels([]) ax.tick_params(axis='both', which='both', length=0) fig.tight_layout() return fig
def relabel_rollout(env, goal, observations, actions): env.set_goal(goal) rewards = [ env.reward(obs, action) for (obs, action) in zip( ptu.get_numpy(observations) if type(observations) is not np. ndarray else observations, ptu.get_numpy(actions) if type(actions ) is not np.ndarray else actions) ] if type(observations) is np.ndarray: return np.vstack(rewards) else: return ptu.FloatTensor(np.vstack(rewards))
def collect_rollouts(self, num_rollouts, random_actions=False): ''' :param num_rollouts: :param random_actions: whether to use policy to sample actions, or randomly sample action space :return: ''' for rollout in range(num_rollouts): obs = ptu.from_numpy(self.env.reset(self.task_idx)) obs = obs.reshape(-1, obs.shape[-1]) done_rollout = False while not done_rollout: if random_actions: if self.args.policy == 'dqn': action = ptu.FloatTensor([[[self.env.action_space.sample()]]]).long() # Sample random action else: action = ptu.FloatTensor([self.env.action_space.sample()]) # Sample random action else: if self.args.policy == 'dqn': action, _ = self.agent.act(obs=obs) # DQN else: action, _, _, _ = self.agent.act(obs=obs) # SAC # observe reward and next obs next_obs, reward, done, info = utl.env_step(self.env, action.squeeze(dim=0)) done_rollout = False if ptu.get_numpy(done[0][0]) == 0. else True # add data to policy buffer - (s+, a, r, s'+, term) term = self.env.unwrapped.is_goal_state() if "is_goal_state" in dir(self.env.unwrapped) else False if self.args.dense_train_sparse_test: rew_to_buffer = {rew_type: rew for rew_type, rew in info.items() if rew_type.startswith('reward')} else: rew_to_buffer = ptu.get_numpy(reward.squeeze(dim=0)) self.policy_storage.add_sample(task=self.task_idx, observation=ptu.get_numpy(obs.squeeze(dim=0)), action=ptu.get_numpy(action.squeeze(dim=0)), reward=rew_to_buffer, terminal=np.array([term], dtype=float), next_observation=ptu.get_numpy(next_obs.squeeze(dim=0))) # set: obs <- next_obs obs = next_obs.clone() # update statistics self._n_env_steps_total += 1 if "is_goal_state" in dir(self.env.unwrapped) and self.env.unwrapped.is_goal_state(): # count successes self._successes_in_buffer += 1 self._n_rollouts_total += 1
def evaluate_vae(encoder, decoder, actions, rewards, states): ''' :param encoder: RNN encoder network :param decoder: reward decoder :param actions: array of actions of shape: (T, batch, action_dim) :param rewards: array of rewards of shape: (T, batch, 1) :param states: array of states of shape: (T, batch, state_dim) :return: ''' if actions.dim() != 3: actions = actions.unsqueeze(dim=0) states = states.unsqueeze(dim=0) rewards = rewards.unsqueeze(dim=0) T, batch_size, _ = actions.size() means, logvars, hidden_states, reward_preds = [], [], [], [] with torch.no_grad(): task_sample, task_mean, task_logvar, hidden_state = encoder.prior( batch_size) means.append(task_mean) logvars.append(task_logvar) hidden_states.append(hidden_state) reward_preds.append(ptu.get_numpy(decoder(task_sample, None))) for action, reward, state in zip(actions, rewards, states): action = action.unsqueeze(dim=0) state = state.unsqueeze(dim=0) reward = reward.unsqueeze(dim=0) with torch.no_grad(): task_sample, task_mean, task_logvar, hidden_state = encoder( actions=action.float(), states=state, rewards=reward, hidden_state=hidden_state, return_prior=False) means.append(task_mean.unsqueeze(dim=0)) logvars.append(task_logvar.unsqueeze(dim=0)) hidden_states.append(hidden_state) reward_preds.append( ptu.get_numpy(decoder(task_sample.unsqueeze(dim=0), None))) means = torch.cat(means, dim=0) logvars = torch.cat(logvars, dim=0) hidden_states = torch.cat(hidden_states, dim=0) reward_preds = np.vstack(reward_preds) return means, logvars, hidden_states, reward_preds
def trajectories_to_batch(dataset): traj_dataset = [] for set in dataset: obs, actions, rewards, next_obs, terminals = set[0], set[1], set[ 2], set[3], set[4] obs = ptu.get_numpy(obs.transpose(0, 1).reshape(-1, obs.shape[-1])) actions = ptu.get_numpy( actions.transpose(0, 1).reshape(-1, actions.shape[-1])) rewards = ptu.get_numpy( rewards.transpose(0, 1).reshape(-1, rewards.shape[-1])) next_obs = ptu.get_numpy( next_obs.transpose(0, 1).reshape(-1, next_obs.shape[-1])) terminals = ptu.get_numpy( terminals.transpose(0, 1).reshape(-1, terminals.shape[-1])) traj_dataset.append([obs, actions, rewards, next_obs, terminals]) return traj_dataset
def visualize_latent_space(latent_dim, n_samples, decoder): from sklearn.manifold import TSNE latents = ptu.FloatTensor(sample_random_normal(latent_dim, n_samples)) pred_rewards = ptu.get_numpy(decoder(latents, None)) goal_locations = np.argmax(pred_rewards, axis=-1) # embed to lower dim space - if dim > 2 if latent_dim > 2: tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) tsne_results = tsne.fit_transform(latents) # create DataFrame data = tsne_results if latent_dim > 2 else latents df = pd.DataFrame(data, columns=['x1', 'x2']) df["y"] = goal_locations fig = plt.figure(figsize=(6, 6)) sns.scatterplot(x="x1", y="x2", hue="y", s=30, palette=sns.color_palette("hls", len(np.unique(df["y"]))), data=df, legend="full", ax=plt.gca()) fig.show() return data, goal_locations
def predict_rewards(learner, means, logvars): reward_preds = ptu.zeros([means.shape[0], learner.env.num_states]) for t in range(reward_preds.shape[0]): task_samples = learner.vae.encoder._sample_gaussian( ptu.FloatTensor(means[t]), ptu.FloatTensor(logvars[t]), num=50) reward_preds[t, :] = learner.vae.reward_decoder( ptu.FloatTensor(task_samples), None).mean(dim=0).detach() return ptu.get_numpy(reward_preds)
def env_step(env, action): # action should be of size: batch x 1 action = ptu.get_numpy(action.squeeze(dim=-1)) next_obs, reward, done, info = env.step(action) # move to torch next_obs = ptu.from_numpy(next_obs).view(-1, next_obs.shape[0]) reward = ptu.FloatTensor([reward]).view(-1, 1) done = ptu.from_numpy(np.array(done, dtype=int)).view(-1, 1) return next_obs, reward, done, info
def eval_vae(dataset, vae, args): num_tasks = len(dataset) reward_preds = np.zeros((num_tasks, args.trajectory_len)) rewards = np.zeros((num_tasks, args.trajectory_len)) random_tasks = np.random.choice(len(dataset), 10) # which trajectory to evaluate states, actions = get_heatmap_params() state_preds = np.zeros((num_tasks, states.shape[0])) for task_idx, task in enumerate(random_tasks): traj_idx_random = np.random.choice(dataset[0][0].shape[1]) # which trajectory to evaluate # get prior parameters with torch.no_grad(): task_sample, task_mean, task_logvar, hidden_state = vae.encoder.prior(batch_size=1) for step in range(args.trajectory_len): # update encoding task_sample, task_mean, task_logvar, hidden_state = utl.update_encoding( encoder=vae.encoder, obs=ptu.FloatTensor(dataset[task][3][step, traj_idx_random]).unsqueeze(0), action=ptu.FloatTensor(dataset[task][1][step, traj_idx_random]).unsqueeze(0), reward=ptu.FloatTensor(dataset[task][2][step, traj_idx_random]).unsqueeze(0), done=ptu.FloatTensor(dataset[task][4][step, traj_idx_random]).unsqueeze(0), hidden_state=hidden_state ) rewards[task_idx, step] = dataset[task][2][step, traj_idx_random].item() reward_preds[task_idx, step] = ptu.get_numpy( vae.reward_decoder(task_sample.unsqueeze(0), ptu.FloatTensor(dataset[task][3][step, traj_idx_random]).unsqueeze(0).unsqueeze(0), ptu.FloatTensor(dataset[task][0][step, traj_idx_random]).unsqueeze(0).unsqueeze(0), ptu.FloatTensor(dataset[task][1][step, traj_idx_random]).unsqueeze(0).unsqueeze(0))[0, 0]) states, actions = get_heatmap_params() prediction = ptu.get_numpy(vae.state_decoder(task_sample.expand((1, 30, task_sample.shape[-1])), ptu.FloatTensor(states).unsqueeze(0), ptu.FloatTensor(actions).unsqueeze(0))).squeeze() for i in range(30): state_preds[task_idx, i] = 1 if np.linalg.norm(prediction[i, :]) > 1 else 0 return rewards, reward_preds, state_preds, random_tasks
def eval_vae(dataset, vae, args): num_tasks = len(dataset) reward_preds = np.zeros((num_tasks, args.trajectory_len)) rewards = np.zeros((num_tasks, args.trajectory_len)) random_tasks = np.random.choice( len(dataset), NUM_EVAL_TASKS) # which trajectory to evaluate for task_idx, task in enumerate(random_tasks): traj_idx_random = np.random.choice( dataset[task][0].shape[1]) # which trajectory to evaluate # traj_idx_random = np.random.choice(np.min([d[0].shape[1] for d in dataset])) # get prior parameters with torch.no_grad(): task_sample, task_mean, task_logvar, hidden_state = vae.encoder.prior( batch_size=1) for step in range(args.trajectory_len): # update encoding task_sample, task_mean, task_logvar, hidden_state = utl.update_encoding( encoder=vae.encoder, obs=ptu.FloatTensor( dataset[task][3][step, traj_idx_random]).unsqueeze(0), action=ptu.FloatTensor( dataset[task][1][step, traj_idx_random]).unsqueeze(0), reward=ptu.FloatTensor( dataset[task][2][step, traj_idx_random]).unsqueeze(0), done=ptu.FloatTensor( dataset[task][4][step, traj_idx_random]).unsqueeze(0), hidden_state=hidden_state) rewards[task_idx, step] = dataset[task][2][step, traj_idx_random].item() reward_preds[task_idx, step] = ptu.get_numpy( vae.reward_decoder( task_sample.unsqueeze(0), ptu.FloatTensor(dataset[task][3][ step, traj_idx_random]).unsqueeze(0).unsqueeze(0), ptu.FloatTensor(dataset[task][0][ step, traj_idx_random]).unsqueeze(0).unsqueeze(0), ptu.FloatTensor(dataset[task][1][ step, traj_idx_random]).unsqueeze(0).unsqueeze(0))[0, 0]) return rewards, reward_preds
def np_ify(tensor_or_other): if isinstance(tensor_or_other, Variable): return ptu.get_numpy(tensor_or_other) else: return tensor_or_other
def load_dataset(data_dir, args, num_tasks=None, allow_dense_data_loading=True, arr_type='tensor'): dataset = [] env_dir = args.env_name.replace('Sparse', '') \ if 'dense_train_sparse_test' in args and \ args.dense_train_sparse_test is True and \ allow_dense_data_loading \ else args.env_name exps_dir = os.path.join(args.main_data_dir, env_dir, data_dir) goals = [] all_dirs = os.listdir(exps_dir) if num_tasks is None: tasks = np.random.permutation(len(all_dirs)) else: tasks = np.random.choice(len(all_dirs), num_tasks) for i, task in enumerate(tasks): exp_dir = os.path.join(exps_dir, all_dirs[task]) goals.append(extract_goal_from_path(all_dirs[task])) if 'rewards.npy' not in os.listdir(exp_dir): print('rewards.npy file doesn\'t exist. Creating it..') env = make_env(args.env_name, args.max_rollouts_per_task, n_tasks=1) create_rewards_arr(env, path=exp_dir) print('Created rewards.npy file.') obs, actions, rewards, next_obs, terminals = load_transitions(exp_dir) if obs.dim() < 3: obs = obs.reshape(-1, args.trajectory_len, obs.shape[-1]).transpose(0, 1) actions = actions.reshape(-1, args.trajectory_len, actions.shape[-1]).transpose(0, 1) rewards = rewards.reshape(-1, args.trajectory_len, rewards.shape[-1]).transpose(0, 1) next_obs = next_obs.reshape(-1, args.trajectory_len, next_obs.shape[-1]).transpose(0, 1) terminals = terminals.reshape(-1, args.trajectory_len, terminals.shape[-1]).transpose(0, 1) if args.num_trajs_per_task is not None: obs = obs[:, :args.num_trajs_per_task, :] actions = actions[:, :args.num_trajs_per_task, :] rewards = rewards[:, :args.num_trajs_per_task, :] next_obs = next_obs[:, :args.num_trajs_per_task, :] terminals = terminals[:, :args.num_trajs_per_task, :] else: if args.num_trajs_per_task is not None: obs = obs[:, :args.num_trajs_per_task, :] actions = actions[:, :args.num_trajs_per_task, :] rewards = rewards[:, :args.num_trajs_per_task, :] next_obs = next_obs[:, :args.num_trajs_per_task, :] terminals = terminals[:, :args.num_trajs_per_task, :] obs = obs.transpose(0, 1).reshape(-1, obs.shape[-1]) actions = actions.transpose(0, 1).reshape(-1, actions.shape[-1]) rewards = rewards.transpose(0, 1).reshape(-1, rewards.shape[-1]) next_obs = next_obs.transpose(0, 1).reshape(-1, next_obs.shape[-1]) terminals = terminals.transpose(0, 1).reshape(-1, terminals.shape[-1]) if arr_type == 'numpy': obs = ptu.get_numpy(obs) actions = ptu.get_numpy(actions) rewards = ptu.get_numpy(rewards) next_obs = ptu.get_numpy(next_obs) terminals = ptu.get_numpy(terminals) dataset.append([obs, actions, rewards, next_obs, terminals]) # print(exp_dir) # print('Obs shape: ' + str(np.shape(dataset[-1][0])) + # '. Act shape: ' + str(np.shape(dataset[-1][1])) + # '. Reward shape: ' + str(np.shape(dataset[-1][2])) + # '. Next obs shape: ' + str(np.shape(dataset[-1][3]))) print('{} experiments loaded.'.format(i + 1)) goals = np.vstack(goals) return dataset, goals
def evaluate(self): num_episodes = self.args.max_rollouts_per_task num_steps_per_episode = self.env.unwrapped._max_episode_steps num_tasks = self.args.num_eval_tasks obs_size = self.env.unwrapped.observation_space.shape[0] returns_per_episode = np.zeros((num_tasks, num_episodes)) success_rate = np.zeros(num_tasks) rewards = np.zeros((num_tasks, self.args.trajectory_len)) reward_preds = np.zeros((num_tasks, self.args.trajectory_len)) observations = np.zeros( (num_tasks, self.args.trajectory_len + 1, obs_size)) if self.args.policy == 'sac': log_probs = np.zeros((num_tasks, self.args.trajectory_len)) # This part is very specific for the Semi-Circle env # if self.args.env_name == 'PointRobotSparse-v0': # reward_belief = np.zeros((num_tasks, self.args.trajectory_len)) # # low_x, high_x, low_y, high_y = -2., 2., -1., 2. # resolution = 0.1 # grid_x = np.arange(low_x, high_x + resolution, resolution) # grid_y = np.arange(low_y, high_y + resolution, resolution) # centers_x = (grid_x[:-1] + grid_x[1:]) / 2 # centers_y = (grid_y[:-1] + grid_y[1:]) / 2 # yv, xv = np.meshgrid(centers_y, centers_x, sparse=False, indexing='ij') # centers = np.vstack([xv.ravel(), yv.ravel()]).T # n_grid_points = centers.shape[0] # reward_belief_discretized = np.zeros((num_tasks, self.args.trajectory_len, centers.shape[0])) for task_loop_i, task in enumerate( self.env.unwrapped.get_all_eval_task_idx()): obs = ptu.from_numpy(self.env.reset(task)) obs = obs.reshape(-1, obs.shape[-1]) step = 0 # get prior parameters with torch.no_grad(): task_sample, task_mean, task_logvar, hidden_state = self.vae.encoder.prior( batch_size=1) observations[task_loop_i, step, :] = ptu.get_numpy(obs[0, :obs_size]) for episode_idx in range(num_episodes): running_reward = 0. for step_idx in range(num_steps_per_episode): # add distribution parameters to observation - policy is conditioned on posterior augmented_obs = self.get_augmented_obs( obs, task_mean, task_logvar) if self.args.policy == 'dqn': action, value = self.agent.act(obs=augmented_obs, deterministic=True) else: action, _, _, log_prob = self.agent.act( obs=augmented_obs, deterministic=self.args.eval_deterministic, return_log_prob=True) # observe reward and next obs next_obs, reward, done, info = utl.env_step( self.env, action.squeeze(dim=0)) running_reward += reward.item() # done_rollout = False if ptu.get_numpy(done[0][0]) == 0. else True # update encoding task_sample, task_mean, task_logvar, hidden_state = self.update_encoding( obs=next_obs, action=action, reward=reward, done=done, hidden_state=hidden_state) rewards[task_loop_i, step] = reward.item() reward_preds[task_loop_i, step] = ptu.get_numpy( self.vae.reward_decoder(task_sample, next_obs, obs, action)[0, 0]) # This part is very specific for the Semi-Circle env # if self.args.env_name == 'PointRobotSparse-v0': # reward_belief[task, step] = ptu.get_numpy( # self.vae.compute_belief_reward(task_mean, task_logvar, obs, next_obs, action)[0]) # # reward_belief_discretized[task, step, :] = ptu.get_numpy( # self.vae.compute_belief_reward(task_mean.repeat(n_grid_points, 1), # task_logvar.repeat(n_grid_points, 1), # None, # torch.cat((ptu.FloatTensor(centers), # ptu.zeros(centers.shape[0], 1)), dim=-1).unsqueeze(0), # None)[:, 0]) observations[task_loop_i, step + 1, :] = ptu.get_numpy( next_obs[0, :obs_size]) if self.args.policy != 'dqn': log_probs[task_loop_i, step] = ptu.get_numpy(log_prob[0]) if "is_goal_state" in dir( self.env.unwrapped ) and self.env.unwrapped.is_goal_state(): success_rate[task_loop_i] = 1. # set: obs <- next_obs obs = next_obs.clone() step += 1 returns_per_episode[task_loop_i, episode_idx] = running_reward if self.args.policy == 'dqn': return returns_per_episode, success_rate, observations, rewards, reward_preds # This part is very specific for the Semi-Circle env # elif self.args.env_name == 'PointRobotSparse-v0': # return returns_per_episode, success_rate, log_probs, observations, \ # rewards, reward_preds, reward_belief, reward_belief_discretized, centers else: return returns_per_episode, success_rate, log_probs, observations, rewards, reward_preds
def rollout_policy(env, learner): is_vae_exist = "vae" in dir(learner) observations = [] actions = [] rewards = [] values = [] if is_vae_exist: latent_samples = [] latent_means = [] latent_logvars = [] obs = ptu.from_numpy(env.reset()) obs = obs.reshape(-1, obs.shape[-1]) observations.append(obs) done_rollout = False if is_vae_exist: # get prior parameters with torch.no_grad(): task_sample, task_mean, task_logvar, hidden_state = learner.vae.encoder.prior( batch_size=1) # store latent_samples.append(ptu.get_numpy(task_sample[0, 0])) latent_means.append(ptu.get_numpy(task_mean[0, 0])) latent_logvars.append(ptu.get_numpy(task_logvar[0, 0])) while not done_rollout: if is_vae_exist: # add distribution parameters to observation - policy is conditioned on posterior augmented_obs = learner.get_augmented_obs(obs=obs, task_mu=task_mean, task_std=task_logvar) with torch.no_grad(): action, value = learner.agent.act(obs=augmented_obs, deterministic=True) else: action, _, _, _ = learner.agent.act(obs=obs) # observe reward and next obs next_obs, reward, done, info = utl.env_step(env, action.squeeze(dim=0)) # store observations.append(next_obs) actions.append(action) values.append(value) rewards.append(reward.item()) done_rollout = False if ptu.get_numpy(done[0][0]) == 0. else True if is_vae_exist: # update encoding task_sample, task_mean, task_logvar, hidden_state = learner.vae.encoder( action, next_obs, reward.reshape((1, 1)), hidden_state, return_prior=False) # values.append(value.item()) latent_samples.append(ptu.get_numpy(task_sample[0])) latent_means.append(ptu.get_numpy(task_mean[0])) latent_logvars.append(ptu.get_numpy(task_logvar[0])) # set: obs <- next_obs obs = next_obs.clone() if is_vae_exist: return observations, actions, rewards, values, \ latent_samples, latent_means, latent_logvars else: return observations, actions, rewards, values
def get_param_values_np(self): state_dict = self.state_dict() np_dict = OrderedDict() for key, tensor in state_dict.items(): np_dict[key] = ptu.get_numpy(tensor) return np_dict
def new_add_scalar(key, value, step): if isinstance(value, torch.Tensor): value = ptu.get_numpy(value) old_add_scalar(key, value, step) logger.record_tabular(key, value)
def collect_rollouts(self, num_rollouts, random_actions=False): ''' :param num_rollouts: :param random_actions: whether to use policy to sample actions, or randomly sample action space :return: ''' for rollout in range(num_rollouts): obs = ptu.from_numpy(self.env.reset(self.task_idx)) obs = obs.reshape(-1, obs.shape[-1]) done_rollout = False # self.policy_storage.reset_running_episode(self.task_idx) # if self.args.fixed_latent_params: # assert 2 ** self.args.task_embedding_size >= self.args.num_tasks # task_mean = ptu.FloatTensor(utl.vertices(self.args.task_embedding_size)[self.task_idx]) # task_logvar = -2. * ptu.ones_like(task_logvar) # arbitrary negative enough number # add distribution parameters to observation - policy is conditioned on posterior augmented_obs = self.get_augmented_obs(obs=obs) while not done_rollout: if random_actions: if self.args.policy == 'dqn': action = ptu.FloatTensor([[ self.env.action_space.sample() ]]).type(torch.long) # Sample random action else: action = ptu.FloatTensor( [self.env.action_space.sample()]) else: if self.args.policy == 'dqn': action, _ = self.agent.act(obs=augmented_obs) # DQN else: action, _, _, _ = self.agent.act( obs=augmented_obs) # SAC # observe reward and next obs next_obs, reward, done, info = utl.env_step( self.env, action.squeeze(dim=0)) done_rollout = False if ptu.get_numpy( done[0][0]) == 0. else True # get augmented next obs augmented_next_obs = self.get_augmented_obs(obs=next_obs) # add data to policy buffer - (s+, a, r, s'+, term) term = self.env.unwrapped.is_goal_state( ) if "is_goal_state" in dir(self.env.unwrapped) else False self.policy_storage.add_sample( task=self.task_idx, observation=ptu.get_numpy(augmented_obs.squeeze(dim=0)), action=ptu.get_numpy(action.squeeze(dim=0)), reward=ptu.get_numpy(reward.squeeze(dim=0)), terminal=np.array([term], dtype=float), next_observation=ptu.get_numpy( augmented_next_obs.squeeze(dim=0))) if not random_actions: self.current_experience_storage.add_sample( task=self.task_idx, observation=ptu.get_numpy( augmented_obs.squeeze(dim=0)), action=ptu.get_numpy(action.squeeze(dim=0)), reward=ptu.get_numpy(reward.squeeze(dim=0)), terminal=np.array([term], dtype=float), next_observation=ptu.get_numpy( augmented_next_obs.squeeze(dim=0))) # set: obs <- next_obs obs = next_obs.clone() augmented_obs = augmented_next_obs.clone() # update statistics self._n_env_steps_total += 1 if "is_goal_state" in dir( self.env.unwrapped ) and self.env.unwrapped.is_goal_state(): # count successes self._successes_in_buffer += 1 self._n_rollouts_total += 1
def collect_rollouts(self): self.training_mode(False) num_episodes = self.args.max_rollouts_per_task num_steps_per_episode = self.env.unwrapped._max_episode_steps num_tasks = self.args.num_eval_tasks obs_size = self.env.unwrapped.observation_space.shape[0] returns_per_episode = np.zeros((num_tasks, num_episodes)) success_rate = np.zeros(num_tasks) rewards = np.zeros((num_tasks, self.args.trajectory_len)) observations = np.zeros( (num_tasks, self.args.trajectory_len + 1, obs_size)) actions = np.zeros( (num_tasks, self.args.trajectory_len, self.args.action_dim)) log_probs = np.zeros((num_tasks, self.args.trajectory_len)) for task in self.env.unwrapped.get_all_task_idx(): obs = ptu.from_numpy(self.env.reset(task)) obs = obs.reshape(-1, obs.shape[-1]) step = 0 # get prior parameters task_sample, task_mean, task_logvar, hidden_state = self.vae.encoder.prior( batch_size=1) observations[task, step, :] = ptu.get_numpy(obs[0, :obs_size]) for episode_idx in range(num_episodes): running_reward = 0. for step_idx in range(num_steps_per_episode): # add distribution parameters to observation - policy is conditioned on posterior augmented_obs = self.get_augmented_obs( obs, task_mean, task_logvar) action, _, _, log_prob = self.agent.act( obs=augmented_obs, deterministic=self.args.eval_deterministic, return_log_prob=True) # observe reward and next obs next_obs, reward, done, info = utl.env_step( self.env, action.squeeze(dim=0)) running_reward += reward.item() # update encoding task_sample, task_mean, task_logvar, hidden_state = self.update_encoding( obs=next_obs, action=action, reward=reward, done=done, hidden_state=hidden_state) rewards[task, step] = reward.item() #reward_preds[task, step] = ptu.get_numpy( # self.vae.reward_decoder(task_sample, next_obs, obs, action)[0, 0]) observations[task, step + 1, :] = ptu.get_numpy( next_obs[0, :obs_size]) actions[task, step, :] = ptu.get_numpy(action[0, :]) log_probs[task, step] = ptu.get_numpy(log_prob[0]) if "is_goal_state" in dir( self.env.unwrapped ) and self.env.unwrapped.is_goal_state(): success_rate[task] = 1. # set: obs <- next_obs obs = next_obs.clone() step += 1 returns_per_episode[task, episode_idx] = running_reward return returns_per_episode, success_rate, log_probs, observations, rewards, actions