Exemple #1
0
def collect_rollouts_per_task(task_idx, agent, policy_storage, env,
                              num_rollouts):
    for rollout in range(num_rollouts):
        obs = ptu.from_numpy(env.reset(task_idx))
        obs = obs.reshape(-1, obs.shape[-1])
        done_rollout = False

        while not done_rollout:
            action, _, _, _ = agent.act(obs=obs)  # SAC
            # observe reward and next obs
            next_obs, reward, done, info = utl.env_step(
                env, action.squeeze(dim=0))
            done_rollout = False if ptu.get_numpy(done[0][0]) == 0. else True

            # add data to policy buffer - (s+, a, r, s'+, term)
            term = env.unwrapped.is_goal_state() if "is_goal_state" in dir(
                env.unwrapped) else False
            rew_to_buffer = ptu.get_numpy(reward.squeeze(dim=0))
            policy_storage.add_sample(
                task=0,  #task_idx,
                observation=ptu.get_numpy(obs.squeeze(dim=0)),
                action=ptu.get_numpy(action.squeeze(dim=0)),
                reward=rew_to_buffer,
                terminal=np.array([term], dtype=float),
                next_observation=ptu.get_numpy(next_obs.squeeze(dim=0)))

            # set: obs <- next_obs
            obs = next_obs.clone()
Exemple #2
0
def transform_mdps_ds_to_bamdp_ds(dataset, vae, args):
    '''

    :param dataset: list of lists of lists. each list is list of arrays
    (s,a,r,s',done) arrays of size (traj_len, n_trajs, dim)
    :param vae: trained vae model
    :return:
    '''

    bamdp_dataset = []

    for i, set in enumerate(dataset):
        obs, actions, rewards, next_obs, terminals = set
        augmented_obs, belief_rewards, augmented_next_obs = \
            transform_mdp_to_bamdp_rollouts(vae, args,
                                            ptu.FloatTensor(obs),
                                            ptu.FloatTensor(actions),
                                            ptu.FloatTensor(rewards),
                                            ptu.FloatTensor(next_obs),
                                            ptu.FloatTensor(terminals))
        rewards = belief_rewards if belief_rewards is not None else ptu.FloatTensor(
            rewards)

        bamdp_dataset.append([
            ptu.get_numpy(augmented_obs), actions,
            ptu.get_numpy(rewards),
            ptu.get_numpy(augmented_next_obs), terminals
        ])
        print('{} datasets were processed.'.format(i + 1))
    return bamdp_dataset
def load_replaying_dataset(data_dir, args, num_tasks=None):
    dataset = []
    env_dir = args.env_name
    exps_dir = os.path.join(args.main_data_dir, env_dir, data_dir)
    goals = []
    all_dirs = os.listdir(exps_dir)
    policies_per_task = 20
    if num_tasks is None:
        tasks = np.random.permutation(len(all_dirs))
    else:
        tasks = np.random.choice(len(all_dirs), num_tasks)
    for i, task in enumerate(tasks):
        task_dir = os.path.join(exps_dir, all_dirs[task])
        all_policies = os.listdir(task_dir)
        policies_to_load = np.random.choice(len(all_policies),
                                            policies_per_task,
                                            replace=False)
        goals.append(extract_goal_from_path(all_dirs[task]))
        task_obs, task_actions, task_rewards, task_next_obs, task_terminals = [], [], [], [], []
        for j, policy in enumerate(policies_to_load):
            exp_dir = os.path.join(exps_dir, all_dirs[task],
                                   all_policies[policy])
            obs, actions, rewards, next_obs, terminals = load_transitions(
                exp_dir)

            obs = obs.reshape(-1, args.trajectory_len,
                              obs.shape[-1]).transpose(0, 1)
            actions = actions.reshape(-1, args.trajectory_len,
                                      actions.shape[-1]).transpose(0, 1)
            rewards = rewards.reshape(-1, args.trajectory_len,
                                      rewards.shape[-1]).transpose(0, 1)
            next_obs = next_obs.reshape(-1, args.trajectory_len,
                                        next_obs.shape[-1]).transpose(0, 1)
            terminals = terminals.reshape(-1, args.trajectory_len,
                                          terminals.shape[-1]).transpose(0, 1)

            obs = ptu.get_numpy(obs)
            actions = ptu.get_numpy(actions)
            rewards = ptu.get_numpy(rewards)
            next_obs = ptu.get_numpy(next_obs)
            terminals = ptu.get_numpy(terminals)

            task_obs.append(obs)
            task_actions.append(actions)
            task_rewards.append(rewards)
            task_next_obs.append(next_obs)
            task_terminals.append(terminals)

        obs = np.concatenate(task_obs, axis=1)
        actions = np.concatenate(task_actions, axis=1)
        rewards = np.concatenate(task_rewards, axis=1)
        next_obs = np.concatenate(task_next_obs, axis=1)
        terminals = np.concatenate(task_terminals, axis=1)
        dataset.append([obs, actions, rewards, next_obs, terminals])
    print('{} experiments loaded.'.format(i + 1))
    goals = np.vstack(goals)

    return dataset, goals
Exemple #4
0
    def evaluate(self, tasks):
        num_episodes = self.args.max_rollouts_per_task
        num_steps_per_episode = self.env.unwrapped._max_episode_steps

        returns_per_episode = np.zeros((len(tasks), num_episodes))
        success_rate = np.zeros(len(tasks))

        if self.args.policy == 'dqn':
            values = np.zeros((len(tasks), self.args.max_trajectory_len))
        else:
            obs_size = self.env.unwrapped.observation_space.shape[0]
            observations = np.zeros((len(tasks), self.args.max_trajectory_len + 1, obs_size))
            log_probs = np.zeros((len(tasks), self.args.max_trajectory_len))

        for task_idx, task in enumerate(tasks):

            obs = ptu.from_numpy(self.env.reset(task))
            obs = obs.reshape(-1, obs.shape[-1])
            step = 0

            if self.args.policy == 'sac':
                observations[task_idx, step, :] = ptu.get_numpy(obs[0, :obs_size])

            for episode_idx in range(num_episodes):
                running_reward = 0.
                for step_idx in range(num_steps_per_episode):
                    # add distribution parameters to observation - policy is conditioned on posterior
                    if self.args.policy == 'dqn':
                        action, value = self.agent.act(obs=obs, deterministic=True)
                    else:
                        action, _, _, log_prob = self.agent.act(obs=obs,
                                                                deterministic=self.args.eval_deterministic,
                                                                return_log_prob=True)
                    # observe reward and next obs
                    next_obs, reward, done, info = utl.env_step(self.env, action.squeeze(dim=0))
                    running_reward += reward.item()
                    if self.args.policy == 'dqn':
                        values[task_idx, step] = value.item()
                    else:
                        observations[task_idx, step + 1, :] = ptu.get_numpy(next_obs[0, :obs_size])
                        log_probs[task_idx, step] = ptu.get_numpy(log_prob[0])

                    if "is_goal_state" in dir(self.env.unwrapped) and self.env.unwrapped.is_goal_state():
                        success_rate[task_idx] = 1.
                    # set: obs <- next_obs
                    obs = next_obs.clone()
                    step += 1

                returns_per_episode[task_idx, episode_idx] = running_reward

        if self.args.policy == 'dqn':
            return returns_per_episode, success_rate, values
        else:
            return returns_per_episode, success_rate, log_probs, observations
Exemple #5
0
def vis_rew_pred(args, rew_pred_arr, goal, **kwargs):
    env = gym.make(args.env_name)
    if args.env_name.startswith('GridNavi'):
        fig = plt.figure(figsize=(6, 6))
    else:  # 'TwoRooms'
        fig = plt.figure(figsize=(12, 6))

    ax = plt.gca()
    cmap = plt.cm.viridis
    for state in env.states:
        cell = Rectangle((state[0], state[1]),
                         width=1,
                         height=1,
                         fc=cmap(rew_pred_arr[ptu.get_numpy(
                             env.task_to_id(ptu.FloatTensor(state)))[0]]))
        ax.add_patch(cell)
        ax.text(state[0] + 0.5,
                state[1] + 0.5,
                rew_pred_arr[ptu.get_numpy(
                    env.task_to_id(ptu.FloatTensor(state)))[0]],
                ha="center",
                va="center",
                color="w")

    plt.xlim(env.observation_space.low[0] - 0.1,
             env.observation_space.high[0] + 1 + 0.1)
    plt.ylim(env.observation_space.low[1] - 0.1,
             env.observation_space.high[1] + 1 + 0.1)

    # add goal's position on grid
    line = Line2D([goal[0] + 0.3, goal[0] + 0.7],
                  [goal[1] + 0.3, goal[1] + 0.7],
                  lw=5,
                  color='black',
                  axes=ax)
    ax.add_line(line)
    line = Line2D([goal[0] + 0.3, goal[0] + 0.7],
                  [goal[1] + 0.7, goal[1] + 0.3],
                  lw=5,
                  color='black',
                  axes=ax)
    ax.add_line(line)
    if 'title' in kwargs:
        plt.title(kwargs['title'])

    if args.env_name.startswith('GridNavi'):
        ax.axis('equal')

    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.tick_params(axis='both', which='both', length=0)

    fig.tight_layout()
    return fig
Exemple #6
0
def relabel_rollout(env, goal, observations, actions):
    env.set_goal(goal)
    rewards = [
        env.reward(obs, action) for (obs, action) in zip(
            ptu.get_numpy(observations) if type(observations) is not np.
            ndarray else observations,
            ptu.get_numpy(actions) if type(actions
                                           ) is not np.ndarray else actions)
    ]
    if type(observations) is np.ndarray:
        return np.vstack(rewards)
    else:
        return ptu.FloatTensor(np.vstack(rewards))
Exemple #7
0
    def collect_rollouts(self, num_rollouts, random_actions=False):
        '''

        :param num_rollouts:
        :param random_actions: whether to use policy to sample actions, or randomly sample action space
        :return:
        '''

        for rollout in range(num_rollouts):
            obs = ptu.from_numpy(self.env.reset(self.task_idx))
            obs = obs.reshape(-1, obs.shape[-1])
            done_rollout = False

            while not done_rollout:
                if random_actions:
                    if self.args.policy == 'dqn':
                        action = ptu.FloatTensor([[[self.env.action_space.sample()]]]).long()   # Sample random action
                    else:
                        action = ptu.FloatTensor([self.env.action_space.sample()])  # Sample random action
                else:
                    if self.args.policy == 'dqn':
                        action, _ = self.agent.act(obs=obs)   # DQN
                    else:
                        action, _, _, _ = self.agent.act(obs=obs)   # SAC
                # observe reward and next obs
                next_obs, reward, done, info = utl.env_step(self.env, action.squeeze(dim=0))
                done_rollout = False if ptu.get_numpy(done[0][0]) == 0. else True

                # add data to policy buffer - (s+, a, r, s'+, term)
                term = self.env.unwrapped.is_goal_state() if "is_goal_state" in dir(self.env.unwrapped) else False
                if self.args.dense_train_sparse_test:
                    rew_to_buffer = {rew_type: rew for rew_type, rew in info.items()
                                     if rew_type.startswith('reward')}
                else:
                    rew_to_buffer = ptu.get_numpy(reward.squeeze(dim=0))
                self.policy_storage.add_sample(task=self.task_idx,
                                               observation=ptu.get_numpy(obs.squeeze(dim=0)),
                                               action=ptu.get_numpy(action.squeeze(dim=0)),
                                               reward=rew_to_buffer,
                                               terminal=np.array([term], dtype=float),
                                               next_observation=ptu.get_numpy(next_obs.squeeze(dim=0)))

                # set: obs <- next_obs
                obs = next_obs.clone()

                # update statistics
                self._n_env_steps_total += 1
                if "is_goal_state" in dir(self.env.unwrapped) and self.env.unwrapped.is_goal_state():  # count successes
                    self._successes_in_buffer += 1
            self._n_rollouts_total += 1
Exemple #8
0
def evaluate_vae(encoder, decoder, actions, rewards, states):
    '''

    :param encoder: RNN encoder network
    :param decoder: reward decoder
    :param actions: array of actions of shape: (T, batch, action_dim)
    :param rewards: array of rewards of shape: (T, batch, 1)
    :param states: array of states of shape: (T, batch, state_dim)
    :return:
    '''

    if actions.dim() != 3:
        actions = actions.unsqueeze(dim=0)
        states = states.unsqueeze(dim=0)
        rewards = rewards.unsqueeze(dim=0)

    T, batch_size, _ = actions.size()

    means, logvars, hidden_states, reward_preds = [], [], [], []
    with torch.no_grad():
        task_sample, task_mean, task_logvar, hidden_state = encoder.prior(
            batch_size)
    means.append(task_mean)
    logvars.append(task_logvar)
    hidden_states.append(hidden_state)
    reward_preds.append(ptu.get_numpy(decoder(task_sample, None)))

    for action, reward, state in zip(actions, rewards, states):
        action = action.unsqueeze(dim=0)
        state = state.unsqueeze(dim=0)
        reward = reward.unsqueeze(dim=0)
        with torch.no_grad():
            task_sample, task_mean, task_logvar, hidden_state = encoder(
                actions=action.float(),
                states=state,
                rewards=reward,
                hidden_state=hidden_state,
                return_prior=False)
        means.append(task_mean.unsqueeze(dim=0))
        logvars.append(task_logvar.unsqueeze(dim=0))
        hidden_states.append(hidden_state)
        reward_preds.append(
            ptu.get_numpy(decoder(task_sample.unsqueeze(dim=0), None)))

    means = torch.cat(means, dim=0)
    logvars = torch.cat(logvars, dim=0)
    hidden_states = torch.cat(hidden_states, dim=0)
    reward_preds = np.vstack(reward_preds)
    return means, logvars, hidden_states, reward_preds
Exemple #9
0
def trajectories_to_batch(dataset):
    traj_dataset = []
    for set in dataset:
        obs, actions, rewards, next_obs, terminals = set[0], set[1], set[
            2], set[3], set[4]
        obs = ptu.get_numpy(obs.transpose(0, 1).reshape(-1, obs.shape[-1]))
        actions = ptu.get_numpy(
            actions.transpose(0, 1).reshape(-1, actions.shape[-1]))
        rewards = ptu.get_numpy(
            rewards.transpose(0, 1).reshape(-1, rewards.shape[-1]))
        next_obs = ptu.get_numpy(
            next_obs.transpose(0, 1).reshape(-1, next_obs.shape[-1]))
        terminals = ptu.get_numpy(
            terminals.transpose(0, 1).reshape(-1, terminals.shape[-1]))
        traj_dataset.append([obs, actions, rewards, next_obs, terminals])
    return traj_dataset
Exemple #10
0
def visualize_latent_space(latent_dim, n_samples, decoder):
    from sklearn.manifold import TSNE
    latents = ptu.FloatTensor(sample_random_normal(latent_dim, n_samples))

    pred_rewards = ptu.get_numpy(decoder(latents, None))
    goal_locations = np.argmax(pred_rewards, axis=-1)

    # embed to lower dim space - if dim > 2
    if latent_dim > 2:
        tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
        tsne_results = tsne.fit_transform(latents)

    # create DataFrame
    data = tsne_results if latent_dim > 2 else latents

    df = pd.DataFrame(data, columns=['x1', 'x2'])
    df["y"] = goal_locations

    fig = plt.figure(figsize=(6, 6))
    sns.scatterplot(x="x1",
                    y="x2",
                    hue="y",
                    s=30,
                    palette=sns.color_palette("hls", len(np.unique(df["y"]))),
                    data=df,
                    legend="full",
                    ax=plt.gca())
    fig.show()

    return data, goal_locations
Exemple #11
0
def predict_rewards(learner, means, logvars):
    reward_preds = ptu.zeros([means.shape[0], learner.env.num_states])
    for t in range(reward_preds.shape[0]):
        task_samples = learner.vae.encoder._sample_gaussian(
            ptu.FloatTensor(means[t]), ptu.FloatTensor(logvars[t]), num=50)
        reward_preds[t, :] = learner.vae.reward_decoder(
            ptu.FloatTensor(task_samples), None).mean(dim=0).detach()

    return ptu.get_numpy(reward_preds)
Exemple #12
0
def env_step(env, action):
    # action should be of size: batch x 1
    action = ptu.get_numpy(action.squeeze(dim=-1))
    next_obs, reward, done, info = env.step(action)
    # move to torch
    next_obs = ptu.from_numpy(next_obs).view(-1, next_obs.shape[0])
    reward = ptu.FloatTensor([reward]).view(-1, 1)
    done = ptu.from_numpy(np.array(done, dtype=int)).view(-1, 1)

    return next_obs, reward, done, info
def eval_vae(dataset, vae, args):

    num_tasks = len(dataset)
    reward_preds = np.zeros((num_tasks, args.trajectory_len))
    rewards = np.zeros((num_tasks, args.trajectory_len))
    random_tasks = np.random.choice(len(dataset), 10)  # which trajectory to evaluate
    states, actions = get_heatmap_params()
    state_preds = np.zeros((num_tasks, states.shape[0]))

    for task_idx, task in enumerate(random_tasks):
        traj_idx_random = np.random.choice(dataset[0][0].shape[1])  # which trajectory to evaluate
        # get prior parameters
        with torch.no_grad():
            task_sample, task_mean, task_logvar, hidden_state = vae.encoder.prior(batch_size=1)
        for step in range(args.trajectory_len):
            # update encoding
            task_sample, task_mean, task_logvar, hidden_state = utl.update_encoding(
                encoder=vae.encoder,
                obs=ptu.FloatTensor(dataset[task][3][step, traj_idx_random]).unsqueeze(0),
                action=ptu.FloatTensor(dataset[task][1][step, traj_idx_random]).unsqueeze(0),
                reward=ptu.FloatTensor(dataset[task][2][step, traj_idx_random]).unsqueeze(0),
                done=ptu.FloatTensor(dataset[task][4][step, traj_idx_random]).unsqueeze(0),
                hidden_state=hidden_state
            )

            rewards[task_idx, step] = dataset[task][2][step, traj_idx_random].item()
            reward_preds[task_idx, step] = ptu.get_numpy(
                vae.reward_decoder(task_sample.unsqueeze(0),
                                   ptu.FloatTensor(dataset[task][3][step, traj_idx_random]).unsqueeze(0).unsqueeze(0),
                                   ptu.FloatTensor(dataset[task][0][step, traj_idx_random]).unsqueeze(0).unsqueeze(0),
                                   ptu.FloatTensor(dataset[task][1][step, traj_idx_random]).unsqueeze(0).unsqueeze(0))[0, 0])

        states, actions = get_heatmap_params()
        prediction = ptu.get_numpy(vae.state_decoder(task_sample.expand((1, 30, task_sample.shape[-1])),
                                                     ptu.FloatTensor(states).unsqueeze(0),
                                                     ptu.FloatTensor(actions).unsqueeze(0))).squeeze()
        for i in range(30):
            state_preds[task_idx, i] = 1 if np.linalg.norm(prediction[i, :]) > 1 else 0

    return rewards, reward_preds, state_preds, random_tasks
Exemple #14
0
def eval_vae(dataset, vae, args):

    num_tasks = len(dataset)
    reward_preds = np.zeros((num_tasks, args.trajectory_len))
    rewards = np.zeros((num_tasks, args.trajectory_len))
    random_tasks = np.random.choice(
        len(dataset), NUM_EVAL_TASKS)  # which trajectory to evaluate

    for task_idx, task in enumerate(random_tasks):
        traj_idx_random = np.random.choice(
            dataset[task][0].shape[1])  # which trajectory to evaluate
        # traj_idx_random = np.random.choice(np.min([d[0].shape[1] for d in dataset]))
        # get prior parameters
        with torch.no_grad():
            task_sample, task_mean, task_logvar, hidden_state = vae.encoder.prior(
                batch_size=1)
        for step in range(args.trajectory_len):
            # update encoding
            task_sample, task_mean, task_logvar, hidden_state = utl.update_encoding(
                encoder=vae.encoder,
                obs=ptu.FloatTensor(
                    dataset[task][3][step, traj_idx_random]).unsqueeze(0),
                action=ptu.FloatTensor(
                    dataset[task][1][step, traj_idx_random]).unsqueeze(0),
                reward=ptu.FloatTensor(
                    dataset[task][2][step, traj_idx_random]).unsqueeze(0),
                done=ptu.FloatTensor(
                    dataset[task][4][step, traj_idx_random]).unsqueeze(0),
                hidden_state=hidden_state)

            rewards[task_idx, step] = dataset[task][2][step,
                                                       traj_idx_random].item()
            reward_preds[task_idx, step] = ptu.get_numpy(
                vae.reward_decoder(
                    task_sample.unsqueeze(0),
                    ptu.FloatTensor(dataset[task][3][
                        step, traj_idx_random]).unsqueeze(0).unsqueeze(0),
                    ptu.FloatTensor(dataset[task][0][
                        step, traj_idx_random]).unsqueeze(0).unsqueeze(0),
                    ptu.FloatTensor(dataset[task][1][
                        step, traj_idx_random]).unsqueeze(0).unsqueeze(0))[0,
                                                                           0])

    return rewards, reward_preds
Exemple #15
0
def np_ify(tensor_or_other):
    if isinstance(tensor_or_other, Variable):
        return ptu.get_numpy(tensor_or_other)
    else:
        return tensor_or_other
Exemple #16
0
def load_dataset(data_dir,
                 args,
                 num_tasks=None,
                 allow_dense_data_loading=True,
                 arr_type='tensor'):
    dataset = []
    env_dir = args.env_name.replace('Sparse', '') \
        if 'dense_train_sparse_test' in args and \
           args.dense_train_sparse_test is True and \
           allow_dense_data_loading \
        else args.env_name
    exps_dir = os.path.join(args.main_data_dir, env_dir, data_dir)
    goals = []
    all_dirs = os.listdir(exps_dir)
    if num_tasks is None:
        tasks = np.random.permutation(len(all_dirs))
    else:
        tasks = np.random.choice(len(all_dirs), num_tasks)
    for i, task in enumerate(tasks):
        exp_dir = os.path.join(exps_dir, all_dirs[task])
        goals.append(extract_goal_from_path(all_dirs[task]))
        if 'rewards.npy' not in os.listdir(exp_dir):
            print('rewards.npy file doesn\'t exist. Creating it..')
            env = make_env(args.env_name,
                           args.max_rollouts_per_task,
                           n_tasks=1)
            create_rewards_arr(env, path=exp_dir)
            print('Created rewards.npy file.')
        obs, actions, rewards, next_obs, terminals = load_transitions(exp_dir)

        if obs.dim() < 3:
            obs = obs.reshape(-1, args.trajectory_len,
                              obs.shape[-1]).transpose(0, 1)
            actions = actions.reshape(-1, args.trajectory_len,
                                      actions.shape[-1]).transpose(0, 1)
            rewards = rewards.reshape(-1, args.trajectory_len,
                                      rewards.shape[-1]).transpose(0, 1)
            next_obs = next_obs.reshape(-1, args.trajectory_len,
                                        next_obs.shape[-1]).transpose(0, 1)
            terminals = terminals.reshape(-1, args.trajectory_len,
                                          terminals.shape[-1]).transpose(0, 1)
            if args.num_trajs_per_task is not None:
                obs = obs[:, :args.num_trajs_per_task, :]
                actions = actions[:, :args.num_trajs_per_task, :]
                rewards = rewards[:, :args.num_trajs_per_task, :]
                next_obs = next_obs[:, :args.num_trajs_per_task, :]
                terminals = terminals[:, :args.num_trajs_per_task, :]
        else:
            if args.num_trajs_per_task is not None:
                obs = obs[:, :args.num_trajs_per_task, :]
                actions = actions[:, :args.num_trajs_per_task, :]
                rewards = rewards[:, :args.num_trajs_per_task, :]
                next_obs = next_obs[:, :args.num_trajs_per_task, :]
                terminals = terminals[:, :args.num_trajs_per_task, :]
            obs = obs.transpose(0, 1).reshape(-1, obs.shape[-1])
            actions = actions.transpose(0, 1).reshape(-1, actions.shape[-1])
            rewards = rewards.transpose(0, 1).reshape(-1, rewards.shape[-1])
            next_obs = next_obs.transpose(0, 1).reshape(-1, next_obs.shape[-1])
            terminals = terminals.transpose(0,
                                            1).reshape(-1, terminals.shape[-1])

        if arr_type == 'numpy':
            obs = ptu.get_numpy(obs)
            actions = ptu.get_numpy(actions)
            rewards = ptu.get_numpy(rewards)
            next_obs = ptu.get_numpy(next_obs)
            terminals = ptu.get_numpy(terminals)

        dataset.append([obs, actions, rewards, next_obs, terminals])
        # print(exp_dir)
        # print('Obs shape: ' + str(np.shape(dataset[-1][0])) +
        #       '. Act shape: ' + str(np.shape(dataset[-1][1])) +
        #       '. Reward shape: ' + str(np.shape(dataset[-1][2])) +
        #       '. Next obs shape: ' + str(np.shape(dataset[-1][3])))
    print('{} experiments loaded.'.format(i + 1))
    goals = np.vstack(goals)

    return dataset, goals
Exemple #17
0
    def evaluate(self):
        num_episodes = self.args.max_rollouts_per_task
        num_steps_per_episode = self.env.unwrapped._max_episode_steps
        num_tasks = self.args.num_eval_tasks
        obs_size = self.env.unwrapped.observation_space.shape[0]

        returns_per_episode = np.zeros((num_tasks, num_episodes))
        success_rate = np.zeros(num_tasks)

        rewards = np.zeros((num_tasks, self.args.trajectory_len))
        reward_preds = np.zeros((num_tasks, self.args.trajectory_len))
        observations = np.zeros(
            (num_tasks, self.args.trajectory_len + 1, obs_size))
        if self.args.policy == 'sac':
            log_probs = np.zeros((num_tasks, self.args.trajectory_len))

        # This part is very specific for the Semi-Circle env
        # if self.args.env_name == 'PointRobotSparse-v0':
        #     reward_belief = np.zeros((num_tasks, self.args.trajectory_len))
        #
        #     low_x, high_x, low_y, high_y = -2., 2., -1., 2.
        #     resolution = 0.1
        #     grid_x = np.arange(low_x, high_x + resolution, resolution)
        #     grid_y = np.arange(low_y, high_y + resolution, resolution)
        #     centers_x = (grid_x[:-1] + grid_x[1:]) / 2
        #     centers_y = (grid_y[:-1] + grid_y[1:]) / 2
        #     yv, xv = np.meshgrid(centers_y, centers_x, sparse=False, indexing='ij')
        #     centers = np.vstack([xv.ravel(), yv.ravel()]).T
        #     n_grid_points = centers.shape[0]
        #     reward_belief_discretized = np.zeros((num_tasks, self.args.trajectory_len, centers.shape[0]))

        for task_loop_i, task in enumerate(
                self.env.unwrapped.get_all_eval_task_idx()):
            obs = ptu.from_numpy(self.env.reset(task))
            obs = obs.reshape(-1, obs.shape[-1])
            step = 0

            # get prior parameters
            with torch.no_grad():
                task_sample, task_mean, task_logvar, hidden_state = self.vae.encoder.prior(
                    batch_size=1)

            observations[task_loop_i,
                         step, :] = ptu.get_numpy(obs[0, :obs_size])

            for episode_idx in range(num_episodes):
                running_reward = 0.
                for step_idx in range(num_steps_per_episode):
                    # add distribution parameters to observation - policy is conditioned on posterior
                    augmented_obs = self.get_augmented_obs(
                        obs, task_mean, task_logvar)
                    if self.args.policy == 'dqn':
                        action, value = self.agent.act(obs=augmented_obs,
                                                       deterministic=True)
                    else:
                        action, _, _, log_prob = self.agent.act(
                            obs=augmented_obs,
                            deterministic=self.args.eval_deterministic,
                            return_log_prob=True)

                    # observe reward and next obs
                    next_obs, reward, done, info = utl.env_step(
                        self.env, action.squeeze(dim=0))
                    running_reward += reward.item()
                    # done_rollout = False if ptu.get_numpy(done[0][0]) == 0. else True
                    # update encoding
                    task_sample, task_mean, task_logvar, hidden_state = self.update_encoding(
                        obs=next_obs,
                        action=action,
                        reward=reward,
                        done=done,
                        hidden_state=hidden_state)
                    rewards[task_loop_i, step] = reward.item()
                    reward_preds[task_loop_i, step] = ptu.get_numpy(
                        self.vae.reward_decoder(task_sample, next_obs, obs,
                                                action)[0, 0])

                    # This part is very specific for the Semi-Circle env
                    # if self.args.env_name == 'PointRobotSparse-v0':
                    #     reward_belief[task, step] = ptu.get_numpy(
                    #         self.vae.compute_belief_reward(task_mean, task_logvar, obs, next_obs, action)[0])
                    #
                    #     reward_belief_discretized[task, step, :] = ptu.get_numpy(
                    #         self.vae.compute_belief_reward(task_mean.repeat(n_grid_points, 1),
                    #                                        task_logvar.repeat(n_grid_points, 1),
                    #                                        None,
                    #                                        torch.cat((ptu.FloatTensor(centers),
                    #                                                   ptu.zeros(centers.shape[0], 1)), dim=-1).unsqueeze(0),
                    #                                        None)[:, 0])

                    observations[task_loop_i, step + 1, :] = ptu.get_numpy(
                        next_obs[0, :obs_size])
                    if self.args.policy != 'dqn':
                        log_probs[task_loop_i,
                                  step] = ptu.get_numpy(log_prob[0])

                    if "is_goal_state" in dir(
                            self.env.unwrapped
                    ) and self.env.unwrapped.is_goal_state():
                        success_rate[task_loop_i] = 1.
                    # set: obs <- next_obs
                    obs = next_obs.clone()
                    step += 1

                returns_per_episode[task_loop_i, episode_idx] = running_reward

        if self.args.policy == 'dqn':
            return returns_per_episode, success_rate, observations, rewards, reward_preds
        # This part is very specific for the Semi-Circle env
        # elif self.args.env_name == 'PointRobotSparse-v0':
        #     return returns_per_episode, success_rate, log_probs, observations, \
        #            rewards, reward_preds, reward_belief, reward_belief_discretized, centers
        else:
            return returns_per_episode, success_rate, log_probs, observations, rewards, reward_preds
Exemple #18
0
def rollout_policy(env, learner):
    is_vae_exist = "vae" in dir(learner)

    observations = []
    actions = []
    rewards = []
    values = []
    if is_vae_exist:
        latent_samples = []
        latent_means = []
        latent_logvars = []

    obs = ptu.from_numpy(env.reset())
    obs = obs.reshape(-1, obs.shape[-1])
    observations.append(obs)
    done_rollout = False
    if is_vae_exist:
        # get prior parameters
        with torch.no_grad():
            task_sample, task_mean, task_logvar, hidden_state = learner.vae.encoder.prior(
                batch_size=1)
        # store
        latent_samples.append(ptu.get_numpy(task_sample[0, 0]))
        latent_means.append(ptu.get_numpy(task_mean[0, 0]))
        latent_logvars.append(ptu.get_numpy(task_logvar[0, 0]))

    while not done_rollout:
        if is_vae_exist:
            # add distribution parameters to observation - policy is conditioned on posterior
            augmented_obs = learner.get_augmented_obs(obs=obs,
                                                      task_mu=task_mean,
                                                      task_std=task_logvar)
            with torch.no_grad():
                action, value = learner.agent.act(obs=augmented_obs,
                                                  deterministic=True)
        else:
            action, _, _, _ = learner.agent.act(obs=obs)

        # observe reward and next obs
        next_obs, reward, done, info = utl.env_step(env, action.squeeze(dim=0))
        # store
        observations.append(next_obs)
        actions.append(action)
        values.append(value)
        rewards.append(reward.item())
        done_rollout = False if ptu.get_numpy(done[0][0]) == 0. else True

        if is_vae_exist:
            # update encoding
            task_sample, task_mean, task_logvar, hidden_state = learner.vae.encoder(
                action,
                next_obs,
                reward.reshape((1, 1)),
                hidden_state,
                return_prior=False)

            # values.append(value.item())
            latent_samples.append(ptu.get_numpy(task_sample[0]))
            latent_means.append(ptu.get_numpy(task_mean[0]))
            latent_logvars.append(ptu.get_numpy(task_logvar[0]))
        # set: obs <- next_obs
        obs = next_obs.clone()
    if is_vae_exist:
        return observations, actions, rewards, values, \
               latent_samples, latent_means, latent_logvars
    else:
        return observations, actions, rewards, values
Exemple #19
0
 def get_param_values_np(self):
     state_dict = self.state_dict()
     np_dict = OrderedDict()
     for key, tensor in state_dict.items():
         np_dict[key] = ptu.get_numpy(tensor)
     return np_dict
Exemple #20
0
 def new_add_scalar(key, value, step):
     if isinstance(value, torch.Tensor):
         value = ptu.get_numpy(value)
     old_add_scalar(key, value, step)
     logger.record_tabular(key, value)
Exemple #21
0
    def collect_rollouts(self, num_rollouts, random_actions=False):
        '''

        :param num_rollouts:
        :param random_actions: whether to use policy to sample actions, or randomly sample action space
        :return:
        '''

        for rollout in range(num_rollouts):
            obs = ptu.from_numpy(self.env.reset(self.task_idx))
            obs = obs.reshape(-1, obs.shape[-1])
            done_rollout = False
            # self.policy_storage.reset_running_episode(self.task_idx)

            # if self.args.fixed_latent_params:
            #     assert 2 ** self.args.task_embedding_size >= self.args.num_tasks
            #     task_mean = ptu.FloatTensor(utl.vertices(self.args.task_embedding_size)[self.task_idx])
            #     task_logvar = -2. * ptu.ones_like(task_logvar)   # arbitrary negative enough number
            # add distribution parameters to observation - policy is conditioned on posterior
            augmented_obs = self.get_augmented_obs(obs=obs)

            while not done_rollout:
                if random_actions:
                    if self.args.policy == 'dqn':
                        action = ptu.FloatTensor([[
                            self.env.action_space.sample()
                        ]]).type(torch.long)  # Sample random action
                    else:
                        action = ptu.FloatTensor(
                            [self.env.action_space.sample()])
                else:
                    if self.args.policy == 'dqn':
                        action, _ = self.agent.act(obs=augmented_obs)  # DQN
                    else:
                        action, _, _, _ = self.agent.act(
                            obs=augmented_obs)  # SAC
                # observe reward and next obs
                next_obs, reward, done, info = utl.env_step(
                    self.env, action.squeeze(dim=0))
                done_rollout = False if ptu.get_numpy(
                    done[0][0]) == 0. else True

                # get augmented next obs
                augmented_next_obs = self.get_augmented_obs(obs=next_obs)

                # add data to policy buffer - (s+, a, r, s'+, term)
                term = self.env.unwrapped.is_goal_state(
                ) if "is_goal_state" in dir(self.env.unwrapped) else False
                self.policy_storage.add_sample(
                    task=self.task_idx,
                    observation=ptu.get_numpy(augmented_obs.squeeze(dim=0)),
                    action=ptu.get_numpy(action.squeeze(dim=0)),
                    reward=ptu.get_numpy(reward.squeeze(dim=0)),
                    terminal=np.array([term], dtype=float),
                    next_observation=ptu.get_numpy(
                        augmented_next_obs.squeeze(dim=0)))
                if not random_actions:
                    self.current_experience_storage.add_sample(
                        task=self.task_idx,
                        observation=ptu.get_numpy(
                            augmented_obs.squeeze(dim=0)),
                        action=ptu.get_numpy(action.squeeze(dim=0)),
                        reward=ptu.get_numpy(reward.squeeze(dim=0)),
                        terminal=np.array([term], dtype=float),
                        next_observation=ptu.get_numpy(
                            augmented_next_obs.squeeze(dim=0)))

                # set: obs <- next_obs
                obs = next_obs.clone()
                augmented_obs = augmented_next_obs.clone()

                # update statistics
                self._n_env_steps_total += 1
                if "is_goal_state" in dir(
                        self.env.unwrapped
                ) and self.env.unwrapped.is_goal_state():  # count successes
                    self._successes_in_buffer += 1
            self._n_rollouts_total += 1
Exemple #22
0
    def collect_rollouts(self):
        self.training_mode(False)
        num_episodes = self.args.max_rollouts_per_task
        num_steps_per_episode = self.env.unwrapped._max_episode_steps
        num_tasks = self.args.num_eval_tasks
        obs_size = self.env.unwrapped.observation_space.shape[0]

        returns_per_episode = np.zeros((num_tasks, num_episodes))
        success_rate = np.zeros(num_tasks)

        rewards = np.zeros((num_tasks, self.args.trajectory_len))
        observations = np.zeros(
            (num_tasks, self.args.trajectory_len + 1, obs_size))
        actions = np.zeros(
            (num_tasks, self.args.trajectory_len, self.args.action_dim))

        log_probs = np.zeros((num_tasks, self.args.trajectory_len))

        for task in self.env.unwrapped.get_all_task_idx():
            obs = ptu.from_numpy(self.env.reset(task))
            obs = obs.reshape(-1, obs.shape[-1])
            step = 0

            # get prior parameters
            task_sample, task_mean, task_logvar, hidden_state = self.vae.encoder.prior(
                batch_size=1)

            observations[task, step, :] = ptu.get_numpy(obs[0, :obs_size])

            for episode_idx in range(num_episodes):
                running_reward = 0.
                for step_idx in range(num_steps_per_episode):
                    # add distribution parameters to observation - policy is conditioned on posterior
                    augmented_obs = self.get_augmented_obs(
                        obs, task_mean, task_logvar)
                    action, _, _, log_prob = self.agent.act(
                        obs=augmented_obs,
                        deterministic=self.args.eval_deterministic,
                        return_log_prob=True)

                    # observe reward and next obs
                    next_obs, reward, done, info = utl.env_step(
                        self.env, action.squeeze(dim=0))
                    running_reward += reward.item()

                    # update encoding
                    task_sample, task_mean, task_logvar, hidden_state = self.update_encoding(
                        obs=next_obs,
                        action=action,
                        reward=reward,
                        done=done,
                        hidden_state=hidden_state)
                    rewards[task, step] = reward.item()
                    #reward_preds[task, step] = ptu.get_numpy(
                    #    self.vae.reward_decoder(task_sample, next_obs, obs, action)[0, 0])

                    observations[task, step + 1, :] = ptu.get_numpy(
                        next_obs[0, :obs_size])
                    actions[task, step, :] = ptu.get_numpy(action[0, :])
                    log_probs[task, step] = ptu.get_numpy(log_prob[0])

                    if "is_goal_state" in dir(
                            self.env.unwrapped
                    ) and self.env.unwrapped.is_goal_state():
                        success_rate[task] = 1.
                    # set: obs <- next_obs
                    obs = next_obs.clone()
                    step += 1

                returns_per_episode[task, episode_idx] = running_reward

        return returns_per_episode, success_rate, log_probs, observations, rewards, actions