Ejemplo n.º 1
0
 def get_action(self, obs):
     obs, goals, taus = split_flat_obs(obs[None],
                                       self.env.observation_space.low.size,
                                       self.env.goal_dim)
     sampled_actions = self.sample_actions()
     first_sampled_actions = sampled_actions.copy()
     actions = ptu.np_to_var(sampled_actions)
     next_obs = self.expand_np_to_var(obs[0])
     goals = self.expand_np_to_var(goals[0])
     taus = self.expand_np_to_var(taus[0])
     costs = 0
     for i in range(self.mpc_horizon):
         curr_obs = next_obs
         if i > 0:
             sampled_actions = self.sample_actions()
             actions = ptu.np_to_var(sampled_actions)
         flat_obs = merge_into_flat_obs(
             curr_obs,
             goals,
             taus,
         )
         obs_delta = self.debug_qf(flat_obs,
                                   actions,
                                   return_internal_prediction=True)
         next_obs = curr_obs + obs_delta
         next_features = self.env.convert_obs_to_goals(next_obs)
         costs += (next_features[:, :7] - goals[:, :7])**2
     costs_np = ptu.get_numpy(costs).sum(1)
     min_i = np.argmin(costs_np)
     return first_sampled_actions[min_i], {}
Ejemplo n.º 2
0
    def get_batch(self, training=True):
        if self.replay_buffer_is_split:
            replay_buffer = self.replay_buffer.get_replay_buffer(training)
        else:
            replay_buffer = self.replay_buffer
        batch = replay_buffer.random_batch(self.batch_size)
        """
        Update the goal states/rewards
        """
        num_steps_left = np.random.randint(0, self.max_tau + 1,
                                           (self.batch_size, 1))
        terminals = 1 - (1 - batch['terminals']) * (num_steps_left != 0)
        batch['terminals'] = terminals

        obs = batch['observations']
        next_obs = batch['next_observations']
        if self.sample_train_goals_from == 'her':
            goals = batch['goals']
        else:
            goals = self._sample_goals_for_training()
        goal_differences = np.abs(
            self.env.convert_obs_to_goals(next_obs)
            # - self.env.convert_obs_to_goals(obs)
            - goals)
        batch['goal_differences'] = goal_differences * self.reward_scale
        batch['goals'] = goals
        """
        Update the observations
        """
        batch['observations'] = merge_into_flat_obs(
            obs=batch['observations'],
            goals=batch['goals'],
            num_steps_left=num_steps_left,
        )
        batch['next_observations'] = merge_into_flat_obs(
            obs=batch['next_observations'],
            goals=batch['goals'],
            num_steps_left=num_steps_left - 1,
        )

        return np_to_pytorch_batch(batch)
Ejemplo n.º 3
0
 def forward(self, obs, goals, taus, actions):
     flat_obs = merge_into_flat_obs(obs, goals, taus)
     if self.vf is None:
         return self.qf(flat_obs, actions)
     else:
         return self.qf(flat_obs, actions) - self.vf(flat_obs)
Ejemplo n.º 4
0
def rollout(env,
            agent,
            qf,
            init_tau,
            max_path_length=np.inf,
            animated=False,
            cycle_tau=False):
    observations = []
    actions = []
    rewards = []
    terminals = []
    agent_infos = []
    env_infos = []
    taus = []
    o = env.reset()
    next_o = None
    path_length = 0
    tau = init_tau
    if animated:
        env.render()
    current_xy = env.convert_ob_to_goal(o)
    delta = env.multitask_goal - current_xy
    reachable_to_current_distances = []
    tdm_distances = []
    while path_length < max_path_length:
        current_xy = env.convert_ob_to_goal(o)
        # new_goal = current_xy + delta
        new_goal = env.multitask_goal
        env.set_goal(new_goal)
        agent.set_goal(new_goal)
        a, agent_info = agent.get_action(o)
        flat_obs = merge_into_flat_obs(o, new_goal, np.array([tau]))
        pred_xy = qf.eval_np(flat_obs[None],
                             a[None],
                             return_internal_prediction=True)
        tdm_values = qf.eval_np(flat_obs[None], a[None])
        reachable_goal = find_reachable_goal_state(env, o, tau, qf, agent)
        print("tau", tau)
        # print("new_goal", new_goal)
        # print("pred_xy", pred_xy)
        # print("tdm values", tdm_values)
        # print("tdm distance", np.sum(np.abs(tdm_values)))
        # print("actual distance", sum(np.abs(current_xy - new_goal)))
        print("current_xy", current_xy)
        print("reachable xy", reachable_goal)
        reachable_to_current_distances.append(
            np.linalg.norm(current_xy - reachable_goal))
        tdm_distances.append(np.sum(np.abs(tdm_values)))
        next_o, r, d, env_info = env.step(a)
        agent.set_tau(tau)
        observations.append(o)
        rewards.append(r)
        terminals.append(d)
        actions.append(a)
        agent_infos.append(agent_info)
        env_infos.append(env_info)
        taus.append(np.array([tau]))
        path_length += 1
        if cycle_tau:
            tau -= 1
            if tau < 0:
                tau = init_tau
        if d:
            break
        o = next_o
        if animated:
            env.render()
            # input("Press Enter to continue...")

    plt.subplot(2, 1, 1)
    plt.plot(np.array(reachable_to_current_distances))
    plt.subplot(2, 1, 2)
    plt.plot(np.array(tdm_distances))
    plt.show()

    actions = np.array(actions)
    if len(actions.shape) == 1:
        actions = np.expand_dims(actions, 1)
    observations = np.array(observations)
    if len(observations.shape) == 1:
        observations = np.expand_dims(observations, 1)
        next_o = np.array([next_o])
    next_observations = np.vstack(
        (observations[1:, :], np.expand_dims(next_o, 0)))

    return dict(
        observations=observations,
        actions=actions,
        rewards=np.array(rewards).reshape(-1, 1),
        next_observations=next_observations,
        terminals=np.array(terminals).reshape(-1, 1),
        agent_infos=np.array(agent_infos),
        env_infos=np.array(env_infos),
        # final_observation=next_o,
        num_steps_left=np.array(taus),
    )
 def forward(self, states, actions, next_states):
     taus = ptu.np_to_var(self.tau * np.ones((states.shape[0], 1)))
     goals = self.env.convert_obs_to_goals(next_states)
     flat_obs = merge_into_flat_obs(states, goals, taus)
     return self.qf(flat_obs, actions).sum(1) * self.output_scale