Ejemplo n.º 1
0
    def step(self, action):
        """
        Perform a step in gazebo. When end of episode
        is reached, reset() should be called to reset
        the environment's internal state.
        Input
        -----
        action : an action provided by the environment
        Outputs
        -------
        (observation, reward, done, info)
        observation :
            agent's observation of the current environment
        reward: float
            amount of reward due to the previous action
        done :
            a boolean, indicating whether the episode has ended
        info :
            a dictionary containing other diagnostic information
            from the previous action
        """
        self._robot.send_command(action)

        obs = self.get_observation()

        reward = self.reward(obs.achieved_goal, self.goal)
        done = self.done(obs.achieved_goal, self.goal)
        next_observation = obs.observation
        return Step(observation=next_observation, reward=reward, done=done)
Ejemplo n.º 2
0
    def step(self, action):
        """
        action map:
        0: left
        1: down
        2: right
        3: up
        :param action: should be a one-hot vector encoding the action
        :return:
        """
        possible_next_states = self.get_possible_next_states(
            self.state, action)

        probs = [x[1] for x in possible_next_states]
        next_state_idx = np.random.choice(len(probs), p=probs)
        next_state = possible_next_states[next_state_idx][0]

        next_x = next_state // self.n_col
        next_y = next_state % self.n_col

        next_state_type = self.desc[next_x, next_y]
        if next_state_type == 'H':
            done = True
            reward = 0
        elif next_state_type in ['F', 'S']:
            done = False
            reward = 0
        elif next_state_type == 'G':
            done = True
            reward = 1
        else:
            raise NotImplementedError
        self.state = next_state
        return Step(observation=self.state, reward=reward, done=done)
Ejemplo n.º 3
0
    def step(self, action):
        """Step the environment state.

        Args:
            action (np.ndarray): The action to take in the environment.

        Returns:
            np.ndarray: Observation. The observation of the environment.
            float: Reward. The reward acquired at this time step.
            boolean: Done. Whether the environment was completed at this
                time step. Always False for this environment.

        """
        # enforce action space
        a = action.copy()  # NOTE: we MUST copy the action before modifying it
        a = np.clip(a, self.action_space.low, self.action_space.high)

        self._point = np.clip(self._point + a, -self._arena_size,
                              self._arena_size)
        dist = np.linalg.norm(self._point - self._goal)
        done = dist < np.linalg.norm(self.action_space.low)

        # dense reward
        reward = -dist
        # done bonus
        if done:
            reward += self._done_bonus

        # sometimes we don't want to terminate
        done = done and not self._never_done

        return Step(np.copy(self._point), reward, done, task=self._task)
Ejemplo n.º 4
0
    def step(self, action):
        """
        Perform a step in gazebo. When end of episode
        is reached, reset() should be called to reset
        the environment's internal state.
        Input
        -----
        action : an action provided by the environment
        Outputs
        -------
        (observation, reward, done, info)
        observation :
            agent's observation of the current environment
        reward: float
            amount of reward due to the previous action
        done :
            a boolean, indicating whether the episode has ended
        info :
            a dictionary containing other diagnostic information
            from the previous action
        """
        assert action.shape == self.action_space.shape
        a = action.copy()  # Note: you MUST copy the action if you modify it
        a *= self._action_scale
        a = np.clip(a, self.action_space.low, self.action_space.high)
        self._robot.send_command(a)

        obs = self.get_observation()

        reward = self.reward(obs.achieved_goal, self.goal)
        reward -= self._control_cost_coeff * np.linalg.norm(a)
        done = self.done(obs.achieved_goal, self.goal)
        next_observation = obs.observation
        return Step(observation=next_observation, reward=reward, done=done)
Ejemplo n.º 5
0
    def step(self, action):
        r"""
        Run one timestep of the environment's dynamics. When end of episode
        is reached, reset() should be called to reset the environment's internal state.

        Parameters
        ----------
        action : array_like
            An action provided by the environment.

        Returns
        -------
        : :py:func:`garage.envs.base.Step`
            A step in the rollout.
            Contains the following information:
                - observation (array_like): Agent's observation of the current environment.
                - reward (float): Amount of reward due to the previous action.
                - done (bool): Is the current step a terminal or goal state, ending the rollout.
                - actions (array_like): The action taken at the current.
                - state (array_like): The cloned simulation state at the current cell, used for resetting if chosen to start a rollout.
                - is_terminal (bool): Whether or not the current cell is a terminal state.
                - is_goal (bool): Whether or not the current cell is a goal state.
        """
        self._env_state_before_action = self._env_state.copy()

        self._action = action
        self._actions.append(action)
        action_return = self._action

        # Update simulation step
        obs = self.simulator.step(self._action)
        if (obs is None) or (self.open_loop is True) or (self.blackbox_sim_state):
            obs = np.array(self._init_state)

        if self.simulator.is_terminal() or self.simulator.is_goal():
            self._done = True

        # Calculate the reward for this step
        self._reward = self.reward_function.give_reward(
            action=self._action,
            info=self.simulator.get_reward_info())
        self._cum_reward += self._reward

        # Update instance attributes
        self._step = self._step + 1

        self._simulator_state = self.simulator.clone_state()
        self._env_state = np.concatenate((self._simulator_state,
                                          np.array([self._cum_reward]),
                                          np.array([self._step])),
                                         axis=0)

        return Step(observation=obs,
                    reward=self._reward,
                    done=self._done,
                    actions=action_return,
                    state=self._env_state_before_action,
                    is_terminal=self.simulator.is_terminal(),
                    is_goal=self.simulator.is_goal())
 def step(self, action):
     """
     Run one timestep of the environment's dynamics. When end of episode
     is reached, reset() should be called to reset the environment's internal state.
     Input
     -----
     action : an action provided by the environment
     Outputs
     -------
     (observation, reward, done, info)
     observation : agent's observation of the current environment
     reward [Float] : amount of reward due to the previous action
     done : a boolean, indicating whether the episode has ended
     info : a dictionary containing other diagnostic information from the previous action
     """
     self._action = action
     self._actions.append(action)
     action_return = self._action
     # Update simulation step
     obs = self.simulator.step(self._action)
     if (obs is None) or (self.open_loop is
                          True) or (self.blackbox_sim_state):
         obs = np.array(self._init_state)
     # if self.simulator.is_goal():
     if self.simulator.is_terminal() or self.simulator.is_goal():
         self._done = True
     # Calculate the reward for this step
     self._reward = self.reward_function.give_reward(
         action=self._action, info=self.simulator.get_reward_info())
     self._cum_reward += self._reward
     # Update instance attributes
     self._step = self._step + 1
     self._simulator_state = self.simulator.clone_state()
     self._env_state = np.concatenate(
         (self._simulator_state, np.array([self._cum_reward
                                           ]), np.array([self._step])),
         axis=0)
     if self._done:
         self.simulator.simulate(self._actions, self._init_state)
         if not (self.simulator.is_goal() or self.simulator.is_terminal()):
             pdb.set_trace()
     return Step(
         observation=obs,
         reward=self._reward,
         done=self._done,
         cache=self._info,
         actions=action_return,
         # step = self._step -1,
         # real_actions=self._action,
         state=self._env_state,
         # root_action=self.root_action,
         is_terminal=self.simulator.is_terminal(),
         is_goal=self.simulator.is_goal())
Ejemplo n.º 7
0
    def step(self, action):
        # no clipping / rescaling of actions
        # action = np.clip(action, self.action_space.low, self.action_space.high)
        # rot_ctrl = np.array([1., 0., 1., 0.])
        # action = np.concatenate([action, rot_ctrl])
        # action, _ = np.split(action, (self.sim.model.nmocap * 7,))
        # action = action.reshape(self.sim.model.nmocap, 7)

        # pos_delta = action[:, :3]
        if self._control_method == "torque_control":
            self.forward_dynamics(action)
            self.sim.forward()
        else:
            reset_mocap2body_xpos(self.sim)
            self.sim.data.mocap_pos[:] = self.sim.data.mocap_pos + action
            for _ in range(50):
                self.sim.step()
            self._step += 1

        # obs = self.get_current_obs()
        # achieved_goal = obs['achieved_goal']
        # goal = obs['desired_goal']
        achieved_goal = self.position
        # reward = self._compute_reward(achieved_goal, goal)

        obs = self.get_obs()

        achieved_dist = self._goal_distance(achieved_goal, self._goal)
        # reward = rewards._sigmoids(self._goal_distance(achieved_goal, goal) / self._goal_distance(self.initial_pos, goal), 0., "cosine")
        # reward = 1. - achieved_dist / self._goal_distance(self._start_pos, self._goal) / 2.  # before
        # reward = 1. - achieved_dist / self._goal_distance(np.zeros(3), self._goal) / 2.
        # TODO sparse reward
        reward = 1. - achieved_dist / self._goal_distance(
            self._start_pos, self._goal)

        # print(self.initial_pos, achieved_goal)

        done = (achieved_dist < self._distance_threshold)

        if done:
            reward = 20.  # 20.

        info = {
            "episode": {
                "l": self._step,
                "r": reward,
                "d": done,
                "position": self.position,
                "task": np.copy(self.onehot)
            }
        }
        # just take gripper position as observation
        return Step(obs, reward, False, **info)
Ejemplo n.º 8
0
    def step(self, action):
        # enforce action space
        a = action.copy()  # NOTE: we MUST copy the action before modifying it
        a = np.clip(a, self.action_space.low, self.action_space.high)

        dist = np.linalg.norm(self._point - self._goal)
        done = dist < np.linalg.norm(self.action_space.low)

        # dense reward
        reward = -dist
        # done bonus
        if done:
            reward += self._done_bonus

        # sometimes we don't want to terminate
        done = done and not self._never_done

        return Step(np.copy(self._point), reward, done)
Ejemplo n.º 9
0
    def step(self, action):
        """
        :param action: power on left & right motor
        :type action: numpy.ndarray
        """
        if self.discretized:
            # discretization from {<-1,-0.33> , <-0.33,0.33> , <0.33,1>} to [-1, 0, 1]
            action = np.clip(np.round(action * 1.5), a_min=-1, a_max=1)
        # Set new state
        next_pos, next_ori = self._get_new_position(action)
        self.agent_pos = next_pos
        self.agent_ori = next_ori
        obs = self.get_current_obs()
        # Determine reward and termination
        next_state_type = self._tile_type_at_pos(next_pos)
        if next_state_type == 'H':
            done = True
            reward = -1
        elif next_state_type in ['F', 'S']:
            done = False
            reward = -self.STEP_PENALTY
        elif next_state_type == 'G':
            done = True
            reward = 1
        else:
            raise NotImplementedError

        # Cache observation
        if self.do_caching:
            k = tuple(np.array(obs, dtype='int8'))
            v = (self.current_map_idx, tuple(self.agent_pos), self.agent_ori)
            if k not in self.states_cache:
                self.states_cache[k] = {v}
            else:
                self.states_cache[k].add(v)

        # Return observation and others
        return Step(observation=obs,
                    reward=reward,
                    done=done,
                    agent_pos=self.agent_pos,
                    agent_ori=self.agent_ori,
                    map=self.current_map_idx)
Ejemplo n.º 10
0
    def step(self, action):
        assert (self.hrl_policy is not None)
        skill_path = skill_rollout(
            env=TfEnv(self.env),
            agent=self.hrl_policy.get_skill_policy(action),
            skill_stopping_func=self.hrl_policy.get_skill_stopping_func(
                action),
            max_path_length=self.hrl_policy.skill_max_timesteps,
            reset_start_rollout=
            False  # do not reset the env, continue from current state
        )
        next_obs = AsaEnv.get_current_obs_wrapped(self.env)
        reward = np.sum(skill_path['rewards'])
        term = skill_path['terminated'][-1]

        return Step(observation=next_obs,
                    reward=reward,
                    done=term,
                    subpath_infos=SubpolicyPathInfo(skill_path,
                                                    store=self.subpath_infos))
Ejemplo n.º 11
0
    def step(self, action):
        # no clipping / rescaling of actions
        if self._for_her:
            # actions are in [-1, 1]
            action = (action + 1.) / 2.
            # actions are in [0, 1]
            action = action * (self.action_space.high - self.action_space.low) + self.action_space.low
            action = np.clip(action, self.action_space.low, self.action_space.high)
        # rot_ctrl = np.array([1., 0., 1., 0.])
        # action = np.concatenate([action, rot_ctrl])
        # action, _ = np.split(action, (self.sim.model.nmocap * 7,))
        # action = action.reshape(self.sim.model.nmocap, 7)

        # pos_delta = action[:, :3]
        # action = np.concatenate((np.zeros(2), np.array(action).flatten()))
        action = np.array(action).flatten()
        if self._control_method == "torque_control":
            self.forward_dynamics(action)
            self.sim.forward()
        else:
            reset_mocap2body_xpos(self.sim)
            self.sim.data.mocap_pos[0, :3] = self.sim.data.mocap_pos[0, :3] + action[:3]
            self.set_gripper_state(action[3])
            for _ in range(1):
                self.sim.step()
            self.sim.forward()

        self._step += 1

        # obs = self.get_current_obs()
        # achieved_goal = obs['achieved_goal']
        # goal = obs['desired_goal']


        grasped = self._grasp()  # TODO fix grasp recognition (always returns False)
        object_pos = self.sim.data.get_site_xpos('object0')

        desired_goal = object_pos

        if self._for_her:
            if not grasped:
                achieved_goal = self.position
            else:
                # print("Grasped!")
                achieved_goal = np.squeeze(object_pos.copy())
                desired_goal = self.goal
        else:
            achieved_goal = self.position
        # reward = self._compute_reward(achieved_goal, goal)

        obs = self.get_obs()

        # penalize x/y movement of object
        penalize_2d_motion = np.linalg.norm(object_pos[:2] - np.array(TASKS[self._task][:2]))
        # reward positive change in z direction (up)
        lifting = object_pos[2] - self.init_box_height

        # achieved_dist = self._goal_distance(achieved_goal, self._goal)
        achieved_dist = self._goal_distance(achieved_goal, desired_goal)
        # reward = rewards._sigmoids(self._goal_distance(achieved_goal, goal) / self._goal_distance(self.initial_pos, goal), 0., "cosine")
        # reward = 1. - achieved_dist / self._goal_distance(self._start_pos, self._goal) / 2.  # before
        # reward = 1. - achieved_dist / self._goal_distance(np.zeros(3), self._goal) / 2.
        # TODO sparse reward
        # if grasped:
        #     print("Grasped!")

        # print(self.initial_pos, achieved_goal)

        done = grasped and lifting > 0.005  # (action[3] < 0.2 and not grasped) or grasped and lifting > 0.02  #(achieved_dist < self._distance_threshold)

        if self._for_her:
            reward = self.compute_reward(achieved_goal, desired_goal, {})
        else:
            if self._sparse_reward:
                reward = 0. - penalize_2d_motion + lifting + float(grasped) * 0.3
            else:
                reward = (.3 + lifting * 30.) * float(grasped) + 1. - achieved_dist / self._goal_distance(
                    self._start_pos, object_pos) - penalize_2d_motion

        if done:  # done:
            reward = 20.  # 20.

        if self._for_her:
            info = {
                "l": self._step,
                "r": reward,
                "d": done,
                "grasped": grasped,
                "is_success": grasped and lifting > 0.005
            }
        else:
            info = {
                "episode": {
                    "l": self._step,
                    "r": reward,
                    "d": done,
                    "grasped": grasped,
                    "position": self.position,
                    "task": np.copy(self.onehot)
                }
            }

        # just take gripper position as observation
        return Step(obs, reward, done, **info)
Ejemplo n.º 12
0
 def step(self, action):
     next_obs, reward, done, info = self.env.step(action)
     return Step(next_obs, reward, done, **info)