Example #1
0
 def step(self, action):
     _, inner_rew, done, info = self.wrapped_env.step(action)
     info['inner_rew'] = inner_rew
     info['outer_rew'] = 0
     if done:
         return Step(self.get_current_obs(), self.dying_cost, done,
                     **info)  # give a -10 rew if the robot dies
     com = self.wrapped_env.get_body_com("torso")
     x, y = com[:2]
     reward = self.coef_inner_rew * inner_rew
     new_objs = []
     for obj in self.objects:
         ox, oy, typ = obj
         # object within zone!
         if (ox - x)**2 + (oy - y)**2 < self.catch_range**2:
             if typ == APPLE:
                 reward = reward + 1
                 info['outer_rew'] = 1
             else:
                 reward = reward - 1
                 info['outer_rew'] = -1
         else:
             new_objs.append(obj)
     self.objects = new_objs
     done = len(self.objects) == 0
     return Step(self.get_current_obs(), reward, done, **info)
Example #2
0
    def step(self, action):
        """
        action map:
        0: left
        1: down
        2: right
        3: up
        :param action: should be a one-hot vector encoding the action
        :return:
        """
        possible_next_states = self.get_possible_next_states(
            self.state, action)

        probs = [x[1] for x in possible_next_states]
        next_state_idx = np.random.choice(len(probs), p=probs)
        next_state = possible_next_states[next_state_idx][0]

        next_x = next_state // self.n_col
        next_y = next_state % self.n_col

        next_state_type = self.desc[next_x, next_y]
        if next_state_type == 'H':
            done = True
            reward = 0
        elif next_state_type in ['F', 'S']:
            done = False
            reward = 0
        elif next_state_type == 'G':
            done = True
            reward = 1
        else:
            raise NotImplementedError
        self.state = next_state
        return Step(observation=self.state, reward=reward, done=done)
 def step(self, action, animate=False, markers=[]):
     latent = self._latents[action]
     accumulated_r = 0
     # print("action", action)
     for _ in range(self._skip_steps):
         action, agent_info = self._wrapped_policy.get_action_from_latent(
             latent, np.copy(self._last_obs))
         # a = action
         if self._deterministic:
             a = agent_info['mean']
         else:
             a = action
         if animate:
             for m in markers:
                 self._wrapped_env.env.get_viewer().add_marker(**m)
             self._wrapped_env.render()
             timestep = 0.05
             speedup = 1.
             time.sleep(timestep / speedup)
         # scale = np.random.normal()
         # a += scale * 0.
         obs, reward, done, info = self._wrapped_env.step(a)
         accumulated_r += reward
         self._last_obs = obs
     return Step(obs, reward, done, **info)
Example #4
0
 def step(self, action):
     time_step = self._env.step(action)
     if time_step.reward:
         self._total_reward += time_step.reward
     return Step(flatten_observation(time_step.observation),
                 time_step.reward, time_step.step_type == StepType.LAST,
                 **time_step.observation)
Example #5
0
 def step(self, action):
     if self.MANUAL_COLLISION:
         old_pos = self.env.get_xy()
         inner_next_obs, inner_rew, done, info = self.env.step(action)
         new_pos = self.env.get_xy()
         if self._is_in_collision(new_pos):
             self.env.set_xy(old_pos)
             done = False
     else:
         inner_next_obs, inner_rew, done, info = self.env.step(action)
     next_obs = self.get_current_obs()
     x, y = self.env.get_body_com("torso")[:2]
     # ref_x = x + self._init_torso_x
     # ref_y = y + self._init_torso_y
     info['outer_rew'] = 0
     info['inner_rew'] = inner_rew
     reward = self.coef_inner_rew * inner_rew
     minx, maxx, miny, maxy = self._goal_range
     if minx <= x <= maxx and miny <= y <= maxy:
         done = True
         reward += self.goal_rew
         # we keep here the original one, so that theAvgReturn is directly
         # the freq of success
         info['rew_rew'] = 1
     return Step(next_obs, reward, done, **info)
Example #6
0
    def step(self, action):
        """
        Perform one step with action.

        :param action: the action to be performed, (x, y, z) only for pos_ctrl
        :return: next_obs, reward, done, info
        """
        action = np.clip(action, self.action_space.low, self.action_space.high)
        if self._control_method == 'torque_control':
            self.forward_dynamics(action)
        elif self._control_method == 'position_control':
            assert action.shape == (3, )
            action = action.copy()
            action *= 0.1  # limit the action
            reset_mocap2body_xpos(self.sim)
            self.sim.data.mocap_pos[:] = self.sim.data.mocap_pos + action
            self.sim.step()
        else:
            raise NotImplementedError

        obs = self.get_current_obs()
        next_obs = obs['observation']
        achieved_goal = obs['achieved_goal']
        goal = obs['desired_goal']
        reward = self._compute_reward(achieved_goal, goal)
        done = (self._goal_distance(achieved_goal, goal) <
                self._distance_threshold)
        return Step(next_obs, reward, done)
Example #7
0
    def step(self, action: np.ndarray):
        action = np.clip(action, self.action_space.low, self.action_space.high)
        if self._control_method == 'torque_control':
            self.forward_dynamics(action)
        elif self._control_method == 'position_control':
            assert action.shape == (4, )
            action = action.copy()
            pos_ctrl, gripper_ctrl = action[:3], action[3]
            pos_ctrl *= 0.1  # limit the action
            rot_ctrl = np.array([0., 1., 1., 0.])
            gripper_ctrl = -50 if gripper_ctrl < 0 else 50
            gripper_ctrl = np.array([gripper_ctrl, -gripper_ctrl])
            action = np.concatenate([pos_ctrl, rot_ctrl, gripper_ctrl])
            ctrl_set_action(self.sim, action)  # For gripper
            mocap_set_action(self.sim,
                             action)  # For pos control of the end effector
            self.sim.step()

        obs = self.get_current_obs()
        next_obs = obs['observation']
        achieved_goal = obs['achieved_goal']
        goal = obs['desired_goal']
        gripper_pos = obs['gripper_pos']
        reward = self._compute_reward(achieved_goal, goal, gripper_pos)
        collided = self._is_collided()
        if collided:
            reward -= 200
        done = (self._goal_distance(achieved_goal, goal) <
                self._distance_threshold) or collided

        return Step(next_obs, reward, done)
Example #8
0
    def step(self, action):
        self.forward_dynamics(action)
        obs = self.get_current_obs()
        reward = self.compute_reward()
        done = self._done

        return Step(obs, reward, done)
Example #9
0
 def step(self, action):
     self._state = self._state + action
     x, y = self._state
     reward = -(x**2 + y**2)**0.5
     done = abs(x) < 0.01 and abs(y) < 0.01
     next_observation = np.copy(self._state)
     return Step(observation=next_observation, reward=reward, done=done)
Example #10
0
    def step(self, action):
        """
        Action map = {0: up, 1: right, 2: down, 3: left}
        :param action: scalar encoding the action
        """
        prev_pos_xy = self._get_pos_as_xy()
        # Set new state
        moves = np.array([[-1, 0], [0, 1], [1, 0], [0, -1]])
        next_pos = np.clip(self.agent_pos + moves[action],
                           a_min=[0, 0],
                           a_max=[self.n_row - 1, self.n_col - 1])
        next_state_type = self.map[next_pos[0], next_pos[1]]
        if next_state_type != 'W':
            self.agent_pos = next_pos
        else:
            next_state_type = 'F'  # agent stays on free tile

        # Determine reward and termination
        reward = -self.STEP_PENALTY  # default reward
        done = False  # default done state

        if next_state_type == 'H':  # hole
            # fall into abyss
            done = True
            reward = -1

        elif next_state_type == 'C':  # coin
            coin_idx = np.where(
                np.all(self.coins_pos == self.agent_pos, axis=1))[0]
            if not self.coins_picked[coin_idx] and not self.coin_holding:
                # pick a coin if 1) it's still there, and 2) I'm not holding another coin
                self.coin_holding = True
                self.coins_picked[coin_idx] = True

        elif next_state_type == 'G':  # goal (coin drop-off area)
            if self.coin_holding:
                # drop the coin
                self.coin_holding = False
                reward = 1
            if np.all(self.coins_picked):
                # finish if all coins have been collected and delivered
                done = True

        elif next_state_type in ['F', 'S']:  # free space
            pass  # default step penalty

        else:
            raise NotImplementedError

        # Return observation and others
        obs = self.get_current_obs()
        return Step(observation=obs,
                    reward=reward,
                    done=done,
                    prev_pos_xy=prev_pos_xy,
                    next_pos_xy=self._get_pos_as_xy())
Example #11
0
 def step(self, action):
     self.forward_dynamics(action)
     next_obs = self.get_current_obs()
     action = np.clip(action, *self.action_bounds)
     ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(action))
     run_cost = -1 * self.get_body_comvel("torso")[0]
     cost = ctrl_cost + run_cost
     reward = -cost
     done = False
     return Step(next_obs, reward, done)
Example #12
0
 def step(self, action):
     self.forward_dynamics(action)
     next_obs = self.get_current_obs()
     x, _, y = self.sim.data.site_xpos[0]
     dist_penalty = 0.01 * x**2 + (y - 2)**2
     v1, v2 = self.sim.data.qvel[1:3]
     vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2
     alive_bonus = 10
     r = float(alive_bonus - dist_penalty - vel_penalty)
     done = y <= 1
     return Step(next_obs, r, done)
Example #13
0
 def step(self, action):
     self.forward_dynamics(action)
     next_obs = self.get_current_obs()
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     ctrl_cost = 0.5 * self.ctrl_cost_coeff * np.sum(
         np.square(action / scaling))
     forward_reward = self.get_body_comvel("torso")[0]
     reward = forward_reward - ctrl_cost
     done = False
     return Step(next_obs, reward, done)
Example #14
0
    def step(self, action):
        self.forward_dynamics(action)
        next_obs = self.get_current_obs()

        distance_to_go = self.finger_to_target_dist()
        vel_cost = 1e-2 * np.linalg.norm(self.joint_velocities())
        reward = -distance_to_go - vel_cost

        done = self.finger_to_target_dist() < self.target_size()

        return Step(next_obs, reward, done)
Example #15
0
    def step(self, action):
        # Copy action first to remove side effects
        action_copy = copy(action)

        self.forward_dynamics(action_copy)
        next_obs = self.get_current_obs()
        action_copy = np.clip(action_copy, *self.action_bounds)
        ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(action_copy))
        assert action.all() == action_copy.all()
        run_cost = -1 * self.get_body_comvel("torso")[0]
        cost = ctrl_cost + run_cost
        reward = -cost
        done = False
        return Step(next_obs, reward, done)
Example #16
0
 def step(self, action):
     self.forward_dynamics(action)
     next_obs = self.get_current_obs()
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     vel = self.get_body_comvel("torso")[0]
     reward = vel + self.alive_coeff - \
         0.5 * self.ctrl_cost_coeff * np.sum(np.square(action / scaling))
     state = self._state
     notdone = np.isfinite(state).all() and \
         (np.abs(state[3:]) < 100).all() and (state[0] > .7) and \
         (abs(state[2]) < .2)
     done = not notdone
     return Step(next_obs, reward, done)
Example #17
0
    def step(self, action):
        """Step the environment.

        Args:
            action (object): input action

        Returns:
            Step: The time step after applying this action.
        """
        time_step = self._env.step(action)
        return Step(
            flatten_observation(time_step.observation)['observations'],
            time_step.reward, time_step.step_type == StepType.LAST,
            **time_step.observation)
Example #18
0
 def step(self, action):
     qpos = np.copy(self.sim.data.qpos)
     qpos[2] += action[1]
     ori = qpos[2]
     # compute increment in each direction
     dx = math.cos(ori) * action[0]
     dy = math.sin(ori) * action[0]
     # ensure that the robot is within reasonable range
     qpos[0] = np.clip(qpos[0] + dx, -7, 7)
     qpos[1] = np.clip(qpos[1] + dy, -7, 7)
     self.sim.data.qpos[:] = qpos
     self.sim.forward()
     next_obs = self.get_current_obs()
     return Step(next_obs, 0, False)
Example #19
0
 def step(self, action):
     self.forward_dynamics(action)
     next_obs = self.get_current_obs()
     action = np.clip(action, *self.action_bounds)
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     ctrl_cost = 0.5 * self.ctrl_cost_coeff * \
         np.sum(np.square(action / scaling))
     forward_reward = self.get_body_comvel("torso")[0]
     reward = forward_reward - ctrl_cost
     qpos = self.sim.data.qpos
     done = not (qpos[0] > 0.8 and qpos[0] < 2.0 and qpos[2] > -1.0
                 and qpos[2] < 1.0)
     return Step(next_obs, reward, done)
Example #20
0
 def step(self, latent, use_mean=True):
     latent = latent.copy()
     latent = np.clip(latent, self.action_space.low, self.action_space.high)
     accumulated_r = 0
     for _ in range(1):
         action, agent_info = self._wrapped_policy.get_action_from_latent(
             latent, self._last_obs)
         if use_mean:
             a = agent_info['mean']
         else:
             a = action
         scale = np.random.normal()
         a += scale * 0.
         obs, reward, done, info = self._wrapped_env.step(a)
         accumulated_r += reward
         self._last_obs = obs
     return Step(obs, accumulated_r, done, **info)
Example #21
0
 def step(self, action):
     self.forward_dynamics(action)
     comvel = self.get_body_comvel("torso")
     forward_reward = comvel[0]
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling))
     contact_cost = 0.5 * 1e-3 * np.sum(
         np.square(np.clip(self.sim.data.cfrc_ext, -1, 1))),
     survive_reward = 0.05
     reward = forward_reward - ctrl_cost - contact_cost + survive_reward
     state = self._state
     notdone = np.isfinite(state).all() \
         and state[2] >= 0.2 and state[2] <= 1.0
     done = not notdone
     ob = self.get_current_obs()
     return Step(ob, float(reward), done)
Example #22
0
 def step(self, action):
     """
     Note: override this method with great care, as it post-processes the
     observations, etc.
     """
     reward_computer = self.compute_reward(action)
     # forward the state
     action = self._inject_action_noise(action)
     for _ in range(self.frame_skip):
         self.forward_dynamics(action)
     # notifies that we have stepped the world
     next(reward_computer)
     # actually get the reward
     reward = next(reward_computer)
     self._invalidate_state_caches()
     done = self.is_current_done()
     next_obs = self.get_current_obs()
     return Step(observation=next_obs, reward=reward, done=done)
Example #23
0
    def step(self, action):
        # enforce action space
        a = action.copy()  # NOTE: we MUST copy the action before modifying it
        a = np.clip(a, self.action_space.low, self.action_space.high)

        dist = np.linalg.norm(self._point - self._goal)
        done = dist < np.linalg.norm(self.action_space.low)

        # dense reward
        reward = -dist
        # done bonus
        if done:
            reward += self._done_bonus

        # sometimes we don't want to terminate
        done = done and not self._never_done

        return Step(np.copy(self._point), reward, done)
    def step(self, action, animate=False, markers=()):
        latent = self._latents[action]
        latent_index = action
        accumulated_r = 0

        seq_info = {
            "latents": [],
            "latent_indices": [],
            "observations": [],
            "actions": [],
            "infos": [],
            "rewards": [],
            "dones": []
        }

        for _ in range(self._skip_steps):
            action, agent_info = self._wrapped_policy.get_action_from_latent(
                latent, np.copy(self._last_obs))
            if self._deterministic:
                a = agent_info['mean']
            else:
                a = action
            if animate:
                for m in markers:
                    self._wrapped_env.env.get_viewer().add_marker(**m)
                self._wrapped_env.render()
                timestep = 0.05
                speedup = 1.
                time.sleep(timestep / speedup)

            obs, reward, done, info = self._wrapped_env.step(a)

            seq_info["latents"].append(latent)
            seq_info["latent_indices"].append(latent_index)
            seq_info["observations"].append(np.copy(self._last_obs))
            seq_info["actions"].append(action)
            seq_info["infos"].append(copy.deepcopy(info))
            seq_info["rewards"].append(reward)
            seq_info["dones"].append(done)

            accumulated_r += reward
            self._last_obs = obs
        return Step(obs, reward, done, **seq_info)
Example #25
0
 def step(self, action, use_mean=False):
     action = np.copy(action)
     if self._normalize:
         action /= np.linalg.norm(action)
     latent = np.sum(action * self._latents.T, axis=1)
     accumulated_r = 0
     for _ in range(1):
         action, agent_info = self._wrapped_policy.get_action_from_latent(
             latent, np.copy(self._last_obs))
         if use_mean:
             a = agent_info['mean']
         else:
             a = action
     # scale = np.random.normal()
     # a += scale * 0.
     obs, reward, done, info = self._wrapped_env.step(a)
     accumulated_r += reward
     self._last_obs = obs
     return Step(obs, accumulated_r, done, **info)
Example #26
0
    def step(self, action, use_mean=True):
        action = np.clip(action, self.action_space.low, self.action_space.high)
        idx = int(action[0])
        if idx == self.action_space.high[0]:
            idx -= 1
        latents = self._latents_combination_hash[idx]
        latent = action[1] * latents[0] + (1 - action[1]) * latents[1]
        # TODO: Make this step size a param..
        for _ in range(10):
            action, agent_info = self._wrapped_policy.get_action_from_latent(
                latent, self._last_obs)
            if use_mean:
                a = agent_info['mean']
            else:
                a = action
            scale = np.random.normal()
            a += scale * 0
            obs, reward, done, info = self._wrapped_env.step(a)
            self._last_obs = obs

        return Step(obs, reward, done, **info)
Example #27
0
    def step(self, action):
        self.forward_dynamics(action)
        next_obs = self.get_current_obs()

        alive_bonus = self.alive_bonus
        data = self.sim.data

        comvel = self.get_body_comvel("torso")

        lin_vel_reward = comvel[0]
        lb, ub = self.action_bounds
        scaling = (ub - lb) * 0.5
        ctrl_cost = .5 * self.ctrl_cost_coeff * np.sum(
            np.square(action / scaling))
        impact_cost = .5 * self.impact_cost_coeff * np.sum(
            np.square(np.clip(data.cfrc_ext, -1, 1)))
        vel_deviation_cost = 0.5 * self.vel_deviation_cost_coeff * np.sum(
            np.square(comvel[1:]))
        reward = lin_vel_reward + alive_bonus - ctrl_cost - \
            impact_cost - vel_deviation_cost
        done = data.qpos[2] < 0.8 or data.qpos[2] > 2.0

        return Step(next_obs, reward, done)
Example #28
0
    def step(self, action):
        # enforce action space
        a = action.copy()  # NOTE: we MUST copy the action before modifying it
        a *= self._action_scale
        a = np.clip(a, self.action_space.low, self.action_space.high)

        self._point = np.clip(self._point + a, -5, 5)
        self._traces[-1].append(tuple(self._point))

        dist = np.linalg.norm(self._point - self._goal)
        done = dist < np.linalg.norm(self.action_space.low)

        # dense reward
        reward = -dist
        is_success = False
        # completion bonus
        if done:
            is_success = True
            reward += self._completion_bonus

        # sometimes we don't want to terminate
        done = done and not self._never_done

        return Step(np.copy(self._point), reward, done, is_success=is_success)
Example #29
0
 def step(self, action):
     obs, reward, done, info = self.active_env.step(action)
     info['task'] = self.active_task_one_hot
     return Step(obs, reward, done, **info)
Example #30
0
 def step(self, action):
     obs, reward, done, info = super().step(action)
     oh_obs = self._obs_with_one_hot(obs)
     return Step(oh_obs, reward, done, **info)