def step(self, action):
        """
        action map:
        0: left
        1: down
        2: right
        3: up
        :param action: should be a one-hot vector encoding the action
        :return:
        """
        possible_next_states = self.get_possible_next_states(
            self.state, action)

        probs = [x[1] for x in possible_next_states]
        next_state_idx = np.random.choice(len(probs), p=probs)
        next_state = possible_next_states[next_state_idx][0]

        next_x = next_state // self.n_col
        next_y = next_state % self.n_col

        next_state_type = self.desc[next_x, next_y]
        if next_state_type == 'H':
            done = True
            reward = 0
        elif next_state_type in ['F', 'S']:
            done = False
            reward = 0
        elif next_state_type == 'G':
            done = True
            reward = 1
        else:
            raise NotImplementedError
        self.state = next_state
        return Step(observation=self.state, reward=reward, done=done)
 def step(self, action):
     self._state = self._state + action
     x, y = self._state
     x -= self._goal[0]
     y -= self._goal[1]
     reward = -(x**2 + y**2)**0.5
     done = abs(x) < 0.01 and abs(y) < 0.01
     next_observation = np.copy(self._state)
     return Step(observation=next_observation, reward=reward, done=done)
 def step(self, action):
     self.forward_dynamics(action)
     next_obs = self.get_current_obs()
     action = np.clip(action, *self.action_bounds)
     ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(action))
     run_cost = 1.*np.abs(self.get_body_comvel("torso")[0] - self._goal_vel)
     cost = ctrl_cost + run_cost
     reward = -cost
     done = False
     return Step(next_obs, reward, done)
Ejemplo n.º 4
0
 def step(self, action):
     self.forward_dynamics(action)
     next_obs = self.get_current_obs()
     x, _, y = self.model.data.site_xpos[0]
     dist_penalty = 0.01 * x**2 + (y - 2)**2
     v1, v2 = self.model.data.qvel[1:3]
     vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2
     alive_bonus = 10
     r = float(alive_bonus - dist_penalty - vel_penalty)
     done = y <= 1
     return Step(next_obs, r, done)
Ejemplo n.º 5
0
 def step(self, action):
     _, _, done, info = self.inner_env.step(action)
     if done:
         return Step(self.get_current_obs(), -10, done, **info)
     com = self.inner_env.get_body_com("torso")
     x, y = com[:2]
     reward = 0
     new_objs = []
     for obj in self.objects:
         ox, oy, typ = obj
         # object within zone!
         if (ox - x)**2 + (oy - y)**2 < self.catch_range**2:
             if typ == APPLE:
                 reward = reward + 1
             else:
                 reward = reward - 1
         else:
             new_objs.append(obj)
     self.objects = new_objs
     done = len(self.objects) == 0
     return Step(self.get_current_obs(), reward, done, **info)
Ejemplo n.º 6
0
 def step(self, action):
     self.forward_dynamics(action)
     next_obs = self.get_current_obs()
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     ctrl_cost = 0.5 * self.ctrl_cost_coeff * np.sum(
         np.square(action / scaling))
     forward_reward = -1.5 * np.abs(
         self.get_body_comvel("torso")[0] - self._goal_vel)
     reward = forward_reward - ctrl_cost
     done = False
     return Step(next_obs, reward, done)
Ejemplo n.º 7
0
 def step(self, action):
     self.forward_dynamics(action)
     next_obs = self.get_current_obs()
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     ctrl_cost = 0.5 * self.ctrl_cost_coeff * np.sum(
         np.square(action / scaling))
     forward_reward = self.get_body_comvel("torso")[0]
     #forward_reward = -1.5*np.abs(self.get_body_comvel("torso")[0] - 0.15)
     # max achievable vel is around 0.20 for vpg.
     reward = forward_reward - ctrl_cost
     done = False
     return Step(next_obs, reward, done)
Ejemplo n.º 8
0
 def step(self, action):
     self.forward_dynamics(action)
     next_obs = self.get_current_obs()
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     vel = self.get_body_comvel("torso")[0]
     reward = vel + self.alive_coeff - \
         0.5 * self.ctrl_cost_coeff * np.sum(np.square(action / scaling))
     state = self._state
     notdone = np.isfinite(state).all() and \
         (np.abs(state[3:]) < 100).all() and (state[0] > .7) and \
         (abs(state[2]) < .2)
     done = not notdone
     return Step(next_obs, reward, done)
 def step(self, action):
     self.forward_dynamics(action)
     next_obs = self.get_current_obs()
     action = np.clip(action, *self.action_bounds)
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     ctrl_cost = 0.5 * self.ctrl_cost_coeff * \
         np.sum(np.square(action / scaling))
     forward_reward = self.get_body_comvel("torso")[0]
     reward = forward_reward - ctrl_cost
     qpos = self.model.data.qpos
     done = not (qpos[0] > 0.8 and qpos[0] < 2.0 and qpos[2] > -1.0
                 and qpos[2] < 1.0)
     return Step(next_obs, reward, done)
Ejemplo n.º 10
0
 def step(self, action):
     qpos = np.copy(self.model.data.qpos)
     qpos[2, 0] += action[1]
     ori = qpos[2, 0]
     # compute increment in each direction
     dx = math.cos(ori) * action[0]
     dy = math.sin(ori) * action[0]
     # ensure that the robot is within reasonable range
     qpos[0, 0] = np.clip(qpos[0, 0] + dx, -7, 7)
     qpos[1, 0] = np.clip(qpos[1, 0] + dy, -7, 7)
     self.model.data.qpos = qpos
     self.model.forward()
     next_obs = self.get_current_obs()
     return Step(next_obs, 0, False)
 def step(self, action):
     if isinstance(self._wrapped_env.action_space, Box):
         # rescale the action
         lb, ub = self._wrapped_env.action_space.bounds
         scaled_action = lb + (action + 1.) * 0.5 * (ub - lb)
         scaled_action = np.clip(scaled_action, lb, ub)
     else:
         scaled_action = action
     wrapped_step = self._wrapped_env.step(scaled_action)
     next_obs, reward, done, info = wrapped_step
     if self._normalize_obs:
         next_obs = self._apply_normalize_obs(next_obs)
     if self._normalize_reward:
         reward = self._apply_normalize_reward(reward)
     return Step(next_obs, reward * self._scale_reward, done, **info)
Ejemplo n.º 12
0
 def step(self, action):
     self.forward_dynamics(action)
     comvel = self.get_body_comvel("torso")
     forward_reward = self.goal_direction * comvel[0]
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling))
     contact_cost = 0.5 * 1e-3 * np.sum(
         np.square(np.clip(self.model.data.cfrc_ext, -1, 1))),
     survive_reward = 0.05
     reward = forward_reward - ctrl_cost - contact_cost + survive_reward
     state = self._state
     notdone = np.isfinite(state).all() \
         and state[2] >= 0.2 and state[2] <= 1.0
     done = not notdone
     ob = self.get_current_obs()
     return Step(ob, float(reward), done)
Ejemplo n.º 13
0
 def step(self, action):
     """
     Note: override this method with great care, as it post-processes the
     observations, etc.
     """
     reward_computer = self.compute_reward(action)
     # forward the state
     action = self._inject_action_noise(action)
     for _ in range(self.frame_skip):
         self.forward_dynamics(action)
     # notifies that we have stepped the world
     next(reward_computer)
     # actually get the reward
     reward = next(reward_computer)
     self._invalidate_state_caches()
     done = self.is_current_done()
     next_obs = self.get_current_obs()
     return Step(observation=next_obs, reward=reward, done=done)
 def step(self, action):
     self.forward_dynamics(action)
     com = self.get_body_com("torso")
     # ref_x = x + self._init_torso_x
     goal_reward = np.exp(-np.sum(
         np.abs(com[:2] - self._goal_pos)))  # make it happy, not suicidal
     lb, ub = self.action_bounds
     scaling = (ub - lb) * 0.5
     ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling))
     contact_cost = 0.5 * 1e-3 * np.sum(
         np.square(np.clip(self.model.data.cfrc_ext, -1, 1))),
     survive_reward = 0.05
     reward = goal_reward - ctrl_cost - contact_cost + survive_reward
     state = self._state
     notdone = np.isfinite(state).all() \
         and state[2] >= 0.2 and state[2] <= 1.0
     done = not notdone
     ob = self.get_current_obs()
     return Step(ob, float(reward), done)
 def step(self, action, **kwargs):
     """
     Run one timestep of the environment's dynamics. When end of episode
     is reached, reset() should be called to reset the environment's internal state.
     Input
     -----
     action : an action provided by the environment
     Outputs
     -------
     (observation, reward, done, info)
     observation : agent's observation of the current environment
     reward [Float] : amount of reward due to the previous action
     done : a boolean, indicating whether the episode has ended
     info : a dictionary containing other diagnostic information from the previous action
     """
     prev_state = self._state
     self._state = prev_state + np.clip(action, -0.1, 0.1)
     reward = self.reward(prev_state, action, self._state)
     done = self.done(self._state)
     next_observation = np.copy(self._state)
     return Step(next_observation, reward, done)
Ejemplo n.º 16
0
 def step(self, action):
     if self.MANUAL_COLLISION:
         old_pos = self.wrapped_env.get_xy()
         _, _, done, info = self.wrapped_env.step(action)
         new_pos = self.wrapped_env.get_xy()
         if self._is_in_collision(new_pos):
             self.wrapped_env.set_xy(old_pos)
             done = False
     else:
         _, _, done, info = self.wrapped_env.step(action)
     next_obs = self.get_current_obs()
     x, y = self.wrapped_env.get_body_com("torso")[:2]
     # ref_x = x + self._init_torso_x
     # ref_y = y + self._init_torso_y
     reward = 0
     minx, maxx, miny, maxy = self._goal_range
     # print "goal range: x [%s,%s], y [%s,%s], now [%s,%s]" % (str(minx), str(maxx), str(miny), str(maxy),
     #                                                          str(x), str(y))
     if minx <= x <= maxx and miny <= y <= maxy:
         done = True
         reward = 1
     return Step(next_obs, reward, done, **info)
    def step(self, action):
        self.forward_dynamics(action)
        next_obs = self.get_current_obs()

        alive_bonus = self.alive_bonus
        data = self.model.data

        comvel = self.get_body_comvel("torso")

        lin_vel_reward = comvel[0]
        lb, ub = self.action_bounds
        scaling = (ub - lb) * 0.5
        ctrl_cost = .5 * self.ctrl_cost_coeff * np.sum(
            np.square(action / scaling))
        impact_cost = .5 * self.impact_cost_coeff * np.sum(
            np.square(np.clip(data.cfrc_ext, -1, 1)))
        vel_deviation_cost = 0.5 * self.vel_deviation_cost_coeff * np.sum(
            np.square(comvel[1:]))
        reward = lin_vel_reward + alive_bonus - ctrl_cost - \
            impact_cost - vel_deviation_cost
        done = data.qpos[2] < 0.8 or data.qpos[2] > 2.0

        return Step(next_obs, reward, done)
Ejemplo n.º 18
0
 def step(self, action):
     next_obs, reward, done, info = self._wrapped_env.step(action)
     return Step(self.inject_obs_noise(next_obs), reward, done, **info)
Ejemplo n.º 19
0
 def step(self, action):
     queued_action = self._queued_actions[:self.action_dim]
     next_obs, reward, done, info = self._wrapped_env.step(queued_action)
     self._queued_actions = np.concatenate(
         [self._queued_actions[self.action_dim:], action])
     return Step(next_obs, reward, done, **info)
 def step(self, action):
     next_obs, reward, done, info = self.env.step(action)
     return Step(next_obs, reward, done, **info)
Ejemplo n.º 21
0
 def step(self, action):
     next_obs, reward, done, info = self._wrapped_env.step(action)
     self.add_to_buffer(next_obs)
     return Step(self.buffer, reward, done, **info)