def step(self, action): _, inner_rew, done, info = self.wrapped_env.step(action) info['inner_rew'] = inner_rew info['outer_rew'] = 0 if done: return Step(self.get_current_obs(), self.dying_cost, done, **info) # give a -10 rew if the robot dies com = self.wrapped_env.get_body_com("torso") x, y = com[:2] reward = self.coef_inner_rew * inner_rew new_objs = [] for obj in self.objects: ox, oy, typ = obj # object within zone! if (ox - x)**2 + (oy - y)**2 < self.catch_range**2: if typ == APPLE: reward = reward + 1 info['outer_rew'] = 1 else: reward = reward - 1 info['outer_rew'] = -1 else: new_objs.append(obj) self.objects = new_objs done = len(self.objects) == 0 return Step(self.get_current_obs(), reward, done, **info)
def step(self, action): """ action map: 0: left 1: down 2: right 3: up :param action: should be a one-hot vector encoding the action :return: """ possible_next_states = self.get_possible_next_states( self.state, action) probs = [x[1] for x in possible_next_states] next_state_idx = np.random.choice(len(probs), p=probs) next_state = possible_next_states[next_state_idx][0] next_x = next_state // self.n_col next_y = next_state % self.n_col next_state_type = self.desc[next_x, next_y] if next_state_type == 'H': done = True reward = 0 elif next_state_type in ['F', 'S']: done = False reward = 0 elif next_state_type == 'G': done = True reward = 1 else: raise NotImplementedError self.state = next_state return Step(observation=self.state, reward=reward, done=done)
def step(self, action, animate=False, markers=[]): latent = self._latents[action] accumulated_r = 0 # print("action", action) for _ in range(self._skip_steps): action, agent_info = self._wrapped_policy.get_action_from_latent( latent, np.copy(self._last_obs)) # a = action if self._deterministic: a = agent_info['mean'] else: a = action if animate: for m in markers: self._wrapped_env.env.get_viewer().add_marker(**m) self._wrapped_env.render() timestep = 0.05 speedup = 1. time.sleep(timestep / speedup) # scale = np.random.normal() # a += scale * 0. obs, reward, done, info = self._wrapped_env.step(a) accumulated_r += reward self._last_obs = obs return Step(obs, reward, done, **info)
def step(self, action): time_step = self._env.step(action) if time_step.reward: self._total_reward += time_step.reward return Step(flatten_observation(time_step.observation), time_step.reward, time_step.step_type == StepType.LAST, **time_step.observation)
def step(self, action): if self.MANUAL_COLLISION: old_pos = self.env.get_xy() inner_next_obs, inner_rew, done, info = self.env.step(action) new_pos = self.env.get_xy() if self._is_in_collision(new_pos): self.env.set_xy(old_pos) done = False else: inner_next_obs, inner_rew, done, info = self.env.step(action) next_obs = self.get_current_obs() x, y = self.env.get_body_com("torso")[:2] # ref_x = x + self._init_torso_x # ref_y = y + self._init_torso_y info['outer_rew'] = 0 info['inner_rew'] = inner_rew reward = self.coef_inner_rew * inner_rew minx, maxx, miny, maxy = self._goal_range if minx <= x <= maxx and miny <= y <= maxy: done = True reward += self.goal_rew # we keep here the original one, so that theAvgReturn is directly # the freq of success info['rew_rew'] = 1 return Step(next_obs, reward, done, **info)
def step(self, action): """ Perform one step with action. :param action: the action to be performed, (x, y, z) only for pos_ctrl :return: next_obs, reward, done, info """ action = np.clip(action, self.action_space.low, self.action_space.high) if self._control_method == 'torque_control': self.forward_dynamics(action) elif self._control_method == 'position_control': assert action.shape == (3, ) action = action.copy() action *= 0.1 # limit the action reset_mocap2body_xpos(self.sim) self.sim.data.mocap_pos[:] = self.sim.data.mocap_pos + action self.sim.step() else: raise NotImplementedError obs = self.get_current_obs() next_obs = obs['observation'] achieved_goal = obs['achieved_goal'] goal = obs['desired_goal'] reward = self._compute_reward(achieved_goal, goal) done = (self._goal_distance(achieved_goal, goal) < self._distance_threshold) return Step(next_obs, reward, done)
def step(self, action: np.ndarray): action = np.clip(action, self.action_space.low, self.action_space.high) if self._control_method == 'torque_control': self.forward_dynamics(action) elif self._control_method == 'position_control': assert action.shape == (4, ) action = action.copy() pos_ctrl, gripper_ctrl = action[:3], action[3] pos_ctrl *= 0.1 # limit the action rot_ctrl = np.array([0., 1., 1., 0.]) gripper_ctrl = -50 if gripper_ctrl < 0 else 50 gripper_ctrl = np.array([gripper_ctrl, -gripper_ctrl]) action = np.concatenate([pos_ctrl, rot_ctrl, gripper_ctrl]) ctrl_set_action(self.sim, action) # For gripper mocap_set_action(self.sim, action) # For pos control of the end effector self.sim.step() obs = self.get_current_obs() next_obs = obs['observation'] achieved_goal = obs['achieved_goal'] goal = obs['desired_goal'] gripper_pos = obs['gripper_pos'] reward = self._compute_reward(achieved_goal, goal, gripper_pos) collided = self._is_collided() if collided: reward -= 200 done = (self._goal_distance(achieved_goal, goal) < self._distance_threshold) or collided return Step(next_obs, reward, done)
def step(self, action): self.forward_dynamics(action) obs = self.get_current_obs() reward = self.compute_reward() done = self._done return Step(obs, reward, done)
def step(self, action): self._state = self._state + action x, y = self._state reward = -(x**2 + y**2)**0.5 done = abs(x) < 0.01 and abs(y) < 0.01 next_observation = np.copy(self._state) return Step(observation=next_observation, reward=reward, done=done)
def step(self, action): """ Action map = {0: up, 1: right, 2: down, 3: left} :param action: scalar encoding the action """ prev_pos_xy = self._get_pos_as_xy() # Set new state moves = np.array([[-1, 0], [0, 1], [1, 0], [0, -1]]) next_pos = np.clip(self.agent_pos + moves[action], a_min=[0, 0], a_max=[self.n_row - 1, self.n_col - 1]) next_state_type = self.map[next_pos[0], next_pos[1]] if next_state_type != 'W': self.agent_pos = next_pos else: next_state_type = 'F' # agent stays on free tile # Determine reward and termination reward = -self.STEP_PENALTY # default reward done = False # default done state if next_state_type == 'H': # hole # fall into abyss done = True reward = -1 elif next_state_type == 'C': # coin coin_idx = np.where( np.all(self.coins_pos == self.agent_pos, axis=1))[0] if not self.coins_picked[coin_idx] and not self.coin_holding: # pick a coin if 1) it's still there, and 2) I'm not holding another coin self.coin_holding = True self.coins_picked[coin_idx] = True elif next_state_type == 'G': # goal (coin drop-off area) if self.coin_holding: # drop the coin self.coin_holding = False reward = 1 if np.all(self.coins_picked): # finish if all coins have been collected and delivered done = True elif next_state_type in ['F', 'S']: # free space pass # default step penalty else: raise NotImplementedError # Return observation and others obs = self.get_current_obs() return Step(observation=obs, reward=reward, done=done, prev_pos_xy=prev_pos_xy, next_pos_xy=self._get_pos_as_xy())
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() action = np.clip(action, *self.action_bounds) ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(action)) run_cost = -1 * self.get_body_comvel("torso")[0] cost = ctrl_cost + run_cost reward = -cost done = False return Step(next_obs, reward, done)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() x, _, y = self.sim.data.site_xpos[0] dist_penalty = 0.01 * x**2 + (y - 2)**2 v1, v2 = self.sim.data.qvel[1:3] vel_penalty = 1e-3 * v1**2 + 5e-3 * v2**2 alive_bonus = 10 r = float(alive_bonus - dist_penalty - vel_penalty) done = y <= 1 return Step(next_obs, r, done)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = 0.5 * self.ctrl_cost_coeff * np.sum( np.square(action / scaling)) forward_reward = self.get_body_comvel("torso")[0] reward = forward_reward - ctrl_cost done = False return Step(next_obs, reward, done)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() distance_to_go = self.finger_to_target_dist() vel_cost = 1e-2 * np.linalg.norm(self.joint_velocities()) reward = -distance_to_go - vel_cost done = self.finger_to_target_dist() < self.target_size() return Step(next_obs, reward, done)
def step(self, action): # Copy action first to remove side effects action_copy = copy(action) self.forward_dynamics(action_copy) next_obs = self.get_current_obs() action_copy = np.clip(action_copy, *self.action_bounds) ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(action_copy)) assert action.all() == action_copy.all() run_cost = -1 * self.get_body_comvel("torso")[0] cost = ctrl_cost + run_cost reward = -cost done = False return Step(next_obs, reward, done)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 vel = self.get_body_comvel("torso")[0] reward = vel + self.alive_coeff - \ 0.5 * self.ctrl_cost_coeff * np.sum(np.square(action / scaling)) state = self._state notdone = np.isfinite(state).all() and \ (np.abs(state[3:]) < 100).all() and (state[0] > .7) and \ (abs(state[2]) < .2) done = not notdone return Step(next_obs, reward, done)
def step(self, action): """Step the environment. Args: action (object): input action Returns: Step: The time step after applying this action. """ time_step = self._env.step(action) return Step( flatten_observation(time_step.observation)['observations'], time_step.reward, time_step.step_type == StepType.LAST, **time_step.observation)
def step(self, action): qpos = np.copy(self.sim.data.qpos) qpos[2] += action[1] ori = qpos[2] # compute increment in each direction dx = math.cos(ori) * action[0] dy = math.sin(ori) * action[0] # ensure that the robot is within reasonable range qpos[0] = np.clip(qpos[0] + dx, -7, 7) qpos[1] = np.clip(qpos[1] + dy, -7, 7) self.sim.data.qpos[:] = qpos self.sim.forward() next_obs = self.get_current_obs() return Step(next_obs, 0, False)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() action = np.clip(action, *self.action_bounds) lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = 0.5 * self.ctrl_cost_coeff * \ np.sum(np.square(action / scaling)) forward_reward = self.get_body_comvel("torso")[0] reward = forward_reward - ctrl_cost qpos = self.sim.data.qpos done = not (qpos[0] > 0.8 and qpos[0] < 2.0 and qpos[2] > -1.0 and qpos[2] < 1.0) return Step(next_obs, reward, done)
def step(self, latent, use_mean=True): latent = latent.copy() latent = np.clip(latent, self.action_space.low, self.action_space.high) accumulated_r = 0 for _ in range(1): action, agent_info = self._wrapped_policy.get_action_from_latent( latent, self._last_obs) if use_mean: a = agent_info['mean'] else: a = action scale = np.random.normal() a += scale * 0. obs, reward, done, info = self._wrapped_env.step(a) accumulated_r += reward self._last_obs = obs return Step(obs, accumulated_r, done, **info)
def step(self, action): self.forward_dynamics(action) comvel = self.get_body_comvel("torso") forward_reward = comvel[0] lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling)) contact_cost = 0.5 * 1e-3 * np.sum( np.square(np.clip(self.sim.data.cfrc_ext, -1, 1))), survive_reward = 0.05 reward = forward_reward - ctrl_cost - contact_cost + survive_reward state = self._state notdone = np.isfinite(state).all() \ and state[2] >= 0.2 and state[2] <= 1.0 done = not notdone ob = self.get_current_obs() return Step(ob, float(reward), done)
def step(self, action): """ Note: override this method with great care, as it post-processes the observations, etc. """ reward_computer = self.compute_reward(action) # forward the state action = self._inject_action_noise(action) for _ in range(self.frame_skip): self.forward_dynamics(action) # notifies that we have stepped the world next(reward_computer) # actually get the reward reward = next(reward_computer) self._invalidate_state_caches() done = self.is_current_done() next_obs = self.get_current_obs() return Step(observation=next_obs, reward=reward, done=done)
def step(self, action): # enforce action space a = action.copy() # NOTE: we MUST copy the action before modifying it a = np.clip(a, self.action_space.low, self.action_space.high) dist = np.linalg.norm(self._point - self._goal) done = dist < np.linalg.norm(self.action_space.low) # dense reward reward = -dist # done bonus if done: reward += self._done_bonus # sometimes we don't want to terminate done = done and not self._never_done return Step(np.copy(self._point), reward, done)
def step(self, action, animate=False, markers=()): latent = self._latents[action] latent_index = action accumulated_r = 0 seq_info = { "latents": [], "latent_indices": [], "observations": [], "actions": [], "infos": [], "rewards": [], "dones": [] } for _ in range(self._skip_steps): action, agent_info = self._wrapped_policy.get_action_from_latent( latent, np.copy(self._last_obs)) if self._deterministic: a = agent_info['mean'] else: a = action if animate: for m in markers: self._wrapped_env.env.get_viewer().add_marker(**m) self._wrapped_env.render() timestep = 0.05 speedup = 1. time.sleep(timestep / speedup) obs, reward, done, info = self._wrapped_env.step(a) seq_info["latents"].append(latent) seq_info["latent_indices"].append(latent_index) seq_info["observations"].append(np.copy(self._last_obs)) seq_info["actions"].append(action) seq_info["infos"].append(copy.deepcopy(info)) seq_info["rewards"].append(reward) seq_info["dones"].append(done) accumulated_r += reward self._last_obs = obs return Step(obs, reward, done, **seq_info)
def step(self, action, use_mean=False): action = np.copy(action) if self._normalize: action /= np.linalg.norm(action) latent = np.sum(action * self._latents.T, axis=1) accumulated_r = 0 for _ in range(1): action, agent_info = self._wrapped_policy.get_action_from_latent( latent, np.copy(self._last_obs)) if use_mean: a = agent_info['mean'] else: a = action # scale = np.random.normal() # a += scale * 0. obs, reward, done, info = self._wrapped_env.step(a) accumulated_r += reward self._last_obs = obs return Step(obs, accumulated_r, done, **info)
def step(self, action, use_mean=True): action = np.clip(action, self.action_space.low, self.action_space.high) idx = int(action[0]) if idx == self.action_space.high[0]: idx -= 1 latents = self._latents_combination_hash[idx] latent = action[1] * latents[0] + (1 - action[1]) * latents[1] # TODO: Make this step size a param.. for _ in range(10): action, agent_info = self._wrapped_policy.get_action_from_latent( latent, self._last_obs) if use_mean: a = agent_info['mean'] else: a = action scale = np.random.normal() a += scale * 0 obs, reward, done, info = self._wrapped_env.step(a) self._last_obs = obs return Step(obs, reward, done, **info)
def step(self, action): self.forward_dynamics(action) next_obs = self.get_current_obs() alive_bonus = self.alive_bonus data = self.sim.data comvel = self.get_body_comvel("torso") lin_vel_reward = comvel[0] lb, ub = self.action_bounds scaling = (ub - lb) * 0.5 ctrl_cost = .5 * self.ctrl_cost_coeff * np.sum( np.square(action / scaling)) impact_cost = .5 * self.impact_cost_coeff * np.sum( np.square(np.clip(data.cfrc_ext, -1, 1))) vel_deviation_cost = 0.5 * self.vel_deviation_cost_coeff * np.sum( np.square(comvel[1:])) reward = lin_vel_reward + alive_bonus - ctrl_cost - \ impact_cost - vel_deviation_cost done = data.qpos[2] < 0.8 or data.qpos[2] > 2.0 return Step(next_obs, reward, done)
def step(self, action): # enforce action space a = action.copy() # NOTE: we MUST copy the action before modifying it a *= self._action_scale a = np.clip(a, self.action_space.low, self.action_space.high) self._point = np.clip(self._point + a, -5, 5) self._traces[-1].append(tuple(self._point)) dist = np.linalg.norm(self._point - self._goal) done = dist < np.linalg.norm(self.action_space.low) # dense reward reward = -dist is_success = False # completion bonus if done: is_success = True reward += self._completion_bonus # sometimes we don't want to terminate done = done and not self._never_done return Step(np.copy(self._point), reward, done, is_success=is_success)
def step(self, action): obs, reward, done, info = self.active_env.step(action) info['task'] = self.active_task_one_hot return Step(obs, reward, done, **info)
def step(self, action): obs, reward, done, info = super().step(action) oh_obs = self._obs_with_one_hot(obs) return Step(oh_obs, reward, done, **info)