def step(self, action): done = False action = np.asarray(action).squeeze() action[0] = np.clip(action[0], -self.max_F, self.max_F) action[1] = np.clip(action[1], -self.max_M, self.max_M) for _ in range(self.act_hold): ns = rk4(self._derivs, action, 0, self.dt, self.state) self.state[0] = ns[0] self.state[1] = ns[1] self.state[2] = wrap(ns[2], -2 * pi, 2 * pi) # We might not need to wrap ... self.state[3] = ns[3] self.state[4] = ns[4] self.state[5] = ns[5] # did it hit the wall? no if term = 0, yes if term = 1 term = any(self.state[0] >= self.deadzone_x[:, 0]) and any( self.state[0] <= self.deadzone_x[:, 1]) and any( self.state[1] >= self.deadzone_y[:, 0]) and any( self.state[1] <= self.deadzone_y[:, 1]) xpen = -0.01 * np.clip((self.state[0] - self.xtarg)**2, -4, 4) ypen = -0.01 * np.clip((self.state[1] - self.ytarg)**2, -4, 4) thpen = -0.01 * np.clip((self.state[2] - self.theta_targ)**2, -2, 2) reward = xpen + ypen + thpen - 10 * term self.cur_step += 1 if self.cur_step > self.num_steps or term: done = True return self._get_obs(), reward, done, {}
def step(self, action): done = False # RL algorithms aware of the action space won't need this but things like the # imitation learning or energy shaping controllers might try feeding in something # above the torque limit torque = np.clip(action, -self.TORQUE_MAX, self.TORQUE_MAX) # torque = action # Add noise to the force action if self.torque_noise_max > 0: torque += self.np_random.uniform(-self.torque_noise_max, self.torque_noise_max) for _ in range(5): # ns = rk4(self._derivs, torque, 0, self.dt, self.state) ns = euler(self._derivs, torque, 0, self.dt, self.state) self.state[0] = wrap(ns[0], -2 * pi, 2 * pi) # self.state[0] = ns[0] self.state[1] = ns[1] # self.state[1] = np.clip(ns[1], -self.X_MAX, self.X_MAX) self.state[2] = ns[2] self.state[3] = ns[3] # self.state[2] = np.clip(ns[2], -self.DTHETA_MAX, self.DTHETA_MAX) # self.state[3] = np.clip(ns[3], -self.DX_MAX, self.DX_MAX) # Should reward be something we pass in ? I do like to mess with them a lot... if (pi - 0.2) < self.state[0] < (pi + 0.2): reward = 1.0 else: reward = 0.0 done = True # if (np.pi - .1 < self.state[0] < np.pi + .1) and (-.1 < np. # upright = (np.cos(self.state[0]) + 1) / 2 # centered = rewards.tolerance(self.state[1], margin=2) # centered = (1 + centered) / 2 # small_control = rewards.tolerance(torque.item(), margin=1, # value_at_margin=0, # sigmoid='quadratic') # small_control = (4 + small_control) / 5 # small_velocity = rewards.tolerance(self.state[2], margin=5) # small_velocity = (1 + small_velocity) / 2 # reward = upright.mean() * small_control * small_velocity * centered self.cur_step += 1 if self.cur_step > self.num_steps: done = True elif np.abs(self.state[1]) > self.X_MAX: # done = True reward -= 5 return self.state.copy(), reward, done, {}
def step(self, action): done = False # RL algorithms aware of the action space won't need this but things like the # imitation learning or energy shaping controllers might try feeding in something # above the torque limit # torque = np.clip(action, -self.TORQUE_MAX, self.TORQUE_MAX) torque = np.clip(action * 1, -self.TORQUE_MAX, self.TORQUE_MAX) # Add noise to the force action if self.torque_noise_max > 0: torque += self.np_random.uniform(-self.torque_noise_max, self.torque_noise_max) # Randomy apply a pertubation push = 0 if (self.state[0] > 145 * pi / 180) and (self.state[0] < 215 * pi / 180): rand = random.random() if rand > 0.99: push = random.gauss(0, 0.2) self.state[0] += push # Do the integration update, we update the simulator at a higer rate than we update the control for _ in range(5): ns = rk4(self._derivs, torque, 0, self.dt, self.state) # ns = euler(self._derivs, torque, 0, self.dt, self.state) self.state[0] = wrap(ns[0], -2 * pi, 2 * pi) # self.state[0] = ns[0] self.state[1] = ns[1] # self.state[1] = np.clip(ns[1], -self.X_MAX, self.X_MAX) self.state[2] = ns[2] self.state[3] = ns[3] # self.state[2] = np.clip(ns[2], -self.DTHETA_MAX, self.DTHETA_MAX) # self.state[3] = np.clip(ns[3], -self.DX_MAX, self.DX_MAX) # Should reward be something we pass in ? I do like to mess with them a lot... reward = -np.cos(self.state[0]) self.cur_step += 1 if self.cur_step > self.num_steps: done = True elif np.abs(self.state[1]) > self.X_MAX: reward -= 5 return self.state, reward, done, {}
def step(self, action): done = False # RL algorithms aware of the action space won't need this but things like the # imitation learning or energy shaping controllers might try feeding in something # above the torque limit torque = np.clip(action, -self.TORQUE_MAX, self.TORQUE_MAX) # torque = action # Add noise to the force action if self.torque_noise_max > 0: torque += self.np_random.uniform(-self.torque_noise_max, self.torque_noise_max) for _ in range(5): # ns = euler(self._derivs, torque, 0, self.dt, self.state) ns = rk4(self._derivs, torque, 0, self.dt, self.state) self.state[0] = wrap(ns[0], -pi, pi) # self.state[0] = ns[0] self.state[1] = ns[1] # self.state[1] = np.clip(ns[1], -self.X_MAX, self.X_MAX) self.state[2] = ns[2] self.state[3] = ns[3] # self.state[2] = np.clip(ns[2], -self.DTHETA_MAX, self.DTHETA_MAX) # self.state[3] = np.clip(ns[3], -self.DX_MAX, self.DX_MAX) # Should reward be something we pass in ? I do like to mess with them a lot... # reward = -np.cos(self.state[0]) + 2 reward = (-5 * np.cos(self.state[0])) + 2 self.cur_step += 1 if self.cur_step > self.num_steps: done = True elif np.abs(self.state[1]) > self.X_MAX: done = True return self.state, reward, done, {}
def _get_obs(self): obs = self.state.copy() obs[0] = wrap(obs[0], self.th1_range[0], self.th1_range[1]) obs[1] = wrap(obs[1], self.th2_range[0], self.th2_range[1]) return obs