class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None, action_repeat=1): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = action_repeat self.state_size = self.action_repeat * 6 self.action_low = 1 self.action_high = 900 self.action_size = self.action_repeat * 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" reward = 10. if self.sim.pose[2] < (self.target_pos[2] - 3.): reward += 2*(self.sim.pose[2] - self.sim.init_pose[2]) reward -= (abs(self.sim.pose[:2] - self.target_pos[:2]) > 3.).sum() else: reward += 10. reward -= 2*(abs(self.sim.pose[:3] - self.target_pos) > 3.).sum() reward -= np.int32(abs(self.sim.pose[2] - self.target_pos[2]) > 3.) if self.sim.done: if self.sim.time < self.sim.runtime: reward -= 10 reward = reward / 10 return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds[(_*4):((_*4)+4)]) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, (reward / self.action_repeat), done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" reward = 1. - .003 * abs(self.sim.pose[:3] - self.target_pos).sum() return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ======= init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) def get_reward(self): #Penalty For Deviating Its position from target position in X, Y, Z Dimension . dist_penalty = abs((self.sim.pose[:3] - self.target_pos).sum()) #Penalty For Deviating Its Position from target position in terms of Euler Angles . angular_penalty = abs((self.sim.pose[3:]).sum()) #Overall reward points = 10 for staying up and not crashing - (0.03 * dist_penalty + 0.06 * angular_penalty) reward_points = 10.0 - (0.03 * dist_penalty + 0.06 * angular_penalty ) return reward_points def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 # Paper uses 5 as the action repeat self.state_size = self.action_repeat * 6 # Based on the random agent the hover/neutral rotor speed appears to be around 450, so tighten up the actions around that value self.action_low = 400 self.action_high = 500 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" reward = 1-2*np.tanh(0.3*np.linalg.norm(self.sim.pose[:3] - self.target_pos[:3])) if self.sim.time < 1: reward += self.sim.time + self.sim.v[2] # negative tanh of the euler distance. Reward [-1 to 0] if self.sim.done and self.sim.time < self.sim.runtime: reward -= 10 # Punish the reward function for crashing return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0. pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Takeoff(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_height=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal: Vertical movement to target height only, keeping x and y coordinates the same self.target_pos = np.array([self.sim.init_pose[0], self.sim.init_pose[1], target_height]) if target_height is not None else np.array([self.sim.init_pose[0], self.sim.init_pose[1], 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" d_euclid_xy = np.sqrt(((self.sim.pose[:2] - self.target_pos[:2])**2).sum()) d_z = abs(self.sim.pose[2]-self.target_pos[2]) reward = np.tanh(1 - 0.002*d_z - 0.001*d_euclid_xy) return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Takeoff(): def __init__(self, init_pose=np.array([0., 0., 0., .1, .1, .1]), init_velocities=None, init_angle_velocities=None, runtime=5., target_height=10.): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_x = self.sim.pose[0] self.target_y = self.sim.pose[1] self.target_z = target_height def get_reward(self): reward = 1. - .3 * (abs(self.sim.pose[2] - self.target_z)).sum() return reward def step(self, rotor_speeds): reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 # 6 states x 3 repeat = 18 self.action_low = 350 # 0 self.action_high = 700 # 900 self.action_size = 1 # 4 # Goal : taking-off, hovering, going to a place or landing. self.target_pos = target_pos def get_reward(self): """Uses current pose of sim to return reward.""" reward = 1.0 - 0.06 * abs(self.target_pos[2] - self.sim.pose[2]) - 0.04 * abs(self.sim.pose[0:1] - self.target_pos[0:1]).sum() - 0.02 * abs(self.sim.pose[3:6]).sum() return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" rotor_speeds = [rotor_speeds[0], rotor_speeds[0], rotor_speeds[0], rotor_speeds[0]] reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class TakeoffTask(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_height, target_height, runtime=5): """Initialize a Task object. Params ====== init_height: initial height of the quadcopter in z dimension to start from target_height: target height of the quadcopter in z dimension to reach for successful takeoff runtime: time limit for each episode """ # Simulation self.init_height = init_height self.target_height = target_height self.runtime = runtime self.sim = PhysicsSim(init_pose=np.array( [0., 0., init_height, 0., 0., 0.]), init_velocities=np.array([0., 0., 0.]), init_angle_velocities=np.array([0., 0., 0.]), runtime=runtime) self.state_size = len(self.create_state()) self.action_low = 0 self.action_high = 900 self.action_size = 1 def get_reward(self, actual_height): # reward is just the difference between the current and the target height return -abs(actual_height - self.target_height) def step(self, rotor_speed): """Uses action to obtain next state, reward, done.""" done = self.sim.next_timestep(rotor_speed * 4) return self.create_state(), self.get_reward( self.sim.pose[2]), self.sim.pose[2] >= self.target_height or done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = [self.create_state()] return state def create_state(self): return np.array( [self.sim.pose[2], self.sim.v[2], self.sim.linear_accel[2]])
class Task(): def __init__(self): self.sim = PhysicsSim(init_pose=TARGET_POSE, runtime=RUNTIME) self.state_size = self.state.shape[0] self.action_low = 0 self.action_high = 900 self.action_size = 4 def get_reward(self): error = np.concatenate([ self.get_position_error(), self.get_orientation_error(), ]) reward = -np.sqrt(np.mean(error**2)) return reward def get_position_error(self): error = TARGET_POSE[:3] - self.sim.pose[:3] error = np.abs(error) return error def get_orientation_error(self): error = TARGET_POSE[3:6] - self.sim.pose[3:6] error = np.remainder(error, 2.0 * np.pi) error[error > np.pi] -= 2.0 * np.pi error = np.abs(error) return error @property def state(self): return np.concatenate([ self.sim.pose, self.sim.v, self.sim.angular_v, self.sim.linear_accel, self.sim.angular_accels ]) def step(self, rotorSpeeds): done = self.sim.next_timestep(rotorSpeeds) reward = self.get_reward() nextState = self.state return nextState, reward, done def reset(self): self.sim.reset() return self.state
class Task(): """Main class that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose, init_velocities, init_angle_velocities, runtime, target_pos): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 # Environment self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Target position self.target_pos = target_pos def _step(self, rotor_speeds, reward_func): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): # update the sim pose and velocities done = self.sim.next_timestep(rotor_speeds) reward += reward_func() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self): reward = 1. - .1 * (abs(self.sim.pose[:3] - self.target_pos)).sum() return reward def step(self, rotor_speeds): reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None, initial_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 12 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 100.]) self.init_pose = init_pose[:3] if init_pose is not None else np.array( [0., 0., 10.]) self.distance_to_cover = self.distanceBetweenPoints( self.init_pose, self.target_pos) def get_reward(self): """Uses current pose of sim to return reward.""" # reward = 1.-.003*(abs(self.sim.pose[:3] - self.target_pos)).sum() # labels = ['time', 'pose', 'v', 'angular_v', 'linear_accel', 'angular_accels'] # print('time', "--", self.sim.time) # print('pose', "--", self.sim.pose) # print('v', "--", self.sim.v) # # print('angular_v', "--", self.angular_v) # print('linear_accel', "--", self.sim.linear_accel) # print('angular_accels', "--", self.sim.angular_accels) # print('target_pos', "--", self.target_pos) # print("---------------------------\n\n") #print(self.init_pose, self.sim.pose[:3], self.target_pos, self.distance_to_cover) #print(self.distanceBetweenPoints(self.initial_pos, self.sim.pose[:3])) #print(self.distanceBetweenPoints(self.initial_pos, self.target_pos)) # if( self.distanceBetweenPoints(self.target_pos, self.sim.pose[:3]) < 0.1*self.distance_to_cover): # reward += 50 # print("wwwooooowwwwww") # print(self.distanceBetweenPoints(self.target_pos, self.sim.pose[:3])) d = self.distanceBetweenPoints(self.target_pos, self.sim.pose[:3]) reward = self.distance_to_cover / d if (d < 0.1): reward += 1.6 * reward # + reward/np.abs(self.sim.v).sum() elif (d < 0.2): reward += 0.8 * reward # + reward/np.abs(self.sim.v).sum() elif (d < 0.3): reward += 0.4 * reward # + reward/np.abs(self.sim.v).sum() else: reward += 0.2 * reward # + np.abs(self.sim.v).sum() return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) pose_all.append(self.sim.v) pose_all.append(self.sim.angular_accels) # if done: # reward += 1 next_state = np.concatenate(pose_all) info = {} if done: info = { "target_pos": self.target_pos, "final_pose": self.sim.pose[:3], "init_pose": self.init_pose } return next_state, reward, done, info def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate( [self.sim.pose, self.sim.v, self.sim.angular_accels] * self.action_repeat) return state def distanceBetweenPoints(self, p1, p2): return math.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2 + (p1[2] - p2[2])**2)
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" self.dimension_weights = np.array( [1, 1., 0.] ) #weights for each velocity axis (wieght of z is set to zero as to not penalize upwards velocities) vel_vector = self.sim.v distance = np.multiply( vel_vector, self.dimension_weights) / np.linalg.norm( self.dimension_weights) #scale velocities according to weights reward = 2 * ( 1 - np.tanh(np.linalg.norm(2 * distance))**2 ) - 1 #reward function corresponding to a slightly modified derivative of tanh() #returns 1 when in zero, and decays to -1 on both infinities #this alongside the velocity weights, aims to reduce movement on the x-y plane reward = reward / 3 reward += (np.clip(self.sim.v[2], -5, 5) / 5) * 2 / 3 # Penalize if the quadcopter crashes if self.sim.done and self.sim.runtime > self.sim.time: reward = -10 return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.init_pose = init_pose self.init_velocities = init_velocities self.init_angle_velcities = init_angle_velocities self.state_size = self.action_repeat * 7 self.action_low = 0 self.action_high = 900 self.action_size = 4 self.init_pose = np.array(init_pose if init_pose is not None else np. array([25., 25., 120., 0, 0, 0])) # Goal self.target_pos = np.array( target_pos if target_pos is not None else np.array([0., 0., 10.])) self.dist = abs(self.init_pose[:3] - self.target_pos).sum() self.last_dist = self.dist self.init_dist = abs(self.init_pose[:3] - self.target_pos).sum() self.init_vdist = abs(self.sim.pose[2] - self.target_pos[2]) self.init_hdist = abs(self.sim.pose[:2] - self.target_pos[:2]).sum() self.last_vdist = self.init_vdist self.last_hdist = self.init_hdist self.last_pos = np.array(self.init_pose[:3]) self.speed = 0 self.proximity = 0 self.speed_limit = 0.1 def get_reward(self): """Uses current pose of sim to return reward.""" self.dist = abs(self.sim.pose[:3] - self.target_pos).sum() self.vdist = abs(self.sim.pose[2] - self.target_pos[2]) self.hdist = abs(self.sim.pose[:2] - self.target_pos[:2]).sum() self.speed = abs(self.last_vdist - self.vdist) if not self.proximity: if self.vdist < 20: self.proximity = 1 # GENTLE LANDING # return 1 - (self.vdist/self.init_vdist) * 1.5 proximity_reward = 1 - (self.vdist / self.init_vdist) speed_penalty = (1 - max(self.speed, 0.05) / 1)**( 1 - (self.vdist / self.init_vdist)) if np.isnan(speed_penalty): speed_penalty = 0.01 #if self.vdist > 20: # speed_penalty = 0.5 #axis_adjust = 1 #for axis in self.sim.pose[3:]: #if axis > 2: #axis_adjust = 0.5 / axis self.last_dist = self.dist self.last_vdist = self.vdist self.last_hdist = self.hdist #print(proximity_reward * speed_penalty) return proximity_reward * speed_penalty / self.action_repeat ''' Working Snapshot proximity_reward = 1 - (self.vdist/self.init_vdist) speed_penalty = (1 - max(self.speed, 0.05)/1) ** (1 - (self.vdist / self.init_vdist)) if np.isnan(speed_penalty): speed_penalty = 0.01 return proximity_reward * speed_penalty #* axis_adjust ''' def step(self, rotor_speeds): self.rotor_speeds = self.clip(rotor_speeds) reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() state = list(self.sim.pose) state.append(self.speed) pose_all.append(state) next_state = np.concatenate(pose_all) """Uses action to obtain next state, reward, done.""" #done = self.sim.next_timestep(self.rotor_speeds) # update the sim pose and velocities #reward = self.get_reward() #next_state = self.sim.pose #next_state.append(self.vdist) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() self.dist = abs(self.init_pose[:3] - self.target_pos).sum() self.last_dist = self.dist self.init_dist = self.dist self.init_vdist = abs(self.sim.pose[2] - self.target_pos[2]) self.init_hdist = abs(self.sim.pose[:2] - self.target_pos[:2]).sum() self.last_vdist = self.init_vdist self.last_hdist = self.init_hdist self.last_pos = np.array(self.init_pose[:3]) self.speed = 0 self.state = list(self.sim.pose) self.state.append(self.speed) self.state = np.concatenate([self.state] * 3) return self.state def new_target(self, target_pose): self.target_pos = target_pose print('Destination updated.') def clip(self, action): return np.clip(np.array(action), self.action_low, self.action_high)
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal - quadcopter should lower from start coordinates 0,0,10 to 0,0,8, and then hover in position. # the episode ends when the time expires or when the quadcopter goes out-of-bounds. self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 8.]) def get_reward(self): """Uses current pose of sim to return reward.""" # big reward if the target location (with some error margin) has been reached # bad reward if the drone gets very close to crashing #print("actual position vs target", self.sim.pose[:3], self.target_pos) if np.all(abs(self.sim.pose[:3] - self.target_pos) <= 0.3): reward = 100 elif self.sim.pose[2] < 1: reward = -100 else: reward = 1. - .3 * (abs(self.sim.pose[:3] - self.target_pos)).sum() return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) #stopping condition - if target position has been reached if np.all(abs(self.sim.pose[:3] - self.target_pos) <= 0.3): print("target reached, at position ", self.sim.pose[:3]) #done = True return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class LandingTask(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 self.sim_init_pose = self.sim.pose # saving initial position of the quadcopter # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 0.]) def get_reward(self): """Uses current pose of sim to return reward.""" # Taking into account the deviation between the current position and the target position on the xy axis reward = (self.target_pos[2] - self.sim.pose[2]) - 0.5 * ( abs(self.target_pos - self.sim.pose[:3]))[:2].sum() if self.sim.pose[2] <= self.target_pos[2]: reward += 140.0 # Giving positive point for passing the targeted Z axis else: reward -= 10 # Punishing for each step if the agent is still above the ground # Giving agent positive reward if at each step if the current position is below the initial position reward += (self.sim_init_pose - self.sim.pose)[2] # Bringing the whole reward in the range of [-1, 1] and summing it to give to give the range of [-3, -3] here in this case reward = np.tanh(reward).sum() # My last reward function # reward = np.tanh(1 - 0.003*(abs(self.sim.pose[:3] - self.target_pos))).sum() return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, self.sim.pose[0], self.sim.pose[ 1], self.sim.pose[ 2], done # Also passing the final value of x, y, z def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Takeoff(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None,position_noise=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ print("########2 Task to Takeoff") # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 self.runtime = runtime self.position_noise = position_noise # Goal self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" #reward = 1.-.3*(abs(self.sim.pose[:3] - self.target_pos)).sum() reward = 0.0 #encorage acceleration in the +z-direction #reward += self.sim.linear_accel[2] #reward += 0.5 * self.sim.linear_accel[2] * self.sim.dt**2 #encorage velocity in the +z-direction if self.sim.v[2] > 0: reward = 10*self.sim.v[2] else: reward = -10 #encorage smaller position errors and limit the demenonator to not be too much close to zero #posDiff = -abs(self.sim.pose[ 2] - self.target_pos[ 2]) #**2 #if posDiff > 0.1: # reward += 10/posDiff #else : # reward += 100 return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) #sparse reward at the end if target is reached if(self.sim.pose[2] >= self.target_pos[2]): print("\n sucessfull takeoff to target target !!!!!!!!!") reward += 10 done = True #if np.isclose(self.target_pos[2], self.sim.pose[2], 1): # reward += 100 # done = True #if(self.sim.pose[2] >= self.target_pos[2] and self.sim.time >= self.runtime): #if np.allclose(self.sim.pose[:-3],self.target_pos, rtol=1): # reward +=100 # done = True next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim with noise to start a new episode.""" self.sim.reset() #::os:: add random start position as suggested by some papers to make agent more generalized if self.position_noise is not None or self.ang_noise is not None: rand_pose = np.copy(self.sim.init_pose) if self.position_noise is not None and self.position_noise > 0: rand_pose[:3] += np.random.normal(0.0, self.position_noise, 3) #print("start rand_pose =",rand_pose) self.sim.pose = np.copy(rand_pose) state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose = np.array([0.0,0.0,10.0,0.0,0.0,0.0]), init_velocities = np.array([0.0,0.0,0.1]), init_angle_velocities = np.array([0.0,0.0,0.0]), runtime=5., target_pos=np.array([0.0,0.0,50.0])): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 10 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) # to calc reward self.pos_diff_init = None def get_reward(self, done): """Uses current pose of sim to return reward.""" reward = 0 self.calc_pos_diff_ratio() reward = self.calc_base_reward_2(reward) return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] # for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities # done = self.check_episode_end(done) reward += self.get_reward(done) pose_all.append(self.sim.pose) # if done: missing = self.action_repeat - len(pose_all) pose_all.extend([pose_all[-1]] * missing) break # next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state #============================================================================# def calc_pos_diff_ratio(self): if self.pos_diff_init is None: self.pos_diff_init = self.sim.init_pose[:3] - self.target_pos self.pos_diff_init = (sum(self.pos_diff_init ** 2)) ** 0.5 pos_diff = self.sim.pose[:3] - self.target_pos pos_diff = (sum(pos_diff ** 2)) ** 0.5 # Normalized distance self.pos_diff_ratio = pos_diff / (self.pos_diff_init + 0.001) def calc_base_reward_2(self, reward): # reward += 1 - self.pos_diff_ratio * 1.0 reward += 1 - self.pos_diff_ratio * (1.0 / 50.0) return reward def calc_episode_end_reward(self, reward, done): if done and self.check_near_goal(): reward += 100 elif done and self.sim.pose[2] <= 0.: reward -= 50 elif done and self.check_out_of_range(): reward -= 30 elif done and not self.check_out_of_range() and self.sim.runtime > self.sim.time: reward -= 20 def check_episode_end(self, done): if self.check_near_goal(): done = True if self.check_near_goal(): done = True return done def check_near_goal(self): return np.abs(self.target_pos[2] - self.sim.pose[2]) < 1.0 def check_out_of_range(self): return self.pos_diff_ratio > 2.0
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 1 self.state_size = self.action_repeat * 2 self.action_low = 390 self.action_high = 420 self.action_range = self.action_high - self.action_low self.action_size = 1 # Goal - hover at z=10, x,y=0 self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" reward = 0 #reward = reward + 1.-0.3*(abs(self.sim.pose[2] - self.target_pos[2])).sum() reward = np.tanh(1 - 0.005 * (abs(self.sim.pose[:3] - self.target_pos))).sum() # penalize too far from target position #distance = np.linalg.norm(self.target_pos[2] - self.sim.pose[2]) #if (distance > 2): # reward = reward - min(1,1/(distance + 1)**2) return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() # update rotor speed to achieve target height # scale to +- 0.1, for input to the network z_norm = (self.sim.pose[2] - 10) / 3 z_v_norm = (self.sim.v[2]) / 3 rotor_norm = (rotor_speeds[0] - self.action_low) / self.action_range * 2 - 1 pose_all.append(z_norm) pose_all.append(z_v_norm) #print (z_norm, z_v_norm) next_state = np.array(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode. Start each episode with the Quadcopyer a random distance (vertically) away from the target height """ self.sim.reset() #perturb the start by +- one unit perturb_unit = 0.1 self.sim.pose[2] += (2 * np.random.random() - 1) * perturb_unit z_norm = (self.sim.pose[2] - 10) / 10 z_v_norm = 0 rotor_norm = 0 state = np.array(([z_norm, z_v_norm]) * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 # 6 states x 3 repeat = 18 self.action_low = 500 # 0 self.action_high = 850 # 900 #reduce the ation size, try 4 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" reward = 1.0 reward += 5.0 - (abs(self.target_pos[2] - self.sim.pose[2]) / self.target_pos[2])**0.4 reward -= .02 * abs(self.sim.pose[:2] - self.target_pos[:2]).sum() reward -= .01 * abs(self.sim.pose[3:6]).sum() if self.target_pos[2] == self.sim.pose[2]: reward += 100.0 #reward = 1.0 #reward += 15.0 * (1.0 - (abs(self.target_pos[2] - self.sim.pose[2]) / self.target_pos[2])) #reward += 0.1 * (1.0 - (abs(self.sim.pose[0:1] - self.target_pos[0:1]).sum() / 100)) #reward += 0.1 * (1.0 - (abs(self.sim.pose[3:6]).sum() / 100)) #- 0.04 * abs(self.sim.pose[0:1] - self.target_pos[0:1]).sum() - 0.02 * abs(self.sim.pose[3:6]).sum() #reward += (self.sim.pose[2]) #reward = 1 #reward += (self.sim.pose[2]) # reward += 1 * (1 - np.linalg.norm(self.sim.pose[:2] - [0., 0.])) #reward += 1 * (1 - np.linalg.norm(self.sim.pose[3:] - [0., 0., 0.])) return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" #reward = 1.-.3*(abs(self.sim.pose[:3] - self.target_pos)).sum() reward = 0 #penalty for distance between target and target position + stablility penalty = abs( np.square(self.sim.pose[:3] - self.target_pos)).sum() + abs( self.sim.pose[3:6]).sum() #for proximity with target distance = np.sqrt((self.sim.pose[0] - self.target_pos[0])**2 + (self.sim.pose[1] - self.target_pos[1])**2 + (self.sim.pose[2] - self.target_pos[2])**2) # if distance < 5: # reward += 100 #reward += 50 done = False if (self.sim.pose[2] > self.target_pos[2]): reward += 50 done = True reward += reward - penalty * 0.01 return reward, done def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities delta_reward, done_right = self.get_reward() reward += delta_reward pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 # State self.state_size = self.action_repeat * (9) self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 150.]) def get_reward(self): """Uses current pose of sim to return reward.""" reward = 0 penalty = 0 current_position = self.sim.pose[:3] # PENALTIES #penalty for distance from target on x axis penalty += abs(current_position[0] - self.target_pos[0])**2 #penalty for distance from target on y axis penalty += abs(current_position[1] - self.target_pos[1])**2 #penalty for distance from target on z axis, weighted as this is more important for this task of hovering at a certain height penalty += 12 * abs(current_position[2] - self.target_pos[2])**2 #penalty for uneven takeoff penalty += abs(self.sim.pose[3:6]).sum() #penalty for being far away from target and travelling fast penalty += 50 * abs( abs(current_position - self.target_pos).sum() - abs(self.sim.v).sum()) #penalty for having too much different velocities of engines penalty += 13 * np.var(self.sim.angular_v) # REWARDS #ongoing reward for being airbourne if current_position[2] > 0.0: reward += 100 #additional reward for flying near the target, where each x,y,z axis needs to be itself close to the target point for the agent to be rewarded if np.sqrt( (current_position[0] - self.target_pos[0])**2) < 10 and np.sqrt( (current_position[1] - self.target_pos[1])** 2) < 10 and np.sqrt( (current_position[2] - self.target_pos[2])**2) < 10: reward += 1000 # TOTAL return reward - (penalty * 0.0002) def cur_state(self): state = np.concatenate([np.array(self.sim.pose), np.array(self.sim.v)]) return state def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() state = self.cur_state() pose_all.append(self.cur_state()) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.cur_state()] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=[0., 0., 50.], init_angle_velocities=None, runtime=5., target_height=100.): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 1 self.state_size = self.action_repeat * 9 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_height = target_height def get_reward(self): """Uses current pose of sim to return reward.""" #reward = self.sim.v[2]**2 #reward = 0.0 reward = 10.0 / (1 + abs(self.sim.pose[2] - self.target_height))**2 #reward += 100./(1+abs(np.linalg.norm(self.sim.pose[:2])-self.target_circle_radius)) #reward -= 10.0*np.linalg.norm(self.sim.pose[6:9])**2 # reward += 1*np.linalg.norm(self.sim.pose[3:6])**2 if self.is_in_circle() else -0.5 # reward = 1 - np.tanh(abs(150 - self.sim.pose[2])/2 + abs(self.sim.v[2]/2)) return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities # if done: # pass #reward = -500 reward += self.get_reward() pose_all.append( np.concatenate((self.sim.pose, self.sim.v), axis=None)) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate( [np.concatenate( (self.sim.pose, self.sim.v), axis=None)] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Se a posição B ( alvo ) não for especificada, então assume uma posição padrão if target_pos is None: target_pos = np.array([0., 0., 100.]) # Isola Coordenadas do Local a se deslocar x_tar = round(np.around(target_pos[0], decimals=0)) y_tar = round(np.around(target_pos[1], decimals=0)) z_tar = round(np.around(target_pos[2], decimals=0)) # Se a posição A ( inicial ) não for especificada, então assume uma posição padrão if init_pose is None: init_pose = np.array([0., 0., 0., 0., 0., 0.]) # Se a velocidade inicial não for especificada, então o drone direciona a velocidade de cada um dos eixos # X, Y & Z para as coordenadas a se deslocar if init_velocities is None: init_velocities = np.array([x_tar * 0.7, y_tar * 0.7, z_tar * 0.7]) # Se a posição A ( inicial ) não for especificada, então assume uma posição padrão if init_angle_velocities is None: init_angle_velocities = np.array([0., 0., 0.]) # Valores Padrão self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.action_size = 4 self.action_low = 0 self.action_high = 900 self.state_size = self.action_repeat * 6 self.target_pos = target_pos def get_distance(self, x_tar, y_tar, z_tar, x_pos, y_pos, z_pos): """ Calcula a Distância entre DOIS Pontos, usando Geometria Analítica & Cálculo Vetorial """ # calcula Hipotenusa entre os eixos X & Y cat_x = (x_tar - x_pos)**2 cat_y = (y_tar - y_pos)**2 hip_xy = (cat_x + cat_y)**(1 / 2) cat_xy = hip_xy**2 # calcula Hipotenusa entre os eixos XY & Z cat_z = (z_tar - z_pos)**2 hip_xyz = (cat_xy + cat_z)**(1 / 2) # retorna distância oriunda da Hipotenusa projetada nos eixos "X, Y & Z" hip_xyz = round(hip_xyz) return hip_xyz def get_reward(self): # Separa Valores dos Eixos Atuais x_pos = round(np.around(self.sim.pose[0], decimals=0)) y_pos = round(np.around(self.sim.pose[1], decimals=0)) z_pos = round(np.around(self.sim.pose[2], decimals=0)) # Separa Valores dos Eixos Desejados x_tar = round(np.around(self.target_pos[0], decimals=0)) y_tar = round(np.around(self.target_pos[1], decimals=0)) z_tar = round(np.around(self.target_pos[2], decimals=0)) # se estiver no chão e o chão não for o objetivo, penaliza o agente if ((z_tar != 0) and (z_pos < 1)): reward = -100000000 else: # recompensa da distância do ponto "A" Vs ponto "B" dist_pB = self.get_distance(x_tar, y_tar, z_tar, x_pos, y_pos, z_pos) # realiza ajuste caso esteja no ponto exato if dist_pB < 2: reward = 30000000 else: reward = (((1 / dist_pB) * 10000)**2) # divide por 3, em função de "self.action_repeat", desta forma as recompensas # seguem a estrutura planejada reward = reward / 3 return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None, posn_tolerance=0.1, ang_pos_tolerance=0.1, target_vel=None, vel_tolerance=0.01, target_ang_vel=None, ang_vel_tolerance=0.01): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent pos_tolerance: tolerance on the position in all of x, y and z ang_pos_tolerance: tolerance on the Euler angles target_vel: target/goal velocities in x, y, z for the agent vel_tolerance: tolerance on the velocity in all of x, y and z target_ang_vel: target angular velocities for the three Euler angles ang_vel_tolerance: tolerance on the angular velocity in all 3 of the Eular angles """ # Simulation self.sim_time = 0.0 # Not amending sim but would like time in the reward hence hard coding these values self.sim_dt = 1 / 50.0 self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) self.target_vel = target_vel if target_vel is not None else np.array([0., 0., 0.]) self.target_ang_vel = target_ang_vel if target_ang_vel is not None else np.array([0., 0., 0.]) def get_reward(self): """Uses current pose of sim to return reward.""" # Starter reward... #reward = 1.-.3*(abs(self.sim.pose[:3] - self.target_pos)).sum() # Goal is to hover at start position # be close to start position distance_from_goal = np.linalg.norm(self.sim.pose[:3] - self.target_pos) # be not moving up/down/left/right/forward/backwards speed = abs(self.sim.v).sum() # be not spinning ang_speed = abs(self.sim.angular_v).sum() if distance_from_goal <= pos_tolerance: # reward for being within tolerance near the goal if speed <= vel_tolerance and ang_speed <= ang_vel_tolerance: # reward for being broadly stationary # quad is near goal and broadly stationary reward = 10.0 else: # reduce reward proportional to speed/ang_speed reward = 10.0 - 0.2 * speed - 0.2 * ang_speed else: # the quad is not near enough to the goal therefore it should be moving # reward based solely on distance at this point reward = -1.0 * distance return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0.0 pose_all = [] for _ in range(self.action_repeat): # update the time like in the sim... self.sim_time += self.sim_dt done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ print('') print('init_pose: ', init_pose) print('init_velocities: ', init_velocities) print('target_pos: ', target_pos) print('') # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 130 # originally 0 self.action_high = 730 # originally 900 self.action_size = 1 # 4 rotor speeds self.alpha = 1e-4 # learning rate for actor / policy update self.beta = 1e-3 # learning rate for critic / value update # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self, done): # reward function for the takeoff-task #rewards depending on position if (self.sim.pose[2] < self.target_pos[2] + 1) and (self.sim.v[2] < 0): #print('test') reward = -.5 elif (self.sim.pose[2] > self.target_pos[2] - 1) and (self.sim.v[2] > 0): reward = -.5 else: reward = 1. - .125 * (abs(self.target_pos[2] - self.sim.pose[2])) reward = np.clip(reward, -1, 1) # penalize crash if done and self.sim.time < self.sim.runtime: reward += -1 #print(reward) return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): # use * np.ones(4) only if working with 1 rotor speed for all 4 rotors done = self.sim.next_timestep( rotor_speeds * np.ones(4)) # update the sim pose and velocities reward = self.get_reward(done) pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 9 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 0.]) def get_reward(self): """Uses current pose of sim to return reward.""" # Calculate distance from target dist_from_target = np.sqrt( np.square(self.sim.pose[:3] - self.target_pos).sum()) #dist_from_target_squared = np.square(self.sim.pose[:3] - self.target_pos).sum() # Calculate distance from vertical axis #dist_from_axis = np.sqrt(np.square(self.sim.pose[:2] - self.target_pos[:2]).sum()) # Calculate angular deviation angular_deviation = np.sqrt(np.square(self.sim.pose[3:]).sum()) # Calculate velocity in xy plane #non_z_v = np.square(self.sim.v[:2]).sum() # Calculate overall velocity #vel = np.square(self.sim.v[:3]).sum() penalty = 0.004 * dist_from_target * dist_from_target #penalty = 0.015 * dist_from_target_squared # Penalty based on angular deviation to encourage the quadcopter to remain upright penalty += 0.008 * angular_deviation # Penalty based on movement in the xy plane to encourage the quadcopter to fly vertically #if dist_from_axis > 4: # penalty += 0.1 # Penalty for high velocity to encourage quadcopter to fly slower #if vel > 10.0: # penalty += 0.1 #if self.sim.pose[2] > self.target_pos[2] + 5: # penalty += 0.01 bonus = 1.0 #if dist_from_target < 0.5: # bonus += 0.01 # Calculate reward reward = bonus - penalty # Reward for time to encourage quadcopter to remain in the air #reward += 0.001 * self.sim.time # clamp reward to [-1, 1] #return min(1.0, max(-1.0, reward)) return np.tanh(reward) def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) pose_all.append(self.sim.v) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() pose_all = np.append(self.sim.pose, self.sim.v) state = np.concatenate([pose_all] * self.action_repeat) return state
class TaskHover(): def __init__(self, positionStart=None, positionStartNoise=0, positionTarget=None, nActionRepeats=1, runtime=10., factorPositionZ=0.15, factorPositionXY=0.0, factorVelocityXY=0.3, factorAngles=0.5, factorAngleRates=0.0, factorAngleAccels=0.0, factorActions=0.05, factorGlobal=1.0, angleNoise=0, angleRateNoise=0): # Internalize parameters. # Positions self.positionTarget = positionTarget if positionTarget is not None else np.array( [0., 0., 10.]) self.positionStart = positionStart if positionStart is not None else np.array( [0., 0., 0.]) # Reward factors. self.factorPositionZ = factorPositionZ self.factorPositionXY = factorPositionXY self.factorAngles = factorAngles self.factorAngleRates = factorAngleRates self.factorAngleAccels = factorAngleAccels self.factorActions = factorActions self.factorGlobal = factorGlobal self.positionStartNoise = positionStartNoise self.angleNoise = angleNoise self.angleRateNoise = angleRateNoise self.factorVelocityXY = factorVelocityXY # Number of action repeats. # For each agent time step we step the simulation multiple times and stack the states. self.nActionRepeats = nActionRepeats # Define action and state spaces. # Use OpenAI gym spaces to make this task behave # like an gym environment so we can use the same # agent as for the pendulum. # Init action space. Two actions, one for each side of the copter. # Limit actions to a range around hovering. self.action_low = 390 self.action_high = 440 self.action_space = spaces.Box(low=self.action_low, high=self.action_high, shape=(2, )) # Number of action repeats. # For each agent time step we step the simulation multiple times. self.nActionRepeats = nActionRepeats # Here the state is comprised of target vector XZ, velocity XY, cos theta, sin theta and theta angular velocities (7 values). # Positions are bounded by the environment bounds, angles by +-pi and angular velocities by 40 rad/s (happens with two rotors 0 and two 900) # Because we step multiple times we receive a multiple of the state vector. #low = np.tile(np.hstack((self.sim.lower_bounds[[0,2]], self.sim.lower_bounds, np.array([0, 0, 0]), np.array([-np.pi*10.0, -np.pi*10.0, -np.pi*10.0]))), self.nActionRepeats) #high = np.tile(np.hstack((self.sim.upper_bounds, self.sim.upper_bounds, np.array([np.pi*2.0, np.pi*2.0, np.pi]), np.array([np.pi*10.0, np.pi*10.0, np.pi*10.0]))), self.nActionRepeats) low = np.tile(np.hstack(([-10, -10], [-20, -20], -1, -1, -40)), self.nActionRepeats) high = np.tile(np.hstack(([10, 10], [20, 20], 1, 1, 40)), self.nActionRepeats) self.observation_space = spaces.Box(low=low, high=high) # Init initial conditions. # Rotation is randomized by +-0.01 degrees around all axes. init_pose = np.hstack((self.positionStart, np.array([0., 0., 0.]))) # Initial velocity 0,0,0. init_velocities = np.array([0., 0., 0.]) # Initial angular velocity 0,0,0. init_angle_velocities = np.array([0., 0., 0.]) # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) def get_reward(self, rotor_speeds): return self.rewardVelocityXY(self.sim.v) + self.rewardAction( rotor_speeds) + self.rewardPositionXY( self.sim.pose) + self.rewardPositionZ( self.sim.pose) + self.rewardAngles( self.sim.pose, self.sim.angular_v) + self.rewardAngleRates( self.sim.pose[3:6], self.sim.angular_v) + self.rewardAngleAccels( self.sim.angular_v, self.sim.angular_accels) def rewardPositionZ(self, pose): #cGauss = 2 cExp = -0.2 #yGauss = np.exp(-((pose[2]-self.positionTarget[2])**2/(2*cGauss**2))) yExp = np.exp(cExp * np.abs(self.positionTarget[2] - pose[2])) return self.factorPositionZ * yExp * self.factorGlobal def rewardPositionXY(self, pose): a = .5 return self.factorPositionXY * np.exp( -np.power(np.linalg.norm(self.positionTarget[:2] - pose[:2]), a)) * self.factorGlobal def rewardAngles(self, pose, angular_v): angles = pose[3:5] # convert angles from 0--2*pi to -pi--pi # get max angle from all 3. angles = np.max( np.abs([a if a <= np.pi else a - 2 * np.pi for a in pose[3:5]])) #x = np.max( np.abs( [a if a <= np.pi else a-2*np.pi for a in pose[3:5]] ) ) #return np.cos(x) * self.factorAngles # Also, only give reward for small angles if the angular velocity is small as well. # Thereby we can avoid to give reward if the copter is tumbling. # We do this by cV = .8 factorV = np.max(np.abs(angular_v[0:2])) factorV = np.exp(-((factorV**2) / (2 * cV**2))) cA = 0.5 return np.exp(-( (angles**2) / (2 * cA**2))) * factorV * self.factorAngles * self.factorGlobal def rewardVelocityXY(self, v): c = 4 vNorm = np.linalg.norm(v) return (np.exp(-( (vNorm**2) / (2 * c**2)))) * self.factorVelocityXY * self.factorGlobal def rewardAngleRates(self, angles, angular_v): ''' cV=0.5 factorV = np.max(np.abs(angular_v[0:2])) return (np.exp(-((x**2)/(2*c**2)))*2-1) * self.factorAngleRates ''' #return np.tanh(np.sum(-np.sign([a if a <= np.pi else a-2*np.pi for a in angles[0:2]]) * angular_v[0:2])/2.) * self.factorAngleRates rewards = -np.tanh( [a if a <= np.pi else a - 2 * np.pi for a in angles[0:2]]) * np.tanh(angular_v[0:2]) rewardMax = rewards[np.argmax(np.abs(rewards))] return rewardMax * self.factorAngleRates * self.factorGlobal def rewardAngleAccels(self, angular_v, angular_accels): # Only reward acceleration if it reduces absolute angle rate. # If angle rate is positive, a negative acceleration shall be rewarded and vice versa. return np.tanh( np.sum(-np.sign(angular_v[0:2]) * angular_accels[0:2]) / 2.) * self.factorAngleAccels * self.factorGlobal def rewardAction(self, rotor_speeds): return -(np.mean(rotor_speeds) - self.action_low) / ( self.action_high - self.action_low) * self.factorActions def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.nActionRepeats): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward(rotor_speeds) / self.nActionRepeats vectorTarget = self.positionTarget[[0, 2]] - self.sim.pose[[0, 2]] distanceTarget = np.max((np.linalg.norm(vectorTarget), 1e-10)) pose_all.append((vectorTarget / distanceTarget * np.min( (10.0, distanceTarget)))) pose_all.append( self.sim.v[[0, 2]] ) # / np.max((np.linalg.norm(self.sim.v[[0,2]]),1e-10)) * np.min((20.0, np.linalg.norm(self.sim.v[[0,2]])))) #pose_all.append([self.sim.pose[4] if self.sim.pose[4] <= np.pi else self.sim.pose[4]-2*np.pi]) pose_all.append([np.cos(self.sim.pose[4])]) pose_all.append([np.sin(self.sim.pose[4])]) pose_all.append( [np.max((np.min((self.sim.angular_v[1], np.pi)), -np.pi))]) next_state = np.concatenate(pose_all) return next_state, reward, done, None def reset(self): """Reset the sim to start a new episode.""" self.sim.reset(positionNoise=self.positionStartNoise, angleNoise=self.angleNoise, angleRateNoise=self.angleRateNoise) vectorTarget = self.positionTarget[[0, 2]] - self.sim.pose[[0, 2]] distanceTarget = np.max((np.linalg.norm(vectorTarget), 1e-10)) state = np.concatenate([ (vectorTarget / distanceTarget * np.min((10.0, distanceTarget))), self.sim.v[[ 0, 2 ]], # / np.max((np.linalg.norm(self.sim.v[[0,2]]),1e-10)) * np.min((20.0, np.linalg.norm(self.sim.v[[0,2]]))), #[self.sim.pose[4] if self.sim.pose[4] <= np.pi else self.sim.pose[4]-2*np.pi], [np.cos(self.sim.pose[4])], [np.sin(self.sim.pose[4])], [self.sim.angular_v[1]] ] * self.nActionRepeats) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 1 # All four propellant with same thrust # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self, lv, av): """Uses current pose, v and angular_v of sim to return reward.""" reward = 0 """current pose of sim to calculate reward.""" reward += (10. - .2 * (abs(self.sim.pose[:3] - self.target_pos)).sum()) """current linear velocity of sim to calculate reward.""" reward += 5 - .3 * (abs(self.sim.v - lv)).sum() """current angular velocity of sim to calculate reward.""" #reward += -1*(abs(self.sim.angular_v - av)).sum() return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] prev_v = [] prev_ang_v = [] for _ in range(self.action_repeat): prev_v = self.sim.v prev_ang_v = self.sim.angular_v # update the sim pose and velocities done = self.sim.next_timestep(rotor_speeds) reward += self.get_reward(prev_v, prev_ang_v) pose_all.append(self.sim.pose[:-3]) pose_all.append( self.sim.angular_v ) #pose_all.append(self.sim.v) # Added velocities to the states next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate( [self.sim.pose[:-3], self.sim.angular_v] * self.action_repeat) # Added velocities to the states return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., sel_state_variables=['pose', 'v', 'angular_v'], target_state_variables=None, repeat=2): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime, sel_state_variables) self.action_repeat = repeat # Goal, current state is target state if not specified if target_state_variables == None: self.target_pose = self.sim.pose self.target_velocity = self.sim.v self.target_angular_velocity = self.sim.angular_v target_state_variables = self.sim.get_state_variables() self.target_state_variables = target_state_variables self.state_size = self.action_repeat * len(target_state_variables) self.action_low = 0 self.action_high = 900 self.action_size = 4 def get_state_variables(self): return self.sim.get_state_variables() def get_state_labels(self): return self.sim.get_state_labels() def get_time(self): return self.sim.get_time() def get_reward(self): # quadratic reward function dif = self.sim.get_state_variables() - self.target_state_variables reward = 5 - dif.dot(dif) / 25.0 return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.get_state_variables()) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.array(self.get_state_variables().tolist() * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 #self.state_size = self.action_repeat * 9 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 1.]) self.init_pose = init_pose if init_pose is not None else np.array( [0., 0., 20.]) self.last_pose = self.init_pose def get_reward(self): """Uses current pose of sim to return reward.""" xy_penalty = 0.01 * abs(self.sim.v[0]) if abs( self.sim.v[0]) > 0. else 0 xy_penalty += 0.01 * abs(self.sim.v[1]) if abs( self.sim.v[1]) > 0. else 0 downward_penalty = 0.3 * abs( self.sim.pose[2] - self.init_pose[2]) if self.init_pose[2] > self.sim.pose[2] else 0 z_displacement = self.sim.pose[2] - self.last_pose[ 2] #To appoarch value 0.3 per 0.02s, 5m per second z_displacement *= 10 #scale up upward_reward = 0.5 * norm( loc=3, scale=1.).pdf(z_displacement) if z_displacement > 0. else 0 #max 0.195 upward_reward += min(1., 0.3 * abs(self.sim.pose[2] - self.init_pose[2]) ) if self.init_pose[2] < self.sim.pose[2] else 0 #upward_reward += 0.3 * abs(self.sim.pose[2] - self.init_pose[2]) if self.init_pose[2] < self.sim.pose[2] else 0 #max 1 reward = -1 * (xy_penalty + downward_penalty) + upward_reward reward = np.tanh(reward) # if self.sim.pose[2] > self.target_pos[2]: # reward += 10 return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] self.last_pose = np.copy(self.sim.pose) for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) #pose_all.append(self.sim.v) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) #state = np.concatenate((self.sim.pose,self.sim.v)) #state = np.concatenate([state] * self.action_repeat) self.last_pose = np.copy(self.init_pose) return state
class Landing(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 self.runtime = runtime # Goal self.target_velocity = np.array([0.0, 0.0, 0.0]) # ideally zero velocity self.last_timestamp = 0 self.last_position = np.array([0.0, 0.0, 0.0]) self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def istargetzone(self): """Checks whether the copter has landed in target zone or not""" flag = False cntr = 0 position = self.sim.pose[:3] #Set upper bound and lower bound for target zone target_bounds = 40 lower_bounds = np.array([-target_bounds / 2, -target_bounds / 2, 0]) upper_bounds = np.array( [target_bounds / 2, target_bounds / 2, target_bounds]) #Set boundary conditions lower_pos = (self.target_pos + lower_bounds) upper_pos = (self.target_pos + upper_bounds) #Check whether the copter has landed with the boundaries of target zone for j in range(3): #Check for the boundary conditions if (lower_pos[j] <= position[j] and position[j] < upper_pos[j]): cntr = cntr + 1 #Check if all 3 conditions have been satisfied if cntr == 3: flag = True return flag def get_reward(self): """Uses current pose of sim to return reward.""" #Calculate distance between current position and target position distance = np.linalg.norm((self.sim.pose[:3] - self.target_pos)) distance_max = np.linalg.norm(self.sim.upper_bounds) #Calculate velocity velocity = np.linalg.norm((self.sim.v - self.target_velocity)) # Calculate distance factor and velocity factor distance_factor = 1 / max(distance, 0.1) vel_discount = (1.0 - max(velocity, 0.1)**(distance_factor)) reward = 0 # Penalize agent running out of time if self.sim.time >= self.runtime: reward = -10.0 self.sim.done = True else: # Agent has touched the ground surface (i.e. z=0) if (self.sim.pose[2] == self.target_pos[2]): # If velocity is less than the specified threshold # it implies that the agent has landed successfulyy if (self.sim.v[2] <= 1): if (self.istargetzone() == True): #Landed safely. Give bonus points for landing in the target zone landing_reward = 100.0 print('Agent has landed in the target zone') else: reward = -100.0 #Landed outside target zone print('outside') else: #Penalize agent for crashing reward = -100 # Crashed self.sim.done = True else: if (np.isnan(self.sim.v[2]) == False): # Depending upon the distance of the copter from the target position a normal penalty has been applied distance_reward = 0.2 - (distance / distance_max)**0.1 reward = vel_discount * distance_reward else: #Penalize agent for crashing reward = -100 # Crashed self.sim.done = True #Apply tanh to avoid instability in training due to exploding gradients reward = np.tanh(reward) return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): #print(rotor_speeds) done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() self.last_timestamp = 0 self.last_position = np.array([0.0, 0.0, 0.0]) state = np.concatenate([self.sim.pose] * self.action_repeat) return state