class Task: """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__( self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5.0, target_pos=None, ): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = (target_pos if target_pos is not None else np.array( [0.0, 0.0, 10.0])) @staticmethod def sigmoid(x): return 1 / (1 + np.exp(-x)) def get_reward(self): """Uses current pose of sim to return reward.""" # If far away from the target position, we subtract by scale of scale_factor. # scale_factor = 0.3 # This says within 3.333 distance results in positive reward. scale_factor = 0.8 # This says within 1.25 distance results in positive reward. # Position is a much more important factor, so it has higher scale than velocity. # The reward defines what it deems the "best position" is, so we want to make # sure that the best position is mostly based on the physical location and now # using a really small velocity. scale_factor_vel = 0.001 # Prevent the body velocity of the quadcopter from going too high. # This should make it smoother. reward = ( 1.0 - scale_factor * ((abs(self.sim.pose[:3] - self.target_pos)).sum()**2) - scale_factor_vel * (self.sim.find_body_velocity().sum()**2) # - scale_factor_vel * abs(self.sim.find_body_velocity().sum()) ) # # My task is moving from (0,0,0) to (0,10,0). We want to prevent the # # quadcopter from going too high in the air. We can simply subtract # # the sigmoid of the z position. If close to the ground, we will subtract close to 0. # # If too high it will subtract 1. Divide z position by 3 to allow for some height. # # Subtract 0.5 and multiply total by 2 to get values between 0 and 1 # reward = ( # 1.0 # - scale_factor * (abs(self.sim.pose[:3] - self.target_pos)).sum() # - 2 * (Task.sigmoid(self.sim.pose[2] / 3) - 0.5) # ) return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class TaskTakeOff(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) self.init_position = init_pose[:3] if init_pose is not None else np.array([0., 0., 0.]) self.orig_distances = abs(self.target_pos - self.init_position) #print("target_pos = {} init_position = {} orig_distances = {}".format(self.target_pos, self.init_position, self.orig_distances )) def get_reward_orig(self): """Uses current pose of sim to return reward.""" reward = 1.-.3*(abs(self.sim.pose[:3] - self.target_pos)).sum() return reward def get_reward_basic(self): """Uses current pose of sim to return reward.""" #indices# X=0, Y=1, Z=2, PSI=3, THETA=4, PHI=5 X = self.sim.pose[0] Y = self.sim.pose[1] Z = self.sim.pose[2] PSI = self.sim.pose[3] THETA = self.sim.pose[4] PHI = self.sim.pose[5] #set rewards #positive reward for positive Z, penalties for X & Y motion #Euler angles and velocities are neglected as their effects #will reflect upon the (x,y,z) coordinates anyway and will then be #followed through. reward = 0.0 + 1*(Z) - 0.3*(abs(X)+abs(Y)) #End episode if max height is breached done = False if self.sim.pose[2] >= self.target_pos[2]: # agent has crossed the target height #raise TypeError reward += 50.0 # bonus reward done = True #normalize reward to avoid overflows in other computations such as gradients #Since we will end an episode once it reaches a height of 10, and give it a bonus reward of 50 #the max positice reward is 60: # as (0,0,10) -> reward calculated above of 10 + 50 as bonus #to normalize we will consider the min value to be -60 Z = (reward + 60) / (120) # Z = (value - min)/(max - min), here (max - min)=(60-(-60)) = 120 always reward = Z return reward, done def get_reward_complex(self): """Uses current pose of sim to return reward.""" #indices of self.sim.pose --> X=0, Y=1, Z=2, PSI=3, THETA=4, PHI=5 X = self.sim.pose[0] Y = self.sim.pose[1] Z = self.sim.pose[2] PSI = self.sim.pose[3] THETA = self.sim.pose[4] PHI = self.sim.pose[5] body_velocity = self.sim.find_body_velocity() #print("body_velocity.type={}".format(type(body_velocity)) #set rewards euler_angle_sum = abs(PSI)+abs(THETA)+abs(PHI) reward = 100.0 reward -= 0.3 * (abs(X)+abs(Y)) #penalty for movement along x and y planes reward -= 0.5 * euler_angle_sum #penalty for euler angle movement return round(reward,2) def get_reward_complex2(self): """Uses current pose of sim to return reward.""" #indices of self.sim.pose --> X=0, Y=1, Z=2, PSI=3, THETA=4, PHI=5 X = self.sim.pose[0] Y = self.sim.pose[1] Z = self.sim.pose[2] PSI = self.sim.pose[3] THETA = self.sim.pose[4] PHI = self.sim.pose[5] body_velocity = self.sim.find_body_velocity() #print("body_velocity.type={}".format(type(body_velocity)) #set rewards euler_angle_sum = abs(PSI)+abs(THETA)+abs(PHI) #positive reward for positive Z movement #penalty for movement along x and y planes reward = 0.0 + 1*(Z) - 0.3*(abs(X)+abs(Y)) reward -= 0.5 * euler_angle_sum #penalty for euler angle movement return round(reward,2) get_reward = get_reward_basic #switch to select the particular get_reward needed. def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities #check if episode is done due to desired height being reached delta_reward, done_height = self.get_reward() reward += delta_reward if done_height: done = done_height pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 100 self.action_high = 800 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self, done): """Uses current pose of sim to return reward.""" r0 = self.target_pos[2] r = self.sim.pose[2] rdot = self.sim.find_body_velocity()[2] if r > 0: reward = (1 - np.tanh(0.1 * np.abs(r - r0))) * 0.4 reward += (1 - np.tanh(0.0625 * np.abs(rdot))) * 0.4 reward += (1 - np.tanh(0.1 * np.abs(self.sim.pose[3:].sum()))) * 0.1 reward += (1 - np.tanh(0.025 * np.abs(self.sim.angular_v.sum()))) * 0.1 if done & (r < 2.5 * r0): reward = 50 else: reward = -30 return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): self.body_velocity_bstep = self.sim.find_body_velocity() done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward(done) pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, physicsSim_Params, target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(physicsSim_Params) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self, rotor_speeds): """Uses current pose of sim to return reward.""" target_closeness_reward = 1. - .3 * (abs(self.sim.pose[:3] - self.target_pos)).sum() # Severely penalize the helicopter having a 'body' velocity that's too large, regardless of direction velocity_penalties = abs(self.sim.find_body_velocity()).sum() # per slack room, idea to minimize euler angles euler_bias = 5 eulers_angle_penalty = abs(self.sim.pose[3:]).sum() - euler_bias # episode ends when hitting the ground or running out of time. Want to fly as long as possible. flight_time = self.sim.time rotor_diff_penalty = np.std( rotor_speeds) # penalize big differences in rotor speed! height_reward = self.sim.pose[ 2] / 100. # reward going higher, since that's what we want! But not by more than distance total = target_closeness_reward \ + flight_time \ - eulers_angle_penalty \ - velocity_penalties \ - rotor_diff_penalty \ + height_reward #print('rewards: ', target_closeness_reward, velocity_penalties, eulers_angle_penalty, # flight_time, rotor_diff_penalty, total) #hyperbolic_activation = np.tanh(total) #if (hyperbolic_activation > -1.): # print('Found rewards that crept above -1!: ', total, hyperbolic_activation) #return hyperbolic_activation return total def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward(rotor_speeds) pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state