class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None, action_repeat=1): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = action_repeat self.state_size = self.action_repeat * 6 self.action_low = 1 self.action_high = 900 self.action_size = self.action_repeat * 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" reward = 10. if self.sim.pose[2] < (self.target_pos[2] - 3.): reward += 2*(self.sim.pose[2] - self.sim.init_pose[2]) reward -= (abs(self.sim.pose[:2] - self.target_pos[:2]) > 3.).sum() else: reward += 10. reward -= 2*(abs(self.sim.pose[:3] - self.target_pos) > 3.).sum() reward -= np.int32(abs(self.sim.pose[2] - self.target_pos[2]) > 3.) if self.sim.done: if self.sim.time < self.sim.runtime: reward -= 10 reward = reward / 10 return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds[(_*4):((_*4)+4)]) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, (reward / self.action_repeat), done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 #0 self.action_high = 900 #900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.])
class Task(): """ Task (environment) that defines the goal and provides feedback to the agent. The task to consider here is Take-off. The quadcoputer starts a little above of (x, y, z) = (0, 0, 0). The target is set to (x, y, z) = (0, 0, 10.0). """ def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """ Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 # size of the state space (with action repeats) self.action_size = 4 # size of the action space self.action_low = 0.0 # lower bound for the action space self.action_high = 900.0 # upper bound for the action space # the target position self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) # the target angle self.target_angle = np.array([0., 0., 0.]) def get_reward_ddgp(self): """ Uses current pose of sim to return reward and done (so that once the quadcopter reaches the height of the target, the episode ends. ) """ ## the following one is the best one March 26 reward = - 0.20*(abs(self.sim.pose[:2] - self.target_pos[:2])).sum() reward += 2.4 - 1.2*abs(self.sim.pose[2] - self.target_pos[2]) # reward += 2.4 - 1.2*abs(self.sim.pose[2] - self.target_pos[2]) # since the angles in self.sim.pose[3:] take value in [0, 2*pi] # I will rewrite them such that they take value in range [-pi, pi] # and then introduce the reward angles_around_0 = np.array([0.0, 0.0, 0.0]) for i in range(3): if self.sim.pose[i+3] <= np.pi: angles_around_0[i] = self.sim.pose[i+3] else: angles_around_0[i] = self.sim.pose[i+3] - 2.0*np.pi reward += - 0.20*(abs(angles_around_0[0:3]-self.target_angle)).sum() done = False if(self.sim.pose[2] >= self.target_pos[2]): reward += 200.0 done = True return reward, done def step_ddpg(self, rotor_speeds): """ Uses action to obtain next state, reward, done. For DDPG algorithm. """ reward = 0.0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities delta_reward, done_height = self.get_reward_ddgp() reward += delta_reward # once the quadcoper reaches the height of the target, end the episode if done_height: done = done_height pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """ Reset the sim to start a new episode. """ self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state def get_reward(self): """ THIS ONE IS USED FOR THE DEMOMSTRATION PART OF THE NOTEBOOK. Uses current pose of sim to return reward and done (so that once the quadcopter reaches the height of the target, the episode ends. ) """ reward = 1.-.3*(abs(self.sim.pose[:3] - self.target_pos)).sum() return reward def step(self, rotor_speeds): """ THIS ONE IS USED FOR THE DEMONSTRATION PART OF THE NOTEBOOK. Uses action to obtain next state, reward, done. """ reward = 0.0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.init_pose = init_pose self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) def get_reward(self, done): """Uses current pose of sim to return reward.""" #pos = np.array(self.sim.pose[:3]) #target = np.array(self.target_pos[:3]) #reward = np.tanh(2 - 4e-3 * np.sum((self.target_pos - self.sim.pose[2])**2) - 3e-4 * np.sum(self.sim.v[2])) #reward += self.sim.v[2]*10 #done = False #if distance < 0.5: # done = True #reward = np.tanh(self.sim.pose[2] * 10 +self.sim.v[2] *10) #reward = np.tanh(1 + self.sim.v[2] + self.sim.pose[2] ) '''- np.sqrt(np.square(self.sim.v[:2]).sum())''' #reward = np.tanh(1 - 0.003 * (abs(self.target_pos - self.sim.pose[:3]))).sum() #np.tanh(1 - 0.003*(abs(self.sim.pose[:3] - self.target_pos))).sum() #dist_from_target = np.sqrt(np.square(self.sim.pose[:3] - self.target_pos).sum()) # Want the quadcopter to hover straight up, measure angular deviation #angular_dev = np.sqrt(np.square(self.sim.pose[3:]).sum()) # Also want to penalise movement not in z-axis (DeepMind Locomotion) #non_z_v = np.square(self.sim.v[:2]).sum() #reward = 1. - .001*dist_from_target - .0005*angular_dev - .0005*non_z_v #reward = np.tanh(1.-.2*(abs(self.sim.pose[:3] - self.target_pos)).sum()) reward = 0 #if self.sim.pose[2] >= self.target_pos[2]: # reward += 10 #reward += np.tanh(np.linalg.norm(self.sim.pose[:3] -self.target_pos) + self.sim.v[2]) # initial reward - fraction of the z-velocity reward = 0.5 * self.sim.v[2] # additional reward if the agent is close in vertical coordinates # to the target pose if abs(self.sim.pose[2] - self.target_pos[2]) < 3: reward += 15. done = True # penalize a crash if done and self.sim.time < self.sim.runtime: reward = -1 return reward, done # penalize the downward movement relative to the starting position if self.sim.pose[2] < self.init_pose[2]: reward -= 1 return np.tanh((1 - .001 * np.linalg.norm(self.sim.pose[:3] - self.target_pos)) + reward + 0.1 * self.sim.v[2]), done def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward, done = self.get_reward(done) pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" reward = 1.-.001*(abs(self.sim.pose[:3] - self.target_pos)).sum() if (abs(self.sim.pose[0] - self.target_pos[0])) < 0.25: reward += 0.03 if (abs(self.sim.pose[1] - self.target_pos[1])) < 0.25: reward += 0.03 if (abs(self.sim.pose[2] - self.target_pos[2])) < 0.25: reward += 0.03 if self.sim.time < self.sim.runtime and self.sim.done == True: reward -= 10 #reward = 1 #penalty = 0 #current_position = self.sim.pose[:3] #distance = np.sqrt(((current_position - self.target_pos)**2).sum()) # penalty for euler angles to avoid unecessary rotations #penalty += abs(self.sim.pose[3:6]).sum() #penalty for being distant from the target coordinates #penalty += .003*(abs(self.sim.pose[:3] - self.target_pos)).sum() #penalty for increasing total distance to the target #penalty += distance**2 return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class TaskModified(): """Task (environment) that defines the goal and provides feedback to the agent.""" """O novo objetivo é aterrissar o quadricóptero. Foi substituído a variável de posição de objetivo de 'target_pos' para target_pos_x_y' com apenas as posições de 'x' e 'y', sendo que a posição 'z' será sempre zero, representando a altura no ponto de pouso.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos_x_y=None): # target_pos=None """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal # self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) """A altitude 'z' e os ângulos de Euler sempre serão 0. para o objetivo de aterrissagem, sendo necessário passar apenas os valores das coordenadas 'x' e 'y'.""" self.target_pos = np.append( target_pos_x_y, 0.) if target_pos_x_y is not None else np.array( [0., 0., 0.]) def get_reward(self): """Uses current pose of sim to return reward.""" """É importante pontuar a recompensa não só pela posição alvo, mas também pela estabilidade na aterrissagem. Por isso, os ângulos de Euler também entraram no cálculo dessa pontuação, sendo dividido por igual o peso da pontuação que era de 0.3""" # reward = 1.-.3*(abs(self.sim.pose[:3] - self.target_pos)).sum() reward = 1. - .3 * (abs(self.sim.pose[:3] - self.target_pos)).sum( ) - .2 * (abs(self.sim.pose[3:] - [0., 0., 0.])).sum() return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] # rotor_speeds_suavised = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities """Repassa as velocidades de forma suavisada ao aproximar do alvo. for speed in rotor_speeds: rotor_speeds_suavised.append((speed/(self.init_height*1.25))+.2) done = self.sim.next_timestep(np.concatenate(rotor_speeds_suavised))""" reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
def __init__(self, init_pos=None, init_velocities=None, init_angle_velocities=None, runtime=3., target_pos=None, action_repeat=3, state_size=6, action_low=0, action_high=900, goal="takeoff", min_accuracy=1.5, distance_factor=2.3, angle_factor=6.3, v_factor=4.5, rotor_factor=0.1, time_factor=0.1, elevation_factor=2.4, crash_factor=1., target_factor=1.): """Initialize a Task object. Params ====== init_pos (array): initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities (array): initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities (array): initial radians/second for each of the three Euler angles runtime (float): time limit for each episode target_pos (array): target/goal (x,y,z) position for the agent action_repeat (int): Number of timesteps to step agent state_size (int): Dimension of each state action_low (array): Min value of each action dimension action_high (array): Max value of each action dimension goal (string): Specifies what type of task the agent is attempting and sets standard initial values if not Otherwise specified Options: 'hover' (default), 'target', 'land', 'takeoff' min_accuracy (float): Sets how far the average deviation from the target point is acceptable distance_factor (float): Scalar that controls the relative reward for the distance measurement of the reward function # DEPRECIATED angle_factor (float): Scalar that controls the relative reward for the angles off axis of the reward function # DEPRECIATED v_factor (float): Scalar that controls the relative reward for the angular velocity of the reward function # DEPRECIATED rotor_factor (float): Scalar that controls the relative reward for the relative rotor speed of the reward function # DEPRECIATED time_factor (float): Scalar that controls the relative reward for the time bonus of the reward function # DEPRECIATED elevation_factor (float): Scalar that controls the relative reward for the elevation measurement of the reward function # DEPRECIATED crash_factor (float): Scalar that controls the relative reward for the crash detector of the reward function # DEPRECIATED target_factor (float): Scalar that controls the relative reward for the goal conditions of the reward function # DEPRECIATED """ ## Set target and initial conditions based on the goal ## But doesn't override if other conditions were specified if goal is "target": self.target_pos = np.array( [10., 0., 10.]) if target_pos is None else target_pos elif goal is "hover": self.target_pos = np.array( [0., 0., 10.]) if target_pos is None else target_pos elif goal is "land": self.target_pos = np.array( [0., 0., 0.01]) if target_pos is None else target_pos elif goal is "takeoff": self.target_pos = np.array( [0., 0., 10.]) if target_pos is None else target_pos self.init_pos = np.array([0.0, 0.0, 0.1, 0.0, 0.0, 0.0 ]) if init_pos is None else init_pos else: print( "It seems an improper goal was specified. Please input one of the following options:\ * 'hover'\n* 'target'\n* 'land'\n* 'takeoff'") if goal is not "takeoff": self.init_pos = np.array([0.0, 0.0, 10, 0.0, 0.0, 0.0 ]) if init_pos is None else init_pos self.goal = goal ## Simulation self.sim = PhysicsSim(self.init_pos, init_velocities, init_angle_velocities, runtime) self.action_repeat = action_repeat self.state_size = self.action_repeat * state_size self.action_low = action_low self.action_high = action_high self.action_size = 4 # Dimension of each action (one for each rotor) self.min_accuracy = min_accuracy self.distance_factor = distance_factor # DEPRECIATED self.angle_factor = angle_factor # DEPRECIATED self.v_factor = v_factor # DEPRECIATED self.rotor_factor = rotor_factor # DEPRECIATED self.time_factor = time_factor # DEPRECIATED self.elevation_factor = elevation_factor # DEPRECIATED self.crash_factor = crash_factor # DEPRECIATED self.target_factor = target_factor # DEPRECIATED ## Numerical representation of the distance to the target self.target_proximity = (abs(self.sim.pose[:3] - self.target_pos)).sum() self.total_proximity = [] self.average_proximity = 0. self.best_proximity = np.inf self.previous_proximity = self.target_proximity ## DEPRECIATED self.params = { "Distance": self.distance_factor, "Angle": self.angle_factor, "Angular V": self.v_factor, "Rotor": self.rotor_factor, "Time": self.time_factor, "Elevation": self.elevation_factor, "Crash": self.crash_factor, "Target": self.target_factor }
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0.0 self.action_high = 900.0 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 150.]) def get_reward(self, rotor_speeds, init_pose): """Uses current pose of sim to return reward.""" #reward = 1.-.3*(abs(self.sim.pose[:3] - self.target_pos)).sum() #REWARD FOR EVERY TIMESTEP reward_plus = 0.050 #0.05 #REWARD / PUNISHMENT FOR TRAVELLING IN THE RIGHT DIRECTION if self.sim.pose[2] < init_pose[2]: reward_pose = -1 * 0.1 else: reward_pose = (0.01 / 150) * (self.sim.pose[2] - init_pose[2] ) #0.05 #PUNISHMENTS FOR MAKING ACTIONS THAT MAKE THE DRONE UNSTABIL #reward_pose = -1* (0.05000/10) * sum(abs(self.target_pos - self.sim.pose[:3])) #reward_pose = -1* (0.01200/10) * (abs(self.target_pos[2] - self.sim.pose[2]) reward_angularv = -1 * (0.0100 / 30) * (max(abs(self.sim.angular_v))) reward_pose2 = -1 * (0.0100 / 6) * sum(abs(self.sim.pose[3:])) reward_rs = -1 * (0.0100 / 900) * (max(rotor_speeds) - min(rotor_speeds)) reward_pose3 = -1 * (0.01000 / 10) * sum( abs(self.target_pos[:2] - self.sim.pose[:2])) reward = reward_plus + reward_pose + reward_angularv + reward_pose2 + reward_rs + reward_pose3 #reward = np.clip(reward,-1.0,1.0) #return reward rewards = [ reward, reward_pose, reward_angularv, reward_pose2, reward_rs ] return rewards def step(self, rotor_speeds, init_pose): """Uses action to obtain next state, reward, done.""" rewards = np.array([0.0, 0.0, 0.0, 0.0, 0.0]) pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities rewards += self.get_reward(rotor_speeds, init_pose) #reward += self.get_reward(rotor_speeds) pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, rewards, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 1 self.runtime = runtime # Goal self.target_pos = target_pos if target_pos is not None else np.array( [10., 10., 10.]) def get_reward(self, done): """Uses current pose of sim to return reward.""" x, y, z = self.sim.pose[0:3] x_a, y_a, z_a = self.sim.pose[3:6] xdot, ydot, zdot = self.sim.v[0:3] xdot_a, ydot_a, zdot_a = self.sim.angular_v[0:3] time = self.sim.time target_z = self.sim.pose[2] reward = 1. - .3 * (abs(self.sim.pose[:3] - self.target_pos[:3])).sum() # reward = 1. # reward = .1-.03*abs(self.sim.pose[:3]-#self.target_pos[:3]).sum()-.01*abs(ydot_a)-.01*abs(zdot_a) # if time > self.runtime: # reward += 1. # if z > target_z and z <(target_z + 10): # reward += 10. # if done and time < self.runtime: # reward += - 1 / time return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward(done) pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self): #pCenterXYZ = 0 #pStability = -(abs(self.sim.pose[3:6]).sum()**2) #pCenterXYZ += -(abs(current_position[0]-self.target_pos[0])**2) #pCenterXYZ += -(abs(current_position[1]-self.target_pos[1])**2) #pCenterXYZ += -(abs(current_position[2]-self.target_pos[2])**2) #sum_acceleration = -np.linalg.norm(self.sim.linear_accel) #distance = np.sqrt((current_position[0]-self.target_pos[0])**2 + (current_position[1]-self.target_pos[1])**2 + (current_position[2]-self.target_pos[2])**2) reward = 1 - 0.003 * (abs(self.sim.pose[:3] - self.target_pos)).sum() #reward = -abs(self.target_pos[2] - self.sim.pose[2]) def is_equal(x, y, delta=0.0): return abs(x - y) <= delta if is_equal(self.target_pos[2], self.sim.pose[2], delta=1): reward += 10.0 return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) if done: reward += 10.0 next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self, done): """Uses current pose of sim to return reward.""" reward = 0 if done and self.sim.time < self.sim.runtime: reward += -1 if self.sim.v[0] > 0 or self.sim.v[0] < 0: reward -= 10 if self.sim.v[1] > 0 or self.sim.v[1] < 0: reward -= 10 if self.sim.v[2] > 0: reward += 50 euler_angles = self.sim.pose[3:6].sum() reward -= euler_angles lateral_distance = ((self.sim.pose[0] - self.target_pos[0]) + (self.sim.pose[1] - self.target_pos[1]))**2 reward -= lateral_distance vertical_distance = (self.sim.pose[2] - self.target_pos[2])**2 if vertical_distance != 0: reward += 100 / vertical_distance #reward += 1.-.003*(abs(self.sim.pose[:3] - self.target_pos)).sum() reward = np.tanh(reward) return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward(done) pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 22 self.action_low = 325 #400-500 flies, 400 doesn't self.action_high = 425 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) def get_reward(self, rotor_speeds): # """Uses current pose of sim to return reward.""" pose_error = abs(self.target_pos - self.sim.pose[:3]) distance = np.linalg.norm(self.target_pos - self.sim.pose[:3]) avg_angle = np.mean(self.sim.pose[3:]) linear_accel = np.linalg.norm(self.sim.linear_accel) angular_accels = np.linalg.norm(self.sim.angular_accels) reward = 0 # reward -= 10 * pose_error[2] # reward -= 10 * pose_error[1] # reward -= 10 * pose_error[0] # reward -= 0.1 * linear_accel.sum() # reward -= 0.1 * angular_accels.sum() # reward -= 1.0 * np.std(rotor_speeds) # reward -= 1.0 * avg_angle # print('rotor speed {}'.format(np.std(rotor_speeds))) reward += 1000 reward -= 0.05 * (distance ** 2) return reward / 1000.0 def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" # make sure it's an actual array rotor_speeds = np.array(rotor_speeds) #rescale from -1 to +1 to full range rotor_speeds = (self.action_high - self.action_low)/2 * (rotor_speeds + 1) + self.action_low rotor_speeds = np.ones_like(rotor_speeds) * rotor_speeds[0] # print('\r rotor speeds {}'.format(rotor_speeds), end='') reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward += self.get_reward(rotor_speeds) pose_all.append(np.concatenate([self.sim.pose, self.sim.v, self.sim.angular_v, self.sim.linear_accel, self.sim.angular_accels, self.sim.prop_wind_speed])) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose, self.sim.v, self.sim.angular_v, self.sim.linear_accel, self.sim.angular_accels, self.sim.prop_wind_speed] * self.action_repeat) return state def dump(self): print("time {:4.1f} x {:=+04.1f} y {:=+04.1f} z {:=+04.1f}".format(self.sim.time, self.sim.pose[0], self.sim.pose[1], self.sim.pose[2]))
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent // changed to target only z """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 #default state_size #self.state_size = self.action_repeat * len(self.sim.pose) #simplified state self.state_size = self.action_repeat self.action_low = 0 #default 0 self.action_high = 900 #default 900 self.action_size = 1 #default 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" #default value #reward = 1.-.3*(abs(self.sim.pose[:3] - self.target_pos)).sum() #my reward reward = 1 - 0.01 * abs(self.sim.pose[2] - self.target_pos) - 0.01*abs(self.sim.v[2]) return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0.0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward += self.get_reward() #default pose_all #pose_all.append(self.sim.pose) #simple state pose_all.append(self.sim.pose[2]) #default next_state #next_state = np.concatenate(pose_all) #simple next_state next_state = np.array(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() #default state, uses x,y,z,phi,theta,psi repeated n times #state = np.concatenate([self.sim.pose] * self.action_repeat) #simple state state = [self.sim.pose[2]] * self.action_repeat return state
class Takeoff(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None,position_noise=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ print("########2 Task to Takeoff") # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 self.runtime = runtime self.position_noise = position_noise # Goal self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" #reward = 1.-.3*(abs(self.sim.pose[:3] - self.target_pos)).sum() reward = 0.0 #encorage acceleration in the +z-direction #reward += self.sim.linear_accel[2] #reward += 0.5 * self.sim.linear_accel[2] * self.sim.dt**2 #encorage velocity in the +z-direction if self.sim.v[2] > 0: reward = 10*self.sim.v[2] else: reward = -10 #encorage smaller position errors and limit the demenonator to not be too much close to zero #posDiff = -abs(self.sim.pose[ 2] - self.target_pos[ 2]) #**2 #if posDiff > 0.1: # reward += 10/posDiff #else : # reward += 100 return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) #sparse reward at the end if target is reached if(self.sim.pose[2] >= self.target_pos[2]): print("\n sucessfull takeoff to target target !!!!!!!!!") reward += 10 done = True #if np.isclose(self.target_pos[2], self.sim.pose[2], 1): # reward += 100 # done = True #if(self.sim.pose[2] >= self.target_pos[2] and self.sim.time >= self.runtime): #if np.allclose(self.sim.pose[:-3],self.target_pos, rtol=1): # reward +=100 # done = True next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim with noise to start a new episode.""" self.sim.reset() #::os:: add random start position as suggested by some papers to make agent more generalized if self.position_noise is not None or self.ang_noise is not None: rand_pose = np.copy(self.sim.init_pose) if self.position_noise is not None and self.position_noise > 0: rand_pose[:3] += np.random.normal(0.0, self.position_noise, 3) #print("start rand_pose =",rand_pose) self.sim.pose = np.copy(rand_pose) state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class TaskTakeOff(): '''Task(environment) that defines the goal and provides feedback to the agent''' def __init__(self, init_pose=None, init_velocities=None,\ init_angle_velocities=None, runtime=5., target_pos=None): '''initialize a task object Params ====== init_pose : initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities : initial velocities of the quadcopter in (x,y,z) dimensions init_angle_velocities : initial radians/second for each of the three Euler angles runtime : time limit for each episode target_pos : target/goal (x,y,z) position for the agent Footnote ======== adjust action_low and action_high to shrink the continuous action space ''' # simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 410 self.action_high = 420 self.action_size = 4 # goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self): '''use current position of simulation to return reward Footnote: ======== 1. include vertical velocity as part of the reward, encouraging the copter to fly vertically 2. penalize crash 3. reward clipping to range (-1,1) 4. or use np.tanh() to squeeze reward in range (-1,1) ''' # version 01 - udacity task template #reward = 1.-.3*(abs(self.sim.pose[:3] - self.target_pos)).sum() + 0.3*(self.sim.v[2]) #reward = np.clip(reward, -1,1) # version 02 - udacity takeoff task template # reward = zero for matching target z, -ve as you go farther, upto -20 #reward = -min(abs(self.target_pos[2] - self.sim.pose[2]), 20.) #if self.sim.pose[2] > self.target_pos[2]: # reward += 1. # bonus reward #elif self.sim.done and self.sim.time < self.sim.runtime: # reward -= 1. # crash before timeout # version 03 - use np.tanh() reward = 3.0 * self.sim.v[2] - 0.025 * abs(self.sim.v[:2]).sum( ) + 1.5 * self.sim.pose[2] - 0.25 * (abs(self.sim.pose[:2]).sum()) reward = np.tanh(reward) return reward def step(self, rotor_speeds): '''use action to obtain next state, reward, done Footnote ======== self.sim.next_timestep() : take in action inputs and update the sim.pose to new position, accumulate the system run-time, return flag of done status ''' reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the simulation position and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate( pose_all ) # total state_size: action_repeat * simulation's position return next_state, reward, done def reset(self): '''reset the simulation to start a new episode''' self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None, action_repeat=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = action_repeat if action_repeat is not None else 3 self.init_pos = init_pose self.state_size = self.action_repeat * 6 self.action_low = 10 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) self.flight_path = self.target_pos - self.init_pos[:3] self.num_steps = 0 #np.seterr(all='raise') def get_reward(self): """Uses current pose of sim to return reward.""" reward = 1. - .2 * (abs(self.sim.pose[:3] - self.target_pos)).sum() # reward = 0 # if not np.all(self.target_pos == self.init_pos[:3]): # a = np.cross(self.target_pos - self.sim.pose[:3], self.flight_path) # reward -= np.linalg.norm(a) / np.linalg.norm(self.flight_path) # reward -= np.linalg.norm([self.sim.pose[:3] - self.target_pos]) # print('task reward : ',reward) return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" self.num_steps += 1 reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() self.num_steps = 0 state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task_Hover(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 10 self.action_high = 100 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" reward = 0 factor = 3 dis = self.sim.pose[2] - self.target_pos[2] if (dis >= 0): # agent above or equal the target reward += dis * factor else: # agent below the target reward += (1 / np.abs(dis)) * factor # to make it in a range [-1,1] reward = np.tanh(reward) #''' return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class TaskLanderWithPhy(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, env=None, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 4 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 40 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) #------------------------------------------------------------- #------------------------------------------------------------- self.env = env self.env.continuous = False # self.action_repeat = 3 # self.state_size = self.action_repeat * np.prod(env.observation_space.shape) # self.action_size = np.prod(env.action_space.shape) #------------------------------------------------------------- # lunder.demo_heuristic_lander(self.env, render=True) # def step(self, action): # """Uses action to obtain next state, reward, done.""" # total_reward = 0 # reward = 0 # done = 0 # obs = [] # for _ in range(self.action_repeat): # ob, reward, done, info = self.env.step(action) # done = self.sim.next_timestep(action) # total_reward += reward # obs.append(ob) # next_state = np.concatenate(obs) # return next_state, reward, done def get_reward(self): """Uses current pose of sim to return reward.""" reward = 1.3 * (abs(self.sim.v[2]).sum()) # print('velocity is ' , self.sim.v[2] , 'position',self.sim.pose[:3] ,'target' ,self.target_pos ) # print ('reward',reward) # if self.env.game_over or abs(self.env.state[0]) >= 1.0: # done = True # reward = -100 # if not self.env.lander.awake: # done = True # reward = +100 # print(self.sim.pose[:3]) return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 # total_reward - 0 pose_all = [] for _ in range(self.action_repeat): # ob, reward, done, info = self.env.step(action) # total_reward += reward done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() # self.sim.pose = ob pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" ob = self.env.reset() state = np.concatenate([ob] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" #reward = 1.-.3*(abs(self.sim.pose[:3] - self.target_pos)).sum() reward = penalty = 0 curr_pos = self.sim.pose[:3] #distance from current position to target distance = np.sqrt((curr_pos[0] - self.target_pos[0])**2 + (curr_pos[1] - self.target_pos[1])**2 + (curr_pos[2] - self.target_pos[2])**2) if distance < 5: reward += 50 #reward += 1000 euler_angles = self.sim.pose[3:6] sum_of_euler_angles = abs(euler_angles).sum() # add the sum of euler angles metric to penalty penalty += sum_of_euler_angles if self.sim.v[2] > 0: reward += 100 done = False if self.sim.pose[2] >= self.target_pos[ 2]: # agent has crossed the target height # raise TypeError reward += 100 # bonus reward (set value as per your expeience on the project) done = True return reward - (penalty * 0.1), done def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim states (pose,velocities) & delta_reward, done_height = self.get_reward() reward += delta_reward if done_height: done = done_height pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation # The simulator is initialized as an instance of the PhysicsSim class (from physics_sim.py). self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) #Inspired by the methodology in the original DDPG paper, we make use of action repeats. For each timestep of the agent, we step the simulation action_repeats timesteps. # If you are not familiar with action repeats, please read the Results section in the DDPG paper. self.action_repeat = 3 #We set the number of elements in the state vector. For the sample task, we only work with the 6-dimensional pose information. T # To set the size of the state (state_size), we must take action repeats into account. self.state_size = self.action_repeat * 6 #The environment will always have a 4-dimensional action space, with one entry for each rotor (action_size=4). # You can set the minimum (action_low) and maximum (action_high) values of each entry here. self.action_low = 0 self.action_high = 900 self.action_size = 4 #TARGET/GOAL: The sample task in this provided file is for the agent to reach a target position. We specify that target position as a variable. # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) #Then, the reward is computed from get_reward(). The episode is considered done if the time limit has been exceeded, # or the quadcopter has travelled outside of the bounds of the simulation. def get_reward(self): raise NotImplementedError("{} must override get_reward()".format( self.__class__.__name__)) #The step() method is perhaps the most important. It accepts the agent's choice of action rotor_speeds, which is used to prepare the next state to pass on to the agent def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done #The reset() method resets the simulator. The agent should call this method every time the episode ends. def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Land(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim([0.0, 0.0, 10.0,0.0,0.0,0.0], init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal if target_pos is None : print("Setting default init pose") self.target_pos = 0.0 self.max_duration = 5.0 #sec def get_reward(self): """Uses current pose of sim to return reward.""" #reward = np.tanh(1 - 0.003*(abs(self.sim.pose[:3] - self.target_pos))).sum() if abs(self.target_pos - self.sim.pose[2])< 0.003: reward = reward = -abs(self.target_pos - self.sim.pose[2])/100.0 if abs(self.target_pos - self.sim.pose[2])> 5: reward =-abs(self.target_pos - self.sim.pose[2])/5.0 else: reward = -abs(self.target_pos - self.sim.pose[2])/10.0 return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) if done : if (self.sim.pose[2] - self.target_pos)<0.5: # agent has crossed the target height reward += 7.0 # bonus reward if (self.sim.pose[2] - self.target_pos)>3: # agent has crossed the target height reward -= 1.0 # bonus reward if self.sim.time > self.max_duration: # agent has run out of time reward -= 10.0 # extra penalty if self.sim.time < self.max_duration: # agent has run out of time reward += 3.0 # bonus reward next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pos=None, init_velocities=None, init_angle_velocities=None, runtime=3., target_pos=None, action_repeat=3, state_size=6, action_low=0, action_high=900, goal="takeoff", min_accuracy=1.5, distance_factor=2.3, angle_factor=6.3, v_factor=4.5, rotor_factor=0.1, time_factor=0.1, elevation_factor=2.4, crash_factor=1., target_factor=1.): """Initialize a Task object. Params ====== init_pos (array): initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities (array): initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities (array): initial radians/second for each of the three Euler angles runtime (float): time limit for each episode target_pos (array): target/goal (x,y,z) position for the agent action_repeat (int): Number of timesteps to step agent state_size (int): Dimension of each state action_low (array): Min value of each action dimension action_high (array): Max value of each action dimension goal (string): Specifies what type of task the agent is attempting and sets standard initial values if not Otherwise specified Options: 'hover' (default), 'target', 'land', 'takeoff' min_accuracy (float): Sets how far the average deviation from the target point is acceptable distance_factor (float): Scalar that controls the relative reward for the distance measurement of the reward function # DEPRECIATED angle_factor (float): Scalar that controls the relative reward for the angles off axis of the reward function # DEPRECIATED v_factor (float): Scalar that controls the relative reward for the angular velocity of the reward function # DEPRECIATED rotor_factor (float): Scalar that controls the relative reward for the relative rotor speed of the reward function # DEPRECIATED time_factor (float): Scalar that controls the relative reward for the time bonus of the reward function # DEPRECIATED elevation_factor (float): Scalar that controls the relative reward for the elevation measurement of the reward function # DEPRECIATED crash_factor (float): Scalar that controls the relative reward for the crash detector of the reward function # DEPRECIATED target_factor (float): Scalar that controls the relative reward for the goal conditions of the reward function # DEPRECIATED """ ## Set target and initial conditions based on the goal ## But doesn't override if other conditions were specified if goal is "target": self.target_pos = np.array( [10., 0., 10.]) if target_pos is None else target_pos elif goal is "hover": self.target_pos = np.array( [0., 0., 10.]) if target_pos is None else target_pos elif goal is "land": self.target_pos = np.array( [0., 0., 0.01]) if target_pos is None else target_pos elif goal is "takeoff": self.target_pos = np.array( [0., 0., 10.]) if target_pos is None else target_pos self.init_pos = np.array([0.0, 0.0, 0.1, 0.0, 0.0, 0.0 ]) if init_pos is None else init_pos else: print( "It seems an improper goal was specified. Please input one of the following options:\ * 'hover'\n* 'target'\n* 'land'\n* 'takeoff'") if goal is not "takeoff": self.init_pos = np.array([0.0, 0.0, 10, 0.0, 0.0, 0.0 ]) if init_pos is None else init_pos self.goal = goal ## Simulation self.sim = PhysicsSim(self.init_pos, init_velocities, init_angle_velocities, runtime) self.action_repeat = action_repeat self.state_size = self.action_repeat * state_size self.action_low = action_low self.action_high = action_high self.action_size = 4 # Dimension of each action (one for each rotor) self.min_accuracy = min_accuracy self.distance_factor = distance_factor # DEPRECIATED self.angle_factor = angle_factor # DEPRECIATED self.v_factor = v_factor # DEPRECIATED self.rotor_factor = rotor_factor # DEPRECIATED self.time_factor = time_factor # DEPRECIATED self.elevation_factor = elevation_factor # DEPRECIATED self.crash_factor = crash_factor # DEPRECIATED self.target_factor = target_factor # DEPRECIATED ## Numerical representation of the distance to the target self.target_proximity = (abs(self.sim.pose[:3] - self.target_pos)).sum() self.total_proximity = [] self.average_proximity = 0. self.best_proximity = np.inf self.previous_proximity = self.target_proximity ## DEPRECIATED self.params = { "Distance": self.distance_factor, "Angle": self.angle_factor, "Angular V": self.v_factor, "Rotor": self.rotor_factor, "Time": self.time_factor, "Elevation": self.elevation_factor, "Crash": self.crash_factor, "Target": self.target_factor } def get_reward(self, previous_reward, done, rotor_speeds): """ Returns a reward based on a number of positive and negative factors guiding the agent on how to perform. These values seemed like appropriate numbers to encourage good behavior, but I added scaler variables to try various relations between the weights. Overall, the goal was for the agent to have a negative score if it crashes, a positive score if it doesn't, and it should have a very positive score if it achieves it's goal. """ ## Updates the distance to target and calculates total, aveerage & best if done self.target_proximity = (abs(self.sim.pose[:3] - self.target_pos)).sum() if done: self.total_proximity.append(self.target_proximity) self.average_proximity = np.mean(self.total_proximity[-10:]) if self.target_proximity < self.best_proximity: self.best_proximity = self.target_proximity reward = 1. - .3 * (abs(self.sim.pose[:3] - self.target_pos) ).sum() # Default reward function ## The following calculations are DEPRECIATED due to needing to simplify the reward function ## Subtracts the difference between the current position & target # reward = (1 - self.target_proximity) * self.distance_factor ## Penalizes when moving further from the target than before, within a margin of error # if self.previous_proximity < (self.target_proximity * 1.2): # reward -= 50 * self.distance_factor # self.previous_proximity = self.target_proximity ## Adds a reward amount if the yaw, roll or pitch are less than about 28 degrees, ## Otherwise, it's a negative reward # for angle in self.sim.pose[3:]: # if angle >= 6.3: # print("Something's wrong with the angles.") # if angle >= 3.1415: # angle = 6.283 - angle # # reward += (.5 - angle) * self.angle_factor ## Adds a reward amount if the yaw, roll or pitch velocities are less than about 28 degrees per second, ## Otherwise, it's a negative reward # for v in self.sim.angular_v: # reward += (.5 - abs(v)) * self.v_factor ## Add a penalty if the rotors differ in speed too drastically # for rotor_a, rotor_b in itertools.combinations(rotor_speeds, 2): # if abs(rotor_a - rotor_b) > 500: # reward -= abs(rotor_a - rotor_b) * self.rotor_factor ## Set a time factor that rewards the agent for each timestep that it hasn't crashed ## Plus, it adds a major bonus if it makes it to the penultimate timestep. # reward += 100 * self.time_factor # reward = reward + (100 * self.time_factor) if self.sim.time > 4 else reward # ## Further penalizes deviation in elevation # if self.goal is not "land": # reward -= abs(self.target_pos[2] - self.sim.pose[2]) * 10 * self.elevation_factor ## Set major reward/penalty depending on whether agent crashes early or not if done and np.average(abs(self.sim.pose[:3] - self.target_pos)) > self.min_accuracy * 2: reward -= 10000 * self.crash_factor # print("\nCrashed!! Current proximity: {:7.3f} || Current timestep: {:7.3f}".format(self.target_proximity, self.sim.time)) # Useful for visualizing behavior while training is running. ## Sets a decent reward if the agent gets close to the target to help it hone in on what works if self.target_proximity < self.min_accuracy * 4: reward += 50 * self.target_factor ## Set major reward if agent gets to the target location (within an acceptable minimum accuracy). ## And most importantly, it stops the simulation at that time, except hover, which can't stop early. if self.target_proximity < self.min_accuracy: ## Add an additional reward factor for reaching the target around the penultimate timestep, ## but not necessarily to rush there so quickly as to zoom past uncontrollably. reward += 100000 * ( self.sim.time / (self.sim.runtime - 1)) * self.target_factor done = True if self.goal is not "hover" else False # print("\nSuccess! (•̀ᴗ•́)و ̑̑ ") ## DEPRECIATED ## Normalizes the reward to a range of -1 to 1 by implementing a floor of -1 ## This was suggested as a fix, but did not really improve the situation for me. # reward = -1. if reward < -1. else reward return reward + previous_reward, done def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward, done = self.get_reward(reward, done, rotor_speeds) pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" #initialize counter variables reward = 0 penalty = 0 #define current position current_position = self.sim.pose[:3] # penalty for distance from target penalty += abs(current_position[0] - self.target_pos[0]) penalty += abs(current_position[1] - self.target_pos[1]) penalty += abs(current_position[2] - self.target_pos[2]) #get current positons x_pos = (current_position[0] - self.target_pos[0]) y_pos = (current_position[1] - self.target_pos[1]) z_pos = (current_position[2] - self.target_pos[2]) distance = np.sqrt((x_pos)**2 + (y_pos)**2 + (z_pos)**2) # structured reward system in the hope the agent will pick up the trail if distance < 5: reward += 10000 if distance < 10: reward += 5000 if distance < 20: reward += 2000 if distance < 40: reward += 1000 if distance < 60: reward += 500 if distance < 100: reward += 250 if distance < 200: reward += 100 return reward - 0.05 * (penalty) def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class TakeOff(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Start Pos self.init_pose = init_pose if init_pose is not None else np.array( [0., 0., 0., 0., 0., 0.]) # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" reward = 0.0 done = False reward = -min(abs(self.target_pos[2] - self.init_pose[2]), 20.0) if self.init_pose[2] >= self.target_pos[2]: reward += 10.0 done = True elif self.sim.time > self.sim.runtime: reward -= 10.0 done = True return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 1 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" #reward = 1.-.3*(abs(self.sim.pose[:3] - self.target_pos)).sum() punish_x = abs(self.sim.pose[0] - self.target_pos[0]) punish_y = abs(self.sim.pose[1] - self.target_pos[1]) reward_z = 1.0 - (self.target_pos[2] - self.sim.pose[2]) punish_rot1 = abs(self.sim.pose[3]) punish_rot2 = abs(self.sim.pose[4]) punish_rot3 = abs(self.sim.pose[5]) reward_vz = self.sim.v[2] reward = reward_z - 0.1 * (punish_x + punish_y) - 0.1 * (punish_rot1 + punish_rot2 + punish_rot3) + \ 0.1 * reward_vz return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 12 self.action_low = 100 self.action_high = 900 self.action_size = 4 self.runtime = runtime # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) self.upp_bounds = self.target_pos + 2 self.low_bounds = self.target_pos - 2 def get_reward(self): """Uses current pose of sim to return reward.""" distance = np.tanh(3. - 0.003 * (abs(self.sim.pose[:3] - self.target_pos)).sum()) # angular_vel = abs(np.tanh(self.sim.angular_v.sum())) # angles_pos = abs(np.tanh(self.sim.pose[3:].sum())) # height = self.sim.pose[2] reward = 1 + distance return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) pose_all.append(self.sim.v) pose_all.append(self.sim.angular_v) condition = (self.upp_bounds > self.sim.pose[:3]).all() and ( self.sim.pose[:3] > self.low_bounds).all() if condition: print("Target reached!", end="") reward += 40 done = True elif done: if self.sim.pose[2] <= self.sim.lower_bounds[2]: print("Crashed!", end="") reward -= 20 elif (self.sim.pose[:3] <= self.sim.lower_bounds).any() or ( self.sim.pose[:3] >= self.sim.upper_bounds).any(): print("Out!", end="") else: print("Time out!", end="") next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate( (self.sim.pose, self.sim.v, self.sim.angular_v) * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 self.t = 0 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self, done): """Uses current pose of sim to return reward.""" penalty = 0 if done and self.t < 240: penalty = 0 reward = penalty + 1 - 0.01 * ( np.linalg.norm(self.sim.pose[:3] - self.target_pos) ) #- .001*(np.square(self.sim.pose[3:])).sum() #alt_reward = penalty + 1 - 0.01*(np.linalg.norm(self.sim.pose[:3] - self.target_pos)) - .001*(np.square(self.sim.pose[3:])).sum() #- .005*(abs(self.sim.v)).sum() #- .005*(abs(self.sim.pose[3:])).sum() #- .3*(abs(self.sim.angular_v)).sum() - .3*(abs(self.sim.pose[3:])).sum() -.01*(abs(self.sim.angular_v)).sum() return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): self.t += 1 done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward(done) pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.t = 0 self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 self.target_pos = target_pos # Goal - Takeoff (0, 0, 0) and reach postion (0, 0, 150) self.target_pos = target_pos def get_reward(self): """Uses current pose of sim to return reward.""" reward = np.tanh(1. - .03 * (abs(self.sim.pose[:3] - self.target_pos))).sum() "Penalize the crashes" if self.sim.done and self.sim.runtime > self.sim.time: reward += -100 if self.sim.pose[2] >= self.target_pos[2]: reward = 200 return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Takeoff(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., height=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 30 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 self.max_reward = -np.inf # Goal self.target_pos = [0., 0., height] if height is not None else np.array([0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" #height_reward = np.exp(-abs(self.sim.pose[2] - self.target_pos[2])) #error = 1 * np.exp(-abs(self.sim.pose[2] - self.target_pos[2])) #reward = np.tanh(1 - np.exp(-1/(abs(self.sim.pose[2] - self.target_pos[2])/self.target_pos[2]))) #reward = np.exp(-np.sqrt((abs(self.sim.pose[2] - self.target_pos[2])))) reward = (1-(abs(self.sim.pose[2] - self.target_pos[2]))/self.target_pos[2]) #if reward < 0.1: # reward += 20 #self.max_reward = max(self.max_reward, reward) #phi_error = 0.1*(1 - np.exp(-1/self.sim.pose[3])) #theta_error = 0.1*(1 - np.exp(-1/self.sim.pose[4])) #phi_error = 0.000004 * (1 - abs(np.sin(self.sim.pose[3]))) #theta_error = 0.000004 * (1 - abs(np.sin(self.sim.pose[4]))) #penalty_phi = np.exp(-abs(self.sim.pose[3] - 0)*0.01)+0.5 #penalty_theta = np.exp(-abs(self.sim.pose[4] - 0)*0.01)+0.5 #reward = np.exp(-abs(self.sim.pose[3] - 0))/2 #phi #reward += np.exp(-abs(self.sim.pose[4] - 0))/2 #theta #reward = np.tanh(reward/3*2-1) return reward #+ phi_error + theta_error def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 1 self.state_size = self.action_repeat * 2 self.action_low = 390 self.action_high = 420 self.action_size = 1 self.action_range = self.action_high - self.action_low # Goal self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" # Get max reward of 1 if within 1 unit from hover spot # Get min reward of -1 if 6+ units from hover spot distance = np.linalg.norm(self.sim.pose[:3] - self.target_pos) #print(self.sim.pose[:3]) #print(self.target_pos) #print(distance) #if distance < 4: # reward = 1 - (distance/50)**0.4 #reward = min(1,1/(distance + 1)**2) #else: # reward = 0 reward = 1 - (distance/50)**0.4 return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities #done = self.sim.next_timestep(rotor_speeds) #done = self.sim.next_timestep(rotor_speeds) reward += self.get_reward() z_norm = (self.sim.pose[2] - 10)/3 z_v_norm = (self.sim.v[2]) / 3 rotor_norm = (rotor_speeds[0] - self.action_low) / self.action_range * 2 - 1 pose_all.append(z_norm) pose_all.append(z_v_norm) #print(rotor_speeds) next_state = np.array(pose_all) #print(next_state) return next_state, reward, done ''' """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done''' def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() #perturb the start by +- one unit perturb_unit = 2 self.sim.pose[2] += (2*np.random.random()-1) * perturb_unit #print("Starting height {:7.3f}".format(self.sim.pose[2])) z_norm = (self.sim.pose[2] - 10)/5 z_v_norm = 0 rotor_norm = 0 state = np.array(([z_norm, z_v_norm]) * self.action_repeat) return state '''
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" # Trying reward functions that are linear or squared sums of the difference between initial and target coordinates # # reward = 1.-.3*(abs(self.sim.pose[:3] - self.target_pos)).sum() # reward = 10. - ((abs(self.sim.pose[:3] - self.target_pos)**2).sum()) # reward = 10. - 0.03*((self.sim.pose[:3] - self.target_pos).sum())**2 # Trying a reward function that incorporates rewards for hovering along each coordinate plane, with heavier weighting towards vertical (z-axis) hovering xr = 0.02 * (abs(self.sim.pose[0] - self.target_pos[0]))**2 / 100 yr = 0.02 * (abs(self.sim.pose[1] - self.target_pos[1]))**2 / 100 zr = 0.05 * (abs(self.sim.pose[2] - self.target_pos[2]))**2 / 100 reward = 10 - xr - yr - zr return reward def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task(): """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal if target_pos is None: print("Setting default init pose") self.target_pos = target_pos if target_pos is not None else np.array( [0., 0., 10.]) def get_reward(self): """Uses current pose of sim to return reward.""" #reward = (1 - 0.003 * (abs(self.sim.pose[2] - self.target_pos[2]))).sum() delta_x = abs(self.sim.pose[0] - self.target_pos[0]) delta_y = abs(self.sim.pose[1] - self.target_pos[1]) delta_z = abs(self.sim.pose[2] - self.target_pos[2]) reward = 1 - .252 * (delta_z) reward = reward - (0.003 * delta_x) if delta_x > 0 else reward reward = reward - (0.003 * delta_y) if delta_y > 0 else reward return np.tanh(reward) def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] for _ in range(self.action_repeat): done = self.sim.next_timestep( rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) if done: reward += 10 next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state
class Task: """Task (environment) that defines the goal and provides feedback to the agent.""" def __init__(self, init_pose=None, init_velocities=None, init_angle_velocities=None, runtime=5., target_pos=None): """Initialize a Task object. Params ====== init_pose: initial position of the quadcopter in (x,y,z) dimensions and the Euler angles init_velocities: initial velocity of the quadcopter in (x,y,z) dimensions init_angle_velocities: initial radians/second for each of the three Euler angles runtime: time limit for each episode target_pos: target/goal (x,y,z) position for the agent """ # Simulation self.sim = PhysicsSim(init_pose, init_velocities, init_angle_velocities, runtime) self.action_repeat = 3 self.state_size = self.action_repeat * 6 self.action_low = 0 self.action_high = 900 self.action_size = 4 # Goal self.target_pos = target_pos if target_pos is not None else np.array([0., 0., 10.]) def get_reward(self): return self.everything_considered() def step(self, rotor_speeds): """Uses action to obtain next state, reward, done.""" reward = 0 pose_all = [] done = False for _ in range(self.action_repeat): done = self.sim.next_timestep(rotor_speeds) # update the sim pose and velocities reward += self.get_reward() pose_all.append(self.sim.pose) next_state = np.concatenate(pose_all) return next_state, reward, done def reset(self): """Reset the sim to start a new episode.""" self.sim.reset() state = np.concatenate([self.sim.pose] * self.action_repeat) return state # rewards ... def reward_speed(self): remain_s = self.target_pos - self.sim.pose[:3] target_s = remain_s / (self.sim.runtime - self.sim.time) delta_s = abs(self.sim.v - target_s).sum() return 2 - sigmoid(delta_s) def reward_basic(self): return np.tanh(1 - 0.003 * (abs(self.sim.pose[:3] - self.target_pos))).sum() def everything_considered(self): euler = self.sim.pose[3:6] current_pos = self.sim.pose[:3] start_pos = self.sim.init_pose[:3] speed = self.sim.v s_max = speed[2] # speed in z s_min = speed[:2] # speed in x, y angular_speed = self.sim.angular_v delta_position = current_pos - start_pos dp_max = delta_position[2] # max this (Z) dp_min = delta_position[:2] # min this (X, Y) euler = (abs_arctan(euler - np.pi) * 1.25).sum() / 3 # min = 0, max = 1 angular_speed = abs_arctan_inv(angular_speed).sum() / 3 # min = 0, max = 1 dp_min = abs_arctan_inv(dp_min).sum() / 2 # min = 0, max = 1s dp_max = abs_arctan(dp_max) # min = 0, max = 1 s_min = abs_arctan_inv(s_min).sum() / 2 # min = 0, max = 1 s_max = np.arctan(s_max) / pi2 # min = -1 , max = 1 sum = euler + angular_speed + dp_max + dp_min + s_max + s_min mx = 6 mn = -1 delta = mx - mn reward = (sum + abs(mn)) / delta return reward