class QLearningAgent(Agent): """An agent that learns to drive in the smartcab world by using Q-Learning""" def __init__(self, env): super(QLearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.possible_actions= Environment.valid_actions self.ai = QLearn(epsilon=.05, alpha=0.1, gamma=0.9) def reset(self, destination=None): self.planner.route_to(destination) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) inputs = inputs.items() deadline = self.env.get_deadline(self) self.state = (inputs[0],inputs[1],inputs[3],self.next_waypoint) action = self.ai.Q_move(self.state) # Execute action and get reward reward = self.env.act(self, action) inputs2 = self.env.sense(self) inputs2 = inputs2.items() next_state = (inputs2[0],inputs2[1],inputs2[3],self.next_waypoint) self.ai.Q_post(self.state, action, next_state, reward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # TODO: Select action according to your policy action = None # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward print("LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)) # [debug]
class QLearningAgent(Agent): """An agent that learns to drive in the smartcab world by using Q-Learning""" def __init__(self, env): super(QLearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.learner = QLearn(epsilon=1, alpha=0.3, gamma=0.6) def reset(self, destination=None): self.planner.route_to(destination) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) # TODO: Select action according to your policy action = self.learner.action(self.state) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward next_inputs = self.env.sense(self) self.future_next_way_out = self.planner.next_waypoint() next_state = (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], self.future_next_way_out) #next_state = (next_inputs[0], next_inputs[1], next_inputs[3], self.planner.next_waypoint()) self.learner.learn(self.state, action, next_state, reward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.q_matrix = {} self.alpha = 0.95 self.gamma = 0.05 self.explore = 0.05 self.state = None def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs deadline = self.env.get_deadline(self) state = self.observe_state() self.state = state knowledge = self.knowledge_from_state(state) # TODO: Select action according to your policy if random() < self.explore: action = choice(OPTIONS) else: action = sorted(knowledge.items(), key=lambda x: x[1], reverse=True)[0][0] old_value = knowledge[action] # Execute action and get reward reward = self.env.act(self, action) new_state = self.observe_state() new_state_knowledge = self.knowledge_from_state(new_state) self.q_matrix[state][action] = (1-self.alpha) * old_value +\ self.alpha * (reward + self.gamma * max(new_state_knowledge.values())) #print "LearningAgent.update(): deadline = {}, state = {}, action = {}, reward = {}".format(deadline, state, action, reward) # [debug] def knowledge_from_state(self, state): """ Gets what we know about the given state """ if not state in self.q_matrix: self.q_matrix[state] = {option: 0 for option in OPTIONS} return self.q_matrix[state] def observe_state(self): """ Observe the current state """ inputs = self.env.sense(self) self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator return (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # location, heading, delta, deadline, oncoming, traffic light state inputs['waypoint'] = self.next_waypoint self.state = tuple(sorted(inputs.items())) # performs the actual update # TODO: Select action according to your policy actions = [None,'forward','left','right'] action = actions[random.randint(0,3)] # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class BasicAgent(Agent): def __init__(self, env): super(BasicAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.color = 'red' # override color def reset(self, destination=None): self.planner.route_to(destination) def update(self, t): self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) action = random.choice(self.env.valid_actions) reward = self.env.act(self, action) light = inputs['light'] oncoming = inputs['oncoming'] left = inputs['left'] right = inputs['right'] #Update the current observed state self.state = (light, oncoming, left, right, self.next_waypoint) print "Basic.update(): next move = {}".format(action) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.alpha = 0.2 self.Qtable = {} def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.state = None self.reward = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # Inform Driving Agent self.state = (inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'], self.next_waypoint) # TODO: Select action according to your policy valid_actions = [None, 'forward', 'left', 'right'] # Random Pick Actions ### action = random.choice(valid_actions) # Get all Q values for all state all_actions = {action: self.Qtable.get((self.state, action), 0) for action in valid_actions} # Find action that maximizes Q values best_actions = [action for action in valid_actions if all_actions[action] == max(all_actions.values())] # Return Best Action action = random.choice(best_actions) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward # To simplify the problem, I set gamma equals 0. self.Qtable[(self.state, action)] = \ (1 - self.alpha) * self.Qtable.get((self.state, action), 0) + self.alpha * reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here def reset(self, destination=None): self.planner.route_to(destination) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = inputs['location'] self.heading=inputs['heading'] action = random.choice(Environment.valid_actions) reward = self.env.act(self, action) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here #Actions self.A = ['forward', 'left', 'right', None] #all the actions available self.trial = 0 #Initialize Q table(light, oncoming, next_waypoint) self.Q={} for i in ['green', 'red']: #possible lights for j in [None, 'forward', 'left', 'right']: #possible oncoming_agent for k in ['forward', 'left', 'right']: #possible next_waypoints self.Q[(i,j,k)] = [1] * len(self.A) def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (inputs['light'], inputs['oncoming'], self.next_waypoint) #Find the max Q value for the current state max_Q = self.Q[self.state].index(max(self.Q[self.state])) #assign action p = random.randrange(0,5) epsilon = 1.5 # small probability to act randomly if p<epsilon: action = random.choice(self.env.valid_actions) else: action = self.A[max_Q] # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward gamma = 0.30 alp_tune =500 # tunning parameter alpha = 1/(1.1+self.trial/alp_tune) # decay learning rate self.trial = self.trial+1 #get the next action state Q(s',a') next_inputs = self.env.sense(self) next_next_waypoint = self.planner.next_waypoint() next_state = (next_inputs['light'],next_inputs['oncoming'], next_next_waypoint) #Update Q Table self.Q[self.state][self.A.index(action)] = \ (1-alpha)*self.Q[self.state][self.A.index(action)] + \ (alpha * (reward + gamma * max(self.Q[next_state])))
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, learning_rate=0.6, discount_factor=0.35): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.alpha = learning_rate self.gamma = discount_factor self.state = None self.Q_values = self.initialize_Q_values() def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def initialize_Q_values(self, val=10.0): Q_values = {} for waypoint in ['left', 'right', 'forward']: for light in ['red', 'green']: for action in self.env.valid_actions: Q_values[((waypoint, light), action)] = val return Q_values def find_max_Q(self, state): action = None max_Q = 0.0 for a in self.env.valid_actions: Q = self.Q_values[(state, a)] if Q > max_Q: action = a max_Q = Q return (max_Q, action) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (self.next_waypoint, inputs['light']) # TODO: Select action according to your policy #action = random.choice(self.env.valid_actions) (Q, action) = self.find_max_Q(self.state) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward next_waypoint = self.planner.next_waypoint() inputs = self.env.sense(self) state_prime = (next_waypoint, inputs['light']) (Q_prime, action_prime) = self.find_max_Q(state_prime) Q += self.alpha * (reward + self.gamma*Q_prime - Q) self.Q_values[(self.state, action)] = Q print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): # sets self.env = env, state = None, next_waypoint = None, and a default color super(LearningAgent, self).__init__(env) self.color = 'red' # override color # simple route planner to get next_waypoint self.planner = RoutePlanner(self.env, self) # TODO: Initialize any additional variables here def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs # # from route planner, also displayed by simulator self.next_waypoint = self.planner.next_waypoint() inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = self.next_waypoint, inputs['light'] # self.state = self.next_waypoint, (inputs['oncoming'] == None or inputs['light'] == 'green') # TODO: Select action according to your policy # Define actions actions = [None, 'forward', 'left', 'right'] # QUESTION 1- select random action action = random.choice(actions) # Execute action and get reward reward = self.env.act(self, action)
class LearningAgent(Agent): def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.next_waypoint = None self.total_reward = 0 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.state = None self.next_waypoint = None def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Select action according to your policy loc, hed = self.get_my_location() action = self.get_next_waypoint_given_location( loc, hed) action_okay = self.check_if_action_is_ok(inputs) if not action_okay: action = None # Execute action and get reward reward = self.env.act(self, action) self.total_reward += reward #TODO: Learn policy based on state, action, reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint valid_waypoints = ['forward','right','left'] valid_lights = ['red','green'] valid_actions = ['forward','right','left', None] self.Q_hat={} states = [(next_waypoint,light,oncoming,right,left) for next_waypoint in valid_waypoints for light in valid_lights for oncoming in valid_actions for right in valid_actions for left in valid_actions] states.append('Destination') for state in states: self.Q_hat[state] = {} for action in valid_actions: self.Q_hat[state][action] = random.random() def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) if self.next_waypoint == None: self.state = 'Destination' else: self.state = (self.next_waypoint, inputs['light'],inputs['oncoming'],inputs['right'],inputs['left']) # TODO: Select action according to your policy valid_actions = ['forward','right','left', None] epsilon = 0.1 #0.01 best_action = max(self.Q_hat[self.state],key=self.Q_hat[self.state].get) random_action = choice(valid_actions) action = choice([best_action, random_action],p=[1-epsilon,epsilon]) # Execute action and get reward reward = self.env.act(self, action) # Learn policy based on state, action, reward new_next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator new_inputs = self.env.sense(self) new_state = (new_next_waypoint, new_inputs['light'],new_inputs['oncoming'],new_inputs['right'],new_inputs['left']) alpha = 0.5 #opt 0.7 gamma = 0.5 #opt 0.1 max_Qhat_ahat = max(self.Q_hat[new_state].values()) self.Q_hat[self.state][action] = (1-alpha)*self.Q_hat[self.state][action]+alpha*(reward+gamma*max_Qhat_ahat) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.prevReward = 0 self.prevAction = None def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.state = None self.prevReward = 0 self.prevAction = None def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state state = None validActions = [] if inputs['light'] == 'red': if inputs['oncoming'] != 'left': validActions = ['right', None] else: if inputs['oncoming'] == 'forward': validActions = [ 'forward','right'] else: validActions = ['right','forward', 'left'] # TODO: Select action according to your policy if validActions != []: action = random.choice(validActions) else: action = random.choice(Environment.valid_actions) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward if action != None: if reward > self.prevReward: self.state = (inputs,self.next_waypoint,deadline) self.prevAction = action self.prevReward = reward else: self.state = (inputs,self.next_waypoint,deadline) self.prevAction = action self.prevReward = reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): # sets self.env = env, state = None, next_waypoint = None, and a default color super(LearningAgent, self).__init__(env) self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.qtable = defaultdict(dict) self.epsilon = .4 # Randomness factor self.alpha = .4 # Learning rate. def reset(self, destination=None): # Prepare for a new trip; reset any variables here, if required self.planner.route_to(destination) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # Update state self.state = self.define_state(inputs, self.next_waypoint) # Select action according policy action = self.select_action() # Execute action and get reward reward = self.env.act(self, action) # Learn policy based on state, action, reward self.update_qtable(action, reward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug] def select_action(self): if random.random() < self.epsilon or self.state not in self.qtable: action = random.choice(self.env.valid_actions) else: action = max(self.qtable[self.state], key=self.qtable[self.state].get) if self.state not in self.qtable: for a in self.env.valid_actions: # init action:reward dict. self.qtable[self.state][a] = 0 return action def update_qtable(self, action, reward): old_q = self.qtable[self.state].get(action, 0) self.qtable[self.state][action] = ((1 - self.alpha) * old_q) + (self.alpha * reward) # Decay alpha and epsilon until it reaches 0.1 and 0.0, respectively. self.alpha = max(0.001, self.alpha - 0.002) self.epsilon = max(0.0, self.epsilon - 0.002) @staticmethod def define_state(inputs, next_waypoint): return tuple(inputs.values()) + (next_waypoint, )
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.Qtable = {} self.alpha = 0.8 self.gamma = 0.6 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right']) old_state = self.state # TODO: Select action according to your policy if (old_state, None) not in self.Qtable.keys(): self.Qtable[(old_state, None)] = 0 if (old_state, 'left') not in self.Qtable.keys(): self.Qtable[(old_state, 'left')] = 0 if (old_state, 'forward') not in self.Qtable.keys(): self.Qtable[(old_state, 'forward')] = 0 if (old_state, 'right') not in self.Qtable.keys(): self.Qtable[(old_state, 'right')] = 0 max_action = max(self.Qtable[(old_state, None)], self.Qtable[(old_state, 'left')],self.Qtable[(old_state, 'forward')],self.Qtable[(old_state, 'right')]) action = random.choice([ i for i in ['left', 'forward', 'right', None] if self.Qtable[(old_state, i)] == max_action]) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward #self.next_waypoint = self.planner.next_waypoint() inputs = self.env.sense(self) self.state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right']) new_state = self.state if (new_state, None) not in self.Qtable.keys(): self.Qtable[(new_state, None)] = 0 if (new_state, 'left') not in self.Qtable.keys(): self.Qtable[(new_state, 'left')] = 0 if (new_state, 'forward') not in self.Qtable.keys(): self.Qtable[(new_state, 'forward')] = 0 if (new_state, 'right') not in self.Qtable.keys(): self.Qtable[(new_state, 'right')] = 0 self.Qtable[(old_state, action)] = (1 - self.alpha) * self.Qtable[(old_state, action)] + self.alpha *(reward + self.gamma * max(self.Qtable[(new_state, i)] for i in [None, 'left', 'forward', 'right']))
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.actions = [None, 'forward', 'left', 'right'] self.learning_rate = 0.3 self.state = None self.q = {} self.trips = 0 def reset(self, destination=None): self.planner.route_to(destination) # Prepare for a new trip; reset any variables here, if required self.trips += 1 if self.trips >= 100: self.learning_rate = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # Update state state = (inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'], self.next_waypoint) self.state = state # Lazy initialization of Q-values for action in self.actions: if (state, action) not in self.q: self.q[(state, action)] = 1 # Select action according to the policy probabilities = [self.q[(state, None)], self.q[(state, 'forward')], self.q[(state, 'left')], self.q[(state, 'right')]] # Use the softmax funtion so that the values actually behave like probabilities probabilities = np.exp(probabilities) / np.sum(np.exp(probabilities), axis=0) adventurousness = max((100 - self.trips) / 100, 0) if random.random() < adventurousness: action = np.random.choice(self.actions, p=probabilities) else: action = self.actions[np.argmax(probabilities)] # Execute action and get reward reward = self.env.act(self, action) # Learn policy based on state, action, reward self.q[(state, action)] = self.learning_rate * reward + (1 - self.learning_rate) * self.q[(state, action)] #print("LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)) # [debug] def get_state(self): return self.state
class BasicAgent(Agent): """A basic agent upon which to build learning agents.""" def __init__(self, env): super(BasicAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.qvals = {} # mapping (state, action) to q-values self.time = 0 # number of moves performed self.possible_actions = (None, 'forward', 'left', 'right') self.n_dest_reached = 0 # number of destinations reached self.last_dest_fail = 0 # last time agent failed to reach destination self.sum_time_left = 0 # sum of time left upon reaching destination over all trials self.n_penalties = 0 # number of penalties incurred self.last_penalty = 0 # last trial in which the agent incurred in a penalty def reset(self, destination=None): self.planner.route_to(destination) def best_action(self, state): """ Return a random action (other agents will have different policies) """ return random.choice(self.possible_actions) def update_qvals(self, state, action, reward): """ Keeps track of visited (state, action) pairs. (in other agents will use reward to update the mapping from (state, action) pairs to q-values) """ self.qvals[(state, action)] = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # update time self.time += 1 # Update state self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) # Pick an action action = self.best_action(self.state) # Execute action and get reward reward = self.env.act(self, action) if reward < 0: self.n_penalties += 1 # Update the q-value of the (state, action) pair self.update_qvals(self.state, action, reward) # print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class QLearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(QLearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.total_reward = 0 self.next_waypoint = None self.QLearner = QAlgorithm(epsilon=0.03, alpha = 0.1, gamma = 0.9) self.next_state = None self.time_step = 1.0 def get_decay_rate(self, t): #Decay rate for epsilon return 1.0 / float(t) def reset(self, destination=None): self.planner.route_to(destination) self.time_step = 1.0 # TODO: Prepare for a new trip; reset any variables here, if required ''' self.previous_action = None self.next_state = None self.total_reward = 0 self.next_waypoint = None self.QLearner = QAlgorithm(epsilon=0.03, alpha = 0.1, gamma = 0.9) self.next_state = None ''' def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) #env_states = self.env.agent_states[self] self.time_step += 1 epsilon = self.get_decay_rate(self.time_step) self.QLearner.set_eps(epsilon) self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) #self.state = (inputs['light'], self.next_waypoint) action = self.QLearner.q_move(self.state) # Execute action and get reward reward = self.env.act(self, action) self.total_reward += reward new_inputs = self.env.sense(self) #env_states = self.env.agent_states[self] new_deadline = self.env.get_deadline(self) self.next_state = (new_inputs['light'], new_inputs['oncoming'], new_inputs['left'], self.next_waypoint) #self.next_state = (new_inputs['light'], self.next_waypoint) # TODO: Learn policy based on state, action, reward self.QLearner.q_future_reward(self.state, action, self.next_state, reward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, alpha, q_initial): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.policy = {} self.trip_log = [] self.trip = None self.alpha = alpha self.q_initial = q_initial def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required if self.trip: self.trip_log.append(self.trip) self.trip = {} self.trip['Deadline'] = self.env.get_deadline(self) self.trip['Reward'] = 0 self.trip['Penalty'] = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state inputs.pop('right') self.state = inputs self.state['waypoint'] = self.next_waypoint # TODO: Select action according to your policy state_hash = hashabledict(frozenset(self.state.iteritems())) q = self.policy.get(state_hash, { None: self.q_initial, 'forward': self.q_initial, 'left': self.q_initial, 'right': self.q_initial, }) action = max(q.iteritems(), key=operator.itemgetter(1))[0] # Execute action and get reward reward = self.env.act(self, action) # Update trip stats self.trip['Reward'] += reward self.trip['Remaining'] = self.env.get_deadline(self) self.trip['Success'] = self.planner.next_waypoint() == None if reward < 0: self.trip['Penalty'] += reward # TODO: Learn policy based on state, action, reward q[action] = (1 - self.alpha) * q[action] + self.alpha * reward self.policy[state_hash] = q
class BasicLearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(BasicLearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.last_reward = 0 self.last_action = None self.last_state = None self.state = None self.total_reward = 0 self.deadline = self.env.get_deadline(self) self.actions = ['forward', 'left', 'right', None] self.reached_destination = 0 self.penalties = 0 self.movements = 0 def tuple_state(self, state): State = namedtuple("State", ["light", "next_waypoint"]) return State(light = state['light'], next_waypoint = self.planner.next_waypoint()) def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.last_reward = 0 self.last_action = None self.last_state = None self.state = None self.total_reward = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = self.tuple_state(inputs) # TODO: Select action according to your policy action = random.choice(self.actions) # Execute action and get reward reward = self.env.act(self, action) if reward >= 10: self.reached_destination += 1 if reward < 0: self.penalties += 1 self.movements += 1 # TODO: Learn policy based on state, action, reward self.last_action = action self.last_state = self.state self.last_reward = reward self.total_reward += reward
class QLearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(QLearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.Q = {} self.learning_rate = 0.9 self.states = None self.possible_actions = [None, 'forward', 'left', 'right'] def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # state = (self.next_waypoint,inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'],deadline) state = (self.next_waypoint,inputs['light'], inputs['oncoming'], inputs['right'], inputs['left']) # state = (self.next_waypoint,inputs['light']) self.state = state for action in self.possible_actions: if(state, action) not in self.Q: self.Q[(state, action)] = 20 # TODO: Select action according to your policy Q_best_value = [self.Q[(state, None)], self.Q[(state, 'forward')], self.Q[(state, 'left')], self.Q[(state, 'right')]] # Softmax Q_best_value = np.exp(Q_best_value) / np.sum(np.exp(Q_best_value), axis=0) #action = np.random.choice(self.possible_actions, p=probabilities) action = self.possible_actions[np.argmax(Q_best_value)] # Execute action and get reward reward = self.env.act(self, action) if reward == -10.0 : print "Invalid move" if reward == -0.5 : print "valid, but is it correct?" # TODO: Learn policy based on state, action, reward # self.Q[(state,action)] = reward + self.learning_rate * self.Q[(state,action)] self.Q[(state,action)] = reward * self.learning_rate + ( 1 - self.learning_rate ) * self.Q[(state,action)]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.acts = ['left', 'right', 'forward', 'stay'] df_tmp = pd.DataFrame(0*np.ones((2, 4)), index=['red', 'green'], columns=self.acts) self.Q = df_tmp self.deadline = 0 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) self.deadline = deadline # TODO: Update state self.state = inputs['light'] # in this case the state is simply defined by the color oof the traffic light at the intersection # TODO: Select action according to your policy # Q-Learner act_opt = self.Q.ix[self.state].argmax() if act_opt=='stay': act_opt=None # print "Current state = %s, qlearner action = %s" % (self.state, act_opt) action = act_opt # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward # Maximize Q conditional on the state we are in = S q_S = self.Q.ix[self.state] # Q conditional on state S q_S_mx = q_S[q_S==q_S.max()] # just in case there are multiple values and the action that maximizes Q(S, a) is not unique ix_mx = np.random.randint(0,len(q_S_mx)) # pick one of the equally qlearner actions q_S_mx = q_S_mx.iloc[ix_mx] # Update Q act_tmp = action if action !=None else 'stay' global alpha, alpha0, gamma alpha *= alpha0 self.Q.ix[self.state, act_tmp] = \ (1-alpha)*self.Q.ix[self.state, act_tmp] + \ alpha*(reward + gamma*q_S_mx) # note: we update the action that has been actually taken, but we update the contribution to Q(S, a) by the action that could have been taken had we behaved qlearnerly
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.QLearning = QLearning(epsilon=0.3, alpha=0.95, gamma=0.9) # self.QLearning = pickle.load(open('./QLearning.pkl')) def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def is_action_okay(self, inputs): action_okay = True if self.next_waypoint == 'right': if inputs['light'] == 'red' and inputs['left'] == 'forward': action_okay = False elif self.next_waypoint == 'forward': if inputs['light'] == 'red': action_okay = False elif self.next_waypoint == 'left': if inputs['light'] == 'red' or (inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right'): action_okay = False return action_okay def update(self, t): # Gather states self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state action_okay = self.is_action_okay(inputs) self.state = (action_okay, self.next_waypoint, deadline < 3) # self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) # TODO: Select action according to your policy action = self.QLearning.policy(self.state) # Execute action and get reward reward = self.env.act(self, action) # Gather next states next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator next_inputs = self.env.sense(self) next_deadline = self.env.get_deadline(self) next_action_okay = self.is_action_okay(next_inputs) next_state = (next_action_okay, next_waypoint, next_deadline < 3) # next_state = (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], next_waypoint) # TODO: Learn policy based on state, action, reward self.QLearning.update_QTable(self.state, action, reward, next_state)
class RandomAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(RandomAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.availableAction = ['forward', 'left', 'right',None] self.goal=0 self.steps=0 self.features=[] self.deadline=[] self.total_reward=[0] def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required #print"RESET, Final state:\n", self.state try: if self.deadline[len(self.features)-1] >0: #deadline less than zero self.goal+=1 #FIXME - order print "PASS! {} steps to goal,Goal reached {} times out of {}!".format( self.deadline[len(self.features)-1],self.goal,len(self.features)) else: print "FAIL! {} steps to goal,Goal reached {} times out of {}!".format( self.deadline[len(self.features)-1],self.goal,len(self.features)) pass except: print "Trial 0 - Goal reached {} times out of {}!".format(self.goal,len(self.features)) pass print "----------------------------------------------------------" self.features.append({}) self.deadline.append(None) self.total_reward.append(0) self.steps=0 def update(self, t): # Gather inputs self.steps+=1 self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) #self.deadline[len(self.features)] = self.env.get_deadline(self) self.state=inputs self.features[len(self.features)-1][self.steps]=inputs self.deadline[len(self.deadline)-1] = self.env.get_deadline(self) # TODO: Select action according to your policy action = self.availableAction[random.randint(0,3)] # Execute action and get reward reward = self.env.act(self, action) self.lastReward=reward # TODO: Learn policy based on state, action, reward self.total_reward[len(self.total_reward)-1] =self.total_reward[len(self.total_reward)-1]+reward
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, learning_rate=0.78, discount_factor=0.36): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.Q = {} self.discount_factor = discount_factor self.learning_rate = learning_rate self.actions = Environment.valid_actions self.oldAction = None def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.oldAction=None def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state #------------------------------------------------ #action = np.random.choice(self.actions) #------------------------------------------------ # TODO: Select action according to your policy current_state = tuple(inputs.values() + [self.next_waypoint]) self.state = (inputs, self.next_waypoint) if current_state not in self.Q: self.Q[current_state] = [3,3,3,3] Q_max = self.Q[current_state].index(np.max(self.Q[current_state])) # Execute action and get reward action = self.actions[Q_max] reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward if t!=0: oldvalue = self.Q[self.oldAction[0]][self.oldAction[1]] newvalue = ((1 - self.learning_rate)*oldvalue)+(self.learning_rate * (self.oldAction[2] + self.discount_factor * self.Q[current_state][Q_max])) self.Q[self.oldAction[0]][self.oldAction[1]] = newvalue self.oldAction = (current_state, self.actions.index(action), reward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.debugOn = False self.reporter = Reporter(driver_type, self.env.valid_actions) if driver_type=='random': self.driver = RandomDriver(self.env.valid_actions) elif driver_type=='greedy': self.driver = GreedyDriver(self.env.valid_actions) elif driver_type=='greedy2': self.driver = Greedy2Driver(self.env.valid_actions) elif driver_type=='smart': self.driver = SmartDriver(self.env.valid_actions, self.reporter) elif driver_type=='smart2': self.driver = Smart2Driver(self.env.valid_actions, self.reporter) elif driver_type=='smart3': self.driver = Smart3Driver(self.env.valid_actions, self.reporter) else: self.driver = Smart2Driver(self.env.valid_actions, self.reporter) def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.reporter.reset(destination) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = State.make(self.next_waypoint, inputs) # TODO: Select action according to your policy action = self.driver.decide(self.state) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward newWaypoint = self.planner.next_waypoint() newState = State.make(newWaypoint, self.env.sense(self)) self.driver.learn(t, deadline, self.state, action, reward, newState) location = self.env.agent_states[self]['location'] if self.debugOn: print "LearningAgent.update(): deadline={} inputs={} action={} reward={} location={}".format(deadline, inputs, action, reward, location) # [debug] self.reporter.update(t, deadline, self.state, action, reward, location, self.driver)
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, trials): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.trials = trials self.success = 0 # How many times agent able to reach the target for given trials? self.penalties = 0 self.total_rewards = 0 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.penalties = 0 self.total_rewards = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # self.state = inputs # self.state = (inputs['light'], inputs['oncoming'], inputs['left']) self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) # TODO: Select action according to your policy action = random.choice(self.env.valid_actions) # Initial random movement / random action # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".\ format(deadline, inputs, action, reward) # [debug] # Statistics self.total_rewards += reward if reward < 0: self.penalties += 1 if reward > 5: self.success += 1 print "\nTrial info: Number of Penalties = {}, Net rewards: {}. \nTotal success/trials: {}/{}. \n".\ format(self.penalties, self.total_rewards, self.success, self.trials)
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.actions = ['forward', 'left', 'right',None] self.q_table = Counter() self.alpha = 0.5 self.gamma = 0.2 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def get_best_action(self, state): best_profit = -999 best_action = None for a in self.actions: profit = self.q_table[(state, a)] if profit > best_profit: best_profit = profit best_action = a return best_action, best_profit def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (inputs['light'],inputs['oncoming'], inputs['left'], self.next_waypoint) if randint(0,10) < 3: action = self.actions[randint(0, 3)] else: action, _ = self.get_best_action(self.state) reward = self.env.act(self, action) next_inputs = self.env.sense(self) next_next_waypoint = self.planner.next_waypoint() next_state = (next_inputs['light'],next_inputs['oncoming'], next_inputs['left'], next_next_waypoint) _, best_profit = self.get_best_action(next_state) self.q_table[(self.state, action)] = (1-self.alpha)*self.q_table[(self.state, action)] + (self.alpha * (reward + self.gamma * best_profit)) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.total_reward = 0 #Q matrix self.Q = {} # initial Q values self.initial_q = 30 # Initialize state self.states = None # Dictionary of possible actions to index states from Q matrix self.actions = {None : 0, 'forward' : 1, 'left' : 2, 'right' : 3} def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) # Check if this state is included in the Q matrix. If not, create a new row in the matrix. if self.state not in self.Q: self.Q[self.state] = [self.initial_q for i in range(len(self.env.valid_actions))] # TODO: Select action according to your policy learning_rate = 0.8 action = self.env.valid_actions[np.argmax(self.Q[self.state])] discount = 0 # Execute action and get reward reward = self.env.act(self, action) self.total_reward += reward # TODO: Learn policy based on state, action, reward # get the future state future_state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) # Update the Q matrix (add consideration of future reward) #self.Q[self.state][self.actions[action]] = self.Q[self.state][self.actions[action]] * (1 - learning_rate) + reward * learning_rate self.Q[self.state][self.actions[action]] = self.Q[self.state][self.actions[action]] + learning_rate * (reward + discount * self.Q[future_state][self.actions[action]] - self.Q[self.state][self.actions[action]])
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor # Set any additional class parameters as needed self.trial = 1 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0 self.alpha = 0 else: #self.epsilon -= 0.05 ##To be used in default learner self.epsilon = -math.pow(-0.03 * self.trial, 2) + 1 self.alpha = -math.pow(-0.02 * self.trial, 2) + 1 if self.learning: self.trial += 1 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent state = (inputs['light'], inputs['oncoming'], waypoint) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = max(self.Q[state].values()) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning: if not state in self.Q.keys(): state_dict = {} for action in self.valid_actions: state_dict[action] = 0.0 self.Q[state] = state_dict return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability if (self.learning and random.random() < self.epsilon) or state not in self.Q.keys(): action = random.choice(self.valid_actions) else: Q_val = self.get_maxQ(state) for a in self.valid_actions: if self.Q[state][a] == Q_val: action = a break # Otherwise, choose an action with the highest Q-value for the current state return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: current_Q = self.Q[state][action] self.Q[state][action] = ( 1 - self.alpha) * current_Q + self.alpha * reward return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.q_values = {} self.penalties = [] self.penalty = 0 def reset(self, destination=None): self.penalties.append(self.penalty) self.penalty = 0 self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # Update state curr_state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right']) self.state = curr_state old_q, action = self.choose_best_action(self.state) # Execute action and get reward reward = self.env.act(self, action) # calculate penalties if reward < 0: self.penalty += 1 # Sense the environment, and obtain new state new_inputs = self.env.sense(self) new_state = (self.next_waypoint, new_inputs['light'], new_inputs['oncoming'], new_inputs['left'], new_inputs['right']) # Update the Q table new_q_val = self.calculate_q_val(new_state, old_q, reward) self.q_values[(curr_state, action)] = new_q_val print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}, waypoint = {}".format(deadline, inputs, action, reward, self.next_waypoint) # [debug] def get_max_q(self, new_state): q = [self.get_q_value(new_state, action) for action in Environment.valid_actions] max_q = max(q) return max_q, q def choose_best_action(self, state): max_q, q_lst = self.get_max_q(state) # If there are multiple max_q, pick a random one if q_lst.count(max_q) > 1: max_q_lst = [i for i in range(len(Environment.valid_actions)) if q_lst[i] == max_q] idx = random.choice(max_q_lst) else: idx = q_lst.index(max_q) action = Environment.valid_actions[idx] return max_q, action def get_q_value(self, state, action): return self.q_values.get((state, action), 1.0) def calculate_q_val(self, new_state, old_q, reward): # learning rate alpha = 0.5 # discount factor gamma = 0.25 # learned value learned_val = reward + (gamma * self.get_max_q(new_state)[0] - old_q) new_q = old_q + alpha * learned_val return new_q
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint self.state = {} self.learning_rate = 0.6 self.exploration_rate = 0.1 self.exploration_degradation_rate = 0.001 self.discount_rate = 0.4 self.q_values = {} self.valid_actions = [None, 'forward', 'left', 'right'] def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) self.state = self.build_state(inputs) # TODO: Select action according to your policy action = self.choose_action_from_policy(self.state) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward self.update_q_value(self.state, action, reward) #print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug] def build_state(self, inputs): return { "light": inputs["light"], "oncoming": inputs["oncoming"], "left": inputs["left"], "direction": self.next_waypoint } def choose_action_from_policy(self, state): if random.random() < self.exploration_rate: self.exploration_rate -= self.exploration_degradation_rate return random.choice(self.valid_actions) best_action = self.valid_actions[0] best_value = 0 for action in self.valid_actions: cur_value = self.q_value_for(state, action) if cur_value > best_value: best_action = action best_value = cur_value elif cur_value == best_value: best_action = random.choice([best_action, action]) return best_action def max_q_value(self, state): max_value = None for action in self.valid_actions: cur_value = self.q_value_for(state, action) if max_value is None or cur_value > max_value: max_value = cur_value return max_value def q_value_for(self, state, action): q_key = self.q_key_for(state, action) if q_key in self.q_values: return self.q_values[q_key] return 0 def update_q_value(self, state, action, reward): q_key = self.q_key_for(state, action) cur_value = self.q_value_for(state, action) inputs = self.env.sense(self) self.next_waypoint = self.planner.next_waypoint() new_state = self.build_state(inputs) learned_value = reward + (self.discount_rate * self.max_q_value(new_state)) new_q_value = cur_value + (self.learning_rate * (learned_value - cur_value)) self.q_values[q_key] = new_q_value def q_key_for(self, state, action): return "{}|{}|{}|{}|{}".format(state["light"], state["direction"], state["oncoming"], state["left"], action)
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=0.99, alpha=0.8): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 # self.epsilon=self.epsilon*0.99 self.epsilon = self.epsilon * 0.99 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent state = (waypoint, inputs['light'], inputs['left'], inputs['right'], inputs['oncoming']) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state dic = self.Q[state] maxQ = -100000.0 for key in dic: if dic[key] > maxQ: maxQ = dic[key] return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 dic = self.Q if dic.has_key(state) == False: dic[state] = { self.valid_actions[0]: 0.0, self.valid_actions[1]: 0.0, self.valid_actions[2]: 0.0, self.valid_actions[3]: 0.0 } self.Q = dic return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if self.learning == False: i = random.randint(0, 3) action = self.valid_actions[i] elif self.learning == True: j = random.random() if j < self.epsilon: i = random.randint(0, 3) action = self.valid_actions[i] else: dic = self.Q[state] action = None l1 = list() for key in dic: if dic[key] == self.get_maxQ(state): l1.append(key) m = len(l1) action = l1[random.randint(0, m - 1)] return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning == True: self.Q[state][action] = ( 1 - self.alpha) * self.Q[state][action] + self.alpha * reward return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict() # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing == True: self.epsilon = 0 self.alpha = 0 else: self.epsilon = self.epsilon - 0.002 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense(self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # NOTE : you are not allowed to engineer eatures outside of the inputs available. # Because the aim of this project is to teach Reinforcement Learning, we have placed # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation. # With the hand-engineered features, this learning process gets entirely negated. # Set 'state' as a tuple of relevant data for the agent state = (waypoint,inputs['light'],inputs['oncoming'],inputs['left'],inputs['right']) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state stateactions = self.Q[state] highq = max(stateactions.iterkeys(), key=(lambda key: stateactions[key])) maxQ = stateactions[highq] return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning == True: if state not in self.Q: actionset = {'left': 0.0, 'right': 0.0, 'forward': 0.0, None : 0.0} self.Q[state] = actionset return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie". if self.learning == False: action = random.choice(self.valid_actions) else: if self.epsilon > random.random(): action = random.choice(self.valid_actions) else: maxq = self.get_maxQ(state) maxqactions = [] for action, qvalue in self.Q[state].iteritems(): if maxq == self.Q[state][action]: maxqactions.append(action) action = random.choice(maxqactions) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives a reward. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning == True: self.Q[state][action] = self.Q[state][action]*(1.0-self.alpha)+self.alpha*reward return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict() # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.learning = learning # Whether the agent is expected to learn self.Q = dict() # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor # Set any additional class parameters as needed self.t = 0 random.seed(1177) def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0.0 self.alpha = 0.0 else: # commented out testing parameters #self.epsilon = self.epsilon - 0.05 self.t += 1.0 # self.epsilon = 1.0/(self.t**2) # self.epsilon = 1.0/(self.t**2 + self.alpha*self.t) # self.epsilon = 1.0/(self.t**2 - self.alpha*self.t) # self.epsilon = math.fabs(math.cos(self.alpha*self.t)) # self.epsilon = math.fabs(math.cos(self.alpha*self.t))/(self.t**2) # self.epsilon = math.fabs(math.cos(self.alpha*self.t)) self.epsilon = math.exp(-self.alpha*self.t) return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense(self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent def xstr(s): if s is None: return 'None' else: return str(s) state = xstr(waypoint) + "_" + inputs['light'] + "_" + xstr(inputs['left']) + "_" + xstr(inputs['oncoming']) if self.learning: self.Q[state] = self.Q.get(state, {None:0.0, 'forward':0.0, 'left':0.0, 'right':0.0}) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = -1000.0 for action in self.Q[state]: if maxQ < self.Q[state][action]: maxQ = self.Q[state][action] return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning: self.Q[state] = self.Q.get(state, {None:0.0, 'forward':0.0, 'left':0.0, 'right':0.0}) return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if not self.learning: action = random.choice(self.valid_actions) else: if self.epsilon > 0.01 and self.epsilon > random.random(): action = random.choice(self.valid_actions) else: valid_actions = [] maxQ = self.get_maxQ(state) for act in self.Q[state]: if maxQ == self.Q[state][act]: valid_actions.append(act) action = random.choice(valid_actions) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: self.Q[state][action] = self.Q[state][action] + self.alpha*(reward-self.Q[state][action]) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.times = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0 self.alpha = 0 else: self.times = self.times + 1 #self.epsilon = pow(0.996,self.times) #self.epsilon = (self.epsilon - 0.05) if (self.epsilon - 0.05) > 0 else 0 self.epsilon = math.cos(0.0015 * self.times) return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent state = (waypoint, inputs['light'], inputs['left'], inputs['right'], inputs['oncoming']) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ_value = max(self.Q[state].values()) same_maxQ = [] ###extract the same value of maxQ into a list for key, value in self.Q[state].items(): if value == maxQ_value: same_maxQ.append(key) #choose an action randomly from the waiting list #maxQ = random.sample(same_maxQ, 1)[0] maxQ = random.choice(same_maxQ) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 #if state not in self.Q: # self.Q[state] ={'left':0.0, 'right':0.0, 'forward':0.0, None:0.0} if self.learning: self.Q.setdefault(state, {action: 0.0 for action in self.valid_actions}) return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if self.learning == False: action = random.sample(self.valid_actions, 1)[0] elif random.random() < self.epsilon: action = random.sample(self.valid_actions, 1)[0] else: action = self.get_maxQ(self.state) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') self.Q[state][action] = ( 1 - self.alpha) * self.Q[state][action] + self.alpha * reward return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the environment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### self.trial_count = 0 # Use as input for epsilon decay functions self.overall_states = 384 # Size of Q-Table = States Combination of all features = 2 * 4 * 4 * 4 * 3 self.overall_state_action = 384 * 4 # Combination of all possible state + valid actions = 1536 self.state_action_count = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Linear: a = 0.05, 20 training trials def decay_linear(a): self.epsilon -= a # Polynomial: a = 20, 20 training trials def decay_frac(a): self.epsilon = 1.0 / a**2 # Exponential: a = 0.995, 600 training trials def decay_exponential(a): self.epsilon = a**self.trial_count def decay_exponential_e(a): self.epsilon = math.e**(-a * self.trial_count) def decay_cosine(a): self.epsilon = math.cos(a * self.trial_count) # Hold epsilon at 1 during all training trials (in order to explore state-action combination more efficiently) def decay_step(total_trials): if self.trial_count > total_trials: self.epsilon = 0.0 # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0, # make sure retrieve from Q-Table instead of random walk. if testing: self.epsilon = 0.0 self.alpha = 0.0 else: self.trial_count += 1 decay_exponential(0.995) # decay_linear(0.05) # decay_step(600) return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent # 选择5个基础特征,state是这5个特征所有状态的组合,应该一共有384种。(waypoint只会有3种状态) state = (inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'], waypoint) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = max(self.Q[state].values()) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning: if state not in self.Q: self.Q[state] = { None: 0.0, 'forward': 0.0, 'left': 0.0, 'right': 0.0 } # Init default Q Score. return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state # 这里所说的按照epsilon的概率选择任意操作的实际意思是: # 首先,你有两个选项,opt1是任选一个操作,opt2一个是选择Q值最高的操作 # 然后,epsilon的取值范围在[0,1]之内,并且呈现逐渐缩小的趋势, # 如果epsilon是0.7,那么opt1的概率就是0.7,opt2的概率就是0.3. if not self.learning: action = random.choice( self.valid_actions ) # Pick a random action from All 4 Valid Actions. else: if random.random() < self.epsilon: action = random.choice( self.valid_actions ) # Pick a random action from All 4 Valid Actions. else: # action = max(self.Q[state], key=self.Q[state].get) # Get Key with max Q Score as action. # 需要考虑具有多个最大值的情况,应随机抽取 max_Q = max(self.Q[state].values()) max_candidate = [ x for x in self.Q[state] if self.Q[state][x] == max_Q ] action = random.choice(max_candidate) """Hard Coded Driving Logic For Fun""" # if inputs['light'] == 'red': # if self.next_waypoint == 'right' and inputs['oncoming'] != 'left' and inputs['left'] != 'forward': # action = self.next_waypoint # else: # action = None # else: # if self.next_waypoint == 'left' and (inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right'): # action = 'forward' # else: # action = self.next_waypoint return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') prev_Q = self.Q[state][action] self.Q[state][action] = prev_Q * (1 - self.alpha) + reward * self.alpha # 显示当前学习进度,包括<状态>的覆盖程度以及<状态-动作>的覆盖程度。 if prev_Q == 0 and reward != 0: self.state_action_count += 1 print 'Trial Count =', self.trial_count print 'Q-Table Size = {} / {}'.format(len(self.Q), self.overall_states) print 'Q-Table Non-zero Item Count = {} / {}'.format( self.state_action_count, self.overall_state_action) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict() # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.time_step = 1 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing == True: self.alpha = 0 self.epsilon = 0 else: self.alpha = 0.007 self.epsilon = math.exp(-self.time_step * self.alpha) self.time_step = self.time_step + 1 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense(self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left']) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state st = self.Q[state] # adapted from Lucretiel's answer here: http://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary max_key = max(st, key=lambda key: st[key]) return st[max_key] def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning == True: if state not in self.Q.keys(): self.Q[state] = {None: 0, 'forward': 0, 'left': 0, 'right':0} return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if (self.learning == False) or (self.epsilon > random.random()): return random.choice(self.valid_actions) else: max_val = self.get_maxQ(state) # to store actions with same max value acts = dict() for act, reward in self.Q[state].iteritems(): if reward == max_val: acts[act] = reward return random.choice(acts.keys()) def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning == True: self.Q[state][action] = (1 - self.alpha) * self.Q[state][action] + self.alpha * reward return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.trial = 1 self.epsilonDecayVariable = epsilon self.alphaDecayVariable = alpha def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice #self.epsilon =self.epsilon -.05 #?=e^ ?at ,for 0<a<1 #self.epsilon =math.exp(-self.epsilonDecayVariable*self.trial*1) #?=cos(at),for 0<a<1 #self.epsilon =math.cos(self.epsilonDecayVariable*self.trial) trialNum = self.trial ParmA = -5E-11 ParmB = 0.104 ParmC = 210 self.epsilon = 1 * math.exp( -ParmA * trialNum) / (1 + math.exp(ParmB * (trialNum - ParmC))) #self.epsilon=self.epsilon -.05 #self.epsilon=math.exp(-0.0005*self.trial)*(4+math.cos(.7*self.trial))/5 #=EXP (-lambda*A1)*(4+COS(freq*A1))/5 #decay learning rate #self.alpha =.75 #math.exp(-self.alphaDecayVariable*self.trial) #self.epsilon =math.pow(self.epsilonDecayVariable,self.trial ) # Update additional class parameters as needed #not sure on additional class parms -time remaining, success rate,etc.. ? self.trial = self.trial + 1 # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0 self.alpha = 0 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent # When learning, check if the state is in the Q-table # If it is not, create a dictionary in the Q-table for the current 'state' # For each action, set the Q-value for the state-action pair to 0 state = (inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'], waypoint) if self.learning: if state not in self.Q: #state is in Q table already self.Q[state] = {None: 0, 'left': 0, 'right': 0, 'forward': 0} return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### #get the highest Q value from the dict of actions for this state: state parm. maxQ = max(self.Q[state].values()) #maxQ = None return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning: if state not in self.Q: self.Q[state] = {None: 0, 'left': 0, 'right': 0, 'forward': 0} return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() #action = random.choice(self.valid_actions) #print(action) ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state #get max q value maxQ = self.get_maxQ(state) #initially choose action with highest Q Value dictActions = self.Q[state] action = dictActions.keys()[dictActions.values().index(maxQ)] if self.learning: #if the random choice is within epsilon prob, then choose randomly. lDblRand = random.random() #is the random less than epsilon ? if lDblRand < self.epsilon: action = random.choice(self.valid_actions) else: print('not learning') return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: self.Q[state][action] = ( 1 - self.alpha) * self.Q[state][action] + (reward * self.alpha) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5, epsilon_threshold=0.005, a=0.5, experiment="na"): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.epsilon_threshold = epsilon_threshold self.t = 0 self.a = a self.experiment = experiment def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0 self.alpha = 0 else: if self.experiment == "2": self.epsilon = self.a**self.t elif self.experiment == "3": if self.t == 0: self.t = 1.0 self.epsilon = 1.0 / (self.t**2) elif self.experiment == "4": self.epsilon = math.e**-(self.a * self.t) elif self.experiment == "5": self.epsilon = math.cos(self.a * self.t) else: self.epsilon = self.epsilon - self.epsilon_threshold self.t = self.t + 1.0 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # NOTE : you are not allowed to engineer eatures outside of the inputs available. # Because the aim of this project is to teach Reinforcement Learning, we have placed # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation. # With the hand-engineered features, this learning process gets entirely negated. # Set 'state' as a tuple of relevant data for the agent state = (waypoint, inputs) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = max(self.Q[self.key_state(state)].values()) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning: if self.Q.get(self.key_state(state)) == None: actions_ = {} for a in self.valid_actions: actions_[a] = 0.0 self.Q[self.key_state(state)] = actions_ return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie". if not self.learning: idx = random.randint(0, len(self.valid_actions) - 1) action = self.valid_actions[idx] return action if self.epsilon > 0.01 and self.epsilon > random.random(): action = self.valid_actions[random.randint( 0, len(self.valid_actions) - 1)] else: max_value = self.get_maxQ(state) actions_ = [ action_ for (action_, value_) in self.Q[self.key_state(state)].iteritems() if value_ == max_value ] idx = random.randint(0, len(actions_) - 1) action = actions_[idx] return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives a reward. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: s_a = self.Q[self.key_state(state)][action] self.Q[self.key_state( state)][action] = (1 - self.alpha) * s_a + self.alpha * reward return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return def key_state(self, state): """ It receives an state and returns a string that serves as key for the Q table """ return "{0}_LI:{1}_O:{2}_R:{3}_L:{4}".format(state[0], state[1]['light'], state[1]['oncoming'], state[1]['right'], state[1]['left'])
class QLearningAgent(Agent): # An agent using Q learning learns to drive def __init__(self, env): super(QLearningAgent, self).__init__(env) self.color = 'red' self.planner = RoutePlanner(self.env, self) self.deadline = self.env.get_deadline(self) self.next_waypoint = None self.moves = 0 self.qDict = dict() self.alpha = 0.9 # learning rate self.epsilon = 0.05 # probability of flipping the coin self.gamma = 0.2 self.state = None self.new_state = None self.reward = None self.cum_reward = 0 self.possible_actions = Environment.valid_actions self.action = None def reset(self, destination=None): self.planner.route_to(destination) self.next_waypoint = None self.moves = 0 self.state = None self.new_state = None self.reward = 0 self.cum_reward = 0 self.action = None def getQvalue(self, state, action): key = (state, action) return self.qDict.get(key, 5.0) def getMaxQ(self, state): q = [self.getQvalue(state, a) for a in self.possible_actions] return max(q) def get_action(self, state): """ epsilon-greedy approach to choose action given the state """ if random.random() < self.epsilon: action = random.choice(self.possible_actions) else: q = [self.getQvalue(state, a) for a in self.possible_actions] if q.count(max(q)) > 1: best_actions = [ i for i in range(len(self.possible_actions)) if q[i] == max(q) ] index = random.choice(best_actions) else: index = q.index(max(q)) action = self.possible_actions[index] return action def qlearning(self, state, action, nextState, reward): """ use Qlearning algorithm to update q values """ key = (state, action) if (key not in self.qDict): # initialize the q values self.qDict[key] = 5.0 else: self.qDict[key] = self.qDict[key] + self.alpha * ( reward + self.gamma * self.getMaxQ(nextState) - self.qDict[key]) def update(self, t): self.next_waypoint = self.planner.next_waypoint() inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # take the following variables as states: # - environ(oncoming, left, right) # - next_waypoint # - traffic light self.new_state = inputs self.new_state['next_waypoint'] = self.next_waypoint self.new_state = tuple(sorted(self.new_state.items())) # for the current state, choose an action based on epsilon policy action = self.get_action(self.new_state) # observe the reward new_reward = self.env.act(self, action) # update q value based on q learning algorithm if self.reward != None: self.qlearning(self.state, self.action, self.new_state, self.reward) # set the state to the new state self.action = action self.state = self.new_state self.reward = new_reward self.cum_reward = self.cum_reward + new_reward self.moves = self.moves + 1
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.trial_num = 1 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 #self.epsilon = self.epsilon - 0.005 #print "self.trial_num",[self.epsilon,self.trial_num,math.log(100.0/(self.trial_num+1)),math.log(100.0/self.trial_num)] if self.epsilon > 0.0: nr = 10000.0 self.epsilon = self.epsilon * math.log( nr / (self.trial_num + 1)) / math.log(nr / self.trial_num) else: self.epsilon = 0.0 """ if self.alpha>0.3: self.alpha=self.alpha-0.05 else: self.alpha=0.3 """ self.alpha = self.epsilon + 0.2 if self.alpha > 1.0: self.alpha = 1.0 self.trial_num = self.trial_num + 1 if testing == True: self.epsilon = 0.0 self.alpha = 0.0 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # NOTE : you are not allowed to engineer eatures outside of the inputs available. # Because the aim of this project is to teach Reinforcement Learning, we have placed # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation. # With the hand-engineered features, this learning process gets entirely negated. # Set 'state' as a tuple of relevant data for the agent #state = None #state={'waypoint':waypoint,'light':inputs['light'],'oncoming':inputs['oncoming'], 'left':inputs['left'] } state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right']) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = None if state in Q: actions_dict = Q[state] """ for action, value in actions_dict.iteritems(): if maxQ==None: maxQ=(value,action) elif value>maxQ[0]: maxQ=(value,action) elif value<maxQ[0]: pass elif value==maxQ[0]: maxQ.add(action) """ for action, value in actions_dict.iteritems(): if not maxQ: maxQ = value elif maxQ < value: maxQ = value return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if state not in self.Q: self.Q[state] = { "left": 0.0, "right": 0.0, "forward": 0.0, None: 0.0 } self.Q[state][state[0]] = 1.0 return self.Q def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None """action_id = random.randint(0,3) action = self.valid_actions[action_id] current_input = self.env.sense(self) light_state = current_input['light'] action = self.next_waypoint print {'light_state':light_state,'self.next_waypoint':self.next_waypoint,'action':action, 'left':current_input['left'] , 'oncoming':current_input['oncoming'],'right':current_input['right']} if light_state != 'green': if self.next_waypoint != 'right': action = None else: if current_input['left'] == 'forward' or current_input['oncoming'] == 'left': action = None else: action =self.next_waypoint if light_state == 'green': if self.next_waypoint != 'left': action = self.next_waypoint elif current_input['oncoming']=='forward': action = None # action_id = random.randint(0,3) # action = self.valid_actions[action_id] # print "action:",action,action_id """ ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie". if random.uniform(0, 1) < self.epsilon: return random.choice(self.env.valid_actions) if not self.learning: return random.choice(self.env.valid_actions) maxQ = None for action, value in self.Q[state].iteritems(): if not maxQ: maxQ = [value, action] elif maxQ[0] == value: maxQ.append(action) elif maxQ[0] < value: maxQ = [value, action] action_to_return = random.choice(maxQ[1:]) """if action_to_return==None: action_to_return = None""" return action_to_return def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives a reward. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: self.Q[state][action] = self.alpha * reward + ( 1 - self.alpha) * self.Q[state][action] return True def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action #print "taking action:",action,type(action) reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.Q = {} self.policy = {} self.alpha = 0.4 self.gamma = 0.3 self.actions = [None, 'forward', 'right', 'left'] def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # The states can be defined as what the current environment is, .i.e., the traffic light and the intersection situation + # the location of the next waypoint.(Hardcoding states is kind of bad idea as they are many of them) self.state = inputs self.state['next_waypoint'] = self.next_waypoint self.state = tuple(self.state.items()) # select random action if state not present in the policy: Agent has not seen the state # TODO: Select action according to your policy #print type(self.policy) action = self.actions[random.randrange(4)] if self.state not in self.policy else self.policy[self.state] reward = self.env.act(self, action) state_prime = self.env.sense(self) state_prime['next_waypoint'] = self.planner.next_waypoint() state_prime = tuple(state_prime.items()) # Now we have all the variables that are input to q learning formula: state, reward, action, and next_state: # What goes next? We tweak q_value for the present state action pair and then choose the action of the state which maximized q(the optimal policy) # According to the formula we need to calculate the utility of the next state. max(a) Q(s', a'). The policy is holding argmax(a) Q(s,a) values. # Thus, as the Q learning algorithm converges, we would be finding he max(a) Q(s', a') if the policy has seen that state. Thus, there is no need to # calculate the max(a') Q(s', a') everytime as that would be recursively going on till i don't know what action_prime = self.actions[random.randrange(4)] if state_prime not in self.policy else self.policy[state_prime] # TODO: Learn policy based on state, action, reward # Q_learning state_act_pair = (self.state, action) self.Q[state_act_pair] = (1 - self.alpha) * self.Q.get(state_act_pair, 0) + self.alpha*(reward + self.gamma*(self.Q.get((state_prime, action_prime), 0))) # Now we need to update the policy well(Take action that maximizes the utility of the present state, thus updating policy) Q_max_candidates = [self.Q.get((self.state, act)) for act in self.actions] try: if None not in Q_max_candidates: # PERFORMS VERY POORLY WHEN ALL STATES NOT ALREADY SEEN.(11/100) THIS IS SO BECAUSE THE POLICY SHOULD NOT BE UPDATED # UNTIL ALL THE ACTIONS CORRESPONDING TO THE STATE HAS BEEN SEEN. OR ELSE IT WILL KEEP FOLLOWING THE SAME POLICY. AND # WILL NEVER FURTHER LEARN. # ANOTHER METHOD IS THE EXPLORATION-EXPLOITATION METHOD WHEREBY THE ACTION IN THE POLICY IS TAKEN WITH PROB 1-e AND ANOTHER # RANDOM ACTION WITH PROB e. Q_max = max(Q_max_candidates) actions_candidates = [act for act in self.actions if Q_max == self.Q.get((self.state, act))] self.policy[self.state]= random.choice(actions_candidates) except Exception as e: print "Could not update the policy by choosing the max Q: {}".format(e) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.state = None self.q_table = {} self.frame = 0 self.last_state_tuple = None self.last_action = None self.alpha = .5 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.state = None self.last_state_tuple = None self.last_action = None def update(self, t): self.frame += 1 # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) #e.g. left, forward, right deadline = self.env.get_deadline(self) # TODO: Update state env_state = self.env.sense(self) state_tuple = (env_state['light'], self.next_waypoint) self.state = "Light: %s, Waypoint: %s" % state_tuple # TODO: Select action according to your policy # TODO: Learn policy based on state, action, reward #default to random action action = random.choice([None, 'forward', 'left', 'right']) #look for action prescribed by q table if state_tuple in self.q_table: #calculate epsilon epsilon = 1 - (((1. / (n_trials * 30.)) * self.frame) + .5) if epsilon >= 1.: epsilon = .99 print "EP: ", epsilon #randomly decide whether to act randomly, based on epsilon roll = random.random() if roll < epsilon: reward = self.env.act(self, action) self.q_table[state_tuple].append((action, reward)) else: #act according to highest recorded q value for the state q_temp = 0 for i in self.q_table[state_tuple]: val = i[1] if val > q_temp: q_temp = val action = i[0] reward = self.env.act(self, action) #if action not in q table: else: reward = self.env.act(self, action) #q value for the state is "initialized" during the simulation (i.e. when the state/action pair is first encountered). #It is initialized to the immediate reward. self.q_table[state_tuple] = [(action, reward)] #update q value for last state if self.last_state_tuple != None: c = 0 for i in self.q_table[self.last_state_tuple]: if i[0] == self.last_action: last_act = self.q_table[self.last_state_tuple][c][0] last_val = self.q_table[self.last_state_tuple][c][1] updated_val = (last_val * (1 - self.alpha)) + (self.alpha * reward) self.q_table[self.last_state_tuple][c] = (last_act, updated_val) #save last state_tuple, so it can be updated during the next iteration self.last_state_tuple = state_tuple self.last_action = action print "FRAME: ", self.frame print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format( deadline, env_state, action, reward) # [debug]