class QLearningAgent(Agent): """An agent that learns to drive in the smartcab world by using Q-Learning""" def __init__(self, env): super(QLearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.learner = QLearn(epsilon=1, alpha=0.3, gamma=0.6) def reset(self, destination=None): self.planner.route_to(destination) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) # TODO: Select action according to your policy action = self.learner.action(self.state) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward next_inputs = self.env.sense(self) self.future_next_way_out = self.planner.next_waypoint() next_state = (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], self.future_next_way_out) #next_state = (next_inputs[0], next_inputs[1], next_inputs[3], self.planner.next_waypoint()) self.learner.learn(self.state, action, next_state, reward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.alpha = 0.2 self.Qtable = {} def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.state = None self.reward = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # Inform Driving Agent self.state = (inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'], self.next_waypoint) # TODO: Select action according to your policy valid_actions = [None, 'forward', 'left', 'right'] # Random Pick Actions ### action = random.choice(valid_actions) # Get all Q values for all state all_actions = {action: self.Qtable.get((self.state, action), 0) for action in valid_actions} # Find action that maximizes Q values best_actions = [action for action in valid_actions if all_actions[action] == max(all_actions.values())] # Return Best Action action = random.choice(best_actions) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward # To simplify the problem, I set gamma equals 0. self.Qtable[(self.state, action)] = \ (1 - self.alpha) * self.Qtable.get((self.state, action), 0) + self.alpha * reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class BasicAgent(Agent): def __init__(self, env): super(BasicAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.color = 'red' # override color def reset(self, destination=None): self.planner.route_to(destination) def update(self, t): self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) action = random.choice(self.env.valid_actions) reward = self.env.act(self, action) light = inputs['light'] oncoming = inputs['oncoming'] left = inputs['left'] right = inputs['right'] #Update the current observed state self.state = (light, oncoming, left, right, self.next_waypoint) print "Basic.update(): next move = {}".format(action) # [debug]
class QLearningAgent(Agent): """An agent that learns to drive in the smartcab world by using Q-Learning""" def __init__(self, env): super(QLearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.possible_actions= Environment.valid_actions self.ai = QLearn(epsilon=.05, alpha=0.1, gamma=0.9) def reset(self, destination=None): self.planner.route_to(destination) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) inputs = inputs.items() deadline = self.env.get_deadline(self) self.state = (inputs[0],inputs[1],inputs[3],self.next_waypoint) action = self.ai.Q_move(self.state) # Execute action and get reward reward = self.env.act(self, action) inputs2 = self.env.sense(self) inputs2 = inputs2.items() next_state = (inputs2[0],inputs2[1],inputs2[3],self.next_waypoint) self.ai.Q_post(self.state, action, next_state, reward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, learning_rate=0.6, discount_factor=0.35): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.alpha = learning_rate self.gamma = discount_factor self.state = None self.Q_values = self.initialize_Q_values() def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def initialize_Q_values(self, val=10.0): Q_values = {} for waypoint in ['left', 'right', 'forward']: for light in ['red', 'green']: for action in self.env.valid_actions: Q_values[((waypoint, light), action)] = val return Q_values def find_max_Q(self, state): action = None max_Q = 0.0 for a in self.env.valid_actions: Q = self.Q_values[(state, a)] if Q > max_Q: action = a max_Q = Q return (max_Q, action) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (self.next_waypoint, inputs['light']) # TODO: Select action according to your policy #action = random.choice(self.env.valid_actions) (Q, action) = self.find_max_Q(self.state) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward next_waypoint = self.planner.next_waypoint() inputs = self.env.sense(self) state_prime = (next_waypoint, inputs['light']) (Q_prime, action_prime) = self.find_max_Q(state_prime) Q += self.alpha * (reward + self.gamma*Q_prime - Q) self.Q_values[(self.state, action)] = Q print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here def reset(self, destination=None): self.planner.route_to(destination) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = inputs['location'] self.heading=inputs['heading'] action = random.choice(Environment.valid_actions) reward = self.env.act(self, action) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # location, heading, delta, deadline, oncoming, traffic light state inputs['waypoint'] = self.next_waypoint self.state = tuple(sorted(inputs.items())) # performs the actual update # TODO: Select action according to your policy actions = [None,'forward','left','right'] action = actions[random.randint(0,3)] # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here #Actions self.A = ['forward', 'left', 'right', None] #all the actions available self.trial = 0 #Initialize Q table(light, oncoming, next_waypoint) self.Q={} for i in ['green', 'red']: #possible lights for j in [None, 'forward', 'left', 'right']: #possible oncoming_agent for k in ['forward', 'left', 'right']: #possible next_waypoints self.Q[(i,j,k)] = [1] * len(self.A) def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (inputs['light'], inputs['oncoming'], self.next_waypoint) #Find the max Q value for the current state max_Q = self.Q[self.state].index(max(self.Q[self.state])) #assign action p = random.randrange(0,5) epsilon = 1.5 # small probability to act randomly if p<epsilon: action = random.choice(self.env.valid_actions) else: action = self.A[max_Q] # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward gamma = 0.30 alp_tune =500 # tunning parameter alpha = 1/(1.1+self.trial/alp_tune) # decay learning rate self.trial = self.trial+1 #get the next action state Q(s',a') next_inputs = self.env.sense(self) next_next_waypoint = self.planner.next_waypoint() next_state = (next_inputs['light'],next_inputs['oncoming'], next_next_waypoint) #Update Q Table self.Q[self.state][self.A.index(action)] = \ (1-alpha)*self.Q[self.state][self.A.index(action)] + \ (alpha * (reward + gamma * max(self.Q[next_state])))
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): # sets self.env = env, state = None, next_waypoint = None, and a default color super(LearningAgent, self).__init__(env) self.color = 'red' # override color # simple route planner to get next_waypoint self.planner = RoutePlanner(self.env, self) # TODO: Initialize any additional variables here def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs # # from route planner, also displayed by simulator self.next_waypoint = self.planner.next_waypoint() inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = self.next_waypoint, inputs['light'] # self.state = self.next_waypoint, (inputs['oncoming'] == None or inputs['light'] == 'green') # TODO: Select action according to your policy # Define actions actions = [None, 'forward', 'left', 'right'] # QUESTION 1- select random action action = random.choice(actions) # Execute action and get reward reward = self.env.act(self, action)
class LearningAgent(Agent): def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.next_waypoint = None self.total_reward = 0 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.state = None self.next_waypoint = None def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Select action according to your policy loc, hed = self.get_my_location() action = self.get_next_waypoint_given_location( loc, hed) action_okay = self.check_if_action_is_ok(inputs) if not action_okay: action = None # Execute action and get reward reward = self.env.act(self, action) self.total_reward += reward #TODO: Learn policy based on state, action, reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.q_matrix = {} self.alpha = 0.95 self.gamma = 0.05 self.explore = 0.05 self.state = None def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs deadline = self.env.get_deadline(self) state = self.observe_state() self.state = state knowledge = self.knowledge_from_state(state) # TODO: Select action according to your policy if random() < self.explore: action = choice(OPTIONS) else: action = sorted(knowledge.items(), key=lambda x: x[1], reverse=True)[0][0] old_value = knowledge[action] # Execute action and get reward reward = self.env.act(self, action) new_state = self.observe_state() new_state_knowledge = self.knowledge_from_state(new_state) self.q_matrix[state][action] = (1-self.alpha) * old_value +\ self.alpha * (reward + self.gamma * max(new_state_knowledge.values())) #print "LearningAgent.update(): deadline = {}, state = {}, action = {}, reward = {}".format(deadline, state, action, reward) # [debug] def knowledge_from_state(self, state): """ Gets what we know about the given state """ if not state in self.q_matrix: self.q_matrix[state] = {option: 0 for option in OPTIONS} return self.q_matrix[state] def observe_state(self): """ Observe the current state """ inputs = self.env.sense(self) self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator return (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # TODO: Select action according to your policy action = None # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward print("LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.prevReward = 0 self.prevAction = None def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.state = None self.prevReward = 0 self.prevAction = None def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state state = None validActions = [] if inputs['light'] == 'red': if inputs['oncoming'] != 'left': validActions = ['right', None] else: if inputs['oncoming'] == 'forward': validActions = [ 'forward','right'] else: validActions = ['right','forward', 'left'] # TODO: Select action according to your policy if validActions != []: action = random.choice(validActions) else: action = random.choice(Environment.valid_actions) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward if action != None: if reward > self.prevReward: self.state = (inputs,self.next_waypoint,deadline) self.prevAction = action self.prevReward = reward else: self.state = (inputs,self.next_waypoint,deadline) self.prevAction = action self.prevReward = reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): # sets self.env = env, state = None, next_waypoint = None, and a default color super(LearningAgent, self).__init__(env) self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.qtable = defaultdict(dict) self.epsilon = .4 # Randomness factor self.alpha = .4 # Learning rate. def reset(self, destination=None): # Prepare for a new trip; reset any variables here, if required self.planner.route_to(destination) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # Update state self.state = self.define_state(inputs, self.next_waypoint) # Select action according policy action = self.select_action() # Execute action and get reward reward = self.env.act(self, action) # Learn policy based on state, action, reward self.update_qtable(action, reward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug] def select_action(self): if random.random() < self.epsilon or self.state not in self.qtable: action = random.choice(self.env.valid_actions) else: action = max(self.qtable[self.state], key=self.qtable[self.state].get) if self.state not in self.qtable: for a in self.env.valid_actions: # init action:reward dict. self.qtable[self.state][a] = 0 return action def update_qtable(self, action, reward): old_q = self.qtable[self.state].get(action, 0) self.qtable[self.state][action] = ((1 - self.alpha) * old_q) + (self.alpha * reward) # Decay alpha and epsilon until it reaches 0.1 and 0.0, respectively. self.alpha = max(0.001, self.alpha - 0.002) self.epsilon = max(0.0, self.epsilon - 0.002) @staticmethod def define_state(inputs, next_waypoint): return tuple(inputs.values()) + (next_waypoint, )
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint valid_waypoints = ['forward','right','left'] valid_lights = ['red','green'] valid_actions = ['forward','right','left', None] self.Q_hat={} states = [(next_waypoint,light,oncoming,right,left) for next_waypoint in valid_waypoints for light in valid_lights for oncoming in valid_actions for right in valid_actions for left in valid_actions] states.append('Destination') for state in states: self.Q_hat[state] = {} for action in valid_actions: self.Q_hat[state][action] = random.random() def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) if self.next_waypoint == None: self.state = 'Destination' else: self.state = (self.next_waypoint, inputs['light'],inputs['oncoming'],inputs['right'],inputs['left']) # TODO: Select action according to your policy valid_actions = ['forward','right','left', None] epsilon = 0.1 #0.01 best_action = max(self.Q_hat[self.state],key=self.Q_hat[self.state].get) random_action = choice(valid_actions) action = choice([best_action, random_action],p=[1-epsilon,epsilon]) # Execute action and get reward reward = self.env.act(self, action) # Learn policy based on state, action, reward new_next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator new_inputs = self.env.sense(self) new_state = (new_next_waypoint, new_inputs['light'],new_inputs['oncoming'],new_inputs['right'],new_inputs['left']) alpha = 0.5 #opt 0.7 gamma = 0.5 #opt 0.1 max_Qhat_ahat = max(self.Q_hat[new_state].values()) self.Q_hat[self.state][action] = (1-alpha)*self.Q_hat[self.state][action]+alpha*(reward+gamma*max_Qhat_ahat) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.actions = [None, 'forward', 'left', 'right'] self.learning_rate = 0.3 self.state = None self.q = {} self.trips = 0 def reset(self, destination=None): self.planner.route_to(destination) # Prepare for a new trip; reset any variables here, if required self.trips += 1 if self.trips >= 100: self.learning_rate = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # Update state state = (inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'], self.next_waypoint) self.state = state # Lazy initialization of Q-values for action in self.actions: if (state, action) not in self.q: self.q[(state, action)] = 1 # Select action according to the policy probabilities = [self.q[(state, None)], self.q[(state, 'forward')], self.q[(state, 'left')], self.q[(state, 'right')]] # Use the softmax funtion so that the values actually behave like probabilities probabilities = np.exp(probabilities) / np.sum(np.exp(probabilities), axis=0) adventurousness = max((100 - self.trips) / 100, 0) if random.random() < adventurousness: action = np.random.choice(self.actions, p=probabilities) else: action = self.actions[np.argmax(probabilities)] # Execute action and get reward reward = self.env.act(self, action) # Learn policy based on state, action, reward self.q[(state, action)] = self.learning_rate * reward + (1 - self.learning_rate) * self.q[(state, action)] #print("LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)) # [debug] def get_state(self): return self.state
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.Qtable = {} self.alpha = 0.8 self.gamma = 0.6 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right']) old_state = self.state # TODO: Select action according to your policy if (old_state, None) not in self.Qtable.keys(): self.Qtable[(old_state, None)] = 0 if (old_state, 'left') not in self.Qtable.keys(): self.Qtable[(old_state, 'left')] = 0 if (old_state, 'forward') not in self.Qtable.keys(): self.Qtable[(old_state, 'forward')] = 0 if (old_state, 'right') not in self.Qtable.keys(): self.Qtable[(old_state, 'right')] = 0 max_action = max(self.Qtable[(old_state, None)], self.Qtable[(old_state, 'left')],self.Qtable[(old_state, 'forward')],self.Qtable[(old_state, 'right')]) action = random.choice([ i for i in ['left', 'forward', 'right', None] if self.Qtable[(old_state, i)] == max_action]) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward #self.next_waypoint = self.planner.next_waypoint() inputs = self.env.sense(self) self.state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right']) new_state = self.state if (new_state, None) not in self.Qtable.keys(): self.Qtable[(new_state, None)] = 0 if (new_state, 'left') not in self.Qtable.keys(): self.Qtable[(new_state, 'left')] = 0 if (new_state, 'forward') not in self.Qtable.keys(): self.Qtable[(new_state, 'forward')] = 0 if (new_state, 'right') not in self.Qtable.keys(): self.Qtable[(new_state, 'right')] = 0 self.Qtable[(old_state, action)] = (1 - self.alpha) * self.Qtable[(old_state, action)] + self.alpha *(reward + self.gamma * max(self.Qtable[(new_state, i)] for i in [None, 'left', 'forward', 'right']))
class BasicAgent(Agent): """A basic agent upon which to build learning agents.""" def __init__(self, env): super(BasicAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint self.qvals = {} # mapping (state, action) to q-values self.time = 0 # number of moves performed self.possible_actions = (None, 'forward', 'left', 'right') self.n_dest_reached = 0 # number of destinations reached self.last_dest_fail = 0 # last time agent failed to reach destination self.sum_time_left = 0 # sum of time left upon reaching destination over all trials self.n_penalties = 0 # number of penalties incurred self.last_penalty = 0 # last trial in which the agent incurred in a penalty def reset(self, destination=None): self.planner.route_to(destination) def best_action(self, state): """ Return a random action (other agents will have different policies) """ return random.choice(self.possible_actions) def update_qvals(self, state, action, reward): """ Keeps track of visited (state, action) pairs. (in other agents will use reward to update the mapping from (state, action) pairs to q-values) """ self.qvals[(state, action)] = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # update time self.time += 1 # Update state self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) # Pick an action action = self.best_action(self.state) # Execute action and get reward reward = self.env.act(self, action) if reward < 0: self.n_penalties += 1 # Update the q-value of the (state, action) pair self.update_qvals(self.state, action, reward) # print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class QLearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(QLearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.total_reward = 0 self.next_waypoint = None self.QLearner = QAlgorithm(epsilon=0.03, alpha = 0.1, gamma = 0.9) self.next_state = None self.time_step = 1.0 def get_decay_rate(self, t): #Decay rate for epsilon return 1.0 / float(t) def reset(self, destination=None): self.planner.route_to(destination) self.time_step = 1.0 # TODO: Prepare for a new trip; reset any variables here, if required ''' self.previous_action = None self.next_state = None self.total_reward = 0 self.next_waypoint = None self.QLearner = QAlgorithm(epsilon=0.03, alpha = 0.1, gamma = 0.9) self.next_state = None ''' def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) #env_states = self.env.agent_states[self] self.time_step += 1 epsilon = self.get_decay_rate(self.time_step) self.QLearner.set_eps(epsilon) self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) #self.state = (inputs['light'], self.next_waypoint) action = self.QLearner.q_move(self.state) # Execute action and get reward reward = self.env.act(self, action) self.total_reward += reward new_inputs = self.env.sense(self) #env_states = self.env.agent_states[self] new_deadline = self.env.get_deadline(self) self.next_state = (new_inputs['light'], new_inputs['oncoming'], new_inputs['left'], self.next_waypoint) #self.next_state = (new_inputs['light'], self.next_waypoint) # TODO: Learn policy based on state, action, reward self.QLearner.q_future_reward(self.state, action, self.next_state, reward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, alpha, q_initial): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.policy = {} self.trip_log = [] self.trip = None self.alpha = alpha self.q_initial = q_initial def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required if self.trip: self.trip_log.append(self.trip) self.trip = {} self.trip['Deadline'] = self.env.get_deadline(self) self.trip['Reward'] = 0 self.trip['Penalty'] = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state inputs.pop('right') self.state = inputs self.state['waypoint'] = self.next_waypoint # TODO: Select action according to your policy state_hash = hashabledict(frozenset(self.state.iteritems())) q = self.policy.get(state_hash, { None: self.q_initial, 'forward': self.q_initial, 'left': self.q_initial, 'right': self.q_initial, }) action = max(q.iteritems(), key=operator.itemgetter(1))[0] # Execute action and get reward reward = self.env.act(self, action) # Update trip stats self.trip['Reward'] += reward self.trip['Remaining'] = self.env.get_deadline(self) self.trip['Success'] = self.planner.next_waypoint() == None if reward < 0: self.trip['Penalty'] += reward # TODO: Learn policy based on state, action, reward q[action] = (1 - self.alpha) * q[action] + self.alpha * reward self.policy[state_hash] = q
class BasicLearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(BasicLearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.last_reward = 0 self.last_action = None self.last_state = None self.state = None self.total_reward = 0 self.deadline = self.env.get_deadline(self) self.actions = ['forward', 'left', 'right', None] self.reached_destination = 0 self.penalties = 0 self.movements = 0 def tuple_state(self, state): State = namedtuple("State", ["light", "next_waypoint"]) return State(light = state['light'], next_waypoint = self.planner.next_waypoint()) def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.last_reward = 0 self.last_action = None self.last_state = None self.state = None self.total_reward = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = self.tuple_state(inputs) # TODO: Select action according to your policy action = random.choice(self.actions) # Execute action and get reward reward = self.env.act(self, action) if reward >= 10: self.reached_destination += 1 if reward < 0: self.penalties += 1 self.movements += 1 # TODO: Learn policy based on state, action, reward self.last_action = action self.last_state = self.state self.last_reward = reward self.total_reward += reward
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.acts = ['left', 'right', 'forward', 'stay'] df_tmp = pd.DataFrame(0*np.ones((2, 4)), index=['red', 'green'], columns=self.acts) self.Q = df_tmp self.deadline = 0 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) self.deadline = deadline # TODO: Update state self.state = inputs['light'] # in this case the state is simply defined by the color oof the traffic light at the intersection # TODO: Select action according to your policy # Q-Learner act_opt = self.Q.ix[self.state].argmax() if act_opt=='stay': act_opt=None # print "Current state = %s, qlearner action = %s" % (self.state, act_opt) action = act_opt # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward # Maximize Q conditional on the state we are in = S q_S = self.Q.ix[self.state] # Q conditional on state S q_S_mx = q_S[q_S==q_S.max()] # just in case there are multiple values and the action that maximizes Q(S, a) is not unique ix_mx = np.random.randint(0,len(q_S_mx)) # pick one of the equally qlearner actions q_S_mx = q_S_mx.iloc[ix_mx] # Update Q act_tmp = action if action !=None else 'stay' global alpha, alpha0, gamma alpha *= alpha0 self.Q.ix[self.state, act_tmp] = \ (1-alpha)*self.Q.ix[self.state, act_tmp] + \ alpha*(reward + gamma*q_S_mx) # note: we update the action that has been actually taken, but we update the contribution to Q(S, a) by the action that could have been taken had we behaved qlearnerly
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.QLearning = QLearning(epsilon=0.3, alpha=0.95, gamma=0.9) # self.QLearning = pickle.load(open('./QLearning.pkl')) def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def is_action_okay(self, inputs): action_okay = True if self.next_waypoint == 'right': if inputs['light'] == 'red' and inputs['left'] == 'forward': action_okay = False elif self.next_waypoint == 'forward': if inputs['light'] == 'red': action_okay = False elif self.next_waypoint == 'left': if inputs['light'] == 'red' or (inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right'): action_okay = False return action_okay def update(self, t): # Gather states self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state action_okay = self.is_action_okay(inputs) self.state = (action_okay, self.next_waypoint, deadline < 3) # self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) # TODO: Select action according to your policy action = self.QLearning.policy(self.state) # Execute action and get reward reward = self.env.act(self, action) # Gather next states next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator next_inputs = self.env.sense(self) next_deadline = self.env.get_deadline(self) next_action_okay = self.is_action_okay(next_inputs) next_state = (next_action_okay, next_waypoint, next_deadline < 3) # next_state = (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], next_waypoint) # TODO: Learn policy based on state, action, reward self.QLearning.update_QTable(self.state, action, reward, next_state)
class QLearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(QLearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.Q = {} self.learning_rate = 0.9 self.states = None self.possible_actions = [None, 'forward', 'left', 'right'] def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # state = (self.next_waypoint,inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'],deadline) state = (self.next_waypoint,inputs['light'], inputs['oncoming'], inputs['right'], inputs['left']) # state = (self.next_waypoint,inputs['light']) self.state = state for action in self.possible_actions: if(state, action) not in self.Q: self.Q[(state, action)] = 20 # TODO: Select action according to your policy Q_best_value = [self.Q[(state, None)], self.Q[(state, 'forward')], self.Q[(state, 'left')], self.Q[(state, 'right')]] # Softmax Q_best_value = np.exp(Q_best_value) / np.sum(np.exp(Q_best_value), axis=0) #action = np.random.choice(self.possible_actions, p=probabilities) action = self.possible_actions[np.argmax(Q_best_value)] # Execute action and get reward reward = self.env.act(self, action) if reward == -10.0 : print "Invalid move" if reward == -0.5 : print "valid, but is it correct?" # TODO: Learn policy based on state, action, reward # self.Q[(state,action)] = reward + self.learning_rate * self.Q[(state,action)] self.Q[(state,action)] = reward * self.learning_rate + ( 1 - self.learning_rate ) * self.Q[(state,action)]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, learning_rate=0.78, discount_factor=0.36): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.Q = {} self.discount_factor = discount_factor self.learning_rate = learning_rate self.actions = Environment.valid_actions self.oldAction = None def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.oldAction=None def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state #------------------------------------------------ #action = np.random.choice(self.actions) #------------------------------------------------ # TODO: Select action according to your policy current_state = tuple(inputs.values() + [self.next_waypoint]) self.state = (inputs, self.next_waypoint) if current_state not in self.Q: self.Q[current_state] = [3,3,3,3] Q_max = self.Q[current_state].index(np.max(self.Q[current_state])) # Execute action and get reward action = self.actions[Q_max] reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward if t!=0: oldvalue = self.Q[self.oldAction[0]][self.oldAction[1]] newvalue = ((1 - self.learning_rate)*oldvalue)+(self.learning_rate * (self.oldAction[2] + self.discount_factor * self.Q[current_state][Q_max])) self.Q[self.oldAction[0]][self.oldAction[1]] = newvalue self.oldAction = (current_state, self.actions.index(action), reward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class RandomAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(RandomAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.availableAction = ['forward', 'left', 'right',None] self.goal=0 self.steps=0 self.features=[] self.deadline=[] self.total_reward=[0] def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required #print"RESET, Final state:\n", self.state try: if self.deadline[len(self.features)-1] >0: #deadline less than zero self.goal+=1 #FIXME - order print "PASS! {} steps to goal,Goal reached {} times out of {}!".format( self.deadline[len(self.features)-1],self.goal,len(self.features)) else: print "FAIL! {} steps to goal,Goal reached {} times out of {}!".format( self.deadline[len(self.features)-1],self.goal,len(self.features)) pass except: print "Trial 0 - Goal reached {} times out of {}!".format(self.goal,len(self.features)) pass print "----------------------------------------------------------" self.features.append({}) self.deadline.append(None) self.total_reward.append(0) self.steps=0 def update(self, t): # Gather inputs self.steps+=1 self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) #self.deadline[len(self.features)] = self.env.get_deadline(self) self.state=inputs self.features[len(self.features)-1][self.steps]=inputs self.deadline[len(self.deadline)-1] = self.env.get_deadline(self) # TODO: Select action according to your policy action = self.availableAction[random.randint(0,3)] # Execute action and get reward reward = self.env.act(self, action) self.lastReward=reward # TODO: Learn policy based on state, action, reward self.total_reward[len(self.total_reward)-1] =self.total_reward[len(self.total_reward)-1]+reward
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.actions = ['forward', 'left', 'right',None] self.q_table = Counter() self.alpha = 0.5 self.gamma = 0.2 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def get_best_action(self, state): best_profit = -999 best_action = None for a in self.actions: profit = self.q_table[(state, a)] if profit > best_profit: best_profit = profit best_action = a return best_action, best_profit def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (inputs['light'],inputs['oncoming'], inputs['left'], self.next_waypoint) if randint(0,10) < 3: action = self.actions[randint(0, 3)] else: action, _ = self.get_best_action(self.state) reward = self.env.act(self, action) next_inputs = self.env.sense(self) next_next_waypoint = self.planner.next_waypoint() next_state = (next_inputs['light'],next_inputs['oncoming'], next_inputs['left'], next_next_waypoint) _, best_profit = self.get_best_action(next_state) self.q_table[(self.state, action)] = (1-self.alpha)*self.q_table[(self.state, action)] + (self.alpha * (reward + self.gamma * best_profit)) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.debugOn = False self.reporter = Reporter(driver_type, self.env.valid_actions) if driver_type=='random': self.driver = RandomDriver(self.env.valid_actions) elif driver_type=='greedy': self.driver = GreedyDriver(self.env.valid_actions) elif driver_type=='greedy2': self.driver = Greedy2Driver(self.env.valid_actions) elif driver_type=='smart': self.driver = SmartDriver(self.env.valid_actions, self.reporter) elif driver_type=='smart2': self.driver = Smart2Driver(self.env.valid_actions, self.reporter) elif driver_type=='smart3': self.driver = Smart3Driver(self.env.valid_actions, self.reporter) else: self.driver = Smart2Driver(self.env.valid_actions, self.reporter) def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.reporter.reset(destination) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = State.make(self.next_waypoint, inputs) # TODO: Select action according to your policy action = self.driver.decide(self.state) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward newWaypoint = self.planner.next_waypoint() newState = State.make(newWaypoint, self.env.sense(self)) self.driver.learn(t, deadline, self.state, action, reward, newState) location = self.env.agent_states[self]['location'] if self.debugOn: print "LearningAgent.update(): deadline={} inputs={} action={} reward={} location={}".format(deadline, inputs, action, reward, location) # [debug] self.reporter.update(t, deadline, self.state, action, reward, location, self.driver)
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, trials): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.trials = trials self.success = 0 # How many times agent able to reach the target for given trials? self.penalties = 0 self.total_rewards = 0 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.penalties = 0 self.total_rewards = 0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # self.state = inputs # self.state = (inputs['light'], inputs['oncoming'], inputs['left']) self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) # TODO: Select action according to your policy action = random.choice(self.env.valid_actions) # Initial random movement / random action # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".\ format(deadline, inputs, action, reward) # [debug] # Statistics self.total_rewards += reward if reward < 0: self.penalties += 1 if reward > 5: self.success += 1 print "\nTrial info: Number of Penalties = {}, Net rewards: {}. \nTotal success/trials: {}/{}. \n".\ format(self.penalties, self.total_rewards, self.success, self.trials)
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.total_reward = 0 #Q matrix self.Q = {} # initial Q values self.initial_q = 30 # Initialize state self.states = None # Dictionary of possible actions to index states from Q matrix self.actions = {None : 0, 'forward' : 1, 'left' : 2, 'right' : 3} def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) # Check if this state is included in the Q matrix. If not, create a new row in the matrix. if self.state not in self.Q: self.Q[self.state] = [self.initial_q for i in range(len(self.env.valid_actions))] # TODO: Select action according to your policy learning_rate = 0.8 action = self.env.valid_actions[np.argmax(self.Q[self.state])] discount = 0 # Execute action and get reward reward = self.env.act(self, action) self.total_reward += reward # TODO: Learn policy based on state, action, reward # get the future state future_state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) # Update the Q matrix (add consideration of future reward) #self.Q[self.state][self.actions[action]] = self.Q[self.state][self.actions[action]] * (1 - learning_rate) + reward * learning_rate self.Q[self.state][self.actions[action]] = self.Q[self.state][self.actions[action]] + learning_rate * (reward + discount * self.Q[future_state][self.actions[action]] - self.Q[self.state][self.actions[action]])
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.t = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0 self.alpha = 0 if self.learning and not testing: # self.epsilon = self.epsilon - 0.05 # self.epsilon = 1.0/(self.t) # self.t = self.t + 1.0 # system("echo "+str(self.epsilon)+" >> eps") # self.epsilon = math.cos(self.t) #self.epsilon = math.exp(-0.1 * self.t) self.epsilon = abs(math.cos(0.01 * self.t)) self.t = self.t + 1.0 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # NOTE : you are not allowed to engineer features outside of the inputs available. # Because the aim of this project is to teach Reinforcement Learning, we have placed # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation. # With the hand-engineered features, this learning process gets entirely negated. # Set 'state' as a tuple of relevant data for the agent #if waypoint == 'right': # state = (waypoint, inputs['left'], inputs['oncoming']) #if waypoint == 'forward': # state = (waypoint, inputs['light'], inputs['left']) #if waypoint == 'left': # state = (waypoint, inputs['light'], inputs['oncoming']) state = (waypoint, inputs['light'], inputs['left'], inputs['oncoming']) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = [''] max_val = -0xFFFFFF # Minimum possible Q value print "\nChoose Max Q" for action in self.Q[state].keys(): print "Max", max_val, print "Action", action, if self.Q[state][action] > max_val: print "Q Action Val", self.Q[state][action], max_val = self.Q[state][action] print "Max", max_val, maxQ[0] = action print "maxQ", maxQ, print print "maxQ", maxQ print for action in self.Q[state].keys(): print "Action", action, if self.Q[state][action] == max_val and action != maxQ[0]: print "Q Action Val", self.Q[state][action], maxQ.append(action) print "maxQ", maxQ, print print print "maxQ", maxQ #s = raw_input("MaxQEh") return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning: if state not in self.Q.keys(): self.Q[state] = {} for action in self.valid_actions: self.Q[state][action] = 0 print "State", state print "QDict", self.Q return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie". if not self.learning: print "Not Learning", action = self.valid_actions[random.randint( 0, len(self.valid_actions) - 1)] if self.learning: print "Learning", print "Epsilon", self.epsilon, action_probability = random.uniform(0, 1) print "Action Probability", action_probability, if action_probability <= self.epsilon: action = self.valid_actions[random.randint( 0, len(self.valid_actions) - 1)] print "random action", action, else: print "Finding maxQ", self.Q[state], actions = self.get_maxQ(state) print "Actions", actions, if len(actions) == 1: action = actions[0] else: action = actions[random.randint(0, len(actions) - 1)] print "Action", action, print print "Given Env", self.env.sense(self) print "Next waypoint", self.next_waypoint #s = raw_input() return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives a reward. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: self.Q[state][action] = (self.Q[state][action] * (1.0 - self.alpha)) + (reward * self.alpha) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'yellow' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.reward = 0 self.next_waypoint = None # Decalring the variable to calculate the success percentage rate self.S = 0 #Declaring the trial counter variable self.var_trial = 0 #Declaring the variable to track sucess rate self.var_trial_S = {} def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.state = None self.next_waypoint = None # Incrementing the success percentage counter before reseting the trial try: if self.var_trial_S[self.var_trial] == 1: self.S = self.S + 1 except: pass # reseting the trial self.var_trial = self.var_trial + 1 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Select action according to your policy action = None action = random.choice(Environment.valid_actions) ''' # TODO: Update state action_okay = True if self.next_waypoint == 'right': if inputs['light'] == 'red' and inputs['left'] == 'forward': action_okay = False elif self.next_waypoint == 'straight': if inputs['light'] == 'red': action_okay = False elif self.next_waypoint == 'left': if inputs['light'] == 'red' or inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right': action_okay = False if not action_okay: action = None ''' self.state = inputs self.state['next_waypoint'] = self.next_waypoint self.state = tuple(sorted(self.state.items())) # Execute action and get reward reward = self.env.act(self, action) self.reward = self.reward + reward # TODO: Learn policy based on state, action, reward # at the end of each trial we see the reward, if reward > 10 means the trial is successful else trial has failed if (reward >= 10): self.var_trial_S[self.var_trial] = 1 else: self.var_trial_S[self.var_trial] = 0 print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor # Set any additional class parameters as needed self.trial = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon, self.alpha = 0, 0 else: #self.epsilon = self.epsilon - 0.05 #for question 6 self.trial += 1 self.epsilon = math.exp(-0.01 * self.trial) if self.alpha >= 0.1: self.alpha = math.exp(-0.07 * self.trial) else: self.alpha return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent # Originally set state as a tuple, but suspect encoding Q-table key as a tuple of tuples possibly resulted in key name collisions. # state = (("lights", inputs['light']), #("oncoming", inputs['oncoming']), #("left", inputs['left']), #("waypoint", self.next_waypoint)) state = str(waypoint) + "_" + inputs['light'] + "_" + str( inputs['left']) + "_" + str(inputs['oncoming']) if not self.learning: random.choice(self.valid.actions) else: self.Q[state] = self.Q.get(state, { None: 0.0, 'forward': 0.0, 'left': 0.0, 'right': 0.0 }) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ # Calculate the maximum Q-value of all actions for a given state # preset an initialization value that should be replaced by a more valid Q value in the loop. maxQ = max(self.Q[state].values()) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning and state not in self.Q: self.Q[state] = { None: 0.0, 'left': 0.0, 'right': 0.0, 'forward': 0.0 } return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if not self.learning: action = random.choice(self.valid_actions) else: coin_toss = random.random() if coin_toss <= self.epsilon: action = random.choice(self.valid_actions) else: actions_list = [] highest_Qvalue = self.get_maxQ(state) for act in self.Q[state]: if highest_Qvalue == self.Q[state][act]: actions_list.append(act) action = random.choice(actions_list) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: self.Q[state][action] = self.alpha * reward + ( 1 - self.alpha) * self.Q[state][action] return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=True, epsilon=0.5, alpha=0.4): super(LearningAgent, self).__init__(env) # Set the agent in the environment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict() # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.decay_rate = 0.001 self.a = 0.01 self.n_trials = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 self.n_trials += 1 if self.learning is not True: self.epsilon = 0 self.alpha = 0 elif self.learning is True: # self.epsilon = self.epsilon - self.decay_rate self.epsilon = np.exp(-(self.a*self.n_trials)) return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense(self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent input1, input2, input3, input4 = inputs.items() state = (waypoint, input1, input2, input3, input4) return state def get_maxQ(self, state): """ The get_maxQ function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state # from https://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary maxQ = max(self.Q[state], key=self.Q[state].get) action_list = [] for action in self.Q[state]: if self.Q[state][action] == self.Q[state][maxQ]: action_list.append(action) if len(action_list) > 0: maxQ = random.choice(action_list) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning: if state not in self.Q.keys(): self.Q[state] = {action: 0.0 for action in self.valid_actions} return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if not self.learning: action_number = random.randint(0, 3) action = self.valid_actions[action_number] elif self.learning: rand_num = random.random() if rand_num < self.epsilon: action_number = random.randint(0, 3) action = self.valid_actions[action_number] else: action = self.get_maxQ(state=state) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') # Set discount factor to zero *from Kevin Robertson # Transitions <s,a,r,s'> # sought some help from the forums and lecture videos. # update Q-value = current Q-value * (1 - alpha) + alpha * (current reward + discount factor * expected future reward) self.Q[state][action] = self.Q[state][action] * (1 - self.alpha) + self.alpha * reward return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent_v2(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent_v2, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.trip_history = [] self.debug = True self.gamma = 0.2 #upper bound of discount #self.alpha = 0.5 #uppder bound of learning rate self.epsilon = 0.1 #lower bound of proportion of random steps self.reg = 0.001 # regularization param for regression self.lr = 0.1 # learning rate for regression self.clock_update = 0 # store number of updates self.init_params_scale = 1e-4 # scale of initial params setting self.max_memory = 400 # number of rows for state_action_experience self.batch_size = 20 # size of batch self.batch_step = 20 # extract a batch for each batch_step steps self.param_step = self.max_memory # how many step should update w self.state_feature = [ 'right_no', 'forward_no', 'left_no', 'next_right', 'next_forward', 'next_left' ] self.action_feature = ['right', 'forward', 'left'] self.state = None self.num_action_space = np.concatenate( (np.diag(np.ones(3)), np.zeros(3)[np.newaxis, :])) self.state_action_feature = self.state_feature + self.action_feature + [ x + "_action_" + y for x in self.state_feature for y in self.action_feature ] #self.state_action_df = pd.DataFrame(columns = (self.state_action_feature + ['Q_score']) ) self.state_action_experience = np.zeros( (1, len(self.state_action_feature))) self.Q_score_experience = np.zeros(1) self.ex_state = np.zeros(len(self.state_feature)) self.ex_state_action = np.zeros(len(self.state_action_feature)) self.ex_reward = 0 self.params = { 'b': np.random.randn(1), 'w': self.init_params_scale * np.random.randn(len(self.state_action_feature), 1) } self.params_update = self.params self.reward_history = np.zeros(1) def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.ex_reward = 0 self.ex_state_action = np.zeros(len(self.state_action_feature)) self.ex_state = np.zeros(len(self.state_feature)) if (len(self.trip_history) < 150): print 'Current success rate is {}'.format( sum(self.trip_history) / (len(self.trip_history) + 0.000001)) else: print 'Success rate for recent 100 trials is {}'.format( sum(self.trip_history[-100:]) / (len(self.trip_history[-100:]) + 0.000001)) print 'Average reward for recent moves is {}'.format( np.mean(self.reward_history)) if (self.reward_history.shape[0] > 1000): self.reward_history = np.delete(self.reward_history, range(100)) def numeric_state(self, inputs=None, deadline=0, next_waypoint=None): #print 'inputs is {}, deadline is {}, next_waypoint is {}'.format(str(inputs), str(deadline), str(next_waypoint)) col_name = self.state_feature state = np.zeros(len(col_name)) state += np.array( map(lambda x: x == 'next_' + str(next_waypoint), col_name)) if inputs['light'] == 'red' and inputs['left'] == 'forward': #state += np.array( map(lambda x: x=='right_no', col_name) ) state[0] = 1 if inputs['light'] == 'red': state[1] = 1 if inputs['light'] == 'red' or (inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right'): state[2] = 1 #state[len(col_name)-1] = deadline if False: print 'inputs is {}, deadline is {}, next_waypoint is {}\n'.format( str(inputs), str(deadline), str(next_waypoint)) print zip(col_name, state) raw_input("Press Enter to continue...") return state def numeric_action(self, action=None): col_name = self.action_feature return np.array(map(lambda x: x == str(action), col_name)) def numeric_state_action(self, num_state=None, num_action=None): return np.concatenate( (num_state, num_action, np.outer(num_state, num_action).flatten()), axis=0) def max_Q_param(self, num_state): X = np.apply_along_axis( lambda x: self.numeric_state_action(num_state, x), axis=1, arr=self.num_action_space) score = X.dot(self.params['w']) + self.params['b'] if False: print '\nX are\n {}\n, Params are\n {}\n'.format( str(X), str(self.params)) raw_input("Press Enter to continue...") choose = np.argmax(score) opt_action = None if choose < 3: opt_action = self.action_feature[choose] num_state_action = X[choose] max_Q_hat = score[choose] if False: print '\nScores are\n {}\n, opt action are\n {}\n'.format( str(score), str(opt_action)) raw_input("Press Enter to continue...") return opt_action, max_Q_hat, num_state_action def gradient(self, X, y, reg=0.01): if False: print '\nX are\n {}\n and y are\n {}\n'.format(str(X), str(y)) raw_input("Press Enter to continue...") w, b = self.params_update['w'], self.params_update['b'] scores = X.dot(w) + b y = y.flatten()[:, np.newaxis] loss = np.mean((y - scores)**2) + 0.5 * reg * np.sum(w**2) if False: print '\ny are\n {}\n and scores are\n {}\n and loss is\n {}\n'.format( str(y), str(scores), str(loss)) raw_input("Press Enter to continue...") d_w = np.mean( (X * ((scores - y) * 2)), axis=0)[:, np.newaxis] + reg * w d_b = np.mean((scores - y) * 2) return d_w, d_b, loss def sample_X_y(self, size=10): idx = np.random.randint(self.state_action_experience.shape[0], size=size) X = self.state_action_experience[idx, :] y = self.Q_score_experience[idx] return X, y def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) #may need to take deadline into account? # TODO: Update state self.state_action_experience = np.concatenate( (self.state_action_experience, self.ex_state_action[np.newaxis, :])) num_state = self.numeric_state(inputs=inputs, deadline=deadline, next_waypoint=self.next_waypoint) self.state = zip(self.state_feature, num_state) # TODO: Select action according to your policy action, max_Q_hat, num_state_action = self.max_Q_param(num_state) if (random.uniform(0, 1) < self.epsilon): action = random.choice(Environment.valid_actions[:]) num_action = self.numeric_action(action) num_state_action = self.numeric_state_action(num_state=num_state, num_action=num_action) if False: print "\n Use a random action, {}".format(str(action)) #debug raw_input("Press Enter to continue...") true_Q_score = self.ex_reward + self.gamma * max_Q_hat self.Q_score_experience = np.append(self.Q_score_experience, true_Q_score) self.clock_update += 1 if False: print '\nShape of State Action expreience Matrix is {}\n'.format( self.state_action_experience.shape) print '\nShape of Q score experience is {}\n'.format( self.Q_score_experience.shape) raw_input("Press Enter to continue...") # TODO: Learn policy based on state, action, reward reward = self.env.act(self, action) self.ex_reward = reward self.ex_state_action = num_state_action self.reward_history = np.append(self.reward_history, reward) if reward > 9: self.trip_history.append(1) #need to write down something here self.state_action_experience = np.concatenate( (self.state_action_experience, self.ex_state_action[np.newaxis, :])) self.Q_score_experience = np.append(self.Q_score_experience, reward) self.clock_update += 1 elif deadline == 0: self.trip_history.append(0) self.state_action_experience = np.concatenate( (self.state_action_experience, self.ex_state_action[np.newaxis, :])) self.Q_score_experience = np.append(self.Q_score_experience, reward) self.clock_update += 1 if (self.clock_update > self.max_memory + 2): self.state_action_experience = np.delete( self.state_action_experience, range(self.state_action_experience.shape[0] - self.max_memory), 0) self.Q_score_experience = np.delete( self.Q_score_experience, range(len(self.Q_score_experience) - self.max_memory)) if False: print '\nShape of State Action expreience Matrix is {}\n'.format( self.state_action_experience.shape) print '\nShape of Q score experience is {}\n'.format( self.Q_score_experience.shape) raw_input("Press Enter to continue...") if (self.clock_update % self.batch_step == 0): for i in xrange(2): if False: print '\nUpdated Parameters are {}\n'.format( str(self.params_update)) raw_input("Press Enter to continue...") data_X, data_y = self.sample_X_y(size=self.batch_size) d_w, d_b, loss = self.gradient(data_X, data_y, reg=self.reg) if False: print '\nGradiants are {} and {}\n'.format( str(d_w), str(d_b)) raw_input("Press Enter to continue...") if False: print '\nloss is {}\n'.format(loss) raw_input("Press Enter to continue...") self.params_update[ 'w'] = self.params_update['w'] - self.lr * d_w self.params_update[ 'b'] = self.params_update['b'] - self.lr * d_b if self.clock_update % self.param_step == 0: self.params = self.params_update if True: print '\nBias for regression is {}\n'.format( str(self.params['b'])) weight_df = pd.DataFrame(data=self.params['w'].T, columns=self.state_action_feature) print '\nWeights for regression is\n{}\n'.format( weight_df.T)
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, discount_rate=0.1, epsilon=0.15, epsilon_decay=.99): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.actions = (None, 'forward', 'left', 'right') self.Q = {} #hashable Q-Table self.default_Q = 1 #initially tried zero self.epsilon = epsilon #controls frequency of random actions [note: not used when implementing epsilon decay] self.epsilon_decay = epsilon_decay #decay rate (multiplier) for GLIE self.alpha = 1 # learning rate [note: not used when updating/decaying alpha] self.initial_alpha = 1 #starting point for alpha-updating model self.discount_rate = discount_rate #controls extent to which we value future rewards # keep track of net reward, trial no., penalties self.net_reward = 0 self.trial = 0 self.success = 0 self.net_penalty = 0 self.penalty_tracker = [] self.reward_tracker = [] self.success_tracker = [] self.trial_tracker = [] #initialize Q-table with default values for light_state in ['green', 'red']: for oncoming_traffic_state in self.actions: for right_traffic_state in self.actions: for left_traffic_state in self.actions: for waypoint_state in self.actions[ 1:]: #ignore NONE - this is not a waypoint possibility # record each state state = (light_state, oncoming_traffic_state, right_traffic_state, left_traffic_state, waypoint_state) self.Q[state] = {} for action in self.actions: self.Q[state][action] = self.default_Q def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.state = None self.trial += 1 self.net_reward = 0 self.net_penalty = 0 #for the random agent def get_random_action(self, state): return random.choice(self.actions) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'], self.next_waypoint) # TODO: Select action according to your policy #random agent approach #best_action = self.get_random_action(self.state) #Q-learning approach if random.random() < self.epsilon: # add randomness to the choice best_action = random.choice(self.actions) else: best_action = None max_Q = None #find action with best Q value for available_action in self.actions: if self.Q[self.state][available_action] > max_Q: best_action = available_action max_Q = self.Q[self.state][available_action] #EPSILON DECAY. COMMENT OUT FOR INITIAL PARAMETER TUNING self.epsilon = 1 / (self.trial + self.epsilon_decay) # Execute action and get reward reward = self.env.act(self, best_action) self.net_reward += reward #successful if reward is large. count this and also store other stats from this trial. if reward > 8: self.success += 1 if reward < 0: self.net_penalty += reward #if done, update stats if self.env.done == True or deadline == 0: self.penalty_tracker.append(self.net_penalty) self.reward_tracker.append(self.net_reward) if deadline == 0: self.success_tracker.append(0) else: self.success_tracker.append(1) self.trial_tracker.append( 40 - deadline) #number of turns it took to reach the end #if all trials done, show progression of performance indicators over time if self.trial == 100: print "net penalties over time: " + str(self.penalty_tracker) print "net rewards over time: " + str(self.reward_tracker) print "success pattern: " + str(self.success_tracker) print "trials required per round: " + str(self.trial_tracker) # TODO: Learn policy based on state, action, reward #get next state next_state = self.env.sense(self) next_waypoint = self.planner.next_waypoint() #re-represent this as a tuple since that's how the Q dict's keys are formatted next_state = (next_state['light'], next_state['oncoming'], next_state['left'], next_state['right'], next_waypoint) #add state to q dict if not already there if next_state not in self.Q: self.Q[next_state] = {} for next_action in self.actions: self.Q[next_state][next_action] = self.default_Q utility_of_next_state = None #search through available actions, find one with higest Q for next_action in self.Q[next_state]: if self.Q[next_state][next_action] > utility_of_next_state: utility_of_next_state = self.Q[next_state][next_action] #next get utility of state utility_of_state = reward + self.discount_rate * utility_of_next_state # update Q Table self.Q[self.state][best_action] = (1 - self.alpha) * \ self.Q[self.state][best_action] + self.alpha * utility_of_state #alpha update. comment this out when testing other inputs in isolation. #decay needed to ensure Q convergence. source: http://dreuarchive.cra.org/2001/manfredi/weeklyJournal/pricebot/node10.html if self.trial != 0: #self.alpha = 1 / (self.trial + self.epsilon_decay) num_trials = 100 self.alpha = (self.initial_alpha * (num_trials / 10.0)) / ( (num_trials / 10.0) + self.trial) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}, successes = {}".format( deadline, inputs, best_action, reward, self.success) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.A = ['forward', 'left', 'right',None] # all avaiable action self.trial = 0 # the number of trials self.Q = {} # Init Q table(light, next_waypoint) self.init_value=1.5 #the value to initilaize the q table with for i in ['green', 'red']: # possible lights for j in ['forward', 'left', 'right']: ## possible next_waypoints self.Q[(i,j)] = [self.init_value] * len(self.A) ## linized Q table self.gamma = 0.3 # discount factor self.alpha = 0.5 # learning rate self.epsilon = 0.1 # prob to act randomly, higher value means more exploration, more random action self.reward_holder='' #str to hold neg rewards and split term self.breaker=0 #value to calculate when new trial occurs def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = (inputs['light'], self.next_waypoint) # TODO: Select action according to your policy #action = random.choice(self.env.valid_actions) # random action # Find the max Q value for the current state max_Q = self.Q[self.state].index(max(self.Q[self.state])) # assign action p = random.random() #generate random value between 0 and 1 #simulated annealing appraoch to avoid local opt, mainly to avoid staying in the same place if p<=self.epsilon: action = random.choice(self.env.valid_actions) else: action = self.A[max_Q] # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward ''' #updating learning rate alpha_tune =500 # tunning parameter alpha = 1/(1.1+self.trial/alpha_tune) # decay learning rate self.trial = self.trial+1 ''' ## get the next state,action Q(s',a') next_inputs = self.env.sense(self) next_next_waypoint = self.planner.next_waypoint() next_state = (next_inputs['light'], next_next_waypoint) ## update Q table self.Q[self.state][self.A.index(action)] = \ (1-self.alpha)*self.Q[self.state][self.A.index(action)] + \ (self.alpha * (reward + self.gamma * max(self.Q[next_state]))) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug] #to get an idea of penalties in later trials if deadline>=self.breaker: self.reward_holder+='3 ' #will split string on 5 later and only be left with neg rewardfor a given trial self.breaker=deadline if reward<0: self.reward_holder+=str(reward)+' '
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here # Initialize Q # One dimension for state, another dimension for actions # By doing so, you can just map the corresponding state to choose action in the future self.Q = {} for i in ['green', 'red']: for j in [None, 'forward', 'left', 'right']: for k in self.env.valid_actions: self.Q[(i, j, k)] = [3] * len(self.env.valid_actions) # Initialize basic parameters of the Q-learning equation # Learning rate self.alpha = 0.3 # Discount rate self.gamma = 0.3 # Simulated annealing self.epsilon = 10 # Trials for plotting self.trials = -1 self.max_trials = 20 self.x_trials = range(0, self.max_trials) self.y_trials = range(0, self.max_trials) def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.trials = self.trials + 1 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # These are the valid inputs from valid_inputs in the class Environment in environment.py # 'light', 'oncoming', 'left', 'right' # Using 'light' and next_waypoint, we've tuples here self.state = (inputs['light'], inputs['oncoming'], self.next_waypoint) # TODO: Select action according to your policy # 1. We can find the action corresponding to the maximum value of Q for the current state # self.Q[self.state] accesses the current state # index(max(self.Q[self.state])) maximizes the value of Q for that state max_Q = self.Q[self.state].index(max(self.Q[self.state])) # 2. Then we select the action to maximize the value of Q # "Simulated annealing" # to allow the agent to still explore the environment # and not just still the the most conservative approach # It is a probabilistic technique for approximating the global optimum of a given function # This implies epsilon (probability of random action) is 10% # 0 and 1 can be the only values less than epsilon, 2 #action = random.choice(self.env.valid_actions) a = random.randrange(0, 2) if a < self.epsilon: action = random.choice(self.env.valid_actions) else: action = self.env.valid_actions[max_Q] # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward # The subsequent next of state and action, Q(s',a') next_inputs = self.env.sense(self) next_next_waypoint = self.planner.next_waypoint() next_state = (next_inputs['light'], next_inputs['oncoming'], next_next_waypoint) # Update Q Q_old = self.Q[self.state][self.env.valid_actions.index(action)] Q_next_utility = reward + self.gamma * max(self.Q[next_state]) self.Q[self.state][self.env.valid_actions.index(action)] = \ (1 - self.alpha) * Q_old + (self.alpha * Q_next_utility) # Determine if trial is successful (1) or not (0) if (deadline == 0) & (reward < 10): self.y_trials[self.trials] = 0 else: self.y_trials[self.trials] = 1
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__(env)# sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'black' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.state = None self.action = None self.alpha = 0.50 self.gamma = 0.05 self.epsilon = .01 self.q_table = {} self.actions = [None, 'forward', 'left', 'right'] self.trips = 0 softmax_probabilities = {} self.previous_state = None self.last_action = None self.last_reward = None def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required # self.trips += 1 if self.trips >= 10: self.epsilon == .10 if self.trips >= 20: self.epsilon == 0.00 def get_action(self, state,softmax_probabilities): #limited randomness on action choice per decreasing self.epsilon--avoid local min/max if random.random()< self.epsilon: action = np.random.choice(self.actions, p = softmax_probabilities) else: action = self.actions[np.argmax(softmax_probabilities)] return action def update(self, t): self.next_waypoint = self.planner.next_waypoint() #from route planner,also displayed, simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) #TODO: Update state state = (inputs['light'], inputs['oncoming'], inputs['right'],inputs['left'], self.next_waypoint) self.state = state # Initialization of q_table values to .97 for action in self.actions: if(state,action) not in self.q_table: self.q_table[(state,action)] = .75 #I used softmax function of q in q_table--so that 0-1 values for q-score softmax_probabilities =[self.q_table[(state,None)],self.q_table[(state,'forward')], self.q_table[(state,'left')], self.q_table[(state, 'right')]] # softmax = (e ^ x) / sum(e ^ x) softmax_probabilities = np.exp(softmax_probabilities)/ np.sum(np.exp(softmax_probabilities), axis = 0) #Execute the action get reward action = self.get_action(self.state, softmax_probabilities) # Get a reward for the action reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward if self.previous_state == None: self.q_table[(state, action)] = ((1-self.alpha) * reward) + (self.alpha * self.q_table[(state, action)]) else: current_q = self.q_table[(self.previous_state, self.last_action)] future_q = self.q_table[(state,action)] current_q = (1 - self.alpha) * current_q + self.alpha * (self.last_reward + self.gamma * future_q) self.q_table[(self.previous_state, self.last_action)] = current_q # use previous_state to store the last state so that I can lag between current state and next self.previous_state = self.state self.last_action = action self.last_reward = reward
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.t = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 #self.epsilon = self.epsilon -0.05 self.a = 0.01 self.t = self.t + 1 if testing: self.epsilon, self.alpha = 0, 0 else: #self.epsilon = (self.a)**self.t #self.epsilon =1.0/(self.t**2) self.epsilon = math.exp(-(self.a * self.t)) return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent # When learning, check if the state is in the Q-table # If it is not, create a dictionary in the Q-table for the current 'state' # For each action, set the Q-value for the state-action pair to 0 state = (waypoint, inputs['light'], inputs['left'], inputs['right'], inputs['oncoming']) if self.learning: self.createQ(state) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state ## My example for easy understanding #Q = {'left':10.0, 'right':11.0, 'forward':-5.0, None :0.0 } #max(Q)---> right #row = self.Q[state] #print row #maxkey = max(row) #maxQ = self.Q[state][maxkey] #maxQ = max(self.Q[state][action] for action in self.valid_actions) maxQ = max(self.Q[state].values()) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 ## My example for easy understanding #a = {'One':1, 'Two':2} #keyword = 'Two' #print keyword in a #if self.learning: # if state not in self.Q: # self.Q[state] = {'left':0.0, 'right':0.0, 'forward':0.0, None :0.0 } if (self.learning) and (state not in self.Q): self.Q[state] = {action: 0 for action in self.valid_actions} return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() #action = None ########### ## TO DO ## ########### ## My example for easy understanding #import random #foo = ['a', 'b', 'c', 'd', 'e'] #print(random.choice(foo) if self.learning: if self.epsilon > random.random(): action = random.choice(self.valid_actions) else: #action = max(self.Q[state]) #action = self.get_maxQ(state) maxQ = self.get_maxQ(state) action = random.choice([ action for action in self.valid_actions if self.Q[state][action] == maxQ ]) else: action = random.choice(self.valid_actions) # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') #next_waypoint = self.planner.next_waypoint() #next_inputs = self.env.sense(self) #next_light = next_inputs['light'] #next_left = next_inputs['left'] #next_right = next_inputs['right'] #next_oncoming = next_inputs['oncoming'] #next_state = (next_waypoint, next_light, next_left, next_right, next_oncoming) self.Q[state][action] = ( 1 - self.alpha) * self.Q[state][action] + self.alpha * reward #self.Q[state][action] = (1-self.alpha)*self.Q[state][action] + self.alpha*(reward+self.get_maxQ(next_state)) #self.Q[state][action] = (1-self.alpha)*self.Q[state][action] + self.alpha*(reward + self.get_maxQ(next_state)- self.Q[state][action]) #self.Q[state][action] = reward*self.alpha + (1-self.alpha)*self.Q[state][action] return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor self.t = 0 self.a = 0.000001 # self.a = 0.00001, alpha = 0.2, epsilon = 1-a*t**2 --> A+ Safety/A+ Reliability, suboptimal grid ########### ## TO DO ## ########### # Set any additional class parameters as needed def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed self.t = self.t + 1 self.epsilon = 1 - self.a * self.t**2 # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0 self.alpha = 0 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # NOTE : you are not allowed to engineer features outside of the inputs available. # Because the aim of this project is to teach Reinforcement Learning, we have placed # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation. # With the hand-engineered features, this learning process gets entirely negated. # Set 'state' as a tuple of relevant data for the agent #state = None state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left']) return state def get_maxQ(self, state): """ The get_maxQ function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = -100000000 for each in self.Q[state]: val = self.Q[state][each] if val > maxQ: maxQ = val return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 temp = ['None' if x == None else str(x) for x in state] strState = '_'.join(temp) if self.learning: temp = {} for action in self.valid_actions: temp[action] = 0.0 self.Q[state] = self.Q.get(state, temp) return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action if not self.learning: action = random.choice(self.valid_actions) else: p = random.uniform(0, 1) if p < self.epsilon: action = random.choice(self.valid_actions) else: maxVal = -100000000 for each in self.Q[state]: val = self.Q[state][each] if val > maxVal: action = each maxVal = val elif val == maxVal: print action, each action = random.choice([action, each]) # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie". return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives a reward. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: # Q-value iteration update rule # (1-alpha)*Q(st,at) + alpha (rt+1 + gamma*maxQ(st+1,a)) self.Q[state][action] = ( 1 - self.alpha) * self.Q[state][action] + self.alpha * (reward) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor # Set any additional class parameters as needed self.t = 0 random.seed(1000) def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0.0 self.alpha = 0.0 else: # commented out testing parameters # self.epsilon = self.epsilon - 0.05 self.t += 1.0 # self.epsilon = 1.0/(self.t**2) # self.epsilon = 1.0/(self.t**2 + self.alpha*self.t) # self.epsilon = math.fabs(math.cos(self.alpha*self.t)) self.epsilon = math.fabs(math.cos(self.alpha * self.t)) return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline # Set 'state' as a tuple of relevant data for the agent # When learning, check if the state is in the Q-table # If it is not, create a dictionary in the Q-table for the current 'state' # For each action, set the Q-value for the state-action pair to 0 # helper to create state string def xstr(s): if s is None: return 'None' else: return str(s) state = xstr(waypoint) + "_" + inputs['light'] + "_" + xstr( inputs['left']) + "_" + xstr(inputs['oncoming']) if self.learning: self.Q[state] = self.Q.get(state, { None: 0.0, 'forward': 0.0, 'left': 0.0, 'right': 0.0 }) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ # Calculate the maximum Q-value of all actions for a given state # preset an initialization value that should be replaced by a more valid Q value in the loop. maxQ = -1000.0 for action in self.Q[state]: if maxQ < self.Q[state][action]: maxQ = self.Q[state][action] return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning: self.Q[state] = self.Q.get(state, { None: 0.0, 'forward': 0.0, 'left': 0.0, 'right': 0.0 }) return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if not self.learning: action = random.choice(self.valid_actions) else: if self.epsilon > 0.01 and self.epsilon > random.random(): action = random.choice(self.valid_actions) else: valid_actions = [] maxQ = self.get_maxQ(state) for act in self.Q[state]: if maxQ == self.Q[state][act]: valid_actions.append(act) action = random.choice(valid_actions) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: self.Q[state][action] = self.Q[state][action] + self.alpha * ( reward - self.Q[state][action]) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'black' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint self.valid_actions = Environment.valid_actions # Intializing previous action, state, reward. self.prev_action = None self.prev_state = None self.prev_reward = None #initialize the Q_table self.Q = {} #Parameters for Q-Learning self.alpha = 0.85 self.gamma = 0.45 self.epsilon = 0.001 self.default_Q = 10 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.state = None self.next_waypoint = None self.prev_action = None self.prev_state = None self.prev_reward = None def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) #Update inputs to include all relevant variables (next_waypoint, status, light, left) inputs['waypoint'] = self.next_waypoint del inputs['right'] #Convert dictionary values of inputs into tuple to reflect state self.state = inputs state = tuple(inputs.values()) # TODO: Select action according to your policy best_action = None #Exploration if random.random() <= self.epsilon: best_action = random.choice(self.valid_actions) if (state, best_action) not in self.Q.keys(): self.Q[(state, best_action)] = self.default_Q Q_value = self.Q[(state, best_action)] else: Q_values = [] for action in self.valid_actions: if (state, action) not in self.Q.keys(): self.Q[(state, action)] = self.default_Q Q_values.append(self.Q[(state, action)]) #Find best_action and Q_value max_Q = max(Q_values) index = [] for i, x in enumerate(Q_values): if x == max(Q_values): index.append(i) i = random.choice(index) best_action = self.valid_actions[i] Q_value = Q_values[i] # Execute action and get reward action = best_action reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward if self.prev_state != None: self.Q[(self.prev_state, self.prev_action)] = (1 - self.alpha) * self.Q[ (self.prev_state, self.prev_action)] + self.alpha * ( self.prev_reward + self.gamma * (self.Q[(state, action)])) #Update previous state, action, and reward self.prev_action = action self.prev_state = state self.prev_reward = reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format( deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=True, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## global t t = t + 1 # self.epsilon = self.epsilon - 0.05 self.epsilon = math.exp(-0.01 * t) # self.alpha =.5* math.exp(-0.01* t) if testing == True: self.epsilon = 0 self.alpha = 0 ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # NOTE : you are not allowed to engineer eatures outside of the inputs available. # Because the aim of this project is to teach Reinforcement Learning, we have placed # constraints in order for you to learn how to adjust epsilon and alpha, and thus # learn about the balance between exploration and exploitation. # With the hand-engineered features, this learning process gets entirely negated. # Set 'state' as a tuple of relevant data for the agent state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left']) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = max(self.Q[state].values()) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### if self.learning == True: if state not in self.Q.keys(): self.Q[state] = {el: 0 for el in self.valid_actions} # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() max_value = self.get_maxQ(state) best_actions = [ key for key, value in self.Q[state].items() if value == max_value ] if self.learning == False: action = random.choice(self.valid_actions) elif self.epsilon > random.random(): action = random.choice(self.valid_actions) else: action = random.choice(best_actions) ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie". return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives a reward. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### if self.learning == True: self.Q[state][action] = self.alpha * reward + \ (1-self.alpha) * self.Q[state][action] # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.alpha = 0.5 self.gamma = 0.5 self.epsilon = 1.0 self.floor = 0.05 self.log = True self.qinit = {'wait': .0, 'forward': .0, 'left': .0, 'right': .0} self.qdf = pd.DataFrame() def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.env.totalreward = 0.0 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # Filter state space # Can ignore traffic when red light except for potential collision on right turn on red # Reason: Crossing red and accident have same penalty if inputs['light'] == 'red': if inputs['oncoming'] != 'left': inputs['oncoming'] = None if inputs['left'] != 'forward': inputs['left'] = None inputs['right'] = None if inputs['light'] == 'green': # ignore oncoming right turn if inputs['oncoming'] == 'right': inputs['oncoming'] = None # ignore right turn from left if inputs['left'] == 'right': inputs['left'] = None # Select State Space #state = ':'.join((inputs['light'], self.next_waypoint)) state = ':'.join( (inputs['light'], str(inputs['oncoming']), str(inputs['left']), str(inputs['right']), self.next_waypoint)) self.state = state if self.log: print "[DEBUG]: %s" % state if state not in self.qdf.index: newq = pd.DataFrame(self.qinit, index=[state]) self.qdf = self.qdf.append(newq) # TODO: Select action according to your policy # select best action from Q matrix qaction = self.qdf.loc[state].idxmax() # random selection of action randaction = random.choice(self.env.valid_actions) # Action Policy sample = random.random() if self.log: print "[DEBUG] epsilon: %s" % self.epsilon if sample <= self.epsilon: action = randaction if self.log: print "[DEBUG] randomaction: %s" % action else: if qaction == 'wait': action = None else: action = qaction if self.log: print "[DEBUG] bestaction: %s" % action # Slowly decrease epsilon, leave at 5% floor if self.epsilon > self.floor: self.epsilon -= .00075 # Override selection. DEBUG only ! #action = qaction #action = randaction # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward if action == None: nowstate = self.qdf['wait'][state] else: nowstate = self.qdf[action][state] nextstate = nowstate * (1 - self.alpha) + self.alpha * ( reward + self.gamma * 2.0) if self.log: print "[DEBUG] nextstate: %s" % nextstate # Update Q matrix if action == None: self.qdf['wait'][state] = nextstate else: self.qdf[action][state] = nextstate if self.log: print "[DEBUG] qdf:" if self.log: print self.qdf self.env.totalreward += reward if self.log: print "[DEBUG] totalreward: %s" % self.env.totalreward if self.log: print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format( deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, trials=1): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.learning_rate = 0.9 self.Q = {} # Standard default Q value of 0 # self.default_Q = 0 # Optimal default Q value. self.default_Q = 1 # discount factor is denoted by Beta in official Bellman equation formula # (http://web.stanford.edu/~pkurlat/teaching/5%20-%20The%20Bellman%20Equation.pdf), # or gamma in Udacity course. self.discount_factor = 0.33 # How likely are we to pick random action / explore new paths? self.epsilon = 0.1 # How many times agent able to reach the target for given trials? self.success = 0 self.total = 0 self.trials = trials # How many penalties does an agent get? self.penalties = 0 self.moves = 0 self.net_reward = 0 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.prev_state = None self.prev_action = None self.prev_reward = None def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state inputs['waypoint'] = self.next_waypoint # Tried deleting some inputs to see how it affected performance. # Success rate did improve significantly, and counterintuitively, less penalty. del inputs['oncoming'] del inputs['left'] del inputs['right'] # Tried to see if deadline info would improve performance turns out it got worse. # inputs['deadline'] = deadline self.state = tuple(sorted(inputs.items())) # TODO: Select action according to your policy # Random action # action = random.choice(Environment.valid_actions) # Q learning chosen action _Q, action = self._select_Q_action(self.state) # Execute action and get reward reward = self.env.act(self, action) # Some stats self.net_reward += reward self.moves += 1 if reward < 0: self.penalties+= 1 add_total = False if deadline == 0: add_total = True if reward > 5: self.success += 1 add_total = True if add_total: self.total += 1 print self._more_stats() # TODO: Learn policy based on state, action, reward # Note that we are updating for the previous state's Q value since Utility function is always +1 future state. if self.prev_state != None: if (self.prev_state, self.prev_action) not in self.Q: self.Q[(self.prev_state, self.prev_action)] = self.default_Q # Update to Q matrix as described in this lesson: # https://www.udacity.com/course/viewer#!/c-ud728-nd/l-5446820041/m-634899057 # WRONG! `- self.Q[(self.prev_state, self.prev_action)]` should not be there. # self.Q[(self.prev_state,self.prev_action)] = (1 - self.learning_rate) * self.Q[(self.prev_state,self.prev_action)] + \ # self.learning_rate * (self.prev_reward + self.discount_factor * \ # self._select_Q_action(self.state)[0] - self.Q[(self.prev_state, self.prev_action)]) # Correct method: self.Q[(self.prev_state,self.prev_action)] = (1 - self.learning_rate) * self.Q[(self.prev_state,self.prev_action)] + \ self.learning_rate * (self.prev_reward + self.discount_factor * \ self._select_Q_action(self.state)[0]) # pdb.set_trace() self.prev_state = self.state self.prev_action = action self.prev_reward = reward self.env.status_text += ' ' + self._more_stats() print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug] def _more_stats(self): """Get additional stats""" return "success/total = {}/{} of {} trials (net reward: {})\npenalties/moves (penalty rate): {}/{} ({})".format( self.success, self.total, self.trials, self.net_reward, self.penalties, self.moves, round(float(self.penalties)/float(self.moves), 2)) def _select_Q_action(self, state): """Select max Q and action based on given state. Args: state(tuple): Tuple of state e.g. (('heading', 'forward'), ('light', 'green'), ...). Returns: tuple: max Q value and best action """ best_action = random.choice(Environment.valid_actions) if self._random_pick(self.epsilon): max_Q = self._get_Q(state, best_action) else: max_Q = -999999 for action in Environment.valid_actions: Q = self._get_Q(state, action) if Q > max_Q: max_Q = Q best_action = action elif Q == max_Q: if self._random_pick(0.5): best_action = action return (max_Q, best_action) def _get_Q(self, state, action): """Get Q value given state and action. If Q value not found in self.Q, self.default_Q will be returned instead. Args: state(tuple): Tuple of state e.g. (('heading', 'forward'), ('light', 'green'), ...). action(string): None, 'forward', 'left', or 'right'. """ return self.Q.get((state, action), self.default_Q) def _random_pick(self, epsilon=0.5): """Random pick with epsilon as bias. The larger epsilon is, the more likely it is to pick random action. Explanation is available at this course: https://www.udacity.com/course/viewer#!/c-ud728-nd/l-5446820041/m-634899065 starting at 2:10. One may consider this function as: When True, do random action. For equal chance, use epsilon = 0.5 Args: epsilon(float): Likelihood of True. """ return random.random() < epsilon
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.t = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 # self.epsilon *= 0.995 self.epsilon = math.cos(float(self.t) / 500) self.alpha = 0.5 self.t += 1 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint # ex. waypoint = ['forward', 'left', 'right'] inputs = self.env.sense( self) # Visual input - intersection light and traffic # ex. inputs = {'light': 'red', 'oncoming': None, 'left': 'right', 'right': 'forward'} deadline = self.env.get_deadline(self) # Remaining deadline # ex. deadline = ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right']) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = max(self.Q[state].values()) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning: self.Q.setdefault(state, {action: 0.0 for action in self.valid_actions}) return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state probability = random.random() if not self.learning: action = random.choice(self.valid_actions) else: if self.epsilon > probability: action = random.choice(self.valid_actions) else: maxQ = self.get_maxQ(state) action_maximum = [ action for action, Q in self.Q[state].iteritems() if Q == maxQ ] action = random.choice(action_maximum) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') # use following function to update q value # Q(S,A) - (1-alpha)*Q(S,A) + alpha*[R + gamma*maxQ(S',a)] # gamma = 0 # Q_next = self.get_maxQ(state) # tuple (action , qvalue) Q_actual = self.Q[state][action] # qvalue self.Q[state][action] = (1 - self.alpha) * Q_actual + self.alpha * (reward) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the first step at making a Q-Learning Agent which begins to learn. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Exploration rate self.alpha = alpha # Learning rate self.gamma = 1.0 # Discount rate ########### ## TO DO ## ########### # Set any additional class parameters as needed self.trial_count = 0 # def getDecay(self, decay): # self.epsilon = 1 / math.pow(self.trial_count, 2) # self.epsilon = math.exp(1) ** (-self.alpha * self.trial_count) # self.epsilon - math.cos(a * self.trial_count) # self.epsilon = self.epsilon / math.sqrt(self.trial_count) # self.epsilon = 1 / math.pow(self.trial_count, 2) def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.trial_count += 1 self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 # self.epsilon -= 0.05 if self.epsilon > 0: self.epsilon = math.pow(self.alpha, self.trial_count) if testing: self.epsilon = 0.0 self.alpha = 0.0 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic # deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent state = (waypoint, inputs['light'], inputs['oncoming']) return state def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 # if self.learning: if state not in self.Q: self.Q[state] = dict() for action in self.valid_actions: self.Q[state][action] = 0.0 return def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state maxQ = max(self.Q[state].values()) return maxQ def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state action = None if self.learning: if random.random < self.epsilon: # choose action with 'epsilon probability' action = random.choice(self.valid_actions) else: action = random.choice([ i for i in self.Q[self.state].keys() if self.Q[self.state][i] == self.get_maxQ(self.state) ]) # bestQ = None # bestAction = None # for action in self.valid_actions: # thisQ = self.Q[state][action] # if thisQ > bestQ: # bestQ = thisQ # bestAction = action # action = bestAction else: # choose random action action = random.choice(self.valid_actions) return action # if self.learning: # if random.random() < self.epsilon: # action = random.choice(self.valid_actions) # else: # action = random.choice([i for i in self.Q[self.state].keys() if self.Q[self.state][i] == self.get_maxQ(self.state)]) # else: # action = random.choice(self.valid_actions) def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: self.Q[state][action] = ( 1 - self.alpha) * self.Q[state][action] + (reward * self.alpha) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor self.gamma = 0.9 # Discount factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.trial_step = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0 self.alpha = 0 else: self.trial_step += 1 #Default #self.epsilon = self.epsilon - 0.05 # Gave excellent results self.epsilon = 0.95**self.trial_step #self.epsilon = 1/(self.trial_step**2) #self.epsilon = math.cos(self.trial_step/19.0) #print "*************{}*************".format(self.trial_step) #self.gamma = math.pow(0.9,self.trial_step) return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent #deadline seems relevant but dramatically increases state space. state = (waypoint, inputs['light'], inputs['oncoming']) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state """ THIS CODE RETURNS ACTION ASSOCIATED WITH maxQ value maxQ_val = 0 #Choose action with max Q value for act in self.Q[state]: print "Action = {} Q_val = {}".format(act,self.Q[state][act]) if self.Q[state][act] > maxQ_val: action = act maxQ_val = self.Q[state][act] print "Action with > Q" if maxQ_val == 0: action = random.choice(self.valid_actions) print "Random Action because Q all 0" """ maxQ = max(self.Q[state].values()) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning == True: if self.Q.get(state) == None: self.Q[state] = {a: 0 for a in self.valid_actions} #print "Creating state Actions: " #for acts in self.Q[state]: # print "actions = {} Q vals = {}".format(acts, self.Q[state][acts]) else: pass #print "State previously visited" return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state #for debugging purposes randy = random.random() if ((self.learning == False) or (randy < self.epsilon)): action = random.choice(self.valid_actions) #print "randy < epsilon; {} < {}".format(randy, self.epsilon) print "Random Action Epsilon" else: print "Getting Max Q" maxQ = self.get_maxQ(state) print "maxQ = {}".format(maxQ) # Kind of clever solution but only returns 1 key/action #action = self.Q[state].keys()[self.Q[state].values().index(maxQ)] maxQ_actions = [] for action, q_val in self.Q[state].items(): if q_val == maxQ: maxQ_actions.append(action) print "bucha actions = {}".format(maxQ_actions) #print "type of thing = {}".format(type(maxQ_actions)) action = random.choice(maxQ_actions) print action return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') #self.get_maxQ(state) # This implementation does not use gamma... but if it did # self.Q[state][action] += self.alpha(reward + gamma * argmax(R(s',a')) - self.Q[state][action]) if self.learning == True: self.Q[state][action] += self.alpha * (reward - self.Q[state][action]) print "new Q = {}".format(self.Q[state][action]) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.Q = {} # Discount # self.gamma = 0.8 self.gamma = 0.5 # self.gamma = 0.1 # self.gamma = 0.5 self.epsilon = 0.95 # <=== # self.epsilon = 0.75 # self.epsilon = 0.5 # self.epsilon = 0.5 # self.epsilon = 0.05 # self.epsilon = 0.25 # self.epsilon = 0.99 # self.epsilon = 0.95 # Learning Rate # self.alpha = 1. # self.alpha = 0.8 # <=== self.alpha = 0.5 # self.alpha = 0.25 # self.alpha = 0.5 self.trips_rewards = {} self.trips_timers = {} self.trips_steps = {} self.trips_finished = {} self.trip_reward = 0 self.total_reward = 0 self.total_finished = 0 self.current_step = 1 self.total_steps = 1 self.trip_number = 0 # self.max_trips = 20 self.max_trips = 100 self.valid_actions = [None, 'forward', 'left', 'right'] print '' print CSI + "7;30;43m" + "Welcome to SmartCab simulation" + CSI + "0m" print '' print '==================================================' print '-------------------- config ----------------------' print 'gamma: ' + str(self.gamma) print 'epsilon: ' + str(self.epsilon) print 'alpha: ' + str(self.alpha) print '--------------------------------------------------' print '' self.timer = time.time() def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.trip_number += 1 self.current_step = 0 self.trip_reward = 0 self.trip_begin_time = time.time() if self.trip_number > self.max_trips: # 20 # Recording results to log.txt print 'self.results_log' print self.results_log # # Output to the log file # f = open('log.txt','ab+') # f.write(self.results_log) # f.write("\n") # f.close() # sys.exit(0) # exit if reached the self.max_trips from config print '[SmartCab] New trip - awesome!' def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state state = {} state['action'] = self.next_waypoint # desired action state['light'] = inputs['light'] state['oncoming_traffic'] = inputs['oncoming'] state['traffic_on_the_left'] = inputs['left'] state = tuple(state.items()) self.state = state print '' print '==================================================' print '-------------------- stats -----------------------' print 'trip_number: ' + str(self.trip_number) print 'current_step: ' + str(self.current_step) print 'deadline: ' + str(deadline) print 'total_steps: ' + str(self.total_steps) print 'trip_reward: ' + str(self.trip_reward) print 'total_reward: ' + str(self.total_reward) print 'total_reward/total_steps: ' + str( self.total_reward / self.total_steps) print 'timer: ' + str(round(time.time() - self.timer, 2)) print '-------------------- STATE -----------------------' # ap(state) print state print '--------------------------------------------------' # TODO: Select action according to your policy # Select action based on previous outcomes # Check if this state occured before print '[SmartCab] Check if this state occured before' if state in self.Q: print '=> State is found in Q-table' print '=> State occured ' + str( self.Q[state]['count']) + ' time(s)' print '[SmartCab] Compare the value of pre configurated parameter epsilon with random value: ' random_value = random.random() if random_value > self.epsilon: print '=> random_value: ' + str( random_value) + ' > epsilon: ' + str(self.epsilon) action = random.choice(self.valid_actions) print '=> Taking random action: ' + str(action) else: print '=> random_value: ' + str( random_value) + ' < epsilon: ' + str(self.epsilon) print '[SmartCab] Find available_actions with maximum Q^ score:' available_actions = [ k for k, v in self.Q[state]['actions_Q^'].items() if v == max(self.Q[state]['actions_Q^'].values()) ] print '=> ' + str(available_actions) action = random.choice(available_actions) print '[SmartCab] Take random action from the list of available_actions with max Q^: ' + str( action) # # ====================================================== # # Alternative model: Take action in the direction of Waypoint if reward > 0 else wait (None) # # ------------------------------------------------------ # # Check previous reward for desired action ( self.next_waypoint ) in similar state ( state ) # if self.Q[state]['reward'] > 0: # action = self.next_waypoint # else: # action = None # # ====================================================== else: print '=> State NOT found in Q-table' print '[SmartCab] Assigning default values for this state inside Qtable' self.Q[state] = {} self.Q[state]['count'] = 0 self.Q[state]['actions_rewards'] = {} self.Q[state]['actions_counts'] = {} self.Q[state]['actions_Q^'] = {} # action = random.choice(self.valid_actions) # Alternative: # 'None' may be removed to motivate exploring (as was advised by udacity review): action = random.choice(['forward', 'left', 'right']) # action = self.next_waypoint print '=> Taking random action: ' + str(action) # # ====================================================== # # Alternative model: Take random action # # ------------------------------------------------------ # action = random.choice(self.valid_actions) # print '=> Taking random action: ' + str(action) # # ------------------------------------------------------ # Count the occurance of current state self.Q[state]['count'] += 1 # Execute action and get reward print '--------------------------------------------------' reward = self.env.act(self, action) print '--------------------------------------------------' print '=> Reward: ' + str(reward) if reward == 12: self.trips_finished[self.trip_number] = 1 self.total_finished += 1 else: self.trips_finished[self.trip_number] = 0 print '' print '[SmartCab] Calculating and recording current results to Qtable' # TODO: Learn policy based on state, action, reward if action in self.Q[state]['actions_rewards']: self.Q[state]['actions_rewards'][action] = reward self.Q[state]['actions_counts'][action] += 1 Qh = self.Q[state]['actions_Q^'][action] Qh = Qh + (self.alpha * (reward + (self.gamma * (max(self.Q[state]['actions_Q^'].values()))) - Qh)) # # # ====================================================== # # # Alternative models: Q^ = 0.5 ; Q^ = 1 ; Q^ = reward # # # ------------------------------------------------------ # Qh = 0.5 # Qh = 1 # Qh = reward # # # ------------------------------------------------------ self.Q[state]['actions_Q^'][action] = Qh else: self.Q[state]['actions_rewards'][action] = reward self.Q[state]['actions_counts'][action] = 1 # self.Q[state]['actions_Q^'][action] = 0.5 # self.Q[state]['actions_Q^'][action] = 1 # self.Q[state]['actions_Q^'][action] = reward # self.Q[state]['actions_Q^'][action] = 0.0 # self.Q[state]['actions_Q^'][action] = 2.5 # self.Q[state]['actions_Q^'][action] = 2.0 self.Q[state]['actions_Q^'][action] = 1.5 # self.Q[state]['actions_Q^'][action] = 1.0 # self.Q[state]['actions_Q^'][action] = 0.5 # Reward for driving in the direction of way point self.Q[state]['reward'] = reward print '--------------------------------------------------' print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format( deadline, inputs, action, reward) # [debug] self.total_reward += reward self.trip_reward += reward self.current_step += 1 self.total_steps += 1 print '-----------------self.Q[state]--------------------' # ap(self.Q[state]) print self.Q[state] print '=================trips_results====================' self.trips_steps[self.trip_number] = self.current_step print 'Steps: ' + str(self.trips_steps) self.trips_rewards[self.trip_number] = self.trip_reward print 'Rewards: ' + str(self.trips_rewards) self.trips_timers[self.trip_number] = round( time.time() - self.trip_begin_time, 2) print 'Timers: ' + str(self.trips_timers) print 'Finished' + str(self.trips_finished) # print 'Timers: ' + str(self.trips_timers) self.results_log = 'Total steps: ' + str( self.total_steps) + '; reward: ' + str( self.total_reward) + '; finished: ' + str( self.total_finished) + '; seconds: ' + str( round(time.time() - self.timer, 4)) print self.results_log print '=================================================='
class LearningAgent_v0(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent_v0, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.gamma = 0.2 #upper bound of discount self.alpha = 0.1 #uppder bound of learning rate self.epsilon = 0.2 #lower bound of proportion of random steps #self.state = {'deadline': None, 'forward_ok': True, 'left_ok': True, 'right_ok': True, 'next_waypoint': None } self.state = { 'light': 'green', 'oncoming': None, 'left': None, 'right': None, 'next_waypoint': None } self.exreward = 0 #self.exstate = {'deadline': None, 'forward_ok': True, 'left_ok': True, 'right_ok': True, 'next_waypoint': None } self.exstate = { 'light': 'green', 'oncoming': None, 'left': None, 'right': None, 'next_waypoint': None } self.exaction = None self.debug = False self.trip_history = [] self.reward_history = np.zeros(1) self.Q = OrderedDict() def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.state = { 'light': 'green', 'oncoming': None, 'left': None, 'right': None, 'next_waypoint': None } self.exreward = 0 self.exstate = { 'light': 'green', 'oncoming': None, 'left': None, 'right': None, 'next_waypoint': None } self.exaction = None if (len(self.trip_history) < 150): print 'Current success rate is {}'.format( sum(self.trip_history) / (len(self.trip_history) + 0.000001)) else: print 'Success rate for recent 100 trials is {}'.format( sum(self.trip_history[-100:]) / (len(self.trip_history[-100:]) + 0.000001)) print 'Average reward for recent moves is {}'.format( np.mean(self.reward_history)) if (self.reward_history.shape[0] > 1000): self.reward_history = np.delete(self.reward_history, range(100)) #print "number of parameter is {0}, sum of Qfunction is {1}".format( len(self.Q.keys()), sum(self.Q.values()) ) def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) #may need to take deadline into account? # TODO: Update state #self.state = {'inputs': inputs, 'deadline': deadline, 'next_waypoint':self.next_waypoint} #self.state = {'inputs': inputs, 'next_waypoint':self.next_waypoint} #epsilon = self.epsilon + (1-self.epsilon)/(t+1)*5 #gamma = ( 1- 10/(t+10) ) * self.gamma #alpha = self.alpha/(t+1.0) gamma = self.gamma epsilon = self.epsilon alpha = self.alpha self.state['next_waypoint'] = self.next_waypoint #self.state['deadline'] = int(deadline>5) + int(deadline>25) for k in inputs.keys(): self.state[k] = inputs[k] # TODO: Select action according to your policy #action = random.choice(Environment.valid_actions[1:]) newkey = str(self.exstate.values()) + ':' + str(self.exaction) if (self.debug): print "\n New key is {}".format(newkey) #debug raw_input("Press Enter to continue...") tmp_Q = dict([(x, self.Q[x]) for x in self.Q.keys() if str(self.state.values()) in x]) #print tmp_Q if self.debug: print "\n Q value for future state is {}".format(str(tmp_Q)) #debug raw_input("Press Enter to continue...") action = random.choice(Environment.valid_actions[:]) tmp_max_Q = 0 if (len(tmp_Q) == 0): tmp_max_Q = 0 action = random.choice(Environment.valid_actions[:]) else: #tmp_idx = max(tmp_Q) tmp_idx = max(tmp_Q.iterkeys(), key=(lambda key: tmp_Q[key])) tmp_max_Q = tmp_Q[tmp_idx] if (tmp_max_Q > 0 or len(tmp_Q) == 4): #print tmp_idx tmp_Q_split = tmp_idx.split(':') #print tmp_Q_split #print tmp_Q_split action = tmp_Q_split[1] if action == 'None': action = None else: exist_actions = [x.split(':')[1] for x in tmp_Q.keys()] all_actions = ['None', 'forward', 'left', 'right'] remaining_actions = [ x for x in all_actions if not (x in exist_actions) ] if self.debug: print "Remaining actions are {}".format( str(remaining_actions)) action = random.choice(remaining_actions) tmp_max_Q = 0 if action == 'None': action = None if self.debug: print "\n future optimum action is {}".format(str(action)) #debug raw_input("Press Enter to continue...") if (random.uniform(0, 1) < epsilon): action = random.choice(Environment.valid_actions[:]) if self.debug: print "\n Instead use a random action, {}".format(str(action)) #debug raw_input("Press Enter to continue...") #print 'now ' + str(action) #random guess have success rate about ~0.20 #action = random.choice(Environment.valid_actions[:]) newval = self.exreward + gamma * tmp_max_Q if self.debug: print "\n current reward is {0}, gamma is {1}, and estimated max future Q is {2}".format( self.exreward, gamma, tmp_max_Q) #debug raw_input("Press Enter to continue...") if newkey in self.Q.keys(): self.Q[newkey] = self.Q[newkey] * (1 - alpha) + alpha * newval else: self.Q[newkey] = self.alpha * newval if self.debug: print "updated Q values {}".format(str(self.Q)) #debug raw_input("Press Enter to continue...") #print t # Execute action and get reward reward = self.env.act(self, action) if reward > 9: self.trip_history.append(1) #need to write down something here newkey = str(self.state.values()) + ':' + str(action) newval = reward # + deadline if newkey in self.Q.keys(): self.Q[newkey] = self.Q[newkey] * (1 - alpha) + alpha * newval else: self.Q[newkey] = self.alpha * newval elif deadline == 0: self.trip_history.append(0) newkey = str(self.state.values()) + ':' + str(action) newval = reward if newkey in self.Q.keys(): self.Q[newkey] = self.Q[newkey] * (1 - alpha) + alpha * newval else: self.Q[newkey] = self.alpha * newval # TODO: Learn policy based on state, action, reward self.exreward = reward self.exstate = self.state self.exaction = action self.reward_history = np.append(self.reward_history, reward)
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.Q_table = {} self.gamma = 0.8 self.alpha = 0.5 self.epsilon = 0.1 self.epsilon_decay_rate = 0.0001 self.valid_actions = [None, 'forward', 'left', 'right'] def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required # build a state by taking parameters from inputs and next_way point def return_state(self, inputs): return ( inputs['light'], inputs['oncoming'], inputs['left'], # inputs['right'], self.next_waypoint, # self.env.get_deadline(self) ) # read Q_value from Q_table based on state and action def get_Q_value(self, state, action): if (state, action) in self.Q_table: return self.Q_table[(state, action)] else: return 0 # return the max Q_value based on the state and all possible actions def get_Max_Q(self, state): max_Q_value = 0 for action in self.valid_actions: if max_Q_value < self.get_Q_value(state, action): max_Q_value = self.get_Q_value(state, action) return max_Q_value # update Q_value in Q_Table def update_Q_value(self, state, action, reward): exist_Q_value = self.get_Q_value(state, action) self.next_waypoint = self.planner.next_waypoint() next_state = self.return_state(self.env.sense(self)) Q_value = exist_Q_value + self.alpha * ( reward + self.gamma * self.get_Max_Q(next_state) - exist_Q_value) self.Q_table[(state, action)] = Q_value # define action policy that take the action which result in the max Q_value from the current state def action_policy(self, state): action_to_take = None best_Q_value = 0 for action in self.valid_actions: if self.get_Q_value(state, action) > best_Q_value: best_Q_value = self.get_Q_value(state, action) action_to_take = action elif self.get_Q_value(state, action) == best_Q_value: action_to_take = action return action_to_take def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # get the state parameters and display them. self.state = self.return_state(inputs) # TODO: Select action according to your policy # action = random.choice(Environment.valid_actions[1:]) # select action according to the policy with greedy strategy if self.epsilon > random.random(): action = random.choice(self.valid_actions) else: action = self.action_policy(self.state) self.epsilon -= self.epsilon_decay_rate # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward self.update_Q_value(self.state, action, reward) print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format( deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.Q = defaultdict(list) self.alphaLearn = .3 self.gammaDiscount = .99 self.stateAction = None self.epsilon = .1 self.penalty = 0 self.oldPenalty = None def reset(self, destination=None): self.stateAction = None self.planner.route_to(destination) print 'penalty = {}'.format( self.penalty if self.oldPenalty is None else (self.penalty - self.oldPenalty)) self.oldPenalty = self.penalty # TODO: Prepare for a new trip; reset any variables here, if required def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state self.state = [ inputs['light'], inputs['right'], inputs['left'], self.next_waypoint ] # print 'this is waypoint {}'.format(self.next_waypoint) # TODO: Select action according to your policy action = None maxQ = 0 if random.random() < self.epsilon: action = random.choice([None, 'forward', 'left', 'right']) else: for k, v in self.Q.items(): if k[0:3] == self.state and v > maxQ: action = k[4] maxQ = v if action == None: action = random.choice([None, 'forward', 'left', 'right']) # Execute action and get reward reward = self.env.act(self, action) if reward < 0: self.penalty += 1 # print 'violation', self.penalty # TODO: Learn policy based on state, action, reward if self.stateAction is not None: if self.Q[tuple(self.stateAction)]: self.Q[tuple(self.stateAction)] = self.Q[tuple( self.stateAction)] + self.alphaLearn * ( reward + self.gammaDiscount * maxQ - self.Q[tuple(self.stateAction)]) else: self.Q[tuple(self.stateAction)] = self.alphaLearn * ( reward + self.gammaDiscount * maxQ) else: print 'Q learning start' #print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward) # [debug] self.stateAction = [ inputs['light'], inputs['right'], inputs['left'], self.next_waypoint, action ]
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'taxi' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here # Simple statistical counter variables self.trial_count = 0 self.trial_summary = {} for i in range(2): self.trial_summary[i] = 0 self.cycles = 0 self.time_count_pos = {} self.time_count_neg = {} self.deadline_start_col = [0] * 100 self.deadline_end_col = [0] * 100 def deadline_stats(self, start, deadline): if (start): self.deadline_start_col.append(deadline) elif not start: self.deadline_end_col.append(deadline) def success_stats(self, suc): if (suc): self.trial_summary[1] += 1 self.time_count_pos[self.cycles] += 1 else: self.trial_summary[0] += 1 self.time_count_neg[self.cycles] += 1 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.trial_count = 0 self.cycles += 1 def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state # TODO: Select action according to your policy action = random.choice([None, 'forward', 'left', 'right']) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format( deadline, inputs, action, reward) # [debug]
def __init__(self, env): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.Q = {} # Discount # self.gamma = 0.8 self.gamma = 0.5 # self.gamma = 0.1 # self.gamma = 0.5 self.epsilon = 0.95 # <=== # self.epsilon = 0.75 # self.epsilon = 0.5 # self.epsilon = 0.5 # self.epsilon = 0.05 # self.epsilon = 0.25 # self.epsilon = 0.99 # self.epsilon = 0.95 # Learning Rate # self.alpha = 1. # self.alpha = 0.8 # <=== self.alpha = 0.5 # self.alpha = 0.25 # self.alpha = 0.5 self.trips_rewards = {} self.trips_timers = {} self.trips_steps = {} self.trips_finished = {} self.trip_reward = 0 self.total_reward = 0 self.total_finished = 0 self.current_step = 1 self.total_steps = 1 self.trip_number = 0 # self.max_trips = 20 self.max_trips = 100 self.valid_actions = [None, 'forward', 'left', 'right'] print '' print CSI + "7;30;43m" + "Welcome to SmartCab simulation" + CSI + "0m" print '' print '==================================================' print '-------------------- config ----------------------' print 'gamma: ' + str(self.gamma) print 'epsilon: ' + str(self.epsilon) print 'alpha: ' + str(self.alpha) print '--------------------------------------------------' print '' self.timer = time.time()
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" # Learning rate ALPHA = 0.66 # Discount factor GAMMA = 0.003 def __init__(self, env): super(LearningAgent, self).__init__( env ) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner( self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here self.state_prev = None self.action_prev = None self.reward_prev = None # Initialize Q-values w random values self.Q = {} for nwp, l, o, a in product(self.env.valid_actions, ('green', 'red'), self.env.valid_actions, self.env.valid_actions): self.Q[LAState(nwp, l, o), a] = random.uniform(1, 5) # Keep track of how many times the primary agent reached destination self.num_reached = 0 def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required self.state_prev = None self.action_prev = None self.reward_prev = None def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint( ) # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) # TODO: Update state state = LAState(next_waypoint=self.next_waypoint, light=inputs['light'], oncoming=inputs['oncoming']) # TODO: Select action according to your policy # Basic driving agent #action = random.choice(self.env.valid_actions) # Action with highest Q score for current state action = max(self.env.valid_actions, key=lambda a: self.Q[state, a]) # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward if self.state_prev is not None: self.Q[self.state_prev, self.action_prev] = \ (1-self.ALPHA) * self.Q[self.state_prev, self.action_prev] + \ self.ALPHA * (self.reward_prev + self.GAMMA * self.Q[state, action]) # Update previous values for the next iteration self.state_prev = state self.action_prev = action self.reward_prev = reward # Update number of times that agent reached destination self.num_reached += self.env.done print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format( deadline, inputs, action, reward) # [debug]
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing == True: self.epsilon = 0 self.aplha = 0 else: self.epsilon = self.epsilon - 0.005 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent state = (waypoint, inputs["light"], inputs["oncoming"], inputs["left"]) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state if state not in self.Q: maxQ = None else: maxQ = max(self.Q[state].values()) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning == True: if state not in self.Q: self.Q[state] = { None: 0.0, "forward": 0.0, "left": 0.0, "right": 0.0 } self.Q[state] = {action: 0.0 for action in self.valid_actions} return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() action = None ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if self.learning == False or random.random() < self.epsilon: action = self.valid_actions[random.randint(0, 3)] else: maxQval = self.get_maxQ(state) best_actions = [] for action in self.valid_actions: if self.Q[state][action] == maxQval: best_actions.append(action) action = random.choice(best_actions) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: self.Q[state][action] = ( 1 - self.alpha) * self.Q[state][action] + self.alpha * (reward) return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the evironment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict( ) # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.train_no = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 import math self.train_no = self.train_no + 1 # self.epsilon = math.e**(-0.01 * self.train_no) # self.alpha = math.e**(-0.001 * (self.train_no)) # self.epsilon = self.epsilon - 0.05*self.train_no # self.alpha = self.alpha - 0.05*self.train_no # self.alpha -= 0.005 # for Question 5, decay function self.epsilon -= 0.05 # for Question 6, use exponential function # self.epsilon = 0.7**self.train_no # results # Trial Aborted! # Agent did not reach the destination. # for Question 6, use 1/train_no/train_no # self.epsilon = 1/self.train_no **2 # results # Trial Aborted! # Agent did not reach the destination. # for Question 6, use exponential function # self.epsilon = math.e**(-0.01 * self.train_no) # self.alpha = math.e**(-0.001 * (self.train_no))did not use # results # Trial Completed! # Agent reached the destination. if testing: self.epsilon = 0 self.alpha = 0 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense( self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # NOTE : you are not allowed to engineer eatures outside of the inputs available. # Because the aim of this project is to teach Reinforcement Learning, we have placed # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn # about the balance between exploration and exploitation. # With the hand-engineered features, this learning process gets entirely negated. # Set 'state' as a tuple of relevant data for the agent state = (inputs['light'], inputs['oncoming'], inputs['left'], waypoint) return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state # q = [self.Q.get((state,action), 0.0) for action in self.valid_actions] q = [self.Q[state][action] for action in self.Q[state]] if q: maxQ = max(q) else: maxQ = 0.0 #maxAction = max(self.Q[state], key=self.Q[state].get) #return self.Q[state][maxAction] return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning: #if self.Q.get(state, None) is None: if state not in self.Q: self.Q[state] = {} for action in self.valid_actions: # self.Q[(state,action)] = 0.0 self.Q[state][action] = 0.0 return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie". if not self.learning: action = random.choice(self.valid_actions) else: if random.random() < self.epsilon: action = random.choice(self.valid_actions) else: # maxQ = self.get_maxQ(state) # maxActions = [] # for action_key in self.Q[state]: # if self.Q[state][action_key] == maxQ: # maxActions.append(action_key) #action = random.choice(maxActions) maxQ = self.get_maxQ(state) max_actions = [] for action in self.Q[state]: if self.Q[state][action] == maxQ: max_actions.append(action) action = random.choice(max_actions) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives a reward. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') if self.learning: self.Q[state][action] = self.Q[state][action] * ( 1 - self.alpha) + reward * self.alpha return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """ An agent that learns to drive in the Smartcab world. This is the object you will be modifying. """ def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5): super(LearningAgent, self).__init__(env) # Set the agent in the environment self.planner = RoutePlanner(self.env, self) # Create a route planner self.valid_actions = self.env.valid_actions # The set of valid actions # Set parameters of the learning agent self.learning = learning # Whether the agent is expected to learn self.Q = dict() # Create a Q-table which will be a dictionary of tuples self.epsilon = epsilon # Random exploration factor self.alpha = alpha # Learning factor ########### ## TO DO ## ########### # Set any additional class parameters as needed self.total_trials = 0 def reset(self, destination=None, testing=False): """ The reset function is called at the beginning of each trial. 'testing' is set to True if testing trials are being used once training trials have completed. """ # Select the destination as the new location to route to self.planner.route_to(destination) ########### ## TO DO ## ########### # Update epsilon using a decay function of your choice # self.epsilon = self.epsilon - 0.05 self.epsilon = max(self.epsilon - 0.01, 0.0) self.total_trials += 1 # self.epsilon = 1.5 * math.exp(-0.6 * self.total_trials) # self.epsilon = math.cos(0.5 * self.total_trials) # self.epsilon = 1.0 / float(self.total_trials * self.total_trials) # self.epsilon = math.pow(0.75, self.total_trials) # Update additional class parameters as needed # If 'testing' is True, set epsilon and alpha to 0 if testing: self.epsilon = 0 self.alpha = 0 return None def build_state(self): """ The build_state function is called when the agent requests data from the environment. The next waypoint, the intersection inputs, and the deadline are all features available to the agent. """ # Collect data about the environment waypoint = self.planner.next_waypoint() # The next waypoint inputs = self.env.sense(self) # Visual input - intersection light and traffic deadline = self.env.get_deadline(self) # Remaining deadline ########### ## TO DO ## ########### # Set 'state' as a tuple of relevant data for the agent # When learning, check if the state is in the Q-table # If it is not, create a dictionary in the Q-table for the current 'state' # For each action, set the Q-value for the state-action pair to 0 # state = None state = waypoint, inputs['light'], inputs['oncoming'] , inputs['left'] #, inputs['right'] return state def get_maxQ(self, state): """ The get_max_Q function is called when the agent is asked to find the maximum Q-value of all actions based on the 'state' the smartcab is in. """ ########### ## TO DO ## ########### # Calculate the maximum Q-value of all actions for a given state actions = self.Q[state] maxQ = max(actions.values()) return maxQ def createQ(self, state): """ The createQ function is called when a state is generated by the agent. """ ########### ## TO DO ## ########### # When learning, check if the 'state' is not in the Q-table # If it is not, create a new dictionary for that state # Then, for each action available, set the initial Q-value to 0.0 if self.learning and state not in self.Q.keys(): self.Q[state] = {None: 0.0, 'left': 0.0, 'right': 0.0, 'forward': 0.0} return def choose_action(self, state): """ The choose_action function is called when the agent is asked to choose which action to take, based on the 'state' the smartcab is in. """ # Set the agent state and default action self.state = state self.next_waypoint = self.planner.next_waypoint() ########### ## TO DO ## ########### # When not learning, choose a random action # When learning, choose a random action with 'epsilon' probability # Otherwise, choose an action with the highest Q-value for the current state if not self.learning: action = random.choice(self.valid_actions) else: if random.random() < self.epsilon: action = random.choice(self.valid_actions) else: maxQ = self.get_maxQ(state) candidate_actions = [action for action in self.valid_actions if self.Q[state][action] == maxQ] action = random.choice(candidate_actions) return action def learn(self, state, action, reward): """ The learn function is called after the agent completes an action and receives an award. This function does not consider future rewards when conducting learning. """ ########### ## TO DO ## ########### # When learning, implement the value iteration update rule # Use only the learning rate 'alpha' (do not use the discount factor 'gamma') # we want to use the alpha update: v = (1 - alpha) v + alpha * x if self.learning: previous = self.Q[state][action] self.Q[state][action] = (1.0 - self.alpha) * previous + self.alpha * reward return def update(self): """ The update function is called when a time step is completed in the environment for a given trial. This function will build the agent state, choose an action, receive a reward, and learn if enabled. """ state = self.build_state() # Get current state self.createQ(state) # Create 'state' in Q-table action = self.choose_action(state) # Choose an action reward = self.env.act(self, action) # Receive a reward self.learn(state, action, reward) # Q-learn return
class LearningAgent(Agent): """An agent that learns to drive in the smartcab world.""" def __init__(self, env, epsilon=0.8, alpha=0.8, gamma=0.9,tno=1): super(LearningAgent, self).__init__(env) # sets self.env = env, state = None, next_waypoint = None, and a default color self.color = 'red' # override color self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint # TODO: Initialize any additional variables here #self.q = {} #initialize q table dictionary self.epsilon = epsilon #exploration rate self.alpha = alpha # learning rate self.gamma = gamma #discount rate #self.lesson_counter = 0 #counts number of steps learned #self.steps_counter = 0 #counts steps in the trial self.reward_previous = None self.action_previous = None self.state_previous = None #self.q = {(('red',None),'right'):10,(('green',None),'forward'):15, (('green',None),'left'):20,(('red',None),'left'):25} self.q = {} self.tno = 0 #self.success = [] def reset(self, destination=None): self.planner.route_to(destination) # TODO: Prepare for a new trip; reset any variables here, if required #self.steps_counter = 0 #after finishing every trial counter gets 0 self.tno += 1 def getQ(self, state, action): return self.q.get((state, action), 10.0) #helper function to get the q-value from the q-table. def update(self, t): # Gather inputs self.next_waypoint = self.planner.next_waypoint() # from route planner, also displayed by simulator inputs = self.env.sense(self) deadline = self.env.get_deadline(self) location = self.env.agent_states[self]["location"] destination = self.env.agent_states[self]["destination"] trial_num = t #self.tno += 1 #sucess = self.env.sucess() # TODO: Update state self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint) # TODO: Select action according to your policy #action = None self.actions = [None, 'left', 'right', 'forward'] #action = self.next_waypoint #action = random.choice(Environment.valid_actions) #if random.random() < self.epsilon: #exploration e-greedy strategy if random.random() < self.epsilon * 1./self.tno: #exploration e-greedy strategy action = random.choice(self.actions) else: q = [self.getQ(self.state, a) for a in self.actions] # gives out list q-values for the visited state-action pairs maxQ = max(q) # get the maximum values of q-values from the above list count = q.count(maxQ) #count the number of maximum values if count > 1: best = [i for i in range(len(self.actions)) if q[i] == maxQ] i = random.choice(best) #select the action randomly from the maximum q-values index else: i = q.index(maxQ) action = self.actions[i] #action = random.choice(Environment.valid_actions) #decay rate #Environment.valid_actions #defaultdict # Execute action and get reward reward = self.env.act(self, action) # TODO: Learn policy based on state, action, reward if t == 0: oldv = self.q.get((self.state,action), 10.0) self.q[(self.state, action)] = oldv #if it's first counter run in the trial then get q-value else: oldv = self.q.get((self.state_previous, self.action_previous), 0.0) #if it's second counter run in the trial then get the q-value from previous state-action self.q[(self.state,action)] = oldv + self.alpha * (reward - oldv + self.gamma * self.q.get((self.state, action), 10.0)) #try: # self.q[(self.state,action)] = oldv + self.alpha * (reward - oldv + self.gamma * self.q[(self.state,action)]) #except Exception: # self.q[(self.state,action)] = reward + 10 #if the state has not been visited before then except statement is triggered otherwise try statement is triggered. #if oldv is None: # self.q[(self.state,action)] = reward #else: #self.q[(self.state,action)] = oldv + self.alpha * (reward - oldv)# + self.gamma * self.q[(self.state, action)] - oldv) #self.q[(self.state,action)] = oldv + self.alpha * (reward - oldv + self.gamma)# * self.q[(self.state, action)] - oldv) uQ = self.q self.state_previous = self.state self.action_previous = action #self.steps_counter += 1 #self.trial_num += 1 print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}, uQ = {}, location = {}, destination = {}, tno = {}".format(deadline, inputs, action, reward, uQ, location,destination, 1./self.tno) # [debug]