Example #1
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world by using Q-Learning"""

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self) # simple route planner to get next_waypoint
        self.learner = QLearn(epsilon=1, alpha=0.3, gamma=0.6)        

    def reset(self, destination=None):
        self.planner.route_to(destination)      

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self) 
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # TODO: Select action according to your policy
        action = self.learner.action(self.state)
        
        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        next_inputs = self.env.sense(self) 
        self.future_next_way_out = self.planner.next_waypoint()
        next_state = (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], self.future_next_way_out)
        #next_state = (next_inputs[0], next_inputs[1], next_inputs[3], self.planner.next_waypoint())
        self.learner.learn(self.state, action, next_state, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.alpha = 0.2
        self.Qtable = {}
        

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.reward = 0
        
        
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state

        # Inform Driving Agent
        self.state = (inputs['light'], inputs['oncoming'], 
                      inputs['left'], inputs['right'], self.next_waypoint)
        
        
        # TODO: Select action according to your policy
        valid_actions = [None, 'forward', 'left', 'right']
        
        # Random Pick Actions
        ### action = random.choice(valid_actions)
        
        # Get all Q values for all state
        all_actions = {action: self.Qtable.get((self.state, action), 0)
                        for action in valid_actions}
                           
        # Find action that maximizes Q values
        best_actions = [action for action in valid_actions 
                        if all_actions[action] == max(all_actions.values())]
           
        # Return Best Action
        action = random.choice(best_actions)    
         
        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        # To simplify the problem, I set gamma equals 0.
        self.Qtable[(self.state, action)] = \
        (1 - self.alpha) * self.Qtable.get((self.state, action), 0) + self.alpha * reward
     
           
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
class BasicAgent(Agent):
    def __init__(self, env):
        super(BasicAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.color = 'red'  # override color

    def reset(self, destination=None):
        self.planner.route_to(destination)  
        
    def update(self, t):
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        action = random.choice(self.env.valid_actions)
        reward = self.env.act(self, action)
        
        light = inputs['light']
        oncoming = inputs['oncoming']
        left = inputs['left']
        right = inputs['right']
       
        #Update the current observed state
        self.state = (light, oncoming, left, right, self.next_waypoint)
        
        print "Basic.update(): next move = {}".format(action)  # [debug]
Example #4
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world by using Q-Learning"""

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.possible_actions= Environment.valid_actions
        self.ai = QLearn(epsilon=.05, alpha=0.1, gamma=0.9)        

    def reset(self, destination=None):
        self.planner.route_to(destination)      

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self) 
        inputs = inputs.items()
        deadline = self.env.get_deadline(self)

        self.state = (inputs[0],inputs[1],inputs[3],self.next_waypoint)
        
        action = self.ai.Q_move(self.state)
        
        # Execute action and get reward
        reward = self.env.act(self, action)

        inputs2 = self.env.sense(self) 
        inputs2 = inputs2.items()
        next_state = (inputs2[0],inputs2[1],inputs2[3],self.next_waypoint)

        self.ai.Q_post(self.state, action, next_state, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, learning_rate=0.6, discount_factor=0.35):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.alpha = learning_rate
        self.gamma = discount_factor
        self.state = None
        self.Q_values = self.initialize_Q_values()
 
    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
       
    def initialize_Q_values(self, val=10.0):
        Q_values = {}
        for waypoint in ['left', 'right', 'forward']:
            for light in ['red', 'green']:
                for action in self.env.valid_actions:
                    Q_values[((waypoint, light), action)] = val
        return Q_values

    def find_max_Q(self, state):
        action = None
        max_Q = 0.0
        for a in self.env.valid_actions:
            Q = self.Q_values[(state, a)]
            if Q > max_Q:
                action = a
                max_Q = Q
        return (max_Q, action)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (self.next_waypoint, inputs['light'])

        # TODO: Select action according to your policy
        #action = random.choice(self.env.valid_actions) 
        (Q, action) = self.find_max_Q(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        state_prime = (next_waypoint, inputs['light'])
        (Q_prime, action_prime) = self.find_max_Q(state_prime)        
        Q += self.alpha * (reward + self.gamma*Q_prime - Q) 
        self.Q_values[(self.state, action)] = Q

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Example #6
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here



    def reset(self, destination=None):
        self.planner.route_to(destination)



    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = inputs['location']
        self.heading=inputs['heading']

        action = random.choice(Environment.valid_actions)
        reward = self.env.act(self, action)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs,
                                                                                                    action,
                                                                                                    reward)  # [debug]
Example #7
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        # location, heading, delta, deadline, oncoming, traffic light state
        inputs['waypoint'] = self.next_waypoint

        self.state = tuple(sorted(inputs.items())) # performs the actual update

        # TODO: Select action according to your policy
        actions = [None,'forward','left','right']
        action = actions[random.randint(0,3)]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        #Actions
        self.A = ['forward', 'left', 'right', None] #all the actions available
        self.trial = 0

        #Initialize Q table(light, oncoming, next_waypoint)
        self.Q={}
        for i in ['green', 'red']:  #possible lights
            for j in [None, 'forward', 'left', 'right']: #possible oncoming_agent
                for k in ['forward', 'left', 'right']: #possible next_waypoints
                    self.Q[(i,j,k)] = [1] * len(self.A)


    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], inputs['oncoming'], self.next_waypoint)
        #Find the max Q value for the current state
        max_Q = self.Q[self.state].index(max(self.Q[self.state]))

        #assign action
        p = random.randrange(0,5)
        epsilon = 1.5 # small probability to act randomly
        if p<epsilon:
            action = random.choice(self.env.valid_actions)
        else:
            action = self.A[max_Q]
        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        gamma = 0.30
        alp_tune =500 # tunning parameter
        alpha = 1/(1.1+self.trial/alp_tune) # decay learning rate
        self.trial = self.trial+1

        #get the next action state Q(s',a')
        next_inputs = self.env.sense(self)
        next_next_waypoint = self.planner.next_waypoint()
        next_state = (next_inputs['light'],next_inputs['oncoming'], next_next_waypoint)

        #Update Q Table
        self.Q[self.state][self.A.index(action)] = \
        (1-alpha)*self.Q[self.state][self.A.index(action)] + \
        (alpha * (reward + gamma * max(self.Q[next_state])))
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        # sets self.env = env, state = None, next_waypoint = None, and a default color
        super(LearningAgent, self).__init__(env)
        self.color = 'red'  # override color
        # simple route planner to get next_waypoint
        self.planner = RoutePlanner(self.env, self)
        # TODO: Initialize any additional variables here

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        # # from route planner, also displayed by simulator
        self.next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = self.next_waypoint, inputs['light']
        # self.state = self.next_waypoint, (inputs['oncoming'] == None or inputs['light'] == 'green')

        # TODO: Select action according to your policy
        # Define actions
        actions = [None, 'forward', 'left', 'right']

        # QUESTION 1- select random action
        action = random.choice(actions)

        # Execute action and get reward
        reward = self.env.act(self, action)
class LearningAgent(Agent):
    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.next_waypoint = None
        self.total_reward = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.next_waypoint = None

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Select action according to your policy
        loc, hed = self.get_my_location()
        action = self.get_next_waypoint_given_location( loc, hed)
        action_okay = self.check_if_action_is_ok(inputs)
        if not action_okay:
            action = None
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.total_reward += reward

        #TODO: Learn policy based on state, action, reward
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Example #11
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.q_matrix = {}
        self.alpha = 0.95
        self.gamma = 0.05
        self.explore = 0.05
        self.state = None

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        deadline = self.env.get_deadline(self)

        state = self.observe_state()
        self.state = state
        
        knowledge = self.knowledge_from_state(state)

        # TODO: Select action according to your policy
        if random() < self.explore:
            action = choice(OPTIONS)
        else:
            action = sorted(knowledge.items(), key=lambda x: x[1], reverse=True)[0][0]

        old_value = knowledge[action]

        # Execute action and get reward
        reward = self.env.act(self, action)
        new_state = self.observe_state()
        new_state_knowledge = self.knowledge_from_state(new_state)

        self.q_matrix[state][action] = (1-self.alpha) * old_value +\
                                       self.alpha * (reward + self.gamma * max(new_state_knowledge.values()))
        

        #print "LearningAgent.update(): deadline = {}, state = {}, action = {}, reward = {}".format(deadline, state, action, reward)  # [debug]

    def knowledge_from_state(self, state):
        """
        Gets what we know about the given state
        """
        if not state in self.q_matrix:
            self.q_matrix[state] = {option: 0 for option in OPTIONS}
        return self.q_matrix[state]

    def observe_state(self):
        """
        Observe the current state
        """
        inputs = self.env.sense(self)
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        return (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)
Example #12
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        
        # TODO: Select action according to your policy
        action = None

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        print("LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward))  # [debug]
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.prevReward = 0
        self.prevAction = None

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.prevReward = 0
        self.prevAction = None

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        state = None
        validActions = []
        
        if inputs['light'] == 'red':
            if inputs['oncoming'] != 'left':
                validActions = ['right', None]
        else:
            if inputs['oncoming'] == 'forward':
                validActions = [ 'forward','right']
            else:
                validActions = ['right','forward', 'left']

        # TODO: Select action according to your policy
        if validActions != []:
            action = random.choice(validActions)
        else:
            action = random.choice(Environment.valid_actions)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        if action != None:
            if reward > self.prevReward:
                self.state = (inputs,self.next_waypoint,deadline) 
                self.prevAction = action
                self.prevReward = reward
            else:
                self.state = (inputs,self.next_waypoint,deadline)
                self.prevAction = action
                self.prevReward = reward


        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Example #14
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        # sets self.env = env, state = None, next_waypoint = None, and a default color
        super(LearningAgent, self).__init__(env)
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.qtable = defaultdict(dict)
        self.epsilon = .4  # Randomness factor
        self.alpha = .4  # Learning rate.

    def reset(self, destination=None):
        # Prepare for a new trip; reset any variables here, if required
        self.planner.route_to(destination)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # Update state
        self.state = self.define_state(inputs, self.next_waypoint)

        # Select action according policy
        action = self.select_action()

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Learn policy based on state, action, reward
        self.update_qtable(action, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs,
                                                                                                    action,
                                                                                                    reward)  # [debug]

    def select_action(self):
        if random.random() < self.epsilon or self.state not in self.qtable:
            action = random.choice(self.env.valid_actions)
        else:
            action = max(self.qtable[self.state], key=self.qtable[self.state].get)

        if self.state not in self.qtable:
            for a in self.env.valid_actions:  # init action:reward dict.
                self.qtable[self.state][a] = 0
        return action

    def update_qtable(self, action, reward):
        old_q = self.qtable[self.state].get(action, 0)
        self.qtable[self.state][action] = ((1 - self.alpha) * old_q) + (self.alpha * reward)
        # Decay alpha and epsilon until it reaches 0.1 and 0.0, respectively.
        self.alpha = max(0.001, self.alpha - 0.002)
        self.epsilon = max(0.0, self.epsilon - 0.002)

    @staticmethod
    def define_state(inputs, next_waypoint):
        return tuple(inputs.values()) + (next_waypoint, )
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint

        valid_waypoints = ['forward','right','left']
        valid_lights = ['red','green']
        valid_actions = ['forward','right','left', None]

        self.Q_hat={}
        states = [(next_waypoint,light,oncoming,right,left) for next_waypoint in valid_waypoints for light in valid_lights for oncoming in valid_actions for right in valid_actions for left in valid_actions]
        states.append('Destination')
        for state in states:
            self.Q_hat[state] = {}
            for action in valid_actions:
                self.Q_hat[state][action] = random.random()

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        if self.next_waypoint == None:
            self.state = 'Destination'
        else:
            self.state = (self.next_waypoint, inputs['light'],inputs['oncoming'],inputs['right'],inputs['left'])

        # TODO: Select action according to your policy

        valid_actions = ['forward','right','left', None]
        epsilon = 0.1 #0.01

        best_action = max(self.Q_hat[self.state],key=self.Q_hat[self.state].get)
        random_action = choice(valid_actions)
        action = choice([best_action, random_action],p=[1-epsilon,epsilon])

        # Execute action and get reward

        reward = self.env.act(self, action)

        # Learn policy based on state, action, reward

        new_next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        new_inputs = self.env.sense(self)
        new_state = (new_next_waypoint, new_inputs['light'],new_inputs['oncoming'],new_inputs['right'],new_inputs['left'])
        alpha = 0.5 #opt 0.7
        gamma = 0.5 #opt 0.1
        max_Qhat_ahat = max(self.Q_hat[new_state].values())
        self.Q_hat[self.state][action] = (1-alpha)*self.Q_hat[self.state][action]+alpha*(reward+gamma*max_Qhat_ahat)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Example #16
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.actions = [None, 'forward', 'left', 'right']
        self.learning_rate = 0.3
        self.state = None
        self.q = {}
        self.trips = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # Prepare for a new trip; reset any variables here, if required
        self.trips += 1
        if self.trips >= 100:
            self.learning_rate = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # Update state
        state = (inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'],
self.next_waypoint)
        self.state = state

        # Lazy initialization of Q-values
        for action in self.actions:
            if (state, action) not in self.q:
                self.q[(state, action)] = 1
        
        # Select action according to the policy

        probabilities = [self.q[(state, None)], self.q[(state, 'forward')], self.q[(state, 'left')], self.q[(state, 'right')]]
        # Use the softmax funtion so that the values actually behave like probabilities
        probabilities = np.exp(probabilities) / np.sum(np.exp(probabilities), axis=0)

        adventurousness = max((100 - self.trips) / 100, 0)
        if random.random() < adventurousness:
            action = np.random.choice(self.actions, p=probabilities)
        else:
            action = self.actions[np.argmax(probabilities)]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Learn policy based on state, action, reward
        self.q[(state, action)] = self.learning_rate * reward + (1 - self.learning_rate) * self.q[(state, action)]

        #print("LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward))  # [debug]

    def get_state(self):
        return self.state
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Qtable = {}
        self.alpha = 0.8
        self.gamma = 0.6

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'])
        old_state = self.state
        
        # TODO: Select action according to your policy
        if (old_state, None) not in self.Qtable.keys():
            self.Qtable[(old_state, None)] = 0
        if (old_state, 'left') not in self.Qtable.keys():
            self.Qtable[(old_state, 'left')] = 0
        if (old_state, 'forward') not in self.Qtable.keys():
            self.Qtable[(old_state, 'forward')] = 0
        if (old_state, 'right') not in self.Qtable.keys():
            self.Qtable[(old_state, 'right')] = 0
        
        max_action = max(self.Qtable[(old_state, None)], self.Qtable[(old_state, 'left')],self.Qtable[(old_state, 'forward')],self.Qtable[(old_state, 'right')])
        action = random.choice([ i for i in ['left', 'forward', 'right', None] if self.Qtable[(old_state, i)] == max_action])

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        #self.next_waypoint = self.planner.next_waypoint()
        inputs = self.env.sense(self)
        self.state = (self.next_waypoint, inputs['light'], inputs['oncoming'], inputs['left'], inputs['right'])
        new_state = self.state

        if (new_state, None) not in self.Qtable.keys():
            self.Qtable[(new_state, None)] = 0
        if (new_state, 'left') not in self.Qtable.keys():
            self.Qtable[(new_state, 'left')] = 0
        if (new_state, 'forward') not in self.Qtable.keys():
            self.Qtable[(new_state, 'forward')] = 0
        if (new_state, 'right') not in self.Qtable.keys():
            self.Qtable[(new_state, 'right')] = 0

        self.Qtable[(old_state, action)] = (1 - self.alpha) * self.Qtable[(old_state, action)] + self.alpha *(reward + self.gamma * max(self.Qtable[(new_state, i)] for i in [None, 'left', 'forward', 'right']))
class BasicAgent(Agent):
    """A basic agent upon which to build learning agents."""

    def __init__(self, env):
        super(BasicAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        self.qvals = {} # mapping (state, action) to q-values
        self.time = 0 # number of moves performed
        self.possible_actions = (None, 'forward', 'left', 'right')
        self.n_dest_reached = 0 # number of destinations reached
        self.last_dest_fail = 0 # last time agent failed to reach destination
        self.sum_time_left = 0 # sum of time left upon reaching destination over all trials
        self.n_penalties = 0 # number of penalties incurred
        self.last_penalty = 0 # last trial in which the agent incurred in a penalty

    def reset(self, destination=None):
        self.planner.route_to(destination)

    def best_action(self, state):
        """
        Return a random action (other agents will have different policies)
        """
        return random.choice(self.possible_actions)

    def update_qvals(self, state, action, reward):
        """
        Keeps track of visited (state, action) pairs. 
        (in other agents will use reward to update 
        the mapping from (state, action) pairs to q-values)
        """
        self.qvals[(state, action)] = 0
    
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        
        # update time
        self.time += 1

        # Update state
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'],
                      self.next_waypoint)

        # Pick an action
        action = self.best_action(self.state)
        
        # Execute action and get reward
        reward = self.env.act(self, action)
        if reward < 0:
            self.n_penalties += 1
        
        # Update the q-value of the (state, action) pair
        self.update_qvals(self.state, action, reward)
        
        # print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Example #19
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.total_reward = 0
        self.next_waypoint = None
        self.QLearner = QAlgorithm(epsilon=0.03, alpha = 0.1, gamma = 0.9)
        self.next_state = None
        self.time_step = 1.0
    
    def get_decay_rate(self, t): #Decay rate for epsilon
       return 1.0 / float(t)
       
    def reset(self, destination=None):
        self.planner.route_to(destination)
        self.time_step = 1.0
        # TODO: Prepare for a new trip; reset any variables here, if required
        '''
        self.previous_action = None
        self.next_state = None
        self.total_reward = 0
        self.next_waypoint = None
        self.QLearner = QAlgorithm(epsilon=0.03, alpha = 0.1, gamma = 0.9)
        self.next_state = None
        '''
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        #env_states = self.env.agent_states[self]
        self.time_step += 1
        epsilon = self.get_decay_rate(self.time_step)
        self.QLearner.set_eps(epsilon)
        
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)
        #self.state = (inputs['light'], self.next_waypoint)

        action = self.QLearner.q_move(self.state)        
        
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.total_reward += reward
        
        new_inputs = self.env.sense(self)
        #env_states = self.env.agent_states[self]
        new_deadline = self.env.get_deadline(self)
        self.next_state =  (new_inputs['light'], new_inputs['oncoming'], new_inputs['left'], self.next_waypoint)
        #self.next_state =  (new_inputs['light'], self.next_waypoint)
        
        # TODO: Learn policy based on state, action, reward

        self.QLearner.q_future_reward(self.state, action, self.next_state, reward)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Example #20
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, alpha, q_initial):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.policy = {}
        self.trip_log = []
        self.trip = None
        self.alpha = alpha
        self.q_initial = q_initial

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        if self.trip:
            self.trip_log.append(self.trip)
        self.trip = {}
        self.trip['Deadline'] = self.env.get_deadline(self)
        self.trip['Reward'] = 0
        self.trip['Penalty'] = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        inputs.pop('right')
        self.state = inputs
        self.state['waypoint'] = self.next_waypoint

        # TODO: Select action according to your policy
        state_hash = hashabledict(frozenset(self.state.iteritems()))
        q = self.policy.get(state_hash, {
            None: self.q_initial,
            'forward': self.q_initial,
            'left': self.q_initial,
            'right': self.q_initial,
        })
        action = max(q.iteritems(), key=operator.itemgetter(1))[0]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Update trip stats
        self.trip['Reward'] += reward
        self.trip['Remaining'] = self.env.get_deadline(self)
        self.trip['Success'] = self.planner.next_waypoint() == None
        if reward < 0: self.trip['Penalty'] += reward

        # TODO: Learn policy based on state, action, reward
        q[action] = (1 - self.alpha) * q[action] + self.alpha * reward
        self.policy[state_hash] = q
Example #21
0
class BasicLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(BasicLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.last_reward = 0
        self.last_action = None
        self.last_state = None
        self.state = None
        self.total_reward = 0
        self.deadline = self.env.get_deadline(self)
        self.actions = ['forward', 'left', 'right', None]
        self.reached_destination = 0
        self.penalties = 0
        self.movements = 0

    def tuple_state(self, state):
        State = namedtuple("State", ["light", "next_waypoint"])
        return State(light = state['light'], next_waypoint = self.planner.next_waypoint()) 

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.last_reward = 0
        self.last_action = None
        self.last_state = None
        self.state = None
        self.total_reward = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        
        # TODO: Update state
        self.state = self.tuple_state(inputs)

        # TODO: Select action according to your policy
        action = random.choice(self.actions)

        # Execute action and get reward
        reward = self.env.act(self, action)

        if reward >= 10: self.reached_destination += 1
        if reward < 0: self.penalties += 1
        self.movements += 1

        # TODO: Learn policy based on state, action, reward 
        self.last_action = action
        self.last_state = self.state
        self.last_reward = reward
        self.total_reward += reward
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        
        # TODO: Initialize any additional variables here
        self.acts = ['left', 'right', 'forward', 'stay']
        df_tmp = pd.DataFrame(0*np.ones((2, 4)), index=['red', 'green'], columns=self.acts)
        self.Q = df_tmp
        self.deadline = 0
        

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
        self.deadline = deadline

        # TODO: Update state
        self.state = inputs['light'] # in this case the state is simply defined by the color oof the traffic light at the intersection
        
        # TODO: Select action according to your policy
        # Q-Learner
        act_opt = self.Q.ix[self.state].argmax()
        if act_opt=='stay': act_opt=None
#            print "Current state = %s, qlearner action = %s" % (self.state, act_opt)
        action = act_opt

        # Execute action and get reward
        reward = self.env.act(self, action)


        # TODO: Learn policy based on state, action, reward

        # Maximize Q conditional on the state we are in = S
        q_S = self.Q.ix[self.state] # Q conditional on state S
        q_S_mx = q_S[q_S==q_S.max()] # just in case there are multiple values and the action that maximizes Q(S, a) is not unique
        ix_mx = np.random.randint(0,len(q_S_mx)) # pick one of the equally qlearner actions
        q_S_mx = q_S_mx.iloc[ix_mx]

        # Update Q
        act_tmp = action if action !=None else 'stay'
        global alpha, alpha0, gamma
        alpha *= alpha0
        self.Q.ix[self.state, act_tmp] = \
            (1-alpha)*self.Q.ix[self.state, act_tmp] + \
            alpha*(reward + gamma*q_S_mx) # note: we update the action that has been actually taken, but we update the contribution to Q(S, a) by the action that could have been taken had we behaved qlearnerly
Example #23
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.QLearning = QLearning(epsilon=0.3, alpha=0.95, gamma=0.9)
        # self.QLearning = pickle.load(open('./QLearning.pkl'))

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def is_action_okay(self, inputs):
        action_okay = True
        if self.next_waypoint == 'right':
            if inputs['light'] == 'red' and inputs['left'] == 'forward':
                action_okay = False
        elif self.next_waypoint == 'forward':
            if inputs['light'] == 'red':
                action_okay = False
        elif self.next_waypoint == 'left':
            if inputs['light'] == 'red' or (inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right'):
                action_okay = False

        return action_okay

    def update(self, t):
        # Gather states
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        action_okay = self.is_action_okay(inputs)
        self.state = (action_okay, self.next_waypoint, deadline < 3)
        # self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # TODO: Select action according to your policy
        action = self.QLearning.policy(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Gather next states
        next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        next_inputs = self.env.sense(self)
        next_deadline = self.env.get_deadline(self)
        next_action_okay = self.is_action_okay(next_inputs)
        next_state = (next_action_okay, next_waypoint, next_deadline < 3)
        # next_state = (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], next_waypoint)

        # TODO: Learn policy based on state, action, reward
        self.QLearning.update_QTable(self.state, action, reward, next_state)
Example #24
0
class QLearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    

    def __init__(self, env):
        super(QLearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}
        self.learning_rate = 0.9
        self.states = None
        self.possible_actions = [None, 'forward', 'left', 'right']
        

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)



        # TODO: Update state
        # state = (self.next_waypoint,inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'],deadline)
        state = (self.next_waypoint,inputs['light'], inputs['oncoming'], inputs['right'], inputs['left'])
        # state = (self.next_waypoint,inputs['light'])
        self.state = state

        for action in self.possible_actions:
            if(state, action) not in self.Q: 
                self.Q[(state, action)] = 20

        # TODO: Select action according to your policy
        Q_best_value = [self.Q[(state, None)], self.Q[(state, 'forward')], self.Q[(state, 'left')], self.Q[(state, 'right')]]
        # Softmax
        Q_best_value = np.exp(Q_best_value) / np.sum(np.exp(Q_best_value), axis=0)
        
        #action = np.random.choice(self.possible_actions, p=probabilities)
        action = self.possible_actions[np.argmax(Q_best_value)]

        # Execute action and get reward
        reward = self.env.act(self, action)
        if reward == -10.0 :
            print "Invalid move"
        if reward == -0.5 :
            print "valid, but is it correct?"
        

        # TODO: Learn policy based on state, action, reward
        # self.Q[(state,action)] = reward + self.learning_rate * self.Q[(state,action)]
        self.Q[(state,action)] = reward * self.learning_rate + ( 1 - self.learning_rate ) * self.Q[(state,action)]
Example #25
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, learning_rate=0.78, discount_factor=0.36):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}
        self.discount_factor = discount_factor
        self.learning_rate = learning_rate
        self.actions = Environment.valid_actions
        self.oldAction = None


    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.oldAction=None
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        #------------------------------------------------
        #action = np.random.choice(self.actions)
        #------------------------------------------------
        


        # TODO: Select action according to your policy
        current_state = tuple(inputs.values() + [self.next_waypoint])
        self.state = (inputs, self.next_waypoint)
        
        if current_state not in self.Q:
        	self.Q[current_state] = [3,3,3,3]

        Q_max = self.Q[current_state].index(np.max(self.Q[current_state]))
        # Execute action and get reward
        action = self.actions[Q_max]
        reward = self.env.act(self, action)



        # TODO: Learn policy based on state, action, reward
        
        if t!=0:
			oldvalue = self.Q[self.oldAction[0]][self.oldAction[1]]
			newvalue = ((1 - self.learning_rate)*oldvalue)+(self.learning_rate * (self.oldAction[2] + self.discount_factor * self.Q[current_state][Q_max]))
			self.Q[self.oldAction[0]][self.oldAction[1]] = newvalue 
        
        self.oldAction = (current_state, self.actions.index(action), reward)
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Example #26
0
class RandomAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(RandomAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.availableAction = ['forward', 'left', 'right',None]   
        self.goal=0
        self.steps=0
        self.features=[]
        self.deadline=[]
        self.total_reward=[0]

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        #print"RESET, Final state:\n", self.state
        try:
            if self.deadline[len(self.features)-1] >0: #deadline less than zero
                self.goal+=1 #FIXME - order
                print "PASS! {} steps to goal,Goal reached {} times out of {}!".format(
                                                        self.deadline[len(self.features)-1],self.goal,len(self.features))
            else:
                print "FAIL! {} steps to goal,Goal reached {} times out of {}!".format(
                                                        self.deadline[len(self.features)-1],self.goal,len(self.features))
                pass
        except:
            print "Trial 0 - Goal reached {} times out of {}!".format(self.goal,len(self.features))
            pass
        print "----------------------------------------------------------"
        self.features.append({})
        self.deadline.append(None)
        self.total_reward.append(0)
        self.steps=0

    def update(self, t):
        # Gather inputs
        self.steps+=1
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        #self.deadline[len(self.features)] = self.env.get_deadline(self)
        self.state=inputs
        self.features[len(self.features)-1][self.steps]=inputs
        self.deadline[len(self.deadline)-1] = self.env.get_deadline(self)

        # TODO: Select action according to your policy
        action = self.availableAction[random.randint(0,3)]    
        
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.lastReward=reward
        # TODO: Learn policy based on state, action, reward
        self.total_reward[len(self.total_reward)-1] =self.total_reward[len(self.total_reward)-1]+reward
Example #27
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here        
        self.actions = ['forward', 'left', 'right',None]        
        self.q_table = Counter()        
        self.alpha = 0.5
        self.gamma = 0.2


    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def get_best_action(self, state):
        best_profit = -999
        best_action = None
        for a in self.actions:
            profit = self.q_table[(state, a)]            
            if profit > best_profit:
                best_profit = profit
                best_action = a
        return best_action, best_profit


    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state        
        self.state = (inputs['light'],inputs['oncoming'], inputs['left'], self.next_waypoint)       

        if randint(0,10) < 3:
            action = self.actions[randint(0, 3)]
        else:        
            action, _ = self.get_best_action(self.state)
        
        reward = self.env.act(self, action)

        next_inputs = self.env.sense(self)
        next_next_waypoint = self.planner.next_waypoint()
        next_state = (next_inputs['light'],next_inputs['oncoming'], next_inputs['left'], next_next_waypoint)      
       
        _, best_profit = self.get_best_action(next_state)
        self.q_table[(self.state, action)] = (1-self.alpha)*self.q_table[(self.state, action)] + (self.alpha * (reward + self.gamma * best_profit))

       
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Example #28
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.debugOn = False
        self.reporter = Reporter(driver_type, self.env.valid_actions)
        if driver_type=='random':
            self.driver = RandomDriver(self.env.valid_actions)
        elif driver_type=='greedy':
            self.driver = GreedyDriver(self.env.valid_actions)
        elif driver_type=='greedy2':
            self.driver = Greedy2Driver(self.env.valid_actions)
        elif driver_type=='smart':
            self.driver = SmartDriver(self.env.valid_actions, self.reporter)
        elif driver_type=='smart2':
            self.driver = Smart2Driver(self.env.valid_actions, self.reporter)
        elif driver_type=='smart3':
            self.driver = Smart3Driver(self.env.valid_actions, self.reporter)
        else:
            self.driver = Smart2Driver(self.env.valid_actions, self.reporter)

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.reporter.reset(destination)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = State.make(self.next_waypoint, inputs)
        
        # TODO: Select action according to your policy
        action = self.driver.decide(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        newWaypoint = self.planner.next_waypoint()
        newState = State.make(newWaypoint, self.env.sense(self))
        self.driver.learn(t, deadline, self.state, action, reward, newState)

        location = self.env.agent_states[self]['location']
        if self.debugOn:
            print "LearningAgent.update(): deadline={} inputs={} action={} reward={} location={}".format(deadline, inputs, action, reward, location)  # [debug]
        self.reporter.update(t, deadline, self.state, action, reward, location, self.driver)
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, trials):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint

        # TODO: Initialize any additional variables here
        self.trials = trials
        self.success = 0  # How many times agent able to reach the target for given trials?

        self.penalties = 0
        self.total_rewards = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)

        # TODO: Prepare for a new trip; reset any variables here, if required
        self.penalties = 0
        self.total_rewards = 0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        # self.state = inputs
        # self.state = (inputs['light'], inputs['oncoming'], inputs['left'])
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # TODO: Select action according to your policy
        action = random.choice(self.env.valid_actions)  # Initial random movement / random action

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".\
            format(deadline, inputs, action, reward)  # [debug]

        # Statistics
        self.total_rewards += reward

        if reward < 0:
            self.penalties += 1

        if reward > 5:
            self.success += 1
            print "\nTrial info: Number of Penalties =  {}, Net rewards: {}. \nTotal success/trials: {}/{}. \n".\
                format(self.penalties, self.total_rewards, self.success, self.trials)
Example #30
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.total_reward = 0

        #Q matrix
        self.Q = {}

        # initial Q values
        self.initial_q = 30

        # Initialize state
        self.states = None

        # Dictionary of possible actions to index states from Q matrix
        self.actions = {None : 0, 'forward' : 1, 'left' : 2, 'right' : 3}

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # Check if this state is included in the Q matrix. If not, create a new row in the matrix.
        if self.state not in self.Q:
            self.Q[self.state] = [self.initial_q for i  in range(len(self.env.valid_actions))]

        # TODO: Select action according to your policy
        learning_rate = 0.8
        action = self.env.valid_actions[np.argmax(self.Q[self.state])]
        discount = 0
        # Execute action and get reward
        reward = self.env.act(self, action)
        self.total_reward += reward
        # TODO: Learn policy based on state, action, reward

        # get the future state
        future_state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

        # Update the Q matrix (add consideration of future reward)
        #self.Q[self.state][self.actions[action]] = self.Q[self.state][self.actions[action]] * (1 - learning_rate) + reward * learning_rate
        self.Q[self.state][self.actions[action]] = self.Q[self.state][self.actions[action]] + learning_rate * (reward + discount * self.Q[future_state][self.actions[action]] - self.Q[self.state][self.actions[action]])
Example #31
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.t = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        if testing:
            self.epsilon = 0
            self.alpha = 0

        if self.learning and not testing:
            # self.epsilon = self.epsilon - 0.05
            # self.epsilon = 1.0/(self.t)
            # self.t = self.t + 1.0
            # system("echo "+str(self.epsilon)+" >> eps")
            # self.epsilon = math.cos(self.t)
            #self.epsilon = math.exp(-0.1 * self.t)
            self.epsilon = abs(math.cos(0.01 * self.t))
            self.t = self.t + 1.0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer features outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        #if waypoint == 'right':
        #    state = (waypoint, inputs['left'], inputs['oncoming'])
        #if waypoint == 'forward':
        #    state = (waypoint, inputs['light'], inputs['left'])
        #if waypoint == 'left':
        #    state = (waypoint, inputs['light'], inputs['oncoming'])
        state = (waypoint, inputs['light'], inputs['left'], inputs['oncoming'])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = ['']
        max_val = -0xFFFFFF  # Minimum possible Q value
        print "\nChoose Max Q"

        for action in self.Q[state].keys():
            print "Max", max_val,
            print "Action", action,

            if self.Q[state][action] > max_val:
                print "Q Action Val", self.Q[state][action],
                max_val = self.Q[state][action]
                print "Max", max_val,
                maxQ[0] = action
                print "maxQ", maxQ,
            print
        print "maxQ", maxQ
        print

        for action in self.Q[state].keys():
            print "Action", action,
            if self.Q[state][action] == max_val and action != maxQ[0]:
                print "Q Action Val", self.Q[state][action],
                maxQ.append(action)
                print "maxQ", maxQ,
            print
        print
        print "maxQ", maxQ
        #s = raw_input("MaxQEh")
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if self.learning:
            if state not in self.Q.keys():
                self.Q[state] = {}
                for action in self.valid_actions:
                    self.Q[state][action] = 0

                print "State", state
                print "QDict", self.Q

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".

        if not self.learning:
            print "Not Learning",
            action = self.valid_actions[random.randint(
                0,
                len(self.valid_actions) - 1)]

        if self.learning:
            print "Learning",
            print "Epsilon", self.epsilon,
            action_probability = random.uniform(0, 1)
            print "Action Probability", action_probability,
            if action_probability <= self.epsilon:
                action = self.valid_actions[random.randint(
                    0,
                    len(self.valid_actions) - 1)]
                print "random action", action,

            else:
                print "Finding maxQ", self.Q[state],
                actions = self.get_maxQ(state)
                print "Actions", actions,
                if len(actions) == 1:
                    action = actions[0]
                else:
                    action = actions[random.randint(0, len(actions) - 1)]
                print "Action", action,

        print
        print "Given Env", self.env.sense(self)
        print "Next waypoint", self.next_waypoint
        #s = raw_input()
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = (self.Q[state][action] *
                                     (1.0 - self.alpha)) + (reward *
                                                            self.alpha)

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Example #32
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'yellow'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.reward = 0
        self.next_waypoint = None

        # Decalring the variable to calculate the success percentage rate

        self.S = 0

        #Declaring the trial counter variable

        self.var_trial = 0

        #Declaring the variable to track sucess rate

        self.var_trial_S = {}

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.next_waypoint = None

          # Incrementing the success percentage counter before reseting the trial

        try:
           if self.var_trial_S[self.var_trial] == 1:
              self.S = self.S + 1
        except:
           pass

        # reseting the trial

        self.var_trial = self.var_trial + 1


    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

               
        # TODO: Select action according to your policy
        action = None
        action = random.choice(Environment.valid_actions)
        
        '''
        # TODO: Update state
        action_okay = True
        if self.next_waypoint == 'right':
            if inputs['light'] == 'red' and inputs['left'] == 'forward':
                action_okay = False

        elif self.next_waypoint == 'straight':
            if inputs['light'] == 'red':
                action_okay = False

        elif self.next_waypoint == 'left':
            if inputs['light'] == 'red' or inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right':
                action_okay = False

        if not action_okay:
            action = None
        '''

        self.state = inputs
        self.state['next_waypoint'] = self.next_waypoint
        self.state = tuple(sorted(self.state.items()))

        # Execute action and get reward
        reward = self.env.act(self, action)
        self.reward = self.reward + reward
        # TODO: Learn policy based on state, action, reward

         # at the end of each trial we see the reward, if reward > 10 means the trial is successful else trial has failed 

        if (reward >= 10):
            self.var_trial_S[self.var_trial] = 1
        else:
            self.var_trial_S[self.var_trial] = 0

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
Example #33
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        # Set any additional class parameters as needed
        self.trial = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        if testing:
            self.epsilon, self.alpha = 0, 0
        else:
            #self.epsilon = self.epsilon - 0.05 #for question 6
            self.trial += 1
            self.epsilon = math.exp(-0.01 * self.trial)
            if self.alpha >= 0.1:
                self.alpha = math.exp(-0.07 * self.trial)
            else:
                self.alpha

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # Set 'state' as a tuple of relevant data for the agent

        # Originally set state as a tuple, but suspect encoding Q-table key as a tuple of tuples possibly resulted in key name collisions.
        # state = (("lights", inputs['light']),
        #("oncoming", inputs['oncoming']),
        #("left", inputs['left']),
        #("waypoint", self.next_waypoint))

        state = str(waypoint) + "_" + inputs['light'] + "_" + str(
            inputs['left']) + "_" + str(inputs['oncoming'])

        if not self.learning:
            random.choice(self.valid.actions)
        else:
            self.Q[state] = self.Q.get(state, {
                None: 0.0,
                'forward': 0.0,
                'left': 0.0,
                'right': 0.0
            })

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        # Calculate the maximum Q-value of all actions for a given state
        # preset an initialization value that should be replaced by a more valid Q value in the loop.
        maxQ = max(self.Q[state].values())
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if self.learning and state not in self.Q:
            self.Q[state] = {
                None: 0.0,
                'left': 0.0,
                'right': 0.0,
                'forward': 0.0
            }
        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            coin_toss = random.random()
            if coin_toss <= self.epsilon:
                action = random.choice(self.valid_actions)
            else:
                actions_list = []
                highest_Qvalue = self.get_maxQ(state)
                for act in self.Q[state]:
                    if highest_Qvalue == self.Q[state][act]:
                        actions_list.append(act)
                action = random.choice(actions_list)
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')

        if self.learning:
            self.Q[state][action] = self.alpha * reward + (
                1 - self.alpha) * self.Q[state][action]
        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Example #34
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """

    def __init__(self, env, learning=True, epsilon=0.5, alpha=0.4):
        super(LearningAgent, self).__init__(env)     # Set the agent in the environment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict()
        # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon   # Random exploration factor
        self.alpha = alpha       # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.decay_rate = 0.001
        self.a = 0.01
        self.n_trials = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        self.n_trials += 1

        if self.learning is not True:
            self.epsilon = 0
            self.alpha = 0
        elif self.learning is True:
            # self.epsilon = self.epsilon - self.decay_rate
            self.epsilon = np.exp(-(self.a*self.n_trials))

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(self)           # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        input1, input2, input3, input4 = inputs.items()
        state = (waypoint, input1, input2, input3, input4)

        return state

    def get_maxQ(self, state):
        """ The get_maxQ function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        # from https://stackoverflow.com/questions/268272/getting-key-with-maximum-value-in-dictionary
        maxQ = max(self.Q[state], key=self.Q[state].get)

        action_list = []

        for action in self.Q[state]:
            if self.Q[state][action] == self.Q[state][maxQ]:
                action_list.append(action)
            
        if len(action_list) > 0:
            maxQ = random.choice(action_list)

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if self.learning:
            if state not in self.Q.keys():
                self.Q[state] = {action: 0.0 for action in self.valid_actions}

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        if not self.learning:
            action_number = random.randint(0, 3)
            action = self.valid_actions[action_number]
        elif self.learning:
            rand_num = random.random()
            if rand_num < self.epsilon:
                action_number = random.randint(0, 3)
                action = self.valid_actions[action_number]
            else:
                action = self.get_maxQ(state=state)

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        #       Set discount factor to zero *from Kevin Robertson

        # Transitions <s,a,r,s'>

        # sought some help from the forums and lecture videos.
        # update Q-value = current Q-value * (1 - alpha) + alpha * (current reward + discount factor * expected future reward)


        self.Q[state][action] = self.Q[state][action] * (1 - self.alpha) + self.alpha * reward

        return

    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()           # Get current state
        self.createQ(state)                  # Create 'state' in Q-table
        action = self.choose_action(state)   # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)    # Q-learn

        return
Example #35
0
class LearningAgent_v2(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent_v2, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.trip_history = []
        self.debug = True
        self.gamma = 0.2  #upper bound of discount
        #self.alpha = 0.5 #uppder bound of learning rate
        self.epsilon = 0.1  #lower bound of proportion of random steps
        self.reg = 0.001  # regularization param for regression
        self.lr = 0.1  # learning rate for regression
        self.clock_update = 0  # store number of updates
        self.init_params_scale = 1e-4  # scale of initial params setting

        self.max_memory = 400  # number of rows for state_action_experience
        self.batch_size = 20  # size of batch
        self.batch_step = 20  # extract a batch for each batch_step steps
        self.param_step = self.max_memory  # how many step should update w

        self.state_feature = [
            'right_no', 'forward_no', 'left_no', 'next_right', 'next_forward',
            'next_left'
        ]
        self.action_feature = ['right', 'forward', 'left']

        self.state = None

        self.num_action_space = np.concatenate(
            (np.diag(np.ones(3)), np.zeros(3)[np.newaxis, :]))

        self.state_action_feature = self.state_feature + self.action_feature + [
            x + "_action_" + y for x in self.state_feature
            for y in self.action_feature
        ]
        #self.state_action_df = pd.DataFrame(columns = (self.state_action_feature + ['Q_score']) )
        self.state_action_experience = np.zeros(
            (1, len(self.state_action_feature)))
        self.Q_score_experience = np.zeros(1)

        self.ex_state = np.zeros(len(self.state_feature))
        self.ex_state_action = np.zeros(len(self.state_action_feature))
        self.ex_reward = 0

        self.params = {
            'b':
            np.random.randn(1),
            'w':
            self.init_params_scale *
            np.random.randn(len(self.state_action_feature), 1)
        }

        self.params_update = self.params

        self.reward_history = np.zeros(1)

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.ex_reward = 0
        self.ex_state_action = np.zeros(len(self.state_action_feature))
        self.ex_state = np.zeros(len(self.state_feature))

        if (len(self.trip_history) < 150):
            print 'Current success rate is {}'.format(
                sum(self.trip_history) / (len(self.trip_history) + 0.000001))
        else:
            print 'Success rate for recent 100 trials is {}'.format(
                sum(self.trip_history[-100:]) /
                (len(self.trip_history[-100:]) + 0.000001))
        print 'Average reward for recent moves is {}'.format(
            np.mean(self.reward_history))
        if (self.reward_history.shape[0] > 1000):
            self.reward_history = np.delete(self.reward_history, range(100))

    def numeric_state(self, inputs=None, deadline=0, next_waypoint=None):
        #print 'inputs is {}, deadline is {}, next_waypoint is {}'.format(str(inputs), str(deadline), str(next_waypoint))
        col_name = self.state_feature
        state = np.zeros(len(col_name))
        state += np.array(
            map(lambda x: x == 'next_' + str(next_waypoint), col_name))

        if inputs['light'] == 'red' and inputs['left'] == 'forward':
            #state += np.array( map(lambda x: x=='right_no', col_name) )
            state[0] = 1
        if inputs['light'] == 'red':
            state[1] = 1
        if inputs['light'] == 'red' or (inputs['oncoming'] == 'forward'
                                        or inputs['oncoming'] == 'right'):
            state[2] = 1
        #state[len(col_name)-1] = deadline

        if False:
            print 'inputs is {}, deadline is {}, next_waypoint is {}\n'.format(
                str(inputs), str(deadline), str(next_waypoint))
            print zip(col_name, state)
            raw_input("Press Enter to continue...")

        return state

    def numeric_action(self, action=None):
        col_name = self.action_feature
        return np.array(map(lambda x: x == str(action), col_name))

    def numeric_state_action(self, num_state=None, num_action=None):
        return np.concatenate(
            (num_state, num_action, np.outer(num_state, num_action).flatten()),
            axis=0)

    def max_Q_param(self, num_state):
        X = np.apply_along_axis(
            lambda x: self.numeric_state_action(num_state, x),
            axis=1,
            arr=self.num_action_space)
        score = X.dot(self.params['w']) + self.params['b']

        if False:
            print '\nX are\n {}\n, Params are\n {}\n'.format(
                str(X), str(self.params))
            raw_input("Press Enter to continue...")

        choose = np.argmax(score)
        opt_action = None
        if choose < 3:
            opt_action = self.action_feature[choose]
        num_state_action = X[choose]
        max_Q_hat = score[choose]

        if False:
            print '\nScores are\n {}\n, opt action are\n {}\n'.format(
                str(score), str(opt_action))
            raw_input("Press Enter to continue...")

        return opt_action, max_Q_hat, num_state_action

    def gradient(self, X, y, reg=0.01):
        if False:
            print '\nX are\n {}\n and y are\n {}\n'.format(str(X), str(y))
            raw_input("Press Enter to continue...")

        w, b = self.params_update['w'], self.params_update['b']
        scores = X.dot(w) + b
        y = y.flatten()[:, np.newaxis]
        loss = np.mean((y - scores)**2) + 0.5 * reg * np.sum(w**2)

        if False:
            print '\ny are\n {}\n and scores are\n {}\n and loss is\n {}\n'.format(
                str(y), str(scores), str(loss))
            raw_input("Press Enter to continue...")

        d_w = np.mean(
            (X * ((scores - y) * 2)), axis=0)[:, np.newaxis] + reg * w
        d_b = np.mean((scores - y) * 2)

        return d_w, d_b, loss

    def sample_X_y(self, size=10):
        idx = np.random.randint(self.state_action_experience.shape[0],
                                size=size)
        X = self.state_action_experience[idx, :]
        y = self.Q_score_experience[idx]

        return X, y

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        #may need to take deadline into account?

        # TODO: Update state
        self.state_action_experience = np.concatenate(
            (self.state_action_experience,
             self.ex_state_action[np.newaxis, :]))
        num_state = self.numeric_state(inputs=inputs,
                                       deadline=deadline,
                                       next_waypoint=self.next_waypoint)

        self.state = zip(self.state_feature, num_state)

        # TODO: Select action according to your policy

        action, max_Q_hat, num_state_action = self.max_Q_param(num_state)

        if (random.uniform(0, 1) < self.epsilon):
            action = random.choice(Environment.valid_actions[:])
            num_action = self.numeric_action(action)
            num_state_action = self.numeric_state_action(num_state=num_state,
                                                         num_action=num_action)
            if False:
                print "\n Use a random action, {}".format(str(action))
                #debug
                raw_input("Press Enter to continue...")

        true_Q_score = self.ex_reward + self.gamma * max_Q_hat
        self.Q_score_experience = np.append(self.Q_score_experience,
                                            true_Q_score)
        self.clock_update += 1

        if False:
            print '\nShape of State Action expreience Matrix is {}\n'.format(
                self.state_action_experience.shape)
            print '\nShape of Q score experience is {}\n'.format(
                self.Q_score_experience.shape)
            raw_input("Press Enter to continue...")

        # TODO: Learn policy based on state, action, reward
        reward = self.env.act(self, action)

        self.ex_reward = reward
        self.ex_state_action = num_state_action
        self.reward_history = np.append(self.reward_history, reward)

        if reward > 9:
            self.trip_history.append(1)
            #need to write down something here
            self.state_action_experience = np.concatenate(
                (self.state_action_experience,
                 self.ex_state_action[np.newaxis, :]))
            self.Q_score_experience = np.append(self.Q_score_experience,
                                                reward)
            self.clock_update += 1
        elif deadline == 0:
            self.trip_history.append(0)
            self.state_action_experience = np.concatenate(
                (self.state_action_experience,
                 self.ex_state_action[np.newaxis, :]))
            self.Q_score_experience = np.append(self.Q_score_experience,
                                                reward)
            self.clock_update += 1

        if (self.clock_update > self.max_memory + 2):
            self.state_action_experience = np.delete(
                self.state_action_experience,
                range(self.state_action_experience.shape[0] - self.max_memory),
                0)
            self.Q_score_experience = np.delete(
                self.Q_score_experience,
                range(len(self.Q_score_experience) - self.max_memory))

            if False:
                print '\nShape of State Action expreience Matrix is {}\n'.format(
                    self.state_action_experience.shape)
                print '\nShape of Q score experience is {}\n'.format(
                    self.Q_score_experience.shape)
                raw_input("Press Enter to continue...")

            if (self.clock_update % self.batch_step == 0):
                for i in xrange(2):
                    if False:
                        print '\nUpdated Parameters are {}\n'.format(
                            str(self.params_update))
                        raw_input("Press Enter to continue...")
                    data_X, data_y = self.sample_X_y(size=self.batch_size)
                    d_w, d_b, loss = self.gradient(data_X,
                                                   data_y,
                                                   reg=self.reg)
                    if False:
                        print '\nGradiants are {} and {}\n'.format(
                            str(d_w), str(d_b))
                        raw_input("Press Enter to continue...")

                    if False:
                        print '\nloss is {}\n'.format(loss)
                        raw_input("Press Enter to continue...")

                    self.params_update[
                        'w'] = self.params_update['w'] - self.lr * d_w
                    self.params_update[
                        'b'] = self.params_update['b'] - self.lr * d_b

            if self.clock_update % self.param_step == 0:
                self.params = self.params_update
                if True:
                    print '\nBias for regression is {}\n'.format(
                        str(self.params['b']))
                    weight_df = pd.DataFrame(data=self.params['w'].T,
                                             columns=self.state_action_feature)
                    print '\nWeights for regression is\n{}\n'.format(
                        weight_df.T)
Example #36
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self,
                 env,
                 discount_rate=0.1,
                 epsilon=0.15,
                 epsilon_decay=.99):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.actions = (None, 'forward', 'left', 'right')
        self.Q = {}  #hashable Q-Table
        self.default_Q = 1  #initially tried zero
        self.epsilon = epsilon  #controls frequency of random actions [note: not used when implementing epsilon decay]
        self.epsilon_decay = epsilon_decay  #decay rate (multiplier) for GLIE
        self.alpha = 1  # learning rate [note: not used when updating/decaying alpha]
        self.initial_alpha = 1  #starting point for alpha-updating model
        self.discount_rate = discount_rate  #controls extent to which we value future rewards

        # keep track of net reward, trial no., penalties
        self.net_reward = 0
        self.trial = 0

        self.success = 0
        self.net_penalty = 0
        self.penalty_tracker = []
        self.reward_tracker = []
        self.success_tracker = []
        self.trial_tracker = []

        #initialize Q-table with default values
        for light_state in ['green', 'red']:
            for oncoming_traffic_state in self.actions:
                for right_traffic_state in self.actions:
                    for left_traffic_state in self.actions:
                        for waypoint_state in self.actions[
                                1:]:  #ignore NONE - this is not a waypoint possibility
                            # record each state
                            state = (light_state, oncoming_traffic_state,
                                     right_traffic_state, left_traffic_state,
                                     waypoint_state)
                            self.Q[state] = {}
                            for action in self.actions:
                                self.Q[state][action] = self.default_Q

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.trial += 1
        self.net_reward = 0
        self.net_penalty = 0

    #for the random agent
    def get_random_action(self, state):
        return random.choice(self.actions)

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], inputs['oncoming'], inputs['left'],
                      inputs['right'], self.next_waypoint)

        # TODO: Select action according to your policy

        #random agent approach
        #best_action = self.get_random_action(self.state)

        #Q-learning approach
        if random.random() < self.epsilon:  # add randomness to the choice
            best_action = random.choice(self.actions)

        else:
            best_action = None
            max_Q = None
            #find action with best Q value
            for available_action in self.actions:
                if self.Q[self.state][available_action] > max_Q:
                    best_action = available_action
                    max_Q = self.Q[self.state][available_action]

        #EPSILON DECAY. COMMENT OUT FOR INITIAL PARAMETER TUNING
        self.epsilon = 1 / (self.trial + self.epsilon_decay)

        # Execute action and get reward
        reward = self.env.act(self, best_action)
        self.net_reward += reward

        #successful if reward is large. count this and also store other stats from this trial.
        if reward > 8:
            self.success += 1

        if reward < 0:
            self.net_penalty += reward

        #if done, update stats
        if self.env.done == True or deadline == 0:
            self.penalty_tracker.append(self.net_penalty)
            self.reward_tracker.append(self.net_reward)
            if deadline == 0:
                self.success_tracker.append(0)
            else:
                self.success_tracker.append(1)
            self.trial_tracker.append(
                40 - deadline)  #number of turns it took to reach the end

        #if all trials done, show progression of performance indicators over time
        if self.trial == 100:
            print "net penalties over time: " + str(self.penalty_tracker)
            print "net rewards over time: " + str(self.reward_tracker)
            print "success pattern: " + str(self.success_tracker)
            print "trials required per round: " + str(self.trial_tracker)

        # TODO: Learn policy based on state, action, reward

        #get next state
        next_state = self.env.sense(self)
        next_waypoint = self.planner.next_waypoint()

        #re-represent this as a tuple since that's how the Q dict's keys are formatted
        next_state = (next_state['light'], next_state['oncoming'],
                      next_state['left'], next_state['right'], next_waypoint)
        #add state to q dict if not already there
        if next_state not in self.Q:
            self.Q[next_state] = {}
            for next_action in self.actions:
                self.Q[next_state][next_action] = self.default_Q

        utility_of_next_state = None
        #search through available actions, find one with higest Q
        for next_action in self.Q[next_state]:
            if self.Q[next_state][next_action] > utility_of_next_state:
                utility_of_next_state = self.Q[next_state][next_action]

        #next get utility of state
        utility_of_state = reward + self.discount_rate * utility_of_next_state

        # update Q Table
        self.Q[self.state][best_action] = (1 - self.alpha) * \
            self.Q[self.state][best_action] + self.alpha * utility_of_state

        #alpha update. comment this out when testing other inputs in isolation.
        #decay needed to ensure Q convergence. source: http://dreuarchive.cra.org/2001/manfredi/weeklyJournal/pricebot/node10.html
        if self.trial != 0:
            #self.alpha = 1 / (self.trial + self.epsilon_decay)
            num_trials = 100
            self.alpha = (self.initial_alpha * (num_trials / 10.0)) / (
                (num_trials / 10.0) + self.trial)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}, successes = {}".format(
            deadline, inputs, best_action, reward, self.success)  # [debug]
Example #37
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.A = ['forward', 'left', 'right',None] # all avaiable action
        self.trial = 0 # the number of trials
        self.Q = {} # Init Q table(light, next_waypoint)
	self.init_value=1.5 #the value to initilaize the q table with
        for i in ['green', 'red']:  # possible lights
            for j in ['forward', 'left', 'right']:  ## possible next_waypoints
                self.Q[(i,j)] = [self.init_value] * len(self.A)  ## linized Q table
	self.gamma = 0.3  # discount factor 
        self.alpha = 0.5  # learning rate
	self.epsilon = 0.1 # prob to act randomly, higher value means more exploration, more random action
	self.reward_holder='' #str to hold neg rewards and split term
	self.breaker=0 #value to calculate when new trial occurs
	

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs['light'], self.next_waypoint)
        
        # TODO: Select action according to your policy
        #action = random.choice(self.env.valid_actions) # random action
        
	# Find the max Q value for the current state
        max_Q = self.Q[self.state].index(max(self.Q[self.state]))

        # assign action 
        p = random.random() #generate random value between 0 and 1
	#simulated annealing appraoch to avoid local opt, mainly to avoid staying in the same place      
	if p<=self.epsilon:
            action = random.choice(self.env.valid_actions)
        else:
            action = self.A[max_Q]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        '''
	#updating learning rate
        alpha_tune =500 # tunning parameter
        alpha = 1/(1.1+self.trial/alpha_tune) # decay learning rate
        self.trial = self.trial+1
	'''
        ## get the next state,action Q(s',a')
        next_inputs = self.env.sense(self)
        next_next_waypoint = self.planner.next_waypoint()
        next_state = (next_inputs['light'], next_next_waypoint)

        ## update Q table
        self.Q[self.state][self.A.index(action)] = \
            (1-self.alpha)*self.Q[self.state][self.A.index(action)] + \
	    (self.alpha * (reward + self.gamma * max(self.Q[next_state])))
       
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
	
	#to get an idea of penalties in later trials	
	if deadline>=self.breaker:
            self.reward_holder+='3 ' #will split string on 5 later and only be left with neg rewardfor a given trial
	self.breaker=deadline
	if reward<0:	
	    self.reward_holder+=str(reward)+' '
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here

        # Initialize Q
        # One dimension for state, another dimension for actions
        # By doing so, you can just map the corresponding state to choose action in the future
        self.Q = {}
        for i in ['green', 'red']:
            for j in [None, 'forward', 'left', 'right']:
                for k in self.env.valid_actions:
                    self.Q[(i, j, k)] = [3] * len(self.env.valid_actions)

        # Initialize basic parameters of the Q-learning equation
        # Learning rate
        self.alpha = 0.3
        # Discount rate
        self.gamma = 0.3
        # Simulated annealing
        self.epsilon = 10
        # Trials for plotting
        self.trials = -1
        self.max_trials = 20
        self.x_trials = range(0, self.max_trials)
        self.y_trials = range(0, self.max_trials)

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.trials = self.trials + 1

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        # These are the valid inputs from valid_inputs in the class Environment in environment.py
        # 'light', 'oncoming', 'left', 'right'
        # Using 'light' and next_waypoint,  we've tuples here
        self.state = (inputs['light'], inputs['oncoming'], self.next_waypoint)

        # TODO: Select action according to your policy

        # 1. We can find the action corresponding to the maximum value of Q for the current state
        # self.Q[self.state] accesses the current state
        # index(max(self.Q[self.state])) maximizes the value of Q for that state
        max_Q = self.Q[self.state].index(max(self.Q[self.state]))

        # 2. Then we select the action to maximize the value of Q

        # "Simulated annealing"
        # to allow the agent to still explore the environment
        # and not just still the the most conservative approach
        # It is a probabilistic technique for approximating the global optimum of a given function

        # This implies epsilon (probability of random action) is 10%
        # 0 and 1 can be the only values less than epsilon, 2

        #action = random.choice(self.env.valid_actions)

        a = random.randrange(0, 2)

        if a < self.epsilon:
            action = random.choice(self.env.valid_actions)
        else:
            action = self.env.valid_actions[max_Q]

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        # The subsequent next of state and action, Q(s',a')
        next_inputs = self.env.sense(self)
        next_next_waypoint = self.planner.next_waypoint()
        next_state = (next_inputs['light'], next_inputs['oncoming'],
                      next_next_waypoint)

        # Update Q
        Q_old = self.Q[self.state][self.env.valid_actions.index(action)]
        Q_next_utility = reward + self.gamma * max(self.Q[next_state])
        self.Q[self.state][self.env.valid_actions.index(action)] = \
            (1 - self.alpha) * Q_old + (self.alpha * Q_next_utility)

        # Determine if trial is successful (1) or not (0)
        if (deadline == 0) & (reward < 10):
            self.y_trials[self.trials] = 0
        else:
            self.y_trials[self.trials] = 1
Example #39
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env):
        super(LearningAgent, self).__init__(env)# sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'black'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        
        # TODO: Initialize any additional variables here
        self.state = None
        self.action = None
        self.alpha = 0.50
        self.gamma = 0.05
        self.epsilon = .01
        self.q_table = {}
        self.actions = [None, 'forward', 'left', 'right']
        self.trips = 0 
        softmax_probabilities = {} 
        self.previous_state = None
        self.last_action = None 
        self.last_reward = None
        
    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        # 
        self.trips += 1
        if self.trips >= 10:
            self.epsilon == .10
            if self.trips >= 20: 
                self.epsilon == 0.00


    def get_action(self, state,softmax_probabilities):
        #limited randomness on action choice per decreasing self.epsilon--avoid local min/max
        if random.random()< self.epsilon:
            action = np.random.choice(self.actions, p = softmax_probabilities) 
        else: 
            action = self.actions[np.argmax(softmax_probabilities)]
        return action 

    def update(self, t):
        self.next_waypoint = self.planner.next_waypoint() #from route planner,also displayed, simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
    
        #TODO: Update state
        state = (inputs['light'], inputs['oncoming'], inputs['right'],inputs['left'], self.next_waypoint)
        self.state = state
        
        # Initialization of q_table values to .97
        for action in self.actions:
            if(state,action) not in self.q_table:
                self.q_table[(state,action)] = .75
            
        #I used softmax function of q in q_table--so that 0-1 values for q-score  
        softmax_probabilities =[self.q_table[(state,None)],self.q_table[(state,'forward')], self.q_table[(state,'left')], self.q_table[(state, 'right')]] 
        # softmax = (e ^ x) / sum(e ^ x) 
        softmax_probabilities = np.exp(softmax_probabilities)/ np.sum(np.exp(softmax_probabilities), axis = 0)
        #Execute the action get reward
        action = self.get_action(self.state, softmax_probabilities)
        # Get a reward for the action
        reward = self.env.act(self, action)
        

# TODO: Learn policy based on state, action, reward  
        if self.previous_state == None: 
            self.q_table[(state, action)] = ((1-self.alpha) * reward) + (self.alpha * self.q_table[(state, action)])
        else: 
            current_q = self.q_table[(self.previous_state, self.last_action)]
            future_q = self.q_table[(state,action)]
            current_q = (1 - self.alpha) * current_q + self.alpha * (self.last_reward + self.gamma * future_q)
            self.q_table[(self.previous_state, self.last_action)] = current_q
        
        # use previous_state to store the last state so that I can lag between current state and next 
        self.previous_state = self.state
        self.last_action = action 
        self.last_reward = reward
Example #40
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.t = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        #self.epsilon = self.epsilon -0.05

        self.a = 0.01
        self.t = self.t + 1

        if testing:
            self.epsilon, self.alpha = 0, 0

        else:
            #self.epsilon = (self.a)**self.t
            #self.epsilon =1.0/(self.t**2)
            self.epsilon = math.exp(-(self.a * self.t))

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        # When learning, check if the state is in the Q-table
        #   If it is not, create a dictionary in the Q-table for the current 'state'
        #   For each action, set the Q-value for the state-action pair to 0

        state = (waypoint, inputs['light'], inputs['left'], inputs['right'],
                 inputs['oncoming'])

        if self.learning:
            self.createQ(state)

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        ## My example for easy understanding
        #Q = {'left':10.0, 'right':11.0, 'forward':-5.0, None :0.0 }
        #max(Q)---> right

        #row = self.Q[state]
        #print row
        #maxkey = max(row)
        #maxQ = self.Q[state][maxkey]
        #maxQ = max(self.Q[state][action] for action in self.valid_actions)
        maxQ = max(self.Q[state].values())

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        ## My example for easy understanding
        #a = {'One':1, 'Two':2}
        #keyword = 'Two'
        #print keyword in a

        #if self.learning:
        #    if state not in self.Q:
        #        self.Q[state] = {'left':0.0, 'right':0.0, 'forward':0.0, None :0.0 }

        if (self.learning) and (state not in self.Q):
            self.Q[state] = {action: 0 for action in self.valid_actions}

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        #action = None

        ###########
        ## TO DO ##
        ###########

        ## My example for easy understanding
        #import random
        #foo = ['a', 'b', 'c', 'd', 'e']
        #print(random.choice(foo)

        if self.learning:
            if self.epsilon > random.random():
                action = random.choice(self.valid_actions)
            else:
                #action = max(self.Q[state])
                #action = self.get_maxQ(state)
                maxQ = self.get_maxQ(state)
                action = random.choice([
                    action for action in self.valid_actions
                    if self.Q[state][action] == maxQ
                ])
        else:
            action = random.choice(self.valid_actions)

        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')

        #next_waypoint = self.planner.next_waypoint()
        #next_inputs = self.env.sense(self)
        #next_light = next_inputs['light']
        #next_left = next_inputs['left']
        #next_right = next_inputs['right']
        #next_oncoming = next_inputs['oncoming']
        #next_state = (next_waypoint, next_light, next_left, next_right, next_oncoming)

        self.Q[state][action] = (
            1 - self.alpha) * self.Q[state][action] + self.alpha * reward
        #self.Q[state][action] = (1-self.alpha)*self.Q[state][action] + self.alpha*(reward+self.get_maxQ(next_state))
        #self.Q[state][action] = (1-self.alpha)*self.Q[state][action] + self.alpha*(reward + self.get_maxQ(next_state)- self.Q[state][action])
        #self.Q[state][action] = reward*self.alpha + (1-self.alpha)*self.Q[state][action]

        return

    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Example #41
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor
        self.t = 0
        self.a = 0.000001
        # self.a = 0.00001, alpha = 0.2, epsilon = 1-a*t**2 --> A+ Safety/A+ Reliability, suboptimal grid
        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        self.t = self.t + 1
        self.epsilon = 1 - self.a * self.t**2
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.epsilon = 0
            self.alpha = 0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer features outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        #state = None
        state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'])

        return state

    def get_maxQ(self, state):
        """ The get_maxQ function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        maxQ = -100000000
        for each in self.Q[state]:
            val = self.Q[state][each]
            if val > maxQ:
                maxQ = val

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        temp = ['None' if x == None else str(x) for x in state]
        strState = '_'.join(temp)
        if self.learning:
            temp = {}
            for action in self.valid_actions:
                temp[action] = 0.0
            self.Q[state] = self.Q.get(state, temp)

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            p = random.uniform(0, 1)
            if p < self.epsilon:
                action = random.choice(self.valid_actions)
            else:
                maxVal = -100000000
                for each in self.Q[state]:
                    val = self.Q[state][each]
                    if val > maxVal:
                        action = each
                        maxVal = val
                    elif val == maxVal:
                        print action, each
                        action = random.choice([action, each])

        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            # Q-value iteration update rule
            # (1-alpha)*Q(st,at) + alpha (rt+1 + gamma*maxQ(st+1,a))
            self.Q[state][action] = (
                1 - self.alpha) * self.Q[state][action] + self.alpha * (reward)

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        # Set any additional class parameters as needed
        self.t = 0
        random.seed(1000)

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.epsilon = 0.0
            self.alpha = 0.0
        else:
            # commented out testing parameters
            # self.epsilon = self.epsilon - 0.05
            self.t += 1.0
            # self.epsilon = 1.0/(self.t**2)
            # self.epsilon = 1.0/(self.t**2 + self.alpha*self.t)
            # self.epsilon = math.fabs(math.cos(self.alpha*self.t))
            self.epsilon = math.fabs(math.cos(self.alpha * self.t))

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        # Set 'state' as a tuple of relevant data for the agent
        # When learning, check if the state is in the Q-table
        #   If it is not, create a dictionary in the Q-table for the current 'state'
        #   For each action, set the Q-value for the state-action pair to 0

        # helper to create state string
        def xstr(s):
            if s is None:
                return 'None'
            else:
                return str(s)

        state = xstr(waypoint) + "_" + inputs['light'] + "_" + xstr(
            inputs['left']) + "_" + xstr(inputs['oncoming'])
        if self.learning:
            self.Q[state] = self.Q.get(state, {
                None: 0.0,
                'forward': 0.0,
                'left': 0.0,
                'right': 0.0
            })
        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        # Calculate the maximum Q-value of all actions for a given state
        # preset an initialization value that should be replaced by a more valid Q value in the loop.
        maxQ = -1000.0
        for action in self.Q[state]:
            if maxQ < self.Q[state][action]:
                maxQ = self.Q[state][action]
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning:
            self.Q[state] = self.Q.get(state, {
                None: 0.0,
                'forward': 0.0,
                'left': 0.0,
                'right': 0.0
            })

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            if self.epsilon > 0.01 and self.epsilon > random.random():
                action = random.choice(self.valid_actions)
            else:
                valid_actions = []
                maxQ = self.get_maxQ(state)
                for act in self.Q[state]:
                    if maxQ == self.Q[state][act]:
                        valid_actions.append(act)
                action = random.choice(valid_actions)
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards
            when conducting learning. """

        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = self.Q[state][action] + self.alpha * (
                reward - self.Q[state][action])

        return

    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Example #43
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'black'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        self.valid_actions = Environment.valid_actions

        # Intializing previous action, state, reward.
        self.prev_action = None
        self.prev_state = None
        self.prev_reward = None

        #initialize the Q_table
        self.Q = {}

        #Parameters for Q-Learning
        self.alpha = 0.85
        self.gamma = 0.45
        self.epsilon = 0.001
        self.default_Q = 10

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = None
        self.next_waypoint = None
        self.prev_action = None
        self.prev_state = None
        self.prev_reward = None

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        #Update inputs to include all relevant variables (next_waypoint, status, light, left)
        inputs['waypoint'] = self.next_waypoint
        del inputs['right']

        #Convert dictionary values of inputs into tuple to reflect state
        self.state = inputs
        state = tuple(inputs.values())

        # TODO: Select action according to your policy
        best_action = None

        #Exploration
        if random.random() <= self.epsilon:
            best_action = random.choice(self.valid_actions)
            if (state, best_action) not in self.Q.keys():
                self.Q[(state, best_action)] = self.default_Q
            Q_value = self.Q[(state, best_action)]

        else:
            Q_values = []
            for action in self.valid_actions:
                if (state, action) not in self.Q.keys():
                    self.Q[(state, action)] = self.default_Q
                Q_values.append(self.Q[(state, action)])

            #Find best_action and Q_value
            max_Q = max(Q_values)
            index = []
            for i, x in enumerate(Q_values):
                if x == max(Q_values):
                    index.append(i)
            i = random.choice(index)
            best_action = self.valid_actions[i]
            Q_value = Q_values[i]

        # Execute action and get reward
        action = best_action
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        if self.prev_state != None:
            self.Q[(self.prev_state,
                    self.prev_action)] = (1 - self.alpha) * self.Q[
                        (self.prev_state, self.prev_action)] + self.alpha * (
                            self.prev_reward + self.gamma *
                            (self.Q[(state, action)]))

        #Update previous state, action, and reward
        self.prev_action = action
        self.prev_state = state
        self.prev_reward = reward

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(
            deadline, inputs, action, reward)  # [debug]
Example #44
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=True, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        global t
        t = t + 1
        #        self.epsilon = self.epsilon - 0.05
        self.epsilon = math.exp(-0.01 * t)
        #        self.alpha =.5* math.exp(-0.01* t)

        if testing == True:
            self.epsilon = 0
            self.alpha = 0
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer eatures outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus
        # learn about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = max(self.Q[state].values())

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########

        if self.learning == True:
            if state not in self.Q.keys():
                self.Q[state] = {el: 0 for el in self.valid_actions}
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        max_value = self.get_maxQ(state)
        best_actions = [
            key for key, value in self.Q[state].items() if value == max_value
        ]
        if self.learning == False:
            action = random.choice(self.valid_actions)
        elif self.epsilon > random.random():
            action = random.choice(self.valid_actions)
        else:
            action = random.choice(best_actions)

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        if self.learning == True:
            self.Q[state][action] = self.alpha * reward + \
            (1-self.alpha) * self.Q[state][action]
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Example #45
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.alpha = 0.5
        self.gamma = 0.5
        self.epsilon = 1.0
        self.floor = 0.05
        self.log = True

        self.qinit = {'wait': .0, 'forward': .0, 'left': .0, 'right': .0}
        self.qdf = pd.DataFrame()

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.env.totalreward = 0.0

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state

        # Filter state space
        # Can ignore traffic when red light except for potential collision on right turn on red
        # Reason: Crossing red and accident have same penalty
        if inputs['light'] == 'red':
            if inputs['oncoming'] != 'left':
                inputs['oncoming'] = None
            if inputs['left'] != 'forward':
                inputs['left'] = None
            inputs['right'] = None

        if inputs['light'] == 'green':
            # ignore oncoming right turn
            if inputs['oncoming'] == 'right':
                inputs['oncoming'] = None
            # ignore right turn from left
            if inputs['left'] == 'right':
                inputs['left'] = None

        # Select State Space
        #state = ':'.join((inputs['light'], self.next_waypoint))
        state = ':'.join(
            (inputs['light'], str(inputs['oncoming']), str(inputs['left']),
             str(inputs['right']), self.next_waypoint))

        self.state = state
        if self.log: print "[DEBUG]: %s" % state

        if state not in self.qdf.index:
            newq = pd.DataFrame(self.qinit, index=[state])
            self.qdf = self.qdf.append(newq)

        # TODO: Select action according to your policy

        # select best action from Q matrix
        qaction = self.qdf.loc[state].idxmax()

        # random selection of action
        randaction = random.choice(self.env.valid_actions)

        # Action Policy
        sample = random.random()
        if self.log: print "[DEBUG] epsilon: %s" % self.epsilon
        if sample <= self.epsilon:
            action = randaction
            if self.log: print "[DEBUG] randomaction: %s" % action
        else:
            if qaction == 'wait':
                action = None
            else:
                action = qaction
            if self.log: print "[DEBUG] bestaction: %s" % action

        # Slowly decrease epsilon, leave at 5% floor
        if self.epsilon > self.floor:
            self.epsilon -= .00075

        # Override selection. DEBUG only !
        #action = qaction
        #action = randaction

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        if action == None:
            nowstate = self.qdf['wait'][state]
        else:
            nowstate = self.qdf[action][state]

        nextstate = nowstate * (1 - self.alpha) + self.alpha * (
            reward + self.gamma * 2.0)
        if self.log: print "[DEBUG] nextstate: %s" % nextstate

        # Update Q matrix
        if action == None:
            self.qdf['wait'][state] = nextstate
        else:
            self.qdf[action][state] = nextstate

        if self.log: print "[DEBUG] qdf:"
        if self.log: print self.qdf

        self.env.totalreward += reward
        if self.log: print "[DEBUG] totalreward: %s" % self.env.totalreward
        if self.log:
            print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(
                deadline, inputs, action, reward)  # [debug]
Example #46
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, trials=1):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.learning_rate = 0.9
        self.Q = {}

        # Standard default Q value of 0
        # self.default_Q = 0

        # Optimal default Q value.
        self.default_Q = 1

        # discount factor is denoted by Beta in official Bellman equation formula
        # (http://web.stanford.edu/~pkurlat/teaching/5%20-%20The%20Bellman%20Equation.pdf),
        # or gamma in Udacity course.
        self.discount_factor = 0.33
        
        # How likely are we to pick random action / explore new paths?
        self.epsilon = 0.1 

        # How many times agent able to reach the target for given trials?
        self.success = 0
        self.total = 0
        self.trials = trials

        # How many penalties does an agent get?
        self.penalties = 0
        self.moves = 0

        self.net_reward = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.prev_state = None
        self.prev_action = None
        self.prev_reward = None

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        inputs['waypoint'] = self.next_waypoint

        # Tried deleting some inputs to see how it affected performance.
        # Success rate did improve significantly, and counterintuitively, less penalty.
        del inputs['oncoming']
        del inputs['left']
        del inputs['right']

        # Tried to see if deadline info would improve performance turns out it got worse.
        # inputs['deadline'] = deadline

        self.state = tuple(sorted(inputs.items()))

        # TODO: Select action according to your policy

        # Random action 
        # action = random.choice(Environment.valid_actions)

        # Q learning chosen action
        _Q, action = self._select_Q_action(self.state)

        # Execute action and get reward
        reward = self.env.act(self, action)

        # Some stats
        self.net_reward += reward
        self.moves += 1
        if reward < 0:
            self.penalties+= 1

        add_total = False
        if deadline == 0:
            add_total = True
        if reward > 5:
            self.success += 1
            add_total = True
        if add_total:
            self.total += 1
            print self._more_stats()

        # TODO: Learn policy based on state, action, reward

        # Note that we are updating for the previous state's Q value since Utility function is always +1 future state.
        if self.prev_state != None:
            if (self.prev_state, self.prev_action) not in self.Q:
                self.Q[(self.prev_state, self.prev_action)] = self.default_Q

            # Update to Q matrix as described in this lesson:
            # https://www.udacity.com/course/viewer#!/c-ud728-nd/l-5446820041/m-634899057

            # WRONG! `- self.Q[(self.prev_state, self.prev_action)]` should not be there.
            # self.Q[(self.prev_state,self.prev_action)] = (1 - self.learning_rate) * self.Q[(self.prev_state,self.prev_action)] + \
            # self.learning_rate * (self.prev_reward + self.discount_factor * \
            #     self._select_Q_action(self.state)[0] - self.Q[(self.prev_state, self.prev_action)])

            # Correct method:
            self.Q[(self.prev_state,self.prev_action)] = (1 - self.learning_rate) * self.Q[(self.prev_state,self.prev_action)] + \
            self.learning_rate * (self.prev_reward + self.discount_factor * \
                self._select_Q_action(self.state)[0])


        # pdb.set_trace()
        self.prev_state = self.state
        self.prev_action = action
        self.prev_reward = reward

        self.env.status_text += ' ' + self._more_stats()

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]

    def _more_stats(self):
        """Get additional stats"""
        return "success/total = {}/{} of {} trials (net reward: {})\npenalties/moves (penalty rate): {}/{} ({})".format(
                self.success, self.total, self.trials, self.net_reward, self.penalties, self.moves, round(float(self.penalties)/float(self.moves), 2))

    def _select_Q_action(self, state):
        """Select max Q and action based on given state.

        Args:
            state(tuple): Tuple of state e.g. (('heading', 'forward'), ('light', 'green'), ...).

        Returns:
            tuple: max Q value and best action
        """
        best_action = random.choice(Environment.valid_actions)
        if self._random_pick(self.epsilon):
            max_Q = self._get_Q(state, best_action)
        else:
            max_Q = -999999
            for action in Environment.valid_actions:
                Q = self._get_Q(state, action)
                if Q > max_Q:
                    max_Q = Q
                    best_action = action
                elif Q == max_Q:
                    if self._random_pick(0.5):
                        best_action = action
        return (max_Q, best_action)


    def _get_Q(self, state, action):
        """Get Q value given state and action.

        If Q value not found in self.Q, self.default_Q will be returned instead.

        Args:
            state(tuple): Tuple of state e.g. (('heading', 'forward'), ('light', 'green'), ...).
            action(string): None, 'forward', 'left', or 'right'.
        """
        return self.Q.get((state, action), self.default_Q)

    def _random_pick(self, epsilon=0.5):
        """Random pick with epsilon as bias.

        The larger epsilon is, the more likely it is to pick random action.

        Explanation is available at this course:
        https://www.udacity.com/course/viewer#!/c-ud728-nd/l-5446820041/m-634899065 starting at 2:10.

        One may consider this function as: When True, do random action.

        For equal chance, use epsilon = 0.5

        Args:
            epsilon(float): Likelihood of True.
        """
        return random.random() < epsilon
Example #47
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.t = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        # self.epsilon *= 0.995
        self.epsilon = math.cos(float(self.t) / 500)
        self.alpha = 0.5
        self.t += 1

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        # ex. waypoint = ['forward', 'left', 'right']
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        # ex. inputs = {'light': 'red', 'oncoming': None, 'left': 'right', 'right': 'forward'}
        deadline = self.env.get_deadline(self)  # Remaining deadline
        # ex. deadline =

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs['light'], inputs['oncoming'], inputs['left'],
                 inputs['right'])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        maxQ = max(self.Q[state].values())

        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if self.learning:
            self.Q.setdefault(state,
                              {action: 0.0
                               for action in self.valid_actions})

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        probability = random.random()
        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            if self.epsilon > probability:
                action = random.choice(self.valid_actions)
            else:
                maxQ = self.get_maxQ(state)
                action_maximum = [
                    action for action, Q in self.Q[state].iteritems()
                    if Q == maxQ
                ]
                action = random.choice(action_maximum)

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')

        # use following function to update q value
        # Q(S,A) - (1-alpha)*Q(S,A) + alpha*[R + gamma*maxQ(S',a)]
        # gamma = 0

        # Q_next = self.get_maxQ(state) # tuple (action , qvalue)
        Q_actual = self.Q[state][action]  # qvalue
        self.Q[state][action] = (1 -
                                 self.alpha) * Q_actual + self.alpha * (reward)

        return

    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Example #48
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the first step at making a Q-Learning Agent which begins to learn. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Exploration rate
        self.alpha = alpha  # Learning rate
        self.gamma = 1.0  # Discount rate

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.trial_count = 0

    # def getDecay(self, decay):

    # self.epsilon = 1 / math.pow(self.trial_count, 2)
    # self.epsilon = math.exp(1) ** (-self.alpha * self.trial_count)
    # self.epsilon - math.cos(a * self.trial_count)
    # self.epsilon = self.epsilon / math.sqrt(self.trial_count)
    # self.epsilon = 1 / math.pow(self.trial_count, 2)

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.trial_count += 1
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        # self.epsilon -= 0.05
        if self.epsilon > 0:
            self.epsilon = math.pow(self.alpha, self.trial_count)
        if testing:
            self.epsilon = 0.0
            self.alpha = 0.0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        # deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs['light'], inputs['oncoming'])
        return state

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        #
        if self.learning:
            if state not in self.Q:
                self.Q[state] = dict()
                for action in self.valid_actions:
                    self.Q[state][action] = 0.0
        return

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        maxQ = max(self.Q[state].values())
        return maxQ

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        action = None
        if self.learning:
            if random.random < self.epsilon:
                # choose action with 'epsilon probability'
                action = random.choice(self.valid_actions)
            else:
                action = random.choice([
                    i for i in self.Q[self.state].keys()
                    if self.Q[self.state][i] == self.get_maxQ(self.state)
                ])
                # bestQ = None
                # bestAction = None
                # for action in self.valid_actions:
                #     thisQ = self.Q[state][action]
                #     if thisQ > bestQ:
                #         bestQ = thisQ
                #         bestAction = action
                # action = bestAction

        else:
            # choose random action
            action = random.choice(self.valid_actions)

        return action

        # if self.learning:
        #     if random.random() < self.epsilon:
        #         action = random.choice(self.valid_actions)
        #     else:

        #         action = random.choice([i for i in self.Q[self.state].keys() if self.Q[self.state][i] == self.get_maxQ(self.state)])
        # else:
        #     action = random.choice(self.valid_actions)

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')

        if self.learning:
            self.Q[state][action] = (
                1 - self.alpha) * self.Q[state][action] + (reward * self.alpha)
        return

    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Example #49
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):

        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor
        self.gamma = 0.9  # Discount factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.trial_step = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0

        if testing:
            self.epsilon = 0
            self.alpha = 0
        else:
            self.trial_step += 1
            #Default
            #self.epsilon = self.epsilon - 0.05

            # Gave excellent results
            self.epsilon = 0.95**self.trial_step

            #self.epsilon = 1/(self.trial_step**2)
            #self.epsilon = math.cos(self.trial_step/19.0)
            #print "*************{}*************".format(self.trial_step)
            #self.gamma = math.pow(0.9,self.trial_step)
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        #deadline seems relevant but dramatically increases state space.
        state = (waypoint, inputs['light'], inputs['oncoming'])
        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        """ THIS CODE RETURNS ACTION ASSOCIATED WITH maxQ value
        maxQ_val = 0
 		#Choose action with max Q value
        for act in self.Q[state]:
            print "Action = {}	Q_val = {}".format(act,self.Q[state][act])
            if self.Q[state][act] > maxQ_val:
 				action = act
 				maxQ_val = self.Q[state][act]
 				print "Action with > Q"
 				
        if maxQ_val == 0:
 			action = random.choice(self.valid_actions)
 			print "Random Action because Q all 0" 		
        """
        maxQ = max(self.Q[state].values())
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning == True:
            if self.Q.get(state) == None:
                self.Q[state] = {a: 0 for a in self.valid_actions}
                #print "Creating state Actions: "
                #for acts in self.Q[state]:
                #	print "actions = {}	Q vals = {}".format(acts, self.Q[state][acts])
            else:
                pass  #print "State previously visited"

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        #for debugging purposes
        randy = random.random()
        if ((self.learning == False) or (randy < self.epsilon)):
            action = random.choice(self.valid_actions)
            #print "randy < epsilon; {} < {}".format(randy, self.epsilon)
            print "Random Action Epsilon"
        else:
            print "Getting Max Q"
            maxQ = self.get_maxQ(state)
            print "maxQ = {}".format(maxQ)
            # Kind of clever solution but only returns 1 key/action
            #action = self.Q[state].keys()[self.Q[state].values().index(maxQ)]
            maxQ_actions = []
            for action, q_val in self.Q[state].items():
                if q_val == maxQ:
                    maxQ_actions.append(action)

            print "bucha actions = {}".format(maxQ_actions)
            #print "type of thing = {}".format(type(maxQ_actions))
            action = random.choice(maxQ_actions)
            print action

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        #self.get_maxQ(state)
        # This implementation does not use gamma... but if it did
        # self.Q[state][action] += self.alpha(reward + gamma * argmax(R(s',a')) - self.Q[state][action])
        if self.learning == True:
            self.Q[state][action] += self.alpha * (reward -
                                                   self.Q[state][action])
            print "new Q = {}".format(self.Q[state][action])

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}

        # Discount
        # self.gamma = 0.8
        self.gamma = 0.5
        # self.gamma = 0.1
        # self.gamma = 0.5

        self.epsilon = 0.95  # <===
        # self.epsilon = 0.75

        # self.epsilon = 0.5

        # self.epsilon = 0.5
        # self.epsilon = 0.05
        # self.epsilon = 0.25
        # self.epsilon = 0.99
        # self.epsilon = 0.95

        # Learning Rate
        # self.alpha = 1.
        # self.alpha = 0.8  # <===
        self.alpha = 0.5

        # self.alpha = 0.25
        # self.alpha = 0.5

        self.trips_rewards = {}
        self.trips_timers = {}
        self.trips_steps = {}
        self.trips_finished = {}

        self.trip_reward = 0
        self.total_reward = 0
        self.total_finished = 0

        self.current_step = 1
        self.total_steps = 1
        self.trip_number = 0

        # self.max_trips = 20
        self.max_trips = 100

        self.valid_actions = [None, 'forward', 'left', 'right']

        print ''
        print CSI + "7;30;43m" + "Welcome to SmartCab simulation" + CSI + "0m"
        print ''
        print '=================================================='
        print '-------------------- config ----------------------'
        print 'gamma: ' + str(self.gamma)
        print 'epsilon: ' + str(self.epsilon)
        print 'alpha: ' + str(self.alpha)
        print '--------------------------------------------------'
        print ''

        self.timer = time.time()

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.trip_number += 1
        self.current_step = 0
        self.trip_reward = 0
        self.trip_begin_time = time.time()

        if self.trip_number > self.max_trips:  # 20
            # Recording results to log.txt
            print 'self.results_log'
            print self.results_log

            # # Output to the log file
            # f = open('log.txt','ab+')
            # f.write(self.results_log)
            # f.write("\n")
            # f.close()

            # sys.exit(0) # exit if reached the self.max_trips from config

        print '[SmartCab] New trip - awesome!'

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        state = {}
        state['action'] = self.next_waypoint  # desired action
        state['light'] = inputs['light']
        state['oncoming_traffic'] = inputs['oncoming']
        state['traffic_on_the_left'] = inputs['left']
        state = tuple(state.items())
        self.state = state

        print ''
        print '=================================================='
        print '-------------------- stats -----------------------'
        print 'trip_number: ' + str(self.trip_number)
        print 'current_step: ' + str(self.current_step)
        print 'deadline: ' + str(deadline)
        print 'total_steps: ' + str(self.total_steps)
        print 'trip_reward: ' + str(self.trip_reward)
        print 'total_reward: ' + str(self.total_reward)
        print 'total_reward/total_steps: ' + str(
            self.total_reward / self.total_steps)
        print 'timer: ' + str(round(time.time() - self.timer, 2))
        print '-------------------- STATE -----------------------'
        # ap(state)
        print state
        print '--------------------------------------------------'

        # TODO: Select action according to your policy
        # Select action based on previous outcomes

        # Check if this state occured before
        print '[SmartCab] Check if this state occured before'
        if state in self.Q:
            print '=> State is found in Q-table'
            print '=> State occured ' + str(
                self.Q[state]['count']) + ' time(s)'

            print '[SmartCab] Compare the value of pre configurated parameter epsilon with random value: '
            random_value = random.random()

            if random_value > self.epsilon:
                print '=> random_value: ' + str(
                    random_value) + ' > epsilon: ' + str(self.epsilon)
                action = random.choice(self.valid_actions)
                print '=> Taking random action: ' + str(action)
            else:
                print '=> random_value: ' + str(
                    random_value) + ' < epsilon: ' + str(self.epsilon)
                print '[SmartCab] Find available_actions with maximum Q^ score:'
                available_actions = [
                    k for k, v in self.Q[state]['actions_Q^'].items()
                    if v == max(self.Q[state]['actions_Q^'].values())
                ]
                print '=> ' + str(available_actions)
                action = random.choice(available_actions)

                print '[SmartCab] Take random action from the list of available_actions with max Q^: ' + str(
                    action)

            # # ======================================================
            # # Alternative model: Take action in the direction of Waypoint if reward > 0 else wait (None)
            # # ------------------------------------------------------
            # # Check previous reward for desired action ( self.next_waypoint ) in similar state ( state )
            # if self.Q[state]['reward'] > 0:
            #     action = self.next_waypoint
            # else:
            #     action = None
            # # ======================================================

        else:
            print '=> State NOT found in Q-table'

            print '[SmartCab] Assigning default values for this state inside Qtable'

            self.Q[state] = {}
            self.Q[state]['count'] = 0
            self.Q[state]['actions_rewards'] = {}
            self.Q[state]['actions_counts'] = {}
            self.Q[state]['actions_Q^'] = {}

            # action = random.choice(self.valid_actions)

            # Alternative:
            # 'None' may be removed to motivate exploring (as was advised by udacity review):
            action = random.choice(['forward', 'left', 'right'])

            # action = self.next_waypoint
            print '=> Taking random action: ' + str(action)

        # # ======================================================
        # # Alternative model: Take random action
        # # ------------------------------------------------------
        # action = random.choice(self.valid_actions)
        # print '=> Taking random action: ' + str(action)
        # # ------------------------------------------------------

        # Count the occurance of current state
        self.Q[state]['count'] += 1

        # Execute action and get reward
        print '--------------------------------------------------'
        reward = self.env.act(self, action)
        print '--------------------------------------------------'
        print '=> Reward: ' + str(reward)

        if reward == 12:
            self.trips_finished[self.trip_number] = 1
            self.total_finished += 1
        else:
            self.trips_finished[self.trip_number] = 0

        print ''
        print '[SmartCab] Calculating and recording current results to Qtable'

        # TODO: Learn policy based on state, action, reward
        if action in self.Q[state]['actions_rewards']:
            self.Q[state]['actions_rewards'][action] = reward
            self.Q[state]['actions_counts'][action] += 1

            Qh = self.Q[state]['actions_Q^'][action]
            Qh = Qh + (self.alpha *
                       (reward +
                        (self.gamma *
                         (max(self.Q[state]['actions_Q^'].values()))) - Qh))

            # # # ======================================================
            # # # Alternative models: Q^ = 0.5 ; Q^ = 1 ; Q^ = reward
            # # # ------------------------------------------------------
            # Qh = 0.5
            # Qh = 1
            # Qh = reward
            # # # ------------------------------------------------------

            self.Q[state]['actions_Q^'][action] = Qh

        else:
            self.Q[state]['actions_rewards'][action] = reward
            self.Q[state]['actions_counts'][action] = 1
            # self.Q[state]['actions_Q^'][action] = 0.5
            # self.Q[state]['actions_Q^'][action] = 1
            # self.Q[state]['actions_Q^'][action] = reward
            # self.Q[state]['actions_Q^'][action] = 0.0
            # self.Q[state]['actions_Q^'][action] = 2.5
            # self.Q[state]['actions_Q^'][action] = 2.0
            self.Q[state]['actions_Q^'][action] = 1.5
            # self.Q[state]['actions_Q^'][action] = 1.0
            # self.Q[state]['actions_Q^'][action] = 0.5

        # Reward for driving in the direction of way point
        self.Q[state]['reward'] = reward

        print '--------------------------------------------------'
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(
            deadline, inputs, action, reward)  # [debug]

        self.total_reward += reward
        self.trip_reward += reward
        self.current_step += 1
        self.total_steps += 1

        print '-----------------self.Q[state]--------------------'

        # ap(self.Q[state])
        print self.Q[state]

        print '=================trips_results===================='
        self.trips_steps[self.trip_number] = self.current_step
        print 'Steps: ' + str(self.trips_steps)

        self.trips_rewards[self.trip_number] = self.trip_reward
        print 'Rewards: ' + str(self.trips_rewards)

        self.trips_timers[self.trip_number] = round(
            time.time() - self.trip_begin_time, 2)
        print 'Timers: ' + str(self.trips_timers)

        print 'Finished' + str(self.trips_finished)
        # print 'Timers: ' + str(self.trips_timers)

        self.results_log = 'Total steps: ' + str(
            self.total_steps) + '; reward: ' + str(
                self.total_reward) + '; finished: ' + str(
                    self.total_finished) + '; seconds: ' + str(
                        round(time.time() - self.timer, 4))
        print self.results_log
        print '=================================================='
Example #51
0
class LearningAgent_v0(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent_v0, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.gamma = 0.2  #upper bound of discount
        self.alpha = 0.1  #uppder bound of learning rate
        self.epsilon = 0.2  #lower bound of proportion of random steps
        #self.state = {'deadline': None, 'forward_ok': True, 'left_ok': True, 'right_ok': True, 'next_waypoint': None }
        self.state = {
            'light': 'green',
            'oncoming': None,
            'left': None,
            'right': None,
            'next_waypoint': None
        }
        self.exreward = 0
        #self.exstate = {'deadline': None, 'forward_ok': True, 'left_ok': True, 'right_ok': True, 'next_waypoint': None }
        self.exstate = {
            'light': 'green',
            'oncoming': None,
            'left': None,
            'right': None,
            'next_waypoint': None
        }
        self.exaction = None
        self.debug = False
        self.trip_history = []

        self.reward_history = np.zeros(1)

        self.Q = OrderedDict()

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state = {
            'light': 'green',
            'oncoming': None,
            'left': None,
            'right': None,
            'next_waypoint': None
        }
        self.exreward = 0
        self.exstate = {
            'light': 'green',
            'oncoming': None,
            'left': None,
            'right': None,
            'next_waypoint': None
        }
        self.exaction = None

        if (len(self.trip_history) < 150):
            print 'Current success rate is {}'.format(
                sum(self.trip_history) / (len(self.trip_history) + 0.000001))
        else:
            print 'Success rate for recent 100 trials is {}'.format(
                sum(self.trip_history[-100:]) /
                (len(self.trip_history[-100:]) + 0.000001))
        print 'Average reward for recent moves is {}'.format(
            np.mean(self.reward_history))
        if (self.reward_history.shape[0] > 1000):
            self.reward_history = np.delete(self.reward_history, range(100))

        #print "number of parameter is {0}, sum of Qfunction is {1}".format( len(self.Q.keys()), sum(self.Q.values()) )

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        #may need to take deadline into account?

        # TODO: Update state
        #self.state = {'inputs': inputs, 'deadline': deadline, 'next_waypoint':self.next_waypoint}
        #self.state = {'inputs': inputs, 'next_waypoint':self.next_waypoint}

        #epsilon = self.epsilon + (1-self.epsilon)/(t+1)*5
        #gamma = ( 1- 10/(t+10) ) * self.gamma
        #alpha = self.alpha/(t+1.0)
        gamma = self.gamma
        epsilon = self.epsilon
        alpha = self.alpha

        self.state['next_waypoint'] = self.next_waypoint
        #self.state['deadline'] = int(deadline>5) + int(deadline>25)

        for k in inputs.keys():
            self.state[k] = inputs[k]

        # TODO: Select action according to your policy
        #action = random.choice(Environment.valid_actions[1:])

        newkey = str(self.exstate.values()) + ':' + str(self.exaction)

        if (self.debug):
            print "\n New key is {}".format(newkey)
            #debug
            raw_input("Press Enter to continue...")

        tmp_Q = dict([(x, self.Q[x]) for x in self.Q.keys()
                      if str(self.state.values()) in x])

        #print tmp_Q

        if self.debug:
            print "\n Q value for future state is {}".format(str(tmp_Q))
            #debug
            raw_input("Press Enter to continue...")

        action = random.choice(Environment.valid_actions[:])
        tmp_max_Q = 0
        if (len(tmp_Q) == 0):
            tmp_max_Q = 0
            action = random.choice(Environment.valid_actions[:])
        else:
            #tmp_idx = max(tmp_Q)
            tmp_idx = max(tmp_Q.iterkeys(), key=(lambda key: tmp_Q[key]))
            tmp_max_Q = tmp_Q[tmp_idx]
            if (tmp_max_Q > 0 or len(tmp_Q) == 4):
                #print tmp_idx
                tmp_Q_split = tmp_idx.split(':')
                #print tmp_Q_split
                #print tmp_Q_split
                action = tmp_Q_split[1]
                if action == 'None':
                    action = None
            else:
                exist_actions = [x.split(':')[1] for x in tmp_Q.keys()]
                all_actions = ['None', 'forward', 'left', 'right']
                remaining_actions = [
                    x for x in all_actions if not (x in exist_actions)
                ]
                if self.debug:
                    print "Remaining actions are {}".format(
                        str(remaining_actions))
                action = random.choice(remaining_actions)
                tmp_max_Q = 0
                if action == 'None':
                    action = None

        if self.debug:
            print "\n future optimum action is {}".format(str(action))
            #debug
            raw_input("Press Enter to continue...")

        if (random.uniform(0, 1) < epsilon):
            action = random.choice(Environment.valid_actions[:])
            if self.debug:
                print "\n Instead use a random action, {}".format(str(action))
                #debug
                raw_input("Press Enter to continue...")

        #print 'now ' + str(action)
        #random guess have success rate about ~0.20
        #action = random.choice(Environment.valid_actions[:])

        newval = self.exreward + gamma * tmp_max_Q

        if self.debug:
            print "\n current reward is {0}, gamma is {1}, and estimated max future Q is {2}".format(
                self.exreward, gamma, tmp_max_Q)
            #debug
            raw_input("Press Enter to continue...")

        if newkey in self.Q.keys():
            self.Q[newkey] = self.Q[newkey] * (1 - alpha) + alpha * newval
        else:
            self.Q[newkey] = self.alpha * newval

        if self.debug:
            print "updated Q values {}".format(str(self.Q))
            #debug
            raw_input("Press Enter to continue...")

        #print t
        # Execute action and get reward
        reward = self.env.act(self, action)

        if reward > 9:
            self.trip_history.append(1)
            #need to write down something here
            newkey = str(self.state.values()) + ':' + str(action)
            newval = reward  # + deadline
            if newkey in self.Q.keys():
                self.Q[newkey] = self.Q[newkey] * (1 - alpha) + alpha * newval
            else:
                self.Q[newkey] = self.alpha * newval
        elif deadline == 0:
            self.trip_history.append(0)
            newkey = str(self.state.values()) + ':' + str(action)
            newval = reward
            if newkey in self.Q.keys():
                self.Q[newkey] = self.Q[newkey] * (1 - alpha) + alpha * newval
            else:
                self.Q[newkey] = self.alpha * newval

        # TODO: Learn policy based on state, action, reward
        self.exreward = reward
        self.exstate = self.state
        self.exaction = action
        self.reward_history = np.append(self.reward_history, reward)
Example #52
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q_table = {}
        self.gamma = 0.8
        self.alpha = 0.5
        self.epsilon = 0.1
        self.epsilon_decay_rate = 0.0001
        self.valid_actions = [None, 'forward', 'left', 'right']

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required

    # build a state by taking parameters from inputs and next_way point
    def return_state(self, inputs):
        return (
            inputs['light'],
            inputs['oncoming'],
            inputs['left'],
            # inputs['right'],
            self.next_waypoint,
            # self.env.get_deadline(self)
        )

    # read Q_value from Q_table based on state and action
    def get_Q_value(self, state, action):
        if (state, action) in self.Q_table:
            return self.Q_table[(state, action)]
        else:
            return 0

    # return the max Q_value based on the state and all possible actions
    def get_Max_Q(self, state):
        max_Q_value = 0
        for action in self.valid_actions:
            if max_Q_value < self.get_Q_value(state, action):
                max_Q_value = self.get_Q_value(state, action)
        return max_Q_value

    # update Q_value in Q_Table
    def update_Q_value(self, state, action, reward):
        exist_Q_value = self.get_Q_value(state, action)
        self.next_waypoint = self.planner.next_waypoint()
        next_state = self.return_state(self.env.sense(self))
        Q_value = exist_Q_value + self.alpha * (
            reward + self.gamma * self.get_Max_Q(next_state) - exist_Q_value)
        self.Q_table[(state, action)] = Q_value

    # define action policy that take the action which result in the max Q_value from the current state
    def action_policy(self, state):
        action_to_take = None
        best_Q_value = 0
        for action in self.valid_actions:
            if self.get_Q_value(state, action) > best_Q_value:
                best_Q_value = self.get_Q_value(state, action)
                action_to_take = action
            elif self.get_Q_value(state, action) == best_Q_value:
                action_to_take = action
        return action_to_take

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # get the state parameters and display them.
        self.state = self.return_state(inputs)

        # TODO: Select action according to your policy

        # action  = random.choice(Environment.valid_actions[1:])

        # select action according to the policy with greedy strategy
        if self.epsilon > random.random():
            action = random.choice(self.valid_actions)
        else:
            action = self.action_policy(self.state)
            self.epsilon -= self.epsilon_decay_rate

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        self.update_Q_value(self.state, action, reward)

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(
            deadline, inputs, action, reward)  # [debug]
Example #53
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = defaultdict(list)
        self.alphaLearn = .3
        self.gammaDiscount = .99
        self.stateAction = None
        self.epsilon = .1
        self.penalty = 0
        self.oldPenalty = None

    def reset(self, destination=None):
        self.stateAction = None
        self.planner.route_to(destination)
        print 'penalty = {}'.format(
            self.penalty if self.oldPenalty is None else (self.penalty -
                                                          self.oldPenalty))
        self.oldPenalty = self.penalty
        # TODO: Prepare for a new trip; reset any variables here, if required

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = [
            inputs['light'], inputs['right'], inputs['left'],
            self.next_waypoint
        ]
        # print 'this is waypoint {}'.format(self.next_waypoint)
        # TODO: Select action according to your policy

        action = None
        maxQ = 0
        if random.random() < self.epsilon:
            action = random.choice([None, 'forward', 'left', 'right'])
        else:
            for k, v in self.Q.items():
                if k[0:3] == self.state and v > maxQ:
                    action = k[4]
                    maxQ = v
            if action == None:
                action = random.choice([None, 'forward', 'left', 'right'])

        # Execute action and get reward
        reward = self.env.act(self, action)
        if reward < 0:
            self.penalty += 1
            # print 'violation', self.penalty

        # TODO: Learn policy based on state, action, reward
        if self.stateAction is not None:
            if self.Q[tuple(self.stateAction)]:
                self.Q[tuple(self.stateAction)] = self.Q[tuple(
                    self.stateAction)] + self.alphaLearn * (
                        reward + self.gammaDiscount * maxQ -
                        self.Q[tuple(self.stateAction)])
            else:
                self.Q[tuple(self.stateAction)] = self.alphaLearn * (
                    reward + self.gammaDiscount * maxQ)
        else:
            print 'Q learning start'

        #print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]

        self.stateAction = [
            inputs['light'], inputs['right'], inputs['left'],
            self.next_waypoint, action
        ]
Example #54
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'taxi'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here

        # Simple statistical counter variables
        self.trial_count = 0
        self.trial_summary = {}
        for i in range(2):
            self.trial_summary[i] = 0
        self.cycles = 0
        self.time_count_pos = {}
        self.time_count_neg = {}
        self.deadline_start_col = [0] * 100
        self.deadline_end_col = [0] * 100

    def deadline_stats(self, start, deadline):
        if (start):
            self.deadline_start_col.append(deadline)
        elif not start:
            self.deadline_end_col.append(deadline)

    def success_stats(self, suc):
        if (suc):
            self.trial_summary[1] += 1
            self.time_count_pos[self.cycles] += 1
        else:
            self.trial_summary[0] += 1
            self.time_count_neg[self.cycles] += 1

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.trial_count = 0
        self.cycles += 1

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state

        # TODO: Select action according to your policy
        action = random.choice([None, 'forward', 'left', 'right'])

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(
            deadline, inputs, action, reward)  # [debug]
    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.Q = {}

        # Discount
        # self.gamma = 0.8
        self.gamma = 0.5
        # self.gamma = 0.1
        # self.gamma = 0.5

        self.epsilon = 0.95  # <===
        # self.epsilon = 0.75

        # self.epsilon = 0.5

        # self.epsilon = 0.5
        # self.epsilon = 0.05
        # self.epsilon = 0.25
        # self.epsilon = 0.99
        # self.epsilon = 0.95

        # Learning Rate
        # self.alpha = 1.
        # self.alpha = 0.8  # <===
        self.alpha = 0.5

        # self.alpha = 0.25
        # self.alpha = 0.5

        self.trips_rewards = {}
        self.trips_timers = {}
        self.trips_steps = {}
        self.trips_finished = {}

        self.trip_reward = 0
        self.total_reward = 0
        self.total_finished = 0

        self.current_step = 1
        self.total_steps = 1
        self.trip_number = 0

        # self.max_trips = 20
        self.max_trips = 100

        self.valid_actions = [None, 'forward', 'left', 'right']

        print ''
        print CSI + "7;30;43m" + "Welcome to SmartCab simulation" + CSI + "0m"
        print ''
        print '=================================================='
        print '-------------------- config ----------------------'
        print 'gamma: ' + str(self.gamma)
        print 'epsilon: ' + str(self.epsilon)
        print 'alpha: ' + str(self.alpha)
        print '--------------------------------------------------'
        print ''

        self.timer = time.time()
Example #56
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    # Learning rate
    ALPHA = 0.66
    # Discount factor
    GAMMA = 0.003

    def __init__(self, env):
        super(LearningAgent, self).__init__(
            env
        )  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(
            self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.state_prev = None
        self.action_prev = None
        self.reward_prev = None
        # Initialize Q-values w random values
        self.Q = {}
        for nwp, l, o, a in product(self.env.valid_actions, ('green', 'red'),
                                    self.env.valid_actions,
                                    self.env.valid_actions):
            self.Q[LAState(nwp, l, o), a] = random.uniform(1, 5)
        # Keep track of how many times the primary agent reached destination
        self.num_reached = 0

    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
        self.state_prev = None
        self.action_prev = None
        self.reward_prev = None

    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint(
        )  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        state = LAState(next_waypoint=self.next_waypoint,
                        light=inputs['light'],
                        oncoming=inputs['oncoming'])

        # TODO: Select action according to your policy
        # Basic driving agent
        #action = random.choice(self.env.valid_actions)
        # Action with highest Q score for current state
        action = max(self.env.valid_actions, key=lambda a: self.Q[state, a])

        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward
        if self.state_prev is not None:
            self.Q[self.state_prev, self.action_prev] = \
                (1-self.ALPHA) * self.Q[self.state_prev, self.action_prev] + \
                self.ALPHA * (self.reward_prev + self.GAMMA * self.Q[state, action])

        # Update previous values for the next iteration
        self.state_prev = state
        self.action_prev = action
        self.reward_prev = reward

        # Update number of times that agent reached destination
        self.num_reached += self.env.done

        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(
            deadline, inputs, action, reward)  # [debug]
Example #57
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)
        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing == True:
            self.epsilon = 0
            self.aplha = 0
        else:
            self.epsilon = self.epsilon - 0.005
        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        state = (waypoint, inputs["light"], inputs["oncoming"], inputs["left"])

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        if state not in self.Q:
            maxQ = None
        else:
            maxQ = max(self.Q[state].values())
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0
        if self.learning == True:
            if state not in self.Q:
                self.Q[state] = {
                    None: 0.0,
                    "forward": 0.0,
                    "left": 0.0,
                    "right": 0.0
                }
                self.Q[state] = {action: 0.0 for action in self.valid_actions}

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()
        action = None
        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state
        if self.learning == False or random.random() < self.epsilon:
            action = self.valid_actions[random.randint(0, 3)]
        else:
            maxQval = self.get_maxQ(state)
            best_actions = []
            for action in self.valid_actions:
                if self.Q[state][action] == maxQval:
                    best_actions.append(action)
            action = random.choice(best_actions)
        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = (
                1 - self.alpha) * self.Q[state][action] + self.alpha * (reward)

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Example #58
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """
    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent,
              self).__init__(env)  # Set the agent in the evironment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict(
        )  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.train_no = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ###########
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice
        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        import math
        self.train_no = self.train_no + 1
        # self.epsilon = math.e**(-0.01 * self.train_no)
        # self.alpha = math.e**(-0.001 * (self.train_no))
        # self.epsilon = self.epsilon - 0.05*self.train_no
        # self.alpha = self.alpha - 0.05*self.train_no
        # self.alpha -= 0.005

        # for Question 5, decay function
        self.epsilon -= 0.05

        # for Question 6, use exponential function
        # self.epsilon = 0.7**self.train_no
        # results
        # Trial Aborted!
        # Agent did not reach the destination.

        # for Question 6, use 1/train_no/train_no
        # self.epsilon = 1/self.train_no **2
        # results
        # Trial Aborted!
        # Agent did not reach the destination.

        # for Question 6, use exponential function
        # self.epsilon = math.e**(-0.01 * self.train_no)
        # self.alpha = math.e**(-0.001 * (self.train_no))did not use
        # results
        # Trial Completed!
        # Agent reached the destination.

        if testing:
            self.epsilon = 0
            self.alpha = 0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the
            environment. The next waypoint, the intersection inputs, and the deadline
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(
            self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ###########
        ## TO DO ##
        ###########

        # NOTE : you are not allowed to engineer eatures outside of the inputs available.
        # Because the aim of this project is to teach Reinforcement Learning, we have placed
        # constraints in order for you to learn how to adjust epsilon and alpha, and thus learn
        # about the balance between exploration and exploitation.
        # With the hand-engineered features, this learning process gets entirely negated.

        # Set 'state' as a tuple of relevant data for the agent
        state = (inputs['light'], inputs['oncoming'], inputs['left'], waypoint)

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ###########
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state
        # q = [self.Q.get((state,action), 0.0) for action in self.valid_actions]
        q = [self.Q[state][action] for action in self.Q[state]]
        if q:
            maxQ = max(q)
        else:
            maxQ = 0.0
        #maxAction = max(self.Q[state], key=self.Q[state].get)

        #return self.Q[state][maxAction]
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ###########
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        # Then, for each action available, set the initial Q-value to 0.0
        if self.learning:
            #if self.Q.get(state, None) is None:
            if state not in self.Q:
                self.Q[state] = {}
                for action in self.valid_actions:
                    # self.Q[(state,action)] = 0.0
                    self.Q[state][action] = 0.0

        return

    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()

        ###########
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        # Otherwise, choose an action with the highest Q-value for the current state
        # Be sure that when choosing an action with highest Q-value that you randomly select between actions that "tie".
        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            if random.random() < self.epsilon:
                action = random.choice(self.valid_actions)
            else:
                # maxQ = self.get_maxQ(state)
                # maxActions = []
                # for action_key in self.Q[state]:
                #     if self.Q[state][action_key] == maxQ:
                #         maxActions.append(action_key)
                #action = random.choice(maxActions)
                maxQ = self.get_maxQ(state)
                max_actions = []
                for action in self.Q[state]:
                    if self.Q[state][action] == maxQ:
                        max_actions.append(action)
                action = random.choice(max_actions)

        return action

    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives a reward. This function does not consider future rewards
            when conducting learning. """

        ###########
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        # Use only the learning rate 'alpha' (do not use the discount factor 'gamma')
        if self.learning:
            self.Q[state][action] = self.Q[state][action] * (
                1 - self.alpha) + reward * self.alpha
        return

    def update(self):
        """ The update function is called when a time step is completed in the
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Example #59
0
class LearningAgent(Agent):
    """ An agent that learns to drive in the Smartcab world.
        This is the object you will be modifying. """

    def __init__(self, env, learning=False, epsilon=1.0, alpha=0.5):
        super(LearningAgent, self).__init__(env)  # Set the agent in the environment
        self.planner = RoutePlanner(self.env, self)  # Create a route planner
        self.valid_actions = self.env.valid_actions  # The set of valid actions

        # Set parameters of the learning agent
        self.learning = learning  # Whether the agent is expected to learn
        self.Q = dict()  # Create a Q-table which will be a dictionary of tuples
        self.epsilon = epsilon  # Random exploration factor
        self.alpha = alpha  # Learning factor

        ###########
        ## TO DO ##
        ###########
        # Set any additional class parameters as needed
        self.total_trials = 0

    def reset(self, destination=None, testing=False):
        """ The reset function is called at the beginning of each trial.
            'testing' is set to True if testing trials are being used
            once training trials have completed. """

        # Select the destination as the new location to route to
        self.planner.route_to(destination)

        ########### 
        ## TO DO ##
        ###########
        # Update epsilon using a decay function of your choice

        # self.epsilon = self.epsilon - 0.05
        self.epsilon = max(self.epsilon - 0.01, 0.0)
        self.total_trials += 1
        # self.epsilon = 1.5 * math.exp(-0.6 * self.total_trials)
        # self.epsilon = math.cos(0.5 * self.total_trials)
        # self.epsilon = 1.0 / float(self.total_trials * self.total_trials)
        # self.epsilon = math.pow(0.75, self.total_trials)



        # Update additional class parameters as needed
        # If 'testing' is True, set epsilon and alpha to 0
        if testing:
            self.epsilon = 0
            self.alpha = 0

        return None

    def build_state(self):
        """ The build_state function is called when the agent requests data from the 
            environment. The next waypoint, the intersection inputs, and the deadline 
            are all features available to the agent. """

        # Collect data about the environment
        waypoint = self.planner.next_waypoint()  # The next waypoint
        inputs = self.env.sense(self)  # Visual input - intersection light and traffic
        deadline = self.env.get_deadline(self)  # Remaining deadline

        ########### 
        ## TO DO ##
        ###########
        # Set 'state' as a tuple of relevant data for the agent
        # When learning, check if the state is in the Q-table
        #   If it is not, create a dictionary in the Q-table for the current 'state'
        #   For each action, set the Q-value for the state-action pair to 0

        # state = None
        state = waypoint, inputs['light'], inputs['oncoming'] , inputs['left'] #, inputs['right']

        return state

    def get_maxQ(self, state):
        """ The get_max_Q function is called when the agent is asked to find the
            maximum Q-value of all actions based on the 'state' the smartcab is in. """

        ########### 
        ## TO DO ##
        ###########
        # Calculate the maximum Q-value of all actions for a given state

        actions = self.Q[state]
        maxQ = max(actions.values())
        return maxQ

    def createQ(self, state):
        """ The createQ function is called when a state is generated by the agent. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, check if the 'state' is not in the Q-table
        # If it is not, create a new dictionary for that state
        #   Then, for each action available, set the initial Q-value to 0.0

        if self.learning and state not in self.Q.keys():
            self.Q[state] = {None: 0.0, 'left': 0.0, 'right': 0.0, 'forward': 0.0}

        return


    def choose_action(self, state):
        """ The choose_action function is called when the agent is asked to choose
            which action to take, based on the 'state' the smartcab is in. """

        # Set the agent state and default action
        self.state = state
        self.next_waypoint = self.planner.next_waypoint()

        ########### 
        ## TO DO ##
        ###########
        # When not learning, choose a random action
        # When learning, choose a random action with 'epsilon' probability
        #   Otherwise, choose an action with the highest Q-value for the current state

        if not self.learning:
            action = random.choice(self.valid_actions)
        else:
            if random.random() < self.epsilon:
                action = random.choice(self.valid_actions)
            else:
                maxQ = self.get_maxQ(state)
                candidate_actions = [action for action in self.valid_actions if self.Q[state][action] == maxQ]
                action = random.choice(candidate_actions)
        return action



    def learn(self, state, action, reward):
        """ The learn function is called after the agent completes an action and
            receives an award. This function does not consider future rewards 
            when conducting learning. """

        ########### 
        ## TO DO ##
        ###########
        # When learning, implement the value iteration update rule
        #   Use only the learning rate 'alpha' (do not use the discount factor 'gamma')

        # we want to use the alpha update: v = (1 - alpha) v + alpha * x
        if self.learning:
            previous = self.Q[state][action]
            self.Q[state][action] = (1.0 - self.alpha) * previous + self.alpha * reward

        return

    def update(self):
        """ The update function is called when a time step is completed in the 
            environment for a given trial. This function will build the agent
            state, choose an action, receive a reward, and learn if enabled. """

        state = self.build_state()  # Get current state
        self.createQ(state)  # Create 'state' in Q-table
        action = self.choose_action(state)  # Choose an action
        reward = self.env.act(self, action)  # Receive a reward
        self.learn(state, action, reward)  # Q-learn

        return
Example #60
0
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, epsilon=0.8, alpha=0.8, gamma=0.9,tno=1):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
	#self.q = {} #initialize q table dictionary
	self.epsilon = epsilon  #exploration rate
	self.alpha = alpha  # learning rate
	self.gamma = gamma   #discount rate
	#self.lesson_counter = 0  #counts number of steps learned
	#self.steps_counter = 0 #counts steps in the trial
        self.reward_previous = None
        self.action_previous = None
        self.state_previous = None
	#self.q = {(('red',None),'right'):10,(('green',None),'forward'):15, (('green',None),'left'):20,(('red',None),'left'):25}
	self.q = {}
	self.tno = 0
	#self.success = []
	
    def reset(self, destination=None):
        self.planner.route_to(destination)
        # TODO: Prepare for a new trip; reset any variables here, if required
	#self.steps_counter = 0  #after finishing every trial counter gets 0
	self.tno += 1 


    def getQ(self, state, action):
        return self.q.get((state, action), 10.0)  #helper function to get the q-value from the q-table.
	

 
    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)
	location = self.env.agent_states[self]["location"] 
	destination = self.env.agent_states[self]["destination"]
	trial_num = t
	#self.tno += 1
	#sucess = self.env.sucess()

        # TODO: Update state
	self.state = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)
        
        # TODO: Select action according to your policy
        #action = None
	self.actions = [None, 'left', 'right', 'forward']
	#action = self.next_waypoint
	#action = random.choice(Environment.valid_actions)
	#if random.random() < self.epsilon:    #exploration e-greedy strategy
	if random.random() < self.epsilon * 1./self.tno:    #exploration e-greedy strategy

    		action = random.choice(self.actions)
	else:
		q = [self.getQ(self.state, a) for a in self.actions] # gives out list q-values for the visited state-action pairs
		maxQ = max(q) # get the maximum values of q-values from the above list
		count = q.count(maxQ)  #count the number of maximum values
		if count > 1:
            		best = [i for i in range(len(self.actions)) if q[i] == maxQ]
            		i = random.choice(best)  #select the action randomly from the maximum q-values index 
    		else:
        		i = q.index(maxQ)


		action = self.actions[i]
	#action = random.choice(Environment.valid_actions)

#decay rate
#Environment.valid_actions
#defaultdict


        # Execute action and get reward
        reward = self.env.act(self, action)

        # TODO: Learn policy based on state, action, reward

	if t == 0:
		oldv = self.q.get((self.state,action), 10.0)
		self.q[(self.state, action)] = oldv   #if it's first counter run in the trial then get q-value
	else:
		oldv = self.q.get((self.state_previous, self.action_previous), 0.0)   #if it's second counter run in the trial then get the q-value from previous state-action  	
		self.q[(self.state,action)] = oldv + self.alpha * (reward - oldv + self.gamma * self.q.get((self.state, action), 10.0))
		#try:
		#	self.q[(self.state,action)] = oldv + self.alpha * (reward - oldv + self.gamma * self.q[(self.state,action)])
		#except Exception:
		#	self.q[(self.state,action)] = reward + 10   #if the state has not been visited before then except statement is triggered otherwise try statement is triggered. 
		#if oldv is None:
		#	self.q[(self.state,action)] = reward
		#else:
			#self.q[(self.state,action)] = oldv + self.alpha * (reward - oldv)# + self.gamma * self.q[(self.state, action)] - oldv)
			#self.q[(self.state,action)] = oldv + self.alpha * (reward - oldv + self.gamma)# * self.q[(self.state, action)] - oldv)
	

	uQ = self.q
	self.state_previous = self.state
        self.action_previous = action
        #self.steps_counter += 1
	#self.trial_num += 1

	
        print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}, uQ = {}, location = {}, destination = {}, tno = {}".format(deadline, inputs, action, reward, uQ, location,destination, 1./self.tno)  # [debug]