예제 #1
0
 def __init__(self,
              model,
              k=2,
              discount_rate=0.9,
              u0=1,
              std0=0.2,
              sampling_interval=20):
     self.model = model
     self.discount_rate = discount_rate
     self.keepr = Keeper()
     # priority queue for ML
     self.ML_queue = UniquePriorityQueue()
     # comparison constant
     self.delta = 0.001
     # number of back-up per action
     self.k = k
     # default mean and std
     self.u0 = u0
     self.std0 = std0
     # draw initial hypothesis
     self.hypothesis = Hypothesis.draw_init_hypothesis(
         model, self.u0, self.std0)
     # maximum-likelihood V
     self.ML_V = {}
     # interval to draw samples
     self.sampling_interval = sampling_interval
 def test_1(self):
     queue = UniquePriorityQueue()
     queue.push(2, 3)
     queue.push(0, 5)
     queue.push(8, 1)
     (p, v) = queue.pop()
     self.assertEqual(5, v)        
     (p, v) = queue.pop()
     self.assertEqual(3, v)        
     (p, v) = queue.pop()
     self.assertEqual(1, v)  
예제 #3
0
 def __init__(self,
              model,
              k=2,
              epsilon=1,
              degrading_constant=0.99,
              discount_rate=0.9):
     self.model = model
     # value model
     self.V = {}
     # book-keeping keeper
     self.keepr = Keeper()
     # parameters for the algorithm
     self.k = k
     self.epsilon = epsilon
     self.degrading_constant = degrading_constant
     self.discount_rate = discount_rate
     # priority queue
     self.queue = UniquePriorityQueue()
     self.delta = 0.001
예제 #4
0
 def __init__(self, model, k = 2, epsilon = 1, degrading_constant = 0.99, discount_rate = 0.9):
     self.model = model
     # value model
     self.V = {}
     # book-keeping keeper
     self.keepr = Keeper()
     # parameters for the algorithm
     self.k = k
     self.epsilon = epsilon
     self.degrading_constant = degrading_constant
     self.discount_rate = discount_rate
     # priority queue
     self.queue = UniquePriorityQueue()
     self.delta = 0.001
 def __init__(self, model, k = 2 , discount_rate = 0.9, u0 = 1, std0 = 0.2, sampling_interval = 20):
     self.model = model
     self.discount_rate = discount_rate
     self.keepr = Keeper()
     # priority queue for ML
     self.ML_queue = UniquePriorityQueue()
     # comparison constant
     self.delta = 0.001
     # number of back-up per action
     self.k = k
     # default mean and std
     self.u0 = u0
     self.std0 = std0
     # draw initial hypothesis
     self.hypothesis = Hypothesis.draw_init_hypothesis(model, self.u0, self.std0)
     # maximum-likelihood V
     self.ML_V = {}
     # interval to draw samples
     self.sampling_interval = sampling_interval
예제 #6
0
class PrioritizedSweeping(RLAlgorithm):
    # model: the input model
    # e: the parameter for randomization
    def __init__(self,
                 model,
                 k=2,
                 epsilon=1,
                 degrading_constant=0.99,
                 discount_rate=0.9):
        self.model = model
        # value model
        self.V = {}
        # book-keeping keeper
        self.keepr = Keeper()
        # parameters for the algorithm
        self.k = k
        self.epsilon = epsilon
        self.degrading_constant = degrading_constant
        self.discount_rate = discount_rate
        # priority queue
        self.queue = UniquePriorityQueue()
        self.delta = 0.001

    # compute the value function V(s)
    def get_v(self, state):
        return self.V.get(state, 0)

    def print_v_table(self):
        for state in self.model.states:
            print(state, self.get_v(state))

    # compute the best reward for from a state to another
    def get_best_reward(self, state, next_state):
        actions = self.model.get_actions(state)
        reward = self.get_reward(state, actions[0], next_state)
        for action in actions:
            reward = max(reward, self.get_reward(state, action, next_state))
        return reward

    # get the next best state
    # if the best
    def get_next_best_state(self, state):
        L = self.model.get_next_states(state)
        best_state = [L[0]]
        m = self.get_v(L[0]) + self.get_best_reward(state, L[0])
        for s in L[1:]:
            # first, check for ties, then check for greater state
            temp = self.get_v(s) + self.get_best_reward(state, s)
            if abs(temp - m) < self.delta * m:
                best_state.append(s)
            elif temp > m:
                m = temp
                best_state = [s]
        return random.choice(best_state)

    # for any state, get the best action with the highest expected reward
    # if there are actions with equal rewards, return one randomly
    def get_best_action(self, state, next_state):
        actions = self.model.get_actions(state)
        # small constant in this to differentiate between action with zero reward vs. no relation state
        constant = 0.1
        p = self.get_transition(state, actions[0], next_state)
        r = self.get_reward(state, actions[0], next_state) + constant
        # expected reward
        m = p * r
        best_action = [actions[0]]
        for a in actions[1:]:
            # first check, for tie
            # then check for greater probability
            temp = self.get_transition(state, a, next_state) * (
                self.get_reward(state, a, next_state) + constant)
            if abs(temp - m) < self.delta * m:
                best_action.append(a)
            elif temp > m:
                m = temp
                best_action = [a]
        return random.choice(best_action)

    # for any state, get the best action to get into that state from the current state
    # if there are actions with equal probability, choose a random one
    def get_best_action_probability(self, state, next_state):
        actions = self.model.get_actions(state)
        p = self.get_transition(state, actions[0], next_state)
        action = [actions[0]]
        for a in actions[1:]:
            # first check, for tie
            # then check for greater probability
            if abs(self.get_transition(state, a, next_state) - p) < self.delta:
                action.append(a)
            elif self.get_transition(state, a, next_state) > p:
                p = self.get_transition(state, a, next_state)
                action = [a]
        return random.choice(action)

    # compute impact C(s, s*) = sum over a P(s|s*,a)*delta(s)
    # s1: current state, s0: predecessor
    def compute_impact(self, s1, s0, delta):
        s = 0
        for action in self.model.get_actions(s0):
            s += self.get_transition(s0, action, s1) * delta
        return s

    # V(s, a) = sum over s' P(s'|s,a)*(R(s,a,s') + V(s')*discount_rate)
    def compute_v_per_action(self, state, action):
        s = 0
        for next_state in self.model.get_next_states(state):
            s += self.get_transition(state, action, next_state) * (
                self.get_reward(state, action, next_state) +
                self.get_v(next_state) * self.discount_rate)
        return s

    # perform a Bellman backup on that state
    def sweep(self, state):
        actions = self.model.get_actions(state)
        V_new = self.compute_v_per_action(state, actions[0])
        for action in actions[1:]:
            V_new = max(V_new, self.compute_v_per_action(state, action))
        delta_change = abs(self.get_v(state) - V_new)
        # update the dictionary
        self.V[state] = V_new
        # now compute the priority queue for the predecessor
        for s0 in self.model.get_prev_states(state):
            capacity = self.compute_impact(state, s0, delta_change)
            self.queue.push_or_update(-capacity, s0)

    def choose_action(self, state):
        # with some probability, choose a random action
        action = None
        if random.random() < self.epsilon:
            actions = self.model.get_actions(state)
            action = random.choice(actions)
            self.epsilon *= self.degrading_constant
            # make sure that we do still explore at the minimum level
            self.epsilon = max(self.epsilon, 0.01)
        else:
            best_next_state = self.get_next_best_state(state)
            # action = self.get_best_action(state, best_next_state)
            action = self.get_best_action_probability(state, best_next_state)
        return action

    def sweep_queue(self):
        for i in range(self.k - 1):
            (v, state) = self.queue.pop()
            self.sweep(state)

    def next(self, action=None):
        if action == None:
            action = self.choose_action(self.model.current_state)
        current_state = self.model.current_state
        reward = self.model.perform(action)
        next_state = self.model.current_state
        self.update_transition(current_state, action, next_state)
        self.update_reward(current_state, action, next_state, reward)
        self.sweep(current_state)
        self.sweep_queue()
        # print self.V
        #if (current_state.id == 8):
        #print "state 8 has been reached"
        #print (current_state, action, next_state, reward)
        #print "reward = ", self.get_reward(current_state, action, next_state)
        return (action, reward, next_state)
 def test_2(self):
     queue = UniquePriorityQueue()
     queue.push_or_update(2, 3)
     queue.push_or_update(10, 5)
     queue.push_or_update(8, 1)
     queue.push_or_update(0, 5)
     queue.push_or_update(4, 6)
     (p, v) = queue.pop()
     self.assertEqual(5, v)        
     (p, v) = queue.pop()
     self.assertEqual(3, v)        
     (p, v) = queue.pop()
     self.assertEqual(6, v)  
     (p, v) = queue.pop()
     self.assertEqual(1, v)          
     queue.push_or_update(10, 5)
     (p, v) = queue.pop()
     self.assertEqual(5, v)          
예제 #8
0
class PrioritizedSweeping(RLAlgorithm):
    # model: the input model
    # e: the parameter for randomization
    def __init__(self, model, k = 2, epsilon = 1, degrading_constant = 0.99, discount_rate = 0.9):
        self.model = model
        # value model
        self.V = {}
        # book-keeping keeper
        self.keepr = Keeper()
        # parameters for the algorithm
        self.k = k
        self.epsilon = epsilon
        self.degrading_constant = degrading_constant
        self.discount_rate = discount_rate
        # priority queue
        self.queue = UniquePriorityQueue()
        self.delta = 0.001

    # compute the value function V(s)
    def get_v(self, state):
        return self.V.get(state, 0)

    def print_v_table(self):
        for state in self.model.states:
            print (state, self.get_v(state))

    # compute the best reward for from a state to another
    def get_best_reward(self, state, next_state):
        actions = self.model.get_actions(state)
        reward = self.get_reward(state, actions[0], next_state)
        for action in actions:
            reward = max(reward, self.get_reward(state, action, next_state))
        return reward

    # get the next best state
    # if the best 
    def get_next_best_state(self, state):
        L = self.model.get_next_states(state)
        best_state = [L[0]]
        m = self.get_v(L[0]) + self.get_best_reward(state, L[0])
        for s in L[1:]:
            # first, check for ties, then check for greater state
            temp = self.get_v(s) + self.get_best_reward(state, s)
            if abs(temp - m)  < self.delta*m:
                best_state.append(s)
            elif temp > m:
                m = temp
                best_state = [s]
        return random.choice(best_state)

    # for any state, get the best action with the highest expected reward
    # if there are actions with equal rewards, return one randomly
    def get_best_action(self, state, next_state):
        actions = self.model.get_actions(state)
        # small constant in this to differentiate between action with zero reward vs. no relation state
        constant = 0.1
        p = self.get_transition(state, actions[0], next_state)
        r = self.get_reward(state, actions[0], next_state) + constant
        # expected reward
        m = p*r
        best_action = [actions[0]]
        for a in actions[1:]:
            # first check, for tie
            # then check for greater probability
            temp = self.get_transition(state, a, next_state)*(self.get_reward(state, a, next_state) + constant)
            if abs(temp - m) < self.delta*m:
                best_action.append(a)
            elif temp > m:
                m = temp
                best_action = [a]
        return random.choice(best_action)

    # for any state, get the best action to get into that state from the current state
    # if there are actions with equal probability, choose a random one
    def get_best_action_probability(self, state, next_state):
        actions = self.model.get_actions(state)
        p = self.get_transition(state, actions[0], next_state)
        action = [actions[0]]
        for a in actions[1:]:
            # first check, for tie
            # then check for greater probability
            if abs(self.get_transition(state, a, next_state) - p) < self.delta:
                action.append(a)
            elif self.get_transition(state, a, next_state) > p:
                p = self.get_transition(state, a, next_state)
                action = [a]
        return random.choice(action)

    # compute impact C(s, s*) = sum over a P(s|s*,a)*delta(s)
    # s1: current state, s0: predecessor
    def compute_impact(self, s1, s0, delta):
        s = 0
        for action in self.model.get_actions(s0):
            s += self.get_transition(s0, action, s1)*delta
        return s

    # V(s, a) = sum over s' P(s'|s,a)*(R(s,a,s') + V(s')*discount_rate)
    def compute_v_per_action(self, state, action):
        s = 0
        for next_state in self.model.get_next_states(state):
            s += self.get_transition(state, action, next_state)*(
                self.get_reward(state, action, next_state) + self.get_v(next_state)*self.discount_rate)
        return s

    # perform a Bellman backup on that state
    def sweep(self, state):
        actions = self.model.get_actions(state)
        V_new = self.compute_v_per_action(state, actions[0])
        for action in actions[1:]:
            V_new = max(V_new, self.compute_v_per_action(state, action))
        delta_change = abs(self.get_v(state) - V_new)
        # update the dictionary
        self.V[state] = V_new
        # now compute the priority queue for the predecessor
        for s0 in self.model.get_prev_states(state):
            capacity = self.compute_impact(state, s0, delta_change)
            self.queue.push_or_update(-capacity, s0)

    def choose_action(self, state):
        # with some probability, choose a random action
        action = None
        if random.random() < self.epsilon:
            actions = self.model.get_actions(state)
            action = random.choice(actions)
            self.epsilon *= self.degrading_constant
            # make sure that we do still explore at the minimum level
            self.epsilon = max(self.epsilon, 0.01)
        else:
            best_next_state = self.get_next_best_state(state)
            # action = self.get_best_action(state, best_next_state)
            action = self.get_best_action_probability(state, best_next_state)
        return action
    
    def sweep_queue(self):
        for i in range(self.k - 1):
            (v, state) = self.queue.pop()
            self.sweep(state)        

    def next(self, action = None):
        if action == None:  
            action = self.choose_action(self.model.current_state)
        current_state = self.model.current_state
        reward = self.model.perform(action)
        next_state = self.model.current_state
        self.update_transition(current_state, action, next_state)
        self.update_reward(current_state, action, next_state, reward)
        self.sweep(current_state)
        self.sweep_queue()
        # print self.V
        #if (current_state.id == 8):
            #print "state 8 has been reached"
            #print (current_state, action, next_state, reward)
            #print "reward = ", self.get_reward(current_state, action, next_state)            
        return (action, reward, next_state)
class BayesPrioritizedSweeping(RLAlgorithm):
    def __init__(self, model, k = 2 , discount_rate = 0.9, u0 = 1, std0 = 0.2, sampling_interval = 20):
        self.model = model
        self.discount_rate = discount_rate
        self.keepr = Keeper()
        # priority queue for ML
        self.ML_queue = UniquePriorityQueue()
        # comparison constant
        self.delta = 0.001
        # number of back-up per action
        self.k = k
        # default mean and std
        self.u0 = u0
        self.std0 = std0
        # draw initial hypothesis
        self.hypothesis = Hypothesis.draw_init_hypothesis(model, self.u0, self.std0)
        # maximum-likelihood V
        self.ML_V = {}
        # interval to draw samples
        self.sampling_interval = sampling_interval
    
    def get_ML_transition(self, s1, a, s2):
        return RLAlgorithm.get_transition(self, s1, a, s2)
    
    def get_ML_reward(self, s1, a, s2):
        return RLAlgorithm.get_reward(self, s1, a, s2)
    
    def get_ML_v(self, state):
        return self.ML_V.get(state, 0)
    
    def update_ML_v(self, state, value):
        self.ML_V[state] = value
    
    # compute impact C(s, s*) = sum over a P(s|s*,a)*delta(s)
    # s1: current state, s0: predecessor
    def compute_impact(self, s1, s0, delta, transition_func):
        s = 0
        for action in self.model.get_actions(s0):
            s += transition_func(s0, action, s1)*delta
        return s    
    
    # V(s, a) = sum over s' P(s'|s,a)*(R(s,a,s') + V(s')*discount_rate)
    def compute_v_per_action(self, state, action, transition_func, reward_func, v_func):
        s = 0
        for next_state in self.model.get_next_states(state):
            s += transition_func(state, action, next_state)*(
                reward_func(state, action, next_state) + v_func(next_state)*self.discount_rate)
        return s
    
    def sweep_ML(self, state):
        self.sweep(state,
                   self.get_ML_transition,
                   self.get_ML_reward,
                   self.get_ML_v,
                   self.update_ML_v,
                   self.ML_queue)
    
    def sweep_hypothesis(self, state):
        self.sweep(state,
                   self.hypothesis.get_transition,
                   self.hypothesis.get_reward,
                   self.hypothesis.get_v,
                   self.hypothesis.update_v,
                   self.hypothesis.queue)
    
    # perform a Bellman backup on that state, 
    def sweep(self, state, transition, reward, get_v, update_v, queue):
        actions = self.model.get_actions(state)
        V_new = self.compute_v_per_action(state, actions[0], transition,
                                          reward, get_v)
        for action in actions[1:]:
            V_new = max(V_new, self.compute_v_per_action(
                state, action, transition, reward, get_v))
        delta_change = abs(get_v(state) - V_new)
        # update the dictionary
        update_v(state, V_new)
        # now compute the priority queue for the predecessor
        for s0 in self.model.get_prev_states(state):
                capacity = self.compute_impact(state, s0, delta_change, transition)
                queue.push_or_update(-capacity, s0)

    # sweep the Bellman queue for ML estimate
    #def sweep_ML_queue(self):
        #for i in range(self.k - 1):
            #(priority, state) = self.ML_queue.pop()
            #self.sweep_ML(state)
    
    #def sweep_hypothesis_queue(self):
        #for i in range(self.k - 1):
            #(priority, state) = self.hypothesis.queue.pop()
            #self.sweep_hypothesis(state)
            
    def sweep_queue(self):
        for i in range(self.k - 1):
            (priority, state) = self.hypothesis.queue.pop()
            self.sweep_hypothesis(state)        
            (priority, state) = self.ML_queue.pop()
            self.sweep_ML(state)
    
    def draw_hypothesis(self):
        # using the optimistic mode - assuming for unseen (state, action) to have max reward
        hypothesis = Hypothesis.draw_hypothesis(self.model, self.keepr, self.keepr.max_reward, self.std0)
        # initialize the hypothesis' v function with ML approximate
        hypothesis.V = dict(self.ML_V)
        return hypothesis
        
    # short cut to compute v per action for the hypothesis
    def compute_action_hypothesis(self, state, action):
        return self.compute_v_per_action(state, action,
                                         self.hypothesis.get_transition,
                                         self.get_ML_reward,
                                         self.hypothesis.get_v)
    
    def choose_action(self, state):
        # get the best action using value - iteration formula
        # https://stellar.mit.edu/S/course/6/fa13/6.S078/courseMaterial/topics/topic1/lectureNotes/mdp_vi/mdp_vi.pdf
        actions = self.model.get_actions(state)
        m = self.compute_action_hypothesis(state, actions[0])
        best_action = [actions[0]]
        for action in actions[1:]:
            temp = self.compute_action_hypothesis(state, action)
            if abs(temp - m) < self.delta*m:
                best_action.append(action)
            elif temp > m:
                best_action = [action]
            return random.choice(best_action)    
    
    def choose_random_choice(self, state):
        return random.choice(self.model.get_actions(state))
        
    def check_to_draw_hypothesis(self):
        # draw the hypothesis at every start state
        # if self.model.current_state == self.model.start_state:
            # draw a new hypothesis
            # self.hypothesis = self.draw_hypothesis()
        # draw a hypothesis every 20 steps
        if self.model.num_steps() % 20 == 0:
            self.hypothesis = self.draw_hypothesis()
            # self.hypothesis.print_complete_reward()
            # self.hypothesis.print_complete_transition()
            
        
    def next(self, action=None):
        # check to draw a new hypothesis
        self.check_to_draw_hypothesis()
        if action == None:
            action = self.choose_action(self.model.current_state)
        current_state = self.model.current_state
        reward = self.model.perform(action)
        next_state = self.model.current_state
        # do book-keeping
        self.keepr.update_reward_and_transition(current_state, action, next_state, reward)
        # do sweeping
        self.sweep_ML(current_state)
        self.sweep_hypothesis(current_state)
        self.sweep_queue()
        if (self.model.current_state.id == 8):
            # print (action, reward, next_state)
            pass
        
        if (self.model.current_state.id == 1):
            # print (action, reward, next_state)
            pass
                
        # print self.ML_V
        # print (current_state, action, next_state, reward)
        # print self.hypothesis.V
        return (action, reward, next_state)
        
    def get_transition(self, s1, a, s2):
        raise Exception("this method is discontinued")
    
    def get_reward(self, s1, a, s2):
        raise Exception("this method is discontinued")
예제 #10
0
class BayesPrioritizedSweeping(RLAlgorithm):
    def __init__(self,
                 model,
                 k=2,
                 discount_rate=0.9,
                 u0=1,
                 std0=0.2,
                 sampling_interval=20):
        self.model = model
        self.discount_rate = discount_rate
        self.keepr = Keeper()
        # priority queue for ML
        self.ML_queue = UniquePriorityQueue()
        # comparison constant
        self.delta = 0.001
        # number of back-up per action
        self.k = k
        # default mean and std
        self.u0 = u0
        self.std0 = std0
        # draw initial hypothesis
        self.hypothesis = Hypothesis.draw_init_hypothesis(
            model, self.u0, self.std0)
        # maximum-likelihood V
        self.ML_V = {}
        # interval to draw samples
        self.sampling_interval = sampling_interval

    def get_ML_transition(self, s1, a, s2):
        return RLAlgorithm.get_transition(self, s1, a, s2)

    def get_ML_reward(self, s1, a, s2):
        return RLAlgorithm.get_reward(self, s1, a, s2)

    def get_ML_v(self, state):
        return self.ML_V.get(state, 0)

    def update_ML_v(self, state, value):
        self.ML_V[state] = value

    # compute impact C(s, s*) = sum over a P(s|s*,a)*delta(s)
    # s1: current state, s0: predecessor
    def compute_impact(self, s1, s0, delta, transition_func):
        s = 0
        for action in self.model.get_actions(s0):
            s += transition_func(s0, action, s1) * delta
        return s

    # V(s, a) = sum over s' P(s'|s,a)*(R(s,a,s') + V(s')*discount_rate)
    def compute_v_per_action(self, state, action, transition_func, reward_func,
                             v_func):
        s = 0
        for next_state in self.model.get_next_states(state):
            s += transition_func(state, action, next_state) * (
                reward_func(state, action, next_state) +
                v_func(next_state) * self.discount_rate)
        return s

    def sweep_ML(self, state):
        self.sweep(state, self.get_ML_transition, self.get_ML_reward,
                   self.get_ML_v, self.update_ML_v, self.ML_queue)

    def sweep_hypothesis(self, state):
        self.sweep(state, self.hypothesis.get_transition,
                   self.hypothesis.get_reward, self.hypothesis.get_v,
                   self.hypothesis.update_v, self.hypothesis.queue)

    # perform a Bellman backup on that state,
    def sweep(self, state, transition, reward, get_v, update_v, queue):
        actions = self.model.get_actions(state)
        V_new = self.compute_v_per_action(state, actions[0], transition,
                                          reward, get_v)
        for action in actions[1:]:
            V_new = max(
                V_new,
                self.compute_v_per_action(state, action, transition, reward,
                                          get_v))
        delta_change = abs(get_v(state) - V_new)
        # update the dictionary
        update_v(state, V_new)
        # now compute the priority queue for the predecessor
        for s0 in self.model.get_prev_states(state):
            capacity = self.compute_impact(state, s0, delta_change, transition)
            queue.push_or_update(-capacity, s0)

    # sweep the Bellman queue for ML estimate
    #def sweep_ML_queue(self):
    #for i in range(self.k - 1):
    #(priority, state) = self.ML_queue.pop()
    #self.sweep_ML(state)

    #def sweep_hypothesis_queue(self):
    #for i in range(self.k - 1):
    #(priority, state) = self.hypothesis.queue.pop()
    #self.sweep_hypothesis(state)

    def sweep_queue(self):
        for i in range(self.k - 1):
            (priority, state) = self.hypothesis.queue.pop()
            self.sweep_hypothesis(state)
            (priority, state) = self.ML_queue.pop()
            self.sweep_ML(state)

    def draw_hypothesis(self):
        # using the optimistic mode - assuming for unseen (state, action) to have max reward
        hypothesis = Hypothesis.draw_hypothesis(self.model, self.keepr,
                                                self.keepr.max_reward,
                                                self.std0)
        # initialize the hypothesis' v function with ML approximate
        hypothesis.V = dict(self.ML_V)
        return hypothesis

    # short cut to compute v per action for the hypothesis
    def compute_action_hypothesis(self, state, action):
        return self.compute_v_per_action(state, action,
                                         self.hypothesis.get_transition,
                                         self.get_ML_reward,
                                         self.hypothesis.get_v)

    def choose_action(self, state):
        # get the best action using value - iteration formula
        # https://stellar.mit.edu/S/course/6/fa13/6.S078/courseMaterial/topics/topic1/lectureNotes/mdp_vi/mdp_vi.pdf
        actions = self.model.get_actions(state)
        m = self.compute_action_hypothesis(state, actions[0])
        best_action = [actions[0]]
        for action in actions[1:]:
            temp = self.compute_action_hypothesis(state, action)
            if abs(temp - m) < self.delta * m:
                best_action.append(action)
            elif temp > m:
                best_action = [action]
            return random.choice(best_action)

    def choose_random_choice(self, state):
        return random.choice(self.model.get_actions(state))

    def check_to_draw_hypothesis(self):
        # draw the hypothesis at every start state
        # if self.model.current_state == self.model.start_state:
        # draw a new hypothesis
        # self.hypothesis = self.draw_hypothesis()
        # draw a hypothesis every 20 steps
        if self.model.num_steps() % 20 == 0:
            self.hypothesis = self.draw_hypothesis()
            # self.hypothesis.print_complete_reward()
            # self.hypothesis.print_complete_transition()

    def next(self, action=None):
        # check to draw a new hypothesis
        self.check_to_draw_hypothesis()
        if action == None:
            action = self.choose_action(self.model.current_state)
        current_state = self.model.current_state
        reward = self.model.perform(action)
        next_state = self.model.current_state
        # do book-keeping
        self.keepr.update_reward_and_transition(current_state, action,
                                                next_state, reward)
        # do sweeping
        self.sweep_ML(current_state)
        self.sweep_hypothesis(current_state)
        self.sweep_queue()
        if (self.model.current_state.id == 8):
            # print (action, reward, next_state)
            pass

        if (self.model.current_state.id == 1):
            # print (action, reward, next_state)
            pass

        # print self.ML_V
        # print (current_state, action, next_state, reward)
        # print self.hypothesis.V
        return (action, reward, next_state)

    def get_transition(self, s1, a, s2):
        raise Exception("this method is discontinued")

    def get_reward(self, s1, a, s2):
        raise Exception("this method is discontinued")