class BellmanDPSolver(object):
    def __init__(self, discountRate=1):
        self.MDP = MDP()
        self.gamma = discountRate
        self.initVs()

    def initVs(self):
        self.state_values = {pair: 0 for pair in self.MDP.S}

    def BellmanUpdate(self):
        prev_version = self.state_values.copy()
        for state in self.MDP.S:
            total_val = dict()
            for action in self.MDP.A:
                sub_total = 0
                for next_state, prob in self.MDP.probNextStates(state, action).items():
                    sub_total += prob*(self.MDP.getRewards(state, action, next_state)
                                       + self.gamma * prev_version.get(next_state))
                total_val[action] = sub_total
            self.state_values[state] = max(total_val.values())
        return self.state_values, self.compute_greedy_policy()

    def compute_greedy_policy(self):
        policy = dict()
        for state in self.MDP.S:
            q_sa = dict()
            for action in self.MDP.A:
                q_sa[action] = sum(prob*(self.MDP.getRewards(state, action, next_state) +
                                         self.gamma * self.state_values[next_state])
                                   for next_state, prob in self.MDP.probNextStates(state, action).items())
            policy[state] = [action for action in self.MDP.A if q_sa[action] == max(q_sa.values())]
        return policy
class BellmanDPSolver(object):
    def __init__(self):
        self.MDP = MDP()
        self.initVs()

    def initVs(self):
        self.stateValueTable = {state: 0 for state in self.MDP.S}
        self.statePolicyTale = {state: self.MDP.A for state in self.MDP.S}

    def BellmanUpdate(self, discount_rate):
        for state in self.MDP.S:
            action_dict = {
                action: sum([
                    prob * (self.MDP.getRewards(state, action, nextState) +
                            discount_rate * self.stateValueTable[nextState])
                    for nextState, prob in self.MDP.probNextStates(
                        state, action).items()
                ])
                for action in self.MDP.A
            }

            self.stateValueTable[state] = max(action_dict.values())
            self.statePolicyTale[state] = [
                action for action, value in action_dict.items()
                if value == self.stateValueTable[state]
            ]
        return self.stateValueTable, self.statePolicyTale
예제 #3
0
class BellmanDPSolver(object):
	def __init__(self, discountRate):
		self.MDP = MDP()
		self.discountRate = discountRate
		self.initVs()

	def initVs(self):
		self.values = {}
		self.policy = {}
		for state in self.MDP.S:
			self.values[state] = 0

	def BellmanUpdate(self):
		for state in self.MDP.S:
			self.policy[state] = []
			values_all = []
			for action in self.MDP.A:
				s_r_sum = 0
				prob_next_states = self.MDP.probNextStates(state,action)
				for state_2 in prob_next_states.keys():
					s_r_sum = s_r_sum + prob_next_states[state_2] * (self.MDP.getRewards(state,action,state_2)+self.discountRate*self.values[state_2])
				values_all.append(s_r_sum)
				
			self.values[state] = max(values_all)
			for i in range(len(values_all)):
				if values_all[i] == self.values[state]:
					self.policy[state].append(self.MDP.A[i])

		return (self.values,self.policy)

		raise NotImplementedError
예제 #4
0
class BellmanDPSolver(object):
    def __init__(self, discountRate=0.99):
        self.MDP = MDP()
        self._gamma = discountRate
        self.initVs()

    def initVs(self):
        self.v = dict()
        self.pi = dict()
        for s in self.MDP.S:
            self.v[s] = 0
            self.pi[s] = copy(self.MDP.A)

    def BellmanUpdate(self):
        for s in self.MDP.S:
            max_val = 0
            opt_act = []
            for a in self.MDP.A:
                probs = self.MDP.probNextStates(s, a)
                val = 0
                for a_prime, p in probs.items():
                    r = self.MDP.getRewards(s, a, a_prime)
                    val += p * (r + self._gamma * self.v[a_prime])
                if val > max_val:
                    max_val = val
                    opt_act = [a]
                elif val == max_val:
                    opt_act.append(a)
            self.v[s] = max_val
            self.pi[s] = opt_act
        return self.v, self.pi
예제 #5
0
class BellmanDPSolver(object):
	def __init__(self,discountRate):
		self.MDP = MDP()
		self.gamma = discountRate
		self.initVs()
	def initVs(self):
		self.values = {s: 0 for s in self.MDP.S}
		self.policy = {s: self.MDP.A for s in self.MDP.S}
	
	def BellmanUpdate(self):
		
		for s in self.MDP.S:
			best_v = -10**20
			best_a = []
			n_value = {a:0 for a in self.MDP.A}
			for a in self.MDP.A:
				
				for s_ in self.MDP.probNextStates(s,a).keys():
					n_value[a] += self.MDP.probNextStates(s,a)[s_] * (self.MDP.getRewards(s,a,s_) + self.gamma * self.values[s_])
				if n_value[a] > best_v:
					best_v = n_value[a]
			
			self.values[s] = best_v
			for a in self.MDP.A:
				if n_value[a] == best_v:
					best_a += [a]
			self.policy[s] = best_a
		return self.values, self.policy
class BellmanDPSolver(object):
	def __init__(self,discount_rate):
		self.mpd = MDP()
		self.actions = self.mpd.A
		self.gamma = discount_rate
		self.policy = {}
		self.current_position = -1
	
	def initVs(self):
		self.values = {(1, 3): 0, (3, 0): 0, (2, 1): 0, (0, 3): 0, (4, 0): 0, (1, 2): 0, (3, 3): 0, (4, 4): 0, (2, 2): 0, (4, 1): 0,
		               (1, 1): 0, 'OUT': 0, (3, 2): 0, (0, 0): 0, (0, 4): 0, (1, 4): 0, (2, 3): 0, (4, 2): 0, (1, 0): 0, (0, 1): 0,
		               'GOAL': 0, (3, 1): 0, (2, 4): 0, (2, 0): 0, (4, 3): 0, (3, 4): 0, (0, 2): 0}
		self.policy = {(1, 3): [], (3, 0): [], (2, 1): [], (0, 3): [], (4, 0): [], (1, 2): [], (3, 3): [], (4, 4): [], (2, 2): [],
		                (4, 1): [], (1, 1): [], 'OUT': [],  (3, 2): [], (0, 0): [], (0, 4): [], (1, 4): [], (2, 3): [], (4, 2): [],
		                (1, 0): [], (0, 1): [], 'GOAL': [], (3, 1): [], (2, 4): [], (2, 0): [], (4, 3): [], (3, 4): [], (0, 2): []}

	
	def BellmanUpdate(self):
		for init_state, value_f in self.values.items():
			max=None
			for action in self.actions:
				temp = 0
				# Transition Table p(s',r | s,a)
				next_states = self.mpd.probNextStates(init_state,action)
				for new_state,prob in next_states.items():
					temp+=prob*(self.mpd.getRewards(init_state,new_state)+self.gamma*self.values[new_state])
				if max == None or temp >= max:
					max = temp
			self.values[init_state]= max
			
			# Greedily compute new policy
			policy_list = []
			max = self.values[init_state]
			for action in self.actions:
				temp = 0
				next_states = self.mpd.probNextStates(init_state,action)
				for new_state,prob in next_states.items():
					temp+=prob*(self.mpd.getRewards(init_state,new_state)+self.gamma*self.values[new_state])
				if temp == max:
					policy_list.append(action)
			self.policy[init_state]= policy_list
			
		return self.values,self.policy
예제 #7
0
class BellmanDPSolver(object):
	def __init__(self, discountRate=0.9):
		self.MDP = MDP()
		self.discountRate = discountRate
		self.initVs()

	def initVs(self):		
		self.Vs = dict()
		self.policy = dict()
		for state in self.MDP.S:
			self.Vs[state] = 0
			self.policy[state] = self.MDP.A


	def action_return(self, state, action):		
		# for each next state:
		# get the state probability given current state and action
		# get the reward for the s, r, s' combination
		# sum the s, r, s' rewards by weighting them by their probability

		state_prob = self.MDP.probNextStates(state, action)

		expected_reward = 0
		for next_state in state_prob:
			prob = state_prob[next_state]
			reward = self.MDP.getRewards(state, action, next_state)
			expected_reward += prob * (reward + self.discountRate * self.Vs[next_state])

		return expected_reward


	def max_action_return(self, state):
		# finds actions with the heighest expected reward
		# and returns the action and its expected reward

		max_return = None
		best_actions = []
		for action in self.MDP.A:
			# get expected return for the action
			a_return = self.action_return(state, action)
			if max_return is None or max_return < a_return:
				max_return = a_return
				best_actions = [action]
			elif max_return == a_return:
				best_actions.append(action)

		return best_actions, max_return


	def BellmanUpdate(self):		
		for state in self.MDP.S:
			self.policy[state], self.Vs[state] = self.max_action_return(state)

		return self.Vs, self.policy
class BellmanDPSolver(object):
    def __init__(self, discountRate):
        self.MDP = MDP()
        self.states = self.MDP.S
        self.Actions = self.MDP.A
        self.state_values = self.initVs()
        self.policy = {}
        self.gamma = discountRate

    def initVs(self):
        state_values = {}
        for state in self.states:
            state_values[state] = 0.0
        return state_values

    def one_step_ahead(self, state):
        """
		Function that calculates the value for all actions in a given state
		Args: state to be considered
		Returns: A dictionary with keys the actions that can be taken and as values the expected value of each action 
		"""
        action_values = {}
        for action in self.Actions:
            transition_prob = self.MDP.probNextStates(state, action)
            total = 0.0
            for next_state, probability in transition_prob.items():
                reward = self.MDP.getRewards(state, action, next_state)
                total += probability * (
                    reward + self.gamma * self.state_values[next_state])
            action_values[action] = total
        return action_values

    def BellmanUpdate(self):
        for state in self.states:
            Action_values = self.one_step_ahead(state)
            max_value = max(Action_values.values())
            self.state_values[state] = max_value
            actions = []
            for action in self.Actions:
                if Action_values[action] == max_value:
                    actions.append(action)
            self.policy[state] = actions
        return (self.state_values, self.policy)
class BellmanDPSolver(object):
    def __init__(self, discountRate):
        self.MDP = MDP()
        self.states = MDP().S
        self.action = MDP().A
        self.discountRate = discountRate
        self.Values = {}
        self.Values['GOAL'] = 0
        self.Values['OUT'] = 0
        self.Policy = {}
        self.initVs()

    def initVs(self):
        for x in range(5):
            for y in range(5):
                self.Values[(x, y)] = 0
                self.Policy[(x, y)] = [
                    "DRIBBLE_UP", "DRIBBLE_DOWN", "DRIBBLE_LEFT",
                    "DRIBBLE_RIGHT", "SHOOT"
                ]

    def BellmanUpdate(self):
        values = {}
        for i in range(5):
            for j in range(5):
                values[(i, j)] = -np.inf
                for action in self.action:
                    tmp_values = 0.0
                    nextstateprob = self.MDP.probNextStates((i, j), action)
                    for nextstate in nextstateprob.keys():
                        reward = self.MDP.getRewards((i, j), action, nextstate)
                        prob = nextstateprob[nextstate]
                        tmp_values = tmp_values + (
                            prob *
                            (reward +
                             self.discountRate * self.Values[nextstate]))
                    if values[(i, j)] < tmp_values:
                        values[(i, j)] = tmp_values
                        self.Policy[(i, j)] = [action]
                    elif values[(i, j)] == tmp_values:
                        self.Policy[(i, j)].append(action)
                self.Values[(i, j)] = values[(i, j)]
        return self.Values, self.Policy
예제 #10
0
class BellmanDPSolver(object):
    def __init__(self, discount=0.9, theta=1e-4):
        self.MDP = MDP()
        self.discount = discount
        self.theta = theta
        self.initval, self.policy = self.initVs()

    def initVs(self):
        initval = {}
        policy = {}
        L1 = self.MDP.S
        for i in L1:
            initval[i] = 0
            # all the action
            policy[i] = self.MDP.A

        return initval, policy

    def BellmanUpdate(self):
        for states in self.MDP.S:
            nextV = {}
            for action in self.MDP.A:
                nextStateProb = self.MDP.probNextStates(states, action)

                value = 0
                for nextsta in nextStateProb:
                    immr = self.MDP.getRewards(states, action, nextsta)

                    value += nextStateProb[nextsta] * (
                        immr + self.discount * self.initval[nextsta])

                nextV[action] = value

            self.initval[states] = max(nextV.values())
            # select the corresponding optimal action and fill in the policy dic
            self.policy[states] = [
                key for key, value in nextV.items()
                if value == max(nextV.values())
            ]

        return self.initval, self.policy
class BellmanDPSolver(object):
	def __init__(self, discountRate):
		self.MDP = MDP()
		self.dr = discountRate
		self.S = [(x,y) for x in range(5) for y in range(5)]
		self.S.append("GOAL")
		self.S.append("OUT")
		self.A = ["DRIBBLE_UP","DRIBBLE_DOWN","DRIBBLE_LEFT","DRIBBLE_RIGHT","SHOOT"]
		self.oppositions = [(2,2), (4,2)]
		self.goalProbs = [[0.00,0.00,0.0,0.00,0.00],[0.0, 0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0],[0.0,0.3,0.5,0.3,0.0],[0.0,0.8,0.9,0.8,0.0]]
		self.vs ={}
		self.act = {}


	def initVs(self):
		for s in self.S:
			self.vs[s] = 0.0

	def BellmanUpdate(self):
		for s in self.S:
			temp = -100000000
			action = " "
			value_max= []
			action_max = []
			for a in self.A:
				nextState = self.MDP.probNextStates(s,a)
				value_a = 0.0
				for s_next,prob in nextState.items():
					reward = self.MDP.getRewards(s,a,s_next)
					value_next = self.vs[s_next]
					value_a += prob*(reward + self.dr*value_next)
				value_max.append(value_a)
			self.vs[s]= np.max(value_max)
			for i in range(len(self.A)):
				if value_max[i] == np.max(value_max):
					action_max.append(self.A[i])			
			self.act[s] = action_max
		return self.vs,self.act
class BellmanDPSolver(object):
    def __init__(self, discountRate=1.0):
        self.MDP = MDP()
        self.discountRate = discountRate

        self.V = None
        self.Policy = None

    def initVs(self):

        # initialize state values to zero
        self.V = {s: 0 for s in self.MDP.S}

        # initialize to uniform policy (so all actions are 'optimal')
        self.Policy = {s: self.MDP.A for s in self.MDP.S}

    def BellmanUpdate(self):

        V_new = {s: 0
                 for s in self.MDP.S
                 }  # make updates in a new value function (i.e. not in place)

        for s in self.V.keys():  # loop through all the states

            max_value = -inf

            for a in self.MDP.A:  # loop through all actions

                estimated_value = 0
                next_states = self.MDP.probNextStates(
                    s, a)  # get next state probabilities

                for next_state in next_states:
                    probability = next_states[next_state]
                    reward = self.MDP.getRewards(
                        s, a, next_state)  # next state reward
                    estimated_value += probability * (
                        reward + self.discountRate * self.V[next_state])

                # if s == (3,2):
                # print("Value of {} from (3,2): {}".format(a, estimated_value))

                if estimated_value > max_value:
                    # we've found a new best optimal action
                    self.Policy[s] = [a]  # update policy
                    max_value = estimated_value
                elif estimated_value == max_value:
                    # we've found another optimal action with same value
                    self.Policy[s].append(a)  # update policy

            # print()
            V_new[s] = max_value  # update value function estimate

        self.V = V_new
        return (self.V, self.Policy)

    def pretty_print(self):

        'Pretty print in grid with (0,0) as the top left corner'

        states = [(x, y) for x in range(5) for y in range(5)]
        print("\nState Values")
        for counter, (y, x) in enumerate(states):
            print("{:+.4f}  ".format(self.V[(x, y)]), end='')

            if ((counter + 1) % 5 == 0 and counter != 0):
                print("")
        print("\n State Policies")
        for counter, (y, x) in enumerate(states):
            print("{:25} ".format(', '.join(self.Policy[(x, y)])), end='')
            if ((counter + 1) % 5 == 0 and counter != 0):
                print("")