class BellmanDPSolver(object): def __init__(self, discountRate=1): self.MDP = MDP() self.gamma = discountRate self.initVs() def initVs(self): self.state_values = {pair: 0 for pair in self.MDP.S} def BellmanUpdate(self): prev_version = self.state_values.copy() for state in self.MDP.S: total_val = dict() for action in self.MDP.A: sub_total = 0 for next_state, prob in self.MDP.probNextStates(state, action).items(): sub_total += prob*(self.MDP.getRewards(state, action, next_state) + self.gamma * prev_version.get(next_state)) total_val[action] = sub_total self.state_values[state] = max(total_val.values()) return self.state_values, self.compute_greedy_policy() def compute_greedy_policy(self): policy = dict() for state in self.MDP.S: q_sa = dict() for action in self.MDP.A: q_sa[action] = sum(prob*(self.MDP.getRewards(state, action, next_state) + self.gamma * self.state_values[next_state]) for next_state, prob in self.MDP.probNextStates(state, action).items()) policy[state] = [action for action in self.MDP.A if q_sa[action] == max(q_sa.values())] return policy
class BellmanDPSolver(object): def __init__(self): self.MDP = MDP() self.initVs() def initVs(self): self.stateValueTable = {state: 0 for state in self.MDP.S} self.statePolicyTale = {state: self.MDP.A for state in self.MDP.S} def BellmanUpdate(self, discount_rate): for state in self.MDP.S: action_dict = { action: sum([ prob * (self.MDP.getRewards(state, action, nextState) + discount_rate * self.stateValueTable[nextState]) for nextState, prob in self.MDP.probNextStates( state, action).items() ]) for action in self.MDP.A } self.stateValueTable[state] = max(action_dict.values()) self.statePolicyTale[state] = [ action for action, value in action_dict.items() if value == self.stateValueTable[state] ] return self.stateValueTable, self.statePolicyTale
class BellmanDPSolver(object): def __init__(self, discountRate): self.MDP = MDP() self.discountRate = discountRate self.initVs() def initVs(self): self.values = {} self.policy = {} for state in self.MDP.S: self.values[state] = 0 def BellmanUpdate(self): for state in self.MDP.S: self.policy[state] = [] values_all = [] for action in self.MDP.A: s_r_sum = 0 prob_next_states = self.MDP.probNextStates(state,action) for state_2 in prob_next_states.keys(): s_r_sum = s_r_sum + prob_next_states[state_2] * (self.MDP.getRewards(state,action,state_2)+self.discountRate*self.values[state_2]) values_all.append(s_r_sum) self.values[state] = max(values_all) for i in range(len(values_all)): if values_all[i] == self.values[state]: self.policy[state].append(self.MDP.A[i]) return (self.values,self.policy) raise NotImplementedError
class BellmanDPSolver(object): def __init__(self, discountRate=0.99): self.MDP = MDP() self._gamma = discountRate self.initVs() def initVs(self): self.v = dict() self.pi = dict() for s in self.MDP.S: self.v[s] = 0 self.pi[s] = copy(self.MDP.A) def BellmanUpdate(self): for s in self.MDP.S: max_val = 0 opt_act = [] for a in self.MDP.A: probs = self.MDP.probNextStates(s, a) val = 0 for a_prime, p in probs.items(): r = self.MDP.getRewards(s, a, a_prime) val += p * (r + self._gamma * self.v[a_prime]) if val > max_val: max_val = val opt_act = [a] elif val == max_val: opt_act.append(a) self.v[s] = max_val self.pi[s] = opt_act return self.v, self.pi
class BellmanDPSolver(object): def __init__(self,discountRate): self.MDP = MDP() self.gamma = discountRate self.initVs() def initVs(self): self.values = {s: 0 for s in self.MDP.S} self.policy = {s: self.MDP.A for s in self.MDP.S} def BellmanUpdate(self): for s in self.MDP.S: best_v = -10**20 best_a = [] n_value = {a:0 for a in self.MDP.A} for a in self.MDP.A: for s_ in self.MDP.probNextStates(s,a).keys(): n_value[a] += self.MDP.probNextStates(s,a)[s_] * (self.MDP.getRewards(s,a,s_) + self.gamma * self.values[s_]) if n_value[a] > best_v: best_v = n_value[a] self.values[s] = best_v for a in self.MDP.A: if n_value[a] == best_v: best_a += [a] self.policy[s] = best_a return self.values, self.policy
class BellmanDPSolver(object): def __init__(self,discount_rate): self.mpd = MDP() self.actions = self.mpd.A self.gamma = discount_rate self.policy = {} self.current_position = -1 def initVs(self): self.values = {(1, 3): 0, (3, 0): 0, (2, 1): 0, (0, 3): 0, (4, 0): 0, (1, 2): 0, (3, 3): 0, (4, 4): 0, (2, 2): 0, (4, 1): 0, (1, 1): 0, 'OUT': 0, (3, 2): 0, (0, 0): 0, (0, 4): 0, (1, 4): 0, (2, 3): 0, (4, 2): 0, (1, 0): 0, (0, 1): 0, 'GOAL': 0, (3, 1): 0, (2, 4): 0, (2, 0): 0, (4, 3): 0, (3, 4): 0, (0, 2): 0} self.policy = {(1, 3): [], (3, 0): [], (2, 1): [], (0, 3): [], (4, 0): [], (1, 2): [], (3, 3): [], (4, 4): [], (2, 2): [], (4, 1): [], (1, 1): [], 'OUT': [], (3, 2): [], (0, 0): [], (0, 4): [], (1, 4): [], (2, 3): [], (4, 2): [], (1, 0): [], (0, 1): [], 'GOAL': [], (3, 1): [], (2, 4): [], (2, 0): [], (4, 3): [], (3, 4): [], (0, 2): []} def BellmanUpdate(self): for init_state, value_f in self.values.items(): max=None for action in self.actions: temp = 0 # Transition Table p(s',r | s,a) next_states = self.mpd.probNextStates(init_state,action) for new_state,prob in next_states.items(): temp+=prob*(self.mpd.getRewards(init_state,new_state)+self.gamma*self.values[new_state]) if max == None or temp >= max: max = temp self.values[init_state]= max # Greedily compute new policy policy_list = [] max = self.values[init_state] for action in self.actions: temp = 0 next_states = self.mpd.probNextStates(init_state,action) for new_state,prob in next_states.items(): temp+=prob*(self.mpd.getRewards(init_state,new_state)+self.gamma*self.values[new_state]) if temp == max: policy_list.append(action) self.policy[init_state]= policy_list return self.values,self.policy
class BellmanDPSolver(object): def __init__(self, discountRate=0.9): self.MDP = MDP() self.discountRate = discountRate self.initVs() def initVs(self): self.Vs = dict() self.policy = dict() for state in self.MDP.S: self.Vs[state] = 0 self.policy[state] = self.MDP.A def action_return(self, state, action): # for each next state: # get the state probability given current state and action # get the reward for the s, r, s' combination # sum the s, r, s' rewards by weighting them by their probability state_prob = self.MDP.probNextStates(state, action) expected_reward = 0 for next_state in state_prob: prob = state_prob[next_state] reward = self.MDP.getRewards(state, action, next_state) expected_reward += prob * (reward + self.discountRate * self.Vs[next_state]) return expected_reward def max_action_return(self, state): # finds actions with the heighest expected reward # and returns the action and its expected reward max_return = None best_actions = [] for action in self.MDP.A: # get expected return for the action a_return = self.action_return(state, action) if max_return is None or max_return < a_return: max_return = a_return best_actions = [action] elif max_return == a_return: best_actions.append(action) return best_actions, max_return def BellmanUpdate(self): for state in self.MDP.S: self.policy[state], self.Vs[state] = self.max_action_return(state) return self.Vs, self.policy
class BellmanDPSolver(object): def __init__(self, discountRate): self.MDP = MDP() self.states = self.MDP.S self.Actions = self.MDP.A self.state_values = self.initVs() self.policy = {} self.gamma = discountRate def initVs(self): state_values = {} for state in self.states: state_values[state] = 0.0 return state_values def one_step_ahead(self, state): """ Function that calculates the value for all actions in a given state Args: state to be considered Returns: A dictionary with keys the actions that can be taken and as values the expected value of each action """ action_values = {} for action in self.Actions: transition_prob = self.MDP.probNextStates(state, action) total = 0.0 for next_state, probability in transition_prob.items(): reward = self.MDP.getRewards(state, action, next_state) total += probability * ( reward + self.gamma * self.state_values[next_state]) action_values[action] = total return action_values def BellmanUpdate(self): for state in self.states: Action_values = self.one_step_ahead(state) max_value = max(Action_values.values()) self.state_values[state] = max_value actions = [] for action in self.Actions: if Action_values[action] == max_value: actions.append(action) self.policy[state] = actions return (self.state_values, self.policy)
class BellmanDPSolver(object): def __init__(self, discountRate): self.MDP = MDP() self.states = MDP().S self.action = MDP().A self.discountRate = discountRate self.Values = {} self.Values['GOAL'] = 0 self.Values['OUT'] = 0 self.Policy = {} self.initVs() def initVs(self): for x in range(5): for y in range(5): self.Values[(x, y)] = 0 self.Policy[(x, y)] = [ "DRIBBLE_UP", "DRIBBLE_DOWN", "DRIBBLE_LEFT", "DRIBBLE_RIGHT", "SHOOT" ] def BellmanUpdate(self): values = {} for i in range(5): for j in range(5): values[(i, j)] = -np.inf for action in self.action: tmp_values = 0.0 nextstateprob = self.MDP.probNextStates((i, j), action) for nextstate in nextstateprob.keys(): reward = self.MDP.getRewards((i, j), action, nextstate) prob = nextstateprob[nextstate] tmp_values = tmp_values + ( prob * (reward + self.discountRate * self.Values[nextstate])) if values[(i, j)] < tmp_values: values[(i, j)] = tmp_values self.Policy[(i, j)] = [action] elif values[(i, j)] == tmp_values: self.Policy[(i, j)].append(action) self.Values[(i, j)] = values[(i, j)] return self.Values, self.Policy
class BellmanDPSolver(object): def __init__(self, discount=0.9, theta=1e-4): self.MDP = MDP() self.discount = discount self.theta = theta self.initval, self.policy = self.initVs() def initVs(self): initval = {} policy = {} L1 = self.MDP.S for i in L1: initval[i] = 0 # all the action policy[i] = self.MDP.A return initval, policy def BellmanUpdate(self): for states in self.MDP.S: nextV = {} for action in self.MDP.A: nextStateProb = self.MDP.probNextStates(states, action) value = 0 for nextsta in nextStateProb: immr = self.MDP.getRewards(states, action, nextsta) value += nextStateProb[nextsta] * ( immr + self.discount * self.initval[nextsta]) nextV[action] = value self.initval[states] = max(nextV.values()) # select the corresponding optimal action and fill in the policy dic self.policy[states] = [ key for key, value in nextV.items() if value == max(nextV.values()) ] return self.initval, self.policy
class BellmanDPSolver(object): def __init__(self, discountRate): self.MDP = MDP() self.dr = discountRate self.S = [(x,y) for x in range(5) for y in range(5)] self.S.append("GOAL") self.S.append("OUT") self.A = ["DRIBBLE_UP","DRIBBLE_DOWN","DRIBBLE_LEFT","DRIBBLE_RIGHT","SHOOT"] self.oppositions = [(2,2), (4,2)] self.goalProbs = [[0.00,0.00,0.0,0.00,0.00],[0.0, 0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0],[0.0,0.3,0.5,0.3,0.0],[0.0,0.8,0.9,0.8,0.0]] self.vs ={} self.act = {} def initVs(self): for s in self.S: self.vs[s] = 0.0 def BellmanUpdate(self): for s in self.S: temp = -100000000 action = " " value_max= [] action_max = [] for a in self.A: nextState = self.MDP.probNextStates(s,a) value_a = 0.0 for s_next,prob in nextState.items(): reward = self.MDP.getRewards(s,a,s_next) value_next = self.vs[s_next] value_a += prob*(reward + self.dr*value_next) value_max.append(value_a) self.vs[s]= np.max(value_max) for i in range(len(self.A)): if value_max[i] == np.max(value_max): action_max.append(self.A[i]) self.act[s] = action_max return self.vs,self.act
class BellmanDPSolver(object): def __init__(self, discountRate=1.0): self.MDP = MDP() self.discountRate = discountRate self.V = None self.Policy = None def initVs(self): # initialize state values to zero self.V = {s: 0 for s in self.MDP.S} # initialize to uniform policy (so all actions are 'optimal') self.Policy = {s: self.MDP.A for s in self.MDP.S} def BellmanUpdate(self): V_new = {s: 0 for s in self.MDP.S } # make updates in a new value function (i.e. not in place) for s in self.V.keys(): # loop through all the states max_value = -inf for a in self.MDP.A: # loop through all actions estimated_value = 0 next_states = self.MDP.probNextStates( s, a) # get next state probabilities for next_state in next_states: probability = next_states[next_state] reward = self.MDP.getRewards( s, a, next_state) # next state reward estimated_value += probability * ( reward + self.discountRate * self.V[next_state]) # if s == (3,2): # print("Value of {} from (3,2): {}".format(a, estimated_value)) if estimated_value > max_value: # we've found a new best optimal action self.Policy[s] = [a] # update policy max_value = estimated_value elif estimated_value == max_value: # we've found another optimal action with same value self.Policy[s].append(a) # update policy # print() V_new[s] = max_value # update value function estimate self.V = V_new return (self.V, self.Policy) def pretty_print(self): 'Pretty print in grid with (0,0) as the top left corner' states = [(x, y) for x in range(5) for y in range(5)] print("\nState Values") for counter, (y, x) in enumerate(states): print("{:+.4f} ".format(self.V[(x, y)]), end='') if ((counter + 1) % 5 == 0 and counter != 0): print("") print("\n State Policies") for counter, (y, x) in enumerate(states): print("{:25} ".format(', '.join(self.Policy[(x, y)])), end='') if ((counter + 1) % 5 == 0 and counter != 0): print("")