def greedyQPolicy(Qs): """ Find the greedy deterministic policy, given the Q-values. """ dim = len(Qs) numA = len(Qs[0]) policy = zeros((dim, numA)) for si in range(dim): actions = all_argmax(Qs[si]) for a in actions: policy[si, a] = 1. / len(actions) return policy
def greedyPolicy(Ts, R, discountFactor, V): """ Find the greedy policy, (soft tie-breaking) given a value function and full transition model. """ dim = len(V) numA = len(Ts) Vnext = V * discountFactor + R policy = zeros((dim, numA)) for si in range(dim): actions = all_argmax([dot(T[si, :], Vnext) for T in Ts]) for a in actions: policy[si, a] = 1. / len(actions) return policy, collapsedTransitions(Ts, policy)
def greedyPolicy(Ts, R, discountFactor, V): """ Find the greedy policy, (soft tie-breaking) given a value function and full transition model. """ dim = len(V) numA = len(Ts) Vnext = V*discountFactor+R policy = zeros((dim, numA)) for si in range(dim): actions = all_argmax([dot(T[si, :], Vnext) for T in Ts]) for a in actions: policy[si, a] = 1. / len(actions) return policy, collapsedTransitions(Ts, policy)