def JN(domain: Domain, policy: Policy.Policy, N): # method to return the Expected value after N turn with a policy in a domain if N == 0: return 0 else: R = domain.reward(domain.state, policy.action(domain.state)) domain.moves(policy.action(domain.state)) return R + domain.gamma * JN(domain, policy, N-1)
def MatrixJN(domain: Domain, policy: Policy.Policy, N): # method to return the list of Matrix of Expected value after N turn with a policy in a domain L = [np.array([[0. for k in range(domain.n)] for l in range(domain.m)])] for h in range(1, N): L.append(np.array([[0. for k in range(domain.n)] for l in range(domain.m)])) for i in range(domain.n): for j in range(domain.m): L[-1][j][i] = domain.reward([i, j], policy.action([i, j])) L[-1][j][i] += domain.gamma * (1 - domain.beta) * L[-2][min(max(j + policy.action([i, j])[1], 0), domain.m - 1)][min(max(i + policy.action([i, j])[0], 0), domain.n - 1)] L[-1][j][i] += domain.gamma * domain.beta * L[-2][0][0] return L