def value_iteration(mdp, gamma=0.9, epsilon=0.0001): states = mdp.states actions = mdp.actions policy = Policy(mdp.states, mdp.actions) Vcurrent = np.zeros(len(mdp.states)) Vprevious = None fix_point = False while not fix_point: Vprevious = deepcopy(Vcurrent) for fromstate in states: values = [] for action in mdp.actions: value = 0. for tostate in mdp.get_neighbors(fromstate): p = mdp.get_probability(action, fromstate, tostate) r = mdp.get_reward(action, fromstate, tostate) v = Vprevious[tostate] value += p * (r + gamma * v) values.append(value) Vcurrent[fromstate] = max(values) del values fix_point = np.linalg.norm(Vcurrent - Vprevious, np.inf) < epsilon for fromstate in states: values = [] for action in actions: value = 0. for tostate in mdp.get_neighbors(fromstate): p = mdp.get_probability(action, fromstate, tostate) r = mdp.get_reward(action, fromstate, tostate) v = Vcurrent[tostate] value += p * (r + gamma * v) values.append(value) acts = np.argwhere(values == np.amax(values)).flatten().tolist() for a in acts: policy.set_probability(1. / len(acts), fromstate, a) for a in [ac for ac in actions if ac not in acts]: policy.set_probability(0., fromstate, a) del values return Vcurrent, policy