Esempio n. 1
0
def value_iteration(mdp, gamma=0.9, epsilon=0.0001):
    states = mdp.states
    actions = mdp.actions

    policy = Policy(mdp.states, mdp.actions)
    Vcurrent = np.zeros(len(mdp.states))
    Vprevious = None
    fix_point = False
    while not fix_point:
        Vprevious = deepcopy(Vcurrent)
        for fromstate in states:
            values = []
            for action in mdp.actions:
                value = 0.
                for tostate in mdp.get_neighbors(fromstate):
                    p = mdp.get_probability(action, fromstate, tostate)
                    r = mdp.get_reward(action, fromstate, tostate)
                    v = Vprevious[tostate]
                    value += p * (r + gamma * v)
                values.append(value)
            Vcurrent[fromstate] = max(values)
            del values
        fix_point = np.linalg.norm(Vcurrent - Vprevious, np.inf) < epsilon

        for fromstate in states:
            values = []
            for action in actions:
                value = 0.
                for tostate in mdp.get_neighbors(fromstate):
                    p = mdp.get_probability(action, fromstate, tostate)
                    r = mdp.get_reward(action, fromstate, tostate)
                    v = Vcurrent[tostate]
                    value += p * (r + gamma * v)
                values.append(value)
            acts = np.argwhere(values == np.amax(values)).flatten().tolist()
            for a in acts:
                policy.set_probability(1. / len(acts), fromstate, a)
            for a in [ac for ac in actions if ac not in acts]:
                policy.set_probability(0., fromstate, a)
            del values
    return Vcurrent, policy