Esempio n. 1
0
def fit_value(st, rm, gamma, num_states):
    iterations = list(range(1, 1000, 10))
    data_value = {}
    data_value['convergence'] = {}
    for iter in iterations:

        print('Current Iteration: {}'.format(iter))
        data_value[str(iter)] = {}

        tot_time_start = time.time()
        vi = mdp.ValueIteration(st,
                                rm,
                                gamma,
                                max_iter=10000000,
                                epsilon=0.0001)
        # vi.setVerbose()
        time_iter, iter_value, variation, policies = vi.run(max_iter=iter)
        tot_time_end = time.time()
        tot_time = tot_time_end - tot_time_start

        if (iter_value > iter):
            raise ValueError(
                'ValueIteration is not stopping at maximum iterations')

        data_value[str(iter)]['tot_time'] = tot_time
        data_value[str(iter)]['time_iter'] = time_iter
        data_value[str(iter)]['value_iter'] = iter_value
        data_value[str(iter)]['variation'] = variation

    print('Convergence')
    tot_time_start = time.time()
    vi = mdp.ValueIteration(st, rm, gamma, max_iter=10000, epsilon=0.0001)
    time_iter, iter_value, variation, policies = vi.run(max_iter=10000)
    tot_time_end = time.time()

    optimal_policy = vi.policy
    expected_values = vi.V
    policies = [tuple(int(x) for x in opt_policy) for opt_policy in policies]
    optimal_policy = tuple(int(x) for x in optimal_policy)
    expected_values = tuple(float(x) for x in expected_values)

    optimal_policy = dict(zip(list(range(num_states)), list(optimal_policy)))
    expected_values = list(expected_values)
    policies = [
        dict(zip(list(range(num_states)), list(opt_policy)))
        for opt_policy in policies
    ]

    data_value['convergence']['tot_time'] = tot_time_end - tot_time_start
    data_value['convergence']['time_iter'] = time_iter
    data_value['convergence']['value_iter'] = iter_value
    data_value['convergence']['variation'] = variation
    data_value['convergence']['optimal_policy'] = optimal_policy
    data_value['convergence']['expected_values'] = expected_values
    data_value['convergence']['policies'] = policies

    return data_value
Esempio n. 2
0
def solve_mdp_value():
    """Solve the problem as a value iteration Markov decision process.
    """
    P, R = get_transition_and_reward_arrays()
    sdp = mdp.ValueIteration(P, R, 0.96, epsilon=0.01, max_iter=1000)
    sdp.run()
    return sdp
Esempio n. 3
0
def tictactoe(gamma=0.95):
    outdir = mktmpdir('a4_ttt')
    timings = {}
    print('====== Running Tic Tac Toe =======')
    gamma = 0.95
    P, R = ttt.getTransitionAndRewardArrays()

    print('\nValue Iteration')
    ttt_vi = mdp.ValueIteration(P, R, gamma)
    ttt_vi.setVerbose()
    vi_time = default_timer()
    ttt_vi.run()
    vi_time = default_timer() - vi_time
    print(f'MDP Toolbox VI finished in {ttt_vi.iter} iterations')
    print(f'Accumulated reward: {len(ttt_vi.rewards)}')
    print(f'Rewards: {ttt_vi.rewards}')
    save_stats(outdir, f'vi', ttt_vi)

    print('\nPolicy Iteration')
    ttt_pi = mdp.PolicyIteration(P, R, gamma)
    ttt_pi.setVerbose()
    pi_time = default_timer()
    ttt_pi.run()
    pi_time = default_timer() - pi_time
    print(f'MDP Toolbox PI finished in {ttt_pi.iter} iterations')
    print(f'Accumulated reward: {len(ttt_pi.rewards)}')
    print(f'Rewards: {ttt_pi.rewards}')
    save_stats(outdir, 'pi', ttt_pi)

    print('PI/VI same policy?: {}'.format(
        np.all(ttt_vi.policy == ttt_pi.policy)))
    save_stats(outdir, 'pi_policy', ttt_pi.policy)
    save_stats(outdir, 'vi_policy', ttt_vi.policy)

    # Q vs random
    epsilons = [0.4, 0.9]
    rewards = []
    agents = []
    qtimes = []
    for i, epsilon in enumerate(epsilons):
        qtimes.append(default_timer())
        r, agent = ttt.train_agents('random', 500000, epsilon, 0.9, 0.4, 0.9,
                                    0.99, False)
        qtimes[i] = default_timer() - qtimes[i]
        rewards.append(r)
        agents.append(agent)
        qpolicy = agent.policy()

        save_stats(outdir, f'ttt_agents{epsilon}', agent)
        save_stats(outdir, f'ttt_rewards{epsilon}', r)
        save_stats(outdir, f'q_policy_{epsilon}', qpolicy)
        # print(f'{epsilon} policy same as vi?: {np.all(ttt_vi.policy == qpolicy)}')

    timings = {
        # 'vi': vi_time,
        # 'pi': pi_time,
        'q_eps4': qtimes[0],
        'q_eps7': qtimes[1]
    }
    print(timings)
Esempio n. 4
0
def main():
	transitions, reward, discount, lake = get_environement()
	
	#Policy iteration
	policy_iteration = mdp.PolicyIteration(transitions, reward, discount, policy0=None, max_iter=1000, eval_type=0)
	policy_iteration.run()
	print_as_grid(policy_iteration.policy, lake, 5)
	print(policy_iteration.time)
	print(policy_iteration.iter)

	# #Value iteration
	value_iteration = mdp.ValueIteration(transitions, reward, discount, epsilon=0.01, max_iter=1000, initial_value=0)
	value_iteration.run()
	print_as_grid(value_iteration.policy, lake, 5)
	print(value_iteration.time)
	print(value_iteration.iter)

	# #MDP
	q_learning = mdp.QLearning(transitions, reward, discount, n_iter=20000000)
	q_learning.run()
	print_as_grid(q_learning.policy, lake, 5)
	print(q_learning.time)
Esempio n. 5
0
def run_program(N, B):
    # Arbitrary threshold to consider realistic horizon
    threshold = 0.01

    # Iterator variables
    max_timestep = 0

    # Calculate maximum likely horizon using arbitrary threshold
    while True:
        p = pow(1. * (N - sum(B)) / N, max_timestep)
        if p < threshold:
            break
        max_timestep += 1

    # State is bankroll, from 0 to N*max_timesteps (inclusive)
    states = range(N * max_timestep + 1)

    # Extra state for each possible bankroll to indicate terminal state
    states *= 2

    # Actions are always roll or quit, encoded to {0, 1}
    actions = [0, 1]

    T = build_transition_matrix(len(states), N, B)
    R = build_reward_matrix(len(states))

    # Gamma is 1 since we don't value future reward any less than immediate
    gamma = 1.0

    # Arbitrary threshold epsilon
    epsilon = 0.01

    vi = mdp.ValueIteration(T, R, gamma, epsilon, max_iter=1000)
    vi.run()

    print 'N={} ... output={}'.format(N, vi.V[0])
Esempio n. 6
0
    
    """
    for w in WINS:
        S = sum(1 if (w[k] == 1 and state[k] == who) else 0
                for k in range(ACTIONS))
        if S == 3:
            # We have a win
            return True
    # There were no wins so return False
    return False

def isValid(state):
    """"""
    # S1 is the sum of the player's cells
    S1 = sum(1 if x == PLAYER else 0 for x in state)
    # S2 is the sum of the opponent's cells
    S2 = sum(1 if x == OPPONENT else 0 for x in state)
    if (S1, S2) in OWNED_CELLS:
        return True
    else:
        return False

if __name__ == "__main__":
    P, R = getTransitionAndRewardArrays()
    ttt = mdp.ValueIteration(P, R, 1)
    ttt.setVerbose()
    ttt.run()
    f = "tictactoe.pkl"
    pickle.dump(ttt.policy, open(f, "wb"))
    print("Optimal policy pickled as '%s' in current directory." % f)
Esempio n. 7
0
 def solveVI(self, discount, epsilon):
     T, R = self.get_transition_and_reward_arrays(0.5)
     vi = mdp.ValueIteration(T, R, discount = discount, epsilon = epsilon)
     #vi.setVerbose()
     vi.run()
     return vi
Esempio n. 8
0
def run_vi_pi():
    """Solves the Maze aka Theseus and the Minotaur MDP."""
    MZ = MazeEnv()
    T = MZ.transition()
    R = MZ.rewards()

    #intial values
    vi = mdp.ValueIteration(T, R, discount=0.9, epsilon=0.01)
    pi = mdp.PolicyIterationModified(T, R, discount=0.9, epsilon=0.01)
    #vi.setVerbose()
    #pi.setVerbose()
    vi.run()
    pi.run()
    print(MZ.print_policy(vi.policy))
    print("\n")
    print(MZ.print_policy(pi.policy))
    plot('MZ_ValueIteration_Iter_Vvar', 'Iterations', 'V-variation')
    plot('MZ_PolicyIteration_Iter_Vvar', 'Iterations', 'V-variation')

    #Discount
    discount = np.arange(0.01, 0.99, 0.01)
    vi_time_d = []
    vi_iter_d = []
    pi_time_d = []
    pi_iter_d = []
    for d in discount:
        vi = mdp.ValueIteration(T, R, discount=d, epsilon=0.01)
        pi = mdp.PolicyIterationModified(T, R, discount=d, epsilon=0.01)
        vi.run()
        pi.run()
        vi_time_d.append(vi.time)
        vi_iter_d.append(vi.iter)
        pi_time_d.append(pi.time)
        pi_iter_d.append(pi.iter)

    pd.DataFrame(pd.concat(
        [pd.Series(discount), pd.Series(vi_time_d)],
        axis=1)).to_csv('../plot_data/MZ_ValueIteration_Discount_vs_Time.csv')
    pd.DataFrame(pd.concat(
        [pd.Series(discount), pd.Series(vi_iter_d)],
        axis=1)).to_csv('../plot_data/MZ_ValueIteration_Discount_vs_Iter.csv')
    pd.DataFrame(pd.concat(
        [pd.Series(discount), pd.Series(pi_time_d)],
        axis=1)).to_csv('../plot_data/MZ_PolicyIteration_Discount_vs_Time.csv')
    pd.DataFrame(pd.concat(
        [pd.Series(discount), pd.Series(pi_iter_d)],
        axis=1)).to_csv('../plot_data/MZ_PolicyIteration_Discount_vs_Iter.csv')
    plot('MZ_ValueIteration_Discount_vs_Time', 'Discount', 'Run Time')
    plot('MZ_ValueIteration_Discount_vs_Iter', 'Discount', 'Iterations')
    plot('MZ_PolicyIteration_Discount_vs_Time', 'Discount', 'Run Time')
    plot('MZ_PolicyIteration_Discount_vs_Iter', 'Discount', 'Iterations')

    #Epsilon
    epsilon = np.arange(0.05, 2, 0.05)
    vi_time_e = []
    vi_iter_e = []
    pi_time_e = []
    pi_iter_e = []
    for e in epsilon:
        vi = mdp.ValueIteration(T, R, discount=0.9, epsilon=e)
        pi = mdp.PolicyIterationModified(T, R, discount=0.9, epsilon=e)
        vi.run()
        pi.run()
        vi_time_e.append(vi.time)
        vi_iter_e.append(vi.iter)
        pi_time_e.append(pi.time)
        pi_iter_e.append(pi.iter)

    pd.DataFrame(pd.concat(
        [pd.Series(epsilon), pd.Series(vi_time_e)],
        axis=1)).to_csv('../plot_data/MZ_ValueIteration_Epsilon_vs_Time.csv')
    pd.DataFrame(pd.concat(
        [pd.Series(epsilon), pd.Series(vi_iter_e)],
        axis=1)).to_csv('../plot_data/MZ_ValueIteration_Epsilon_vs_Iter.csv')
    pd.DataFrame(pd.concat(
        [pd.Series(epsilon), pd.Series(pi_time_e)],
        axis=1)).to_csv('../plot_data/MZ_PolicyIteration_Epsilon_vs_Time.csv')
    pd.DataFrame(pd.concat(
        [pd.Series(epsilon), pd.Series(pi_iter_e)],
        axis=1)).to_csv('../plot_data/MZ_PolicyIteration_Epsilon_vs_Iter.csv')
    plot('MZ_ValueIteration_Epsilon_vs_Time', 'Epsilon', 'Run Time')
    plot('MZ_ValueIteration_Epsilon_vs_Iter', 'Epsilon', 'Iterations')
    plot('MZ_PolicyIteration_Epsilon_vs_Time', 'Epsilon', 'Run Time')
    plot('MZ_PolicyIteration_Epsilon_vs_Iter', 'Epsilon', 'Iterations')
Esempio n. 9
0
print("PolicyIterationModified duration:", pim_class.time)
print("PolicyIterationModified iterations:", pim_class.iter)

print("_________________")

#RelativeValueIteration
rvi_class = mdp.RelativeValueIteration(T, R, discountFactor, max_iter=iterations)
rvi_class.run()
all_policies["RelativeValueIteration"] = rvi_class.policy
print("RelativeValueIteration duration:", rvi_class.time)
print("RelativeValueIteration iterations:", rvi_class.iter)

print("_________________")

#ValueIteration
vi_class = mdp.ValueIteration(T, R, discountFactor, max_iter=iterations)
vi_class.run()
all_policies["ValueIteration"] = vi_class.policy
print("ValueIteration duration:", vi_class.time)
print("ValueIteration iterations:", vi_class.iter)

print("_________________")

#ValueIterationGS
vigs_class = mdp.ValueIterationGS(T, R, discountFactor, max_iter=iterations)
vigs_class.run()
all_policies["ValueIterationGS"] = vigs_class.policy
print("ValueIterationGS duration:", vigs_class.time)
print("ValueIterationGS iterations:", vigs_class.iter)

#print policies for visualization purposes 
Esempio n. 10
0
def value_iteration(T, R, gamma=0.99):
    vi = mdp.ValueIteration(T, R, gamma)
    vi.run()

    return vi
Esempio n. 11
0
        return False

    def isValid(state):
        """"""
        # S1 is the sum of the player's cells
        S1 = sum(1 if x == PLAYER else 0 for x in state)
        # S2 is the sum of the opponent's cells
        S2 = sum(1 if x == OPPONENT else 0 for x in state)
        if (S1, S2) in OWNED_CELLS:
            return True
        else:
            return False

    P, R = getTransitionAndRewardArrays()
    for discount in np.arange(.1, 1, .2):
        ttt = mdp.ValueIteration(P, R, discount)
        ttt.setVerbose()
        start = clock()
        ttt.run()
        elapsed = clock() - start

    for discount in np.arange(.1, 1, .2):
        ttt = mdp.PolicyIteration(P, R, discount)
        ttt.setVerbose()
        start = clock()
        ttt.run()
        elapsed = clock() - start

    for discount in np.arange(.1, 1, .2):
        qlearner_stats = collections.defaultdict(list)
        ttt = hmdp.QLearning(P, R, discount)