def fit_value(st, rm, gamma, num_states): iterations = list(range(1, 1000, 10)) data_value = {} data_value['convergence'] = {} for iter in iterations: print('Current Iteration: {}'.format(iter)) data_value[str(iter)] = {} tot_time_start = time.time() vi = mdp.ValueIteration(st, rm, gamma, max_iter=10000000, epsilon=0.0001) # vi.setVerbose() time_iter, iter_value, variation, policies = vi.run(max_iter=iter) tot_time_end = time.time() tot_time = tot_time_end - tot_time_start if (iter_value > iter): raise ValueError( 'ValueIteration is not stopping at maximum iterations') data_value[str(iter)]['tot_time'] = tot_time data_value[str(iter)]['time_iter'] = time_iter data_value[str(iter)]['value_iter'] = iter_value data_value[str(iter)]['variation'] = variation print('Convergence') tot_time_start = time.time() vi = mdp.ValueIteration(st, rm, gamma, max_iter=10000, epsilon=0.0001) time_iter, iter_value, variation, policies = vi.run(max_iter=10000) tot_time_end = time.time() optimal_policy = vi.policy expected_values = vi.V policies = [tuple(int(x) for x in opt_policy) for opt_policy in policies] optimal_policy = tuple(int(x) for x in optimal_policy) expected_values = tuple(float(x) for x in expected_values) optimal_policy = dict(zip(list(range(num_states)), list(optimal_policy))) expected_values = list(expected_values) policies = [ dict(zip(list(range(num_states)), list(opt_policy))) for opt_policy in policies ] data_value['convergence']['tot_time'] = tot_time_end - tot_time_start data_value['convergence']['time_iter'] = time_iter data_value['convergence']['value_iter'] = iter_value data_value['convergence']['variation'] = variation data_value['convergence']['optimal_policy'] = optimal_policy data_value['convergence']['expected_values'] = expected_values data_value['convergence']['policies'] = policies return data_value
def solve_mdp_value(): """Solve the problem as a value iteration Markov decision process. """ P, R = get_transition_and_reward_arrays() sdp = mdp.ValueIteration(P, R, 0.96, epsilon=0.01, max_iter=1000) sdp.run() return sdp
def tictactoe(gamma=0.95): outdir = mktmpdir('a4_ttt') timings = {} print('====== Running Tic Tac Toe =======') gamma = 0.95 P, R = ttt.getTransitionAndRewardArrays() print('\nValue Iteration') ttt_vi = mdp.ValueIteration(P, R, gamma) ttt_vi.setVerbose() vi_time = default_timer() ttt_vi.run() vi_time = default_timer() - vi_time print(f'MDP Toolbox VI finished in {ttt_vi.iter} iterations') print(f'Accumulated reward: {len(ttt_vi.rewards)}') print(f'Rewards: {ttt_vi.rewards}') save_stats(outdir, f'vi', ttt_vi) print('\nPolicy Iteration') ttt_pi = mdp.PolicyIteration(P, R, gamma) ttt_pi.setVerbose() pi_time = default_timer() ttt_pi.run() pi_time = default_timer() - pi_time print(f'MDP Toolbox PI finished in {ttt_pi.iter} iterations') print(f'Accumulated reward: {len(ttt_pi.rewards)}') print(f'Rewards: {ttt_pi.rewards}') save_stats(outdir, 'pi', ttt_pi) print('PI/VI same policy?: {}'.format( np.all(ttt_vi.policy == ttt_pi.policy))) save_stats(outdir, 'pi_policy', ttt_pi.policy) save_stats(outdir, 'vi_policy', ttt_vi.policy) # Q vs random epsilons = [0.4, 0.9] rewards = [] agents = [] qtimes = [] for i, epsilon in enumerate(epsilons): qtimes.append(default_timer()) r, agent = ttt.train_agents('random', 500000, epsilon, 0.9, 0.4, 0.9, 0.99, False) qtimes[i] = default_timer() - qtimes[i] rewards.append(r) agents.append(agent) qpolicy = agent.policy() save_stats(outdir, f'ttt_agents{epsilon}', agent) save_stats(outdir, f'ttt_rewards{epsilon}', r) save_stats(outdir, f'q_policy_{epsilon}', qpolicy) # print(f'{epsilon} policy same as vi?: {np.all(ttt_vi.policy == qpolicy)}') timings = { # 'vi': vi_time, # 'pi': pi_time, 'q_eps4': qtimes[0], 'q_eps7': qtimes[1] } print(timings)
def main(): transitions, reward, discount, lake = get_environement() #Policy iteration policy_iteration = mdp.PolicyIteration(transitions, reward, discount, policy0=None, max_iter=1000, eval_type=0) policy_iteration.run() print_as_grid(policy_iteration.policy, lake, 5) print(policy_iteration.time) print(policy_iteration.iter) # #Value iteration value_iteration = mdp.ValueIteration(transitions, reward, discount, epsilon=0.01, max_iter=1000, initial_value=0) value_iteration.run() print_as_grid(value_iteration.policy, lake, 5) print(value_iteration.time) print(value_iteration.iter) # #MDP q_learning = mdp.QLearning(transitions, reward, discount, n_iter=20000000) q_learning.run() print_as_grid(q_learning.policy, lake, 5) print(q_learning.time)
def run_program(N, B): # Arbitrary threshold to consider realistic horizon threshold = 0.01 # Iterator variables max_timestep = 0 # Calculate maximum likely horizon using arbitrary threshold while True: p = pow(1. * (N - sum(B)) / N, max_timestep) if p < threshold: break max_timestep += 1 # State is bankroll, from 0 to N*max_timesteps (inclusive) states = range(N * max_timestep + 1) # Extra state for each possible bankroll to indicate terminal state states *= 2 # Actions are always roll or quit, encoded to {0, 1} actions = [0, 1] T = build_transition_matrix(len(states), N, B) R = build_reward_matrix(len(states)) # Gamma is 1 since we don't value future reward any less than immediate gamma = 1.0 # Arbitrary threshold epsilon epsilon = 0.01 vi = mdp.ValueIteration(T, R, gamma, epsilon, max_iter=1000) vi.run() print 'N={} ... output={}'.format(N, vi.V[0])
""" for w in WINS: S = sum(1 if (w[k] == 1 and state[k] == who) else 0 for k in range(ACTIONS)) if S == 3: # We have a win return True # There were no wins so return False return False def isValid(state): """""" # S1 is the sum of the player's cells S1 = sum(1 if x == PLAYER else 0 for x in state) # S2 is the sum of the opponent's cells S2 = sum(1 if x == OPPONENT else 0 for x in state) if (S1, S2) in OWNED_CELLS: return True else: return False if __name__ == "__main__": P, R = getTransitionAndRewardArrays() ttt = mdp.ValueIteration(P, R, 1) ttt.setVerbose() ttt.run() f = "tictactoe.pkl" pickle.dump(ttt.policy, open(f, "wb")) print("Optimal policy pickled as '%s' in current directory." % f)
def solveVI(self, discount, epsilon): T, R = self.get_transition_and_reward_arrays(0.5) vi = mdp.ValueIteration(T, R, discount = discount, epsilon = epsilon) #vi.setVerbose() vi.run() return vi
def run_vi_pi(): """Solves the Maze aka Theseus and the Minotaur MDP.""" MZ = MazeEnv() T = MZ.transition() R = MZ.rewards() #intial values vi = mdp.ValueIteration(T, R, discount=0.9, epsilon=0.01) pi = mdp.PolicyIterationModified(T, R, discount=0.9, epsilon=0.01) #vi.setVerbose() #pi.setVerbose() vi.run() pi.run() print(MZ.print_policy(vi.policy)) print("\n") print(MZ.print_policy(pi.policy)) plot('MZ_ValueIteration_Iter_Vvar', 'Iterations', 'V-variation') plot('MZ_PolicyIteration_Iter_Vvar', 'Iterations', 'V-variation') #Discount discount = np.arange(0.01, 0.99, 0.01) vi_time_d = [] vi_iter_d = [] pi_time_d = [] pi_iter_d = [] for d in discount: vi = mdp.ValueIteration(T, R, discount=d, epsilon=0.01) pi = mdp.PolicyIterationModified(T, R, discount=d, epsilon=0.01) vi.run() pi.run() vi_time_d.append(vi.time) vi_iter_d.append(vi.iter) pi_time_d.append(pi.time) pi_iter_d.append(pi.iter) pd.DataFrame(pd.concat( [pd.Series(discount), pd.Series(vi_time_d)], axis=1)).to_csv('../plot_data/MZ_ValueIteration_Discount_vs_Time.csv') pd.DataFrame(pd.concat( [pd.Series(discount), pd.Series(vi_iter_d)], axis=1)).to_csv('../plot_data/MZ_ValueIteration_Discount_vs_Iter.csv') pd.DataFrame(pd.concat( [pd.Series(discount), pd.Series(pi_time_d)], axis=1)).to_csv('../plot_data/MZ_PolicyIteration_Discount_vs_Time.csv') pd.DataFrame(pd.concat( [pd.Series(discount), pd.Series(pi_iter_d)], axis=1)).to_csv('../plot_data/MZ_PolicyIteration_Discount_vs_Iter.csv') plot('MZ_ValueIteration_Discount_vs_Time', 'Discount', 'Run Time') plot('MZ_ValueIteration_Discount_vs_Iter', 'Discount', 'Iterations') plot('MZ_PolicyIteration_Discount_vs_Time', 'Discount', 'Run Time') plot('MZ_PolicyIteration_Discount_vs_Iter', 'Discount', 'Iterations') #Epsilon epsilon = np.arange(0.05, 2, 0.05) vi_time_e = [] vi_iter_e = [] pi_time_e = [] pi_iter_e = [] for e in epsilon: vi = mdp.ValueIteration(T, R, discount=0.9, epsilon=e) pi = mdp.PolicyIterationModified(T, R, discount=0.9, epsilon=e) vi.run() pi.run() vi_time_e.append(vi.time) vi_iter_e.append(vi.iter) pi_time_e.append(pi.time) pi_iter_e.append(pi.iter) pd.DataFrame(pd.concat( [pd.Series(epsilon), pd.Series(vi_time_e)], axis=1)).to_csv('../plot_data/MZ_ValueIteration_Epsilon_vs_Time.csv') pd.DataFrame(pd.concat( [pd.Series(epsilon), pd.Series(vi_iter_e)], axis=1)).to_csv('../plot_data/MZ_ValueIteration_Epsilon_vs_Iter.csv') pd.DataFrame(pd.concat( [pd.Series(epsilon), pd.Series(pi_time_e)], axis=1)).to_csv('../plot_data/MZ_PolicyIteration_Epsilon_vs_Time.csv') pd.DataFrame(pd.concat( [pd.Series(epsilon), pd.Series(pi_iter_e)], axis=1)).to_csv('../plot_data/MZ_PolicyIteration_Epsilon_vs_Iter.csv') plot('MZ_ValueIteration_Epsilon_vs_Time', 'Epsilon', 'Run Time') plot('MZ_ValueIteration_Epsilon_vs_Iter', 'Epsilon', 'Iterations') plot('MZ_PolicyIteration_Epsilon_vs_Time', 'Epsilon', 'Run Time') plot('MZ_PolicyIteration_Epsilon_vs_Iter', 'Epsilon', 'Iterations')
print("PolicyIterationModified duration:", pim_class.time) print("PolicyIterationModified iterations:", pim_class.iter) print("_________________") #RelativeValueIteration rvi_class = mdp.RelativeValueIteration(T, R, discountFactor, max_iter=iterations) rvi_class.run() all_policies["RelativeValueIteration"] = rvi_class.policy print("RelativeValueIteration duration:", rvi_class.time) print("RelativeValueIteration iterations:", rvi_class.iter) print("_________________") #ValueIteration vi_class = mdp.ValueIteration(T, R, discountFactor, max_iter=iterations) vi_class.run() all_policies["ValueIteration"] = vi_class.policy print("ValueIteration duration:", vi_class.time) print("ValueIteration iterations:", vi_class.iter) print("_________________") #ValueIterationGS vigs_class = mdp.ValueIterationGS(T, R, discountFactor, max_iter=iterations) vigs_class.run() all_policies["ValueIterationGS"] = vigs_class.policy print("ValueIterationGS duration:", vigs_class.time) print("ValueIterationGS iterations:", vigs_class.iter) #print policies for visualization purposes
def value_iteration(T, R, gamma=0.99): vi = mdp.ValueIteration(T, R, gamma) vi.run() return vi
return False def isValid(state): """""" # S1 is the sum of the player's cells S1 = sum(1 if x == PLAYER else 0 for x in state) # S2 is the sum of the opponent's cells S2 = sum(1 if x == OPPONENT else 0 for x in state) if (S1, S2) in OWNED_CELLS: return True else: return False P, R = getTransitionAndRewardArrays() for discount in np.arange(.1, 1, .2): ttt = mdp.ValueIteration(P, R, discount) ttt.setVerbose() start = clock() ttt.run() elapsed = clock() - start for discount in np.arange(.1, 1, .2): ttt = mdp.PolicyIteration(P, R, discount) ttt.setVerbose() start = clock() ttt.run() elapsed = clock() - start for discount in np.arange(.1, 1, .2): qlearner_stats = collections.defaultdict(list) ttt = hmdp.QLearning(P, R, discount)