def policy_iteration(): deltas = {} rewards = {} for size in PROBLEM_SIZES: P, R = forest(S=size, r1=1, r2=5, p=.1) pi = PolicyIteration(P, R, 0.9, max_iter=10) pi.run() delta = [pi.run_stats[i]['Error'] for i in range(len(pi.run_stats))] reward = [pi.run_stats[i]['Reward'] for i in range(len(pi.run_stats))] deltas[size] = delta rewards[size] = reward print(pi.policy) print(pi.S) # forest_plot.plot_pi_forest_convergence_size(rewards) deltas = {} rewards = {} for p in [.2, .1, .05, .01]: P, R = forest(S=10, r1=1, r2=5, p=p) pi = PolicyIteration(P, R, 0.9, max_iter=10) pi.run() delta = [pi.run_stats[i]['Error'] for i in range(len(pi.run_stats))] reward = [pi.run_stats[i]['Reward'] for i in range(len(pi.run_stats))] deltas[p] = delta rewards[p] = reward print(pi.policy) print(pi.S) forest_plot.plot_pi_forest_convergence_p(rewards)
def process_environment(self, config, tune_param=0): """ Process instances for problem environment """ print(f"Processing inputs for {self.type} in {self.environment}") # Generate environment class env_class = Environment(self.name, self.environment, tune_param) # Populate with instance if self.environment == "forest": # Generate fire management problem env_class.env = forest(S=config['states'], r1=config['reward1'], r2=config['reward2'], p=config['prob'], is_sparse=config['is_spare']) env_class.transition = env_class.env[0] env_class.reward = env_class.env[1] self.states = config['states'] else: # Create FrozenLake and process matrices lake = frozen_lake.generate_random_map(size=config['size'], p=config['prob']) env_class.env = frozen_lake.FrozenLakeEnv(lake) env_class.process_matrices() self.states = config['size']**2 return env_class
def gamma_iter_value(): gamma = np.arange(0.1, 1.0, 0.1) v1_iter = [] v2_iter = [] v1_v_mean = [] v2_v_mean = [] for g in gamma: P, R = forest(num_states, r1, r2, p_fire) P2, R2 = forest(num_states, r1, r2, 0.9) vi = ValueIteration(P, R, g, 1e-20) vi.run() vi2 = ValueIteration(P2, R2, g, 1e-20) vi2.run() v1_iter.append(len(vi.run_stats)) v2_iter.append(len(vi2.run_stats)) v1_v_mean.append(vi.run_stats[-1]["Mean V"]) v2_v_mean.append(vi2.run_stats[-1]["Mean V"]) # plt.plot(gamma, v1_iter, linestyle='--', marker='o', color='b',label="fire possibility = 0.1") # plt.plot(gamma, v2_iter, linestyle='--', marker='o', color='r',label="fire possibility = 0.9") # plt.xlabel("Gamma") # plt.ylabel("Converged iteration #") # plt.title("converged happen at iteration # vs gamma") # plt.legend(('fire possibility = 0.1', 'fire possibility = 0.9'), loc="upper left") # plt.show() plt.plot(gamma, v1_v_mean, linestyle='--', marker='o', color='b', label="fire possibility = 0.1") plt.plot(gamma, v2_v_mean, linestyle='--', marker='o', color='r', label="fire possibility = 0.9") plt.xlabel("Gamma") plt.ylabel("Converged Mean Value") plt.title("converged Mean Value vs gamma") plt.legend(('fire possibility = 0.1', 'fire possibility = 0.9'), loc="upper left") plt.show()
def value_iteration(): deltas = {} rewards = {} for size in PROBLEM_SIZES: P, R = forest(S=size, r1=1, r2=5, p=.1) vi = ValueIteration(P, R, 0.9, max_iter=10) vi.run() delta = [vi.run_stats[i]['Error'] for i in range(len(vi.run_stats))] reward = [vi.run_stats[i]['Reward'] for i in range(len(vi.run_stats))] deltas[size] = delta rewards[size] = reward print(vi.policy) print(vi.S)
def qlearning(): deltas = {} rewards = {} for size in [10, 20, 40, 80]: P, R = forest(S=size, r1=1, r2=5, p=.1) ql = QLearning(P, R, 0.90, epsilon_decay=.998) ql.run() delta = [ql.run_stats[i]['Error'] for i in range(len(ql.run_stats))] reward = [ql.run_stats[i]['Reward'] for i in range(len(ql.run_stats))] epilson = [ ql.run_stats[i]['Epsilon'] for i in range(len(ql.run_stats)) ] deltas[size] = delta rewards[size] = reward print(ql.policy) forest_plot.plot_ql_forest_convergence_size(deltas)
def run(verbose=False): # MDP Forest Problem # transitions, reward = example.forest() nS = 1000 # transitions, reward = example.forest(S=nS, r1=250, r2=120, p=0.01, is_sparse=False) transitions, reward = example.forest(S=nS, r1=1045, r2=1025, p=0.01, is_sparse=False) # print(transitions) # print (reward) # return print('~~~~~~~~~~ Forest - Policy Iteration ~~~~~~~~~~') pi = mdp.PolicyIteration(transitions, reward, 0.75, max_iter=10000) if verbose: pi.setVerbose() pi.run() util.print_debugs(pi) # print(pi.run_stats) # return print('~~~~~~~~~~ Forest - Value Iteration ~~~~~~~~~~') vi = mdp.ValueIteration(transitions, reward, 0.75, max_iter=100000) if verbose: vi.setVerbose() vi.run() util.print_debugs(vi) if (vi.policy == pi.policy): print('Forest - Value and Policy Iteration policies are the same! ') else: print('Forest - Value and Policy Iteration policies are NOT the same.') print('~~~~~~~~~~ Forest - Q-Learning ~~~~~~~~~~') # transitions, reward, gamma, # alpha=0.1, alpha_decay=0.99, alpha_min=0.001, # epsilon=1.0, epsilon_min=0.1, epsilon_decay=0.99, # n_iter=10000, skip_check=False, iter_callback=None, # run_stat_frequency=None): ql = mdp.QLearning(transitions, reward, 0.75, alpha=0.3, epsilon_min=0.005, n_iter=500000) if verbose: ql.setVerbose() start_t = time.process_time() ql.run() end_t = time.process_time() # Output print('~~~~~~~~~~ Forest - Policy Iteration ~~~~~~~~~~') util.print_debugs(pi) print('~~~~~~~~~~ Forest - Value Iteration ~~~~~~~~~~') util.print_debugs(vi) print('~~~~~~~~~~ Forest - Q-Learning ~~~~~~~~~~') print(ql.policy) print('Q-Learning # of Iterations: %i' % q_counter) print('Clock time') print(end_t - start_t) if (vi.policy == pi.policy): print('Forest - Value and Policy Iteration policies are the same! ') else: print('Forest - Value and Policy Iteration policies are NOT the same.') if (vi.policy == ql.policy): print('Forest – QL and VI Policies are the same!') else: print('Forest – QL and VI Policies are NOT the same.') if (pi.policy == ql.policy): print('Forest – PI and PI Policies are the same!') else: print('Forest – PI and VI Policies are NOT the same.') # A Q-Learning Algorithm # # Source: # https://www.oreilly.com/radar/introduction-to-reinforcement-learning-and-openai-gym/ """
policy_all = {} vi_res = { "Iteration to converge": [], "Time to converge": [], "Max V": [], "Mean V": [], } pi_res = { "Iteration to converge": [], "Time to converge": [], "Max V": [], "Mean V": [], } for s in N_STATES: print(f"Running nS {s}...") P, R = forest(S=s, p=0.001, r1=100, r2=10) vi = ValueIteration(P, R, gamma=0.99, epsilon=0.001) vi.run() vi_res["Iteration to converge"].append(vi.iter) vi_res["Time to converge"].append(vi.time) vi_res["Max V"].append(vi.run_stats[-1]["Max V"]) vi_res["Mean V"].append(vi.run_stats[-1]["Mean V"]) V_all[("vi", s)] = vi.V policy_all[("vi", s)] = vi.policy pi = PolicyIteration(P, R, gamma=0.99, eval_type=1, max_iter=1000) pi.run() pi_res["Iteration to converge"].append(pi.iter) pi_res["Time to converge"].append(pi.time) pi_res["Max V"].append(pi.run_stats[-1]["Max V"]) pi_res["Mean V"].append(pi.run_stats[-1]["Mean V"]) V_all[("pi", s)] = pi.V
def init_forest(self): return mdpExm.forest(self.states, self.reward_wait, self.reward_cut, self.prob_fire)
label="fire possibility = 0.9") plt.xlabel("Gamma") plt.ylabel("Converged Mean Value") plt.title("converged Mean Value vs gamma") plt.legend(('fire possibility = 0.1', 'fire possibility = 0.9'), loc="upper left") plt.show() if __name__ == '__main__': gamma = 0.9 num_states = 20 r1 = 4 r2 = 2 p_fire = 0.1 P, R = forest(num_states, r1, r2, p_fire) vi = ValueIteration(P, R, 0.96, 1e-20) vi.run() P2, R2 = forest(num_states, r1, r2, 0.8) vi2 = ValueIteration(P2, R2, 0.96, 1e-20) vi2.run() # # calculate and plot the v_mean # iter_score(vi, vi2) # gamma_iter_value() # # # pi = PolicyIteration(P, R, 0.96)
if __name__ == '__main__': if not os.path.isdir('figures/'): os.mkdir('figures') y = input('''Choose environment: S=250, r1=10, r2=5: 1, S=1000, r1=15, r2=5: 2: ''') x = input('''Choose algorithm: VI: v, PI: p, Q: q: ''') if y == '1': P, R = forest(S=250, r1=10, r2=5) if x == 'v': value_iteration(P, R, id='250') elif x == 'p': policy_iteration(P, R, id='250') elif x == 'q': Q_learner(P, R, id='250') elif y == '2': P, R = forest(S=1000, r1=15, r2=5) if x == 'v': value_iteration(P, R, id='1000') elif x == 'p': policy_iteration(P, R, id='1000') elif x == 'q': Q_learner(P, R, id='1000')
def run_forest(): np.random.seed(0) P, R = example.forest(S=5, r1=3, r2=15, p=0.2) print("Transition Array: ") print(P.shape) print(P) # Transition array A x S x S print("Reward Array: ") print(R.shape) print(R) # Reward array S x A # TODO gamma_range = np.array([0.1, 0.9, 0.99]) alpha_range = np.array([0.01, 0.5, 0.99]) epsilon_range = np.array([0.1, 0.5, 0.95]) e_decay_range = np.array([0.1, 0.5, 0.999]) # gamma_range = np.append(np.linspace(0.1, 0.9, 9), np.linspace(0.91, 0.99, 9)) # alpha_range = np.append(np.linspace(0.01, 0.1, 9), np.linspace(0.2, 0.99, 4)) # epsilon_range = np.linspace(0.1, 1.0, 10) # e_decay_range = np.append(np.linspace(0.1, 0.9, 4), np.linspace(0.91, 0.99, 9)) difference_list = np.zeros(gamma_range.shape) value_iteration_list = np.zeros(gamma_range.shape) value_time_list = np.zeros(gamma_range.shape) value_reward_list = np.zeros(gamma_range.shape) value_error_list = np.zeros(gamma_range.shape) policy_iteration_list = np.zeros(gamma_range.shape) policy_time_list = np.zeros(gamma_range.shape) policy_reward_list = np.zeros(gamma_range.shape) policy_error_list = np.zeros(gamma_range.shape) for i, gamma in enumerate(gamma_range): print('Gamma %0.2f' % gamma) vi = mdp.ValueIteration(transitions=P, reward=R, gamma=gamma, epsilon=0.0001, max_iter=10000) vi.setVerbose() vi.run() vi_stats = vi.run_stats value_iteration_list[i] = vi_stats[-1:][0]['Iteration'] value_time_list[i] = vi_stats[-1:][0]['Time'] value_reward_list[i] = vi_stats[-1:][0]['Reward'] value_error_list[i] = vi_stats[-1:][0]['Error'] plot_stats(vi_stats, ('vi_forest_%0.2f' % gamma)) pi = mdp.PolicyIteration(transitions=P, reward=R, gamma=gamma, max_iter=10000, eval_type=1) pi.setVerbose() pi.run() stats = pi.run_stats policy_iteration_list[i] = stats[-1:][0]['Iteration'] policy_time_list[i] = stats[-1:][0]['Time'] policy_reward_list[i] = stats[-1:][0]['Reward'] policy_error_list[i] = stats[-1:][0]['Error'] plot_stats(stats, ('pi_forest_%0.2f' % gamma)) print('Policies Found') print('Value Iteration: ' + str(vi.policy)) print('Policy Iteration: ' + str(pi.policy)) difference1 = sum([abs(x - y) for x, y in zip(pi.policy, vi.policy)]) difference_list[i] = difference1 print("Discrepancy in Policy and Value Iteration: ", difference1) print() # Plotting # Error v Iteration plt.clf() plt.title('Value Iteration: Error v Iterations') plt.xlabel('Iterations') plt.ylabel('Error') plt.plot(list(value_iteration_list), list(value_error_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/vi_error_v_iteration.png') # Reward v Gamma plt.clf() plt.title('Value Iteration: Reward v Gamma') plt.xlabel('Gamma') plt.ylabel('Reward') plt.plot(list(gamma_range), list(value_reward_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/vi_reward_v_gamma.png') # Gamma v Iterations plt.clf() plt.title('Value Iteration: Gamma v Iterations') plt.xlabel('Iterations') plt.ylabel('Gamma') plt.plot(list(value_iteration_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/forest_experiment/vi_gamma_v_iterations.png') # Gamma v Time plt.clf() plt.title('Value Iteration: Gamma v Time') plt.xlabel('Time') plt.ylabel('Gamma') plt.plot(list(value_time_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/forest_experiment/vi_gamma_v_time.png') # Reward vs Iterations plt.clf() plt.title('Value Iteration: Reward v Iterations') plt.xlabel('Iterations') plt.ylabel('Reward') plt.plot(list(value_iteration_list), list(value_reward_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/vi_reward_v_iterations.png') # Policy # Error v Iteration plt.clf() plt.title('Policy Iteration: Error v Iterations') plt.xlabel('Iterations') plt.ylabel('Error') plt.plot(list(policy_iteration_list), list(policy_error_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/pi_error_v_iteration.png') # Gamma v Reward plt.clf() plt.title('Policy Iteration: Reward v Gamma') plt.xlabel('Gamma') plt.ylabel('Reward') plt.plot(list(gamma_range), list(policy_reward_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/pi_reward_v_gamma.png') # Gamma v Iterations plt.clf() plt.title('Policy Iteration: Gamma v Iterations') plt.xlabel('Iterations') plt.ylabel('Gamma') plt.plot(list(policy_iteration_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/forest_experiment/pi_gamma_v_iterations.png') # Gamma v Time plt.clf() plt.title('Policy Iteration: Gamma v Time') plt.xlabel('Time') plt.ylabel('Gamma') plt.plot(list(policy_time_list), list(gamma_range)) plt.tight_layout() plt.savefig('plots/forest_experiment/pi_gamma_v_time.png') # Reward vs Iterations plt.clf() plt.title('Policy Iteration: Reward v Iterations') plt.xlabel('Iterations') plt.ylabel('Reward') plt.plot(list(policy_iteration_list), list(policy_reward_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/pi_reward_v_iterations.png') # Gamma vs Policy Differences plt.clf() plt.title('Gamma v Policy Differences') plt.xlabel('Gamma') plt.ylabel('Policy Differences') plt.plot(list(gamma_range), list(difference_list)) plt.tight_layout() plt.savefig('plots/forest_experiment/gamma_v_differences.png') plt.close('all') prev_Q = None thresh = 1e-4 print('== Q Learning ==') for i, gamma in enumerate(gamma_range): for j, alpha in enumerate(alpha_range): for k, ep in enumerate(epsilon_range): for l, ed in enumerate(e_decay_range): # print('ql: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'.format(gamma, alpha, ep, ed)) ql = mdp.QLearning(transitions=P, reward=R, gamma=gamma, alpha=alpha, alpha_decay=1.0, alpha_min=0.001, epsilon=ep, epsilon_min=0.1, epsilon_decay=ed, n_iter=10e4) stats = ql.run() plot_stats(stats, ('ql_forest_%0.2f_%0.2f_%0.2f_%0.2f' % (gamma, alpha, ep, ed))) # print('Policy: ') # print(ql.policy) # print(ql.run_stats) df = pd.DataFrame.from_records(ql.run_stats) iteration_list = df['Iteration'][-100:] windowed_reward = df['Reward'][-100:].mean() error_list = df['Error'][-100:].mean() if prev_Q is None: prev_Q = ql.Q else: variation = np.absolute(np.subtract(np.asarray(ql.Q), np.asarray(prev_Q))).max() res = np.abs(np.subtract(np.asarray(prev_Q), np.asarray(ql.Q))) print('Result: ') print(res) print('Variation: ') print(variation) print('Mean Reward for Last 100 Iterations:') print(windowed_reward) if np.all(res < thresh) or variation < thresh or windowed_reward > 1.0: print('Breaking! Below Thresh') print('Found at: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'.format( gamma, alpha, ep, ed)) print('Optimal Policy: ') print(ql.policy) break
from hiive.mdptoolbox import example, mdp P, R = example.forest() vi = mdp.ValueIteration(P, R, 0.96) print(vi.verbose) vi.run() expected = (5.93215488, 9.38815488, 13.38815488) all(expected[k] - vi.V[k] < 1e-12 for k in range(len(expected))) print(vi.policy) print(vi.iter) from hiive import mdptoolbox import numpy as np P = np.array([[[0.5, 0.5], [0.8, 0.2]], [[0, 1], [0.1, 0.9]]]) R = np.array([[5, 10], [-1, 2]]) vi = mdptoolbox.mdp.ValueIteration(P, R, 0.9) vi.run() expected = (40.048625392716815, 33.65371175967546) all(expected[k] - vi.V[k] < 1e-12 for k in range(len(expected))) print(vi.policy) print(vi.iter) from hiive import mdptoolbox import numpy as np from scipy.sparse import csr_matrix as sparse
# In[1]: from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning from hiive.mdptoolbox.example import forest import numpy as np from numpy.random import choice import pandas as pd from matplotlib import pyplot as plt # In[2]: np.random.seed(100) P, R = forest(S=500, r1=100, r2= 25, p=0.01) # In[3]: def test_policy(P, R, policy, test_count=100, gamma=0.9): num_state = P.shape[-1] total_episode = num_state * test_count # start in each state total_reward = 0 for state in range(num_state): state_reward = 0 for state_episode in range(test_count): episode_reward = 0 disc_rate = 1
import numpy as np import pandas as pd import gym from hiive.mdptoolbox.example import forest import matplotlib.pyplot as plt PROBS = {} PROBS["forest"] = forest(S=1000, p=0.001, r1=100, r2=10) env = gym.make("FrozenLake8x8-v0") nA, nS = env.nA, env.nS P = np.zeros([nA, nS, nS]) R = np.zeros([nA, nS, nS]) DONE = np.zeros(nS) for s in range(nS): for a in range(nA): transitions = env.P[s][a] for p_trans, next_s, reward, done in transitions: P[a, s, next_s] += p_trans R[a, s, next_s] += reward DONE[next_s] = done P[a, s, :] /= np.sum(P[a, s, :]) PROBS["frozen_lake"] = (P, R) nA, nS = env.nA, env.nS P = np.zeros([nA, nS, nS]) R = np.zeros([nA, nS, nS]) DONE = np.zeros(nS)
import hiive.mdptoolbox.example as example import hiive.mdptoolbox.mdp as mdp states = 50 P, R = example.forest(S=states) #pi = mdp.QLearning(P, R, 0.99, n_iter=500000, alpha=0.3, alpha_min=0.1, epsilon_min=0.1, epsilon_decay=0.9999) pi = mdp.ValueIteration(P, R, 0.99) pi.run() #print("deltas_" + str(gamma)[2:] + " = " + str(pi.deltas)) for x in pi.run_stats: print(x) print(pi.policy) l = len(pi.run_stats) - 1 print('Time: ', pi.run_stats[l]['Time'], "Reward: ", pi.run_stats[l]['Reward'])
alpha_decay=0.99, epsilon_decay=0.99, n_iter=max_iter).run() q_stats.append(q) return q_stats, gammas def vi_pi_q_comp(P, R): vi = mdp.ValueIteration(P, R, 0.60, epsilon=0.001).run() pi = mdp.PolicyIteration(P, R, 0.60, eval_type=1).run() q = mdp.QLearning(P, R, 0.6, alpha=0.2).run() return vi, pi, q P1, R1 = example.forest(10000, p=0.5) # P1, R1 = example.forest(10000) env = gym.make('Taxi-v3') states = env.observation_space.n actions = env.action_space.n P2, R2 = build_matrix(env, states, actions) # VI Gamma vi_gamma_forest, gamma_forest = vi_experiment_gamma(P1, R1) vi_gamma_taxi, gamma_taxi = vi_experiment_gamma(P2, R2) print(vi_gamma_taxi[-2][-1], vi_gamma_forest[-2][-1]) plot_vi_gamma(vi_gamma_taxi, vi_gamma_forest, gamma_taxi, gamma_forest) # VI Epsilon vi_epsilon_forest, epsilon_forest = vi_experiment_epsilon(P1, R1)
# # A forest is managed by two actions – WAIT and CUT. An action is decided each year with two objectives: maintain the forest for wildlife (i.e. WAIT) or cut the forest for wood (i.e. CUT). The Agent gets a reward of $1 for cutting the forest and going back to Initial State. However, the agent can decide to wait and hope for a better reward and move through states with a probability p in the hope of catching the maximum reward in final state. However, there is a probability of (1-p) that the forest will burn down and the agent will lose the reward as the forest goes back to initial state. # In[1]: from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning from hiive.mdptoolbox.example import forest import numpy as np from numpy.random import choice import pandas as pd from matplotlib import pyplot as plt # In[2]: np.random.seed(100) P, R = forest(S=25, r1=20, r2=5, p=0.1) # In[3]: def test_policy(P, R, policy, test_count=1000, gamma=0.9): num_state = P.shape[-1] total_episode = num_state * test_count # start in each state total_reward = 0 for state in range(num_state): state_reward = 0 for state_episode in range(test_count): episode_reward = 0 disc_rate = 1 while True:
for i in range(lake_size): for j in range(lake_size): text = ax.text(j, i, pol_matrix[i, j], ha="center", va="center", color="w") ax.set_xticks([]) ax.set_yticks([]) ax.set_title("QL policy") plt.savefig("charts/lake_ql_viz") # non grid world, forest, large, 5000 states transitions, rewards = example.forest(S=5000) # tune PI/VI gamma values tune_gamma = False if tune_gamma: gamma_range = np.linspace(0.01, 0.99, 99) vi_iter = [] pi_iter = [] vi_time = [] pi_time = [] vi_max_v = [] pi_max_v = [] for gamma in gamma_range: vi = mdp.ValueIteration(transitions, rewards,
from hiive.mdptoolbox.mdp import ValueIteration, QLearning, PolicyIteration from hiive.mdptoolbox.example import forest import numpy as np import matplotlib import matplotlib.pyplot as plt P, R = forest(2000) compare_VI_QI_policy = [] # True or False compare_VI_PI_policy = [] Gamma = 1 Epsilon = 0.0000000000000000000000000000000000000000000000000000000000000000000000000001 Max_Iterations = 200000 VI = ValueIteration(P, R, Gamma, Epsilon, Max_Iterations) # run VI VI.setVerbose() VI.run() print('VI') print(VI.iter) print(VI.time) print(VI.run_stats[-1:]) iterations = np.zeros(len(VI.run_stats)) reward = np.zeros(len(VI.run_stats)) i = 0 for stat in VI.run_stats: iterations[i] = stat['Iteration'] reward[i] = stat['Reward']