def run_agent_Q_RMS_param(num_runs, num_episodes, discount, step_size, step=1, agent_type="Sarsa"): """ Run the n-step Sarsa agent and return the avg Q-value RMS over episodes and runs """ mdp = RandomWalk(19, -1) s = mdp.init() # ground truth for value gt_v = np.asarray(mdp.Q_equiprobable(discount)[1:-1]) # Arrays for RMS error over all states rms_err = np.asarray([0.0] * num_episodes) sum_rms_err = 0.0 # create n-step agent print("Starting agent {}-step {}".format(step, agent_type)) if agent_type.lower() == "sarsa": agent = Sarsa(mdp, s, step) elif agent_type.lower() == "expsarsa": agent = ExpSARSA(mdp, s, step) elif agent_type.lower() == "treebackup": agent = TreeBackup(mdp, s, step) elif agent_type.lower() == "qsigma": agent = QSigma(mdp, 0.5, s, step) else: raise Exception("Wrong type of agent") for run in range(num_runs): for i in range(num_episodes): agent.episode(discount, step_size, 10000) agent.init() rms_err[i] = np.sqrt(np.mean(np.square(np.asarray(agent.Q[1:-1]) - gt_v))) sum_rms_err += np.sum(rms_err) # Reset Q after a run agent.reset_Q() # averaged over num_runs and num_episodes return sum_rms_err / (num_runs * num_episodes)
def run_agent_RMS_value(num_runs, num_episodes, discount, step_size, step=1): """ Run SARSA agent for num_episodes to get the state values """ mdp = RandomWalk(19, -1) s = mdp.init() # ground truth for value gt_v = np.asarray(mdp.value_equiprobable(discount)[1:-1]) # initial value init_v = np.asarray([0.5] * mdp.num_states())[1:-1] # Arrays for RMS error over all states rms_err = np.asarray([0.0] * (num_episodes + 1)) sum_rms_err = np.asarray([0.0] * (num_episodes + 1)) rms_err[0] = np.sqrt(np.mean(np.square(init_v - gt_v))) # create n-step SARSA agent agent = Sarsa(mdp, s, step) for run in range(num_runs): for i in range(num_episodes): agent.episode(discount, step_size, 10000) agent.init() rms_err[i + 1] = np.sqrt(np.mean(np.square(np.asarray(agent.Q_to_value()[1:-1]) - gt_v))) sum_rms_err += rms_err # Reset Q after a run agent.reset_Q() # averaged over num_runs return sum_rms_err / num_runs
def example_randomwalk(): # create an MDP env = RandomWalk(19) # create 1-step SARSA agent agent = Sarsa(env, env.init(), 1) agent2 = Sarsa(env, env.init(), 1) # act using equiprobable random policy with discount = 0.9 and step size = 0.1 num_episode = 100 for iter in range(num_episode): agent.episode(0.9, 0.1) agent.init() agent2.set_policy_eps_greedy(0.5) for iter in range(num_episode): agent2.episode(0.9, 0.1) agent2.init() print('Equiprobable Q_SARSA[s][a]', agent.Q) print('Eps greedy Q_SARSA[s][a]', agent2.Q)
def run_agent_value(num_episodes, discount, step_size, step=1): """ Run SARSA agent for num_episodes to get the state values""" mdp = RandomWalk(19) s = mdp.init() step = step # create n-step SARSA agent agent = Sarsa(mdp, s, step) for i in range(num_episodes): agent.episode(discount, step_size) agent.init() return agent.Q_to_value()
def example_randomwalk(): """ An example on random walk MDP """ # create an MDP env = RandomWalk(19, -1) # create n-step TreeBackup agent agent = TreeBackup(env, env.init(), 3) agent2 = TreeBackup(env, env.init(), 3) # act using equiprobable random policy with discount = 0.9 and step size = 0.1 num_episode = 1000 for iter in range(num_episode): agent.episode(0.9, 0.1) agent.init() agent2.set_policy_eps_greedy(0.1) for iter in range(num_episode): agent2.episode(0.9, 0.1) agent2.init() print('Q_DP[s][a] ', env.Q_equiprobable(0.9)) print('Q_eps_greedy[s][a] ', env.Q_eps_greedy(0.1, 0.9)) print('Equiprobable Q_TreeBackup[s][a]', agent.Q) print('Eps greedy Q_TreeBackup[s][a]', agent2.Q)
def example_randomwalk(): """ An example on random walk MDP """ # create an MDP env = RandomWalk(19, -1) # create n-step QSigma agent agent = QSigma(env, 0.5, env.init(), 3) #Psigma=0.5, init_state=env.init(), steps=3 agent2 = QSigma(env, 0.5, env.init(), 3) # act using equiprobable random policy with discount = 0.9 and step size = 0.1 num_episode = 1000 for iter in range(num_episode): agent.episode(0.9, 0.1) agent.init() agent2.set_policy_eps_greedy(0.1) for iter in range(num_episode): agent2.episode(0.9, 0.1) agent2.init() print('Q_DP[s][a] ', env.Q_equiprobable(0.9)) print('Q_eps_greedy[s][a] ', env.Q_eps_greedy(0.1, 0.9)) print('Equiprobable Q_Q(sigma)[s][a]', agent.Q) print('Eps greedy Q_Q(sigma)[s][a]', agent2.Q)
def decay_agent(n=1, alpha=0.5, episodes=100, ep_start=30, decay=0.7): """ Run an agent for specified n-step Qsigma method with sigma decay""" mdp = RandomWalk(19, -1) s = mdp.init() num_runs = 250 num_episodes = episodes discount = 1.0 step_size = alpha steps = n # Arrays for sum of rewards for each episodes Q_opt = mdp.Q_equiprobable(1.0) rms_err = 0.0 # create n-step Qsigma agent agent = QSigma(mdp, 1.0, s, steps) agent.set_policy_equiprobable() for run in range(num_runs): sqerr = 0.0 agent._Psigma = 1.0 for i in range(num_episodes): if i > ep_start: agent._Psigma *= decay agent.episode(discount, step_size) agent.init() count = 0 for s in range(mdp.num_states()): for a in range(mdp.num_actions(s)): count += 1 sqerr += (1 / count) * ( (agent.Q[s][a] - Q_opt[s][a])**2 - sqerr) rms_err += sqerr**0.5 # Reset Q after a run agent.reset_Q() rms_err /= num_runs return rms_err
def run_agent_RMS_Q(num_runs, num_episodes, discount, step_size, step=1): """ Run SARSA agent for num_episodes to get the Q values """ mdp = RandomWalk(19) s = mdp.init() # ground truth for Q gt_Q = np.asarray(mdp.Q_equiprobable(discount)[1:-1]) gt_Q_left = gt_Q[:, 0] gt_Q_right = gt_Q[:, 1] v = np.asarray([0.5] * mdp.num_states()) v[0], v[-1] = 0.0, 0.0 init_Q_left = np.asarray(mdp.value_to_Q(v, discount)[1:-1])[:, 0] init_Q_right = np.asarray(mdp.value_to_Q(v, discount)[1:-1])[:, 1] # Arrays for RMS error over all states rms_err_left = np.asarray([0.0] * (num_episodes + 1)) # Q[left] rms_err_right = np.asarray([0.0] * (num_episodes + 1)) # Q[right] sum_rms_err_left = np.asarray([0.0] * (num_episodes + 1)) sum_rms_err_right = np.asarray([0.0] * (num_episodes + 1)) rms_err_left[0] = np.sqrt(np.mean(np.square(init_Q_left - gt_Q_left))) rms_err_right[0] = np.sqrt(np.mean(np.square(init_Q_right - gt_Q_right))) # create n-step SARSA agent agent = Sarsa(mdp, s, step) for run in range(num_runs): for i in range(num_episodes): agent.episode(discount, step_size, 10000) agent.init() rms_err_left[i + 1] = np.sqrt(np.mean(np.square(np.asarray(agent.Q[1:-1])[:, 0] - gt_Q_left))) rms_err_right[i + 1] = np.sqrt(np.mean(np.square(np.asarray(agent.Q[1:-1])[:, 1] - gt_Q_right))) sum_rms_err_left += rms_err_left sum_rms_err_right += rms_err_right # Reset Q after a run agent.reset_Q() # averaged over num_runs return sum_rms_err_left / num_runs, sum_rms_err_right / num_runs