def main(args): # environment env = GridWorld() # agent agent = TDAgent( env, epsilon=args.epsilon, gamma=args.discout, alpha=0.05, lamda=0.7) agent.control(method=args.algorithm)
G = Qs[tau % n] for k in range(tau, min(tau + n - 1, T - 1)): G += Z * deltas[k % n] Z = gamma * Z * ((1 - sigma) * pis[(k + 1) % n] + sigma) p = p * (1 - sigma + sigma * ratios[k % n]) s = states[tau % n] a = actions[tau % n] # Update state-action value function. Q[s, a] += alpha * p * (G - Q[s, a]) action_values = [Q[s, i] for i in range(4)] policy[s] = np.argmax(action_values) t += 1 epsilon = decay(epsilon) if episode % 100 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy if __name__ == '__main__': n = 4 alpha = 0.0001 gamma = 1 sigma = 0.5 epsilon = 1 n_episodes = 50000 n_tests = 10 env = GridWorld() policy = n_step_Q_sigma(env, n, alpha, gamma, sigma, epsilon, n_episodes) test_policy(env, policy, n_tests)
r_t = env.step(a_t) # updates cumulative_reward += r_t * gamma**step step += 1 locs.append(env.get_agent_loc()) # termination condition if env.is_terminal(): break return cumulative_reward, step, locs '''train ''' # define env and agent env = GridWorld(has_bomb=True) state_dim = env.height * env.width n_actions = len(ACTIONS) agent = Agent(dim_input=state_dim, dim_output=n_actions) # training params max_steps = 100 gamma = .9 alpha = 0.03 n_epochs = 1000 n_trials = 5 log_return = np.zeros((n_epochs, 2, n_trials)) log_steps = np.zeros((n_epochs, 2, n_trials)) for epoch_id in range(n_epochs):
def main(args): env = GridWorld() agent = TDAgent(env, epsilon=args.epsilon, gamma=args.discount, alpha=args.lr) agent.control(method=args.algorithm)