Ejemplo n.º 1
0
def main(args):
    # environment
    env = GridWorld()
    # agent
    agent = TDAgent(
        env, epsilon=args.epsilon, gamma=args.discout, alpha=0.05, lamda=0.7)
    agent.control(method=args.algorithm)
Ejemplo n.º 2
0
                G = Qs[tau % n]
                for k in range(tau, min(tau + n - 1, T - 1)):
                    G += Z * deltas[k % n]
                    Z = gamma * Z * ((1 - sigma) * pis[(k + 1) % n] + sigma)
                    p = p * (1 - sigma + sigma * ratios[k % n])
                s = states[tau % n]
                a = actions[tau % n]
                # Update state-action value function.
                Q[s, a] += alpha * p * (G - Q[s, a])
                action_values = [Q[s, i] for i in range(4)]
                policy[s] = np.argmax(action_values)
            t += 1
        epsilon = decay(epsilon)
        if episode % 100 == 0:
            print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy


if __name__ == '__main__':
    n = 4
    alpha = 0.0001
    gamma = 1
    sigma = 0.5
    epsilon = 1
    n_episodes = 50000
    n_tests = 10
    env = GridWorld()
    policy = n_step_Q_sigma(env, n, alpha, gamma, sigma, epsilon, n_episodes)
    test_policy(env, policy, n_tests)
Ejemplo n.º 3
0
        r_t = env.step(a_t)
        # updates
        cumulative_reward += r_t * gamma**step
        step += 1
        locs.append(env.get_agent_loc())
        # termination condition
        if env.is_terminal():
            break
    return cumulative_reward, step, locs


'''train
'''

# define env and agent
env = GridWorld(has_bomb=True)
state_dim = env.height * env.width
n_actions = len(ACTIONS)
agent = Agent(dim_input=state_dim, dim_output=n_actions)

# training params
max_steps = 100
gamma = .9
alpha = 0.03
n_epochs = 1000
n_trials = 5

log_return = np.zeros((n_epochs, 2, n_trials))
log_steps = np.zeros((n_epochs, 2, n_trials))

for epoch_id in range(n_epochs):
Ejemplo n.º 4
0
def main(args):
    env = GridWorld()

    agent = TDAgent(env, epsilon=args.epsilon, gamma=args.discount, alpha=args.lr)
    agent.control(method=args.algorithm)