def test1(self):
        print 'Test 1 -- Regular Case'
        grid = [['0', '0', '0', '1'], ['0', 'x', '0', '-1'],
                ['0', '0', '0', '0']]

        gw = gridworld.GridWorld(grid, {(0, 3), (1, 3)}, 0.8)

        agent = qlearning.QLearningAgent(gw.get_actions,
                                         epsilon=0.2,
                                         alpha=0.5,
                                         gamma=0.9)

        # Training
        episodes = 5000
        for i in range(episodes):
            gw.reset((2, 0))
            cur_s = gw.get_current_state()
            is_done = False
            while not is_done:
                a = agent.get_action(cur_s)
                last_state, action, next_state, reward, is_done = gw.step(a)
                agent.learn(last_state, action, next_state, reward, is_done)
                cur_s = next_state

        # show optimal policy
        opt_policy = gw.get_optimal_policy(agent)
        gw.display_policy_grid(opt_policy)
        gw.display_value_grid(gw.get_values(agent))
        gw.display_qvalue_grid(gw.get_qvalues(agent))
Ejemplo n.º 2
0
def get_agent(name, *args, **kwargs):
    if name == 'QLearningAgent':
        import qlearning
        return qlearning.QLearningAgent(*args, **kwargs)
    elif name == 'DQNAgent':
        import dqn
        return dqn.DQNAgent(*args, **kwargs)
    else:
        raise Exception('Invalid agent name.')
env = gym.make('CartPole-v0')
if RECORD:
    env = wrappers.Monitor(
        env,
        '/home/vbalogh/git/reinforcement_learning-stormmax/cartpole-experiment-1',
        force=True)


def get_actions(state):
    return [0, 1]


agent = qlearning.QLearningAgent(get_actions,
                                 epsilon=EPSILON,
                                 alpha=LEARNING_RATE,
                                 gamma=DISCOUNT_FACTOR,
                                 epsilon_decay=EPSILON_DECAY)

history = []

agent, history = train(agent, env, history)

# if RECORD:
#   env.monitor.close()

avg_reward = [
    numpy.mean(history[i * 100:(i + 1) * 100])
    for i in xrange(int(len(history) / 100))
]
f_reward = plt.figure(1)