Example #1
0
def main(_):
    with tf.Session() as sess:
        env = gym.make(ENV_NAME)
        np.random.seed(RANDOM_SEED)
        tf.set_random_seed(RANDOM_SEED)
        env.seed(RANDOM_SEED)

        print(env.observation_space)
        print(env.action_space)

        state_dim = env.observation_space.shape[0]

        try:
            action_dim = env.action_space.shape[0]
            action_bound = env.action_space.high
            # Ensure action bound is symmetric
            assert (env.action_space.high == -env.action_space.low)
            discrete = False
            print('Continuous Action Space')
        except IndexError:
            action_dim = env.action_space.n
            action_bound = 1
            discrete = True
            print('Discrete Action Space')

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             ACTOR_LEARNING_RATE, TAU)

        critic = CriticNetwork(sess, state_dim, action_dim,
                               CRITIC_LEARNING_RATE, TAU,
                               actor.get_num_trainable_vars())

        noise = Noise(DELTA, SIGMA, OU_A, OU_MU)
        reward = Reward(REWARD_FACTOR, GAMMA)

        if GYM_MONITOR_EN:
            if not RENDER_ENV:
                env = Monitor(env,
                              MONITOR_DIR,
                              video_callable=False,
                              force=True)
            else:
                env = Monitor(env, MONITOR_DIR, force=True)

        try:
            train(sess, env, actor, critic, noise, reward, discrete)
        except KeyboardInterrupt:
            pass

        if GYM_MONITOR_EN:
            env.close()
Example #2
0
        if stateValues[maxValueAction]['value'] < stateValues[action]['value']:
            maxValueAction = action
        if stateValues[minCountAction]['count'] > stateValues[action]['count']:
            minCountAction = action
    # Compute the decay of the exploration
    decayX = 0.5
    decayY = 50
    decay = max(-i_episode * decayX + decayY, 10 / (i_episode + 1))
    if randint(0, 100) < decay:
        explorationHistory[i_episode] += 1
        return minCountAction
    else:
        return maxValueAction


nbEpisodes = 1000
stepsHistory = [0] * nbEpisodes
env = gym.make('LunarLander-v2')
env = Monitor(env, 'tmp/cart-pole', force=True)
for i in range(6):
    print i
    history = {}  # 'state' ==> [{'count': int, 'value': float}]
    explorationHistory = [0] * nbEpisodes
    learn(nbEpisodes, i)
env.close()
# gym.upload('tmp/cart-pole', api_key='sk_QoYvL963TwnAqSJXZLOQ')
plt.plot(range(nbEpisodes), stepsHistory, range(nbEpisodes),
         explorationHistory, range(nbEpisodes), [195] * nbEpisodes)
plt.ylabel('Number of rewards')
plt.show()
Example #3
0
            randomAgent.remember(state, action, reward, next_state, done)
            state = next_state
            if done:
                break
    print("Finish to full the random agent memory")
    agent.memory = randomAgent.memory
    randomAgent = None

    env = Monitor(env, 'tmp/cart-pole-ddqn-2', force=True)
    for e in range(EPISODES):
        if DEBUG and e >= EPISODES - 10:
            agent.stopExploration()

        state = env.reset()
        for time in range(500):
            #env.render()
            
            # act on one input (one state)
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)

            agent.remember(state, action, reward, next_state, done)
            agent.replay(batch_size)
            state = next_state
            if done:
                print("episode: {}/{}, score: {}, e: {:.2}"
                      .format(e, EPISODES, time, agent.epsilon))
                break

    env.close()