def main(_): with tf.Session() as sess: env = gym.make(ENV_NAME) np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) env.seed(RANDOM_SEED) print(env.observation_space) print(env.action_space) state_dim = env.observation_space.shape[0] try: action_dim = env.action_space.shape[0] action_bound = env.action_space.high # Ensure action bound is symmetric assert (env.action_space.high == -env.action_space.low) discrete = False print('Continuous Action Space') except IndexError: action_dim = env.action_space.n action_bound = 1 discrete = True print('Discrete Action Space') actor = ActorNetwork(sess, state_dim, action_dim, action_bound, ACTOR_LEARNING_RATE, TAU) critic = CriticNetwork(sess, state_dim, action_dim, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) noise = Noise(DELTA, SIGMA, OU_A, OU_MU) reward = Reward(REWARD_FACTOR, GAMMA) if GYM_MONITOR_EN: if not RENDER_ENV: env = Monitor(env, MONITOR_DIR, video_callable=False, force=True) else: env = Monitor(env, MONITOR_DIR, force=True) try: train(sess, env, actor, critic, noise, reward, discrete) except KeyboardInterrupt: pass if GYM_MONITOR_EN: env.close()
if stateValues[maxValueAction]['value'] < stateValues[action]['value']: maxValueAction = action if stateValues[minCountAction]['count'] > stateValues[action]['count']: minCountAction = action # Compute the decay of the exploration decayX = 0.5 decayY = 50 decay = max(-i_episode * decayX + decayY, 10 / (i_episode + 1)) if randint(0, 100) < decay: explorationHistory[i_episode] += 1 return minCountAction else: return maxValueAction nbEpisodes = 1000 stepsHistory = [0] * nbEpisodes env = gym.make('LunarLander-v2') env = Monitor(env, 'tmp/cart-pole', force=True) for i in range(6): print i history = {} # 'state' ==> [{'count': int, 'value': float}] explorationHistory = [0] * nbEpisodes learn(nbEpisodes, i) env.close() # gym.upload('tmp/cart-pole', api_key='sk_QoYvL963TwnAqSJXZLOQ') plt.plot(range(nbEpisodes), stepsHistory, range(nbEpisodes), explorationHistory, range(nbEpisodes), [195] * nbEpisodes) plt.ylabel('Number of rewards') plt.show()
randomAgent.remember(state, action, reward, next_state, done) state = next_state if done: break print("Finish to full the random agent memory") agent.memory = randomAgent.memory randomAgent = None env = Monitor(env, 'tmp/cart-pole-ddqn-2', force=True) for e in range(EPISODES): if DEBUG and e >= EPISODES - 10: agent.stopExploration() state = env.reset() for time in range(500): #env.render() # act on one input (one state) action = agent.act(state) next_state, reward, done, _ = env.step(action) agent.remember(state, action, reward, next_state, done) agent.replay(batch_size) state = next_state if done: print("episode: {}/{}, score: {}, e: {:.2}" .format(e, EPISODES, time, agent.epsilon)) break env.close()