Exemple #1
0
              batch_size=64,
              layer1_size=256,
              layer2_size=128,
              n_actions=3)

#agent.load_models()
# np.random.seed(1)

score_history = []
for i in range(50):
    obs = env.reset()
    done = False
    score = 0
    while not done:
        act = agent.choose_action(obs)
        print(act)
        new_state, reward, done, info = env.step(act)
        agent.remember(obs, act, reward, new_state, int(done))
        agent.learn()
        score += reward
        obs = new_state
        #env.render()
    score_history.append(score)

    if i % 10 == 0:
        agent.save_models()
        env.render()

    print('episode ', i, 'score %.2f' % score,
          'trailing 25 games avg %.3f' % np.mean(score_history[-25:]))
def main(env, episodes=500, max_steps=500, eps_decay=.99,
         actor_lr=10**-6, critic_lr=10**-3, gamma=.9, 
         base_nodes=64, batch_size=128,theta=.4, sigma=.25):

	with tf.Session() as sess:

		# Initialize environment and constants
		input_dim   = env.state_dim   
		output_dim  = env.action_dim  
		action_high = env.action_high 
		action_low  = env.action_low 

		# Create DDPG Agent
		agent = Agent(input_dim, output_dim, action_high, action_low, 
		              actor_lr=actor_lr, critic_lr=critic_lr, gamma=gamma, 
		              base_nodes=base_nodes, eps_decay=eps_decay,
		              batch_size=batch_size,theta=theta, sigma=sigma,
		              sess=sess)

		sess.run(tf.global_variables_initializer())
		agent.actor.update_target_network()
		agent.critic.update_target_network()

		# Prepare for episodes
		c_losses, rewards, actions, Qs, states = [np.array([]) for i in range(5)]

		for e in tqdm(range(episodes)):

			# Reset episode
			state = env.reset()
			state = np.reshape(state, (-1, len(state)))
			agent.noise.reset()

			done         = False
			step_count   = 0
			total_reward = 0

			while not done and step_count < max_steps:

				# Action
				action = agent.act(state)
				next_state, reward, done = env.step(action)
				next_state = np.reshape(next_state, (-1, len(next_state)))

				# Learn
				c_loss = agent.learn(state, action, reward, done, next_state)
				
				# Save results
				c_losses = np.append(c_losses, c_loss)
				actions  = np.append(actions, action)
				states   = np.append(states, state[0])
				Qs       = np.append(Qs, agent.critic.predict(state, action))
				
				# Loop
				state         = next_state
				step_count   += 1
				total_reward += reward

			# Reduce exploration
			if agent.eps > agent.min_eps:
				agent.eps *= agent.eps_decay

			rewards = np.append(rewards, total_reward)


		return rewards, c_losses, actions, Qs