batch_size=64, layer1_size=256, layer2_size=128, n_actions=3) #agent.load_models() # np.random.seed(1) score_history = [] for i in range(50): obs = env.reset() done = False score = 0 while not done: act = agent.choose_action(obs) print(act) new_state, reward, done, info = env.step(act) agent.remember(obs, act, reward, new_state, int(done)) agent.learn() score += reward obs = new_state #env.render() score_history.append(score) if i % 10 == 0: agent.save_models() env.render() print('episode ', i, 'score %.2f' % score, 'trailing 25 games avg %.3f' % np.mean(score_history[-25:]))
def main(env, episodes=500, max_steps=500, eps_decay=.99, actor_lr=10**-6, critic_lr=10**-3, gamma=.9, base_nodes=64, batch_size=128,theta=.4, sigma=.25): with tf.Session() as sess: # Initialize environment and constants input_dim = env.state_dim output_dim = env.action_dim action_high = env.action_high action_low = env.action_low # Create DDPG Agent agent = Agent(input_dim, output_dim, action_high, action_low, actor_lr=actor_lr, critic_lr=critic_lr, gamma=gamma, base_nodes=base_nodes, eps_decay=eps_decay, batch_size=batch_size,theta=theta, sigma=sigma, sess=sess) sess.run(tf.global_variables_initializer()) agent.actor.update_target_network() agent.critic.update_target_network() # Prepare for episodes c_losses, rewards, actions, Qs, states = [np.array([]) for i in range(5)] for e in tqdm(range(episodes)): # Reset episode state = env.reset() state = np.reshape(state, (-1, len(state))) agent.noise.reset() done = False step_count = 0 total_reward = 0 while not done and step_count < max_steps: # Action action = agent.act(state) next_state, reward, done = env.step(action) next_state = np.reshape(next_state, (-1, len(next_state))) # Learn c_loss = agent.learn(state, action, reward, done, next_state) # Save results c_losses = np.append(c_losses, c_loss) actions = np.append(actions, action) states = np.append(states, state[0]) Qs = np.append(Qs, agent.critic.predict(state, action)) # Loop state = next_state step_count += 1 total_reward += reward # Reduce exploration if agent.eps > agent.min_eps: agent.eps *= agent.eps_decay rewards = np.append(rewards, total_reward) return rewards, c_losses, actions, Qs