def train( self, env: gym.Env, agent: Agent, network: Network, optimizer, window_size: int, nb_self_play: int, num_unroll_steps: int, td_steps: int, discount: float, batch_size: int, nb_train_update: int, nb_train_epochs: int, max_grad_norm: float, filename: str, ent_c: float, ): replay_buffer = ReplayBuffer(window_size, batch_size) for epoch in range(nb_train_epochs): network.eval() rewards = [] for _ in range(nb_self_play): game_buffer = self._play_one_game(env, agent) # game_buffer.print_buffer() replay_buffer.append(game_buffer) rewards.append(np.sum(game_buffer.rewards)) network.train() losses = [] for _ in range(nb_train_update): batch = replay_buffer.sample_batch(num_unroll_steps, td_steps, discount) losses.append( self._update_weights(network, optimizer, batch, max_grad_norm, ent_c)) v_loss, r_loss, p_loss, entropy = np.mean(losses, axis=0) print( f"Epoch[{epoch+1}]: Reward[{np.mean(rewards)}], Loss: V[{v_loss:.6f}]/R[{r_loss:.6f}]/P[{p_loss:.6f}]/E[{entropy:.6f}]" ) if (epoch + 1) % 10 == 0: agent.save_model(filename)
def train(sess, env, actor, critic, noise, reward, discrete): # set up summary writer summary_write = tf.summary.FileWriter("ddpg_summary") sess.run(tf.global_variables_initializer()) # initialize target and critic network actor.update_target_network() critic.update_target_network() # initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) # initialize noise ou_level = 0. for i in range(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 episode_buffer = np.empty((0, 5), float) for j in range(MAX_EP_STEPS): if RENDER_ENV: env.render() a = actor.predict(np.reshape(s, (1, actor.s_dim))) # Add exploration noise if i < NOISE_MAX_EP: ou_level = noise.ornstein_uhlenbeck_level(ou_level) a = a + ou_level # Set action for discrete and continuous action spaces if discrete: action = np.argmax(a) else: action = a[0] s2, r, terminal, info = env.step(action) # Choose reward type ep_reward += r episode_buffer = np.append(episode_buffer, [[s, a, r, terminal, s2]], axis=0) # Adding experience to memory if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targes predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.max(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() # Set previous state for next step s = s2 if terminal: # Reward system for episode # episode_buffer = reward.discount(episode_buffer) # Add episode to replay for step in episode_buffer: replay_buffer.add(np.reshape(step[0], (actor.s_dim, )), np.reshape(step[1], (actor.a_dim, )), step[2], step[3], np.reshape(step[4], (actor.s_dim, ))) # summary = tf.summary() # summary.value.add(tag="Perf/Reward", simple_value=float(ep_reward)) # summary.value.add(tag="Perf/Qmax", simple_value=float(ep_ave_max_q / float(j))) # summary_writer.add_summary(summary, i) # summary_writer.flush() if i != 0: print "|Reward: %.2i | Episode: %d | Qmax: %.4f" % ( int(ep_reward), i, (ep_ave_max_q / float(i))) break