def train(
        self,
        env: gym.Env,
        agent: Agent,
        network: Network,
        optimizer,
        window_size: int,
        nb_self_play: int,
        num_unroll_steps: int,
        td_steps: int,
        discount: float,
        batch_size: int,
        nb_train_update: int,
        nb_train_epochs: int,
        max_grad_norm: float,
        filename: str,
        ent_c: float,
    ):
        replay_buffer = ReplayBuffer(window_size, batch_size)

        for epoch in range(nb_train_epochs):
            network.eval()
            rewards = []
            for _ in range(nb_self_play):
                game_buffer = self._play_one_game(env, agent)
                # game_buffer.print_buffer()
                replay_buffer.append(game_buffer)
                rewards.append(np.sum(game_buffer.rewards))

            network.train()
            losses = []
            for _ in range(nb_train_update):
                batch = replay_buffer.sample_batch(num_unroll_steps, td_steps,
                                                   discount)
                losses.append(
                    self._update_weights(network, optimizer, batch,
                                         max_grad_norm, ent_c))
            v_loss, r_loss, p_loss, entropy = np.mean(losses, axis=0)
            print(
                f"Epoch[{epoch+1}]: Reward[{np.mean(rewards)}], Loss: V[{v_loss:.6f}]/R[{r_loss:.6f}]/P[{p_loss:.6f}]/E[{entropy:.6f}]"
            )

            if (epoch + 1) % 10 == 0:
                agent.save_model(filename)
Exemple #2
0
def train(sess, env, actor, critic, noise, reward, discrete):
    # set up summary writer
    summary_write = tf.summary.FileWriter("ddpg_summary")

    sess.run(tf.global_variables_initializer())

    # initialize target and critic network
    actor.update_target_network()
    critic.update_target_network()

    # initialize replay memory
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    # initialize noise
    ou_level = 0.

    for i in range(MAX_EPISODES):
        s = env.reset()
        ep_reward = 0
        ep_ave_max_q = 0

        episode_buffer = np.empty((0, 5), float)

        for j in range(MAX_EP_STEPS):
            if RENDER_ENV:
                env.render()
            a = actor.predict(np.reshape(s, (1, actor.s_dim)))

            # Add exploration noise
            if i < NOISE_MAX_EP:
                ou_level = noise.ornstein_uhlenbeck_level(ou_level)
                a = a + ou_level

            # Set action for discrete and continuous action spaces
            if discrete:
                action = np.argmax(a)
            else:
                action = a[0]

            s2, r, terminal, info = env.step(action)

            # Choose reward type
            ep_reward += r

            episode_buffer = np.append(episode_buffer,
                                       [[s, a, r, terminal, s2]],
                                       axis=0)

            # Adding experience to memory
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(MINIBATCH_SIZE)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Update the critic given the targes
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))
                ep_ave_max_q += np.max(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            # Set previous state for next step
            s = s2
            if terminal:
                # Reward system for episode
                #
                episode_buffer = reward.discount(episode_buffer)

                # Add episode to replay
                for step in episode_buffer:
                    replay_buffer.add(np.reshape(step[0], (actor.s_dim, )),
                                      np.reshape(step[1], (actor.a_dim, )),
                                      step[2], step[3],
                                      np.reshape(step[4], (actor.s_dim, )))

                # summary = tf.summary()
                # summary.value.add(tag="Perf/Reward", simple_value=float(ep_reward))
                # summary.value.add(tag="Perf/Qmax", simple_value=float(ep_ave_max_q / float(j)))
                # summary_writer.add_summary(summary, i)

                # summary_writer.flush()

                if i != 0:
                    print "|Reward: %.2i | Episode: %d | Qmax: %.4f" % (
                        int(ep_reward), i, (ep_ave_max_q / float(i)))
                break