Beispiel #1
0
                    MINI_BATCH)
                next_state_action_values = np.max(target_dqn.predict(
                    next_states / 255.0),
                                                  axis=1)
                y_true = dqn.predict(
                    states /
                    255.0)  # Y.shape: (MINI_BATCH, num_actions), i.e., (32, 6)
                y_true[range(
                    MINI_BATCH
                ), actions] = rewards + GAMMA * next_state_action_values * np.invert(
                    dones)
                dqn.train(states / 255.0, y_true)
            step += 1
        total_episode_rewards.append(cur_episode_reward)
        if episode % 100 == 0:
            dqn.save(MODEL_DIR, 'dqn-{}'.format(episode))
        if np.mean(total_episode_rewards[-30:]) > 19:
            dqn.save(MODEL_DIR, 'dqn-{}'.format(episode))
            break
    np.save(os.path.join(RES_DIR, 'episode_rewards.npy'),
            np.array(total_episode_rewards))

    # 画episode_reward
    plt.figure()
    plt.title('EPISODE - REWARD')
    plt.plot(range(len(total_episode_rewards)),
             total_episode_rewards,
             linewidth=2)
    plt.xlabel('episode')
    plt.ylabel('reward')
    plt.savefig(os.path.join(IMG_DIR, 'episode_reward.png'))
Beispiel #2
0
            # Prepare data batch
            for i in range(batch_size):
                states[i] = experiences_batch[i][0]
                actions.append(experiences_batch[i][1])
                next_states[i] = experiences_batch[i][2]
                rewards.append(experiences_batch[i][3])

            current_q_values = policy_net.predict(states)
            target_q_values = target_net.predict(next_states)

            # Create Q_targets
            for i in range(batch_size):
                # Q_max = max_a' Q_target(s', a')
                target_q_values[i][actions[i]] = rewards[i] + gamma * (np.amax(
                    target_q_values[i]))

            # Train Policy Network
            policy_net.train(states, target_q_values)

        if environment_manager.done:
            max_reward = max_reward if max_reward > max_episode_reward else max_episode_reward
            print("Episode: " + str(episode) + " Episode reward: " +
                  str(max_episode_reward) + " Max Reward: " + str(max_reward) +
                  " Epsilon value " +
                  str(strategy.get_actual_exploration_rate()))
            break
    # update target network and save network
    if episode % target_update == 0:
        target_net.copy_weights_from_nn(policy_net)
        policy_net.save(episode, strategy.get_actual_exploration_rate())