Beispiel #1
0
while settings.episodes > episode:
    # Prepare environment for playing
    env.reset()

    # Reset or increment values
    terminal = False
    episode += 1
    step = 0
    q_max_arr = []
    reward_arr = []
    epsilon_arr = []

    while not terminal and step < settings.train_step_limit:
        step += 1
        # Get the Q-values of the current state
        state_row = env.actor_state_row()
        q_values = q_table.get_state_q(state_row)
        # Save max(Q(s,a)) for stats
        q_max = np.max(q_values)

        # Anneal epsilon
        if epsilon > settings.final_epsilon:
            epsilon = settings.initial_epsilon - (2 * episode /
                                                  float(settings.episodes))
        else:
            # Final epsilon reached, stop annealing.
            epsilon = settings.final_epsilon

        # Select action
        if (np.random.random() < epsilon):
            # Choose random action