Esempio n. 1
0
    y_t = rewards_vector + GAMMA * target_q_values
    loss = critic.model.train_on_batch([states, actions], y_t)
    losses.append(loss)
    loss_writer.writerow({
        'episode': episode,
        'avg_reward': r_t,
        'critic_loss': loss
    })  # record losses

    # Update actor
    a_for_grad = actor.model.predict(states)
    grads = critic.gradients(states, a_for_grad)
    actor.train(states, grads)

    # Update target networks
    actor.update_actor_target()
    critic.update_critic_target()

    # Gradually decrease exploration
    epsilon *= EPSILON_DECAY

    # Print to terminal
    print("Episode: ", episode)
    print("Epsilon: ", epsilon)
    # print("S_t", s_t)
    print("Defender mu_sigma (a_t): ", a_t)
    # print("Defender locations list: ", def_coords_list)
    print("Defender average coords (row, col): ", def_avg_coords)
    print("Average reward (r_t): ", r_t)
    print("Critic Loss: ", loss)
    print()