y_t = rewards_vector + GAMMA * target_q_values loss = critic.model.train_on_batch([states, actions], y_t) losses.append(loss) loss_writer.writerow({ 'episode': episode, 'avg_reward': r_t, 'critic_loss': loss }) # record losses # Update actor a_for_grad = actor.model.predict(states) grads = critic.gradients(states, a_for_grad) actor.train(states, grads) # Update target networks actor.update_actor_target() critic.update_critic_target() # Gradually decrease exploration epsilon *= EPSILON_DECAY # Print to terminal print("Episode: ", episode) print("Epsilon: ", epsilon) # print("S_t", s_t) print("Defender mu_sigma (a_t): ", a_t) # print("Defender locations list: ", def_coords_list) print("Defender average coords (row, col): ", def_avg_coords) print("Average reward (r_t): ", r_t) print("Critic Loss: ", loss) print()