training_info["epoch mean rewards"].append(epoch_rewards[-1]) training_info["value net loss"].append(critic_loss_total) training_info["AE Hash loss"].append(ae_hash_loss) if (i_epoch + 1) % num_avg_epoch: training_info["past %d epochs mean reward" % (num_avg_epoch)] = \ (sum(training_info["epoch mean rewards"][-num_avg_epoch:]) / num_avg_epoch) \ if len(training_info["epoch mean rewards"]) >= num_avg_epoch else 0 # Print stats print("\n\n============= Epoch: %d =============" % (i_epoch + 1)) print("epoch mean durations: %f" % (epoch_durations[-1])) print("epoch mean rewards: %f" % (epoch_rewards[-1])) print("Max reward achieved: %f" % training_info["max reward achieved"]) print("value net loss: %f" % critic_loss_total) print("Autoencoder Hashing model loss: %f" % ae_hash_loss) # Plot stats if plot: # plot_durations(training_info["epoch mean rewards"], training_info["value net loss"]) plot_durations(training_info["epoch mean rewards"], training_info["value net loss"], training_info["AE Hash loss"]) # Update counter i_epoch += 1 # Every save_ckpt_interval, save a checkpoint according to current i_episode. if i_epoch % save_ckpt_interval == 0: save_checkpoint(ckpt_dir, actor_critic, ae_hash, ae_hash_optim, i_epoch, **training_info)
training_info["epoch mean durations"].append(epoch_durations[-1]) training_info["epoch mean rewards"].append(epoch_rewards[-1]) training_info["extrinsic value net loss"].append(ex_value_net_mse) training_info["intrinsic value net loss"].append(in_value_net_mse) if (i_epoch + 1) % num_avg_epoch: training_info["past %d epochs mean reward" % (num_avg_epoch)] = \ (sum(training_info["epoch mean rewards"][-num_avg_epoch:]) / num_avg_epoch) \ if len(training_info["epoch mean rewards"]) >= num_avg_epoch else 0 # Print stats print("\n\n============= Epoch: %d =============" % (i_epoch + 1)) print("epoch mean durations: %f" % (epoch_durations[-1])) print("epoch mean rewards: %f" % (epoch_rewards[-1])) print("Max reward achieved: %f" % training_info["max reward achieved"]) print("extrinsic value net loss: %f" % ex_value_net_mse) print("intrinsic value net loss: %f" % in_value_net_mse) # Plot stats if plot: plot_durations(training_info["epoch mean rewards"], training_info["extrinsic value net loss"], training_info["intrinsic value net loss"]) # Update counter i_epoch += 1 # Every save_ckpt_interval, save a checkpoint according to current i_episode. # if i_epoch % save_ckpt_interval == 0: # save_checkpoint(ckpt_dir, policy_net, value_net_in, value_net_ex, # valuenet_in_optimizer, valuenet_ex_optimizer, simhash, i_epoch, **training_info)
# Record stats training_info["epoch mean durations"].append(epoch_durations[-1]) training_info["epoch mean rewards"].append(epoch_rewards[-1]) if (i_epoch + 1) % num_avg_epoch: training_info["past %d epochs mean reward" % (num_avg_epoch)] = \ (sum(training_info["epoch mean rewards"][-num_avg_epoch:]) / num_avg_epoch) \ if len(training_info["epoch mean rewards"]) >= num_avg_epoch else 0 # Print stats print("\n\n============= Epoch: %d =============" % (i_epoch + 1)) print("epoch mean durations: %f" % (epoch_durations[-1])) print("epoch mean rewards: %f" % (epoch_rewards[-1])) print("Max reward achieved: %f" % training_info["max reward achieved"]) # Plot stats if plot: plot_durations(training_info["epoch mean rewards"]) # Update counter i_epoch += 1 # Every save_ckpt_interval, save a checkpoint according to current i_episode. if i_epoch % save_ckpt_interval == 0: save_checkpoint(ckpt_dir, policy_net, policynet_optimizer, i_epoch, policy_lr=policy_lr, **training_info)