env.reset() done = False curr_score = 0 for act_i in range(max_ep_len): exp = agent.act(env, eps) curr_score = curr_score + exp.reward if exp.done: break if act_i % learn_every == 0: agent.learn(64) score_list.append(curr_score) score_window.append(curr_score) if episode % update_every == 0: agent.update_target() if episode % 20 == 0: print("episode " + str(episode) + ", mean score: " + str(np.mean(score_window))) if episode % 100 == 0: mean_score_list.append(np.mean(score_window)) print("test completed with scores: " + str(mean_score_list)) agent.save_checkpoint(local_checkpoint="test_out_3/qnet_" + env_sel + "_local_test_" + str(test_i) + ".ckp") pickle.dump((score_list, mean_score_list, pars), open( "test_out_3/qnet_" + env_sel + "_scores_and_pars_test_" + str(test_i) + ".p", "wb")) results.append(max(mean_score_list)) result_pars.append(pars)
score_window.append(curr_score) #print and save if episode % 20 == 0: print("Episode " + str(episode) + ". Eps = " + str(eps) + ", mean_score: " + str(np.mean(score_window))) ax.clear() ax.plot(np.arange(len(score_list)), score_list) plt.ylabel('Score') plt.xlabel('Episode #') plt.draw() plt.pause(.001) if np.mean(score_window) > env_pars.target_score: agent.save_checkpoint(target_checkpoint="qnet_" + env_sel + "_target_episode_" + str(episode) + ".ckp", local_checkpoint="qnet_" + env_sel + "_local_episode_" + str(episode) + ".ckp", delayer_checkpoint="qnet_" + env_sel + "_delayer_episode_" + str(episode) + ".ckp") pickle.dump( score_list, open("qnet_" + env_sel + "_scores_" + str(episode) + ".p", "wb")) #final print and save agent.save_checkpoint(target_checkpoint="qnet_" + env_sel + "_target_final.ckp") pickle.dump(score_list, open("qnet_" + env_sel + "_scores_final.p", "wb")) plt.plot(np.arange(len(score_list)), score_list) plt.ylabel('Score') plt.xlabel('Episode #') plt.draw()