while not done: log(test_env, iteration, step_idx, total_rew) p, _ = network.step(np.array([state])) # print(p) action = np.argmax(p) state, reward, done, _ = test_env.step(action) step_idx += 1 total_rew += reward log(test_env, iteration, step_idx, total_rew) value_losses = [] policy_losses = [] for i in range(1000): if i % 50 == 0: test_agent(i) plt.plot(value_losses, label="value loss") plt.plot(policy_losses, label="policy loss") plt.legend() plt.show() obs, pis, returns, total_reward, done_state = execute_episode( network, 32, HillClimbingEnv) mem.add_all({"ob": obs, "pi": pis, "return": returns}) batch = mem.get_minibatch() vl, pl = trainer.train(batch["ob"], batch["pi"], batch["return"]) value_losses.append(vl) policy_losses.append(pl)