ep_ret += rew ep_len += 1 if done or (t==local_steps_per_epoch-1): # if not done: # print("WARNING: trajectory cut off by epoch at %d steps." % ep_len) last_val = rew if done else v_t buffer.finish_path(last_val) if done: rewards.append(ep_ret) obs, rew, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0 agent.update(buffer.get()) for i in range(10): obs, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 rewards = [] while not d or ep_len == 1000: act, _, _ = agent.get_action(obs) obs, r, d, _ = env.step(act[0]) ep_len += 1 ep_ret += r rewards.append(r) env.render() obs, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 print(np.mean(np.array(rewards))) print(rewards)
obs = torch.from_numpy(obs) rollouts.insert(obs, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): agent.model.eval() next_value = agent.get_value(rollouts.obs[-1], rollouts.masks[-1], device) if using_pcnt: pcnt_dist.add_pcnt(rollouts, device) rollouts.compute_returns(next_value, use_gae, gamma, gae_lambda, use_proper_time_limits) agent.model.train() value_loss, action_loss, dist_entropy = agent.update(rollouts, device) rollouts.after_update() #if j % args.log_interval == 0 and len(episode_rewards) > 1: all_return.append(np.mean(cumul_return)) all_length.append(np.mean(episo_length)) if True: total_num_steps = (j + 1) * num_processes * num_steps end = time.time() print( "Updates {}, num timesteps {}, \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n, entropy loss {}, value_loss {}, action_loss {}" .format(j, total_num_steps, len(cumul_return), np.mean(cumul_return), np.median(cumul_return), np.min(cumul_return), np.max(cumul_return), dist_entropy, value_loss, action_loss))