for mb_obs, mb_rewards, mb_actions, mb_values, mb_probs, done_rewards, done_steps in \ common.iterate_batches(envs, net_i2a, device): if len(done_rewards) > 0: total_steps += sum(done_steps) speed = total_steps / (time.time() - ts_start) if best_reward is None: best_reward = done_rewards.max() elif best_reward < done_rewards.max(): best_reward = done_rewards.max() tb_tracker.track("total_reward_max", best_reward, step_idx) tb_tracker.track("total_reward", done_rewards, step_idx) tb_tracker.track("total_steps", done_steps, step_idx) print("%d: done %d episodes, mean_reward=%.2f, best_reward=%.2f, speed=%.2f f/s" % ( step_idx, len(done_rewards), done_rewards.mean(), best_reward, speed)) obs_v = common.train_a2c(net_i2a, mb_obs, mb_rewards, mb_actions, mb_values, optimizer, tb_tracker, step_idx, device=device) # policy distillation probs_v = torch.FloatTensor(mb_probs).to(device) policy_opt.zero_grad() logits_v, _ = net_policy(obs_v) policy_loss_v = -F.log_softmax(logits_v, dim=1) * probs_v.view_as(logits_v) policy_loss_v = policy_loss_v.sum(dim=1).mean() policy_loss_v.backward() policy_opt.step() tb_tracker.track("loss_distill", policy_loss_v, step_idx) step_idx += 1 if step_idx % TEST_EVERY_BATCH == 0: test_reward, test_steps = common.test_model(test_env, net_i2a, device=device) writer.add_scalar("test_reward", test_reward, step_idx)
for mb_obs, mb_rewards, mb_actions, mb_values, _, done_rewards, done_steps in \ common.iterate_batches(envs, net, device=device): if len(done_rewards) > 0: total_steps += sum(done_steps) speed = total_steps / (time.time() - ts_start) if best_reward is None: best_reward = done_rewards.max() elif best_reward < done_rewards.max(): best_reward = done_rewards.max() tb_tracker.track("total_reward_max", best_reward, step_idx) tb_tracker.track("total_reward", done_rewards, step_idx) tb_tracker.track("total_steps", done_steps, step_idx) print("%d: done %d episodes, mean_reward=%.2f, best_reward=%.2f, speed=%.2f" % ( step_idx, len(done_rewards), done_rewards.mean(), best_reward, speed)) common.train_a2c(net, mb_obs, mb_rewards, mb_actions, mb_values, optimizer, tb_tracker, step_idx, device=device) step_idx += 1 if args.steps is not None and args.steps < step_idx: break if step_idx % TEST_EVERY_BATCH == 0: test_reward, test_steps = common.test_model(test_env, net, device=device) writer.add_scalar("test_reward", test_reward, step_idx) writer.add_scalar("test_steps", test_steps, step_idx) if best_test_reward is None or best_test_reward < test_reward: if best_test_reward is not None: fname = os.path.join(saves_path, "best_%08.3f_%d.dat" % (test_reward, step_idx)) torch.save(net.state_dict(), fname) best_test_reward = test_reward print("%d: test reward=%.2f, steps=%.2f, best_reward=%.2f" % ( step_idx, test_reward, test_steps, best_test_reward))
best_reward = None best_test_reward = None with ptan.common.utils.TBMeanTracker(writer, batch_size=100) as tb_tracker: for mb_obs, mb_rewards, mb_actions, mb_values, done_rewards, done_steps in common.iterate_batches(envs, net, cuda=args.cuda): if len(done_rewards) > 0: if best_reward is None: best_reward = done_rewards.max() elif best_reward < done_rewards.max(): best_reward = done_rewards.max() tb_tracker.track("total_reward_max", best_reward, step_idx) tb_tracker.track("total_reward", done_rewards, step_idx) tb_tracker.track("total_steps", done_steps, step_idx) print("%d: done %d episodes, mean_reward=%.2f, best_reward=%.2f" % ( step_idx, len(done_rewards), done_rewards.mean(), best_reward)) common.train_a2c(net, mb_obs, mb_rewards, mb_actions, mb_values, optimizer, tb_tracker, step_idx, cuda=args.cuda) step_idx += 1 if args.steps is not None and args.steps < step_idx: break if step_idx % TEST_EVERY_BATCH == 0: test_reward, test_steps = common.test_model(test_env, net, cuda=args.cuda) writer.add_scalar("test_reward", test_reward, step_idx) writer.add_scalar("test_steps", test_steps, step_idx) if best_test_reward is None or best_test_reward < test_reward: if best_test_reward is not None: fname = os.path.join(saves_path, "best_%08.3f_%d.dat" % (test_reward, step_idx)) torch.save(net.state_dict(), fname) best_test_reward = test_reward print("%d: test reward=%.2f, steps=%.2f, best_reward=%.2f" % ( step_idx, test_reward, test_steps, best_test_reward))