expert_rewards = [np.sum(epi['rews']) for epi in expert_epis] expert_mean_rew = np.mean(expert_rewards) logger.log('expert_score={}'.format(expert_mean_rew)) logger.log('expert_num_epi={}'.format(expert_traj.num_epi)) total_epi = 0 total_step = 0 max_rew = -1e6 kl_beta = args.init_kl_beta if args.pretrain: with measure('bc pretrain'): for _ in range(args.bc_epoch): _ = behavior_clone.train(expert_traj, pol, optim_pol, args.bc_batch_size) torch.save(pol.state_dict(), os.path.join(args.log, 'models', 'pol_bc.pkl')) while args.max_epis > total_epi: with measure('sample'): epis = sampler.sample(pol, max_steps=args.max_steps_per_iter) with measure('train'): traj = Traj() traj.add_epis(epis) traj = ef.compute_vs(traj, vf) traj = ef.compute_rets(traj, args.gamma) traj = ef.compute_advs(traj, args.gamma, args.lam) traj = ef.centerize_advs(traj) traj = ef.compute_h_masks(traj) traj.register_epis()
total_epi += traj.num_epi step = traj.num_step total_step += step rewards = [np.sum(epi['rews']) for epi in epis] mean_rew = np.mean(rewards) logger.record_results(args.log, result_dict, score_file, total_epi, step, total_step, rewards, plot_title=args.env_name) if mean_rew > max_rew: torch.save(pol.state_dict(), os.path.join(args.log, 'models', 'pol_max.pkl')) torch.save(vf.state_dict(), os.path.join(args.log, 'models', 'vf_max.pkl')) torch.save(optim_pol.state_dict(), os.path.join(args.log, 'models', 'optim_pol_max.pkl')) torch.save(optim_vf.state_dict(), os.path.join(args.log, 'models', 'optim_vf_max.pkl')) max_rew = mean_rew torch.save(pol.state_dict(), os.path.join(args.log, 'models', 'pol_last.pkl')) torch.save(vf.state_dict(), os.path.join(args.log, 'models', 'vf_last.pkl')) torch.save(optim_pol.state_dict(), os.path.join(args.log, 'models', 'optim_pol_last.pkl'))
off_traj, pol, qfs, targ_qfs, log_alpha, optim_pol, optim_qfs, optim_alpha, step//50, args.rnn_batch_size, args.seq_length, args.burn_in_length, args.tau, args.gamma, args.sampling, not args.no_reparam ) rewards = [np.sum(epi['rews']) for epi in epis] mean_rew = np.mean(rewards) logger.record_results(args.log, result_dict, score_file, total_epi, step, total_step, rewards, plot_title=args.env_name) if mean_rew > max_rew: torch.save(pol.state_dict(), os.path.join( args.log, 'models', 'pol_max.pkl')) torch.save(qf1.state_dict(), os.path.join( args.log, 'models', 'qf1_max.pkl')) torch.save(qf2.state_dict(), os.path.join( args.log, 'models', 'qf2_max.pkl')) torch.save(optim_pol.state_dict(), os.path.join( args.log, 'models', 'optim_pol_max.pkl')) torch.save(optim_qf1.state_dict(), os.path.join( args.log, 'models', 'optim_qf1_max.pkl')) torch.save(optim_qf2.state_dict(), os.path.join( args.log, 'models', 'optim_qf2_max.pkl')) max_rew = mean_rew torch.save(pol.state_dict(), os.path.join( args.log, 'models', 'pol_last.pkl'))
score_file, total_epi, step, total_step, rewards, plot_title=args.env_name) # save models regular intervals steps_as = str( int( int(total_step / args.steps_per_save_models + 1) * args.steps_per_save_models)) if 'prev_as' in locals(): if not prev_as == steps_as: torch.save( pol.state_dict(), os.path.join(args.log, 'models', 'pol_' + steps_as + '.pkl')) torch.save( qf1.state_dict(), os.path.join(args.log, 'models', 'qf1_' + steps_as + '.pkl')) torch.save( qf2.state_dict(), os.path.join(args.log, 'models', 'qf2_' + steps_as + '.pkl')) torch.save( discrim.state_dict(), os.path.join(args.log, 'models', 'discrim_' + steps_as + '.pkl')) torch.save( optim_pol.state_dict(), os.path.join(args.log, 'models', 'optim_pol_' + steps_as + '.pkl'))