optim_pol, optim_qfs, optim_alpha, step//50, args.rnn_batch_size, args.seq_length, args.burn_in_length, args.tau, args.gamma, args.sampling, not args.no_reparam ) rewards = [np.sum(epi['rews']) for epi in epis] mean_rew = np.mean(rewards) logger.record_results(args.log, result_dict, score_file, total_epi, step, total_step, rewards, plot_title=args.env_name) if mean_rew > max_rew: torch.save(pol.state_dict(), os.path.join( args.log, 'models', 'pol_max.pkl')) torch.save(qf1.state_dict(), os.path.join( args.log, 'models', 'qf1_max.pkl')) torch.save(qf2.state_dict(), os.path.join( args.log, 'models', 'qf2_max.pkl')) torch.save(optim_pol.state_dict(), os.path.join( args.log, 'models', 'optim_pol_max.pkl')) torch.save(optim_qf1.state_dict(), os.path.join( args.log, 'models', 'optim_qf1_max.pkl')) torch.save(optim_qf2.state_dict(), os.path.join( args.log, 'models', 'optim_qf2_max.pkl')) max_rew = mean_rew torch.save(pol.state_dict(), os.path.join( args.log, 'models', 'pol_last.pkl')) torch.save(qf1.state_dict(), os.path.join( args.log, 'models', 'qf1_last.pkl'))
# logを保存 logger.record_results(args.log, result_dict, score_file, total_epi, step, total_step, rewards, plot_title=args.env_name) if mean_rew > max_rew: # 報酬の最大値が更新されたら保存 # policy torch.save(pol.state_dict(), os.path.join(args.log, 'models', 'pol_max.pkl')) # Q関数 torch.save(qf.state_dict(), os.path.join(args.log, 'models', 'qf_max.pkl')) # target Q theta1 torch.save(targ_qf1.state_dict(), os.path.join(args.log, 'models', 'targ_qf1_max.pkl')) # target Q theta 2 torch.save(targ_qf2.state_dict(), os.path.join(args.log, 'models', 'targ_qf2_max.pkl')) # 訓練パラメータ torch.save(optim_qf.state_dict(), os.path.join(args.log, 'models', 'optim_qf_max.pkl')) # 得られた報酬記録を更新 max_rew = mean_rew # 最後のepisodeのmodelを保存 torch.save(pol.state_dict(),
rewards = [np.sum(epi['rews']) for epi in epis] mean_rew = np.mean(rewards) logger.record_results(args.log, result_dict, score_file, total_epi, step, total_step, rewards, plot_title=args.env_name) if mean_rew > max_rew: torch.save(pol.state_dict(), os.path.join(args.log, 'models', 'pol_max.pkl')) torch.save(qf1.state_dict(), os.path.join(args.log, 'models', 'qf1_max.pkl')) torch.save(qf2.state_dict(), os.path.join(args.log, 'models', 'qf2_max.pkl')) torch.save(optim_pol.state_dict(), os.path.join(args.log, 'models', 'optim_pol_max.pkl')) torch.save(optim_qf1.state_dict(), os.path.join(args.log, 'models', 'optim_qf1_max.pkl')) torch.save(optim_qf2.state_dict(), os.path.join(args.log, 'models', 'optim_qf2_max.pkl')) max_rew = mean_rew torch.save(pol.state_dict(), os.path.join(args.log, 'models', 'pol_last.pkl')) torch.save(qf1.state_dict(), os.path.join(args.log, 'models', 'qf1_last.pkl'))
logger.record_results(args.log, result_dict, score_file, total_epi, step, total_step, rewards, plot_title=args.env_name) mean_rew = np.mean([np.sum(path['real_rews']) for path in epis]) if mean_rew > max_rew: torch.save(pol.state_dict(), os.path.join(args.log, 'models', 'pol_max.pkl')) torch.save(vf.state_dict(), os.path.join(args.log, 'models', 'vf_max.pkl')) torch.save(discrim.state_dict(), os.path.join(args.log, 'models', 'discrim_max.pkl')) torch.save(optim_pol.state_dict(), os.path.join(args.log, 'models', 'optim_pol_max.pkl')) torch.save(optim_vf.state_dict(), os.path.join(args.log, 'models', 'optim_vf_max.pkl')) torch.save(optim_discrim.state_dict(), os.path.join(args.log, 'models', 'optim_discrim_max.pkl')) max_rew = mean_rew torch.save(pol.state_dict(), os.path.join(args.log, 'models', 'pol_last.pkl')) torch.save(vf.state_dict(), os.path.join(args.log, 'models', 'vf_last.pkl')) torch.save(discrim.state_dict(), os.path.join(args.log, 'models', 'discrim_last.pkl'))
rewards = [np.sum(epi['real_rews']) for epi in epis] mean_rew = np.mean(rewards) pseudo_rewards = [np.sum(epi['rews']) for epi in epis] result_dict['PseudoReward'] = pseudo_rewards logger.record_results(args.log, result_dict, score_file, total_epi, step, total_step, rewards, plot_title=args.env_name) mean_rew = np.mean([np.sum(path['real_rews']) for path in epis]) if mean_rew > max_rew: torch.save(pol.state_dict(), os.path.join( args.log, 'models', 'pol_max.pkl')) torch.save(vf.state_dict(), os.path.join( args.log, 'models', 'vf_max.pkl')) torch.save(discrim.state_dict(), os.path.join( args.log, 'models', 'discrim_max.pkl')) torch.save(optim_pol.state_dict(), os.path.join( args.log, 'models', 'optim_pol_max.pkl')) torch.save(optim_vf.state_dict(), os.path.join( args.log, 'models', 'optim_vf_max.pkl')) torch.save(optim_discrim.state_dict(), os.path.join( args.log, 'models', 'optim_discrim_max.pkl')) max_rew = mean_rew torch.save(pol.state_dict(), os.path.join( args.log, 'models', 'pol_last.pkl')) torch.save(vf.state_dict(), os.path.join( args.log, 'models', 'vf_last.pkl')) torch.save(discrim.state_dict(), os.path.join( args.log, 'models', 'discrim_last.pkl'))
rewards, plot_title=args.env_name) mean_rew = np.mean([np.sum(path['real_rews']) for path in epis]) if mean_rew > max_rew: torch.save(pol.state_dict(), os.path.join(args.log, 'models', 'pol_max.pkl')) torch.save(vf.state_dict(), os.path.join(args.log, 'models', 'vf_max.pkl')) if args.rew_type == 'rew': torch.save(rewf.state_dict(), os.path.join(args.log, 'models', 'rewf_max.pkl')) torch.save(shaping_vf.state_dict(), os.path.join(args.log, 'models', 'shaping_vf_max.pkl')) else: torch.save(advf.state_dict(), os.path.join(args.log, 'models', 'advf_max.pkl')) torch.save(optim_pol.state_dict(), os.path.join(args.log, 'models', 'optim_pol_max.pkl')) torch.save(optim_vf.state_dict(), os.path.join(args.log, 'models', 'optim_vf_max.pkl')) torch.save(optim_discrim.state_dict(), os.path.join(args.log, 'models', 'optim_discrim_max.pkl')) max_rew = mean_rew torch.save(pol.state_dict(), os.path.join(args.log, 'models', 'pol_last.pkl')) torch.save(vf.state_dict(), os.path.join(args.log, 'models', 'vf_last.pkl')) if args.rew_type == 'rew': torch.save(rewf.state_dict(),
rewards, plot_title=args.env_name) mean_rew = np.mean([np.sum(path['real_rews']) for path in epis]) if mean_rew > max_rew: torch.save(pol.state_dict(), os.path.join( args.log, 'models', 'pol_max.pkl')) torch.save(vf.state_dict(), os.path.join( args.log, 'models', 'vf_max.pkl')) if args.rew_type == 'rew': torch.save(rewf.state_dict(), os.path.join( args.log, 'models', 'rewf_max.pkl')) torch.save(shaping_vf.state_dict(), os.path.join( args.log, 'models', 'shaping_vf_max.pkl')) else: torch.save(advf.state_dict(), os.path.join( args.log, 'models', 'advf_max.pkl')) torch.save(optim_pol.state_dict(), os.path.join( args.log, 'models', 'optim_pol_max.pkl')) torch.save(optim_vf.state_dict(), os.path.join( args.log, 'models', 'optim_vf_max.pkl')) torch.save(optim_discrim.state_dict(), os.path.join( args.log, 'models', 'optim_discrim_max.pkl')) max_rew = mean_rew torch.save(pol.state_dict(), os.path.join( args.log, 'models', 'pol_last.pkl')) torch.save(vf.state_dict(), os.path.join( args.log, 'models', 'vf_last.pkl')) if args.rew_type == 'rew': torch.save(rewf.state_dict(), os.path.join(
if total_grad_step >= args.lag * num_update_lagged: logger.log('Updated lagged qf!!') lagged_qf_net.load_state_dict(qf_net.state_dict()) num_update_lagged += 1 rewards = [np.sum(epi['rews']) for epi in epis] mean_rew = np.mean(rewards) logger.record_results(args.log, result_dict, score_file, total_epi, step, total_step, rewards, plot_title=args.env_name) if mean_rew > max_rew: torch.save(pol.state_dict(), os.path.join( args.log, 'models', 'pol_max.pkl')) torch.save(qf.state_dict(), os.path.join( args.log, 'models', 'qf_max.pkl')) torch.save(targ_qf1.state_dict(), os.path.join( args.log, 'models', 'targ_qf1_max.pkl')) torch.save(targ_qf2.state_dict(), os.path.join( args.log, 'models', 'targ_qf2_max.pkl')) torch.save(optim_qf.state_dict(), os.path.join( args.log, 'models', 'optim_qf_max.pkl')) max_rew = mean_rew torch.save(pol.state_dict(), os.path.join( args.log, 'models', 'pol_last.pkl')) torch.save(qf.state_dict(), os.path.join( args.log, 'models', 'qf_last.pkl')) torch.save(targ_qf1.state_dict(), os.path.join( args.log, 'models', 'targ_qf1_last.pkl'))
total_step, rewards, plot_title=args.env_name) # save models regular intervals steps_as = str( int( int(total_step / args.steps_per_save_models + 1) * args.steps_per_save_models)) if 'prev_as' in locals(): if not prev_as == steps_as: torch.save( pol.state_dict(), os.path.join(args.log, 'models', 'pol_' + steps_as + '.pkl')) torch.save( qf1.state_dict(), os.path.join(args.log, 'models', 'qf1_' + steps_as + '.pkl')) torch.save( qf2.state_dict(), os.path.join(args.log, 'models', 'qf2_' + steps_as + '.pkl')) torch.save( discrim.state_dict(), os.path.join(args.log, 'models', 'discrim_' + steps_as + '.pkl')) torch.save( optim_pol.state_dict(), os.path.join(args.log, 'models', 'optim_pol_' + steps_as + '.pkl')) torch.save( optim_qf1.state_dict(), os.path.join(args.log, 'models',