Example #1
0
            optim_pol, optim_qfs, optim_alpha,
            step//50, args.rnn_batch_size, args.seq_length, args.burn_in_length,
            args.tau, args.gamma, args.sampling, not args.no_reparam
        )

    rewards = [np.sum(epi['rews']) for epi in epis]
    mean_rew = np.mean(rewards)
    logger.record_results(args.log, result_dict, score_file,
                          total_epi, step, total_step,
                          rewards,
                          plot_title=args.env_name)

    if mean_rew > max_rew:
        torch.save(pol.state_dict(), os.path.join(
            args.log, 'models', 'pol_max.pkl'))
        torch.save(qf1.state_dict(), os.path.join(
            args.log, 'models', 'qf1_max.pkl'))
        torch.save(qf2.state_dict(), os.path.join(
            args.log, 'models', 'qf2_max.pkl'))
        torch.save(optim_pol.state_dict(), os.path.join(
            args.log, 'models', 'optim_pol_max.pkl'))
        torch.save(optim_qf1.state_dict(), os.path.join(
            args.log, 'models', 'optim_qf1_max.pkl'))
        torch.save(optim_qf2.state_dict(), os.path.join(
            args.log, 'models', 'optim_qf2_max.pkl'))
        max_rew = mean_rew

    torch.save(pol.state_dict(), os.path.join(
        args.log, 'models', 'pol_last.pkl'))
    torch.save(qf1.state_dict(), os.path.join(
        args.log, 'models', 'qf1_last.pkl'))
Example #2
0
    # logを保存
    logger.record_results(args.log,
                          result_dict,
                          score_file,
                          total_epi,
                          step,
                          total_step,
                          rewards,
                          plot_title=args.env_name)

    if mean_rew > max_rew:  # 報酬の最大値が更新されたら保存
        # policy
        torch.save(pol.state_dict(),
                   os.path.join(args.log, 'models', 'pol_max.pkl'))
        # Q関数
        torch.save(qf.state_dict(),
                   os.path.join(args.log, 'models', 'qf_max.pkl'))
        # target Q theta1
        torch.save(targ_qf1.state_dict(),
                   os.path.join(args.log, 'models', 'targ_qf1_max.pkl'))
        # target Q theta 2
        torch.save(targ_qf2.state_dict(),
                   os.path.join(args.log, 'models', 'targ_qf2_max.pkl'))
        # 訓練パラメータ
        torch.save(optim_qf.state_dict(),
                   os.path.join(args.log, 'models', 'optim_qf_max.pkl'))
        # 得られた報酬記録を更新
        max_rew = mean_rew

    # 最後のepisodeのmodelを保存
    torch.save(pol.state_dict(),
Example #3
0
    rewards = [np.sum(epi['rews']) for epi in epis]
    mean_rew = np.mean(rewards)
    logger.record_results(args.log,
                          result_dict,
                          score_file,
                          total_epi,
                          step,
                          total_step,
                          rewards,
                          plot_title=args.env_name)

    if mean_rew > max_rew:
        torch.save(pol.state_dict(),
                   os.path.join(args.log, 'models', 'pol_max.pkl'))
        torch.save(qf1.state_dict(),
                   os.path.join(args.log, 'models', 'qf1_max.pkl'))
        torch.save(qf2.state_dict(),
                   os.path.join(args.log, 'models', 'qf2_max.pkl'))
        torch.save(optim_pol.state_dict(),
                   os.path.join(args.log, 'models', 'optim_pol_max.pkl'))
        torch.save(optim_qf1.state_dict(),
                   os.path.join(args.log, 'models', 'optim_qf1_max.pkl'))
        torch.save(optim_qf2.state_dict(),
                   os.path.join(args.log, 'models', 'optim_qf2_max.pkl'))
        max_rew = mean_rew

    torch.save(pol.state_dict(),
               os.path.join(args.log, 'models', 'pol_last.pkl'))
    torch.save(qf1.state_dict(),
               os.path.join(args.log, 'models', 'qf1_last.pkl'))
Example #4
0
    logger.record_results(args.log,
                          result_dict,
                          score_file,
                          total_epi,
                          step,
                          total_step,
                          rewards,
                          plot_title=args.env_name)

    mean_rew = np.mean([np.sum(path['real_rews']) for path in epis])
    if mean_rew > max_rew:
        torch.save(pol.state_dict(),
                   os.path.join(args.log, 'models', 'pol_max.pkl'))
        torch.save(vf.state_dict(),
                   os.path.join(args.log, 'models', 'vf_max.pkl'))
        torch.save(discrim.state_dict(),
                   os.path.join(args.log, 'models', 'discrim_max.pkl'))
        torch.save(optim_pol.state_dict(),
                   os.path.join(args.log, 'models', 'optim_pol_max.pkl'))
        torch.save(optim_vf.state_dict(),
                   os.path.join(args.log, 'models', 'optim_vf_max.pkl'))
        torch.save(optim_discrim.state_dict(),
                   os.path.join(args.log, 'models', 'optim_discrim_max.pkl'))
        max_rew = mean_rew

    torch.save(pol.state_dict(),
               os.path.join(args.log, 'models', 'pol_last.pkl'))
    torch.save(vf.state_dict(), os.path.join(args.log, 'models',
                                             'vf_last.pkl'))
    torch.save(discrim.state_dict(),
               os.path.join(args.log, 'models', 'discrim_last.pkl'))
Example #5
0
    rewards = [np.sum(epi['real_rews']) for epi in epis]
    mean_rew = np.mean(rewards)
    pseudo_rewards = [np.sum(epi['rews']) for epi in epis]
    result_dict['PseudoReward'] = pseudo_rewards
    logger.record_results(args.log, result_dict, score_file,
                          total_epi, step, total_step,
                          rewards,
                          plot_title=args.env_name)

    mean_rew = np.mean([np.sum(path['real_rews']) for path in epis])
    if mean_rew > max_rew:
        torch.save(pol.state_dict(), os.path.join(
            args.log, 'models', 'pol_max.pkl'))
        torch.save(vf.state_dict(), os.path.join(
            args.log, 'models', 'vf_max.pkl'))
        torch.save(discrim.state_dict(), os.path.join(
            args.log, 'models', 'discrim_max.pkl'))
        torch.save(optim_pol.state_dict(), os.path.join(
            args.log, 'models', 'optim_pol_max.pkl'))
        torch.save(optim_vf.state_dict(), os.path.join(
            args.log, 'models', 'optim_vf_max.pkl'))
        torch.save(optim_discrim.state_dict(), os.path.join(
            args.log, 'models', 'optim_discrim_max.pkl'))
        max_rew = mean_rew

    torch.save(pol.state_dict(), os.path.join(
        args.log, 'models', 'pol_last.pkl'))
    torch.save(vf.state_dict(), os.path.join(
        args.log, 'models', 'vf_last.pkl'))
    torch.save(discrim.state_dict(), os.path.join(
        args.log, 'models', 'discrim_last.pkl'))
Example #6
0
                          rewards,
                          plot_title=args.env_name)

    mean_rew = np.mean([np.sum(path['real_rews']) for path in epis])
    if mean_rew > max_rew:
        torch.save(pol.state_dict(),
                   os.path.join(args.log, 'models', 'pol_max.pkl'))
        torch.save(vf.state_dict(),
                   os.path.join(args.log, 'models', 'vf_max.pkl'))
        if args.rew_type == 'rew':
            torch.save(rewf.state_dict(),
                       os.path.join(args.log, 'models', 'rewf_max.pkl'))
            torch.save(shaping_vf.state_dict(),
                       os.path.join(args.log, 'models', 'shaping_vf_max.pkl'))
        else:
            torch.save(advf.state_dict(),
                       os.path.join(args.log, 'models', 'advf_max.pkl'))
        torch.save(optim_pol.state_dict(),
                   os.path.join(args.log, 'models', 'optim_pol_max.pkl'))
        torch.save(optim_vf.state_dict(),
                   os.path.join(args.log, 'models', 'optim_vf_max.pkl'))
        torch.save(optim_discrim.state_dict(),
                   os.path.join(args.log, 'models', 'optim_discrim_max.pkl'))
        max_rew = mean_rew

    torch.save(pol.state_dict(),
               os.path.join(args.log, 'models', 'pol_last.pkl'))
    torch.save(vf.state_dict(), os.path.join(args.log, 'models',
                                             'vf_last.pkl'))
    if args.rew_type == 'rew':
        torch.save(rewf.state_dict(),
Example #7
0
                          rewards,
                          plot_title=args.env_name)

    mean_rew = np.mean([np.sum(path['real_rews']) for path in epis])
    if mean_rew > max_rew:
        torch.save(pol.state_dict(), os.path.join(
            args.log, 'models', 'pol_max.pkl'))
        torch.save(vf.state_dict(), os.path.join(
            args.log, 'models', 'vf_max.pkl'))
        if args.rew_type == 'rew':
            torch.save(rewf.state_dict(), os.path.join(
                args.log, 'models', 'rewf_max.pkl'))
            torch.save(shaping_vf.state_dict(), os.path.join(
                args.log, 'models', 'shaping_vf_max.pkl'))
        else:
            torch.save(advf.state_dict(), os.path.join(
                args.log, 'models', 'advf_max.pkl'))
        torch.save(optim_pol.state_dict(), os.path.join(
            args.log, 'models', 'optim_pol_max.pkl'))
        torch.save(optim_vf.state_dict(), os.path.join(
            args.log, 'models', 'optim_vf_max.pkl'))
        torch.save(optim_discrim.state_dict(), os.path.join(
            args.log, 'models', 'optim_discrim_max.pkl'))
        max_rew = mean_rew

    torch.save(pol.state_dict(), os.path.join(
        args.log, 'models', 'pol_last.pkl'))
    torch.save(vf.state_dict(), os.path.join(
        args.log, 'models', 'vf_last.pkl'))
    if args.rew_type == 'rew':
        torch.save(rewf.state_dict(), os.path.join(
Example #8
0
    if total_grad_step >= args.lag * num_update_lagged:
        logger.log('Updated lagged qf!!')
        lagged_qf_net.load_state_dict(qf_net.state_dict())
        num_update_lagged += 1

    rewards = [np.sum(epi['rews']) for epi in epis]
    mean_rew = np.mean(rewards)
    logger.record_results(args.log, result_dict, score_file,
                          total_epi, step, total_step,
                          rewards,
                          plot_title=args.env_name)

    if mean_rew > max_rew:
        torch.save(pol.state_dict(), os.path.join(
            args.log, 'models', 'pol_max.pkl'))
        torch.save(qf.state_dict(), os.path.join(
            args.log, 'models',  'qf_max.pkl'))
        torch.save(targ_qf1.state_dict(), os.path.join(
            args.log, 'models',  'targ_qf1_max.pkl'))
        torch.save(targ_qf2.state_dict(), os.path.join(
            args.log, 'models',  'targ_qf2_max.pkl'))
        torch.save(optim_qf.state_dict(), os.path.join(
            args.log, 'models',  'optim_qf_max.pkl'))
        max_rew = mean_rew

    torch.save(pol.state_dict(), os.path.join(
        args.log, 'models',  'pol_last.pkl'))
    torch.save(qf.state_dict(), os.path.join(
        args.log, 'models', 'qf_last.pkl'))
    torch.save(targ_qf1.state_dict(), os.path.join(
        args.log, 'models', 'targ_qf1_last.pkl'))
Example #9
0
                          total_step,
                          rewards,
                          plot_title=args.env_name)

    # save models regular intervals
    steps_as = str(
        int(
            int(total_step / args.steps_per_save_models + 1) *
            args.steps_per_save_models))
    if 'prev_as' in locals():
        if not prev_as == steps_as:
            torch.save(
                pol.state_dict(),
                os.path.join(args.log, 'models', 'pol_' + steps_as + '.pkl'))
            torch.save(
                qf1.state_dict(),
                os.path.join(args.log, 'models', 'qf1_' + steps_as + '.pkl'))
            torch.save(
                qf2.state_dict(),
                os.path.join(args.log, 'models', 'qf2_' + steps_as + '.pkl'))
            torch.save(
                discrim.state_dict(),
                os.path.join(args.log, 'models',
                             'discrim_' + steps_as + '.pkl'))
            torch.save(
                optim_pol.state_dict(),
                os.path.join(args.log, 'models',
                             'optim_pol_' + steps_as + '.pkl'))
            torch.save(
                optim_qf1.state_dict(),
                os.path.join(args.log, 'models',