コード例 #1
0
actor = ppo_module.Actor(
    ppo_module.MLP(cfg['architecture']['policy_net'], nn.LeakyReLU, ob_dim,
                   act_dim),
    ppo_module.MultivariateGaussianDiagonalCovariance(act_dim, 1.0), 'cuda')

critic = ppo_module.Critic(
    ppo_module.MLP(cfg['architecture']['value_net'], nn.LeakyReLU, ob_dim, 1),
    'cuda')

ppo = PPO.PPO(
    actor=actor,
    critic=critic,
    num_envs=cfg['environment']['num_envs'],
    num_transitions_per_env=n_steps,
    num_learning_epochs=4,
    gamma=0.996,
    lam=0.95,
    num_mini_batches=4,
    device='cuda',
    log_dir=saver.data_dir,
    mini_batch_sampling='in_order',
)

if not test_mode:
    for update in range(1000000):
        ax[0].set(xlabel='iteration',
                  ylabel='avg performance',
                  title='average performance')
        ax[1].set(xlabel='iteration',
                  ylabel='avg dones',
                  title='average dones')
コード例 #2
0
ファイル: runner.py プロジェクト: sehoon74/raisimLib
    ppo_module.MLP(cfg['architecture']['value_net'], nn.LeakyReLU, ob_dim, 1),
    device)

saver = ConfigurationSaver(
    log_dir=home_path + "/raisimGymTorch/data/" + task_name,
    save_items=[task_path + "/cfg.yaml", task_path + "/Environment.hpp"])
tensorboard_launcher(saver.data_dir +
                     "/..")  # press refresh (F5) after the first ppo update

ppo = PPO.PPO(
    actor=actor,
    critic=critic,
    num_envs=cfg['environment']['num_envs'],
    num_transitions_per_env=n_steps,
    num_learning_epochs=4,
    gamma=0.996,
    lam=0.95,
    num_mini_batches=4,
    device=device,
    log_dir=saver.data_dir,
    shuffle_batch=False,
)

if mode == 'retrain':
    load_param(weight_path, env, actor, critic, ppo.optimizer, saver.data_dir)

for update in range(1000000):
    start = time.time()
    env.reset()
    reward_ll_sum = 0
    done_sum = 0