actor = ppo_module.Actor( ppo_module.MLP(cfg['architecture']['policy_net'], nn.LeakyReLU, ob_dim, act_dim), ppo_module.MultivariateGaussianDiagonalCovariance(act_dim, 1.0), 'cuda') critic = ppo_module.Critic( ppo_module.MLP(cfg['architecture']['value_net'], nn.LeakyReLU, ob_dim, 1), 'cuda') ppo = PPO.PPO( actor=actor, critic=critic, num_envs=cfg['environment']['num_envs'], num_transitions_per_env=n_steps, num_learning_epochs=4, gamma=0.996, lam=0.95, num_mini_batches=4, device='cuda', log_dir=saver.data_dir, mini_batch_sampling='in_order', ) if not test_mode: for update in range(1000000): ax[0].set(xlabel='iteration', ylabel='avg performance', title='average performance') ax[1].set(xlabel='iteration', ylabel='avg dones', title='average dones')
ppo_module.MLP(cfg['architecture']['value_net'], nn.LeakyReLU, ob_dim, 1), device) saver = ConfigurationSaver( log_dir=home_path + "/raisimGymTorch/data/" + task_name, save_items=[task_path + "/cfg.yaml", task_path + "/Environment.hpp"]) tensorboard_launcher(saver.data_dir + "/..") # press refresh (F5) after the first ppo update ppo = PPO.PPO( actor=actor, critic=critic, num_envs=cfg['environment']['num_envs'], num_transitions_per_env=n_steps, num_learning_epochs=4, gamma=0.996, lam=0.95, num_mini_batches=4, device=device, log_dir=saver.data_dir, shuffle_batch=False, ) if mode == 'retrain': load_param(weight_path, env, actor, critic, ppo.optimizer, saver.data_dir) for update in range(1000000): start = time.time() env.reset() reward_ll_sum = 0 done_sum = 0