# Training n_steps = math.floor(cfg['environment']['max_time'] / cfg['environment']['control_dt']) total_steps = n_steps * env.num_envs avg_rewards = [] avg_dones = [] fig, ax = plt.subplots(1, 2, constrained_layout=True, sharex=True, figsize=[10.8, 4.8]) actor = ppo_module.Actor( ppo_module.MLP(cfg['architecture']['policy_net'], nn.LeakyReLU, ob_dim, act_dim), ppo_module.MultivariateGaussianDiagonalCovariance(act_dim, 1.0), 'cuda') critic = ppo_module.Critic( ppo_module.MLP(cfg['architecture']['value_net'], nn.LeakyReLU, ob_dim, 1), 'cuda') ppo = PPO.PPO( actor=actor, critic=critic, num_envs=cfg['environment']['num_envs'], num_transitions_per_env=n_steps, num_learning_epochs=4, gamma=0.996, lam=0.95, num_mini_batches=4,
"Can't find trained weight, please provide a trained weight with --weight switch\n" ) else: print("Loaded weight from {}\n".format(weight_path)) start = time.time() env.reset() reward_ll_sum = 0 done_sum = 0 average_dones = 0. n_steps = math.floor(cfg['environment']['max_time'] / cfg['environment']['control_dt']) total_steps = n_steps * 1 start_step_id = 0 print("Visualizing and evaluating the policy: ", weight_path) loaded_graph = ppo_module.MLP(cfg['architecture']['policy_net'], torch.nn.LeakyReLU, ob_dim, act_dim) loaded_graph.load_state_dict( torch.load(weight_path)['actor_architecture_state_dict']) env.load_scaling(weight_dir, int(iteration_number)) env.turn_on_visualization() # max_steps = 1000000 max_steps = 1000 ## 10 secs for step in range(max_steps): time.sleep(0.01) obs = env.observe(False) action_ll = loaded_graph.architecture(torch.from_numpy(obs).cpu()) reward_ll, dones = env.step(action_ll.cpu().detach().numpy()) reward_ll_sum = reward_ll_sum + reward_ll[0]