def train_advil(env, n=0): venv = gym.make(env) for i in range(n): mean_rewards = [] std_rewards = [] for num_trajs in range(0, 26, 5): if num_trajs == 0: expert_data = make_sa_dataloader(env, normalize=True) pi = advil_training(expert_data, venv, iters=0) else: expert_data = make_sa_dataloader(env, max_trajs=num_trajs, normalize=True, batch_size=1024) pi = advil_training(expert_data, venv) def get_policy(*args, **kwargs): return pi model = PPO(get_policy, env, verbose=1) mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) mean_rewards.append(mean_reward) std_rewards.append(std_reward) print("{0} Trajs: {1}".format(num_trajs, mean_reward)) np.savez(os.path.join("learners", env, "advil_rewards_{0}".format(i)), means=mean_rewards, stds=std_rewards)
def train_bc(env, n=0): venv = util.make_vec_env(env, n_envs=8) if isinstance(venv.action_space, Discrete): w = 64 else: w = 256 for i in range(n): mean_rewards = [] std_rewards = [] for num_trajs in range(0, 26, 5): if num_trajs == 0: expert_data = make_sa_dataloader(env, normalize=False) else: expert_data = make_sa_dataloader(env, max_trajs=num_trajs, normalize=False) bc_trainer = bc.BC(venv.observation_space, venv.action_space, expert_data=expert_data, policy_class=policies.ActorCriticPolicy, ent_weight=0., l2_weight=0., policy_kwargs=dict(net_arch=[w, w])) if num_trajs > 0: bc_trainer.train(n_batches=int(5e5)) def get_policy(*args, **kwargs): return bc_trainer.policy model = PPO(get_policy, env, verbose=1) model.save( os.path.join("learners", env, "bc_{0}_{1}".format(i, num_trajs))) mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10) mean_rewards.append(mean_reward) std_rewards.append(std_reward) print("{0} Trajs: {1}".format(num_trajs, mean_reward)) np.savez(os.path.join("learners", env, "bc_rewards_{0}".format(i)), means=mean_rewards, stds=std_rewards)
import numpy as np import gym import gym_fishing from stable_baselines3 import PPO from stable_baselines3.common.evaluation import evaluate_policy env = gym.make("fishing-v0") model = PPO("MlpPolicy", env, verbose=0) model.learn(total_timesteps=100000) ## simulate and plot results df = env.simulate(model, reps=10) env.plot(df, "ppo.png") ## Evaluate model mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=50) print("mean reward:", mean_reward, "std:", std_reward) # save trained agent for future use, if desired # model.save("ppo")
verbose=1) # Train the agent # Evaluate the model every 1000 steps on 5 test episodes # and save the evaluation to the "logs/" folder # total_timesteps:Number of interactions between agent and environment(one step==one transition); # Each n_steps(2048) contains many episodes; # Then n_steps transitions used to training.(1 epoch == n_steps transitions) model.learn(total_timesteps=100000, eval_freq=1000, n_eval_episodes=5, eval_log_path="./logs/") # save the model model.save("{}/model".format(results_root)) # et policy policy = model.policy # Retrieve the environment env = model.get_env() # Evaluate the policy mean_reward, std_reward = evaluate_policy(policy, env, n_eval_episodes=10, deterministic=True) print(f"mean_reward={mean_reward:.2f} +/- {std_reward}") # # load model # del model # # the policy_kwargs are automatically loaded # model = PPO.load("ppo_cartpole", env=env)