Beispiel #1
0
def train_advil(env, n=0):
    venv = gym.make(env)
    for i in range(n):
        mean_rewards = []
        std_rewards = []
        for num_trajs in range(0, 26, 5):
            if num_trajs == 0:
                expert_data = make_sa_dataloader(env, normalize=True)
                pi = advil_training(expert_data, venv, iters=0)
            else:
                expert_data = make_sa_dataloader(env,
                                                 max_trajs=num_trajs,
                                                 normalize=True,
                                                 batch_size=1024)
                pi = advil_training(expert_data, venv)

            def get_policy(*args, **kwargs):
                return pi

            model = PPO(get_policy, env, verbose=1)
            mean_reward, std_reward = evaluate_policy(model,
                                                      model.get_env(),
                                                      n_eval_episodes=10)
            mean_rewards.append(mean_reward)
            std_rewards.append(std_reward)
            print("{0} Trajs: {1}".format(num_trajs, mean_reward))
            np.savez(os.path.join("learners", env,
                                  "advil_rewards_{0}".format(i)),
                     means=mean_rewards,
                     stds=std_rewards)
Beispiel #2
0
def train_bc(env, n=0):
    venv = util.make_vec_env(env, n_envs=8)
    if isinstance(venv.action_space, Discrete):
        w = 64
    else:
        w = 256
    for i in range(n):
        mean_rewards = []
        std_rewards = []
        for num_trajs in range(0, 26, 5):
            if num_trajs == 0:
                expert_data = make_sa_dataloader(env, normalize=False)
            else:
                expert_data = make_sa_dataloader(env,
                                                 max_trajs=num_trajs,
                                                 normalize=False)
            bc_trainer = bc.BC(venv.observation_space,
                               venv.action_space,
                               expert_data=expert_data,
                               policy_class=policies.ActorCriticPolicy,
                               ent_weight=0.,
                               l2_weight=0.,
                               policy_kwargs=dict(net_arch=[w, w]))
            if num_trajs > 0:
                bc_trainer.train(n_batches=int(5e5))

            def get_policy(*args, **kwargs):
                return bc_trainer.policy

            model = PPO(get_policy, env, verbose=1)
            model.save(
                os.path.join("learners", env,
                             "bc_{0}_{1}".format(i, num_trajs)))
            mean_reward, std_reward = evaluate_policy(model,
                                                      model.get_env(),
                                                      n_eval_episodes=10)
            mean_rewards.append(mean_reward)
            std_rewards.append(std_reward)
            print("{0} Trajs: {1}".format(num_trajs, mean_reward))
            np.savez(os.path.join("learners", env, "bc_rewards_{0}".format(i)),
                     means=mean_rewards,
                     stds=std_rewards)
Beispiel #3
0
import numpy as np
import gym
import gym_fishing
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

env = gym.make("fishing-v0")
model = PPO("MlpPolicy", env, verbose=0)
model.learn(total_timesteps=100000)

## simulate and plot results
df = env.simulate(model, reps=10)
env.plot(df, "ppo.png")

## Evaluate model
mean_reward, std_reward = evaluate_policy(model,
                                          model.get_env(),
                                          n_eval_episodes=50)
print("mean reward:", mean_reward, "std:", std_reward)

# save trained agent for future use, if desired
# model.save("ppo")
                verbose=1)

# Train the agent
# Evaluate the model every 1000 steps on 5 test episodes
# and save the evaluation to the "logs/" folder
# total_timesteps:Number of interactions between agent and environment(one step==one transition);
# Each n_steps(2048) contains many episodes;
# Then n_steps transitions used to training.(1 epoch == n_steps transitions)
model.learn(total_timesteps=100000,
            eval_freq=1000,
            n_eval_episodes=5,
            eval_log_path="./logs/")
# save the model
model.save("{}/model".format(results_root))

# et policy
policy = model.policy
# Retrieve the environment
env = model.get_env()
# Evaluate the policy
mean_reward, std_reward = evaluate_policy(policy,
                                          env,
                                          n_eval_episodes=10,
                                          deterministic=True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

# # load model

# del model
# # the policy_kwargs are automatically loaded
# model = PPO.load("ppo_cartpole", env=env)