Exemple #1
0
 def create_model(self, n_envs=1):
     """ Create env and agent model """
     env_cls = SprEnv
     self.env = make_vec_env(env_cls,
                             n_envs=n_envs,
                             env_kwargs={"params": self.params},
                             seed=self.params.seed)
     self.model = ACKTR(
         self.policy,
         self.env,
         gamma=self.params.agent_config['gamma'],
         n_steps=self.params.agent_config['n_steps'],
         ent_coef=self.params.agent_config['ent_coef'],
         vf_coef=self.params.agent_config['vf_coef'],
         vf_fisher_coef=self.params.agent_config['vf_fisher_coef'],
         max_grad_norm=self.params.agent_config['max_grad_norm'],
         learning_rate=self.params.agent_config['learning_rate'],
         gae_lambda=self.params.agent_config['gae_lambda'],
         lr_schedule=self.params.agent_config['lr_schedule'],
         kfac_clip=self.params.agent_config['kfac_clip'],
         kfac_update=self.params.agent_config['kfac_update'],
         async_eigen_decomp=self.params.agent_config['async_eigen_decomp'],
         verbose=self.params.agent_config['verbose'],
         tensorboard_log="./tb/acktr/",
         seed=self.params.seed,
         policy_kwargs={"params": self.params})
Exemple #2
0
def fed_and_eval(base_index, w):
    base_env = make_vec_env(f"selected-bipedal-{subenv_dict[base_index]}-v0",
                            n_envs=1,
                            seed=seed)
    base_agent = ACKTR.load(
        f"./base_agent/{subenv_dict[base_index]}/model.zip")
    base_parameter_dict = base_agent.get_parameters()

    sub_model_parameters = []
    for subenv in subenv_dict.values():
        client_policy = ACKTR.load(
            f"./base{base_index}_client_model/{subenv}/policy.zip")
        sub_model_parameters.append(client_policy.get_parameters())

    aligned_agent = base_agent
    base_parameter_dict = aligned_agent.get_parameters()

    model_align(w, base_parameter_dict, sub_model_parameters, alpha=alpha)

    aligned_agent.load_parameters(base_parameter_dict)
    avg_reward, reward_std = evaluate_policy(aligned_agent,
                                             base_env,
                                             n_eval_episodes=100)

    print(f"base {base_index}, weight {w} done")
    return (avg_reward, reward_std)
Exemple #3
0
 def load_model(self, path=None):
     """ Load the model from a zip archive """
     if path is not None:
         self.model = ACKTR.load(path)
     else:
         self.model = ACKTR.load(self.params.model_path)
         # Copy the model to the new directory
         self.model.save(self.params.model_path)
Exemple #4
0
def train(env_id, num_timesteps, seed, num_cpu):
    """
    train an ACKTR model on atari

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    :param num_cpu: (int) The number of cpu to train on
    """
    env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
    model = ACKTR(CnnPolicy, env, nprocs=num_cpu)
    model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
    env.close()
Exemple #5
0
def run():
    torch.multiprocessing.freeze_support()
    env_id = "CartPole-v1"
    num_cpu = 4  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    model = ACKTR(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=25000)

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
def acktr(env_id, log_dir, timesteps):
    # Create log dir
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make(env_id)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    model = ACKTR(MlpPolicy, env, verbose=0)
    # Train the agent
    print("Beginning training episodes with ACKTR.")
    model.learn(total_timesteps=timesteps)

    env.close()
Exemple #7
0
def test_action_mask_run_acktr(vec_env, policy, env_class):
    env = vec_env([env_class])

    model = ACKTR(policy, env, verbose=0)

    obs, done, action_masks = env.reset(), [False], []
    while not done[0]:
        action, _states = model.predict(obs, action_mask=action_masks)
        obs, _, done, infos = env.step(action)

        action_masks.clear()
        for info in infos:
            env_action_mask = info.get('action_mask')
            action_masks.append(env_action_mask)

    env.close()
def get_intrinsic_reward(base_index):
    intrinsic_rewards = [[] for _ in range(len(subenv_dict))]
    # base env
    base_name = subenv_dict[base_index]
    base_env = make_vec_env(f"selected-bipedal-{base_name}-v0",
                            n_envs=1,
                            seed=seed)
    base_agent = ACKTR.load(f"./base_agent/{base_name}/model.zip")

    # rnd model
    rnd_dict = {}
    for client_env in subenv_dict.values():
        rnd = RandomNetworkDistillation(input_size=24)
        rnd.load(f"./base{base_index}_client_model/{client_env}/rnd")
        rnd_dict[client_env] = rnd
    obs = base_env.reset()
    for _ in range(num_test):
        for i, client_env in subenv_dict.items():
            intrinsic_rewards[i].append(
                rnd_dict[client_env].get_intrinsic_reward(obs))
        action = base_agent.predict(obs)
        obs, reward, done, info = base_env.step(action[0])
        if done:
            obs = base_env.reset()
    return intrinsic_rewards
Exemple #9
0
def NewPotential(current_window, algorithm='PPO'):

    # Determine the pretrained agent
    if algorithm == 'A2C':
        model = A2C.load("pretrained_A2C")
    elif algorithm == 'PPO':
        model = PPO2.load("pretrained_PPO")
    elif algorithm == 'ACKTR':
        model = ACKTR.load("pretrained_ACKTR")
    elif algorithm == 'ACER':
        model = ACER.load("pretrained_ACER")
    else:
        raise ValueError("%s is not a valid algorithm." % algorithm)

    if len(current_window) != model.observation_space.shape[0]:
        raise ValueError("%s is does not match the model's window size." %
                         len(current_window))

    action, _states = model.predict(current_window, deterministic=False)

    voltages = np.linspace(0, 1, num=model.action_space.n)
    if action >= 0 and action <= model.action_space.n - 1:
        voltage = voltages[action]
    else:
        raise ValueError(
            "Received invalid action={} which is not part of the action space".
            format(action))

    return voltage
Exemple #10
0
def make_new_model(model_type, policy, env, tensorboard_log=None):
    if model_type.lower() == 'dqn':
        model = DQN(policy, env, tensorboard_log=tensorboard_log)
    elif model_type.lower() == 'ppo2':
        model = PPO2(policy, env, tensorboard_log=tensorboard_log)
    elif model_type.lower() == 'a2c':
        model = A2C(policy, env, tensorboard_log=tensorboard_log)
    elif model_type.lower() == 'acktr':
        model = ACKTR(policy, env, tensorboard_log=tensorboard_log)
    return model
Exemple #11
0
def run_illegal_move_training(
                    exp_name,exp_path,
                    basicdate,
                    model_type='PPO2',
                    n_eval_episodes=10,
                    training_intervals=100,
                    max_steps=10000,
                    reward_margin=10,
                    log_to_tb=False,
                    pelican_agent_filepath=False):
    
       # set up logging 
    if log_to_tb:
        writer = SummaryWriter(exp_path)
        tb_log_name = 'Illegal_move_prevention_training'
    else:
        writer = None
        tb_log_name = None
    
    if pelican_agent_filepath:
        logger.info('Loading agent from file: ' + pelican_agent_filepath)
        # env = plark_env_illegal_move.PlarkEnvIllegalMove( config_file_path='/Components/plark-game/plark_game/game_config/10x10/balanced.json')
        env = gym.make('plark-env-illegal-move-v0')

        if model_type.lower() == 'dqn':
            model = DQN.load(pelican_agent_filepath)
            model.set_env(env)
            
        elif model_type.lower() == 'ppo2':
            model = PPO2.load(pelican_agent_filepath)
            model.set_env(DummyVecEnv([lambda: env]))
            
        elif model_type.lower() == 'a2c':
            model = A2C.load(pelican_agent_filepath)
            model.set_env(env)
            
        elif model_type.lower() == 'acktr':
            model = ACKTR.load(pelican_agent_filepath)
            model.set_env(env)

    else:   
        # Instantiate the env and model
        env = gym.make('plark-env-illegal-move-v0')
        model = PPO2('CnnPolicy', env)

    # Start training 
    train_agent(exp_path,model,env,training_intervals,max_steps,model_type,basicdate,writer,tb_log_name,reward_margin)
                
    # Evaluate
    mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False)
    logger.info('Evaluation finished')
    logger.info('Mean Reward is ' + str(mean_reward))
    logger.info('Number of steps is ' + str(n_steps))
Exemple #12
0
def run_sonobuoy_training(
                    exp_name,exp_path,
                    basicdate,
                    model_type='PPO2',
                    n_eval_episodes=10,
                    training_intervals=100,
                    max_steps=10000,
                    reward_margin=10,
                    log_to_tb=False,
                    pelican_agent_filepath=False):

    # set up logging 
    if log_to_tb:
        writer = SummaryWriter(exp_path)
        tb_log_name = 'sonobuoy_training'
    else:
        writer = None
        tb_log_name = None

        
    env = gym.make('plark-env-v0', panther_agent_filepath='/data/agents/models/PPO2_20200429_073132_panther/')
    
    if pelican_agent_filepath:
        logger.info('Loading agent from file: ' + pelican_agent_filepath)

        if model_type.lower() == 'dqn':
            model = DQN.load(pelican_agent_filepath)
            model.set_env(env)
            
        elif model_type.lower() == 'ppo2':
            model = PPO2.load(pelican_agent_filepath)
            model.set_env(DummyVecEnv([lambda: env]))
            
        elif model_type.lower() == 'a2c':
            model = A2C.load(pelican_agent_filepath)
            model.set_env(env)
            
        elif model_type.lower() == 'acktr':
            model = ACKTR.load(pelican_agent_filepath)
            model.set_env(env)

    else:   
        # Instantiate the env and model
        model = PPO2('CnnPolicy', env)

    # Start training 
    train_agent(exp_path,model,env,training_intervals,max_steps,model_type,basicdate,writer,tb_log_name,reward_margin)
                
    # Evaluate
    mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False)
    logger.info('Evaluation finished')
    logger.info('Mean Reward is ' + str(mean_reward))
    logger.info('Number of steps is ' + str(n_steps))
Exemple #13
0
 def loadAgent(self, filepath, algorithm_type):
     try:
         if algorithm_type.lower() == 'dqn':
             self.model = DQN.load(filepath)
         elif algorithm_type.lower() == 'ppo2':
             self.model = PPO2.load(filepath)
         elif algorithm_type.lower() == 'a2c':
             self.model = A2C.load(filepath)
         elif algorithm_type.lower() == 'acktr':
             self.model = ACKTR.load(filepath)
     except:
         raise ValueError('Error loading pelican agent. File : "' +
                          filepath + '" does not exsist')
def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_acktr(trial)
    seed = trial.suggest_int('numpyseed', 1, 429496729)
    np.random.seed(seed)
    original_env = gym.make('rustyblocks-v0')
    original_env.max_invalid_tries = 3
    env = DummyVecEnv([lambda: original_env])
    model = ACKTR("MlpPolicy", env, nprocs=1, verbose=0, **model_params)
    print("DOING LEARING acer")
    original_env.force_progression = False
    model.learn(int(2e4), seed=seed)
    print("DONE LEARING acer")
    original_env.max_invalid_tries = -1

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    original_env.force_progression = True
    original_env.invalid_try_limit = 5000
    while n_episodes < 4:
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward

        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)
    trial.report(last_reward)

    return last_reward
Exemple #15
0
def train_acktr(seed):
    """
    test ACKTR on the uav_env(cartesian,discrete) 
    """
    """
    ACKTR(policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25, 
    vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, 
    lr_schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, 
    async_eigen_decomp=False)
    """
    algo = 'ACKTR'
    num_timesteps = 3000000

    env = set_up_env(seed)

    global best_mean_reward, n_steps
    best_mean_reward, n_steps = -np.inf, 0

    model = ACKTR(policy=MlpPolicy, env=env, gamma=0.99, nprocs=1, n_steps=20,
                  ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25,
                  max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0,
                  tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo),
                  _init_setup_model=True)
    # , async_eigen_decomp=False)

    model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed,
                log_interval=500, tb_log_name="seed_{}".format(seed))

    model = ACKTR.load(log_dir + 'best_model.pkl')

    evaluation = evaluate_model(env, model, 100)
    os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True)
    os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed))
    env.close()
    del model, env
    gc.collect()
    return evaluation
 def loadAgent(self, filepath, algorithm_type):
     try:
         if algorithm_type.lower() == "dqn":
             self.model = DQN.load(filepath)
         elif algorithm_type.lower() == "ppo2":
             self.model = PPO2.load(filepath)
         elif algorithm_type.lower() == "ppo":
             self.model = PPO.load(filepath)
         elif algorithm_type.lower() == "a2c":
             self.model = A2C.load(filepath)
         elif algorithm_type.lower() == "acktr":
             self.model = ACKTR.load(filepath)
     except:
         raise ValueError('Error loading panther agent. File : "' +
                          filepath + '" does not exsist')
Exemple #17
0
def stable_baseline_test(env_origin):
    env = make_vec_env(lambda: env_origin, n_envs=1)
    model = ACKTR('CnnPolicy', env_origin, verbose=1)
    model.learn(total_timesteps=2000000)
    print("Stable_baseline evaluation starts.....\n")
    #NOTE:evaluate_policy needs vec_env
    reward_mean, reward_std = evaluate_policy(model,
                                              env,
                                              n_eval_episodes=20,
                                              deterministic=False)

    print("mean reward:" + str(reward_mean) + '\n')
    print("reward std:" + str(reward_std) + '\n')

    print("custom evaluation begin\n")

    env = env_origin
    obs = env.reset()
    reward_list_total = []
    epilen_list = []
    reward_list = []
    last_end = 0
    for i in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        reward_list.append(rewards)
        if dones:
            obs = env.reset()
            epilen_list.append(i - last_end)
            last_end = i
            reward_list_total.append(np.sum(reward_list))
            reward_list = []
            if i > 900:
                break
    print("mean reward:{}\n".format(np.mean(reward_list_total)))
    print("mean epilen:{}\n".format(np.mean(epilen_list)))
def save_client(base_index, subenv_id):
    base_agent = ACKTR.load(
        f"./base_agent/{subenv_dict[base_index]}/model.zip")

    subenv = subenv_dict[subenv_id]
    env = make_vec_env(f"selected-bipedal-{subenv}-v0",
                       n_envs=n_envs,
                       seed=seed)
    learner = base_agent
    learner.env = env
    learner.verbose = 0
    callback = SaveRNDDatasetCallback(base_index=base_index)
    learner.learn(
        total_timesteps=client_timesteps,
        callback=callback,
    )

    dir_name = f"base{base_index}_client_model/{subenv}"
    Path(dir_name).mkdir(parents=True, exist_ok=True)
    learner.save(f"{dir_name}/policy.zip")
    print(f"base {base_index} sub-env {subenv} done")
def eval_base_agent(agent_index):
    mean_result = []
    std_result = []
    agent = ACKTR.load(f"./base_agent/{subenv_dict[agent_index]}/model.zip")
    for env_index in range(4):
        env = gym.make(f"selected-bipedal-{subenv_dict[env_index]}-v0")
        env.seed = seed
        mean, std = evaluate_policy(agent, env, n_eval_episodes=100)
        mean_result.append(mean)
        std_result.append(std)
    Path("log").mkdir(parents=True, exist_ok=True)
    file = open(f"log/agent{agent_index}_simple_agent_test.csv",
                "w",
                newline="")
    writer = csv.writer(file)
    writer.writerow(mean_result)
    writer.writerow(std_result)
    file.close()
    print(f">>> Agent {agent_index}:")
    print(mean_result)
    print(std_result)
    return
def optimize_agent(trial):
    agent = PPO2
    policy = MlpLstmPolicy
    train_env, test_env = optimize_envs(trial)

    if agent == ACKTR:
        params = optimize_acktr(trial)
        model = ACKTR(policy, train_env, verbose=1,
                      tensorboard_log="./tensorboard", **params)
    elif agent == PPO2:
        params = optimize_ppo2(trial)
        model = PPO2(policy, train_env, verbose=1, nminibatches=1,
                     tensorboard_log="./tensorboard", **params)

    model.test_env = test_env
    model.trial = trial

    try:
        model.learn(n_timesteps, callback=learn_callback)
        model.env.close()
        test_env.close()
    except AssertionError:
        # Sometimes, random hyperparams can generate NaN
        model.env.close()
        model.test_env.close()
        raise

    is_pruned = False
    cost = np.inf

    if hasattr(model, 'is_pruned'):
        is_pruned = model.is_pruned  # pylint: disable=no-member
        cost = -1 * model.last_mean_test_reward  # pylint: disable=no-member

    del model.env, model.test_env
    del model

    if is_pruned:
        raise optuna.structs.TrialPruned()

    return cost
Exemple #21
0
import numpy as np

from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, SAC, PPO1, PPO2, TD3, TRPO
from stable_baselines.ddpg import NormalActionNoise
from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.common.evaluation import evaluate_policy


# Hyperparameters for learning identity for each RL model
LEARN_FUNC_DICT = {
    'a2c': lambda e: A2C(policy="MlpPolicy", learning_rate=1e-3, n_steps=1,
                         gamma=0.7, env=e, seed=0).learn(total_timesteps=10000),
    'acer': lambda e: ACER(policy="MlpPolicy", env=e, seed=0,
                           n_steps=1, replay_ratio=1).learn(total_timesteps=15000),
    'acktr': lambda e: ACKTR(policy="MlpPolicy", env=e, seed=0,
                             learning_rate=5e-4, n_steps=1).learn(total_timesteps=20000),
    'dqn': lambda e: DQN(policy="MlpPolicy", batch_size=16, gamma=0.1,
                         exploration_fraction=0.001, env=e, seed=0).learn(total_timesteps=40000),
    'ppo1': lambda e: PPO1(policy="MlpPolicy", env=e, seed=0, lam=0.5,
                           optim_batchsize=16, optim_stepsize=1e-3).learn(total_timesteps=15000),
    'ppo2': lambda e: PPO2(policy="MlpPolicy", env=e, seed=0,
                           learning_rate=1.5e-3, lam=0.8).learn(total_timesteps=20000),
    'trpo': lambda e: TRPO(policy="MlpPolicy", env=e, seed=0,
                           max_kl=0.05, lam=0.7).learn(total_timesteps=10000),
}


@pytest.mark.slow
@pytest.mark.parametrize("model_name", ['a2c', 'acer', 'acktr', 'dqn', 'ppo1', 'ppo2', 'trpo'])
def test_identity(model_name):
    """
Exemple #22
0
import gym

from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines import ACKTR

# multiprocess environment
n_cpu = 4
env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)])

model = ACKTR(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("acktr_cartpole")

del model  # remove to demonstrate saving and loading

model = ACKTR.load("acktr_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
Exemple #23
0
from stable_baselines import A2C, ACER, ACKTR, DeepQ, DDPG, PPO1, PPO2, TRPO
from stable_baselines.ddpg import AdaptiveParamNoiseSpec
from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox
from stable_baselines.common.vec_env import DummyVecEnv

PARAM_NOISE_DDPG = AdaptiveParamNoiseSpec(initial_stddev=float(0.2),
                                          desired_action_stddev=float(0.2))

# Hyperparameters for learning identity for each RL model
LEARN_FUNC_DICT = {
    'a2c':
    lambda e: A2C(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'acer':
    lambda e: ACER(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'acktr':
    lambda e: ACKTR(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'deepq':
    lambda e: DeepQ(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'ddpg':
    lambda e: DDPG(policy="MlpPolicy", env=e, param_noise=PARAM_NOISE_DDPG).
    learn(total_timesteps=1000),
    'ppo1':
    lambda e: PPO1(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'ppo2':
    lambda e: PPO2(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
    'trpo':
    lambda e: TRPO(policy="MlpPolicy", env=e).learn(total_timesteps=1000),
}


@pytest.mark.slow
Exemple #24
0
#! /usr/bin/env python

import gym
gym.logger.set_level(40)
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

from env import GoLeftEnv
from stable_baselines import DQN, PPO2, A2C, ACKTR
from stable_baselines.common.cmd_util import make_vec_env
from stable_baselines.common.evaluation import evaluate_policy

env = GoLeftEnv(grid_size=10)
env = make_vec_env(lambda: env, n_envs=1)

model = ACKTR.load("models/acktr_goleft", env=env)

obs = env.reset()
n_steps = 20
for step in range(n_steps):
    action, _ = model.predict(obs, deterministic=True)
    print("Step {}".format(step + 1))
    print("Action: ", action)
    obs, reward, done, info = env.step(action)
    print('obs=', obs, 'reward=', reward, 'done=', done)
    if done:
        # Note that the VecEnv resets automatically
        # when a done signal is encountered
        print("Goal reached!", "reward=", reward)
        break
def train_initial_policy(
        model_name,
        algo=ALGO,
        env_name=ENV_NAME,
        time_steps=TIME_STEPS):
    """Uses the specified algorithm on the target environment"""
    print("Using algorithm : ", algo.__name__)
    print("Model saved as : ", "data/models/" +algo.__name__+"_initial_policy_"+env_name+"_.pkl")

    # define the environment here
    env = gym.make(env_name)
    env.seed(SEED)
    if NOISE_VALUE>0 : env = NoisyRealEnv(env, noise_value=NOISE_VALUE)

    if MUJOCO_NORMALIZE:
        env = MujocoNormalized(env)

    print('~~ ENV Obs RANGE : ', env.observation_space.low, env.observation_space.high)
    print('~~~ ENV Action RANGE : ', env.action_space.low, env.action_space.high)

    if algo.__name__  == "ACKTR":
        print('Using SubprovVecEnv')
        env = SubprocVecEnv([lambda: env for i in range(8)])
    elif algo.__name__ == "SAC":
        print('Using standard gym environment')
        env = env
    else:
        print('Using Dummy Vec Env')
        env = DummyVecEnv([lambda : env])

    if NORMALIZE :
        env = VecNormalize(env,
                           training=True,
                           norm_obs=True,
                           norm_reward=False,
                           clip_reward=1e6,
                           )


    with open('data/target_policy_params.yaml') as file:
        args = yaml.load(file, Loader=yaml.FullLoader)
    args = args[algo.__name__][PARAMS_ENV]
    print('~~ Loaded args file ~~')

    if algo.__name__ == "SAC":
        print('Initializing SAC with RLBaselinesZoo hyperparameters .. ')
        print('using 256 node architecture as in the paper')

        class CustomPolicy(ffp_sac):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy, self).__init__(*args, **kwargs,
                                                   feature_extraction="mlp", layers=[256, 256])

        model = SAC(CustomPolicy, env,
                    verbose=1,
                    tensorboard_log='data/TBlogs/initial_policy_training',
                    batch_size=args['batch_size'],
                    buffer_size=args['buffer_size'],
                    ent_coef=args['ent_coef'],
                    learning_starts=args['learning_starts'],
                    learning_rate=args['learning_rate'],
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )
    elif algo.__name__ == "TD3":
        print('Initializing TD3 with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/td3/HopperBulletEnv-v0/config.yml
        n_actions = env.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=float(args['noise_std']) * np.ones(n_actions))
        class CustomPolicy2(ffp_td3):
            def __init__(self, *args, **kwargs):
                super(CustomPolicy2, self).__init__(*args, **kwargs,
                                                   feature_extraction="mlp", layers=[400, 300])
        model = TD3(CustomPolicy2, env,
                    verbose = 1,
                    tensorboard_log = 'data/TBlogs/initial_policy_training',
                    batch_size = args['batch_size'],
                    buffer_size = args['buffer_size'],
                    gamma = args['gamma'],
                    gradient_steps = args['gradient_steps'],
                    learning_rate = args['learning_rate'],
                    learning_starts = args['learning_starts'],
                    action_noise = action_noise,
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )

    elif algo.__name__ == "TRPO":
        print('Initializing TRPO with RLBaselinesZoo hyperparameters .. ')
        # hyperparameters suggestions from :
        # https://github.com/araffin/rl-baselines-zoo/blob/master/trained_agents/sac/HopperBulletEnv-v0/config.yml
        model = TRPO(mlp_standard, env,
                    verbose=1,
                    tensorboard_log='data/TBlogs/initial_policy_training',
                    timesteps_per_batch=args['timesteps_per_batch'],
                    lam=args['lam'],
                    max_kl=args['max_kl'],
                    gamma=args['gamma'],
                    vf_iters=args['vf_iters'],
                    vf_stepsize=args['vf_stepsize'],
                    entcoeff=args['entcoeff'],
                    cg_damping=args['cg_damping'],
                    cg_iters=args['cg_iters'],
                     seed=SEED,
                    )

    elif algo.__name__ == "ACKTR":
        print('Initializing ACKTR')
        model = ACKTR(mlp_standard,
                      env,
                      verbose=1,
                      n_steps=128,
                      ent_coef=0.01,
                      lr_schedule='constant',
                      learning_rate=0.0217,
                      max_grad_norm=0.5,
                      gamma=0.99,
                      vf_coef=0.946,
                      seed=SEED)

    elif algo.__name__ == "PPO2":
        print('Initializing PPO2')
        print('Num envs : ', env.num_envs)
        model = PPO2(mlp_standard,
                     env,
                     n_steps=int(args['n_steps']/env.num_envs),
                     nminibatches=args['nminibatches'],
                     lam=args['lam'],
                     gamma=args['gamma'],
                     ent_coef=args['ent_coef'],
                     noptepochs=args['noptepochs'],
                     learning_rate=args['learning_rate'],
                     cliprange=args['cliprange'],
                     verbose=1,
                     tensorboard_log='data/TBlogs/initial_policy_training',
                     seed=SEED,
                     )

    else:
        print('No algorithm matched. Using SAC .. ')
        model = SAC(CustomPolicy, env,
                    verbose=1,
                    batch_size=args['batch_size'],
                    buffer_size=args['buffer_size'],
                    ent_coef=args['ent_coef'],
                    learning_starts=args['learning_starts'],
                    learning_rate=args['learning_rate'],
                    train_freq=args['train_freq'],
                    seed=SEED,
                    )

    # change model name if using normalization
    if NORMALIZE:
        model_name = model_name.replace('.pkl', 'normalized_.pkl')

    elif MUJOCO_NORMALIZE:
        model_name = model_name.replace('.pkl', 'mujoco_norm_.pkl')

    if SAVE_BEST_FOR_20:
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name,
                    log_interval=10,
                    callback=eval_callback)
        save_the_model()
        model_name = model_name.replace('best_', '')
        model.save(model_name)
    elif SAVE_INTERMEDIATE:
        check_callback = CheckpointCallback(save_freq=SAVE_FREQ,
                                            save_path=model_name[:-4],
                                            name_prefix=ENV_NAME + '_' + str(SEED),
                                            verbose=1,
                                            )
        eval_env = DummyVecEnv([lambda: gym.make(ENV_NAME)])
        eval_env.seed(SEED)
        eval_callback = EvalCallback(eval_env,
                                     n_eval_episodes=10,
                                     eval_freq=SAVE_FREQ,
                                     log_path=model_name[:-4],
                                     deterministic=False,
                                     render=False,
                                     verbose=1)

        callbacks = CallbackList([check_callback, eval_callback])
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name.split('/')[-1],
                    log_interval=10,
                    callback=callbacks)
        model.save(model_name)
        npzfile = np.load(model_name[:-4] + '/evaluations.npz')
        average_rewards = np.mean(npzfile['results'], axis=1)[:, 0]
        with open(model_name[:-4] + "/eval_results.txt", "a") as f:
            for i in range(np.shape(average_rewards)[0]):
                f.write("{}, {}\n".format(npzfile['timesteps'][i], average_rewards[i]))
        evaluate_policy_on_env(env, model, render=False, iters=50)
    else:
        model.learn(total_timesteps=time_steps,
                    tb_log_name=model_name.split('/')[-1],
                    log_interval=10,)
        model.save(model_name)
        evaluate_policy_on_env(env, model, render=False, iters=50)

    # save the environment params
    if NORMALIZE:
        # env.save(model_name.replace('.pkl', 'stats_.pkl'))
        env.save('data/models/env_stats/'+env_name+'.pkl')

    print('done :: ', model_name)
    exit()
Exemple #26
0
#       extension: .py
#       format_name: light
#       format_version: '1.4'
#       jupytext_version: 1.2.4
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# +
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines import ACKTR

# multiprocess environment
n_cpu = 4
env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)])

model = ACKTR(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=250000)
# -

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
Exemple #27
0
envTmp = gym.make('Battleships-v0', config=config)

#Wrap environment into a vector environment
env = DummyVecEnv([lambda: envTmp])

# Choose to display board
print("Diplay board: Yes (1), No (0)")
choiceRender = bool(int(input()))

# Choose Model
randomAgent = True
print("Choose Agent: Radom (1), ACKTR (2), DQN (3)")
choice = int(input())
if choice == 2:
    # Load ACKTR Model
    model = ACKTR.load("./ACKTR_Models/ACKTR_5x5_3_2_2_Dynamic.zip", verbose=0, env=env)
    # Disable Random Agent
    randomAgent = False

elif choice == 3:
    # load DQN Model
    model = DQN.load("./DQN_Models/DQN_5x5_3_2_2_Dynamic.zip", verbose=0, env=env)
    # Disable Random Agent
    randomAgent = False

# Inits result Array
results = []
# Iteration: Amount of played Games
for iteration in range(10):
    score = 0
    print('Iteration', iteration)
     elif args.model == 'acer':
         model = ACER(policy,
                      env,
                      verbose=1,
                      n_steps=64,
                      tensorboard_log=out_dir)
     elif args.model == 'ppo':
         model = PPO2(policy,
                      env,
                      verbose=1,
                      n_steps=64,
                      tensorboard_log=out_dir)
     elif args.model == 'acktr':
         model = ACKTR(policy,
                       env,
                       n_steps=4,
                       verbose=1,
                       tensorboard_log=out_dir)
     elif args.model == 'ddpg':
         model = DDPG(policy, env, verbose=1, tensorboard_log=out_dir)
     elif args.model == 'a2c':
         model = A2C(policy,
                     env,
                     n_steps=64,
                     verbose=1,
                     tensorboard_log=out_dir)
     elif args.model == 'sac':
         model = SAC("CnnPolicy", env)
     train(model, env, out_dir)
 else:
     #results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "rl")
Exemple #29
0
#! /usr/bin/env python

import gym
gym.logger.set_level(40)
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

import env_yaw
from stable_baselines import DQN, PPO2, A2C, ACKTR
from stable_baselines.common.cmd_util import make_vec_env
from stable_baselines.common.evaluation import evaluate_policy

env = gym.make("Yaw-v0")
env = make_vec_env(lambda: env, n_envs=1)

# model = ACKTR.load("models/acktr_goleft", env=env)
model = ACKTR('MlpPolicy', env, verbose=1)

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
        if (n_steps + 1) % 100000 == 0:
            print("Saving checkpoint model")
            _locals['self'].save(model_dir + 'model_{}_steps.pkl'.format(n_steps + 1))

        n_steps += 1
        return True


    print('Starting Training')
    """
    ACKTR(policy, env, gamma=0.99, nprocs=1, n_steps=20, ent_coef=0.01, vf_coef=0.25,
     vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001,
      lr_schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True,
       async_eigen_decomp=False, policy_kwargs=None, full_tensorboard_log=False)
    """

    model = ACKTR(policy=MlpPolicy, env=env, gamma=0.99, nprocs=1, n_steps=20,
                  ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25,
                  max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear', verbose=0,
                  tensorboard_log=None, _init_setup_model=True)

    model.learn(total_timesteps=num_timesteps, callback=custom_callback, seed=seed,
                log_interval=100)

    print('Starting evaluation')
    env = setup_env_cart_discrete(seed, log_dir)
    model.set_env(env)

    get_trajectories(model, trajectory_dir, n_trajectories=100)