Ejemplo n.º 1
0
class OUNoise:
    """Ornstein-Uhlenbeck process."""
    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.baseline_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(size),
                                                           sigma=sigma *
                                                           np.ones(size))
        self.reset()

    def reset(self):
        """Reset the internal state (= noise) to mean (mu)."""
        # self.decay()
        self.state = copy.copy(self.mu)
        self.baseline_noise.reset()

    def sample(self):
        """Update internal state and return it as a noise sample."""
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.array(
            [random.random() for i in range(len(x))])
        self.state = x + dx
        return self.baseline_noise()  #self.state

    def decay(self):
        self.sigma = max(0.35, self.sigma * 0.99)
        self.theta = max(0.15, self.theta * 0.995)
Ejemplo n.º 2
0
 def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
     """Initialize parameters and noise process."""
     self.mu = mu * np.ones(size)
     self.theta = theta
     self.sigma = sigma
     self.seed = random.seed(seed)
     self.baseline_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(size),
                                                        sigma=sigma *
                                                        np.ones(size))
     self.reset()
Ejemplo n.º 3
0
    def _preprocess_action_noise(
        self, hyperparams: Dict[str, Any], saved_hyperparams: Dict[str, Any], env: VecEnv
    ) -> Dict[str, Any]:
        # Special case for HER
        algo = saved_hyperparams["model_class"] if self.algo == "her" else self.algo
        # Parse noise string
        if algo in ["ddpg", "sac", "td3", "tqc", "d3pg"] and hyperparams.get("noise_type") is not None:
            noise_type = hyperparams["noise_type"].strip()
            noise_std = hyperparams["noise_std"]

            # Save for later (hyperparameter optimization)
            self.n_actions = env.action_space.shape[0]

            if "normal" in noise_type:
                hyperparams["action_noise"] = NormalActionNoise(
                    mean=np.zeros(self.n_actions),
                    sigma=noise_std * np.ones(self.n_actions),
                )
            elif "ornstein-uhlenbeck" in noise_type:
                hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
                    mean=np.zeros(self.n_actions),
                    sigma=noise_std * np.ones(self.n_actions),
                )
            else:
                raise RuntimeError(f'Unknown noise type "{noise_type}"')

            print(f"Applying {noise_type} noise with std {noise_std}")

            del hyperparams["noise_type"]
            del hyperparams["noise_std"]

        return hyperparams
Ejemplo n.º 4
0
def test_vec_noise():
    num_envs = 4
    num_actions = 10
    mu = np.zeros(num_actions)
    sigma = np.ones(num_actions) * 0.4
    base: ActionNoise = OrnsteinUhlenbeckActionNoise(mu, sigma)
    with pytest.raises(ValueError):
        vec = VectorizedActionNoise(base, -1)
    with pytest.raises(ValueError):
        vec = VectorizedActionNoise(base, None)
    with pytest.raises(ValueError):
        vec = VectorizedActionNoise(base, "whatever")

    vec = VectorizedActionNoise(base, num_envs)
    assert vec.n_envs == num_envs
    assert vec().shape == (num_envs, num_actions)
    assert not (vec() == base()).all()
    with pytest.raises(ValueError):
        vec = VectorizedActionNoise(None, num_envs)
    with pytest.raises(TypeError):
        vec = VectorizedActionNoise(12, num_envs)
    with pytest.raises(AssertionError):
        vec.noises = []
    with pytest.raises(TypeError):
        vec.noises = None
    with pytest.raises(ValueError):
        vec.noises = [None] * vec.n_envs
    with pytest.raises(AssertionError):
        vec.noises = [base] * (num_envs - 1)
    assert all(isinstance(noise, type(base)) for noise in vec.noises)
    assert len(vec.noises) == num_envs
Ejemplo n.º 5
0
    def train_DDPG(self, model_name, model_params=DDPG_PARAMS):
        """DDPG model"""
        from stable_baselines3.ddpg.ddpg import DDPG
        # from stable_baselines3.ddpg.policies import DDPGPolicy
        from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise

        env_train = self.env

        n_actions = env_train.action_space.shape[-1]
        # param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))

        start = time.time()
        model = DDPG(
            'MlpPolicy',
            env_train,
            batch_size=model_params['batch_size'],
            buffer_size=model_params['buffer_size'],
            # param_noise=param_noise,
            action_noise=action_noise,
            verbose=model_params['verbose'],
            tensorboard_log=f"{zvt_env['log_path']}/{model_name}")
        model.learn(total_timesteps=model_params['timesteps'],
                    tb_log_name="DDPG_run")
        end = time.time()

        model.save(f"{zvt_env['model_path']}/{model_name}")
        print('Training time (DDPG): ', (end - start) / 60, ' minutes')
        return model
Ejemplo n.º 6
0
 def create_model(env, algorithm, save_path):
     # the noise object
     n_actions = env.action_space.shape[-1]
     action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                 sigma=float(0.2) *
                                                 np.ones(n_actions),
                                                 theta=0.15)
     if algorithm == "ddpg":
         return DDPG(DDPG_MlpPolicy,
                     env,
                     learning_rate=0.001,
                     buffer_size=1000000,
                     batch_size=64,
                     tau=0.001,
                     gamma=0.99,
                     train_freq=(10, "step"),
                     action_noise=action_noise,
                     policy_kwargs=dict(optimizer_class=th.optim.AdamW),
                     tensorboard_log=save_path)
     elif algorithm == "td3":
         return TD3(TD3_MlpPolicy,
                    env,
                    action_noise=action_noise,
                    tensorboard_log=save_path)
     elif algorithm == "sac":
         return SAC(SAC_MlpPolicy,
                    env,
                    action_noise=action_noise,
                    tensorboard_log=save_path)
     else:
         raise Exception("--> Alican's LOG: Unknown agent type!")
Ejemplo n.º 7
0
def sample_td3_params(trial):
    """
    Sampler for TD3 hyperparams.
    :param trial: (optuna.trial)
    :return: (dict)
    """
    gamma = trial.suggest_categorical(
        'gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform('lr', 1e-5, 1)
    batch_size = trial.suggest_categorical('batch_size',
                                           [16, 32, 64, 100, 128, 256, 512])
    buffer_size = trial.suggest_categorical(
        'buffer_size', [int(1e4), int(1e5), int(1e6)])

    episodic = trial.suggest_categorical('episodic', [True, False])

    if episodic:
        n_episodes_rollout = 1
        train_freq, gradient_steps = -1, -1
    else:
        train_freq = trial.suggest_categorical('train_freq',
                                               [1, 16, 128, 256, 1000, 2000])
        gradient_steps = train_freq
        n_episodes_rollout = -1

    noise_type = trial.suggest_categorical(
        'noise_type', ['ornstein-uhlenbeck', 'normal', None])
    noise_std = trial.suggest_uniform('noise_std', 0, 1)

    net_arch = trial.suggest_categorical('net_arch',
                                         ["small", "medium", "big"])
    # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])

    net_arch = {
        'small': [64, 64],
        'medium': [256, 256],
        'big': [400, 300],
    }[net_arch]

    hyperparams = {
        'gamma': gamma,
        'learning_rate': learning_rate,
        'batch_size': batch_size,
        'buffer_size': buffer_size,
        'train_freq': train_freq,
        'gradient_steps': gradient_steps,
        'n_episodes_rollout': n_episodes_rollout,
        'policy_kwargs': dict(net_arch=net_arch),
    }

    if noise_type == 'normal':
        hyperparams['action_noise'] = NormalActionNoise(
            mean=np.zeros(trial.n_actions),
            sigma=noise_std * np.ones(trial.n_actions))
    elif noise_type == 'ornstein-uhlenbeck':
        hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(trial.n_actions),
            sigma=noise_std * np.ones(trial.n_actions))

    return hyperparams
Ejemplo n.º 8
0
    def _preprocess_action_noise(self, hyperparams: Dict[str, Any],
                                 saved_hyperparams: Dict[str, Any],
                                 env: VecEnv) -> Dict[str, Any]:
        # Parse noise string
        # Note: only off-policy algorithms are supported
        if hyperparams.get("noise_type") is not None:
            noise_type = hyperparams["noise_type"].strip()
            noise_std = hyperparams["noise_std"]

            # Save for later (hyperparameter optimization)
            self.n_actions = env.action_space.shape[0]

            if "normal" in noise_type:
                hyperparams["action_noise"] = NormalActionNoise(
                    mean=np.zeros(self.n_actions),
                    sigma=noise_std * np.ones(self.n_actions),
                )
            elif "ornstein-uhlenbeck" in noise_type:
                hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
                    mean=np.zeros(self.n_actions),
                    sigma=noise_std * np.ones(self.n_actions),
                )
            else:
                raise RuntimeError(f'Unknown noise type "{noise_type}"')

            print(f"Applying {noise_type} noise with std {noise_std}")

            del hyperparams["noise_type"]
            del hyperparams["noise_std"]

        return hyperparams
Ejemplo n.º 9
0
def sample_ddpg_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for DDPG hyperparams.

    :param trial:
    :return:
    """
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform("lr", 1e-5, 1)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 256, 512, 1024, 2048])
    buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)])
    # Polyak coeff
    tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02])

    episodic = trial.suggest_categorical("episodic", [True, False])

    if episodic:
        n_episodes_rollout = 1
        train_freq, gradient_steps = -1, -1
    else:
        train_freq = trial.suggest_categorical("train_freq", [1, 16, 128, 256, 1000, 2000])
        gradient_steps = train_freq
        n_episodes_rollout = -1

    noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None])
    noise_std = trial.suggest_uniform("noise_std", 0, 1)

    # NOTE: Add "verybig" to net_arch when tuning HER (see TD3)
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
    # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])

    net_arch = {
        "small": [64, 64],
        "medium": [256, 256],
        "big": [400, 300],
    }[net_arch]

    hyperparams = {
        "gamma": gamma,
        "tau": tau,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "buffer_size": buffer_size,
        "train_freq": train_freq,
        "gradient_steps": gradient_steps,
        "n_episodes_rollout": n_episodes_rollout,
        "policy_kwargs": dict(net_arch=net_arch),
    }

    if noise_type == "normal":
        hyperparams["action_noise"] = NormalActionNoise(
            mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)
        )
    elif noise_type == "ornstein-uhlenbeck":
        hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)
        )

    return hyperparams
Ejemplo n.º 10
0
def main():
    """
   # Example with Vectorized env
   num_cpu = 4  # Number of processes to use
   my_env_kwargs={'renders': False}
   env = make_vec_env('panda-ip-reach-v0', n_envs=num_cpu, env_kwargs=my_env_kwargs)
   """

    # Example with a simple Dummy vec env
    env = gym.envs.make('panda-ip-reach-v0', renders=False)
    env = DummyVecEnv([lambda: env])

    #check_env(pandaenv)

    # The noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    print("n_actions = {0}".format(n_actions))

    #action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=0.1 * np.ones(n_actions))

    model = DDPG(policy='MlpPolicy',
                 env=env,
                 learning_rate=0.001,
                 buffer_size=1000000,
                 learning_starts=100,
                 batch_size=100,
                 tau=0.005,
                 gamma=0.99,
                 train_freq=1,
                 gradient_steps=-1,
                 action_noise=action_noise,
                 optimize_memory_usage=False,
                 tensorboard_log="./ddpg_panda_reach_tensorboard/",
                 create_eval_env=False,
                 policy_kwargs=None,
                 verbose=1,
                 seed=None,
                 device='auto',
                 _init_setup_model=True)
    """
   print("start model evaluation without learning !")
   mean_reward_before, std_reward_before = evaluate_policy(model, env, n_eval_episodes=1)
   print("end model evaluation !")
   """
    print("start model learning !")
    model.learn(total_timesteps=200000, log_interval=10)
    print("end model learning !")

    print("-> model saved !!")
    model.save("ddpg_panda_reach")
    """
   print("start model evaluation with learning !")
   mean_reward_after, std_reward_after = evaluate_policy(model, env, n_eval_episodes=1)
   print("end model evaluation !")
   """
    """
Ejemplo n.º 11
0
def sample_td3_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for TD3 hyperparams.

    :param trial:
    :return:
    """
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 256, 512, 1024, 2048])
    buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)])
    # Polyak coeff
    tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05, 0.08])

    train_freq = trial.suggest_categorical("train_freq", [1, 4, 8, 16, 32, 64, 128, 256, 512])
    gradient_steps = train_freq

    noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None])
    noise_std = trial.suggest_uniform("noise_std", 0, 1)

    # NOTE: Add "verybig" to net_arch when tuning HER
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
    # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])

    net_arch = {
        "small": [64, 64],
        "medium": [256, 256],
        "big": [400, 300],
        # Uncomment for tuning HER
        # "verybig": [256, 256, 256],
    }[net_arch]

    hyperparams = {
        "gamma": gamma,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "buffer_size": buffer_size,
        "train_freq": train_freq,
        "gradient_steps": gradient_steps,
        "policy_kwargs": dict(net_arch=net_arch),
        "tau": tau,
    }

    if noise_type == "normal":
        hyperparams["action_noise"] = NormalActionNoise(
            mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)
        )
    elif noise_type == "ornstein-uhlenbeck":
        hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)
        )

    if trial.using_her_replay_buffer:
        hyperparams = sample_her_params(trial, hyperparams)

    return hyperparams
def objective(trial):
    noise = trial.suggest_uniform('Noise', 0.1, 0.8)
    timesteps = trial.suggest_int('Timesteps', 10, 100)

    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(noise) *
                                                np.ones(n_actions))
    model = DDPG('MlpPolicy', env, action_noise=action_noise)
    model.learn(total_timesteps=timesteps * 1000, log_interval=1000)

    return test_model(env, model, '')
Ejemplo n.º 13
0
def train_DDPG(env_train, model_name, timesteps=10000):
    """DDPG model"""

    # add the noise objects for DDPG
    n_actions = env_train.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))

    start = time.time()
    model = DDPG('MlpPolicy', env_train, action_noise=action_noise)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (DDPG): ', (end-start)/60,' minutes')
    return model
Ejemplo n.º 14
0
def train_DDPG(env_train, model_name, timesteps=10000):
    """DDPG model"""
    # the noise objects for DDPG
    n_actions = env_train.action_space.shape[-1]
    # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    start = time.time()
    param_noise = None
    # removed keyword "param_noise=param_noise" stable_baselines3 doesn't need this one
    model = DDPG('MlpPolicy', env_train, action_noise=action_noise)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (DDPG): ', (end - start) / 60, ' minutes')
    return model
def action_noise(hyper, algo, n_actions):
    """
  Configure Action Noise from hyperparameter logs
  """
    if hyper['params_episodic']:
        hyper['params_train_freq'] = (1, "episode")
    else:
        hyper['params_train_freq'] = (int(hyper['params_train_freq']), "step")

    if hyper["params_noise_type"] == "normal":
        hyper["params_action_noise"] = NormalActionNoise(
            mean=np.zeros(n_actions),
            sigma=hyper['params_noise_std'] * np.ones(n_actions))
    elif hyper["params_noise_type"] == "ornstein-uhlenbeck":
        hyper["params_action_noise"] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(n_actions),
            sigma=hyper['params_noise_std'] * np.ones(n_actions))
    else:
        hyper["params_action_noise"] = None
    return hyper
def action_noise(hyper, algo, n_actions):
  """
  Configure Action Noise from hyperparameter logs
  """
  if hyper['params_episodic']:
      hyper['params_n_episodes_rollout'] = 1
      hyper['params_train_freq'], hyper['params_gradient_steps'] = -1, -1
  else:
      hyper['params_train_freq'] = hyper['params_train_freq']
      hyper['params_gradient_steps'] = hyper['params_train_freq']
      hyper['params_n_episodes_rollout'] = -1
      
  if hyper["params_noise_type"] == "normal":  
    hyper["params_action_noise"] = NormalActionNoise(
      mean=np.zeros(n_actions), sigma= hyper['params_noise_std'] * np.ones(n_actions))
  elif hyper["params_noise_type"] == "ornstein-uhlenbeck":
    hyper["params_action_noise"] = OrnsteinUhlenbeckActionNoise(
        mean=np.zeros(n_actions), sigma= hyper['params_noise_std'] * np.ones(n_actions))
  else:
    hyper["params_action_noise"] = None
  return hyper
Ejemplo n.º 17
0
def make_model(config, env):
    policy = config["policy_name"]

    if config["policy_name"] == "CustomTCNPolicy":
        policy = customActorCriticPolicyWrapper(
            env.observation_space.shape[0] // config["obs_input"],
            config["obs_input"])

    tb_log = None
    if config["tensorboard_log"]:
        tb_log = "./tb/{}/".format(config["session_ID"])

    ou_noise = None
    if config["ou_noise"]:
        ou_noise = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(env.action_space.shape[0]),
            sigma=config["ou_sigma"] * np.ones(env.action_space.shape[0]),
            theta=config["ou_theta"],
            dt=config["ou_dt"],
            initial_noise=None)

    model = TD3(policy=policy,
                env=env,
                buffer_size=config["buffer_size"],
                learning_starts=config["learning_starts"],
                action_noise=ou_noise,
                target_policy_noise=config["target_policy_noise"],
                target_noise_clip=config["target_noise_clip"],
                gamma=config["gamma"],
                tau=config["tau"],
                learning_rate=eval(config["learning_rate"]),
                verbose=config["verbose"],
                tensorboard_log=tb_log,
                device="cpu",
                policy_kwargs=dict(net_arch=[
                    int(config["policy_hid_dim"]),
                    int(config["policy_hid_dim"])
                ]))

    return model
Ejemplo n.º 18
0
import numpy as np
import pytest

from stable_baselines3 import A2C, PPO, SAC, TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

normal_action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1))


@pytest.mark.parametrize('action_noise', [normal_action_noise, OrnsteinUhlenbeckActionNoise(np.zeros(1), 0.1 * np.ones(1))])
def test_td3(action_noise):
    model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]),
                learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise)
    model.learn(total_timesteps=1000, eval_freq=500)


@pytest.mark.parametrize("env_id", ['CartPole-v1', 'Pendulum-v0'])
def test_a2c(env_id):
    model = A2C('MlpPolicy', env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True)
    model.learn(total_timesteps=1000, eval_freq=500)


@pytest.mark.parametrize("env_id", ['CartPole-v1', 'Pendulum-v0'])
@pytest.mark.parametrize("clip_range_vf", [None, 0.2, -0.2])
def test_ppo(env_id, clip_range_vf):
    if clip_range_vf is not None and clip_range_vf < 0:
        # Should throw an error
        with pytest.raises(AssertionError):
            model = PPO('MlpPolicy', env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True,
                        clip_range_vf=clip_range_vf)
    else:
Ejemplo n.º 19
0
import gym
import numpy as np
import pytest

from stable_baselines3 import A2C, DDPG, DQN, PPO, SAC, TD3
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

normal_action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1))


@pytest.mark.parametrize("model_class", [TD3, DDPG])
@pytest.mark.parametrize("action_noise", [
    normal_action_noise,
    OrnsteinUhlenbeckActionNoise(np.zeros(1), 0.1 * np.ones(1))
])
def test_deterministic_pg(model_class, action_noise):
    """
    Test for DDPG and variants (TD3).
    """
    model = model_class(
        "MlpPolicy",
        "Pendulum-v0",
        policy_kwargs=dict(net_arch=[64, 64]),
        learning_starts=100,
        verbose=1,
        create_eval_env=True,
        buffer_size=250,
        action_noise=action_noise,
    )
    model.learn(total_timesteps=300, eval_freq=250)
Ejemplo n.º 20
0
def run_model_stablebaseline(flow_params,
                             num_cpus=1,
                             rollout_size=50,
                             num_steps=50,
                             algorithm="ppo",
                             exp_config=None):
    """Run the model for num_steps if provided.
    Parameters
    ----------
    flow_params : dict
        flow-specific parameters
    num_cpus : int
        number of CPUs used during training
    rollout_size : int
        length of a single rollout
    num_steps : int
        total number of training steps
    The total rollout length is rollout_size.
    Returns
    -------
    stable_baselines.*
        the trained model
    """
    from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv

    if num_cpus == 1:
        constructor = env_constructor(params=flow_params, version=0)()
        # The algorithms require a vectorized environment to run
        env = DummyVecEnv([lambda: constructor])
    else:
        env = SubprocVecEnv([
            env_constructor(params=flow_params, version=i)
            for i in range(num_cpus)
        ])
    if algorithm == "PPO":
        from stable_baselines3 import PPO
        train_model = PPO('MlpPolicy', env, verbose=1, n_steps=rollout_size)
        train_model.learn(total_timesteps=num_steps)
        print("Learning Process is Done.")
        return train_model

    elif algorithm == "DDPG":
        from stable_baselines3 import DDPG
        from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
        import numpy as np
        if exp_config == 'singleagent_figure_eight':
            train_model = DDPG(
                'MlpPolicy',
                env,
                verbose=1,
                n_episodes_rollout=rollout_size,
                learning_starts=3000,
                learning_rate=0.0001,
                action_noise=OrnsteinUhlenbeckActionNoise(
                    mean=np.zeros(1),
                    sigma=0.15 * np.ones(1),
                    initial_noise=0.7 * np.ones(1)),
                tau=0.005,
                batch_size=128,
                tensorboard_log='tensorboard_ddpg',
                device='cuda',
            )
        else:
            train_model = DDPG(
                'MlpPolicy',
                env,
                verbose=1,
                n_episodes_rollout=rollout_size,
                learning_starts=1200,
                tensorboard_log='tensorboard_ddpg',
                learning_rate=0.0001,
                action_noise=OrnsteinUhlenbeckActionNoise(
                    mean=np.zeros(1),
                    sigma=0.15 * np.ones(1),
                    initial_noise=0.7 * np.ones(1)),
                tau=0.005,
                batch_size=512,
                device='cpu',
            )

        from tensorboard_baselines.callbacks_ddpg import TensorboardCallback
        train_model.learn(
            total_timesteps=num_steps,
            log_interval=2,
            eval_log_path='ddpg_log',
            eval_freq=2,
            eval_freq=10,
            #callback=[TensorboardCallback],
        )
        print("Learning Process is Done.")
        return train_model
Ejemplo n.º 21
0
        n_actions = env.action_space.shape[0]
        if 'normal' in noise_type:
            if 'lin' in noise_type:
                final_sigma = hyperparams.get('noise_std_final',
                                              0.0) * np.ones(n_actions)
                hyperparams['action_noise'] = LinearNormalActionNoise(
                    mean=np.zeros(n_actions),
                    sigma=noise_std * np.ones(n_actions),
                    final_sigma=final_sigma,
                    max_steps=n_timesteps)
            else:
                hyperparams['action_noise'] = NormalActionNoise(
                    mean=np.zeros(n_actions),
                    sigma=noise_std * np.ones(n_actions))
        elif 'ornstein-uhlenbeck' in noise_type:
            hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions))
        else:
            raise RuntimeError(f'Unknown noise type "{noise_type}"')
        print(f"Applying {noise_type} noise with std {noise_std}")
        del hyperparams['noise_type']
        del hyperparams['noise_std']
        if 'noise_std_final' in hyperparams:
            del hyperparams['noise_std_final']

    if args.trained_agent.endswith('.zip') and os.path.isfile(
            args.trained_agent):
        # Continue training
        print("Loading pretrained agent")
        # Policy should not be changed
        del hyperparams['policy']
Ejemplo n.º 22
0
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import VecNormalize

import gym, gym_conservation

env_id = "conservation-v6"  #"fishing-v1"
algo = "td3"
outdir = "results"
total_timesteps = 1500000
verbose = 0
seed = 0
tensorboard_log = "/var/log/tensorboard/single"
log_dir = "logs"

noise_std = 0.4805935357322933,
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(1),
                                            sigma=noise_std * np.ones(1))
hyper = {
    "gamma": 0.995,
    "learning_rate": 8.315382409902049e-05,
    "batch_size": 512,
    "buffer_size": 10000,
    "train_freq": 1000,
    "gradient_steps": 1000,
    "n_episodes_rollout": -1,
    "action_noise": action_noise,
    "policy_kwargs": {
        "net_arch": [64, 64]
    }
}

#norm_env = VecNormalize(make_vec_env(env_id), gamma = hyper["gamma"])
Ejemplo n.º 23
0
    f.close()

# A2C algorithm
for i in range(n_tests):
    test_name = 'saved_models/a2c_soccer_actions_env_1_' + str(i)
    n_actions = env.action_space.shape[-1]
    model = A2C('MlpPolicy', env)
    model.learn(total_timesteps=25000, log_interval=1000)
    model.save(test_name)
    test_model(env, model, test_name)

# DDPG algorithm
for i in range(n_tests):
    test_name = 'saved_models/ddpg_soccer_actions_env_1_' + str(i)
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.3) * np.ones(n_actions))
    model = DDPG('MlpPolicy', env, action_noise=action_noise)
    model.learn(total_timesteps=10000, log_interval=1000)
    model.save(test_name)
    test_model(env, model, test_name)

for i in range(n_tests):
    test_name = 'saved_models/ddpg_soccer_actions_env_2_' + str(i)
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.3) * np.ones(n_actions))
    policy_kwargs = dict(net_arch=[400, 300])
    model = DDPG('MlpPolicy', env, action_noise=action_noise, policy_kwargs=policy_kwargs)
    model.learn(total_timesteps=10000, log_interval=1000)
    model.save(test_name)
    test_model(env, model, test_name)
if hyper['episodic']:
    hyper['n_episodes_rollout'] = 1
    hyper['train_freq'], hyper['gradient_steps'] = -1, -1
else:
    hyper['train_freq'] = hyper['train_freq']
    hyper['gradient_steps'] = hyper['train_freq']
    hyper['n_episodes_rollout'] = -1

n_actions = env.action_space.shape[0]
if hyper["noise_type"] == "normal":
    hyper["action_noise"] = NormalActionNoise(mean=np.zeros(n_actions),
                                              sigma=hyper['noise_std'] *
                                              np.ones(n_actions))
elif noise_type == "ornstein-uhlenbeck":
    hyper["action_noise"] = OrnsteinUhlenbeckActionNoise(
        mean=np.zeros(n_actions),
        sigma=hyper['noise_std'] * np.ones(n_actions))

model = DDPG('MlpPolicy',
             env,
             verbose=0,
             tensorboard_log=tensorboard_log,
             seed=seed,
             gamma=hyper['gamma'],
             learning_rate=hyper['lr'],
             batch_size=hyper['batch_size'],
             buffer_size=hyper['buffer_size'],
             action_noise=hyper['action_noise'],
             train_freq=hyper['train_freq'],
             gradient_steps=hyper['train_freq'],
             n_episodes_rollout=hyper['n_episodes_rollout'],
Ejemplo n.º 25
0
    #### Create custom policy ##########################################################################
    CustomPolicy = MlpPolicy
    CustomPolicy.layers = [64, 64, 32]  # actor network has layers [64, 64, 32]

    #### Check the environment's spaces ################################################################
    env = RLTetherAviary(gui=False, record=False)
    env = Monitor(env, log_dir)
    print("[INFO] Action space:", env.action_space)
    print("[INFO] Observation space:", env.observation_space)
    print("[INFO] Checking Environment...")
    check_env(env, warn=True, skip_render_check=True)

    ####
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(env.N_ACTIONS),
                                                sigma=0.1 *
                                                np.ones(env.N_ACTIONS),
                                                dt=0.005)

    #### Create the callback: check every 1000 steps
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                log_dir=log_dir)

    #### Train the model ###############################################################################
    model = DDPG(CustomPolicy,
                 env,
                 verbose=1,
                 batch_size=64,
                 action_noise=action_noise)

    for i in range(step_iters):  # run for step_iters * training_timesteps
Ejemplo n.º 26
0
from stable_baselines3 import TD3
from stable_baselines3.td3.policies import MlpPolicy
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

from stable_baselines3.common.evaluation import evaluate_policy

env = gym.make('Pendulum-v0')

# check env
#from stable_baselines3.common.env_checker import check_env
#check_env(env)

# The noise objects for TD3
n_actions = env.action_space.shape[-1]
#action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

model = TD3(MlpPolicy, env, action_noise=action_noise, verbose=1)

print("start model evaluation without learning !")
mean_reward_before, std_reward_before = evaluate_policy(model, env, n_eval_episodes=100)
print("end model evaluation !")

print("start model learning !")
model.learn(total_timesteps=10000, log_interval=10)
print("end model learning !")

print("-> model saved !!")
model.save("td3_pendulum")

print("start model evaluation with learning !")