def action_noise(hyper, algo, n_actions):
    """
  Configure Action Noise from hyperparameter logs
  """
    if hyper['params_episodic']:
        hyper['params_train_freq'] = (1, "episode")
    else:
        hyper['params_train_freq'] = (int(hyper['params_train_freq']), "step")

    if hyper["params_noise_type"] == "normal":
        hyper["params_action_noise"] = NormalActionNoise(
            mean=np.zeros(n_actions),
            sigma=hyper['params_noise_std'] * np.ones(n_actions))
    elif hyper["params_noise_type"] == "ornstein-uhlenbeck":
        hyper["params_action_noise"] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(n_actions),
            sigma=hyper['params_noise_std'] * np.ones(n_actions))
    else:
        hyper["params_action_noise"] = None
    return hyper
Example #2
0
    def test_stable_DDPG(env_name, request):

        env = request.getfixturevalue(env_name)
        # DDPG must fail in discrete environments
        if env_name == 'env_demo':
            with pytest.raises(IndexError):
                env.action_space.shape[-1]
            with pytest.raises(AssertionError):
                model = stable_baselines3.DDPG("MlpPolicy", env, verbose=1)
        else:
            # Action noise
            n_actions = env.action_space.shape[-1]
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=0.1 * np.ones(n_actions))
            # model
            model = stable_baselines3.DDPG("MlpPolicy",
                                           env,
                                           action_noise=action_noise,
                                           verbose=1)

            model.learn(total_timesteps=TIMESTEPS)

            # Check model state
            assert model.action_space == env.action_space
            assert model.env.action_space == env.action_space

            assert isinstance(model.policy,
                              stable_baselines3.td3.policies.TD3Policy)

            # Check model works

            obs = env.reset()
            a, _ = model.predict(obs)
            obs, reward, done, info = env.step(a)

            assert reward is not None and reward < 0
            assert a is not None
            assert isinstance(done, bool)
            assert info['timestep'] == 1

            env.close()
Example #3
0
def train_td3():

    log_dir = f"model_save/"
    env = ENV_CONTINUE(istest=False)
    env = Monitor(env, log_dir)
    env = DummyVecEnv([lambda: env])
    # env = VecNormalize(env, norm_obs=True, norm_reward=True,
    #                clip_obs=10.)

    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))
    # model = TD3("MlpPolicy", env, action_noise=action_noise, verbose=1, batch_size=2048, seed=1)

    model = TD3('MlpPolicy', env, verbose=1, batch_size=2048, seed=1)
    callback = SaveOnBestTrainingRewardCallback(check_freq=480,
                                                log_dir=log_dir)
    model.learn(total_timesteps=int(100000),
                callback=callback,
                log_interval=100)
    model.save('model_save/td3')
Example #4
0
def test_sac(ent_coef, i):
    model = SAC(
        "MlpPolicy",
        "Pendulum-v0",
        policy_kwargs=dict(net_arch=[64, 64]),
        learning_starts=5e3,
        verbose=1,
        create_eval_env=True,
        buffer_size=1000000,
        ent_coef=ent_coef,
        action_noise=NormalActionNoise(np.zeros(1), np.zeros(1))  #,
        #tensorboard_log="./sac_pendulum_tensorboard/"
    )
    eval_env = gym.make('Pendulum-v0')
    eval_callback = EvalCallback(eval_env,
                                 best_model_save_path='./logs/',
                                 log_path='./logs/',
                                 eval_freq=250,
                                 deterministic=True,
                                 render=False)
    model.learn(total_timesteps=20000, callback=eval_callback)
def test_continuous(model_class):
    env = IdentityEnvBox(eps=0.5)

    n_steps = {
        A2C: 3500,
        PPO: 3000,
        SAC: 700,
        TD3: 500,
        DDPG: 500
    }[model_class]

    kwargs = dict(policy_kwargs=dict(net_arch=[64, 64]), seed=0, gamma=0.95)
    if model_class in [TD3]:
        n_actions = 1
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))
        kwargs["action_noise"] = action_noise

    model = model_class("MlpPolicy", env, **kwargs).learn(n_steps)

    evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90)
def action_noise(hyper, algo, n_actions):
  """
  Configure Action Noise from hyperparameter logs
  """
  if hyper['params_episodic']:
      hyper['params_n_episodes_rollout'] = 1
      hyper['params_train_freq'], hyper['params_gradient_steps'] = -1, -1
  else:
      hyper['params_train_freq'] = hyper['params_train_freq']
      hyper['params_gradient_steps'] = hyper['params_train_freq']
      hyper['params_n_episodes_rollout'] = -1
      
  if hyper["params_noise_type"] == "normal":  
    hyper["params_action_noise"] = NormalActionNoise(
      mean=np.zeros(n_actions), sigma= hyper['params_noise_std'] * np.ones(n_actions))
  elif hyper["params_noise_type"] == "ornstein-uhlenbeck":
    hyper["params_action_noise"] = OrnsteinUhlenbeckActionNoise(
        mean=np.zeros(n_actions), sigma= hyper['params_noise_std'] * np.ones(n_actions))
  else:
    hyper["params_action_noise"] = None
  return hyper
def test_sac2():
    reward = []
    for i in [6000, 8000, 10000]:
        model = SAC("MlpPolicy",
                    "Pendulum-v0",
                    policy_kwargs=dict(net_arch=[64, 64]),
                    learning_starts=5000,
                    verbose=0,
                    create_eval_env=True,
                    buffer_size=i,
                    ent_coef=0,
                    action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)),
                    batch_size=32)
        eval_env = gym.make('Pendulum-v0')
        eval_callback = EvalCallback(eval_env,
                                     best_model_save_path='./logs/',
                                     log_path='./logs/alpha4_histogram',
                                     eval_freq=250,
                                     n_eval_episodes=5,
                                     deterministic=True,
                                     render=False)
        model.learn(total_timesteps=20000, callback=eval_callback)
        reward.append(eval_callback.last_mean_reward)
        hist, bins = np.histogram(model.replay_buffer.rewards, bins=500)
        x = []
        for h in range(len(hist)):
            for j in range(hist[h]):
                x.append(bins[h])
        plt.hist(x, bins=bins)
        plt.xlabel("reward")
        plt.ylabel("population")
        plt.title(
            "last mean reward = {:.2f} +/- {:.2f}, replay size = {}".format(
                reward[-1], eval_callback.last_std, i))
        plt.legend()
        plt.show()

    return reward
Example #8
0
def main():
    # Create log dir
    log_dir = './sac_data'
    os.makedirs(log_dir, exist_ok=True)

    vix_env = trading_vix_env.trading_vix_env()
    env = Monitor(vix_env, log_dir)

    # Create action noise because TD3 and DDPG use a deterministic policy
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))
    # Create the callback: check every 20000 steps
    callback = custom_call_back.CustomCallback(check_freq=20000,
                                               log_dir=log_dir)
    # Create RL model
    model = SAC('MlpPolicy',
                env,
                action_noise=action_noise,
                verbose=2,
                batch_size=10000)
    # Train the agent
    model.learn(total_timesteps=int(5e9), callback=callback)
def test_goal_selection_strategy(goal_selection_strategy, online_sampling):
    """
    Test different goal strategies.
    """
    env = BitFlippingEnv(continuous=True)

    normal_action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1))

    model = HER(
        "MlpPolicy",
        env,
        SAC,
        goal_selection_strategy=goal_selection_strategy,
        online_sampling=online_sampling,
        gradient_steps=1,
        train_freq=4,
        max_episode_length=10,
        policy_kwargs=dict(net_arch=[64]),
        learning_starts=100,
        action_noise=normal_action_noise,
    )
    assert model.action_noise is not None
    model.learn(total_timesteps=300)
Example #10
0
def train_DDPG(env):

    print(f"action space shape -1:{env.action_space.shape[-1]}")

    # The noise objects for TD3
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.02 * np.ones(n_actions))

    model = DDPG(
        'MlpPolicy',
        env,
        learning_rate=0.0003,
        learning_starts=5,
        train_freq=10,
        n_episodes_rollout=-1,
        buffer_size=100000,
        action_noise=action_noise,
        batch_size=128,
        verbose=2,
    )
    model.learn(total_timesteps=1000000, log_interval=1)

    model.save("DDPG_pkl")
mdp = OvercookedGridworld.from_layout_name("cramped_room_single")
base_env = OvercookedEnv.from_mdp(mdp, horizon=1e4)
env = gym.make('Overcooked-single-v0')
env.custom_init(base_env, base_env.lossless_state_encoding_mdp_single)
env = Monitor(env, "./her_overcooked/", allow_early_resets=True)

# Available strategies (cf paper): future, final, episode
# goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE
goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

# If True the HER transitions will get sampled online
online_sampling = True
# Time limit for the episodes
max_episode_length = 50

action_noise = NormalActionNoise(mean=np.zeros(1), sigma=0.3 * np.ones(1))

# Initialize the model
model = HER(
    "MlpPolicy",
    env,
    model_class,
    n_sampled_goal=4,
    goal_selection_strategy=goal_selection_strategy,
    # IMPORTANT: because the env is not wrapped with a TimeLimit wrapper
    # we have to manually specify the max number of steps per episode
    max_episode_length=max_episode_length,
    verbose=1,
    buffer_size=int(1e6),
    learning_rate=1e-3,
    gamma=0.95,
Example #12
0
                    policy_kwargs=onpolicy_kwargs,
                    tensorboard_log=filename+'/tb/',
                    verbose=1
                    ) if ARGS.obs == ObservationType.KIN else PPO(a2cppoCnnPolicy,
                                                                  train_env,
                                                                  policy_kwargs=onpolicy_kwargs,
                                                                  tensorboard_log=filename+'/tb/',
                                                                  verbose=1
                                                                  )

    #### Off-policy algorithms #################################
    offpolicy_kwargs = dict(activation_fn=torch.nn.ReLU,
                            # net_arch=[512, 512, 256, 128]
                            net_arch=[400, 300]
                            )  # or None # or dict(net_arch=dict(qf=[256, 128, 64, 32], pi=[256, 128, 64, 32]))
    action_noise = NormalActionNoise(np.array([0.0]), np.array([0.2]))

    if ARGS.algo == 'sac':
        model = SAC(sacMlpPolicy,
                    train_env,
                    policy_kwargs=offpolicy_kwargs,
                    tensorboard_log=filename+'/tb/',
                    verbose=1
                    ) if ARGS.obs == ObservationType.KIN else SAC(sacCnnPolicy,
                                                                  train_env,
                                                                  policy_kwargs=offpolicy_kwargs,
                                                                  tensorboard_log=filename+'/tb/',
                                                                  verbose=1
                                                                  )
    if ARGS.algo == 'td3':
        model = TD3(td3ddpgMlpPolicy,
Example #13
0
def using_callback_example():
    # Using Callback: Monitoring Training.

    class SaveOnBestTrainingRewardCallback(BaseCallback):
        """
		Callback for saving a model (the check is done every 'check_freq' steps)
		based on the training reward (in practice, we recommend using 'EvalCallback').

		:param check_freq:
		:param log_dir: Path to the folder where the model will be saved. It must contains the file created by the 'Monitor' wrapper.
		:param verbose: Verbosity level.
		"""
        def __init__(self, check_freq: int, log_dir: str, verbose: int = 1):
            super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
            self.check_freq = check_freq
            self.log_dir = log_dir
            self.save_path = os.path.join(log_dir, "best_model")
            self.best_mean_reward = -np.inf

        def _init_callback(self) -> None:
            # Create folder if needed.
            if self.save_path is not None:
                os.makedirs(self.save_path, exist_ok=True)

        def _on_step(self) -> bool:
            if self.n_calls % self.check_freq == 0:
                # Retrieve training reward.
                x, y = ts2xy(load_results(self.log_dir), "timesteps")
                if len(x) > 0:
                    # Mean training reward over the last 100 episodes.
                    mean_reward = np.mean(y[-100:])
                    if self.verbose > 0:
                        print(f"Num timesteps: {self.num_timesteps}")
                        print(
                            f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}"
                        )

                    # New best model, you could save the agent here.
                    if mean_reward > self.best_mean_reward:
                        self.best_mean_reward = mean_reward
                        # Example for saving best model.
                        if self.verbose > 0:
                            print(f"Saving new best model to {self.save_path}")
                        self.model.save(self.save_path)

            return True

    # Create log dir.
    log_dir = "tmp/"
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment.
    env = gym.make("LunarLanderContinuous-v2")
    env = Monitor(env, log_dir)

    # Add some action noise for exploration.
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))
    # Because we use parameter noise, we should use a MlpPolicy with layer normalization.
    model = TD3("MlpPolicy", env, action_noise=action_noise, verbose=0)
    # Create the callback: check every 1000 steps.
    callback = SaveOnBestTrainingRewardCallback(check_freq=1000,
                                                log_dir=log_dir)
    # Train the agent.
    timesteps = 1e5
    model.learn(total_timesteps=int(timesteps), callback=callback)

    plot_results([log_dir], timesteps, results_plotter.X_TIMESTEPS,
                 "TD3 LunarLander")
    plt.show()
Example #14
0
from stable_baselines3.td3.policies import MlpPolicy
from TD3_torch import TD3
from Config import Config
arg = Config()
import numpy as np
from numpy import pi
import time
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from reward_functions import reward_singleff
from FireflyEnv import ffacc_real

action_noise = NormalActionNoise(mean=0., sigma=float(0.3))
arg.init_action_noise = 0.5
arg.goal_distance_range = [0.3, 1]
arg.mag_action_cost_range = [0.1, 1.]
arg.dev_action_cost_range = [0.1, 1.]
arg.dev_v_cost_range = [0.1, 1.]
arg.dev_w_cost_range = [0.1, 1.]
# arg.goal_distance_range=[0.01,0.99]
arg.gains_range = [0.35, 0.45, pi / 2 - 0.1, pi / 2 + 0.1]
# arg.goal_radius_range=[0.07,0.2]
arg.std_range = [0.01, 0.07, 0.01, 0.07]
# arg.mag_action_cost_range= [0.0001,0.0005]
# arg.dev_action_cost_range= [0.0001,0.0005]
arg.reward_amount = 100
arg.terminal_vel = 0.05
arg.dt = 0.1
arg.episode_len = 100
arg.training = True
arg.presist_phi = False
arg.agent_knows_phi = True
def train(experiment_name: str = typer.Option(...),
          total_timesteps: int = 3000000,
          input_path: Optional[str] = None,
          agent_type: SingleOrMultiAgent = SingleOrMultiAgent.single_agent,
          env_seed: int = random.randint(0, int(1e6)),
          environment_port: int = 5005,
          device: str = 'cuda',
          gamma: float = 0.99,
          learning_rate: float = 5e-5,
          policy_layers_comma_sep: str = '128,128,128',
          value_layers_comma_sep: str = '128,128,128',
          eval_freq: int = 100000,
          n_eval_episodes: int = 40,
          rl_algorithm: RLAlgorithm = RLAlgorithm.ppo,
          n_envs: Optional[int] = None,
          batch_size: Optional[int] = None,
          n_steps: Optional[int] = None,
          ppo_target_kl: Optional[float] = 0.1,
          ppo_a2c_gae_lambda: float = 0.95,
          ppo_n_epochs: int = 10,
          ppo_clip_range: float = 0.2,
          log_std_init: Optional[float] = None,
          ppo_a2c_ortho_init: Optional[bool] = None,
          td3_sac_buffer_size: Optional[int] = None,
          sac_tau: Optional[float] = None,
          sac_train_freq: Optional[int] = None,
          td3_sac_gradient_steps: Optional[int] = None,
          td3_sac_learning_starts: Optional[int] = None,
          td3_noise_type: Optional[str] = None,
          td3_noise_std: Optional[float] = None,
          use_sde: Optional[bool] = None,
          sde_sample_freq: Optional[int] = None,
          normalize=False,
          normalize_advantage: Optional[bool] = None,
          use_rms_prop: Optional[bool] = None,
          activation_function: Optional[str] = None):
    """Train an agent in the reacher environment.

    Args:
        experiment_name: the name of the experiment which will be used to create a directory under 'experiments' and
            store there all training artifacts along with the final and best models
        total_timesteps: the number of timestamps to run till stopping training
        input_path: in case provided, the model from that path is loaded - this is used to continue a previous training
        agent_type: specifies whether to use the environment with one agent or the environment with 20 agents
        env_seed: a seed for the environment random initialization - if not set, defaults to random
        environment_port: this is the port used by the unity environment to communicate with the C# backend. One needs
            to set different ports to different environments which run in parallel.
        device: the device used to train the model, can be 'cpu' or 'cuda:x'
        gamma: the discount rate applied to future actions
        learning_rate: the learning rate used by the policy and value network optimizer
        ppo_target_kl: an upper limit to the target KL divergence. This violates a bit the idea of PPO to reduce the
            amount of hyper-parameters but can still be useful since the agents can still experience catastrophic
            forgetting if this value becomes to high. The idea is to use it as a safe-guard, rather than a tunable
            hyper-parameter.
        policy_layers_comma_sep: a sequence of layer width for the policy network as a comma-separated list
        value_layers_comma_sep: a sequence of layer width for the value network as a comma-separated list
        eval_freq: the number of steps after which a validation round will take place. Whenever there is an improvement,
            the best model will be saved under the 'eval' directory in the experiment. Available only for the single
            agent environment.
        n_eval_episodes: number of episodes run during evaluation, available only for the single agent environment
        rl_algorithm: the algorithm used to train an agent
        n_envs: the number of agents used during training. This is applicable only in multi agent training and the
            maximum number of agents is 20. In fact all 20 agents of the unity environment will be active but only
            the first 'n_envs' will take active part in training.
        batch_size: the batch size used during training
        n_steps: number of steps run during rollout
    """
    experiment_path = EXPERIMENTS_DIR / experiment_name
    model_path = experiment_path / 'model'
    eval_path = experiment_path / 'eval'
    tensorboard_log_path = experiment_path / 'tensorboard_logs'
    for path in [experiment_path, eval_path, tensorboard_log_path]:
        path.mkdir(exist_ok=True, parents=True)

    env = create_environment(agent_type=agent_type,
                             normalize=normalize,
                             n_envs=n_envs,
                             env_seed=env_seed,
                             environment_port=environment_port,
                             training_mode=True)

    algorithm_class, policy = algorithm_and_policy[rl_algorithm]

    if input_path:
        model = algorithm_class.load(input_path, env=env)
    else:
        policy_layers = [
            int(layer_width)
            for layer_width in policy_layers_comma_sep.split(',')
        ]
        value_layers = [
            int(layer_width)
            for layer_width in value_layers_comma_sep.split(',')
        ]

        net_arch = (policy_layers if rl_algorithm in [
            RLAlgorithm.td3, RLAlgorithm.sac
        ] else [dict(vf=value_layers, pi=policy_layers)])

        policy_kwargs = remove_none_entries(
            dict(activation_fn=nn.ReLU
                 if activation_function == 'ReLU' else None,
                 net_arch=net_arch,
                 log_std_init=log_std_init,
                 ortho_init=ppo_a2c_ortho_init))

        if rl_algorithm == RLAlgorithm.ppo:
            algorithm_specific_parameters = dict(target_kl=ppo_target_kl,
                                                 gae_lambda=ppo_a2c_gae_lambda,
                                                 n_epochs=ppo_n_epochs,
                                                 clip_range=ppo_clip_range)
        elif rl_algorithm == RLAlgorithm.a2c:
            algorithm_specific_parameters = dict(
                normalize_advantage=normalize_advantage,
                use_rms_prop=use_rms_prop)
        elif rl_algorithm == RLAlgorithm.sac:
            algorithm_specific_parameters = dict(
                buffer_size=td3_sac_buffer_size,
                tau=sac_tau,
                train_freq=sac_train_freq,
                gradient_steps=td3_sac_gradient_steps,
                learning_starts=td3_sac_learning_starts)
        elif rl_algorithm == RLAlgorithm.td3:
            action_shape = (env.num_envs, env.action_space.shape[0])
            action_noise = (NormalActionNoise(
                np.zeros(action_shape, dtype=np.float32),
                td3_noise_std * np.ones(action_shape, dtype=np.float32))
                            if td3_noise_type == 'normal' else None)
            algorithm_specific_parameters = remove_none_entries(
                dict(buffer_size=td3_sac_buffer_size,
                     gradient_steps=td3_sac_gradient_steps,
                     learning_starts=td3_sac_learning_starts,
                     action_noise=action_noise))
        else:
            algorithm_specific_parameters = dict()

        model_optional_parameters = remove_none_entries(
            dict(batch_size=batch_size,
                 n_steps=n_steps,
                 use_sde=use_sde,
                 sde_sample_freq=sde_sample_freq))

        model = algorithm_class(
            policy,
            env,
            verbose=1,
            tensorboard_log=str(tensorboard_log_path),
            device=device,
            gamma=gamma,
            policy_kwargs=policy_kwargs,
            learning_rate=learning_rate,
            **model_optional_parameters,
            **remove_none_entries(algorithm_specific_parameters))

    eval_callback = ReacherEvaluationCallback(eval_env=env,
                                              eval_freq=eval_freq,
                                              n_eval_episodes=n_eval_episodes,
                                              n_agents=n_envs if n_envs else 1,
                                              eval_path=eval_path,
                                              normalization=normalize)

    model.learn(total_timesteps=total_timesteps, callback=[eval_callback])

    model.save(str(model_path / model))
    model.get_vec_normalize_env().save(str(model_path / 'vecnormalize.pkl'))
Example #16
0
def sample_td3_params(trial):
    """
    Sampler for TD3 hyperparams.

    :param trial: (optuna.trial)
    :return: (dict)
    """
    gamma = trial.suggest_categorical(
        'gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform('lr', 1e-5, 1)
    batch_size = trial.suggest_categorical('batch_size',
                                           [16, 32, 64, 100, 128, 256, 512])
    buffer_size = trial.suggest_categorical(
        'buffer_size', [int(1e4), int(1e5), int(1e6)])

    episodic = trial.suggest_categorical('episodic', [True, False])

    if episodic:
        n_episodes_rollout = 1
        train_freq, gradient_steps = -1, -1
    else:
        train_freq = trial.suggest_categorical('train_freq',
                                               [1, 16, 128, 256, 1000, 2000])
        gradient_steps = train_freq
        n_episodes_rollout = -1

    noise_type = trial.suggest_categorical(
        'noise_type', ['ornstein-uhlenbeck', 'normal', None])
    noise_std = trial.suggest_uniform('noise_std', 0, 1)

    net_arch = trial.suggest_categorical('net_arch',
                                         ["small", "medium", "big"])
    # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])

    net_arch = {
        'small': [64, 64],
        'medium': [256, 256],
        'big': [400, 300],
    }[net_arch]

    hyperparams = {
        'gamma': gamma,
        'learning_rate': learning_rate,
        'batch_size': batch_size,
        'buffer_size': buffer_size,
        'train_freq': train_freq,
        'gradient_steps': gradient_steps,
        'n_episodes_rollout': n_episodes_rollout,
        'policy_kwargs': dict(net_arch=net_arch),
    }

    if noise_type == 'normal':
        hyperparams['action_noise'] = NormalActionNoise(
            mean=np.zeros(trial.n_actions),
            sigma=noise_std * np.ones(trial.n_actions))
    elif noise_type == 'ornstein-uhlenbeck':
        hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(trial.n_actions),
            sigma=noise_std * np.ones(trial.n_actions))

    return hyperparams
Example #17
0
def sample_td3_params(trial: optuna.Trial,
                      octree_observations: bool = True,
                      octree_depth: int = 4,
                      octree_full_depth: int = 2,
                      octree_channels_in: int = 7,
                      octree_fast_conv: bool = True,
                      octree_batch_norm: bool = True) -> Dict[str, Any]:
    """
    Sampler for TD3 hyperparameters
    """

    buffer_size = 150000
    # learning_starts = trial.suggest_categorical(
    #     "learning_starts", [5000, 10000, 20000])
    learning_starts = 5000

    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    learning_rate = trial.suggest_float(
        "learning_rate", low=0.000001, high=0.001, log=True)

    gamma = trial.suggest_float("gamma", low=0.98, high=1.0, log=True)
    tau = trial.suggest_float("tau", low=0.001, high=0.025, log=True)

    target_policy_noise = trial.suggest_float(
        "target_policy_noise", low=0.0, high=0.5, log=True)
    target_noise_clip = 0.5

    noise_std = trial.suggest_float("noise_std", low=0.025, high=0.5, log=True)
    action_noise = NormalActionNoise(mean=np.zeros(trial.n_actions),
                                     sigma=np.ones(trial.n_actions)*noise_std)

    train_freq = 1
    gradient_steps = trial.suggest_categorical("gradient_steps", [1, 2])

    policy_kwargs = dict()
    net_arch = trial.suggest_categorical("net_arch", ["small [256, 128]",
                                                      "medium [384, 256]",
                                                      "big [512, 384]"])
    policy_kwargs["net_arch"] = {"small [256, 128]": [256, 128],
                                 "medium [384, 256]": [384, 256],
                                 "big [512, 384]": [512, 384]}[net_arch]
    if octree_observations:
        features_extractor_kwargs = dict()

        features_extractor_kwargs["depth"] = octree_depth
        features_extractor_kwargs["full_depth"] = octree_full_depth
        features_extractor_kwargs["channels_in"] = octree_channels_in

        features_extractor_kwargs["channel_multiplier"] = \
            trial.suggest_categorical("channel_multiplier", [8, 16, 32, 64])

        features_extractor_kwargs["features_dim"] = \
            trial.suggest_categorical("features_dim", [256, 512, 768])

        features_extractor_kwargs["fast_conv"] = octree_fast_conv
        features_extractor_kwargs["batch_normalization"] = octree_batch_norm

        policy_kwargs["features_extractor_kwargs"] = features_extractor_kwargs

    return {
        "buffer_size": buffer_size,
        "learning_starts": learning_starts,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
        "gamma": gamma,
        "tau": tau,
        "target_policy_noise": target_policy_noise,
        "target_noise_clip": target_noise_clip,
        "action_noise": action_noise,
        "train_freq": train_freq,
        "gradient_steps": gradient_steps,
        "policy_kwargs": policy_kwargs,
    }
Example #18
0
def sample_tqc_params(trial: optuna.Trial,
                      octree_observations: bool = True,
                      octree_depth: int = 4,
                      octree_full_depth: int = 2,
                      octree_channels_in: int = 7) -> Dict[str, Any]:
    """
    Sampler for TQC hyperparameters
    """

    buffer_size = 25000
    learning_starts = 0

    batch_size = 32
    learning_rate = trial.suggest_float("learning_rate",
                                        low=0.000025, high=0.00075, log=True)

    gamma = 1.0 - trial.suggest_float("gamma",
                                      low=0.0001, high=0.025, log=True)
    tau = trial.suggest_float("tau", low=0.0005, high=0.025, log=True)

    ent_coef = "auto_0.1_0.05"
    target_entropy = "auto"

    noise_std = trial.suggest_float("noise_std", low=0.01, high=0.1, log=True)
    action_noise = NormalActionNoise(mean=np.zeros(trial.n_actions),
                                     sigma=np.ones(trial.n_actions)*noise_std)

    train_freq = 1
    gradient_steps = trial.suggest_categorical("gradient_steps", [1, 2])

    policy_kwargs = dict()
    net_arch = trial.suggest_categorical("net_arch", [128, 256, 384, 512])
    policy_kwargs["net_arch"] = [net_arch] * 2
    policy_kwargs["n_quantiles"] = trial.suggest_int("n_quantiles",
                                                     low=20, high=40)
    top_quantiles_to_drop_per_net = round(0.08*policy_kwargs["n_quantiles"])
    policy_kwargs["n_critics"] = trial.suggest_categorical("n_critics", [2, 3])

    if octree_observations:
        features_extractor_kwargs = dict()

        features_extractor_kwargs["depth"] = octree_depth
        features_extractor_kwargs["full_depth"] = octree_full_depth
        features_extractor_kwargs["channels_in"] = octree_channels_in

        features_extractor_kwargs["channel_multiplier"] = \
            trial.suggest_categorical("channel_multiplier", [8, 16, 32])

        features_extractor_kwargs["full_depth_channels"] = \
            trial.suggest_categorical("full_depth_channels", [4, 8, 16])

        features_extractor_kwargs["features_dim"] = \
            trial.suggest_categorical("features_dim", [64, 128, 256])

        features_extractor_kwargs["batch_normalization"] = trial.suggest_categorical("batch_normalization",
                                                                                     [True, False])

        policy_kwargs["features_extractor_kwargs"] = features_extractor_kwargs

    return {
        "buffer_size": buffer_size,
        "learning_starts": learning_starts,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
        "gamma": gamma,
        "tau": tau,
        "ent_coef": ent_coef,
        "target_entropy": target_entropy,
        "top_quantiles_to_drop_per_net": top_quantiles_to_drop_per_net,
        "action_noise": action_noise,
        "train_freq": train_freq,
        "gradient_steps": gradient_steps,
        "policy_kwargs": policy_kwargs,
    }
Example #19
0
def sample_ddpg_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for DDPG hyperparams.

    :param trial:
    :return:
    """
    gamma = trial.suggest_categorical(
        "gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    batch_size = trial.suggest_categorical(
        "batch_size", [16, 32, 64, 100, 128, 256, 512, 1024, 2048])
    buffer_size = trial.suggest_categorical(
        "buffer_size", [int(1e4), int(1e5), int(1e6)])
    # Polyak coeff
    tau = trial.suggest_categorical("tau",
                                    [0.001, 0.005, 0.01, 0.02, 0.05, 0.08])

    train_freq = trial.suggest_categorical(
        "train_freq", [1, 4, 8, 16, 32, 64, 128, 256, 512])
    gradient_steps = train_freq

    noise_type = trial.suggest_categorical(
        "noise_type", ["ornstein-uhlenbeck", "normal", None])
    noise_std = trial.suggest_uniform("noise_std", 0, 1)

    # NOTE: Add "verybig" to net_arch when tuning HER (see TD3)
    net_arch = trial.suggest_categorical("net_arch",
                                         ["small", "medium", "big"])
    # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])

    net_arch = {
        "small": [64, 64],
        "medium": [256, 256],
        "big": [400, 300],
    }[net_arch]

    hyperparams = {
        "gamma": gamma,
        "tau": tau,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "buffer_size": buffer_size,
        "train_freq": train_freq,
        "gradient_steps": gradient_steps,
        "policy_kwargs": dict(net_arch=net_arch),
    }

    if noise_type == "normal":
        hyperparams["action_noise"] = NormalActionNoise(
            mean=np.zeros(trial.n_actions),
            sigma=noise_std * np.ones(trial.n_actions))
    elif noise_type == "ornstein-uhlenbeck":
        hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(trial.n_actions),
            sigma=noise_std * np.ones(trial.n_actions))

    if trial.using_her_replay_buffer:
        hyperparams = sample_her_params(trial, hyperparams)

    return hyperparams
Example #20
0
def sample_ddpg_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for DDPG hyperparams.

    :param trial:
    :return:
    """
    gamma = trial.suggest_categorical(
        "gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform("lr", 1e-5, 1)
    batch_size = trial.suggest_categorical("batch_size",
                                           [16, 32, 64, 100, 128, 256, 512])
    buffer_size = trial.suggest_categorical(
        "buffer_size", [int(1e4), int(1e5), int(1e6)])
    # Polyak coeff
    tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02])

    episodic = trial.suggest_categorical("episodic", [True, False])

    if episodic:
        n_episodes_rollout = 1
        train_freq, gradient_steps = -1, -1
    else:
        train_freq = trial.suggest_categorical("train_freq",
                                               [1, 16, 128, 256, 1000, 2000])
        gradient_steps = train_freq
        n_episodes_rollout = -1

    noise_type = trial.suggest_categorical(
        "noise_type", ["ornstein-uhlenbeck", "normal", None])
    noise_std = trial.suggest_uniform("noise_std", 0, 1)

    net_arch = trial.suggest_categorical("net_arch",
                                         ["small", "medium", "big"])
    # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])

    net_arch = {
        "small": [64, 64],
        "medium": [256, 256],
        "big": [400, 300],
    }[net_arch]

    hyperparams = {
        "gamma": gamma,
        "tau": tau,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "buffer_size": buffer_size,
        "train_freq": train_freq,
        "gradient_steps": gradient_steps,
        "n_episodes_rollout": n_episodes_rollout,
        "policy_kwargs": dict(net_arch=net_arch),
    }

    if noise_type == "normal":
        hyperparams["action_noise"] = NormalActionNoise(
            mean=np.zeros(trial.n_actions),
            sigma=noise_std * np.ones(trial.n_actions))
    elif noise_type == "ornstein-uhlenbeck":
        hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(trial.n_actions),
            sigma=noise_std * np.ones(trial.n_actions))

    return hyperparams
Example #21
0
        # print(self.model)
        return True


if __name__ == '__main__':
    # Instantiate Environment
    env_id = 'gym_spm:spm-v0'
    env = gym.make('gym_spm:spm-v0')

    # HyperParameters
    lr = 3e-4

    # Instantiate Model
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=-30 * np.zeros(n_actions),
                                     sigma=.75 * np.ones(n_actions))
    model = DDPG('MlpPolicy', env, action_noise=action_noise, verbose=1)
    # model = PPO('MlpPolicy', env, tensorboard_log=log_dir)

    # Train OR Load Model
    model.learn(total_timesteps=25000)

    # model.save(model_dir_description)

    mean_reward, std_reward = evaluate_policy(model,
                                              model.get_env(),
                                              n_eval_episodes=10)

    print("Mean Reward = ", mean_reward)

    epsi_sp_list = []
Example #22
0
import numpy as np
import pytest

from stable_baselines3 import A2C, PPO, SAC, TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

normal_action_noise = NormalActionNoise(np.zeros(1), 0.1 * np.ones(1))


@pytest.mark.parametrize('action_noise', [normal_action_noise, OrnsteinUhlenbeckActionNoise(np.zeros(1), 0.1 * np.ones(1))])
def test_td3(action_noise):
    model = TD3('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]),
                learning_starts=100, verbose=1, create_eval_env=True, action_noise=action_noise)
    model.learn(total_timesteps=1000, eval_freq=500)


@pytest.mark.parametrize("env_id", ['CartPole-v1', 'Pendulum-v0'])
def test_a2c(env_id):
    model = A2C('MlpPolicy', env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True)
    model.learn(total_timesteps=1000, eval_freq=500)


@pytest.mark.parametrize("env_id", ['CartPole-v1', 'Pendulum-v0'])
@pytest.mark.parametrize("clip_range_vf", [None, 0.2, -0.2])
def test_ppo(env_id, clip_range_vf):
    if clip_range_vf is not None and clip_range_vf < 0:
        # Should throw an error
        with pytest.raises(AssertionError):
            model = PPO('MlpPolicy', env_id, seed=0, policy_kwargs=dict(net_arch=[16]), verbose=1, create_eval_env=True,
                        clip_range_vf=clip_range_vf)
    else:
Example #23
0
    test_model(env, model, test_name)

for i in range(n_tests):
    test_name = 'saved_models/ddpg_soccer_actions_env_2_' + str(i)
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.3) * np.ones(n_actions))
    policy_kwargs = dict(net_arch=[400, 300])
    model = DDPG('MlpPolicy', env, action_noise=action_noise, policy_kwargs=policy_kwargs)
    model.learn(total_timesteps=10000, log_interval=1000)
    model.save(test_name)
    test_model(env, model, test_name)

for i in range(n_tests):
    test_name = 'saved_models/ddpg_soccer_actions_env_3_' + str(i)
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=float(0.3) * np.ones(n_actions))
    model = DDPG('MlpPolicy', env, action_noise=action_noise)
    model.learn(total_timesteps=10000, log_interval=1000)
    model.save(test_name)
    test_model(env, model, test_name)

# PPO algorithm
for i in range(n_tests):
    test_name = 'saved_models/ppo_soccer_actions_env_1_' + str(i)
    n_actions = env.action_space.shape[-1]
    model = PPO('MlpPolicy', env)
    model.learn(total_timesteps=10000, log_interval=1000)
    model.save(test_name)
    test_model(env, model, test_name)

# SAC algorithm
def train(experiment_name: str = typer.Option(...),
          total_timesteps: int = int(5e5),
          env_seed: int = random.randint(0, int(1e6)),
          port: int = 6005,
          device: str = 'cuda',
          gamma: float = 0.98,
          learning_rate: float = 7.3e-4,
          layers_comma_sep: str = '400,300',
          eval_freq: int = 100000,
          n_eval_episodes: int = 5,
          rl_algorithm: RLAlgorithm = RLAlgorithm.sac,
          batch_size: int = 256,
          buffer_size: int = 300000,
          gradient_steps: int = 64,
          learning_starts: int = 10000,
          sac_tau: float = 0.02,
          sac_train_freq: int = 64,
          td3_noise_type: Optional[str] = None,
          td3_noise_std: Optional[float] = None):
    """Train two agent in the tennis environment. Training is using single agent algorithms to train both agents with
    the union of their observations.

    Args:
        experiment_name: the name of the experiment which will be used to create a directory under 'experiments' and
            store there all training artifacts along with the final and best models
        total_timesteps: the number of timestamps to run till stopping training
        env_seed: a seed for the environment random initialization - if not set, defaults to random
        port: this is the port used by the unity environment to communicate with the C# backend. One needs
            to set different ports to different environments which run in parallel.
        device: the device used to train the model, can be 'cpu' or 'cuda:x'
        gamma: the discount rate applied to future actions
        learning_rate: the learning rate used by the policy and value network optimizer
        layers_comma_sep: a sequence of layer width for the networks as a comma-separated list
        eval_freq: the number of steps after which a validation round will take place. Whenever there is an improvement,
            the best model will be saved under the 'eval' directory in the experiment. Available only for the single
            agent environment.
        n_eval_episodes: number of episodes run during evaluation, available only for the single agent environment
        rl_algorithm: the algorithm used to train an agent
        batch_size: the batch size used during training
    """
    experiment_path = EXPERIMENTS_DIR / experiment_name
    model_path = experiment_path / 'model'
    eval_path = experiment_path / 'eval'
    tensorboard_log_path = experiment_path / 'tensorboard_logs'
    for path in [experiment_path, eval_path, tensorboard_log_path]:
        path.mkdir(exist_ok=True, parents=True)

    environment_parameters = dict(seed=env_seed,
                                  no_graphics=True,
                                  train_mode=True,
                                  port=port)

    env = UnityEnvironmentWrapperToGym(**environment_parameters)

    algorithm_class, policy = algorithm_and_policy[rl_algorithm]

    layers = [int(layer_width) for layer_width in layers_comma_sep.split(',')]

    policy_kwargs = remove_none_entries(
        dict(activation_fn=nn.ReLU, net_arch=layers))

    if rl_algorithm == RLAlgorithm.sac:
        algorithm_specific_parameters = dict(buffer_size=buffer_size,
                                             tau=sac_tau,
                                             train_freq=sac_train_freq,
                                             gradient_steps=gradient_steps,
                                             learning_starts=learning_starts)
    elif rl_algorithm == RLAlgorithm.td3:
        action_shape = (env.num_envs, env.action_space.shape[0])
        action_noise = (NormalActionNoise(
            np.zeros(action_shape, dtype=np.float32),
            td3_noise_std * np.ones(action_shape, dtype=np.float32))
                        if td3_noise_type == 'normal' else None)
        algorithm_specific_parameters = remove_none_entries(
            dict(buffer_size=buffer_size,
                 gradient_steps=gradient_steps,
                 learning_starts=learning_starts,
                 action_noise=action_noise))
    else:
        raise ValueError(f'Unknown algorithm: {rl_algorithm}')

    model = algorithm_class(
        policy,
        env,
        verbose=1,
        tensorboard_log=str(tensorboard_log_path),
        device=device,
        gamma=gamma,
        policy_kwargs=policy_kwargs,
        learning_rate=learning_rate,
        batch_size=batch_size,
        **remove_none_entries(algorithm_specific_parameters))

    model.learn(total_timesteps=total_timesteps,
                eval_env=env,
                eval_freq=eval_freq,
                n_eval_episodes=n_eval_episodes,
                eval_log_path=str(eval_path))

    model.save(str(model_path))
Example #25
0
                 ] and hyperparams.get('noise_type') is not None:
        noise_type = hyperparams['noise_type'].strip()
        noise_std = hyperparams['noise_std']
        n_actions = env.action_space.shape[0]
        if 'normal' in noise_type:
            if 'lin' in noise_type:
                final_sigma = hyperparams.get('noise_std_final',
                                              0.0) * np.ones(n_actions)
                hyperparams['action_noise'] = LinearNormalActionNoise(
                    mean=np.zeros(n_actions),
                    sigma=noise_std * np.ones(n_actions),
                    final_sigma=final_sigma,
                    max_steps=n_timesteps)
            else:
                hyperparams['action_noise'] = NormalActionNoise(
                    mean=np.zeros(n_actions),
                    sigma=noise_std * np.ones(n_actions))
        elif 'ornstein-uhlenbeck' in noise_type:
            hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions))
        else:
            raise RuntimeError(f'Unknown noise type "{noise_type}"')
        print(f"Applying {noise_type} noise with std {noise_std}")
        del hyperparams['noise_type']
        del hyperparams['noise_std']
        if 'noise_std_final' in hyperparams:
            del hyperparams['noise_std_final']

    if args.trained_agent.endswith('.zip') and os.path.isfile(
            args.trained_agent):
        # Continue training
def test_sac_phase():
    reward = []
    for i in [2000, 4000, 6000, 8000, 10000]:
        model = SAC("MlpPolicy",
                    "Pendulum-v0",
                    policy_kwargs=dict(net_arch=[64, 64]),
                    learning_starts=5000,
                    verbose=0,
                    create_eval_env=True,
                    buffer_size=i,
                    ent_coef=0,
                    action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)),
                    batch_size=32)
        env = model.env
        eval_callback = EvalCallback(env,
                                     best_model_save_path='./logs/',
                                     log_path='./logs/alpha5_phase',
                                     eval_freq=250,
                                     n_eval_episodes=100,
                                     deterministic=True,
                                     render=False)
        model.learn(total_timesteps=20000, callback=eval_callback)
        reward.append(eval_callback.last_mean_reward)
        definition = 200
        portrait = np.zeros((definition, definition))
        state_min = env.observation_space.low
        state_max = env.observation_space.high
        for index_t, t in enumerate(np.linspace(-np.pi, np.pi,
                                                num=definition)):
            for index_td, td in enumerate(
                    np.linspace(state_min[2], state_max[2], num=definition)):
                state = torch.Tensor([[np.cos(t), np.sin(t), td]])
                action = model.policy.forward(state)
                portrait[definition - (1 + index_td),
                         index_t] = model.critic.q1_forward(state, action)
        plt.figure(figsize=(10, 10))
        plt.imshow(portrait,
                   cmap="inferno",
                   extent=[-180, 180, state_min[2], state_max[2]],
                   aspect='auto')
        plt.rc('axes', titlesize=12)
        plt.xlabel('angle')
        plt.ylabel('velocity')
        plt.title(
            "critic, last mean reward = {:.2f} +/- {:.2f}, replay size = {}".
            format(reward[-1], eval_callback.last_std, i))
        plt.colorbar(label="critic value")
        plt.scatter([0], [0])
        plt.show()
        definition = 200
        portrait = np.zeros((definition, definition))
        state_min = env.observation_space.low
        state_max = env.observation_space.high
        portrait = np.zeros((definition, definition))
        for index_t, t in enumerate(np.linspace(-np.pi, np.pi,
                                                num=definition)):
            for index_td, td in enumerate(
                    np.linspace(state_min[2], state_max[2], num=definition)):
                state = torch.Tensor([[np.cos(t), np.sin(t), td]])
                probs = model.policy.forward(state)
                action = probs.data.numpy().astype(float)
                portrait[definition - (1 + index_td), index_t] = action
        plt.figure(figsize=(10, 10))
        plt.imshow(portrait,
                   cmap="coolwarm",
                   extent=[-180, 180, state_min[2], state_max[2]],
                   aspect='auto')
        plt.title(
            "action, last mean reward = {:.2f} +/- {:.2f}, replay size = {}".
            format(reward[-1], eval_callback.last_std, i))
        plt.colorbar(label="action")
        plt.rc('axes', titlesize=12)
        plt.xlabel('angle')
        plt.ylabel('velocity')
        plt.scatter([0], [0])
        plt.show()

    return reward
Example #27
0
from torchvision import transforms
import numpy as np
from model import BehaviorCloneNet, CarModel
from logloader import LogLoader
import time
from torchvision.transforms import Compose, ToTensor, Normalize
from custom_arch import CustomCNN, CustomActorCriticPolicy

env = make_vec_env(DeepwatchEnv2)

policy_kwargs = dict(features_extractor_class=CustomCNN)
#check_env(env)

n_actions = env.action_space.shape[-1]

action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                 sigma=0.1 * np.ones(n_actions))

#model = TD3(CnnPolicy, env, action_noise=action_noise, buffer_size=50000, verbose=1) # optimize_memory_usage=True
#model = SAC(CnnPolicy, env, buffer_size=50000, action_noise=action_noise, learning_rate=0.0005, tensorboard_log='./tensorboard', verbose=1)
#model = SAC.load("deepwatch_evolution_sac_7", env)
model = A2C(MlpPolicy, env, verbose=1,
            n_steps=5)  #, policy_kwargs=policy_kwargs)
model.load("deepwatch_evolution_a2c_2")

for i in range(100):
    model.learn(total_timesteps=1000)
    model.save("deepwatch_evolution_a2c_3")
    print("Saved Checkpoint")

#model.learn(total_timesteps=10000)
#model.save("deepwatch_evolution")
    'noise_std': 0.513787888663763,
    'net_arch': 'medium'
}
policy_kwargs = dict(net_arch=[256, 256])  # medium
if hyper['episodic']:
    hyper['n_episodes_rollout'] = 1
    hyper['train_freq'], hyper['gradient_steps'] = -1, -1
else:
    hyper['train_freq'] = hyper['train_freq']
    hyper['gradient_steps'] = hyper['train_freq']
    hyper['n_episodes_rollout'] = -1

n_actions = env.action_space.shape[0]
if hyper["noise_type"] == "normal":
    hyper["action_noise"] = NormalActionNoise(mean=np.zeros(n_actions),
                                              sigma=hyper['noise_std'] *
                                              np.ones(n_actions))
elif noise_type == "ornstein-uhlenbeck":
    hyper["action_noise"] = OrnsteinUhlenbeckActionNoise(
        mean=np.zeros(n_actions),
        sigma=hyper['noise_std'] * np.ones(n_actions))

model = DDPG('MlpPolicy',
             env,
             verbose=0,
             tensorboard_log=tensorboard_log,
             seed=seed,
             gamma=hyper['gamma'],
             learning_rate=hyper['lr'],
             batch_size=hyper['batch_size'],
             buffer_size=hyper['buffer_size'],
Example #29
0
def test_sac(ent_coef):
    model = SAC('MlpPolicy', 'Pendulum-v0', policy_kwargs=dict(net_arch=[64, 64]),
                learning_starts=100, verbose=1, create_eval_env=True, ent_coef=ent_coef,
                action_noise=NormalActionNoise(np.zeros(1), np.zeros(1)))
    model.learn(total_timesteps=1000, eval_freq=500)
    if is_ipython:
        display.clear_output(wait=True)
        display.display(plt.gcf())


env = PortfolioHedgingEnv(use_skew=False,
                          hedger_verbose=False,
                          corr=0.0,
                          instr_weight=0.5,
                          save_figs=True)
env.model_name = "sac_autohedger_portfolio_common_c_0_w_05"

policy_args = {"net_arch": [8000, 8000]}

reward_history = []
noise = NormalActionNoise(0, 50)
model = SAC(MlpPolicy,
            env,
            verbose=2,
            learning_rate=5e-6,
            target_update_interval=32,
            learning_starts=0,
            use_sde_at_warmup=True,
            use_sde=False,
            policy_kwargs=policy_args,
            buffer_size=int(10e6))
model.learn(total_timesteps=20000,
            log_interval=50,
            n_eval_episodes=100,
            callback=callback)
#model.save(env.model_name)