コード例 #1
0
def sample_td3_params(trial):
    """
    Sampler for TD3 hyperparams.

    :param trial: (optuna.trial)
    :return: (dict)
    """
    gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform('lr', 1e-5, 1)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 100, 128, 256, 512])
    buffer_size = trial.suggest_categorical('buffer_size', [int(1e4), int(1e5), int(1e6)])
    train_freq = trial.suggest_categorical('train_freq', [1, 10, 100, 1000, 2000])
    gradient_steps = train_freq
    noise_type = trial.suggest_categorical('noise_type', ['ornstein-uhlenbeck', 'normal'])
    noise_std = trial.suggest_uniform('noise_std', 0, 1)

    hyperparams = {
        'gamma': gamma,
        'learning_rate': learning_rate,
        'batch_size': batch_size,
        'buffer_size': buffer_size,
        'train_freq': train_freq,
        'gradient_steps': gradient_steps,
    }

    if noise_type == 'normal':
        hyperparams['action_noise'] = NormalActionNoise(mean=np.zeros(trial.n_actions),
                                                        sigma=noise_std * np.ones(trial.n_actions))
    elif noise_type == 'ornstein-uhlenbeck':
        hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(trial.n_actions),
                                                                   sigma=noise_std * np.ones(trial.n_actions))

    return hyperparams
コード例 #2
0
def main(env):

    n_actions = env.action_space.shape[0]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    # Using only one expert trajectory
    # you can specify `traj_limitation=-1` for using the whole dataset
    file_dir = "/home/vignesh/Thesis_Suture_data/trial2/ambf_data/"
    dataset = ExpertDataset(expert_path=file_dir + 'expert_psm_data.npz',
                            traj_limitation=1,
                            batch_size=32)

    model = DDPG(MlpPolicy,
                 env,
                 gamma=0.95,
                 verbose=1,
                 nb_train_steps=300,
                 nb_rollout_steps=150,
                 param_noise=param_noise,
                 batch_size=128,
                 action_noise=action_noise,
                 random_exploration=0.05,
                 normalize_observations=True,
                 tensorboard_log="./ddpg_dvrk_tensorboard/",
                 observation_range=(-1.5, 1.5))

    model.pretrain(dataset, n_epochs=1000)
    model.save("./gail_robot_env")
コード例 #3
0
ファイル: models.py プロジェクト: ioneliabuzatu/rl-benchmarks
def ddpg(env_id,
         timesteps,
         policy="MlpPolicy",
         log_interval=None,
         tensorboard_log=None,
         seed=None,
         load_weights=None):
    from stable_baselines import DDPG

    env = gym.make(env_id)

    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    if load_weights is not None:
        model = DDPG.load(load_weights, env=env)
    else:
        model = DDPG(policy,
                     env,
                     verbose=1,
                     param_noise=param_noise,
                     action_noise=action_noise,
                     tensorboard_log=tensorboard_log)

    callback = WandbRenderEnvCallback(model_name="ddpg", env_name=env_id)

    model.learn(total_timesteps=timesteps,
                log_interval=log_interval,
                callback=callback)
    save_model_weights(model, "ddpg", env_id, policy, seed=seed, path=".")
コード例 #4
0
def main(env: PSMCartesianDDPGEnv):
    # the noise objects for DDPG
    n_actions = env.action.action_space.shape[0]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    model = DDPG(MlpPolicy,
                 env,
                 gamma=0.95,
                 verbose=1,
                 nb_train_steps=300,
                 nb_rollout_steps=150,
                 param_noise=param_noise,
                 batch_size=128,
                 action_noise=action_noise,
                 random_exploration=0.05,
                 normalize_observations=True,
                 tensorboard_log="./ddpg_dvrk_tensorboard/",
                 observation_range=(-1.5, 1.5),
                 critic_l2_reg=0.01)

    model.learn(total_timesteps=4000000,
                log_interval=100,
                callback=CheckpointCallback(
                    save_freq=100000, save_path="./ddpg_dvrk_tensorboard/"))
    model.save("./ddpg_robot_env")
コード例 #5
0
ファイル: models.py プロジェクト: wjjmjh/FinRL-Library
    def train_DDPG(self, model_name, model_params=config.DDPG_PARAMS):
        """DDPG model"""
        from stable_baselines import DDPG
        from stable_baselines.ddpg.policies import DDPGPolicy
        from stable_baselines.common.noise import OrnsteinUhlenbeckActionNoise

        env_train = self.env

        n_actions = env_train.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))

        start = time.time()
        model = DDPG('MlpPolicy',
                     env_train,
                     batch_size=model_params['batch_size'],
                     buffer_size=model_params['buffer_size'],
                     param_noise=param_noise,
                     action_noise=action_noise,
                     verbose=model_params['verbose'])
        model.learn(total_timesteps=model_params['timesteps'])
        end = time.time()

        model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
        print('Training time (DDPG): ', (end - start) / 60, ' minutes')
        return model
コード例 #6
0
def sample_ddpg_params(trial):
    """
    Sampler for DDPG hyperparams.
    :param trial: (optuna.trial)
    :return: (dict)
    """
    gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    # actor_lr = trial.suggest_loguniform('actor_lr', 1e-5, 1)
    # critic_lr = trial.suggest_loguniform('critic_lr', 1e-5, 1)
    learning_rate = trial.suggest_loguniform('lr', 1e-5, 1)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 100, 128, 256, 512])
    buffer_size = trial.suggest_categorical('memory_limit', [int(1e4), int(1e5), int(1e6)])
    noise_type = trial.suggest_categorical('noise_type', ['ornstein-uhlenbeck', 'normal'])
    noise_std = trial.suggest_uniform('noise_std', 0, 1)
    normalize_observations = trial.suggest_categorical('normalize_observations', [True, False])
    normalize_returns = trial.suggest_categorical('normalize_returns', [True, False])

    hyperparams = {
        'gamma': gamma,
        'actor_lr': learning_rate,
        'critic_lr': learning_rate,
        'batch_size': batch_size,
        'memory_limit': buffer_size,
        'normalize_observations': normalize_observations,
        'normalize_returns': normalize_returns
    }


    if noise_type == 'normal':
        hyperparams['action_noise'] = NormalActionNoise(mean=np.zeros(1),
                                                        sigma=noise_std * np.ones(1))
    elif noise_type == 'ornstein-uhlenbeck':
        hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(1),
                                                                   sigma=noise_std * np.ones(1))
    return hyperparams
コード例 #7
0
def main(
  training_env: PSMCartesianHERDDPGEnv,
  eval_env: PSMCartesianHERDDPGEnv = None,
  log_dir='./.logs/results'
):

  os.makedirs(log_dir, exist_ok=True)

  # training_env = Monitor(training_env, log_dir)

  n_actions = training_env.action_space.shape[0]
  noise_std = 0.2
  # Currently using OU noise
  action_noise = OrnsteinUhlenbeckActionNoise(
    mean=np.zeros(n_actions),
    sigma=noise_std * np.ones(n_actions)
  )
  model_class = DDPG  # works also with SAC, DDPG and TD3

  rl_model_kwargs = {
    'actor_lr': 1e-3,
    'critic_lr': 1e-3,
    'action_noise': action_noise,
    'nb_train_steps': 300,
    'nb_rollout_steps': 100,
    'gamma': 0.95,
    'observation_range': (-1.5,
                          1.5),
    'random_exploration': 0.05,
    'normalize_observations': True,
    'critic_l2_reg': 0.01
  }

  # Available strategies (cf paper): future, final, episode, random
  model = HER(
    'MlpPolicy',
    training_env,
    model_class,
    verbose=1,
    n_sampled_goal=4,
    goal_selection_strategy='future',
    buffer_size=int(1e5),
    batch_size=128,
    tensorboard_log="./ddpg_dvrk_tensorboard/",
    **rl_model_kwargs
  )
  # Reset the model
  training_env.reset()
  # Create callbacks
  checkpoint_callback = CheckpointCallback(
    save_freq=100000,
    save_path="./ddpg_dvrk_tensorboard/"
  )  # save_path="./.model/model_checkpoint/") #save_freq=100000
  # eval_callback = EvalCallback(training_env, best_model_save_path='./ddpg_dvrk_tensorboard/best_model',
  #                             log_path=log_dir, eval_freq=500)
  callback = CallbackList([checkpoint_callback])  # , eval_callback])
  # Train the model
  model.learn(4000000, log_interval=100, callback=callback)
  model.save("./her_robot_env")
コード例 #8
0
def ddpg(env, seed):
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.1) *
                                                np.ones(n_actions))

    return DDPG('MlpPolicy',
                env,
                action_noise=action_noise,
                verbose=1,
                tensorboard_log="./data/runs",
                seed=seed)
コード例 #9
0
def DDPGAgent(multi_stock_env, num_episodes):
    models_folder = 'saved_models'
    rewards_folder = 'saved_rewards'

    env = DummyVecEnv([lambda: multi_stock_env])
    
    # the noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    
    # Hyper parameters
    GAMMA = 0.99
    TAU = 0.001
    BATCH_SIZE = 16
    ACTOR_LEARNING_RATE = 0.0001
    CRITIC_LEARNING_RATE = 0.001
    BUFFER_SIZE = 500

    print("\nRunning DDPG Agent...\n")
    model = DDPG(MlpPolicy, env, 
                gamma = GAMMA, tau = TAU, batch_size = BATCH_SIZE,
                actor_lr = ACTOR_LEARNING_RATE, critic_lr = CRITIC_LEARNING_RATE,
                buffer_size = BUFFER_SIZE, verbose=1, 
                param_noise=param_noise, action_noise=action_noise)
    model.learn(total_timesteps=50000)
    model.save(f'{models_folder}/rl/ddpg.h5')

    del model
    
    model = DDPG.load(f'{models_folder}/rl/ddpg.h5')
    obs = env.reset()
    portfolio_value = []

    for e in range(num_episodes):
        action, _states = model.predict(obs)
        next_state, reward, done, info = env.step(action)
        print(f"episode: {e + 1}/{num_episodes}, episode end value: {info[0]['cur_val']:.2f}")
        portfolio_value.append(round(info[0]['cur_val'], 3))

    # save portfolio value for each episode
    np.save(f'{rewards_folder}/rl/ddpg.npy', portfolio_value)

    print("\nDDPG Agent run complete and saved!")

    a = np.load(f'./saved_rewards/rl/ddpg.npy')

    print(f"\nCumulative Portfolio Value Average reward: {a.mean():.2f}, Min: {a.min():.2f}, Max: {a.max():.2f}")
    plt.plot(a)
    plt.title("Portfolio Value Per Episode (DDPG)")
    plt.ylabel("Portfolio Value")
    plt.xlabel("Episodes")
    plt.show()
def train_DDPG(env_train, model_name, timesteps=10000):
    """DDPG model"""
    # the noise objects for DDPG
    n_actions = env_train.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))

    start = time.time()
    model = DDPG('MlpPolicy', env_train, param_noise=param_noise, action_noise=action_noise)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (DDPG): ', (end - start) / 60, ' minutes')
    return model
コード例 #11
0
def sample_ddpg_params(trial):
    """
    Sampler for DDPG hyperparams.

    :param trial: (optuna.trial)
    :return: (dict)
    """
    gamma = trial.suggest_categorical(
        'gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    # actor_lr = trial.suggest_loguniform('actor_lr', 1e-5, 1)
    # critic_lr = trial.suggest_loguniform('critic_lr', 1e-5, 1)
    learning_rate = trial.suggest_loguniform('lr', 1e-5, 1)
    batch_size = trial.suggest_categorical('batch_size',
                                           [16, 32, 64, 128, 256])
    buffer_size = trial.suggest_categorical(
        'memory_limit', [int(1e4), int(1e5), int(1e6)])
    noise_type = trial.suggest_categorical(
        'noise_type', ['ornstein-uhlenbeck', 'normal', 'adaptive-param'])
    noise_std = trial.suggest_uniform('noise_std', 0, 1)
    normalize_observations = trial.suggest_categorical(
        'normalize_observations', [True, False])
    normalize_returns = trial.suggest_categorical('normalize_returns',
                                                  [True, False])

    hyperparams = {
        'gamma': gamma,
        'actor_lr': learning_rate,
        'critic_lr': learning_rate,
        'batch_size': batch_size,
        'memory_limit': buffer_size,
        'normalize_observations': normalize_observations,
        'normalize_returns': normalize_returns
    }

    if noise_type == 'adaptive-param':
        hyperparams['param_noise'] = AdaptiveParamNoiseSpec(
            initial_stddev=noise_std, desired_action_stddev=noise_std)
        # Apply layer normalization when using parameter perturbation
        hyperparams['policy_kwargs'] = dict(layer_norm=True)
    elif noise_type == 'normal':
        hyperparams['action_noise'] = NormalActionNoise(
            mean=np.zeros(trial.n_actions),
            sigma=noise_std * np.ones(trial.n_actions))
    elif noise_type == 'ornstein-uhlenbeck':
        hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(trial.n_actions),
            sigma=noise_std * np.ones(trial.n_actions))
    return hyperparams
コード例 #12
0
ファイル: train2.py プロジェクト: 3neutronstar/flow_RL
def train_stable_baselines(submodule, flags):
    """Train policies using the PPO algorithm in stable-baselines."""
    from stable_baselines.common.vec_env import DummyVecEnv
    from stable_baselines import DDPG
    flow_params = submodule.flow_params
    # Path to the saved files
    exp_tag = flow_params['exp_tag']
    result_name = '{}/{}'.format(exp_tag, strftime("%Y-%m-%d-%H:%M:%S"))

    # Perform training.
    print('Beginning training.')
    model = run_model_stablebaseline(
        flow_params, flags.num_cpus, flags.rollout_size, flags.num_steps)

    # Save the model to a desired folder and then delete it to demonstrate
    # loading.
    print('Saving the trained model!')
    path = os.path.realpath(os.path.expanduser('~/baseline_results'))
    ensure_dir(path)
    save_path = os.path.join(path, result_name)
    model.save(save_path)

    # dump the flow params
    with open(os.path.join(path, result_name) + '.json', 'w') as outfile:
        json.dump(flow_params, outfile,
                  cls=FlowParamsEncoder, sort_keys=True, indent=4)

    # Replay the result by loading the model
    print('Loading the trained model and testing it out!')
    model = DDPG.load(save_path)
    flow_params = get_flow_params(os.path.join(path, result_name) + '.json')
    flow_params['sim'].render = True
    env = env_constructor(params=flow_params, version=0)()
    
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))

    # The algorithms require a vectorized environment to run
    eval_env = DummyVecEnv([lambda: env])
    obs = eval_env.reset()
    reward = 0
    for _ in range(flow_params['env'].horizon):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = eval_env.step(action)
        reward += rewards
    print('the final reward is {}'.format(reward))
コード例 #13
0
def main(env):

    n_actions = env.action_space.shape[0]
    noise_std = 0.2
    # Currently using OU noise
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=noise_std *
                                                np.ones(n_actions))
    model_class = DDPG  # works also with SAC, DDPG and TD3

    rl_model_kwargs = {
        'actor_lr': 1e-3,
        'critic_lr': 1e-3,
        'action_noise': action_noise,
        'nb_train_steps': 300,
        'nb_rollout_steps': 100,
        'gamma': 0.95,
        'observation_range': (-1.5, 1.5),
        'random_exploration': 0.05,
        'normalize_observations': True,
        'critic_l2_reg': 0.01
    }

    # Available strategies (cf paper): future, final, episode, random
    model = HER('MlpPolicy',
                env,
                model_class,
                verbose=1,
                n_sampled_goal=4,
                goal_selection_strategy='future',
                buffer_size=int(1e5),
                batch_size=128,
                tensorboard_log="./ddpg_dvrk_tensorboard/",
                **rl_model_kwargs)
    # Reset the model
    env.reset()
    # Train the model
    model.learn(4000000,
                log_interval=100,
                callback=CheckpointCallback(
                    save_freq=100000, save_path="./ddpg_dvrk_tensorboard/"))
    model.save("./her_robot_env")
コード例 #14
0
ファイル: ddpg.py プロジェクト: thias15/CausalWorld
def train_policy(num_of_envs, log_relative_path, maximum_episode_length,
                 skip_frame, seed_num, ddpg_config, total_time_steps,
                 validate_every_timesteps, task_name):
    print("Using MPI for multiprocessing with {} workers".format(
        MPI.COMM_WORLD.Get_size()))
    rank = MPI.COMM_WORLD.Get_rank()
    print("Worker rank: {}".format(rank))
    task = generate_task(task_generator_id=task_name,
                         dense_reward_weights=np.array(
                             [250, 0, 125, 0, 750, 0, 0, 0.005]),
                         fractional_reward_weight=1,
                         goal_height=0.15,
                         tool_block_mass=0.02)
    env = CausalWorld(task=task,
                      skip_frame=skip_frame,
                      enable_visualization=False,
                      seed=0,
                      max_episode_length=maximum_episode_length,
                      normalize_actions=False,
                      normalize_observations=False)
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))
    policy_kwargs = dict(layers=[256, 256])
    checkpoint_callback = CheckpointCallback(save_freq=int(
        validate_every_timesteps / num_of_envs),
                                             save_path=log_relative_path,
                                             name_prefix='model')
    model = DDPG(MlpPolicy,
                 env,
                 verbose=2,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 policy_kwargs=policy_kwargs,
                 **ddpg_config)
    model.learn(total_timesteps=total_time_steps,
                tb_log_name="ddpg",
                callback=checkpoint_callback)
    return
コード例 #15
0
def training(env):
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    model = DDPG(MlpPolicy,
                 env,
                 verbose=1,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 render=True,
                 return_range=[-1.0, 1.0],
                 observation_range=[-2.0, 2.0])
    model.learn(total_timesteps=40000)
    time = datetime.now().strftime("%m%d_%H%M%S")
    model.save("models\\ddpg_sbl_" + time)

    del model  # remove to demonstrate saving and loading
    testing(env, time)
コード例 #16
0
    def __call__(self):

        policy_kwargs = dict(layers=[400, 300, 200, 100])
        n_actions = self.env.action_space.shape[-1]
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.1) * np.ones(n_actions))

        # check_env(self.env)
        model = DDPG(MlpPolicy,
                     self.env,
                     policy_kwargs=policy_kwargs,
                     action_noise=action_noise,
                     memory_limit=50000,
                     tensorboard_log="/home/dfki.uni-bremen.de/mpatil/Documents/baselines_log",
                     verbose=1)

        time_steps = 3e4
        model.learn(total_timesteps=int(time_steps),
                    log_interval=50,
                    tb_log_name="ddpg_Docker_" + self.expt_name)
        model.save("/home/dfki.uni-bremen.de/mpatil/Documents/ddpg_stable_baselines_" + self.expt_name)

        print("Closing environment")
        self.env.close()
コード例 #17
0
def init_ddpg(env_id,
              timesteps,
              policy="MlpPolicy",
              log_interval=None,
              tensorboard_log=None,
              seed=None):
    from stable_baselines import DDPG

    env = gym.make(env_id)

    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    model = DDPG(policy,
                 env,
                 verbose=1,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 tensorboard_log=tensorboard_log)

    return model
コード例 #18
0
def run(Model, Policy, gamma):
    env = gym.make('Stock-v0')
    env._init_data(train_data)

    if gamma != 0:
        n_actions = env.action_space.shape[-1]
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(gamma) * np.ones(n_actions))
        model = Model(Policy, env, verbose=1, action_noise=action_noise)
        model.gamma = 0.2
    else:
        model = Model(Policy, env, verbose=1)

    model.learn(total_timesteps=total_timesteps, log_interval=10)

    print("test model")
    env = gym.make('TestStock-v0')
    env._init_data(test_data)
    obs = env.reset()

    for _ in range(686):
        action, _ = model.predict(obs)
        obs, _, _, _ = env.step(action)

    return env.asset_memory
コード例 #19
0
def DRL() -> None:
    ### PREPARATION
    # callback for validation
    eval_callback = EvalCallback(val_env,
                                 best_model_save_path=config.val_path,
                                 log_path=config.val_path,
                                 eval_freq=config.val_freq,
                                 deterministic=config.deterministic,
                                 n_eval_episodes=config.val_eps)

    ### SETUP AND TRAIN
    # Setup model
    if config.MODEL_NAME == "A2C":
        model = A2C(config.POLICY,
                    train_env,
                    verbose=1,
                    tensorboard_log=config.tb_path,
                    seed=config.seed)
    elif config.MODEL_NAME == "PPO":
        model = PPO2(config.POLICY,
                     train_env,
                     verbose=1,
                     tensorboard_log=config.tb_path,
                     nminibatches=1,
                     seed=config.seed)
    elif config.MODEL_NAME == "DDPG":
        # the noise objects for DDPG
        n_actions = train_env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))
        model = DDPG(config.POLICY,
                     train_env,
                     param_noise=param_noise,
                     action_noise=action_noise,
                     verbose=1,
                     tensorboard_log=config.tb_path,
                     seed=config.seed)
        print("DDPG does not provice training output...")

    ###
    # Train Model
    model = model.learn(total_timesteps=config.learn_steps,
                        callback=eval_callback)

    # Load best model after training
    if config.MODEL_NAME == "A2C":
        model = A2C.load(load_path=config.val_path.joinpath("best_model.zip"))
    elif config.MODEL_NAME == "PPO":
        model = PPO2.load(load_path=config.val_path.joinpath("best_model.zip"))
    elif config.MODEL_NAME == "DDPG":
        model = DDPG.load(load_path=config.val_path.joinpath("best_model.zip"))

    ### EVAL MODEL
    # Make prediction in test_env
    test_mean, test_std = evaluate_policy(model=model,
                                          env=test_env,
                                          deterministic=config.deterministic,
                                          n_eval_episodes=config.test_eps,
                                          return_episode_rewards=False)

    print(f"Test Mean:{test_mean}\n"+ \
          f"Test Std:{test_std}")
コード例 #20
0
            save_path = 'logs/agent_{}/models/'.format(args.agent_id)
            env = Monitor(env, 'logs/agent_{}/'.format(args.agent_id))    # logging monitor

            repo = git.Repo(search_parent_directories=False)
            commit_id = repo.head.object.hexsha
            with open('logs/agent_{}/reproduction_info.txt'.format(args.agent_id), 'w') as f:  # Use file to refer to the file object
                f.write('Git commit id: {}\n\n'.format(commit_id))
                f.write('Program arguments:\n\n{}'.format(args))
                f.close()
        else:
            save_path = '../logs/'
            env = Monitor(env, '../logs/')                                   # logging monitor
        model_dir = save_path + '{}_final_model'.format(args.alg)                                       # model save/load directory

        if args.alg == 'ddpg':
            action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                        sigma=args.action_noise * np.ones(n_actions))

            param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(args.param_noise_stddev),
                                                 desired_action_stddev=float(args.param_noise_stddev))
            model = DDPG(DDPGPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise,
                         render=args.play)
        elif args.alg == 'ppo2':
            model = PPO2(CommonMlpPolicy, env, verbose=1)
        elif args.alg == 'trpo':
            model = TRPO(CommonMlpPolicy, env, verbose=1, model_dir=save_path)
        elif args.alg =='a2c':
            model = A2C(CommonMlpPolicy, env, verbose=1)
        else:
            print(args.alg)
            raise Exception('Algorithm name is not defined!')
コード例 #21
0
ファイル: sbmain_new_cord.py プロジェクト: yizhoucc/ffsb
import gym
from stable_baselines.ddpg.policies import MlpPolicy
from stable_baselines import A2C
from stable_baselines import DDPG
from FireflyEnv import ffenv_new_cord
from Config import Config
arg = Config()
import numpy as np
import time
import torch
from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(2),
                                            sigma=float(0.2) * np.ones(2))
arg.std_range = [0.0001, 0.001, 0.0001, 0.001]
env = ffenv_new_cord.FireflyEnv(arg)
model = DDPG(MlpPolicy,
             env,
             verbose=1,
             tensorboard_log="./DDPG_tb/",
             action_noise=action_noise,
             gamma=0.99,
             memory_policy=None,
             eval_env=None,
             nb_train_steps=50,
             nb_rollout_steps=100,
             nb_eval_steps=100,
             param_noise=None,
             normalize_observations=False,
             tau=0.001,
             batch_size=128,
コード例 #22
0
def run_process(study_name, alg_param, env_param, log_path='.'):
    study_path = os.path.join(log_path, study_name)
    make_sure_path_exists(study_path)
    trial_path, trial_id = generate_trial_path(study_path)
    make_sure_path_exists(trial_path)

    with open(trial_path + '/alg_param.pkl', "wb+") as outfile:
        pickle.dump(alg_param, outfile)

    with open(trial_path + '/env_param.pkl', "wb+") as outfile:
        pickle.dump(env_param, outfile)

    num_nodes = alg_param['num_nodes']
    num_layers = alg_param['num_layers']
    learning_rate = alg_param['learning_rate']
    alg = alg_param['alg']
    nenv = alg_param['nenv']
    env = build_env(trial_path, env_param, nenv=nenv)

    if alg == 'dqn':
        from stable_baselines.deepq.policies import MlpPolicy
        from stable_baselines import DQN
        call_iter = 1000
        policy_kwargs = dict(layers=[num_nodes for _ in range(num_layers)])
        model = DQN(MlpPolicy,
                    env,
                    verbose=1,
                    policy_kwargs=policy_kwargs,
                    tensorboard_log=trial_path)
    #DDPG calls back every step of every rollout
    elif alg == 'ddpg':
        from stable_baselines.ddpg.policies import MlpPolicy
        from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec
        from stable_baselines import DDPG
        call_iter = 1000
        n_actions = env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))
        policy_kwargs = dict(layers=[num_nodes for _ in range(num_layers)])
        model = DDPG(MlpPolicy,
                     env,
                     verbose=1,
                     param_noise=param_noise,
                     action_noise=action_noise,
                     policy_kwargs=policy_kwargs,
                     tensorboard_log=trial_path)

    elif alg == 'td3':
        from stable_baselines import TD3
        from stable_baselines.td3.policies import MlpPolicy
        from stable_baselines.common.vec_env import DummyVecEnv
        from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
        call_iter = 1000
        n_actions = env.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))
        policy_kwargs = dict(layers=[num_nodes for _ in range(num_layers)])
        model = TD3(MlpPolicy,
                    env,
                    verbose=1,
                    action_noise=action_noise,
                    learning_rate=learning_rate,
                    policy_kwargs=policy_kwargs,
                    tensorboard_log=trial_path)

    #PPO1 calls back only after every rollout
    elif alg == 'ppo2':
        from stable_baselines.common.policies import MlpPolicy
        from stable_baselines import PPO2
        call_iter = 100
        policy_kwargs = dict(net_arch=[num_nodes for _ in range(num_layers)])
        model = PPO2(MlpPolicy,
                     env,
                     policy_kwargs=policy_kwargs,
                     verbose=1,
                     learning_rate=learning_rate,
                     tensorboard_log=trial_path,
                     n_steps=alg_param['n_steps'],
                     noptepochs=alg_param['noptepochs'],
                     nminibatches=alg_param['nminibatches'],
                     gamma=alg_param['gamma'],
                     ent_coef=alg_param['ent_coef'],
                     cliprange=alg_param['cliprange'],
                     lam=alg_param['lam'])

    best_mean_reward, n_steps = -np.inf, 0

    #callback frequency differs among algorithms
    def callback(_locals, _globals):
        from stable_baselines.results_plotter import load_results, ts2xy
        nonlocal n_steps, best_mean_reward, call_iter
        # Print stats every 1000 call
        if (n_steps + 1) % call_iter == 0:
            # Evaluate policy training performance
            x, y = ts2xy(load_results(trial_path), 'timesteps')
            if len(x) > 0:
                mean_reward = np.mean(y[-200:])
                print(x[-1], 'timesteps')
                print(
                    "Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}"
                    .format(best_mean_reward, mean_reward))
                # New best model, you could save the agent here
                if mean_reward > best_mean_reward:
                    best_mean_reward = mean_reward
                    # Example for saving best model
                    print("Saving new best model")
                    _locals['self'].save(trial_path + '/best_model.pkl')
        n_steps += 1
        return True

    # model= DDPG.load('log/A00/best_model.pkl')
    # model.set_env(env)
    print(f"Starting to train {trial_id}")
    model.learn(total_timesteps=int(1e6),
                tb_log_name='tb_log',
                callback=callback)

    model.save(trial_path + '/fully_trained_model')
コード例 #23
0
ファイル: 2_train.py プロジェクト: PierreExeter/RL_reacher
            hyperparams['param_noise'] = AdaptiveParamNoiseSpec(
                initial_stddev=noise_std, desired_action_stddev=noise_std)
        elif 'normal' in noise_type:
            if 'lin' in noise_type:
                hyperparams['action_noise'] = LinearNormalActionNoise(
                    mean=np.zeros(n_actions),
                    sigma=noise_std * np.ones(n_actions),
                    final_sigma=hyperparams.get('noise_std_final', 0.0) *
                    np.ones(n_actions),
                    max_steps=n_timesteps)
            else:
                hyperparams['action_noise'] = NormalActionNoise(
                    mean=np.zeros(n_actions),
                    sigma=noise_std * np.ones(n_actions))
        elif 'ornstein-uhlenbeck' in noise_type:
            hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions))
        else:
            raise RuntimeError('Unknown noise type "{}"'.format(noise_type))
        print("Applying {} noise with std {}".format(noise_type, noise_std))
        del hyperparams['noise_type']
        del hyperparams['noise_std']
        if 'noise_std_final' in hyperparams:
            del hyperparams['noise_std_final']

    if ALGOS[args.algo] is None:
        raise ValueError('{} requires MPI to be installed'.format(args.algo))

    if os.path.isfile(args.trained_agent):
        # Continue training
        print("Loading pretrained agent")
        # Policy should not be changed
コード例 #24
0
from stable_baselines import TRPO

import numpy as np

import math
import matplotlib.pyplot as plt

# # Create environment
env = gym.make('QuadGym-v0')
# env = gym.make('HalfCheetah-v2')
# the noise objects for DDPG
n_actions = env.action_space.shape[-1]
param_noise = None
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.ones(n_actions) * 0.15,
                                            sigma=float(0.1) *
                                            np.ones(n_actions))

model = DDPG(MlpPolicy2,
             env,
             verbose=1,
             param_noise=param_noise,
             action_noise=action_noise,
             render=False,
             buffer_size=1000000,
             random_exploration=0.0)
# model = DDPG(MlpPolicy2, env, gamma=0.99, memory_policy=None, nb_train_steps=500, nb_rollout_steps=50, nb_eval_steps=300, param_noise=None, action_noise=action_noise, normalize_observations=False, tau=0.002, batch_size=250, normalize_returns=False, enable_popart=False, observation_range=(-10.0,10.0), critic_l2_reg=0.0, actor_lr=0.0005, critic_lr=0.0005, clip_norm=None, render=False, render_eval=False, buffer_size=1000000, verbose=1, _init_setup_model=True)

model.learn(total_timesteps=1000000)
model.save("ddpg_quad")
qpos0_hist = np.ones((1, 49))
コード例 #25
0
            original_adr = currentPath + '/tools/cfgs/' + args.cfg_file.split(
                '/')[-1]
            target_adr = currentPath + '/logs/agent_{}/'.format(
                args.agent_id) + args.cfg_file.split('/')[-1]
            shutil.copyfile(original_adr, target_adr)

        else:
            save_path = 'logs/'
            env = Monitor(env, 'logs/',
                          info_keywords=('reserved', ))  # logging monitor
        model_dir = save_path + '{}_final_model'.format(
            cfg.POLICY.NAME)  # model save/load directory

        if cfg.POLICY.NAME == 'DDPG':
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(n_actions),
                sigma=float(cfg.POLICY.ACTION_NOISE) * np.ones(n_actions))

            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(cfg.POLICY.PARAM_NOISE_STD),
                desired_action_stddev=float(cfg.POLICY.PARAM_NOISE_STD))
            model = DDPG(policy[cfg.POLICY.NET],
                         env,
                         verbose=1,
                         param_noise=param_noise,
                         action_noise=action_noise,
                         policy_kwargs={
                             'cnn_extractor': eval(cfg.POLICY.CNN_EXTRACTOR)
                         })
        elif cfg.POLICY.NAME == 'PPO2':
            model = PPO2(policy[cfg.POLICY.NET],
コード例 #26
0
from rl_visualization.visualization_env import VisualizationEnv

if __name__ == '__main__':

    env = gym.make('MountainCarContinuous-v0')

    env = VisualizationEnv(
        env,
        steps_lookback=10000,
        refresh_time=30,
        features_names=['Car Position', 'Car Velocity'],
        actions_names=[
            'Push car to the left (negative value) or to the right (positive value)'
        ])

    model = SAC(MlpPolicy,
                env,
                verbose=1,
                action_noise=OrnsteinUhlenbeckActionNoise(mean=np.zeros(1),
                                                          sigma=0.5 *
                                                          np.ones(1)))
    model.learn(total_timesteps=60000)

    obs = env.reset()
    for i in range(100000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()

    env.close()
    env.join()
コード例 #27
0
def main():
    args = parse_args()
    algorithm = args.algo
    agent = args.agent
    output = args.output
    use_encoder = './results/' + args.use_encoder + '/'
    time_steps = int(1e7)
    env_name = 'HalfCheetah-v2'

    tf.reset_default_graph()

    with tf.Session() as sess:

        def make_env(use_encoder=True, env_name=env_name):

            if use_encoder:
                return WrappedEnv(sess=sess,
                                  env_name=env_name,
                                  feature_dim=10,
                                  encoder_gamma=0.98,
                                  encoder_hidden_size=128,
                                  dynamics_hidden_size=256,
                                  invdyn_hidden_size=256,
                                  encoder_lr=0.0003,
                                  dynamics_lr=0.0003,
                                  invdyn_lr=0.0003)
            else:
                return gym.make(env_name)

        if algorithm == 'ddpg':

            from stable_baselines.common.cmd_util import SubprocVecEnv
            from stable_baselines.common.callbacks import CheckpointCallback
            from stable_baselines.ddpg.policies import MlpPolicy
            from stable_baselines.common.noise import OrnsteinUhlenbeckActionNoise
            from stable_baselines import DDPG

            env = make_env(use_encoder=use_encoder)
            sess.run(tf.global_variables_initializer())
            if use_encoder:
                env = random_run_for_encoder_training(env,
                                                      num_epochs=200,
                                                      num_iters=500)

            n_actions = env.action_space.shape[-1]
            param_noise = None
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(n_actions),
                sigma=float(0.22) * np.ones(n_actions))

            policy_kwargs = dict(act_fun=tf.nn.tanh, layers=[64, 64])

            model = DDPG(MlpPolicy,
                         env,
                         gamma=0.99,
                         batch_size=128,
                         verbose=1,
                         param_noise=param_noise,
                         action_noise=action_noise,
                         policy_kwargs=policy_kwargs,
                         tensorboard_log=output + algorithm + '_' +
                         str(use_encoder) + '/' + 'log/')

            checkpoint_callback = CheckpointCallback(
                save_freq=4000,
                save_path=output + algorithm + '_' + str(use_encoder) + '/',
                name_prefix='agent')
            model.set_env(env)
            model.learn(total_timesteps=time_steps,
                        callback=checkpoint_callback,
                        reset_num_timesteps=False)
            model.save(output + algorithm + '_' + str(use_encoder) + '/' +
                       agent)
            env.close()
            del model

        elif algorithm == 'ppo':

            from stable_baselines.common.policies import MlpPolicy, MlpLnLstmPolicy
            from stable_baselines import PPO2
            from stable_baselines.common.cmd_util import SubprocVecEnv
            from stable_baselines.common.callbacks import CheckpointCallback

            env = make_env(use_encoder=use_encoder)
            sess.run(tf.global_variables_initializer())
            if use_encoder:
                env = random_run_for_encoder_training(env,
                                                      num_epochs=200,
                                                      num_iters=500)

            policy_kwargs = dict(act_fun=tf.nn.tanh, layers=[64, 64])

            model = PPO2(MlpPolicy,
                         env,
                         n_steps=2048,
                         nminibatches=32,
                         lam=0.95,
                         gamma=0.99,
                         noptepochs=10,
                         verbose=1,
                         policy_kwargs=policy_kwargs,
                         tensorboard_log=output + algorithm + '_' +
                         str(use_encoder) + '/' + 'log/')

            checkpoint_callback = CheckpointCallback(
                save_freq=4096,
                save_path=output + algorithm + '_' + str(use_encoder) + '/',
                name_prefix='agent')
            model.learn(total_timesteps=time_steps,
                        callback=checkpoint_callback,
                        reset_num_timesteps=False)
            model.save(output + algorithm + '_' + str(use_encoder) + '/' +
                       agent)
            env.close()
            del model
コード例 #28
0
def train_decision(config=None,
                   save=False,
                   load=False,
                   calender=None,
                   history=None,
                   predict_results_dict=None,
                   test_mode=False,
                   start_date=None,
                   stop_date=None,
                   episode_steps=1000,
                   model='DDPG'):
    """
    训练决策模型,从数据库读取数据并进行决策训练

    参数:
        config:配置文件, 
        save:保存结果, 
        calender:交易日日历, 
        history:行情信息, 
        all_quotes:拼接之后的行情信息
        predict_results_dict:预测结果信息
    """
    # 首先处理预测数据中字符串日期

    MODEL = model

    predict_dict = {}
    for k, v in predict_results_dict.items():
        assert isinstance(v['predict_date'].iloc[0], str)
        tmp = v['predict_date'].apply(
            lambda x: arrow.get(x, 'YYYY-MM-DD').date())
        predict_dict[k] = v.rename(index=tmp)

    env = Portfolio_Prediction_Env(config=config,
                                   calender=calender,
                                   stock_history=history,
                                   window_len=1,
                                   prediction_history=predict_dict,
                                   start_trade_date=start_date,
                                   stop_trade_date=stop_date,
                                   save=save)

    # 测试模式
    if test_mode:
        obs = env.reset()
        # check_env(env)
        for i in range(1000):
            W = np.random.uniform(0.0, 1.0, size=(6, ))
            offer = np.random.uniform(-10.0, 10.0, size=(6, ))
            obs, reward, done, infos = env.step(np.hstack((W, offer)))
            # env.render()
            if done:
                env.save_history()
                break
        env.close()

    # 训练模式
    if MODEL == "DDPG":
        # 添加噪声
        n_actions = env.action_space.shape
        param_noise = None
        # 适合于惯性系统控制的OU噪声
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))

        model_path = search_file(
            os.path.join(sys.path[0], 'saved_models', MODEL), MODEL)
        if len(model_path) > 0 and load:
            model = DDPG.load(
                model_path[0],
                env=env,
                policy=CustomDDPGPolicy,
                param_noise=param_noise,
                action_noise=action_noise,
                # tensorboard_log='./tb_log',
            )
        else:
            model = DDPG(
                policy=CustomDDPGPolicy,
                env=env,
                verbose=1,
                param_noise=param_noise,
                action_noise=action_noise,
                # tensorboard_log='./tb_log',
            )
        # 训练步数
        model.learn(total_timesteps=episode_steps, )
        model.save(
            os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5'))

    elif MODEL == 'TD3':
        n_actions = env.action_space.shape[-1]
        # 适合于惯性系统控制的OU噪声
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))

        model_path = search_file(
            os.path.join(sys.path[0], 'saved_models', MODEL), MODEL)
        if len(model_path) > 0 and load:
            model = TD3.load(
                model_path[0],
                env=env,
                policy=CustomTD3Policy,
                action_noise=action_noise,
                # tensorboard_log='./tb_log',
            )
        else:
            model = TD3(
                policy=CustomTD3Policy,
                env=env,
                verbose=1,
                action_noise=action_noise,
                # tensorboard_log='./tb_log',
            )
        # 训练步数
        model.learn(total_timesteps=episode_steps, )
        model.save(
            os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5'))

    elif MODEL == "HER":
        """
        env必须是GoalEnv
        """
        model_class = DDPG

        # Available strategies (cf paper): future, final, episode, random
        goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

        # Wrap the model
        model = HER(policy=CustomDDPGPolicy,
                    env=env,
                    model_class=model_class,
                    n_sampled_goal=4,
                    goal_selection_strategy=goal_selection_strategy,
                    verbose=1)
        model.learn(total_timesteps=episode_steps, )
        model.save(
            os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5'))

    obs = env.reset()
    # 实测模式
    for i in range(1000):
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        # env.render(info=info)
        if done:
            if save:
                env.save_history()
            env.reset()
            break

    env.close()
コード例 #29
0
                        log_interval=1,
                        tb_log_name=tensorboard_log_name)
            model.save(model_save_name)
elif algorithm == "DDPG":
    if train:
        for i in range(model_num):
            from stable_baselines.ddpg.policies import MlpPolicy
            from stable_baselines.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise, AdaptiveParamNoiseSpec
            from stable_baselines import DDPG
            env = gym.make(env_name)

            # the noise objects for DDPG
            n_actions = env.action_space.shape[-1]
            param_noise = None
            action_noise = OrnsteinUhlenbeckActionNoise(
                mean=np.zeros(n_actions),
                sigma=float(0.5) * np.ones(n_actions))

            model = DDPG(MlpPolicy,
                         env,
                         verbose=1,
                         param_noise=param_noise,
                         action_noise=action_noise,
                         tensorboard_log=tensorboard_log_dir)
            model.learn(total_timesteps=total_timesteps_,
                        log_interval=1,
                        tb_log_name=tensorboard_log_name)
            model.save(model_save_name)

del model  # remove to demonstrate saving and loading
コード例 #30
0
    def _preprocess_hyperparams(self, _hyperparams):
        # Convert to python object if needed
        if "policy_kwargs" in _hyperparams.keys() and isinstance(_hyperparams["policy_kwargs"], str):
            _hyperparams["policy_kwargs"] = eval(_hyperparams["policy_kwargs"])

        n_timesteps = _hyperparams.pop("n_timesteps", None)
        n_envs = _hyperparams.pop("n_envs", None)
        log_every = _hyperparams.pop("log_every", None)
        if not self.continue_learning:
            if not log_every:
                self.logger.debug("log_every not defined in yml file: using command line log_every {}".format(self.log_every))
                log_every = self.log_every
            else:
                self.logger.debug("using log_every as defined in yml file: {}".format(log_every))
        else:
            self.logger.debug("priority to command line log_every {}".format(self.log_every))
            log_every = self.log_every

        # Parse noise string
        if self.algo_name in ["ddpg", "sac", "td3"] and _hyperparams.get("noise_type") is not None:
            noise_type = _hyperparams["noise_type"].strip()
            noise_std = _hyperparams["noise_std"]
            n_actions = get_n_actions(env_name=self.env_name, env_variables=self.env_kwargs)
            self.logger.debug("n_actions: {}".format(n_actions))
            if "adaptive-param" in noise_type:
                assert self.algo_name == "ddpg", "Parameter is not supported by SAC"
                _hyperparams["param_noise"] = AdaptiveParamNoiseSpec(initial_stddev=noise_std, desired_action_stddev=noise_std)
            elif "normal" in noise_type:
                if "lin" in noise_type:
                    _hyperparams["action_noise"] = LinearNormalActionNoise(
                        mean=np.zeros(n_actions),
                        sigma=noise_std * np.ones(n_actions),
                        final_sigma=_hyperparams.get("noise_std_final", 0.0) * np.ones(n_actions),
                        max_steps=n_timesteps,
                    )
                else:
                    _hyperparams["action_noise"] = NormalActionNoise(
                        mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)
                    )
            elif "ornstein-uhlenbeck" in noise_type:
                _hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
                    mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions)
                )
            else:
                raise RuntimeError('Unknown noise type "{}"'.format(noise_type))
            self.logger.debug("Applying {} noise with std {}".format(noise_type, noise_std))
            del _hyperparams["noise_type"]
            del _hyperparams["noise_std"]
            if "noise_std_final" in _hyperparams:
                del _hyperparams["noise_std_final"]

        normalize_kwargs = _parse_normalize(dictionary=_hyperparams)

        if n_envs is None:
            self.logger.debug("n_envs not defined in yml file: using command line n_envs {}".format(self.num_envs))
            n_envs = self.num_envs
        else:
            self.logger.debug("using n_envs as num of envs defined in yml file:".format(n_envs))

        if not self.continue_learning:
            # priority to yml defined n_timesteps
            if n_timesteps is None:
                self.logger.debug(
                    "n_timesteps not defined in yml file: using command line n_timesteps {}".format(self.train_total_timesteps)
                )
                n_timesteps = self.train_total_timesteps
            else:
                self.logger.debug("using n_timesteps as total timesteps defined in yml file: {}".format(n_timesteps))
                n_timesteps = int(n_timesteps)
        else:
            if self.train_total_timesteps and self.train_total_timesteps != -1:
                assert self.train_total_timesteps <= int(n_timesteps), "train_total_timesteps <= n_timesteps: {}, {}".format(
                    self.train_total_timesteps, n_timesteps
                )
                # priority to command line n_timesteps
                self.logger.debug("priority to command line n_timesteps {}".format(self.train_total_timesteps))
                n_timesteps = self.train_total_timesteps
            elif self.train_total_timesteps == -1:
                assert n_timesteps, "n_timesteps should have a value: {}".format(n_timesteps)
                n_timesteps = int(n_timesteps)
                self.logger.info("training in continual learning = training from scratch. n_timesteps {}".format(n_timesteps))
            else:
                assert n_timesteps, "n_timesteps should have a value: {}".format(n_timesteps)
                n_timesteps = int(n_timesteps // 2)
                self.logger.debug(
                    "train_total_timesteps not specified in continue_learning: "
                    "taking half of original n_timesteps defined in yml file {}".format(n_timesteps)
                )

        assert n_timesteps % log_every == 0, "it should be possible to divide n_timesteps for log_every: {}, {}".format(
            n_timesteps, log_every
        )
        return normalize_kwargs, n_envs, n_timesteps, log_every, _hyperparams