def run_experiment(args):
    hyperparam_file = os.path.join(HYPERPARAM_DIR, args.agent + ".yml")
    hyperparams = yaml.safe_load(open(hyperparam_file))

    hyperparams = hyperparams[args.env]

    n_envs = hyperparams.pop("n_envs", 1)
    n_timesteps = int(hyperparams.pop("n_timesteps"))
    policy = hyperparams.pop("policy")
    normalize = hyperparams.pop("normalize", None)

    vecEnv = []
    for i in range(n_envs):
        # Bit of trickery here to avoid referencing
        # to the same "i"
        vecEnv.append((lambda idx: lambda: create_env(args, idx))(i))

    if args.subprocenv:
        vecEnv = SubprocVecEnv(vecEnv)
    else:
        vecEnv = DummyVecEnv(vecEnv)

    # Handle learning rates
    # Taken from rl-zoo/train.py
    for key in ['learning_rate', 'cliprange', 'cliprange_vf']:
        if key not in hyperparams or args.agent == "dqn":
            continue
        if key == 'learning_rate' and args.agent == "a2c":
            continue
        if isinstance(hyperparams[key], str):
            schedule, initial_value = hyperparams[key].split('_')
            initial_value = float(initial_value)
            hyperparams[key] = linear_schedule(initial_value)
        elif isinstance(hyperparams[key], (float, int)):
            # Negative value: ignore (ex: for clipping)
            if hyperparams[key] < 0:
                continue
            hyperparams[key] = constfn(float(hyperparams[key]))

    if args.forced_cliprange is not None:
        hyperparams["cliprange"] = args.forced_cliprange

    agent_class = AVAILABLE_ALGORITHMS[args.agent]
    agent = agent_class(policy, vecEnv, verbose=1, **hyperparams)

    # Prepare callback
    checkpoint_dir = os.path.join(args.output, CHECKPOINT_DIR)
    os.makedirs(checkpoint_dir)
    # Note that save_freq is counted in number of agent step-calls,
    # not env step-calls.
    save_freq = n_timesteps // (args.num_snapshots * n_envs)

    checkpoint_callback = CheckpointCallback(save_freq, checkpoint_dir)

    agent.learn(total_timesteps=n_timesteps, callback=checkpoint_callback)

    vecEnv.close()
Example #2
0
    log_path, "{}_{}".format(ENV_ID,
                             get_latest_run_id(log_path, ENV_ID) + 1))
params_path = os.path.join(save_path, ENV_ID)
os.makedirs(params_path, exist_ok=True)

# Create learning rate schedules for ppo2 and sac
if args.algo in ["ppo2", "sac"]:
    for key in ['learning_rate', 'cliprange']:
        if key not in hyperparams:
            continue
        if isinstance(hyperparams[key], str):
            schedule, initial_value = hyperparams[key].split('_')
            initial_value = float(initial_value)
            hyperparams[key] = linear_schedule(initial_value)
        elif isinstance(hyperparams[key], float):
            hyperparams[key] = constfn(hyperparams[key])
        else:
            raise ValueError('Invalid valid for {}: {}'.format(
                key, hyperparams[key]))

# Should we overwrite the number of timesteps?
if args.n_timesteps > 0:
    n_timesteps = args.n_timesteps
else:
    n_timesteps = int(hyperparams['n_timesteps'])
del hyperparams['n_timesteps']

normalize = False
normalize_kwargs = {}
if 'normalize' in hyperparams.keys():
    normalize = hyperparams['normalize']
Example #3
0
        print("Using {} environments".format(n_envs))

    # Create learning rate schedules for ppo2 and sac
    if algo_ in ["ppo2", "sac", "td3"]:
        for key in ['learning_rate', 'cliprange', 'cliprange_vf']:
            if key not in hyperparams:
                continue
            if isinstance(hyperparams[key], str):
                schedule, initial_value = hyperparams[key].split('_')
                initial_value = float(initial_value)
                hyperparams[key] = linear_schedule(initial_value)
            elif isinstance(hyperparams[key], (float, int)):
                # Negative value: ignore (ex: for clipping)
                if hyperparams[key] < 0:
                    continue
                hyperparams[key] = constfn(float(hyperparams[key]))
            else:
                raise ValueError('Invalid value for {}: {}'.format(
                    key, hyperparams[key]))

    # Should we overwrite the number of timesteps?
    if args.n_timesteps > 0:
        if args.verbose:
            print("Overwriting n_timesteps with n={}".format(args.n_timesteps))
        n_timesteps = args.n_timesteps
    else:
        n_timesteps = int(hyperparams['n_timesteps'])

    normalize = False
    normalize_kwargs = {}
    if 'normalize' in hyperparams.keys():
Example #4
0
def train_HER(env, out_dir, seed=None, **kwargs):
    # Logs will be saved in log_dir/monitor.csv
    global output_dir, log_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = Monitor(env, log_dir + '/', allow_early_resets=True)

    policy = kwargs['policy']
    algo_name = kwargs['algo_name']
    n_timesteps = kwargs['n_timesteps']
    noise_type = None
    if 'noise_type' in kwargs:
        noise_type = kwargs['noise_type']
        del kwargs['noise_type']

    # HER Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = kwargs['goal_selection_strategy']
    n_sampled_goal = kwargs['n_sampled_goal']

    del kwargs['policy']
    del kwargs['algo_name']
    del kwargs['n_timesteps']
    del kwargs['goal_selection_strategy']
    del kwargs['n_sampled_goal']

    # Set agent algorithm
    agent = set_agent(algo_name)
    if not agent:
        print("invalid algorithm for HER")
        return

    # the noise objects
    nb_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = None

    if noise_type:

        for current_noise_type in noise_type.split(','):

            current_noise_type = current_noise_type.strip()

            if 'adaptive-param' in current_noise_type and algo_name is 'ddpg':
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))

            elif 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))

            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mean=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))

            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    # Create learning rate schedule
    for key in ['learning_rate', 'learning_rate_pi', 'cliprange']:
        if key in kwargs:
            if isinstance(kwargs[key], str):
                schedule, initial_value = kwargs[key].split('_')
                initial_value = float(initial_value)
                kwargs[key] = linear_schedule(initial_value)
            elif isinstance(kwargs[key], float):
                kwargs[key] = constfn(kwargs[key])
            else:
                raise ValueError('Invalid valid for {}: {}'.format(
                    key, kwargs[key]))

    kwargs['tensorboard_log'] = os.path.join(log_dir, 'tb')
    kwargs['full_tensorboard_log'] = False
    kwargs['seed'] = seed
    kwargs['action_noise'] = action_noise
    if algo_name is 'ddpg':
        kwargs['param_noise'] = param_noise

    if 'continue' in kwargs and kwargs['continue'] is True:
        # Continue training
        print("Loading pretrained agent")
        # Policy should not be changed
        for key in ['policy', 'policy_kwargs']:
            if key in kwargs:
                del kwargs[key]

        model = HER.load(os.path.join(out_dir, 'final_model.pkl'),
                         env=env,
                         verbose=1,
                         **kwargs)
    else:
        if 'continue' in kwargs:
            del kwargs['continue']
        model = HER(policy,
                    env,
                    agent,
                    goal_selection_strategy=goal_selection_strategy,
                    n_sampled_goal=n_sampled_goal,
                    verbose=1,
                    **kwargs)

    model.learn(total_timesteps=n_timesteps, callback=log_callback)

    return model
Example #5
0
    def create_model(
        self,
        seed,
        algo_name,
        env,
        tensorboard_log_dir,
        hyperparams,
        best_model_save_path=None,
        model_to_load=None,
        continue_learning=False,
        env_name="CartPole-v1",
        n_timesteps=-1,
        save_replay_buffer: bool = True,
    ):

        old_hyperparams = dict()

        # Create learning rate schedules for ppo2 and sac
        if algo_name in ["ppo2", "sac", "td3"]:
            for key in ["learning_rate", "cliprange", "cliprange_vf"]:
                if key not in hyperparams:
                    continue
                if isinstance(hyperparams[key], str):
                    self.logger.debug("Key {}, value {}".format(key, hyperparams[key]))
                    old_hyperparams[key] = hyperparams[key]
                    schedule, initial_value = hyperparams[key].split("_")
                    initial_value = float(initial_value)
                    hyperparams[key] = linear_schedule(initial_value)
                elif isinstance(hyperparams[key], (float, int)):
                    # Negative value: ignore (ex: for clipping)
                    if hyperparams[key] < 0:
                        continue
                    old_hyperparams[key] = float(hyperparams[key])
                    hyperparams[key] = constfn(float(hyperparams[key]))
                else:
                    raise ValueError("Invalid value for {}: {}".format(key, hyperparams[key]))

        if algo_name == "ppo2":

            if self.sb_version == "sb3":
                raise NotImplementedError("PPO still in sb2")

            if best_model_save_path and continue_learning:
                model = PPO2.load(
                    self.load_model(best_model_save_path, model_to_load),
                    env=env,
                    tensorboard_log=tensorboard_log_dir,
                    verbose=1,
                )
                key = "cliprange"
                cl_cliprange_value = 0.08  # new policy can be a bit different than the old one
                if key in old_hyperparams:
                    if isinstance(old_hyperparams[key], str):
                        self.logger.debug("Setting cliprange to lin_{}".format(cl_cliprange_value))
                        model.cliprange = linear_schedule(cl_cliprange_value)
                    elif isinstance(old_hyperparams[key], (float, int)):
                        self.logger.debug("Setting cliprange to value {}".format(cl_cliprange_value))
                        model.cliprange = constfn(cl_cliprange_value)
                else:
                    # default value is too high for continual learning (0.2)
                    self.logger.debug("Setting cliprange to value {}".format(cl_cliprange_value))
                    model.cliprange = cl_cliprange_value

                return model
            elif best_model_save_path:
                return PPO2.load(
                    self.load_model(best_model_save_path, model_to_load),
                    env=env,
                    tensorboard_log=tensorboard_log_dir,
                    verbose=1,
                    n_cpu_tf_sess=n_cpu_tf_sess,
                )
            return PPO2(env=env, verbose=1, tensorboard_log=tensorboard_log_dir, **hyperparams, n_cpu_tf_sess=n_cpu_tf_sess,)

        elif algo_name == "sac":
            if self.sb_version == "sb3":
                if best_model_save_path and continue_learning:
                    model = stable_baselines3.SAC.load(
                        self.load_model(best_model_save_path, model_to_load),
                        env=env,
                        seed=seed,
                        tensorboard_log=tensorboard_log_dir,
                        verbose=1,
                    )
                    model.load_replay_buffer(path=best_model_save_path + "/replay_buffer")
                    self.logger.debug("Model replay buffer size: {}".format(model.replay_buffer.size()))
                    self.logger.debug("Setting learning_starts to 0")
                    model.learning_starts = 0

                    value = get_value_given_key(best_model_save_path + "/progress.csv", "ent_coef")
                    if value:
                        ent_coef = float(value)
                        self.logger.debug("Restore model old ent_coef: {}".format("auto_" + str(ent_coef)))
                        model.ent_coef = "auto_" + str(ent_coef)
                        model.target_entropy = str(ent_coef)

                    return model
                elif best_model_save_path:
                    return stable_baselines3.SAC.load(
                        self.load_model(best_model_save_path, model_to_load),
                        env=env,
                        seed=seed,
                        tensorboard_log=tensorboard_log_dir,
                        verbose=1,
                        n_cpu_tf_sess=n_cpu_tf_sess,
                    )
                assert n_timesteps > 0, "n_timesteps > 0: {}".format(n_timesteps)
                return stable_baselines3.SAC(env=env, verbose=0, seed=seed, tensorboard_log=tensorboard_log_dir, **hyperparams)

            else:
                if best_model_save_path and continue_learning:
                    model = CustomSAC.load(
                        self.load_model(best_model_save_path, model_to_load),
                        env=env,
                        tensorboard_log=tensorboard_log_dir,
                        verbose=1,
                    )
                    self.logger.debug("Model replay buffer size: {}".format(len(model.replay_buffer)))
                    self.logger.debug("Setting learning_starts to 0")
                    model.learning_starts = 0
                    if not save_replay_buffer:
                        self.logger.debug("Setting save_replay_buffer to False")
                        model.save_replay_buffer = False

                    value = get_value_given_key(best_model_save_path + "/progress.csv", "ent_coef")
                    if value:
                        ent_coef = float(value)
                        self.logger.debug("Restore model old ent_coef: {}".format("auto_" + str(ent_coef)))
                        model.ent_coef = "auto_" + str(ent_coef)
                        model.target_entropy = str(ent_coef)

                    return model

                elif best_model_save_path:
                    # do not load replay buffer since we are in testing mode (no continue_learning)
                    return SAC.load(
                        self.load_model(best_model_save_path, model_to_load),
                        env=env,
                        tensorboard_log=tensorboard_log_dir,
                        verbose=1,
                        n_cpu_tf_sess=n_cpu_tf_sess,
                    )
                return CustomSAC(
                    total_timesteps=n_timesteps,
                    env=env,
                    verbose=1,
                    tensorboard_log=tensorboard_log_dir,
                    **hyperparams,
                    n_cpu_tf_sess=n_cpu_tf_sess,
                    save_replay_buffer=save_replay_buffer,
                )

        elif algo_name == "dqn":

            if self.sb_version == "sb3":

                if best_model_save_path:
                    if continue_learning:
                        model = stable_baselines3.DQN.load(
                            self.load_model(best_model_save_path, model_to_load),
                            env=env,
                            seed=seed,
                            tensorboard_log=tensorboard_log_dir,
                            verbose=0,
                        )
                        model.load_replay_buffer(path=best_model_save_path + "/replay_buffer")
                        model.learning_starts = 0
                        model.exploration_fraction = 0.0005
                        model.exploration_initial_eps = model.exploration_final_eps
                        model.exploration_schedule = get_linear_fn(
                            model.exploration_initial_eps, model.exploration_final_eps, model.exploration_fraction
                        )
                        self.logger.debug("Model replay buffer size: {}".format(model.replay_buffer.size()))
                        self.logger.debug("Setting learning_starts to {}".format(model.learning_starts))
                        self.logger.debug("Setting exploration_fraction to {}".format(model.exploration_fraction))
                        self.logger.debug("Setting exploration_initial_eps to {}".format(model.exploration_initial_eps))
                        return model
                    return stable_baselines3.DQN.load(
                        self.load_model(best_model_save_path, model_to_load),
                        env=env,
                        seed=seed,
                        tensorboard_log=tensorboard_log_dir,
                        verbose=1,
                    )
                return stable_baselines3.DQN(env=env, verbose=0, seed=seed, tensorboard_log=tensorboard_log_dir, **hyperparams)
            else:
                if best_model_save_path:
                    if continue_learning:
                        model = CustomDQN.load(
                            self.load_model(best_model_save_path, model_to_load),
                            env=env,
                            tensorboard_log=tensorboard_log_dir,
                            verbose=1,
                        )
                        self.logger.debug("Model replay buffer size: {}".format(len(model.replay_buffer)))
                        self.logger.debug(
                            "Setting exploration initial eps to exploration final eps {}".format(model.exploration_final_eps)
                        )
                        self.logger.debug("Setting learning_starts to 0")
                        if not save_replay_buffer:
                            self.logger.debug("Setting save_replay_buffer to False")
                            model.save_replay_buffer = False
                        model.learning_starts = 0
                        model.exploration_fraction = 0.005
                        model.exploration_initial_eps = model.exploration_final_eps
                        return model
                    return DQN.load(
                        self.load_model(best_model_save_path, model_to_load),
                        env=env,
                        tensorboard_log=tensorboard_log_dir,
                        verbose=1,
                        n_cpu_tf_sess=n_cpu_tf_sess,
                    )
                return CustomDQN(
                    env=env,
                    save_replay_buffer=save_replay_buffer,
                    verbose=1,
                    tensorboard_log=tensorboard_log_dir,
                    **hyperparams,
                    n_cpu_tf_sess=n_cpu_tf_sess,
                )
        raise NotImplementedError("algo_name {} not supported yet".format(algo_name))
Example #6
0
def train_SAC(env, eval_env, out_dir, seed=None, **kwargs):

    # Delete keys so the dict can be pass to the model constructor
    policy = kwargs['policy']
    n_timesteps = kwargs['n_timesteps']
    noise_type = None
    if 'noise_type' in kwargs:
        noise_type = kwargs['noise_type']
        del kwargs['noise_type']
    del kwargs['policy']
    del kwargs['n_timesteps']

    save_frequency = 10000
    eval_frequency = 50000
    eval_episodes = 1000
    if 'save_freq' in kwargs:
        save_frequency = kwargs['save_freq']
        del kwargs['save_freq']

    if 'eval_freq' in kwargs:
        eval_frequency = kwargs['eval_freq']
        del kwargs['eval_freq']

    if 'eval_episides' in kwargs:
        eval_episodes = kwargs['eval_episides']
        del kwargs['eval_episides']

    # the noise objects - usually not necessary for SAC but can help for hard exploration tasks
    nb_actions = env.action_space.shape[-1]
    action_noise = None
    if noise_type:

        for current_noise_type in noise_type.split(','):

            current_noise_type = current_noise_type.strip()

            if 'normal' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = NormalActionNoise(mean=np.zeros(nb_actions),
                                                 sigma=float(stddev) *
                                                 np.ones(nb_actions))

            elif 'ou' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                action_noise = OrnsteinUhlenbeckActionNoise(
                    mean=np.zeros(nb_actions),
                    sigma=float(stddev) * np.ones(nb_actions))

            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    # Create learning rate schedule
    for key in ['learning_rate', 'learning_rate_pi', 'cliprange']:
        if key in kwargs:
            if isinstance(kwargs[key], str):
                schedule, initial_value = kwargs[key].split('_')
                initial_value = float(initial_value)
                kwargs[key] = linear_schedule(initial_value)
            elif isinstance(kwargs[key], float):
                kwargs[key] = constfn(kwargs[key])
            else:
                raise ValueError('Invalid valid for {}: {}'.format(
                    key, kwargs[key]))

    if 'continue' in kwargs and kwargs['continue'] is True:
        print("Loading pretrained agent")
        list_of_models = glob.glob(os.path.join(out_dir, '*.zip'))
        last_saved_model = max(list_of_models, key=os.path.getctime)
        model = SAC_residual.load(last_saved_model,
                                  env=env,
                                  tensorboard_log=os.path.join(out_dir, 'tb'),
                                  verbose=1,
                                  **kwargs)
        reset_num_timesteps = False
        if 'num_timesteps' in kwargs:
            model.num_timesteps = kwargs['num_timesteps']
            del kwargs['num_timesteps']
    else:
        if 'continue' in kwargs:
            del kwargs['continue']
        # create model
        model = SAC(policy,
                    env,
                    action_noise=action_noise,
                    seed=seed,
                    verbose=1,
                    tensorboard_log=os.path.join(out_dir, 'tb'),
                    full_tensorboard_log=False,
                    **kwargs)
        reset_num_timesteps = True

    # start training
    train_callback = get_train_callback(eval_env,
                                        seed,
                                        out_dir,
                                        save_f=save_frequency,
                                        eval_f=eval_frequency,
                                        eval_ep=eval_episodes)
    model.learn(total_timesteps=n_timesteps,
                callback=train_callback,
                log_interval=10,
                reset_num_timesteps=reset_num_timesteps)

    return model