def run_experiment(args): hyperparam_file = os.path.join(HYPERPARAM_DIR, args.agent + ".yml") hyperparams = yaml.safe_load(open(hyperparam_file)) hyperparams = hyperparams[args.env] n_envs = hyperparams.pop("n_envs", 1) n_timesteps = int(hyperparams.pop("n_timesteps")) policy = hyperparams.pop("policy") normalize = hyperparams.pop("normalize", None) vecEnv = [] for i in range(n_envs): # Bit of trickery here to avoid referencing # to the same "i" vecEnv.append((lambda idx: lambda: create_env(args, idx))(i)) if args.subprocenv: vecEnv = SubprocVecEnv(vecEnv) else: vecEnv = DummyVecEnv(vecEnv) # Handle learning rates # Taken from rl-zoo/train.py for key in ['learning_rate', 'cliprange', 'cliprange_vf']: if key not in hyperparams or args.agent == "dqn": continue if key == 'learning_rate' and args.agent == "a2c": continue if isinstance(hyperparams[key], str): schedule, initial_value = hyperparams[key].split('_') initial_value = float(initial_value) hyperparams[key] = linear_schedule(initial_value) elif isinstance(hyperparams[key], (float, int)): # Negative value: ignore (ex: for clipping) if hyperparams[key] < 0: continue hyperparams[key] = constfn(float(hyperparams[key])) if args.forced_cliprange is not None: hyperparams["cliprange"] = args.forced_cliprange agent_class = AVAILABLE_ALGORITHMS[args.agent] agent = agent_class(policy, vecEnv, verbose=1, **hyperparams) # Prepare callback checkpoint_dir = os.path.join(args.output, CHECKPOINT_DIR) os.makedirs(checkpoint_dir) # Note that save_freq is counted in number of agent step-calls, # not env step-calls. save_freq = n_timesteps // (args.num_snapshots * n_envs) checkpoint_callback = CheckpointCallback(save_freq, checkpoint_dir) agent.learn(total_timesteps=n_timesteps, callback=checkpoint_callback) vecEnv.close()
log_path, "{}_{}".format(ENV_ID, get_latest_run_id(log_path, ENV_ID) + 1)) params_path = os.path.join(save_path, ENV_ID) os.makedirs(params_path, exist_ok=True) # Create learning rate schedules for ppo2 and sac if args.algo in ["ppo2", "sac"]: for key in ['learning_rate', 'cliprange']: if key not in hyperparams: continue if isinstance(hyperparams[key], str): schedule, initial_value = hyperparams[key].split('_') initial_value = float(initial_value) hyperparams[key] = linear_schedule(initial_value) elif isinstance(hyperparams[key], float): hyperparams[key] = constfn(hyperparams[key]) else: raise ValueError('Invalid valid for {}: {}'.format( key, hyperparams[key])) # Should we overwrite the number of timesteps? if args.n_timesteps > 0: n_timesteps = args.n_timesteps else: n_timesteps = int(hyperparams['n_timesteps']) del hyperparams['n_timesteps'] normalize = False normalize_kwargs = {} if 'normalize' in hyperparams.keys(): normalize = hyperparams['normalize']
print("Using {} environments".format(n_envs)) # Create learning rate schedules for ppo2 and sac if algo_ in ["ppo2", "sac", "td3"]: for key in ['learning_rate', 'cliprange', 'cliprange_vf']: if key not in hyperparams: continue if isinstance(hyperparams[key], str): schedule, initial_value = hyperparams[key].split('_') initial_value = float(initial_value) hyperparams[key] = linear_schedule(initial_value) elif isinstance(hyperparams[key], (float, int)): # Negative value: ignore (ex: for clipping) if hyperparams[key] < 0: continue hyperparams[key] = constfn(float(hyperparams[key])) else: raise ValueError('Invalid value for {}: {}'.format( key, hyperparams[key])) # Should we overwrite the number of timesteps? if args.n_timesteps > 0: if args.verbose: print("Overwriting n_timesteps with n={}".format(args.n_timesteps)) n_timesteps = args.n_timesteps else: n_timesteps = int(hyperparams['n_timesteps']) normalize = False normalize_kwargs = {} if 'normalize' in hyperparams.keys():
def train_HER(env, out_dir, seed=None, **kwargs): # Logs will be saved in log_dir/monitor.csv global output_dir, log_dir output_dir = out_dir log_dir = os.path.join(out_dir, 'log') os.makedirs(log_dir, exist_ok=True) env = Monitor(env, log_dir + '/', allow_early_resets=True) policy = kwargs['policy'] algo_name = kwargs['algo_name'] n_timesteps = kwargs['n_timesteps'] noise_type = None if 'noise_type' in kwargs: noise_type = kwargs['noise_type'] del kwargs['noise_type'] # HER Available strategies (cf paper): future, final, episode, random goal_selection_strategy = kwargs['goal_selection_strategy'] n_sampled_goal = kwargs['n_sampled_goal'] del kwargs['policy'] del kwargs['algo_name'] del kwargs['n_timesteps'] del kwargs['goal_selection_strategy'] del kwargs['n_sampled_goal'] # Set agent algorithm agent = set_agent(algo_name) if not agent: print("invalid algorithm for HER") return # the noise objects nb_actions = env.action_space.shape[-1] param_noise = None action_noise = None if noise_type: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if 'adaptive-param' in current_noise_type and algo_name is 'ddpg': _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Create learning rate schedule for key in ['learning_rate', 'learning_rate_pi', 'cliprange']: if key in kwargs: if isinstance(kwargs[key], str): schedule, initial_value = kwargs[key].split('_') initial_value = float(initial_value) kwargs[key] = linear_schedule(initial_value) elif isinstance(kwargs[key], float): kwargs[key] = constfn(kwargs[key]) else: raise ValueError('Invalid valid for {}: {}'.format( key, kwargs[key])) kwargs['tensorboard_log'] = os.path.join(log_dir, 'tb') kwargs['full_tensorboard_log'] = False kwargs['seed'] = seed kwargs['action_noise'] = action_noise if algo_name is 'ddpg': kwargs['param_noise'] = param_noise if 'continue' in kwargs and kwargs['continue'] is True: # Continue training print("Loading pretrained agent") # Policy should not be changed for key in ['policy', 'policy_kwargs']: if key in kwargs: del kwargs[key] model = HER.load(os.path.join(out_dir, 'final_model.pkl'), env=env, verbose=1, **kwargs) else: if 'continue' in kwargs: del kwargs['continue'] model = HER(policy, env, agent, goal_selection_strategy=goal_selection_strategy, n_sampled_goal=n_sampled_goal, verbose=1, **kwargs) model.learn(total_timesteps=n_timesteps, callback=log_callback) return model
def create_model( self, seed, algo_name, env, tensorboard_log_dir, hyperparams, best_model_save_path=None, model_to_load=None, continue_learning=False, env_name="CartPole-v1", n_timesteps=-1, save_replay_buffer: bool = True, ): old_hyperparams = dict() # Create learning rate schedules for ppo2 and sac if algo_name in ["ppo2", "sac", "td3"]: for key in ["learning_rate", "cliprange", "cliprange_vf"]: if key not in hyperparams: continue if isinstance(hyperparams[key], str): self.logger.debug("Key {}, value {}".format(key, hyperparams[key])) old_hyperparams[key] = hyperparams[key] schedule, initial_value = hyperparams[key].split("_") initial_value = float(initial_value) hyperparams[key] = linear_schedule(initial_value) elif isinstance(hyperparams[key], (float, int)): # Negative value: ignore (ex: for clipping) if hyperparams[key] < 0: continue old_hyperparams[key] = float(hyperparams[key]) hyperparams[key] = constfn(float(hyperparams[key])) else: raise ValueError("Invalid value for {}: {}".format(key, hyperparams[key])) if algo_name == "ppo2": if self.sb_version == "sb3": raise NotImplementedError("PPO still in sb2") if best_model_save_path and continue_learning: model = PPO2.load( self.load_model(best_model_save_path, model_to_load), env=env, tensorboard_log=tensorboard_log_dir, verbose=1, ) key = "cliprange" cl_cliprange_value = 0.08 # new policy can be a bit different than the old one if key in old_hyperparams: if isinstance(old_hyperparams[key], str): self.logger.debug("Setting cliprange to lin_{}".format(cl_cliprange_value)) model.cliprange = linear_schedule(cl_cliprange_value) elif isinstance(old_hyperparams[key], (float, int)): self.logger.debug("Setting cliprange to value {}".format(cl_cliprange_value)) model.cliprange = constfn(cl_cliprange_value) else: # default value is too high for continual learning (0.2) self.logger.debug("Setting cliprange to value {}".format(cl_cliprange_value)) model.cliprange = cl_cliprange_value return model elif best_model_save_path: return PPO2.load( self.load_model(best_model_save_path, model_to_load), env=env, tensorboard_log=tensorboard_log_dir, verbose=1, n_cpu_tf_sess=n_cpu_tf_sess, ) return PPO2(env=env, verbose=1, tensorboard_log=tensorboard_log_dir, **hyperparams, n_cpu_tf_sess=n_cpu_tf_sess,) elif algo_name == "sac": if self.sb_version == "sb3": if best_model_save_path and continue_learning: model = stable_baselines3.SAC.load( self.load_model(best_model_save_path, model_to_load), env=env, seed=seed, tensorboard_log=tensorboard_log_dir, verbose=1, ) model.load_replay_buffer(path=best_model_save_path + "/replay_buffer") self.logger.debug("Model replay buffer size: {}".format(model.replay_buffer.size())) self.logger.debug("Setting learning_starts to 0") model.learning_starts = 0 value = get_value_given_key(best_model_save_path + "/progress.csv", "ent_coef") if value: ent_coef = float(value) self.logger.debug("Restore model old ent_coef: {}".format("auto_" + str(ent_coef))) model.ent_coef = "auto_" + str(ent_coef) model.target_entropy = str(ent_coef) return model elif best_model_save_path: return stable_baselines3.SAC.load( self.load_model(best_model_save_path, model_to_load), env=env, seed=seed, tensorboard_log=tensorboard_log_dir, verbose=1, n_cpu_tf_sess=n_cpu_tf_sess, ) assert n_timesteps > 0, "n_timesteps > 0: {}".format(n_timesteps) return stable_baselines3.SAC(env=env, verbose=0, seed=seed, tensorboard_log=tensorboard_log_dir, **hyperparams) else: if best_model_save_path and continue_learning: model = CustomSAC.load( self.load_model(best_model_save_path, model_to_load), env=env, tensorboard_log=tensorboard_log_dir, verbose=1, ) self.logger.debug("Model replay buffer size: {}".format(len(model.replay_buffer))) self.logger.debug("Setting learning_starts to 0") model.learning_starts = 0 if not save_replay_buffer: self.logger.debug("Setting save_replay_buffer to False") model.save_replay_buffer = False value = get_value_given_key(best_model_save_path + "/progress.csv", "ent_coef") if value: ent_coef = float(value) self.logger.debug("Restore model old ent_coef: {}".format("auto_" + str(ent_coef))) model.ent_coef = "auto_" + str(ent_coef) model.target_entropy = str(ent_coef) return model elif best_model_save_path: # do not load replay buffer since we are in testing mode (no continue_learning) return SAC.load( self.load_model(best_model_save_path, model_to_load), env=env, tensorboard_log=tensorboard_log_dir, verbose=1, n_cpu_tf_sess=n_cpu_tf_sess, ) return CustomSAC( total_timesteps=n_timesteps, env=env, verbose=1, tensorboard_log=tensorboard_log_dir, **hyperparams, n_cpu_tf_sess=n_cpu_tf_sess, save_replay_buffer=save_replay_buffer, ) elif algo_name == "dqn": if self.sb_version == "sb3": if best_model_save_path: if continue_learning: model = stable_baselines3.DQN.load( self.load_model(best_model_save_path, model_to_load), env=env, seed=seed, tensorboard_log=tensorboard_log_dir, verbose=0, ) model.load_replay_buffer(path=best_model_save_path + "/replay_buffer") model.learning_starts = 0 model.exploration_fraction = 0.0005 model.exploration_initial_eps = model.exploration_final_eps model.exploration_schedule = get_linear_fn( model.exploration_initial_eps, model.exploration_final_eps, model.exploration_fraction ) self.logger.debug("Model replay buffer size: {}".format(model.replay_buffer.size())) self.logger.debug("Setting learning_starts to {}".format(model.learning_starts)) self.logger.debug("Setting exploration_fraction to {}".format(model.exploration_fraction)) self.logger.debug("Setting exploration_initial_eps to {}".format(model.exploration_initial_eps)) return model return stable_baselines3.DQN.load( self.load_model(best_model_save_path, model_to_load), env=env, seed=seed, tensorboard_log=tensorboard_log_dir, verbose=1, ) return stable_baselines3.DQN(env=env, verbose=0, seed=seed, tensorboard_log=tensorboard_log_dir, **hyperparams) else: if best_model_save_path: if continue_learning: model = CustomDQN.load( self.load_model(best_model_save_path, model_to_load), env=env, tensorboard_log=tensorboard_log_dir, verbose=1, ) self.logger.debug("Model replay buffer size: {}".format(len(model.replay_buffer))) self.logger.debug( "Setting exploration initial eps to exploration final eps {}".format(model.exploration_final_eps) ) self.logger.debug("Setting learning_starts to 0") if not save_replay_buffer: self.logger.debug("Setting save_replay_buffer to False") model.save_replay_buffer = False model.learning_starts = 0 model.exploration_fraction = 0.005 model.exploration_initial_eps = model.exploration_final_eps return model return DQN.load( self.load_model(best_model_save_path, model_to_load), env=env, tensorboard_log=tensorboard_log_dir, verbose=1, n_cpu_tf_sess=n_cpu_tf_sess, ) return CustomDQN( env=env, save_replay_buffer=save_replay_buffer, verbose=1, tensorboard_log=tensorboard_log_dir, **hyperparams, n_cpu_tf_sess=n_cpu_tf_sess, ) raise NotImplementedError("algo_name {} not supported yet".format(algo_name))
def train_SAC(env, eval_env, out_dir, seed=None, **kwargs): # Delete keys so the dict can be pass to the model constructor policy = kwargs['policy'] n_timesteps = kwargs['n_timesteps'] noise_type = None if 'noise_type' in kwargs: noise_type = kwargs['noise_type'] del kwargs['noise_type'] del kwargs['policy'] del kwargs['n_timesteps'] save_frequency = 10000 eval_frequency = 50000 eval_episodes = 1000 if 'save_freq' in kwargs: save_frequency = kwargs['save_freq'] del kwargs['save_freq'] if 'eval_freq' in kwargs: eval_frequency = kwargs['eval_freq'] del kwargs['eval_freq'] if 'eval_episides' in kwargs: eval_episodes = kwargs['eval_episides'] del kwargs['eval_episides'] # the noise objects - usually not necessary for SAC but can help for hard exploration tasks nb_actions = env.action_space.shape[-1] action_noise = None if noise_type: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Create learning rate schedule for key in ['learning_rate', 'learning_rate_pi', 'cliprange']: if key in kwargs: if isinstance(kwargs[key], str): schedule, initial_value = kwargs[key].split('_') initial_value = float(initial_value) kwargs[key] = linear_schedule(initial_value) elif isinstance(kwargs[key], float): kwargs[key] = constfn(kwargs[key]) else: raise ValueError('Invalid valid for {}: {}'.format( key, kwargs[key])) if 'continue' in kwargs and kwargs['continue'] is True: print("Loading pretrained agent") list_of_models = glob.glob(os.path.join(out_dir, '*.zip')) last_saved_model = max(list_of_models, key=os.path.getctime) model = SAC_residual.load(last_saved_model, env=env, tensorboard_log=os.path.join(out_dir, 'tb'), verbose=1, **kwargs) reset_num_timesteps = False if 'num_timesteps' in kwargs: model.num_timesteps = kwargs['num_timesteps'] del kwargs['num_timesteps'] else: if 'continue' in kwargs: del kwargs['continue'] # create model model = SAC(policy, env, action_noise=action_noise, seed=seed, verbose=1, tensorboard_log=os.path.join(out_dir, 'tb'), full_tensorboard_log=False, **kwargs) reset_num_timesteps = True # start training train_callback = get_train_callback(eval_env, seed, out_dir, save_f=save_frequency, eval_f=eval_frequency, eval_ep=eval_episodes) model.learn(total_timesteps=n_timesteps, callback=train_callback, log_interval=10, reset_num_timesteps=reset_num_timesteps) return model