class Q_Trainer(object): def __init__(self, params): self.params = params train_args = { "num_agent_train_steps_per_iter": params["num_agent_train_steps_per_iter"], "num_critic_updates_per_agent_update": params[ "num_critic_updates_per_agent_update" ], "train_batch_size": params["batch_size"], "double_q": params["double_q"], } env_args = get_env_kwargs(params["env_name"]) self.agent_params = {**train_args, **env_args, **params} self.params["agent_class"] = ExplorationOrExploitationAgent self.params["agent_params"] = self.agent_params self.params["train_batch_size"] = params["batch_size"] self.params["env_wrappers"] = self.agent_params["env_wrappers"] self.rl_trainer = RL_Trainer(self.params) def run_training_loop(self): self.rl_trainer.run_training_loop( self.agent_params["num_timesteps"], collect_policy=self.rl_trainer.agent.actor, eval_policy=self.rl_trainer.agent.actor, )
class AC_Trainer(object): def __init__(self, params): ##################### ## SET AGENT PARAMS ##################### computation_graph_args = { 'n_layers': params['n_layers'], 'size': params['size'], 'learning_rate': params['learning_rate'], # TODO: num_target_updates 啥意思 'num_target_updates': params['num_target_updates'], 'num_grad_steps_per_target_update': params['num_grad_steps_per_target_update'], } estimate_advantage_args = { 'gamma': params['discount'], 'standardize_advantages': not (params['dont_standardize_advantages']), } train_args = { 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'], 'num_actor_updates_per_agent_update': params['num_actor_updates_per_agent_update'], } agent_params = { **computation_graph_args, **estimate_advantage_args, **train_args } self.params = params self.params['agent_class'] = ACAgent self.params['agent_params'] = agent_params self.params['batch_size_initial'] = self.params['batch_size'] ################ ## RL TRAINER ################ self.rl_trainer = RL_Trainer(self.params) def run_training_loop(self): self.rl_trainer.run_training_loop( self.params['n_iter'], collect_policy=self.rl_trainer.agent.actor, eval_policy=self.rl_trainer.agent.actor, )
class SAC_Trainer(object): def __init__(self, params): ##################### ## SET AGENT PARAMS ##################### computation_graph_args = { 'n_layers': params['n_layers'], 'size': params['size'], 'num_target_updates': params['num_target_updates'], } sac_update_args = { 'gamma': params['discount'], 'polyak_tau': params['polyak_tau'], 'learning_rate_valuefn': params['learning_rate_valuefn'], 'learning_rate_policyfn': params['learning_rate_policyfn'], 'learning_rate_alpha': params['learning_rate_alpha'], } train_args = { 'exploration_steps': params['exploration_steps'], 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], 'l2_reg': params['l2_reg'], 'learning_starts': params['learning_starts'], 'learning_freq': params['learning_freq'], 'target_update_freq': params['target_update_freq'], } agent_params = { **computation_graph_args, **sac_update_args, **train_args } self.params = params self.params['agent_class'] = SACAgent self.params['agent_params'] = agent_params self.params['batch_size_initial'] = self.params['batch_size'] ################ ## RL TRAINER ################ self.rl_trainer = RL_Trainer(self.params) def run_training_loop(self): self.rl_trainer.run_training_loop( self.params['total_timesteps'], collect_policy=self.rl_trainer.agent.actor, eval_policy=self.rl_trainer.agent.actor, )
class Q_Trainer(object): def __init__(self, params): self.params = params train_args = { 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'], 'train_batch_size': params['batch_size'], 'double_q': params['double_q'], } env_args = get_env_kwargs(params['env_name']) self.agent_params = {**train_args, **env_args, **params} self.params['agent_class'] = DQNAgent self.params['agent_params'] = self.agent_params self.params['train_batch_size'] = params['batch_size'] self.params['env_wrappers'] = self.agent_params['env_wrappers'] self.rl_trainer = RL_Trainer(self.params) def run_training_loop(self): self.rl_trainer.run_training_loop( self.agent_params['num_timesteps'], collect_policy = self.rl_trainer.agent.actor, eval_policy = self.rl_trainer.agent.actor, )
class PG_Trainer(object): def __init__(self, params): ##################### ## SET AGENT PARAMS ##################### computation_graph_args = { 'n_layers': params['n_layers'], 'size': params['size'], 'learning_rate': params['learning_rate'], } estimate_advantage_args = { 'gamma': params['discount'], 'standardize_advantages': params['standardize_advantages'], 'reward_to_go': params['reward_to_go'], 'nn_baseline': params['nn_baseline'], 'gae': params['gae'], 'gae_gamma': params['gae_gamma'], 'gae_lambda': params['gae_lambda'] } train_args = { 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], } agent_params = { **computation_graph_args, **estimate_advantage_args, **train_args } self.params = params self.params['agent_class'] = PGAgent self.params['agent_params'] = agent_params self.params['batch_size_initial'] = self.params['batch_size'] ################ ## RL TRAINER ################ self.rl_trainer = RL_Trainer(self.params) def run_training_loop(self): self.rl_trainer.run_training_loop( self.params['n_iter'], collect_policy=self.rl_trainer.agent.actor, eval_policy=self.rl_trainer.agent.actor, ) if self.params['render_after_training'] == 1: self.rl_trainer.eval_render(self.rl_trainer.agent.actor) def load_trained_agent_render(self): self.rl_trainer.agent.actor.restore( '/home/kim/cs285_ws/homework_fall2019/hw2/cs285/data/pg_todo_CartPole-v0_15-01-2020_15-42-29/policy_itr_99' ) self.rl_trainer.eval_render(self.rl_trainer.agent.actor)
class PG_Trainer(object): def __init__(self, params): ##################### ## SET AGENT PARAMS ##################### computation_graph_args = { 'n_layers': params['n_layers'], 'size': params['size'], 'learning_rate': params['learning_rate'], } estimate_advantage_args = { 'gamma': params['discount'], 'standardize_advantages': not (params['dont_standardize_advantages']), 'reward_to_go': params['reward_to_go'], 'nn_baseline': params['nn_baseline'], 'generalized_advantage_estimation': params['generalized_advantage_estimation'], 'gae_lambda': params['gae_lambda'] } train_args = { 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], } agent_params = { **computation_graph_args, **estimate_advantage_args, **train_args } self.params = params self.params['agent_class'] = PGAgent self.params['agent_params'] = agent_params self.params['batch_size_initial'] = self.params['batch_size'] ################ ## RL TRAINER ################ self.rl_trainer = RL_Trainer(self.params) def run_training_loop(self): self.rl_trainer.run_training_loop( self.params['n_iter'], collect_policy=self.rl_trainer.agent.actor, eval_policy=self.rl_trainer.agent.actor, )
class PPO_Trainer(object): def __init__(self, params): ##################### ## SET AGENT PARAMS ##################### computation_graph_args = { 'n_layers': params['n_layers'], 'size': params['size'], } estimate_advantage_args = { 'gamma': params['discount'], 'use_gae':params['use_gae'], 'gae_lam':params['gae_lam'], 'standardize_advantages': not(params['dont_standardize_advantages']), } ppo_update_args = { 'clip_eps' : params['clip_epsilon'], 'ent_coeff' : params['ent_coeff'], 'max_grad_norm' : params['max_grad_norm'], 'ppo_epochs': params['ppo_epochs'], 'ppo_min_batch_size':params['ppo_min_batch_size'] } train_args = { 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], 'l2_reg': params['l2_reg'], 'learning_rate_valuefn': params['learning_rate_valuefn'], 'learning_rate_policyfn': params['learning_rate_policyfn'], 'num_target_updates': params['num_target_updates'], } agent_params = {**computation_graph_args, **estimate_advantage_args, **ppo_update_args, **train_args} self.params = params self.params['agent_class'] = PPOAgent self.params['agent_params'] = agent_params self.params['batch_size_initial'] = self.params['batch_size'] ################ ## RL TRAINER ################ self.rl_trainer = RL_Trainer(self.params) def run_training_loop(self): self.rl_trainer.run_training_loop( self.params['n_iter'], collect_policy = self.rl_trainer.agent.actor, eval_policy = self.rl_trainer.agent.actor, )
def train_AC(params): computation_graph_args = { 'n_layers': params['n_layers'], 'size': params['size'], 'device': params['device'], 'learning_rate': params['learning_rate'], 'num_target_updates': params['num_target_updates'], 'num_grad_steps_per_target_update': params['num_grad_steps_per_target_update'], } train_args = { 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'], 'num_actor_updates_per_agent_update': params['num_actor_updates_per_agent_update'], 'gamma': params['discount'], 'standardize_advantages': not (params['dont_standardize_advantages']), } exploration_args = { 'density_model': params['density_model'], 'bonus_coeff': params['bonus_coeff'], 'kl_weight': params['kl_weight'], 'density_lr': params['density_lr'], 'density_train_iters': params['density_train_iters'], 'density_batch_size': params['density_batch_size'], 'density_hiddim': params['density_hiddim'], 'replay_size': params['replay_size'], 'sigma': params['sigma'], } params['agent_params'] = { **computation_graph_args, **train_args, **exploration_args } params['agent_class'] = Exploratory_ACAgent rl_trainer = RL_Trainer(params) rl_trainer.run_training_loop(params['n_iter'], policy=rl_trainer.agent.actor)
class BC_Trainer(object): def __init__(self, params): ####################### ## AGENT PARAMS ####################### agent_params = { 'n_layers': params['n_layers'], 'size': params['size'], 'learning_rate': params['learning_rate'], 'max_replay_buffer_size': params['max_replay_buffer_size'], } self.params = params self.params[ 'agent_class'] = BCAgent ## TODO: look in here and implement this self.params['agent_params'] = agent_params ################ ## RL TRAINER ################ import roboschool self.rl_trainer = RL_Trainer( self.params) ## TODO: look in here and implement this ####################### ## LOAD EXPERT POLICY ####################### print('Loading expert policy from...', self.params['expert_policy_file']) #self.loaded_expert_policy = Loaded_Gaussian_Policy(self.rl_trainer.sess, self.params['expert_policy_file']) module_name = self.params['expert_policy_file'].replace( '/', '.').rstrip('.py') policy_module = importlib.import_module(module_name) _, policy = policy_module.get_env_and_policy() self.loaded_expert_policy = PolicyWrapper(policy) print('Done restoring expert policy...') def run_training_loop(self): self.rl_trainer.run_training_loop( n_iter=self.params['n_iter'], initial_expertdata=self.params['expert_data'], collect_policy=self.rl_trainer.agent.actor, eval_policy=self.rl_trainer.agent.actor, relabel_with_expert=self.params['do_dagger'], expert_policy=self.loaded_expert_policy, )
class PG_Trainer(object): def __init__(self, params): ##################### ## SET AGENT PARAMS ##################### computation_graph_args = { "n_layers": params["n_layers"], "size": params["size"], "learning_rate": params["learning_rate"], } estimate_advantage_args = { "gamma": params["discount"], "standardize_advantages": not (params["dont_standardize_advantages"]), "reward_to_go": params["reward_to_go"], "nn_baseline": params["nn_baseline"], } train_args = { "num_agent_train_steps_per_iter": params["num_agent_train_steps_per_iter"], } agent_params = { **computation_graph_args, **estimate_advantage_args, **train_args, } self.params = params self.params["agent_class"] = PGAgent self.params["agent_params"] = agent_params self.params["batch_size_initial"] = self.params["batch_size"] ################ ## RL TRAINER ################ self.rl_trainer = RL_Trainer(self.params) def run_training_loop(self): self.rl_trainer.run_training_loop( self.params["n_iter"], collect_policy=self.rl_trainer.agent.actor, eval_policy=self.rl_trainer.agent.actor, )
class BC_Trainer(object): def __init__(self, params): ####################### ## AGENT PARAMS ####################### agent_params = { 'n_layers': params['n_layers'], 'size': params['size'], 'learning_rate': params['learning_rate'], 'max_replay_buffer_size': params['max_replay_buffer_size'], } self.params = params self.params[ 'agent_class'] = BCAgent ## TODO: look in here and implement this self.params['agent_params'] = agent_params ################ ## RL TRAINER ################ self.rl_trainer = RL_Trainer( self.params) ## TODO: look in here and implement this ####################### ## LOAD EXPERT POLICY ####################### # 其实包括agent和expert都是gaussian policy,因为action space是连续的 # 相互独立的多元高斯分布 print('Loading expert policy from...', self.params['expert_policy_file']) self.loaded_expert_policy = Loaded_Gaussian_Policy( self.rl_trainer.sess, self.params['expert_policy_file']) # 同一个sess,两幅图定义在一起 print('Done restoring expert policy...') def run_training_loop(self): self.rl_trainer.run_training_loop( n_iter=self.params['n_iter'], initial_expertdata=self.params['expert_data'], collect_policy=self.rl_trainer.agent.actor, eval_policy=self.rl_trainer.agent.actor, relabel_with_expert=self.params['do_dagger'], expert_policy=self.loaded_expert_policy, )
class MB_Trainer(object): def __init__(self, params): ##################### ## SET AGENT PARAMS ##################### computation_graph_args = { 'ensemble_size': params['ensemble_size'], 'n_layers': params['n_layers'], 'size': params['size'], 'learning_rate': params['learning_rate'], 'device': params['device'] } train_args = { 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], } controller_args = { 'mpc_horizon': params['mpc_horizon'], 'mpc_num_action_sequences': params['mpc_num_action_sequences'], } agent_params = { **computation_graph_args, **train_args, **controller_args } self.params = params self.params['agent_class'] = MBAgent self.params['agent_params'] = agent_params ################ ## RL TRAINER ################ self.rl_trainer = RL_Trainer(self.params) def run_training_loop(self): self.rl_trainer.run_training_loop( self.params['n_iter'], collect_policy=self.rl_trainer.agent.actor, eval_policy=self.rl_trainer.agent.actor, )
class BC_Trainer(object): def __init__(self, params): ####################### # AGENT PARAMS ####################### agent_params = { 'n_layers': params['n_layers'], 'size': params['size'], 'learning_rate': params['learning_rate'], 'max_replay_buffer_size': params['max_replay_buffer_size'], } self.params = params self.params['agent_class'] = BCAgent # HW1: you will modify this self.params['agent_params'] = agent_params ################ # RL TRAINER ################ self.rl_trainer = RL_Trainer(self.params) # HW1: you will modify this ####################### # LOAD EXPERT POLICY ####################### print('Loading expert policy from...', self.params['expert_policy_file']) self.loaded_expert_policy = LoadedGaussianPolicy( self.params['expert_policy_file']) print('Done restoring expert policy...') def run_training_loop(self): self.rl_trainer.run_training_loop( n_iter=self.params['n_iter'], initial_expertdata=self.params['expert_data'], collect_policy=self.rl_trainer.agent.actor, eval_policy=self.rl_trainer.agent.actor, relabel_with_expert=self.params['do_dagger'], expert_policy=self.loaded_expert_policy, )
class BC_Trainer(object): def __init__(self, params): ####################### ## AGENT PARAMS ####################### agent_params = { "n_layers": params["n_layers"], "size": params["size"], "learning_rate": params["learning_rate"], "max_replay_buffer_size": params["max_replay_buffer_size"], } self.params = params self.params["agent_class"] = BCAgent ## HW1: you will modify this self.params["agent_params"] = agent_params ################ ## RL TRAINER ################ self.rl_trainer = RL_Trainer(self.params) ## HW1: you will modify this ####################### ## LOAD EXPERT POLICY ####################### print("Loading expert policy from...", self.params["expert_policy_file"]) self.loaded_expert_policy = LoadedGaussianPolicy( self.params["expert_policy_file"] ) print("Done restoring expert policy...") def run_training_loop(self): self.rl_trainer.run_training_loop( n_iter=self.params["n_iter"], initial_expertdata=self.params["expert_data"], collect_policy=self.rl_trainer.agent.actor, eval_policy=self.rl_trainer.agent.actor, relabel_with_expert=self.params["do_dagger"], expert_policy=self.loaded_expert_policy, )
class BC_Trainer(object): def __init__(self, params, logger=None): ####################### ## AGENT PARAMS ####################### agent_params = { 'n_layers': params['n_layers'], 'size': params['size'], 'learning_rate': params['learning_rate'], 'max_replay_buffer_size': params['max_replay_buffer_size'], } self.params = params self.params[ 'agent_class'] = BCAgent ## TODO: look in here and implement this self.params['agent_params'] = agent_params ################ ## RL TRAINER ################ self.rl_trainer = RL_Trainer( self.params, logger) ## TODO: look in here and implement this ####################### ## LOAD EXPERT POLICY ####################### print('Loading expert policy from...', self.params['expert_policy_file']) self.loaded_expert_policy = Loaded_Gaussian_Policy( self.rl_trainer.sess, self.params['expert_policy_file']) print('Done restoring expert policy...') def run_training_loop(self): return self.rl_trainer.run_training_loop( n_iter=self.params['n_iter'], initial_expertdata=self.params['expert_data'], collect_policy=self.rl_trainer.agent.actor, eval_policy=self.rl_trainer.agent.actor, relabel_with_expert=self.params['do_dagger'], expert_policy=self.loaded_expert_policy, ) def run_logging_loop(self, itr): return self.rl_trainer.run_logging_loop( n_iter=itr, initial_expertdata=self.params['expert_data'], collect_policy=self.rl_trainer.agent.actor, eval_policy=self.rl_trainer.agent.actor, )
class BC_Trainer(object): def __init__(self, params): ####################### ## AGENT PARAMS ####################### agent_params = { 'n_layers': params['n_layers'], 'size': params['size'], 'learning_rate': params['learning_rate'], 'max_replay_buffer_size': params['max_replay_buffer_size'], 'siren': params['siren'], 'train_separate_params': params['train_separate_params'], 'supervision_mode': params['supervision_mode'], 'offset_learning_rate': params['offset_learning_rate'], 'epsilon_s': params['epsilon_s'], 'auto_cast': params['auto_cast'], 'gradient_loss_scale': params['gradient_loss_scale'], 'additional_activation': params['additional_activation'], 'omega': params['omega'], } self.params = params self.params['agent_class'] = BCAgent ## HW1: you will modify this self.params['agent_params'] = agent_params ################ ## RL TRAINER ################ self.rl_trainer = RL_Trainer(self.params) ## HW1: you will modify this ####################### ## LOAD EXPERT POLICY ####################### print('Loading expert policy from...', self.params['expert_policy_file']) self.loaded_expert_policy = LoadedGaussianPolicy( self.params['expert_policy_file']) print('Done restoring expert policy...') def run_training_loop(self): return self.rl_trainer.run_training_loop( n_iter=self.params['n_iter'], initial_expertdata=self.params['expert_data'], collect_policy=self.rl_trainer.agent.actor, eval_policy=self.rl_trainer.agent.actor, relabel_with_expert=self.params['do_dagger'], expert_policy=self.loaded_expert_policy, )
class TRPO_Trainer(object): def __init__(self, params): ##################### ## SET AGENT PARAMS ##################### computation_graph_args = { 'n_layers': params['n_layers'], 'size': params['size'], 'learning_rate': params['learning_rate'], 'num_target_updates': params['num_target_updates'], } estimate_advantage_args = { 'gamma': params['discount'], 'use_gae': params['use_gae'], 'gae_lam': params['gae_lam'], 'standardize_advantages': not (params['dont_standardize_advantages']), } trpo_update_args = { 'cg_steps': params['cg_steps'], 'damping': params['damping'], 'max_kl_increment': params['max_kl_increment'], 'max_backtracks': params['max_backtracks'], } train_args = { 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], 'l2_reg': params['l2_reg'] } agent_params = { **computation_graph_args, **estimate_advantage_args, **trpo_update_args, **train_args } self.params = params self.params['agent_class'] = TRPOAgent self.params['agent_params'] = agent_params self.params['batch_size_initial'] = self.params['batch_size'] ################ ## RL TRAINER ################ self.rl_trainer = RL_Trainer(self.params) def run_training_loop(self): self.rl_trainer.run_training_loop( self.params['n_iter'], collect_policy=self.rl_trainer.agent.actor, eval_policy=self.rl_trainer.agent.actor, )