Ejemplo n.º 1
0
class Q_Trainer(object):
    def __init__(self, params):
        self.params = params

        train_args = {
            "num_agent_train_steps_per_iter": params["num_agent_train_steps_per_iter"],
            "num_critic_updates_per_agent_update": params[
                "num_critic_updates_per_agent_update"
            ],
            "train_batch_size": params["batch_size"],
            "double_q": params["double_q"],
        }

        env_args = get_env_kwargs(params["env_name"])

        self.agent_params = {**train_args, **env_args, **params}

        self.params["agent_class"] = ExplorationOrExploitationAgent
        self.params["agent_params"] = self.agent_params
        self.params["train_batch_size"] = params["batch_size"]
        self.params["env_wrappers"] = self.agent_params["env_wrappers"]

        self.rl_trainer = RL_Trainer(self.params)

    def run_training_loop(self):
        self.rl_trainer.run_training_loop(
            self.agent_params["num_timesteps"],
            collect_policy=self.rl_trainer.agent.actor,
            eval_policy=self.rl_trainer.agent.actor,
        )
Ejemplo n.º 2
0
class AC_Trainer(object):
    def __init__(self, params):

        #####################
        ## SET AGENT PARAMS
        #####################

        computation_graph_args = {
            'n_layers':
            params['n_layers'],
            'size':
            params['size'],
            'learning_rate':
            params['learning_rate'],
            # TODO: num_target_updates 啥意思
            'num_target_updates':
            params['num_target_updates'],
            'num_grad_steps_per_target_update':
            params['num_grad_steps_per_target_update'],
        }

        estimate_advantage_args = {
            'gamma': params['discount'],
            'standardize_advantages':
            not (params['dont_standardize_advantages']),
        }

        train_args = {
            'num_agent_train_steps_per_iter':
            params['num_agent_train_steps_per_iter'],
            'num_critic_updates_per_agent_update':
            params['num_critic_updates_per_agent_update'],
            'num_actor_updates_per_agent_update':
            params['num_actor_updates_per_agent_update'],
        }

        agent_params = {
            **computation_graph_args,
            **estimate_advantage_args,
            **train_args
        }

        self.params = params
        self.params['agent_class'] = ACAgent
        self.params['agent_params'] = agent_params
        self.params['batch_size_initial'] = self.params['batch_size']

        ################
        ## RL TRAINER
        ################

        self.rl_trainer = RL_Trainer(self.params)

    def run_training_loop(self):

        self.rl_trainer.run_training_loop(
            self.params['n_iter'],
            collect_policy=self.rl_trainer.agent.actor,
            eval_policy=self.rl_trainer.agent.actor,
        )
Ejemplo n.º 3
0
class SAC_Trainer(object):
    def __init__(self, params):

        #####################
        ## SET AGENT PARAMS
        #####################

        computation_graph_args = {
            'n_layers': params['n_layers'],
            'size': params['size'],
            'num_target_updates': params['num_target_updates'],
        }

        sac_update_args = {
            'gamma': params['discount'],
            'polyak_tau': params['polyak_tau'],
            'learning_rate_valuefn': params['learning_rate_valuefn'],
            'learning_rate_policyfn': params['learning_rate_policyfn'],
            'learning_rate_alpha': params['learning_rate_alpha'],
        }

        train_args = {
            'exploration_steps':
            params['exploration_steps'],
            'num_agent_train_steps_per_iter':
            params['num_agent_train_steps_per_iter'],
            'l2_reg':
            params['l2_reg'],
            'learning_starts':
            params['learning_starts'],
            'learning_freq':
            params['learning_freq'],
            'target_update_freq':
            params['target_update_freq'],
        }

        agent_params = {
            **computation_graph_args,
            **sac_update_args,
            **train_args
        }

        self.params = params
        self.params['agent_class'] = SACAgent
        self.params['agent_params'] = agent_params
        self.params['batch_size_initial'] = self.params['batch_size']

        ################
        ## RL TRAINER
        ################

        self.rl_trainer = RL_Trainer(self.params)

    def run_training_loop(self):

        self.rl_trainer.run_training_loop(
            self.params['total_timesteps'],
            collect_policy=self.rl_trainer.agent.actor,
            eval_policy=self.rl_trainer.agent.actor,
        )
Ejemplo n.º 4
0
class Q_Trainer(object):

    def __init__(self, params):
        self.params = params

        train_args = {
            'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
            'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'],
            'train_batch_size': params['batch_size'],
            'double_q': params['double_q'],
        }

        env_args = get_env_kwargs(params['env_name'])

        self.agent_params = {**train_args, **env_args, **params}

        self.params['agent_class'] = DQNAgent
        self.params['agent_params'] = self.agent_params
        self.params['train_batch_size'] = params['batch_size']
        self.params['env_wrappers'] = self.agent_params['env_wrappers']

        self.rl_trainer = RL_Trainer(self.params)

    def run_training_loop(self):
        self.rl_trainer.run_training_loop(
            self.agent_params['num_timesteps'],
            collect_policy = self.rl_trainer.agent.actor,
            eval_policy = self.rl_trainer.agent.actor,
            )
class PG_Trainer(object):
    def __init__(self, params):

        #####################
        ## SET AGENT PARAMS
        #####################

        computation_graph_args = {
            'n_layers': params['n_layers'],
            'size': params['size'],
            'learning_rate': params['learning_rate'],
        }
        estimate_advantage_args = {
            'gamma': params['discount'],
            'standardize_advantages': params['standardize_advantages'],
            'reward_to_go': params['reward_to_go'],
            'nn_baseline': params['nn_baseline'],
            'gae': params['gae'],
            'gae_gamma': params['gae_gamma'],
            'gae_lambda': params['gae_lambda']
        }

        train_args = {
            'num_agent_train_steps_per_iter':
            params['num_agent_train_steps_per_iter'],
        }

        agent_params = {
            **computation_graph_args,
            **estimate_advantage_args,
            **train_args
        }

        self.params = params
        self.params['agent_class'] = PGAgent
        self.params['agent_params'] = agent_params
        self.params['batch_size_initial'] = self.params['batch_size']

        ################
        ## RL TRAINER
        ################

        self.rl_trainer = RL_Trainer(self.params)

    def run_training_loop(self):

        self.rl_trainer.run_training_loop(
            self.params['n_iter'],
            collect_policy=self.rl_trainer.agent.actor,
            eval_policy=self.rl_trainer.agent.actor,
        )
        if self.params['render_after_training'] == 1:
            self.rl_trainer.eval_render(self.rl_trainer.agent.actor)

    def load_trained_agent_render(self):
        self.rl_trainer.agent.actor.restore(
            '/home/kim/cs285_ws/homework_fall2019/hw2/cs285/data/pg_todo_CartPole-v0_15-01-2020_15-42-29/policy_itr_99'
        )
        self.rl_trainer.eval_render(self.rl_trainer.agent.actor)
Ejemplo n.º 6
0
class PG_Trainer(object):
    def __init__(self, params):

        #####################
        ## SET AGENT PARAMS
        #####################

        computation_graph_args = {
            'n_layers': params['n_layers'],
            'size': params['size'],
            'learning_rate': params['learning_rate'],
        }

        estimate_advantage_args = {
            'gamma':
            params['discount'],
            'standardize_advantages':
            not (params['dont_standardize_advantages']),
            'reward_to_go':
            params['reward_to_go'],
            'nn_baseline':
            params['nn_baseline'],
            'generalized_advantage_estimation':
            params['generalized_advantage_estimation'],
            'gae_lambda':
            params['gae_lambda']
        }

        train_args = {
            'num_agent_train_steps_per_iter':
            params['num_agent_train_steps_per_iter'],
        }

        agent_params = {
            **computation_graph_args,
            **estimate_advantage_args,
            **train_args
        }

        self.params = params
        self.params['agent_class'] = PGAgent
        self.params['agent_params'] = agent_params
        self.params['batch_size_initial'] = self.params['batch_size']

        ################
        ## RL TRAINER
        ################

        self.rl_trainer = RL_Trainer(self.params)

    def run_training_loop(self):

        self.rl_trainer.run_training_loop(
            self.params['n_iter'],
            collect_policy=self.rl_trainer.agent.actor,
            eval_policy=self.rl_trainer.agent.actor,
        )
Ejemplo n.º 7
0
class PPO_Trainer(object):

    def __init__(self, params):

        #####################
        ## SET AGENT PARAMS
        #####################

        computation_graph_args = {
            'n_layers': params['n_layers'],
            'size': params['size'],
            }

        estimate_advantage_args = {
            'gamma': params['discount'],
            'use_gae':params['use_gae'],
            'gae_lam':params['gae_lam'],
            'standardize_advantages': not(params['dont_standardize_advantages']),
        }

        ppo_update_args = {
            'clip_eps' : params['clip_epsilon'],
            'ent_coeff' : params['ent_coeff'],
            'max_grad_norm' : params['max_grad_norm'],
            'ppo_epochs': params['ppo_epochs'],
            'ppo_min_batch_size':params['ppo_min_batch_size']
        }

        train_args = {
            'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
            'l2_reg': params['l2_reg'],
            'learning_rate_valuefn': params['learning_rate_valuefn'],
            'learning_rate_policyfn': params['learning_rate_policyfn'],
            'num_target_updates': params['num_target_updates'],
        }

        agent_params = {**computation_graph_args, **estimate_advantage_args, **ppo_update_args, **train_args}

        self.params = params
        self.params['agent_class'] = PPOAgent
        self.params['agent_params'] = agent_params
        self.params['batch_size_initial'] = self.params['batch_size']

        ################
        ## RL TRAINER
        ################

        self.rl_trainer = RL_Trainer(self.params)

    def run_training_loop(self):

        self.rl_trainer.run_training_loop(
            self.params['n_iter'],
            collect_policy = self.rl_trainer.agent.actor,
            eval_policy = self.rl_trainer.agent.actor,
            )
def train_AC(params):

    computation_graph_args = {
        'n_layers':
        params['n_layers'],
        'size':
        params['size'],
        'device':
        params['device'],
        'learning_rate':
        params['learning_rate'],
        'num_target_updates':
        params['num_target_updates'],
        'num_grad_steps_per_target_update':
        params['num_grad_steps_per_target_update'],
    }

    train_args = {
        'num_agent_train_steps_per_iter':
        params['num_agent_train_steps_per_iter'],
        'num_critic_updates_per_agent_update':
        params['num_critic_updates_per_agent_update'],
        'num_actor_updates_per_agent_update':
        params['num_actor_updates_per_agent_update'],
        'gamma':
        params['discount'],
        'standardize_advantages':
        not (params['dont_standardize_advantages']),
    }

    exploration_args = {
        'density_model': params['density_model'],
        'bonus_coeff': params['bonus_coeff'],
        'kl_weight': params['kl_weight'],
        'density_lr': params['density_lr'],
        'density_train_iters': params['density_train_iters'],
        'density_batch_size': params['density_batch_size'],
        'density_hiddim': params['density_hiddim'],
        'replay_size': params['replay_size'],
        'sigma': params['sigma'],
    }

    params['agent_params'] = {
        **computation_graph_args,
        **train_args,
        **exploration_args
    }
    params['agent_class'] = Exploratory_ACAgent

    rl_trainer = RL_Trainer(params)
    rl_trainer.run_training_loop(params['n_iter'],
                                 policy=rl_trainer.agent.actor)
Ejemplo n.º 9
0
class BC_Trainer(object):
    def __init__(self, params):

        #######################
        ## AGENT PARAMS
        #######################

        agent_params = {
            'n_layers': params['n_layers'],
            'size': params['size'],
            'learning_rate': params['learning_rate'],
            'max_replay_buffer_size': params['max_replay_buffer_size'],
        }

        self.params = params
        self.params[
            'agent_class'] = BCAgent  ## TODO: look in here and implement this
        self.params['agent_params'] = agent_params

        ################
        ## RL TRAINER
        ################
        import roboschool

        self.rl_trainer = RL_Trainer(
            self.params)  ## TODO: look in here and implement this

        #######################
        ## LOAD EXPERT POLICY
        #######################

        print('Loading expert policy from...',
              self.params['expert_policy_file'])
        #self.loaded_expert_policy = Loaded_Gaussian_Policy(self.rl_trainer.sess, self.params['expert_policy_file'])
        module_name = self.params['expert_policy_file'].replace(
            '/', '.').rstrip('.py')
        policy_module = importlib.import_module(module_name)
        _, policy = policy_module.get_env_and_policy()
        self.loaded_expert_policy = PolicyWrapper(policy)
        print('Done restoring expert policy...')

    def run_training_loop(self):

        self.rl_trainer.run_training_loop(
            n_iter=self.params['n_iter'],
            initial_expertdata=self.params['expert_data'],
            collect_policy=self.rl_trainer.agent.actor,
            eval_policy=self.rl_trainer.agent.actor,
            relabel_with_expert=self.params['do_dagger'],
            expert_policy=self.loaded_expert_policy,
        )
Ejemplo n.º 10
0
class PG_Trainer(object):
    def __init__(self, params):

        #####################
        ## SET AGENT PARAMS
        #####################

        computation_graph_args = {
            "n_layers": params["n_layers"],
            "size": params["size"],
            "learning_rate": params["learning_rate"],
        }

        estimate_advantage_args = {
            "gamma": params["discount"],
            "standardize_advantages":
            not (params["dont_standardize_advantages"]),
            "reward_to_go": params["reward_to_go"],
            "nn_baseline": params["nn_baseline"],
        }

        train_args = {
            "num_agent_train_steps_per_iter":
            params["num_agent_train_steps_per_iter"],
        }

        agent_params = {
            **computation_graph_args,
            **estimate_advantage_args,
            **train_args,
        }

        self.params = params
        self.params["agent_class"] = PGAgent
        self.params["agent_params"] = agent_params
        self.params["batch_size_initial"] = self.params["batch_size"]

        ################
        ## RL TRAINER
        ################

        self.rl_trainer = RL_Trainer(self.params)

    def run_training_loop(self):

        self.rl_trainer.run_training_loop(
            self.params["n_iter"],
            collect_policy=self.rl_trainer.agent.actor,
            eval_policy=self.rl_trainer.agent.actor,
        )
class BC_Trainer(object):
    def __init__(self, params):

        #######################
        ## AGENT PARAMS
        #######################

        agent_params = {
            'n_layers': params['n_layers'],
            'size': params['size'],
            'learning_rate': params['learning_rate'],
            'max_replay_buffer_size': params['max_replay_buffer_size'],
        }

        self.params = params
        self.params[
            'agent_class'] = BCAgent  ## TODO: look in here and implement this
        self.params['agent_params'] = agent_params

        ################
        ## RL TRAINER
        ################

        self.rl_trainer = RL_Trainer(
            self.params)  ## TODO: look in here and implement this

        #######################
        ## LOAD EXPERT POLICY
        #######################

        # 其实包括agent和expert都是gaussian policy,因为action space是连续的
        # 相互独立的多元高斯分布
        print('Loading expert policy from...',
              self.params['expert_policy_file'])
        self.loaded_expert_policy = Loaded_Gaussian_Policy(
            self.rl_trainer.sess,
            self.params['expert_policy_file'])  # 同一个sess,两幅图定义在一起
        print('Done restoring expert policy...')

    def run_training_loop(self):

        self.rl_trainer.run_training_loop(
            n_iter=self.params['n_iter'],
            initial_expertdata=self.params['expert_data'],
            collect_policy=self.rl_trainer.agent.actor,
            eval_policy=self.rl_trainer.agent.actor,
            relabel_with_expert=self.params['do_dagger'],
            expert_policy=self.loaded_expert_policy,
        )
class MB_Trainer(object):
    def __init__(self, params):

        #####################
        ## SET AGENT PARAMS
        #####################

        computation_graph_args = {
            'ensemble_size': params['ensemble_size'],
            'n_layers': params['n_layers'],
            'size': params['size'],
            'learning_rate': params['learning_rate'],
            'device': params['device']
        }

        train_args = {
            'num_agent_train_steps_per_iter':
            params['num_agent_train_steps_per_iter'],
        }

        controller_args = {
            'mpc_horizon': params['mpc_horizon'],
            'mpc_num_action_sequences': params['mpc_num_action_sequences'],
        }

        agent_params = {
            **computation_graph_args,
            **train_args,
            **controller_args
        }

        self.params = params
        self.params['agent_class'] = MBAgent
        self.params['agent_params'] = agent_params

        ################
        ## RL TRAINER
        ################

        self.rl_trainer = RL_Trainer(self.params)

    def run_training_loop(self):

        self.rl_trainer.run_training_loop(
            self.params['n_iter'],
            collect_policy=self.rl_trainer.agent.actor,
            eval_policy=self.rl_trainer.agent.actor,
        )
Ejemplo n.º 13
0
class BC_Trainer(object):

    def __init__(self, params):

        #######################
        # AGENT PARAMS
        #######################

        agent_params = {
            'n_layers': params['n_layers'],
            'size': params['size'],
            'learning_rate': params['learning_rate'],
            'max_replay_buffer_size': params['max_replay_buffer_size'],
        }

        self.params = params
        self.params['agent_class'] = BCAgent  # HW1: you will modify this
        self.params['agent_params'] = agent_params

        ################
        # RL TRAINER
        ################

        self.rl_trainer = RL_Trainer(self.params)  # HW1: you will modify this

        #######################
        # LOAD EXPERT POLICY
        #######################

        print('Loading expert policy from...',
              self.params['expert_policy_file'])
        self.loaded_expert_policy = LoadedGaussianPolicy(
            self.params['expert_policy_file'])
        print('Done restoring expert policy...')

    def run_training_loop(self):

        self.rl_trainer.run_training_loop(
            n_iter=self.params['n_iter'],
            initial_expertdata=self.params['expert_data'],
            collect_policy=self.rl_trainer.agent.actor,
            eval_policy=self.rl_trainer.agent.actor,
            relabel_with_expert=self.params['do_dagger'],
            expert_policy=self.loaded_expert_policy,
        )
Ejemplo n.º 14
0
class BC_Trainer(object):
    def __init__(self, params):

        #######################
        ## AGENT PARAMS
        #######################

        agent_params = {
            "n_layers": params["n_layers"],
            "size": params["size"],
            "learning_rate": params["learning_rate"],
            "max_replay_buffer_size": params["max_replay_buffer_size"],
        }

        self.params = params
        self.params["agent_class"] = BCAgent  ## HW1: you will modify this
        self.params["agent_params"] = agent_params

        ################
        ## RL TRAINER
        ################

        self.rl_trainer = RL_Trainer(self.params)  ## HW1: you will modify this

        #######################
        ## LOAD EXPERT POLICY
        #######################

        print("Loading expert policy from...", self.params["expert_policy_file"])
        self.loaded_expert_policy = LoadedGaussianPolicy(
            self.params["expert_policy_file"]
        )
        print("Done restoring expert policy...")

    def run_training_loop(self):

        self.rl_trainer.run_training_loop(
            n_iter=self.params["n_iter"],
            initial_expertdata=self.params["expert_data"],
            collect_policy=self.rl_trainer.agent.actor,
            eval_policy=self.rl_trainer.agent.actor,
            relabel_with_expert=self.params["do_dagger"],
            expert_policy=self.loaded_expert_policy,
        )
class BC_Trainer(object):
    def __init__(self, params, logger=None):

        #######################
        ## AGENT PARAMS
        #######################

        agent_params = {
            'n_layers': params['n_layers'],
            'size': params['size'],
            'learning_rate': params['learning_rate'],
            'max_replay_buffer_size': params['max_replay_buffer_size'],
        }

        self.params = params
        self.params[
            'agent_class'] = BCAgent  ## TODO: look in here and implement this
        self.params['agent_params'] = agent_params

        ################
        ## RL TRAINER
        ################

        self.rl_trainer = RL_Trainer(
            self.params, logger)  ## TODO: look in here and implement this

        #######################
        ## LOAD EXPERT POLICY
        #######################

        print('Loading expert policy from...',
              self.params['expert_policy_file'])
        self.loaded_expert_policy = Loaded_Gaussian_Policy(
            self.rl_trainer.sess, self.params['expert_policy_file'])
        print('Done restoring expert policy...')

    def run_training_loop(self):

        return self.rl_trainer.run_training_loop(
            n_iter=self.params['n_iter'],
            initial_expertdata=self.params['expert_data'],
            collect_policy=self.rl_trainer.agent.actor,
            eval_policy=self.rl_trainer.agent.actor,
            relabel_with_expert=self.params['do_dagger'],
            expert_policy=self.loaded_expert_policy,
        )

    def run_logging_loop(self, itr):

        return self.rl_trainer.run_logging_loop(
            n_iter=itr,
            initial_expertdata=self.params['expert_data'],
            collect_policy=self.rl_trainer.agent.actor,
            eval_policy=self.rl_trainer.agent.actor,
        )
Ejemplo n.º 16
0
class BC_Trainer(object):
    def __init__(self, params):

        #######################
        ## AGENT PARAMS
        #######################

        agent_params = {
            'n_layers': params['n_layers'],
            'size': params['size'],
            'learning_rate': params['learning_rate'],
            'max_replay_buffer_size': params['max_replay_buffer_size'],
            'siren': params['siren'],
            'train_separate_params': params['train_separate_params'],
            'supervision_mode': params['supervision_mode'],
            'offset_learning_rate': params['offset_learning_rate'],
            'epsilon_s': params['epsilon_s'],
            'auto_cast': params['auto_cast'],
            'gradient_loss_scale': params['gradient_loss_scale'],
            'additional_activation': params['additional_activation'],
            'omega': params['omega'],
        }

        self.params = params
        self.params['agent_class'] = BCAgent  ## HW1: you will modify this
        self.params['agent_params'] = agent_params

        ################
        ## RL TRAINER
        ################

        self.rl_trainer = RL_Trainer(self.params)  ## HW1: you will modify this

        #######################
        ## LOAD EXPERT POLICY
        #######################

        print('Loading expert policy from...',
              self.params['expert_policy_file'])
        self.loaded_expert_policy = LoadedGaussianPolicy(
            self.params['expert_policy_file'])
        print('Done restoring expert policy...')

    def run_training_loop(self):

        return self.rl_trainer.run_training_loop(
            n_iter=self.params['n_iter'],
            initial_expertdata=self.params['expert_data'],
            collect_policy=self.rl_trainer.agent.actor,
            eval_policy=self.rl_trainer.agent.actor,
            relabel_with_expert=self.params['do_dagger'],
            expert_policy=self.loaded_expert_policy,
        )
Ejemplo n.º 17
0
class TRPO_Trainer(object):
    def __init__(self, params):

        #####################
        ## SET AGENT PARAMS
        #####################

        computation_graph_args = {
            'n_layers': params['n_layers'],
            'size': params['size'],
            'learning_rate': params['learning_rate'],
            'num_target_updates': params['num_target_updates'],
        }

        estimate_advantage_args = {
            'gamma': params['discount'],
            'use_gae': params['use_gae'],
            'gae_lam': params['gae_lam'],
            'standardize_advantages':
            not (params['dont_standardize_advantages']),
        }

        trpo_update_args = {
            'cg_steps': params['cg_steps'],
            'damping': params['damping'],
            'max_kl_increment': params['max_kl_increment'],
            'max_backtracks': params['max_backtracks'],
        }

        train_args = {
            'num_agent_train_steps_per_iter':
            params['num_agent_train_steps_per_iter'],
            'l2_reg':
            params['l2_reg']
        }

        agent_params = {
            **computation_graph_args,
            **estimate_advantage_args,
            **trpo_update_args,
            **train_args
        }

        self.params = params
        self.params['agent_class'] = TRPOAgent
        self.params['agent_params'] = agent_params
        self.params['batch_size_initial'] = self.params['batch_size']

        ################
        ## RL TRAINER
        ################

        self.rl_trainer = RL_Trainer(self.params)

    def run_training_loop(self):

        self.rl_trainer.run_training_loop(
            self.params['n_iter'],
            collect_policy=self.rl_trainer.agent.actor,
            eval_policy=self.rl_trainer.agent.actor,
        )