def __init__(
        self,
        env,
        agent,
        train_tasks,
        eval_tasks,
        meta_batch=64,
        num_iterations=100,
        num_train_steps_per_itr=1000,
        num_initial_steps=100,
        num_tasks_sample=100,
        num_steps_prior=100,
        num_steps_posterior=100,
        num_extra_rl_steps_posterior=100,
        num_evals=10,
        num_steps_per_eval=1000,
        batch_size=1024,
        embedding_batch_size=1024,
        embedding_mini_batch_size=1024,
        max_path_length=1000,
        discount=0.99,
        replay_buffer_size=1000000,
        reward_scale=1,
        num_exp_traj_eval=1,
        update_post_train=1,
        eval_deterministic=True,
        render=False,
        save_replay_buffer=False,
        save_algorithm=False,
        save_environment=False,
        render_eval_paths=False,
        dump_eval_paths=False,
        plotter=None,
        dyna=False,
        dyna_num_train_itr=50,
        dyna_num_train_steps_per_itr=50,
        dyna_tandem_train=True,
        dyna_n_layers=3,
        dyna_hidden_layer_size=64,
        dyna_learning_rate=1e-3,
    ):
        """
        :param env: training env
        :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in
        :param train_tasks: list of tasks used for training
        :param eval_tasks: list of tasks used for eval

        see default experiment config file for descriptions of the rest of the arguments
        """
        self.env = env
        self.agent = agent
        self.exploration_agent = agent  # Can potentially use a different policy purely for exploration rather than also solving tasks, currently not being used
        self.train_tasks = train_tasks
        self.eval_tasks = eval_tasks
        self.meta_batch = meta_batch
        self.num_iterations = num_iterations
        self.num_train_steps_per_itr = num_train_steps_per_itr
        self.num_initial_steps = num_initial_steps
        self.num_tasks_sample = num_tasks_sample
        self.num_steps_prior = num_steps_prior
        self.num_steps_posterior = num_steps_posterior
        self.num_extra_rl_steps_posterior = num_extra_rl_steps_posterior
        self.num_evals = num_evals
        self.num_steps_per_eval = num_steps_per_eval
        self.batch_size = batch_size
        self.embedding_batch_size = embedding_batch_size
        self.embedding_mini_batch_size = embedding_mini_batch_size
        self.max_path_length = max_path_length
        self.discount = discount
        self.replay_buffer_size = replay_buffer_size
        self.reward_scale = reward_scale
        self.update_post_train = update_post_train
        self.num_exp_traj_eval = num_exp_traj_eval
        self.eval_deterministic = eval_deterministic
        self.render = render
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment

        self.eval_statistics = None
        self.render_eval_paths = render_eval_paths
        self.dump_eval_paths = dump_eval_paths
        self.plotter = plotter

        self.dyna = dyna
        self.dyna_num_train_itr = dyna_num_train_itr
        self.dyna_num_train_steps_per_itr = dyna_num_train_steps_per_itr
        self.dyna_tandem_train = dyna_tandem_train
        self.dyna_n_layers = dyna_n_layers
        self.dyna_hidden_layer_size = dyna_hidden_layer_size
        self.dyna_learning_rate = dyna_learning_rate

        if dyna:
            self.sampler = DynamicsSampler(
                env=env,
                policy=agent,
                max_path_length=self.max_path_length,
                num_train_itr=dyna_num_train_itr,
                num_train_steps_per_itr=dyna_num_train_steps_per_itr,
                tandem_train=dyna_tandem_train,
                n_layers=dyna_n_layers,
                hidden_layer_size=dyna_hidden_layer_size,
                learning_rate=dyna_learning_rate,
            )
        else:
            self.sampler = InPlacePathSampler(
                env=env,
                policy=agent,
                max_path_length=self.max_path_length,
            )

        # separate replay buffers for
        # - training RL update
        # - training encoder update
        self.replay_buffer = MultiTaskReplayBuffer(
            self.replay_buffer_size,
            env,
            self.train_tasks,
        )

        self.enc_replay_buffer = MultiTaskReplayBuffer(
            self.replay_buffer_size,
            env,
            self.train_tasks,
        )

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []
Exemple #2
0
    def __init__(self,
                 env,
                 agent,
                 train_tasks,
                 eval_tasks,
                 goal_radius,
                 eval_deterministic=True,
                 render=False,
                 render_eval_paths=False,
                 plotter=None,
                 **kwargs):
        """
        :param env: training env
        :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in
        :param train_tasks: list of tasks used for training
        :param eval_tasks: list of tasks used for eval
        :param goal_radius: reward threshold for defining sparse rewards

        see default experiment config file for descriptions of the rest of the arguments
        """
        self.env = env
        self.agent = agent
        self.train_tasks = train_tasks
        self.eval_tasks = eval_tasks
        self.goal_radius = goal_radius

        self.meta_batch = kwargs['meta_batch']
        self.batch_size = kwargs['batch_size']
        self.num_iterations = kwargs['num_iterations']
        self.num_train_steps_per_itr = kwargs['num_train_steps_per_itr']
        self.num_initial_steps = kwargs['num_initial_steps']
        self.num_tasks_sample = kwargs['num_tasks_sample']
        self.num_steps_prior = kwargs['num_steps_prior']
        self.num_steps_posterior = kwargs['num_steps_posterior']
        self.num_extra_rl_steps_posterior = kwargs[
            'num_extra_rl_steps_posterior']
        self.num_evals = kwargs['num_evals']
        self.num_steps_per_eval = kwargs['num_steps_per_eval']
        self.embedding_batch_size = kwargs['embedding_batch_size']
        self.embedding_mini_batch_size = kwargs['embedding_mini_batch_size']
        self.max_path_length = kwargs['max_path_length']
        self.discount = kwargs['discount']
        self.replay_buffer_size = kwargs['replay_buffer_size']
        self.reward_scale = kwargs['reward_scale']
        self.update_post_train = kwargs['update_post_train']
        self.num_exp_traj_eval = kwargs['num_exp_traj_eval']
        self.save_replay_buffer = kwargs['save_replay_buffer']
        self.save_algorithm = kwargs['save_algorithm']
        self.save_environment = kwargs['save_environment']
        self.dump_eval_paths = kwargs['dump_eval_paths']
        self.data_dir = kwargs['data_dir']
        self.train_epoch = kwargs['train_epoch']
        self.eval_epoch = kwargs['eval_epoch']
        self.sample = kwargs['sample']
        self.n_trj = kwargs['n_trj']
        self.allow_eval = kwargs['allow_eval']
        self.mb_replace = kwargs['mb_replace']

        self.eval_deterministic = eval_deterministic
        self.render = render
        self.eval_statistics = None
        self.render_eval_paths = render_eval_paths
        self.plotter = plotter

        self.train_buffer = MultiTaskReplayBuffer(self.replay_buffer_size, env,
                                                  self.train_tasks,
                                                  self.goal_radius)
        self.eval_buffer = MultiTaskReplayBuffer(self.replay_buffer_size, env,
                                                 self.eval_tasks,
                                                 self.goal_radius)
        self.replay_buffer = MultiTaskReplayBuffer(self.replay_buffer_size,
                                                   env, self.train_tasks,
                                                   self.goal_radius)
        self.enc_replay_buffer = MultiTaskReplayBuffer(self.replay_buffer_size,
                                                       env, self.train_tasks,
                                                       self.goal_radius)
        # offline sampler which samples from the train/eval buffer
        self.offline_sampler = OfflineInPlacePathSampler(
            env=env, policy=agent, max_path_length=self.max_path_length)
        # online sampler for evaluation (if collect on-policy context, for offline context, use self.offline_sampler)
        self.sampler = InPlacePathSampler(env=env,
                                          policy=agent,
                                          max_path_length=self.max_path_length)

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []
        self.init_buffer()
Exemple #3
0
    def __init__(
            self,
            env,
            policy,
            train_tasks,
            eval_tasks,
            meta_batch=64,
            num_iterations=100,
            num_train_steps_per_itr=1000,
            num_tasks_sample=100,
            num_steps_per_task=100,
            num_evals=10,
            num_steps_per_eval=1000,
            batch_size=1024,
            embedding_batch_size=1024,
            embedding_mini_batch_size=1024,
            max_path_length=1000,
            discount=0.99,
            replay_buffer_size=1000000,  #1000000,
            reward_scale=1,
            train_embedding_source='posterior_only',
            eval_embedding_source='initial_pool',
            eval_deterministic=True,
            render=False,
            save_replay_buffer=False,
            save_algorithm=False,
            save_environment=False,
            obs_emb_dim=0):
        """
        Base class for Meta RL Algorithms
        :param env: training env
        :param policy: policy that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in
        :param train_tasks: list of tasks used for training
        :param eval_tasks: list of tasks used for eval
        :param meta_batch: number of tasks used for meta-update
        :param num_iterations: number of meta-updates taken
        :param num_train_steps_per_itr: number of meta-updates performed per iteration
        :param num_tasks_sample: number of train tasks to sample to collect data for
        :param num_steps_per_task: number of transitions to collect per task
        :param num_evals: number of independent evaluation runs, with separate task encodings
        :param num_steps_per_eval: number of transitions to sample for evaluation
        :param batch_size: size of batches used to compute RL update
        :param embedding_batch_size: size of batches used to compute embedding
        :param embedding_mini_batch_size: size of batch used for encoder update
        :param max_path_length: max episode length
        :param discount:
        :param replay_buffer_size: max replay buffer size
        :param reward_scale:
        :param render:
        :param save_replay_buffer:
        :param save_algorithm:
        :param save_environment:
        """
        self.env = env
        self.policy = policy
        self.exploration_policy = policy  # Can potentially use a different policy purely for exploration rather than also solving tasks, currently not being used
        self.train_tasks = train_tasks
        self.eval_tasks = eval_tasks
        self.meta_batch = meta_batch
        self.num_iterations = num_iterations
        self.num_train_steps_per_itr = num_train_steps_per_itr
        self.num_tasks_sample = num_tasks_sample
        self.num_steps_per_task = num_steps_per_task
        self.num_evals = num_evals
        self.num_steps_per_eval = num_steps_per_eval
        self.batch_size = batch_size
        self.embedding_batch_size = embedding_batch_size
        self.embedding_mini_batch_size = embedding_mini_batch_size
        self.max_path_length = max_path_length
        self.discount = discount
        self.replay_buffer_size = min(
            int(replay_buffer_size / (len(train_tasks))), 1000)
        self.reward_scale = reward_scale
        self.train_embedding_source = train_embedding_source
        self.eval_embedding_source = eval_embedding_source  # TODO: add options for computing embeddings on train tasks too
        self.eval_deterministic = eval_deterministic
        self.render = render
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment

        self.eval_sampler = InPlacePathSampler(
            env=env,
            policy=policy,
            max_samples=self.num_steps_per_eval,
            max_path_length=self.max_path_length,
        )

        # separate replay buffers for
        # - training RL update
        # - training encoder update
        # - testing encoder
        self.replay_buffer = MultiTaskReplayBuffer(self.replay_buffer_size,
                                                   env,
                                                   self.train_tasks,
                                                   state_dim=obs_emb_dim)

        self.enc_replay_buffer = MultiTaskReplayBuffer(self.replay_buffer_size,
                                                       env,
                                                       self.train_tasks,
                                                       state_dim=obs_emb_dim)
        self.eval_enc_replay_buffer = MultiTaskReplayBuffer(
            self.replay_buffer_size,
            env,
            self.eval_tasks,
            state_dim=obs_emb_dim)

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []
Exemple #4
0
    def __init__(
            self,
            env,
            agent,
            train_tasks,
            eval_tasks,
            meta_batch=64,
            num_iterations=100,
            num_train_steps_per_itr=1000,
            num_initial_steps=100,
            num_tasks_sample=100,
            num_steps_prior=100,
            num_steps_posterior=100,
            num_extra_rl_steps_posterior=100,
            num_evals=10,
            num_steps_per_eval=1000,
            batch_size=1024,
            low_batch_size=2048,  #TODO: Tune this batch size
            embedding_batch_size=1024,
            embedding_mini_batch_size=1024,
            max_path_length=1000,
            discount=0.99,
            replay_buffer_size=1000000,
            reward_scale=1,
            num_exp_traj_eval=1,
            update_post_train=1,
            eval_deterministic=True,
            render=False,
            save_replay_buffer=False,
            save_algorithm=False,
            save_environment=False,
            render_eval_paths=False,
            dump_eval_paths=False,
            plotter=None,
            use_goals=False):
        """
        :param env: training env
        :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in
        :param train_tasks: list of tasks used for training
        :param eval_tasks: list of tasks used for eval

        see default experiment config file for descriptions of the rest of the arguments
        """
        self.env = env
        self.agent = agent
        self.use_goals = use_goals
        assert (agent.use_goals == self.use_goals)
        self.exploration_agent = agent  # Can potentially use a different policy purely for exploration rather than also solving tasks, currently not being used
        self.train_tasks = train_tasks
        self.eval_tasks = eval_tasks
        self.meta_batch = meta_batch
        self.num_iterations = num_iterations
        self.num_train_steps_per_itr = num_train_steps_per_itr
        self.num_initial_steps = num_initial_steps
        self.num_tasks_sample = num_tasks_sample
        self.num_steps_prior = num_steps_prior
        self.num_steps_posterior = num_steps_posterior
        self.num_extra_rl_steps_posterior = num_extra_rl_steps_posterior
        self.num_evals = num_evals
        self.num_steps_per_eval = num_steps_per_eval
        self.batch_size = batch_size
        self.embedding_batch_size = embedding_batch_size
        self.embedding_mini_batch_size = embedding_mini_batch_size
        self.low_batch_size = low_batch_size
        self.max_path_length = max_path_length
        self.discount = discount
        self.replay_buffer_size = replay_buffer_size
        self.reward_scale = reward_scale
        self.update_post_train = update_post_train
        self.num_exp_traj_eval = num_exp_traj_eval
        self.eval_deterministic = eval_deterministic
        self.render = render
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment

        self.eval_statistics = None
        self.render_eval_paths = render_eval_paths
        self.dump_eval_paths = dump_eval_paths
        self.plotter = plotter

        obs_dim = int(np.prod(env.observation_space.shape))
        action_dim = int(np.prod(env.action_space.shape))

        self.sampler = InPlacePathSampler(
            env=env,
            policy=agent,
            max_path_length=self.max_path_length,
        )

        # separate replay buffers for
        # - training RL update
        # - training encoder update

        self.enc_replay_buffer = MultiTaskReplayBuffer(
            self.replay_buffer_size,
            env,
            self.train_tasks,
        )
        if self.use_goals:
            self.high_buffer = MultiTaskReplayBuffer(self.replay_buffer_size,
                                                     env, self.train_tasks)
            #Hacky method for changing the obs and action dimensions for the internal
            #buffers since they're not the same as the original environment
            internal_buffers = dict([
                (idx,
                 SimpleReplayBuffer(
                     max_replay_buffer_size=self.replay_buffer_size,
                     observation_dim=obs_dim,
                     action_dim=obs_dim,
                 )) for idx in self.train_tasks
            ])
            self.high_buffer.task_buffers = internal_buffers

            self.low_buffer = SimpleReplayBuffer(
                max_replay_buffer_size=replay_buffer_size,
                observation_dim=2 * obs_dim,
                action_dim=action_dim,
            )
        else:
            self.replay_buffer = MultiTaskReplayBuffer(
                self.replay_buffer_size,
                env,
                self.train_tasks,
            )

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []