Beispiel #1
0
    def setup(self, algo, env):
        """Set up trainer for algorithm and environment.

        This method saves algo and env within trainer and creates a sampler.

        Note:
            After setup() is called all variables in session should have been
            initialized. setup() respects existing values in session so
            policy weights can be loaded before setup().

        Args:
            algo (RLAlgorithm): An algorithm instance.
            env (Environment): An environment instance.

        Raises:
            ValueError: If sampler_cls is passed and the algorithm doesn't
                contain a `max_episode_length` field.

        """
        self._algo = algo
        self._env = env

        self._seed = get_seed()
        self._sampler = self._algo.sampler

        self._has_setup = True
Beispiel #2
0
    def make_sampler(self,
                     sampler_cls,
                     *,
                     seed=None,
                     n_workers=psutil.cpu_count(logical=False),
                     max_path_length=None,
                     worker_class=DefaultWorker,
                     sampler_args=None,
                     worker_args=None):
        """Construct a Sampler from a Sampler class.

        Args:
            sampler_cls (type): The type of sampler to construct.
            seed (int): Seed to use in sampler workers.
            max_path_length (int): Maximum path length to be sampled by the
                sampler. Paths longer than this will be truncated.
            n_workers (int): The number of workers the sampler should use.
            worker_class (type): Type of worker the Sampler should use.
            sampler_args (dict or None): Additional arguments that should be
                passed to the sampler.
            worker_args (dict or None): Additional arguments that should be
                passed to the sampler.

        Raises:
            ValueError: If `max_path_length` isn't passed and the algorithm
                doesn't contain a `max_path_length` field, or if the algorithm
                doesn't have a policy field.

        Returns:
            sampler_cls: An instance of the sampler class.

        """
        if not hasattr(self._algo, 'policy'):
            raise ValueError('If the runner is used to construct a sampler, '
                             'the algorithm must have a `policy` field.')
        if max_path_length is None:
            if hasattr(self._algo, 'max_path_length'):
                max_path_length = self._algo.max_path_length
            else:
                raise ValueError('If `sampler_cls` is specified in '
                                 'runner.setup, the algorithm must have '
                                 'a `max_path_length` field.')
        if seed is None:
            seed = get_seed()
        if sampler_args is None:
            sampler_args = {}
        if worker_args is None:
            worker_args = {}

        if issubclass(sampler_cls, BaseSampler):
            return sampler_cls(self._algo, self._env, **sampler_args)
        else:
            return sampler_cls.from_worker_factory(WorkerFactory(
                seed=seed,
                max_path_length=max_path_length,
                n_workers=n_workers,
                worker_class=worker_class,
                worker_args=worker_args),
                                                   agents=self._algo.policy,
                                                   envs=self._env)
Beispiel #3
0
    def setup(self, algo, env, sampler_cls=None, sampler_args=None):
        """Set up runner for algorithm and environment.

        This method saves algo and env within runner and creates a sampler.

        Note:
            After setup() is called all variables in session should have been
            initialized. setup() respects existing values in session so
            policy weights can be loaded before setup().

        Args:
            algo (garage.np.algos.RLAlgorithm): An algorithm instance.
            env (garage.envs.GarageEnv): An environement instance.
            sampler_cls (garage.sampler.Sampler): A sampler class.
            sampler_args (dict): Arguments to be passed to sampler constructor.

        """
        self._algo = algo
        self._env = env
        self._policy = self._algo.policy

        if sampler_args is None:
            sampler_args = {}
        if sampler_cls is None:
            sampler_cls = algo.sampler_cls
        self._sampler = sampler_cls(algo, env, **sampler_args)

        self._has_setup = True

        self._setup_args = SetupArgs(sampler_cls=sampler_cls,
                                     sampler_args=sampler_args,
                                     seed=get_seed())
Beispiel #4
0
    def __init__(self, snapshot_config, max_cpus=1):
        self._snapshotter = Snapshotter(snapshot_config.snapshot_dir,
                                        snapshot_config.snapshot_mode,
                                        snapshot_config.snapshot_gap)

        parallel_sampler.initialize(max_cpus)

        seed = get_seed()
        if seed is not None:
            parallel_sampler.set_seed(seed)

        self._has_setup = False
        self._plot = False

        self._setup_args = None
        self._train_args = None
        self._stats = ExperimentStats(total_itr=0,
                                      total_env_steps=0,
                                      total_epoch=0,
                                      last_path=None)

        self._algo = None
        self._env = None
        self._policy = None
        self._sampler = None
        self._plotter = None

        self._start_time = None
        self._itr_start_time = None
        self.step_itr = None
        self.step_path = None
Beispiel #5
0
    def setup(self, algo, env):
        """Set up trainer for algorithm and environment.

        This method saves algo and env within trainer and creates a sampler.

        Note:
            After setup() is called all variables in session should have been
            initialized. setup() respects existing values in session so
            policy weights can be loaded before setup().

        Args:
            algo (RLAlgorithm): An algorithm instance. If this algo want to use
                samplers, it should have a `_sampler` field.
            env (Environment): An environment instance.

        """
        self._algo = algo
        self._env = env

        self._seed = get_seed()

        if hasattr(self._algo, '_sampler'):
            # pylint: disable=protected-access
            self._sampler = self._algo._sampler

        self._has_setup = True
Beispiel #6
0
 def __init__(
         self,
         agents,
         envs,
         *,  # After this require passing by keyword.
         worker_factory=None,
         max_episode_length=None,
         is_tf_worker=False,
         seed=get_seed(),
         n_workers=psutil.cpu_count(logical=False),
         worker_class=DefaultWorker,
         worker_args=None):
     # pylint: disable=super-init-not-called
     if not ray.is_initialized():
         ray.init(log_to_driver=False, ignore_reinit_error=True)
     if worker_factory is None and max_episode_length is None:
         raise TypeError('Must construct a sampler from WorkerFactory or'
                         'parameters (at least max_episode_length)')
     if isinstance(worker_factory, WorkerFactory):
         self._worker_factory = worker_factory
     else:
         self._worker_factory = WorkerFactory(
             max_episode_length=max_episode_length,
             is_tf_worker=is_tf_worker,
             seed=seed,
             n_workers=n_workers,
             worker_class=worker_class,
             worker_args=worker_args)
     self._sampler_worker = ray.remote(SamplerWorker)
     self._agents = agents
     self._envs = self._worker_factory.prepare_worker_messages(envs)
     self._all_workers = defaultdict(None)
     self._workers_started = False
     self.start_worker()
     self.total_env_steps = 0
Beispiel #7
0
    def setup(self,
              algo,
              env,
              sampler_cls=None,
              sampler_args=None,
              n_workers=psutil.cpu_count(logical=False),
              worker_class=None,
              worker_args=None):
        """Set up trainer for algorithm and environment.

        This method saves algo and env within trainer and creates a sampler.

        Note:
            After setup() is called all variables in session should have been
            initialized. setup() respects existing values in session so
            policy weights can be loaded before setup().

        Args:
            algo (RLAlgorithm): An algorithm instance.
            env (Environment): An environment instance.
            sampler_cls (type): A class which implements :class:`Sampler`.
            sampler_args (dict): Arguments to be passed to sampler constructor.
            n_workers (int): The number of workers the sampler should use.
            worker_class (type): Type of worker the sampler should use.
            worker_args (dict or None): Additional arguments that should be
                passed to the worker.

        Raises:
            ValueError: If sampler_cls is passed and the algorithm doesn't
                contain a `max_episode_length` field.

        """
        self._algo = algo
        self._env = env
        self._n_workers = n_workers
        self._worker_class = worker_class
        if sampler_args is None:
            sampler_args = {}
        if sampler_cls is None:
            sampler_cls = getattr(algo, 'sampler_cls', None)
        if worker_class is None:
            worker_class = getattr(algo, 'worker_cls', DefaultWorker)
        if worker_args is None:
            worker_args = {}

        self._worker_args = worker_args
        if sampler_cls is None:
            self._sampler = None
        else:
            self._sampler = self.make_sampler(sampler_cls,
                                              sampler_args=sampler_args,
                                              n_workers=n_workers,
                                              worker_class=worker_class,
                                              worker_args=worker_args)

        self._has_setup = True

        self._setup_args = SetupArgs(sampler_cls=sampler_cls,
                                     sampler_args=sampler_args,
                                     seed=get_seed())
Beispiel #8
0
    def _setup_worker(self, env_indices, tasks):
        """Setup workers.

        Args:
            env_indices (List[Int]): Indices of environments to be assigned
                to workers for sampling.
            tasks (List[dict]): List of tasks to assign.

        """
        if self._vec_env is not None:
            self._vec_env.close()

        vec_envs = []
        for env_ind in env_indices:
            for _ in range(self._envs_per_worker):
                vec_env = copy.deepcopy(self.env)
                vec_env.set_task(tasks[env_ind])
                vec_envs.append(vec_env)
        seed0 = deterministic.get_seed()
        if seed0 is not None:
            for (i, e) in enumerate(vec_envs):
                e.seed(seed0 + i)

        self._vec_env = VecEnvExecutor(
            envs=vec_envs, max_path_length=self.algo.max_path_length)
Beispiel #9
0
    def evaluate(self, algo, test_rollouts_per_task=None):
        """Evaluate the Meta-RL algorithm on the test tasks.

        Args:
            algo (garage.np.algos.MetaRLAlgorithm): The algorithm to evaluate.
            test_rollouts_per_task (int or None): Number of rollouts per task.

        """
        if test_rollouts_per_task is None:
            test_rollouts_per_task = self._n_test_rollouts
        adapted_trajectories = []
        logger.log('Sampling for adapation and meta-testing...')
        if self._test_sampler is None:
            self._test_sampler = self._sampler_class.from_worker_factory(
                WorkerFactory(seed=get_seed(),
                              max_path_length=self._max_path_length,
                              n_workers=1,
                              worker_class=self._worker_class,
                              worker_args=self._worker_args),
                agents=algo.get_exploration_policy(),
                envs=self._test_task_sampler.sample(1))
        for env_up in self._test_task_sampler.sample(self._n_test_tasks):
            policy = algo.get_exploration_policy()
            traj = self._trajectory_batch_class.concatenate(*[
                self._test_sampler.obtain_samples(self._eval_itr, 1, policy,
                                                  env_up)
                for _ in range(self._n_exploration_traj)
            ])
            adapted_policy = algo.adapt_policy(policy, traj)
            adapted_traj = self._test_sampler.obtain_samples(
                self._eval_itr, test_rollouts_per_task * self._max_path_length,
                adapted_policy)
            adapted_trajectories.append(adapted_traj)
        logger.log('Finished meta-testing...')

        if self._test_task_names is not None:
            name_map = dict(enumerate(self._test_task_names))
        else:
            name_map = None

        with tabular.prefix(self._prefix + '/' if self._prefix else ''):
            log_multitask_performance(
                self._eval_itr,
                self._trajectory_batch_class.concatenate(
                    *adapted_trajectories),
                getattr(algo, 'discount', 1.0),
                trajectory_class=self._trajectory_batch_class,
                name_map=name_map)
        self._eval_itr += 1

        if self._trajectory_batch_class == TrajectoryBatch:
            rewards = self._trajectory_batch_class.concatenate(
                *adapted_trajectories).rewards
        else:
            rewards = self._trajectory_batch_class.concatenate(
                *adapted_trajectories).env_rewards

        return sum(rewards) / len(rewards)
Beispiel #10
0
    def __init__(
            self,
            agents,
            envs,
            *,  # After this require passing by keyword.
            worker_factory=None,
            max_episode_length=None,
            is_tf_worker=False,
            seed=get_seed(),
            n_workers=psutil.cpu_count(logical=False),
            worker_class=DefaultWorker,
            worker_args=None):
        # pylint: disable=super-init-not-called
        if worker_factory is None and max_episode_length is None:
            raise TypeError('Must construct a sampler from WorkerFactory or'
                            'parameters (at least max_episode_length)')
        if isinstance(worker_factory, WorkerFactory):
            self._factory = worker_factory
        else:
            self._factory = WorkerFactory(
                max_episode_length=max_episode_length,
                is_tf_worker=is_tf_worker,
                seed=seed,
                n_workers=n_workers,
                worker_class=worker_class,
                worker_args=worker_args)

        self._agents = self._factory.prepare_worker_messages(
            agents, cloudpickle.dumps)
        self._envs = self._factory.prepare_worker_messages(envs)
        self._to_sampler = mp.Queue(2 * self._factory.n_workers)
        self._to_worker = [mp.Queue(1) for _ in range(self._factory.n_workers)]
        # If we crash from an exception, with full queues, we would rather not
        # hang forever, so we would like the process to close without flushing
        # the queues.
        # That's what cancel_join_thread does.
        for q in self._to_worker:
            q.cancel_join_thread()
        self._workers = [
            mp.Process(target=run_worker,
                       kwargs=dict(
                           factory=self._factory,
                           to_sampler=self._to_sampler,
                           to_worker=self._to_worker[worker_number],
                           worker_number=worker_number,
                           agent=self._agents[worker_number],
                           env=self._envs[worker_number],
                       ),
                       daemon=False)
            for worker_number in range(self._factory.n_workers)
        ]
        self._agent_version = 0
        for w in self._workers:
            w.start()
        self.total_env_steps = 0
    def start_worker(self):
        """Initialize the sampler."""
        n_envs = self.n_envs
        envs = [pickle.loads(pickle.dumps(self.env)) for _ in range(n_envs)]

        # Deterministically set environment seeds based on the global seed.
        for (i, e) in enumerate(envs):
            e.seed(deterministic.get_seed() + i)

        self.vec_env = VecEnvExecutor(
            envs=envs, max_path_length=self.algo.max_path_length)
Beispiel #12
0
    def setup(self,
              algo,
              env,
              sampler_cls=None,
              sampler_args=None,
              n_workers=psutil.cpu_count(logical=False),
              worker_class=DefaultWorker,
              worker_args=None):
        """Set up runner for algorithm and environment.

        This method saves algo and env within runner and creates a sampler.

        Note:
            After setup() is called all variables in session should have been
            initialized. setup() respects existing values in session so
            policy weights can be loaded before setup().

        Args:
            algo (garage.np.algos.RLAlgorithm): An algorithm instance.
            env (garage.envs.GarageEnv): An environement instance.
            sampler_cls (garage.sampler.Sampler): A sampler class.
            sampler_args (dict): Arguments to be passed to sampler constructor.
            n_workers (int): The number of workers the sampler should use.
            worker_class (type): Type of worker the sampler should use.
            worker_args (dict or None): Additional arguments that should be
                passed to the worker.

        """
        self._algo = algo
        self._env = env
        self._policy = self._algo.policy
        self._n_workers = n_workers
        self._worker_class = worker_class
        if sampler_args is None:
            sampler_args = {}
        if sampler_cls is None:
            sampler_cls = algo.sampler_cls
        if worker_args is None:
            worker_args = {}

        self._worker_args = worker_args
        self._sampler = self.make_sampler(sampler_cls,
                                          sampler_args=sampler_args,
                                          n_workers=n_workers,
                                          worker_class=worker_class,
                                          worker_args=worker_args)

        self._has_setup = True

        self._setup_args = SetupArgs(sampler_cls=sampler_cls,
                                     sampler_args=sampler_args,
                                     seed=get_seed())
Beispiel #13
0
    def start_worker(self):
        """Start workers."""
        n_envs = self._n_envs
        envs = [pickle.loads(pickle.dumps(self.env)) for _ in range(n_envs)]

        # Deterministically set environment seeds based on the global seed.
        seed0 = deterministic.get_seed()
        if seed0 is not None:
            for (i, e) in enumerate(envs):
                e.seed(seed0 + i)

        self._vec_env = VecEnvExecutor(
            envs=envs, max_path_length=self.algo.max_path_length)
Beispiel #14
0
    def evaluate(self, algo, test_episodes_per_task=None):
        """Evaluate the Meta-RL algorithm on the test tasks.

        Args:
            algo (MetaRLAlgorithm): The algorithm to evaluate.
            test_episodes_per_task (int or None): Number of episodes per task.

        """
        if test_episodes_per_task is None:
            test_episodes_per_task = self._n_test_episodes
        adapted_episodes = []
        logger.log('Sampling for adapation and meta-testing...')
        env_updates = self._test_task_sampler.sample(self._n_test_tasks)
        if self._test_sampler is None:
            env = env_updates[0]()
            self._max_episode_length = env.spec.max_episode_length
            self._test_sampler = LocalSampler.from_worker_factory(
                WorkerFactory(seed=get_seed(),
                              max_episode_length=self._max_episode_length,
                              n_workers=1,
                              worker_class=self._worker_class,
                              worker_args=self._worker_args),
                agents=algo.get_exploration_policy(),
                envs=env)
        for env_up in env_updates:
            policy = algo.get_exploration_policy()
            eps = EpisodeBatch.concatenate(*[
                self._test_sampler.obtain_samples(self._eval_itr, 1, policy,
                                                  env_up)
                for _ in range(self._n_exploration_eps)
            ])
            adapted_policy = algo.adapt_policy(policy, eps)
            adapted_eps = self._test_sampler.obtain_samples(
                self._eval_itr,
                test_episodes_per_task * self._max_episode_length,
                adapted_policy)
            adapted_episodes.append(adapted_eps)
        logger.log('Finished meta-testing...')

        if self._test_task_names is not None:
            name_map = dict(enumerate(self._test_task_names))
        else:
            name_map = None

        with tabular.prefix(self._prefix + '/' if self._prefix else ''):
            log_multitask_performance(
                self._eval_itr,
                EpisodeBatch.concatenate(*adapted_episodes),
                getattr(algo, 'discount', 1.0),
                name_map=name_map)
        self._eval_itr += 1
Beispiel #15
0
    def make_sampler(self,
                     sampler_cls,
                     *,
                     seed=None,
                     n_workers=psutil.cpu_count(logical=False),
                     max_path_length=None,
                     worker_class=DefaultWorker,
                     sampler_args=None,
                     worker_args=None):
        """Construct a Sampler from a Sampler class.

        Args:
            sampler_cls (type): The type of sampler to construct.
            seed (int): Seed to use in sampler workers.
            max_path_length (int): Maximum path length to be sampled by the
                sampler. Paths longer than this will be truncated.
            n_workers (int): The number of workers the sampler should use.
            worker_class (type): Type of worker the Sampler should use.
            sampler_args (dict or None): Additional arguments that should be
                passed to the sampler.
            worker_args (dict or None): Additional arguments that should be
                passed to the sampler.

        Returns:
            sampler_cls: An instance of the sampler class.

        """
        if max_path_length is None:
            max_path_length = self._algo.max_path_length
        if seed is None:
            seed = get_seed()
        if sampler_args is None:
            sampler_args = {}
        if worker_args is None:
            worker_args = {}
        if issubclass(sampler_cls, BaseSampler):
            return sampler_cls(self._algo, self._env, **sampler_args)
        else:
            return sampler_cls.from_worker_factory(WorkerFactory(
                seed=seed,
                max_path_length=max_path_length,
                n_workers=n_workers,
                worker_class=worker_class,
                worker_args=worker_args),
                                                   agents=self._algo.policy,
                                                   envs=self._env)
Beispiel #16
0
 def __init__(
         self,
         *,  # Require passing by keyword.
         max_episode_length,
         is_tf_worker=False,
         seed=get_seed(),
         n_workers=psutil.cpu_count(logical=False),
         worker_class=DefaultWorker,
         worker_args=None):
     self.n_workers = n_workers
     self._seed = seed
     self._max_episode_length = max_episode_length
     if is_tf_worker:
         worker_class = TFWorkerClassWrapper(worker_class)
     self._worker_class = worker_class
     if worker_args is None:
         self._worker_args = {}
     else:
         self._worker_args = worker_args
Beispiel #17
0
 def __init__(
         self,
         *,  # Require passing by keyword.
         max_episode_length,
         is_tf_worker=False,
         seed=get_seed(),
         n_workers=psutil.cpu_count(logical=False),
         worker_class=DefaultWorker,
         worker_args=None):
     self.n_workers = n_workers
     self._seed = seed
     self._max_episode_length = max_episode_length
     if is_tf_worker:
         # Import here to avoid hard dependency on TF.
         # pylint: disable=import-outside-toplevel
         from garage.tf.samplers import TFWorkerClassWrapper
         worker_class = TFWorkerClassWrapper(worker_class)
     self._worker_class = worker_class
     if worker_args is None:
         self._worker_args = {}
     else:
         self._worker_args = worker_args
Beispiel #18
0
    def __init__(self, snapshot_config, max_cpus=1):
        self._snapshotter = Snapshotter(
            snapshot_config.snapshot_dir,
            snapshot_config.snapshot_mode,
            snapshot_config.snapshot_gap,
        )

        parallel_sampler.initialize(max_cpus)

        seed = get_seed()
        if seed is not None:
            parallel_sampler.set_seed(seed)

        self._has_setup = False
        self._plot = False

        self._setup_args = None
        self._train_args = None
        self._stats = ExperimentStats(
            total_itr=0, total_env_steps=0, total_epoch=0, last_path=None
        )

        self._algo = None
        self._env = None
        self._sampler = None
        self._plotter = None

        self._start_time = None
        self._itr_start_time = None
        self.step_itr = None
        self.step_path = None

        # only used for off-policy algorithms
        self.enable_logging = True

        self._n_workers = None
        self._worker_class = None
        self._worker_args = None
Beispiel #19
0
    def __init__(
            self,
            agents,
            envs,
            *,  # After this require passing by keyword.
            worker_factory=None,
            max_episode_length=None,
            is_tf_worker=False,
            seed=get_seed(),
            n_workers=psutil.cpu_count(logical=False),
            worker_class=DefaultWorker,
            worker_args=None):
        # pylint: disable=super-init-not-called
        if worker_factory is None and max_episode_length is None:
            raise TypeError('Must construct a sampler from WorkerFactory or'
                            'parameters (at least max_episode_length)')
        if isinstance(worker_factory, WorkerFactory):
            self._factory = worker_factory
        else:
            self._factory = WorkerFactory(
                max_episode_length=max_episode_length,
                is_tf_worker=is_tf_worker,
                seed=seed,
                n_workers=n_workers,
                worker_class=worker_class,
                worker_args=worker_args)

        self._agents = self._factory.prepare_worker_messages(agents)
        self._envs = self._factory.prepare_worker_messages(
            envs, preprocess=copy.deepcopy)
        self._workers = [
            self._factory(i) for i in range(self._factory.n_workers)
        ]
        for worker, agent, env in zip(self._workers, self._agents, self._envs):
            worker.update_agent(agent)
            worker.update_env(env)
        self.total_env_steps = 0
Beispiel #20
0
    def _build(self, state_input):
        action_dim = self._output_dim

        with tf.variable_scope('dist_params'):
            if self._std_share_network:
                # mean and std networks share an MLP
                b = np.concatenate([
                    np.zeros(action_dim),
                    np.full(action_dim, self._init_std_param)
                ], axis=0)  # yapf: disable
                b = tf.constant_initializer(b)
                mean_std_network = mlp(
                    state_input,
                    output_dim=action_dim * 2,
                    hidden_sizes=self._hidden_sizes,
                    hidden_nonlinearity=self._hidden_nonlinearity,
                    output_nonlinearity=self._output_nonlinearity,
                    output_b_init=b,
                    name='mean_std_network')
                with tf.variable_scope('mean_network'):
                    mean_network = mean_std_network[..., :action_dim]
                with tf.variable_scope('std_network'):
                    std_network = mean_std_network[..., action_dim:]

            else:
                # separate MLPs for mean and std networks
                # mean network
                mean_network = mlp(
                    state_input,
                    output_dim=action_dim,
                    hidden_sizes=self._hidden_sizes,
                    hidden_nonlinearity=self._hidden_nonlinearity,
                    output_nonlinearity=self._output_nonlinearity,
                    name='mean_network')

                # std network
                if self._adaptive_std:
                    b = tf.constant_initializer(self._init_std_param)
                    std_network = mlp(
                        state_input,
                        output_dim=action_dim,
                        hidden_sizes=self._std_hidden_sizes,
                        hidden_nonlinearity=self._std_hidden_nonlinearity,
                        output_nonlinearity=self._std_output_nonlinearity,
                        output_b_init=b,
                        name='std_network')
                else:
                    p = tf.constant_initializer(self._init_std_param)
                    std_network = parameter(state_input,
                                            length=action_dim,
                                            initializer=p,
                                            trainable=self._learn_std,
                                            name='std_network')

        mean_var = mean_network
        std_param_var = std_network

        with tf.variable_scope('std_parameterization'):
            # build std_var with std parameterization
            if self._std_parameterization == 'exp':
                std_param_var = std_param_var
            elif self._std_parameterization == 'softplus':
                std_param_var = tf.log(1. + tf.exp(std_param_var))
            else:
                raise NotImplementedError

        with tf.variable_scope('std_limits'):
            if self._min_std_param:
                std_var = tf.maximum(std_param_var, self._min_std_param)
            if self._max_std_param:
                std_var = tf.minimum(std_param_var, self._max_std_param)

        dist = DiagonalGaussian(action_dim)

        rnd = tf.random.normal(shape=mean_var.get_shape().as_list()[1:],
                               seed=deterministic.get_seed())
        action_var = rnd * tf.exp(std_var) + mean_var

        return action_var, mean_var, std_var, std_param_var, dist
Beispiel #21
0
    def _build(self, state_input):
        action_dim = self._output_dim

        with tf.variable_scope('dist_params'):
            if self._std_share_network:
                # mean and std networks share an MLP
                b = np.concatenate([
                    np.zeros(action_dim),
                    np.full(action_dim, self._init_std_param)
                ], axis=0)  # yapf: disable
                b = tf.constant_initializer(b)
                mean_std_network = mlp(
                    state_input,
                    output_dim=action_dim * 2,
                    hidden_sizes=self._hidden_sizes,
                    hidden_nonlinearity=self._hidden_nonlinearity,
                    output_nonlinearity=self._output_nonlinearity,
                    output_b_init=b,
                    name='mean_std_network')
                with tf.variable_scope('mean_network'):
                    mean_network = mean_std_network[..., :action_dim]
                with tf.variable_scope('std_network'):
                    std_network = mean_std_network[..., action_dim:]

            else:
                # separate MLPs for mean and std networks
                # mean network
                mean_network = mlp(
                    state_input,
                    output_dim=action_dim,
                    hidden_sizes=self._hidden_sizes,
                    hidden_nonlinearity=self._hidden_nonlinearity,
                    output_nonlinearity=self._output_nonlinearity,
                    name='mean_network')

                # std network
                if self._adaptive_std:
                    b = tf.constant_initializer(self._init_std_param)
                    std_network = mlp(
                        state_input,
                        output_dim=action_dim,
                        hidden_sizes=self._std_hidden_sizes,
                        hidden_nonlinearity=self._std_hidden_nonlinearity,
                        output_nonlinearity=self._std_output_nonlinearity,
                        output_b_init=b,
                        name='std_network')
                else:
                    p = tf.constant_initializer(self._init_std_param)
                    std_network = parameter(state_input,
                                            length=action_dim,
                                            initializer=p,
                                            trainable=self._learn_std,
                                            name='std_network')

        mean_var = mean_network
        log_std_var = std_network

        with tf.variable_scope('std_parameterization'):
            # build std_var with std parameterization
            if self._std_parameterization == 'exp':
                pass
            elif self._std_parameterization == 'softplus':
                softplus_std_var = tf.log(1. + tf.exp(log_std_var))
                log_std_var = tf.log(softplus_std_var)
            else:
                raise NotImplementedError

        with tf.variable_scope('std_limits'):
            if self._min_std_param:
                log_std_var = tf.maximum(log_std_var, self._min_std_param)
            if self._max_std_param:
                log_std_var = tf.minimum(log_std_var, self._max_std_param)

        distribution = tfp.distributions.MultivariateNormalDiag(
            mean_var, tf.exp(log_std_var))

        action_var = distribution.sample(seed=deterministic.get_seed())

        return action_var, log_std_var, distribution
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#  See the GNU Affero Public License for more details.
#
#  You should have received a copy of the GNU Affero Public License
#  along with this program.  If not, see http://www.gnu.org/licenses.
#
#  http://numenta.org/licenses/
#
# ------------------------------------------------------------------------------
import copy

from garage.experiment.deterministic import get_seed
from garage.sampler import FragmentWorker, LocalSampler
from garage.sampler.worker_factory import WorkerFactory

seed = get_seed()


class SingleVecWorkSampler(LocalSampler):
    """Sampler class which contains 1 vectorized worker which contains all the envs.

    The sampler need to be created either from a worker factory or from
    parameters which can construct a worker factory. See the __init__ method
    of WorkerFactory for the detail of these parameters.

    Args:
        agents (Policy or List[Policy]): Agent(s) to use to sample episodes.
            If a list is passed in, it must have length exactly
            `worker_factory.n_workers`, and will be spread across the
            workers.
        envs (Environment or List[Environment]): Environment from which
    def _build(self, state_input):
        action_dim = self._output_dim

        with tf.variable_scope('dist_params'):
            if self._std_share_network:
                # mean and std networks share an MLP
                b = np.concatenate([
                    np.zeros(action_dim),
                    np.full(action_dim, self._init_std_param)
                ], axis=0)  # yapf: disable

                mean_std_network = mlp(
                    state_input,
                    output_dim=action_dim * 2,
                    hidden_sizes=self._hidden_sizes,
                    hidden_nonlinearity=self._hidden_nonlinearity,
                    hidden_w_init=self._hidden_w_init,
                    hidden_b_init=self._hidden_b_init,
                    output_nonlinearity=self._output_nonlinearity,
                    output_w_init=self._output_w_init,
                    output_b_init=tf.constant_initializer(b),
                    name='mean_std_network',
                    layer_normalization=self._layer_normalization)
                with tf.variable_scope('mean_network'):
                    mean_network = mean_std_network[..., :action_dim]
                with tf.variable_scope('log_std_network'):
                    log_std_network = mean_std_network[..., action_dim:]

            else:
                # separate MLPs for mean and std networks
                # mean network
                mean_network = mlp(
                    state_input,
                    output_dim=action_dim,
                    hidden_sizes=self._hidden_sizes,
                    hidden_nonlinearity=self._hidden_nonlinearity,
                    hidden_w_init=self._hidden_w_init,
                    hidden_b_init=self._hidden_b_init,
                    output_nonlinearity=self._output_nonlinearity,
                    output_w_init=self._output_w_init,
                    output_b_init=self._output_b_init,
                    name='mean_network',
                    layer_normalization=self._layer_normalization)

                # std network
                if self._adaptive_std:
                    log_std_network = mlp(
                        state_input,
                        output_dim=action_dim,
                        hidden_sizes=self._std_hidden_sizes,
                        hidden_nonlinearity=self._std_hidden_nonlinearity,
                        hidden_w_init=self._std_hidden_w_init,
                        hidden_b_init=self._std_hidden_b_init,
                        output_nonlinearity=self._std_output_nonlinearity,
                        output_w_init=self._std_output_w_init,
                        output_b_init=tf.constant_initializer(
                            self._init_std_param),
                        name='log_std_network',
                        layer_normalization=self._layer_normalization)
                else:
                    log_std_network = parameter(
                        state_input,
                        length=action_dim,
                        initializer=tf.constant_initializer(
                            self._init_std_param),
                        trainable=self._learn_std,
                        name='log_std_network')

        mean_var = mean_network
        std_param = log_std_network

        with tf.variable_scope('std_parameterization'):
            # build std_var with std parameterization
            if self._std_parameterization == 'exp':
                log_std_var = std_param
            else:  # we know it must be softplus here
                log_std_var = tf.log(1. + tf.exp(std_param))

        with tf.variable_scope('std_limits'):
            if self._min_std_param is not None:
                log_std_var = tf.maximum(log_std_var, self._min_std_param)
            if self._max_std_param is not None:
                log_std_var = tf.minimum(log_std_var, self._max_std_param)

        dist = DiagonalGaussian(self._output_dim)

        rnd = tf.random.normal(shape=mean_var.get_shape().as_list()[1:],
                               seed=deterministic.get_seed())
        action_var = rnd * tf.exp(log_std_var) + mean_var

        return action_var, mean_var, log_std_var, std_param, dist
Beispiel #24
0
    def make_sampler(self,
                     sampler_cls,
                     *,
                     seed=None,
                     n_workers=psutil.cpu_count(logical=False),
                     max_episode_length=None,
                     worker_class=None,
                     sampler_args=None,
                     worker_args=None):
        """Construct a Sampler from a Sampler class.

        Args:
            sampler_cls (type): The type of sampler to construct.
            seed (int): Seed to use in sampler workers.
            max_episode_length (int): Maximum episode length to be sampled by
                the sampler. Epsiodes longer than this will be truncated.
            n_workers (int): The number of workers the sampler should use.
            worker_class (type): Type of worker the Sampler should use.
            sampler_args (dict or None): Additional arguments that should be
                passed to the sampler.
            worker_args (dict or None): Additional arguments that should be
                passed to the sampler.

        Raises:
            ValueError: If `max_episode_length` isn't passed and the algorithm
                doesn't contain a `max_episode_length` field, or if the
                algorithm doesn't have a policy field.

        Returns:
            sampler_cls: An instance of the sampler class.

        """
        policy = getattr(self._algo, 'exploration_policy', None)
        if policy is None:
            policy = getattr(self._algo, 'policy', None)
        if policy is None:
            raise ValueError('If the trainer is used to construct a sampler, '
                             'the algorithm must have a `policy` or '
                             '`exploration_policy` field.')
        if max_episode_length is None:
            if hasattr(self._algo, 'max_episode_length'):
                max_episode_length = self._algo.max_episode_length
        if max_episode_length is None:
            raise ValueError('If `sampler_cls` is specified in trainer.setup, '
                             'the algorithm must specify `max_episode_length`')
        if worker_class is None:
            worker_class = getattr(self._algo, 'worker_cls', DefaultWorker)
        if seed is None:
            seed = get_seed()
        if sampler_args is None:
            sampler_args = {}
        if worker_args is None:
            worker_args = {}
        return sampler_cls.from_worker_factory(WorkerFactory(
            seed=seed,
            max_episode_length=max_episode_length,
            n_workers=n_workers,
            worker_class=worker_class,
            worker_args=worker_args),
                                               agents=policy,
                                               envs=self._env)
Beispiel #25
0
    def _build_graph(self, from_input):
        latent_dim = self.latent_space.flat_dim
        small = 1e-5

        with self._variable_scope:
            with tf.variable_scope("dist_params"):
                if self._std_share_network:
                    # mean and std networks share an MLP
                    b = np.concatenate([
                        np.zeros(latent_dim),
                        np.full(latent_dim, self._init_std_param)
                    ],
                                       axis=0)
                    b = tf.constant_initializer(b)
                    # b = tf.truncated_normal_initializer(
                    #     mean=b, stddev=small)
                    mean_std_network = mlp(
                        with_input=from_input,
                        output_dim=latent_dim * 2,
                        hidden_sizes=self._hidden_sizes,
                        hidden_nonlinearity=self._hidden_nonlinearity,
                        output_nonlinearity=self._output_nonlinearity,
                        output_b_init=b,
                        name="mean_std_network")
                    with tf.variable_scope("mean_network"):
                        mean_network = mean_std_network[..., :latent_dim]
                    with tf.variable_scope("std_network"):
                        std_network = mean_std_network[..., latent_dim:]
                else:
                    # separate MLPs for mean and std networks
                    # mean network
                    mean_network = mlp(
                        with_input=from_input,
                        output_dim=latent_dim,
                        hidden_sizes=self._hidden_sizes,
                        hidden_nonlinearity=self._hidden_nonlinearity,
                        output_nonlinearity=self._output_nonlinearity,
                        name="mean_network")

                    # std network
                    if self._adaptive_std:
                        b = tf.constant_initializer(self._init_std_param)
                        # b = tf.truncated_normal_initializer(
                        #     mean=self._init_std_param, stddev=small)
                        std_network = mlp(
                            with_input=from_input,
                            output_dim=latent_dim,
                            hidden_sizes=self._std_hidden_sizes,
                            hidden_nonlinearity=self._std_hidden_nonlinearity,
                            output_nonlinearity=self._output_nonlinearity,
                            output_b_init=b,
                            name="std_network")
                    else:
                        p = tf.constant_initializer(self._init_std_param)
                        # p = tf.truncated_normal_initializer(
                        #     mean=self._init_std_param, stddev=small)
                        std_network = parameter(with_input=from_input,
                                                length=latent_dim,
                                                initializer=p,
                                                trainable=self._learn_std,
                                                name="std_network")

                mean_var = mean_network
                std_param_var = std_network

                with tf.variable_scope("std_limits"):
                    if self._min_std_param:
                        std_param_var = tf.maximum(std_param_var,
                                                   self._min_std_param)
                    if self._max_std_param:
                        std_param_var = tf.minimum(std_param_var,
                                                   self._max_std_param)

            with tf.variable_scope("std_parameterization"):
                # build std_var with std parameterization
                if self._std_parameterization == "exp":
                    std_var = tf.exp(std_param_var)
                elif std_parameterization == "softplus":
                    std_var = tf.log(1. + tf.exp(std_param_var))
                else:
                    raise NotImplementedError

            dist = tf.contrib.distributions.MultivariateNormalDiag(
                mean_var, std_var)

            latent_var = dist.sample(seed=deterministic.get_seed())

            return latent_var, mean_var, std_param_var, dist
def run_task(snapshot_config, exp_config):
    logger.log(f"Config of this experiment is {exp_config}")
    env_config = exp_config["env"]
    env_name = env_config["name"]
    replay_buffer_size = exp_config.get("replay_buffer_size")
    n_epochs = exp_config.get("n_epochs")
    steps_per_epoch = exp_config.get("steps_per_epoch")
    sampler_batch_size = exp_config.get("sampler_batch_size")
    n_train_steps = exp_config.get("n_train_steps")
    learning_rate = exp_config.get("learning_rate")
    buffer_batch_size = exp_config.get("buffer_batch_size")
    target_network_update_freq = exp_config.get("target_network_update_freq")
    min_buffer_size = exp_config.get("min_buffer_size")
    net_config = exp_config.get("q-net")
    loss_weights = exp_config.get("loss_weights")
    deepmdp_config = exp_config.get("deepmdp")
    epsilon_greedy_config = exp_config.get("epsilon_greedy")
    plots = exp_config.get("plots")
    steps = n_epochs * steps_per_epoch * sampler_batch_size
    num_frames = env_config.get("num_frames")
    if num_frames is None:
        num_frames = 4
    snapshot_config = SnapshotConfig(
        os.path.join(os.getcwd(), f'runs/{get_info()}/snapshots'),
        snapshot_config["snapshot_mode"], snapshot_config["snapshot_gap"])

    if "LunarLander-v2" in env_name:
        # Pass either LunarLander-v2 or LunarLander-v2-img to have the env give back image or semantical observations.
        if env_name[-4:] == "-img":
            env = setup_lunar_lander_with_image_obs(
                env_name[:-4], num_frames, do_noops=env_config["do_noops"])
        elif env_name[-4:] == "-obf":
            env = setup_lunar_lander_with_obfuscated_states(
                env_name[:-4], num_frames, do_noops=env_config["do_noops"])
        elif env_name[-4:] == "-stk":
            env = setup_stacked_lunar_lander_env(
                env_name[:-4], num_frames, normalize=env_config["normalize"])
        else:
            env = GarageEnv(gym.make(env_name))
    elif "SpaceInvaders-v0" == env_name:
        env = setup_atari_env(env_name, num_frames)
    else:
        raise ValueError("Env name not known")

    # Set env seed
    env.seed(get_seed())
    # Set seed for action space (needed for epsilon-greedy reproducability)
    env.action_space.seed(get_seed())

    # Init visualizer
    visualizer = Visualizer(get_info() + "_main", plots)
    visualizer.publish_config(exp_config)

    runner = LocalRunner(snapshot_config)
    replay_buffer = SimpleReplayBuffer(env.spec,
                                       size_in_transitions=replay_buffer_size,
                                       time_horizon=1)

    strategy = EpsilonGreedyStrategy(env.spec, steps, **epsilon_greedy_config)
    qf = DiscreteCNNQFunction(env_spec=env.spec, **net_config)

    aux_objectives = []
    if deepmdp_config["use"]:
        reward_objective = RewardAuxiliaryObjective(
            env.spec, qf.embedding_size, deepmdp_config["reward_head"])
        transition_objective = TransitionAuxiliaryObjective(
            env.spec, qf.embedding_size, deepmdp_config["transition_head"])
        aux_objectives.append(reward_objective)
        aux_objectives.append(transition_objective)

    policy = DiscreteQfDerivedPolicy(env.spec, qf)
    algo = DQN(policy=policy,
               qf=qf,
               env_spec=env.spec,
               experiment_id=get_info(),
               plot_list=plots,
               visualizer=visualizer,
               replay_buffer=replay_buffer,
               qf_optimizer=torch.optim.Adam,
               exploration_strategy=strategy,
               n_train_steps=n_train_steps,
               buffer_batch_size=buffer_batch_size,
               min_buffer_size=min_buffer_size,
               n_epoch_cycles=steps_per_epoch,
               target_network_update_freq=target_network_update_freq,
               qf_lr=learning_rate,
               max_path_length=1000,
               auxiliary_objectives=aux_objectives,
               **loss_weights)

    # Use modded off policy sampler for passing generating summary statistics about episode's qvals in algo-object.
    runner.setup(algo=algo, env=env, sampler_cls=OffPolicyVectorizedSampler)
    runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size)

    # Bypass GarageEnv>>close as this requires a display
    env.env.close()
    def _build_graph(self, from_input):
        latent_dim = self.latent_space.flat_dim
        small = 1e-5

        with self._variable_scope:
            with tf.variable_scope("word2vec"):
                lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(
                    num_units=self._sentence_embedding_dict_dim)
                self.sentence_embedding_dict = tf.Variable(np.zeros(
                    (self._sentence_code_dim,
                     self._sentence_embedding_dict_dim)),
                                                           dtype=tf.float32)

                # (bs, max_sentence_len, sentence_embedding_dict_dim)
                self.sentence_embedding = tf.nn.embedding_lookup(
                    params=self.sentence_embedding_dict, ids=from_input)

                data_mask = tf.cast(from_input, tf.bool)
                data_len = tf.reduce_sum(tf.cast(data_mask, tf.int32), axis=1)
                initial_state = lstm_cell.zero_state(
                    tf.shape(self.sentence_embedding)[0], tf.float32)
                input_vec = tf.nn.dynamic_rnn(
                    lstm_cell,
                    self.sentence_embedding,
                    sequence_length=data_len,
                    initial_state=initial_state)[0][:, -1]

            with tf.variable_scope("dist_params"):
                if self._std_share_network:
                    # mean and std networks share an MLP
                    b = np.concatenate([
                        np.zeros(latent_dim),
                        np.full(latent_dim, self._init_std_param)
                    ],
                                       axis=0)
                    b = tf.constant_initializer(b)
                    mean_std_network = mlp(
                        with_input=input_vec,
                        output_dim=latent_dim * 2,
                        hidden_sizes=self._hidden_sizes,
                        hidden_nonlinearity=self._hidden_nonlinearity,
                        output_nonlinearity=self._output_nonlinearity,
                        # hidden_w_init=tf.orthogonal_initializer(1.0),
                        # output_w_init=tf.orthogonal_initializer(1.0),
                        output_b_init=b,
                        name="mean_std_network")
                    with tf.variable_scope("mean_network"):
                        mean_network = mean_std_network[..., :latent_dim]
                    with tf.variable_scope("std_network"):
                        std_network = mean_std_network[..., latent_dim:]
                else:
                    # separate MLPs for mean and std networks
                    # mean network
                    mean_network = mlp(
                        with_input=input_vec,
                        output_dim=latent_dim,
                        hidden_sizes=self._hidden_sizes,
                        hidden_nonlinearity=self._hidden_nonlinearity,
                        output_nonlinearity=self._output_nonlinearity,
                        name="mean_network")

                    # std network
                    if self._adaptive_std:
                        b = tf.constant_initializer(self._init_std_param)
                        std_network = mlp(
                            with_input=input_vec,
                            output_dim=latent_dim,
                            hidden_sizes=self._std_hidden_sizes,
                            hidden_nonlinearity=self._std_hidden_nonlinearity,
                            output_nonlinearity=self._output_nonlinearity,
                            output_b_init=b,
                            name="std_network")
                    else:
                        p = tf.constant_initializer(self._init_std_param)
                        std_network = parameter(with_input=input_vec,
                                                length=latent_dim,
                                                initializer=p,
                                                trainable=self._learn_std,
                                                name="std_network")

                if self._mean_scale != 1.:
                    mean_var = tf.identity(mean_network * self._mean_scale,
                                           "mean_scale")
                else:
                    mean_var = mean_network

                if self._mean_output_nonlinearity is not None:
                    mean_var = self._mean_output_nonlinearity(mean_var)

                std_param_var = std_network

                with tf.variable_scope("std_limits"):
                    if self._min_std_param:
                        std_param_var = tf.maximum(std_param_var,
                                                   self._min_std_param)
                    if self._max_std_param:
                        std_param_var = tf.minimum(std_param_var,
                                                   self._max_std_param)

            with tf.variable_scope("std_parameterization"):
                # build std_var with std parameterization
                if self._std_parameterization == "exp":
                    std_var = tf.exp(std_param_var)
                elif self._std_parameterization == "softplus":
                    std_var = tf.log(1. + tf.exp(std_param_var))
                else:
                    raise NotImplementedError

            if self._normalize:
                mean_var = tf.nn.l2_normalize(mean_var)
                #std_var = tf.nn.l2_normalize(std_var)

            dist = tf.contrib.distributions.MultivariateNormalDiag(
                mean_var, std_var)

            latent_var = dist.sample(seed=deterministic.get_seed())

            return latent_var, mean_var, std_param_var, dist
    def setup(self,
              algo,
              env,
              irl,
              baseline,
              n_itr=200,
              start_itr=0,
              sampler_cls=None,
              sampler_args=None,
              n_workers=psutil.cpu_count(logical=False),
              worker_class=None,
              worker_args=None,
              discount=0.99,
              gae_lambda=1,
              discrim_train_itrs=10,
              discrim_batch_size=32,
              irl_model_wt=1.0,
              zero_environment_reward=False):
        """
        :param discount(float): Discount
        :param irl_model_wt(float): weight of IRL model
        """
        self._algo = algo
        self._env = env
        self._irl = irl
        self._baseline = baseline
        self._n_workers = n_workers
        self._worker_class = worker_class

        self.n_itr = n_itr
        self.start_itr = start_itr

        if sampler_args is None:
            sampler_args = {}
        if sampler_cls is None:
            sampler_cls = getattr(algo, 'sampler_cls', None)
        if worker_class is None:
            worker_class = getattr(algo, 'worker_cls', DefaultWorker)
        if worker_args is None:
            worker_args = {}

        self._worker_args = worker_args
        if sampler_cls is None:
            self._sampler = None
        else:
            self._sampler = self.make_sampler(sampler_cls,
                                              sampler_args=sampler_args,
                                              n_workers=n_workers,
                                              worker_class=worker_class,
                                              worker_args=worker_args)

        self._has_setup = True

        self._setup_args = SetupArgs(sampler_cls=sampler_cls,
                                     sampler_args=sampler_args,
                                     seed=get_seed())

        self.irl_model_wt = irl_model_wt
        self.discount = discount
        self.gae_lambda = gae_lambda
        self.discrim_train_itrs = discrim_train_itrs
        self.discrim_batch_size = discrim_batch_size
        self.no_reward = zero_environment_reward