コード例 #1
0
ファイル: rl2env.py プロジェクト: andCelli/garage
 def __init__(self, env, max_obs_dim=None):
     super().__init__(env)
     self._max_obs_dim = max_obs_dim
     action_space = akro.from_gym(self.env.action_space)
     observation_space = self._create_rl2_obs_space(env)
     self._spec = EnvSpec(action_space=action_space,
                          observation_space=observation_space)
コード例 #2
0
    def __init__(self,
                 goal_reward=10,
                 actuation_cost_coeff=30,
                 distance_cost_coeff=1,
                 init_sigma=0.1):
        super().__init__()
        Serializable.quick_init(self, locals())

        self.dynamics = PointDynamics(dim=2, sigma=0)
        self.init_mu = np.zeros(2, dtype=np.float32)
        self.init_sigma = init_sigma
        self.goal_positions = np.array([[5, 0], [-5, 0], [0, 5], [0, -5]],
                                       dtype=np.float32)
        self.goal_threshold = 1.
        self.goal_reward = goal_reward
        self.action_cost_coeff = actuation_cost_coeff
        self.distance_cost_coeff = distance_cost_coeff
        self.xlim = (-7, 7)
        self.ylim = (-7, 7)
        self.vel_bound = 1.
        self.reset()
        self.observation = None

        self._ax = None
        self._env_lines = list()
        self.fixed_plots = None
        self.dynamic_plots = []
        self.__spec = EnvSpec(action_space=self.action_space,
                              observation_space=self.observation_space)
コード例 #3
0
 def test_pickleable(self):
     env_spec = EnvSpec(akro.Box(-1, 1, (1, )), akro.Box(-2, 2, (2, )), 500)
     round_trip = pickle.loads(pickle.dumps(env_spec))
     assert round_trip
     assert round_trip.action_space == env_spec.action_space
     assert round_trip.observation_space == env_spec.observation_space
     assert round_trip.max_episode_length == env_spec.max_episode_length
コード例 #4
0
ファイル: bullet_env.py プロジェクト: songanz/garage
    def __init__(self, env=None, env_name='', is_image=False):
        """Returns a Garage wrapper class for bullet-based gym.Env.

        Args:
            env (gym.wrappers.time_limit): A gym.wrappers.time_limit.TimeLimit
                object wrapping a gym.Env created via gym.make().
            env_name (str): If the env_name is speficied, a gym environment
                with that name will be created. If such an environment does not
                exist, a `gym.error` is thrown.
            is_image (bool): True if observations contain pixel values,
                false otherwise. Setting this to true converts a gym.Spaces.Box
                obs space to an akro.Image and normalizes pixel values.

        """
        if not env:
            # 'RacecarZedBulletEnv-v0' environment enables rendering by
            # default, while pybullet allows only one GUI connection at a time.
            # Setting renders to False avoids potential error when multiple
            # of these envs are tested at the same time.
            if env_name == 'RacecarZedBulletEnv-v0':
                env = gym.make(env_name, renders=False)
            else:
                env = gym.make(env_name)

        # Needed for deserialization
        self._env = env
        self._env_name = env_name

        super().__init__(env)
        self.action_space = akro.from_gym(self.env.action_space)
        self.observation_space = akro.from_gym(self.env.observation_space,
                                               is_image=is_image)
        self._spec = EnvSpec(action_space=self.action_space,
                             observation_space=self.observation_space)
コード例 #5
0
ファイル: garage_env.py プロジェクト: maciejwolczyk/garage-1
    def __init__(self, env=None, env_name='', is_image=False):
        """Initializes a GarageEnv.

        Args:
            env (gym.wrappers.time_limit): A gym.wrappers.time_limit.TimeLimit
                object wrapping a gym.Env created via gym.make().
            env_name (str): If the env_name is speficied, a gym environment
                with that name will be created. If such an environment does not
                exist, a `gym.error` is thrown.
            is_image (bool): True if observations contain pixel values,
                false otherwise. Setting this to true converts a gym.Spaces.Box
                obs space to an akro.Image and normalizes pixel values.
        """
        # Needed for deserialization
        self._env_name = env_name
        self._env = env

        if env_name:
            super().__init__(gym.make(env_name))
        else:
            super().__init__(env)

        self.action_space = akro.from_gym(self.env.action_space)
        self.observation_space = akro.from_gym(self.env.observation_space,
                                               is_image=is_image)
        self._spec = EnvSpec(action_space=self.action_space,
                             observation_space=self.observation_space)
コード例 #6
0
    def spec(self):
        """
        Returns an EnvSpec.

        Returns:
            spec (garage.envs.EnvSpec)
        """
        return EnvSpec(observation_space=self.observation_space,
                       action_space=self.action_space)
コード例 #7
0
    def spec(self):
        r"""Returns a garage environment specification.

        Returns
        -------
        :py:class:`garage.envs.env_spec.EnvSpec`
            A garage environment specification.
        """
        return EnvSpec(
            observation_space=self.observation_space,
            action_space=self.action_space)
コード例 #8
0
 def __init__(self, env, task_index, n_total_tasks):
     assert 0 <= task_index < n_total_tasks
     super().__init__(env)
     self._task_index = task_index
     self._n_total_tasks = n_total_tasks
     env_lb = self.env.observation_space.low
     env_ub = self.env.observation_space.high
     one_hot_ub = np.ones(self._n_total_tasks)
     one_hot_lb = np.zeros(self._n_total_tasks)
     self.observation_space = akro.Box(np.concatenate([env_lb, one_hot_lb]),
                                       np.concatenate([env_ub, one_hot_ub]))
     self.__spec = EnvSpec(action_space=self.action_space,
                           observation_space=self.observation_space)
コード例 #9
0
ファイル: base.py プロジェクト: wjssx/garage
    def __init__(self, env=None, env_name=''):
        # Needed for deserialization
        self._env_name = env_name
        self._env = env

        if env_name:
            super().__init__(gym.make(env_name))
        else:
            super().__init__(env)

        self.action_space = akro.from_gym(self.env.action_space)
        self.observation_space = akro.from_gym(self.env.observation_space)
        self.__spec = EnvSpec(action_space=self.action_space,
                              observation_space=self.observation_space)
コード例 #10
0
ファイル: base.py プロジェクト: ZaneH1992/garage
    def __init__(self, env=None, env_name=''):
        if env_name:
            super().__init__(gym.make(env_name))
        else:
            super().__init__(env)

        self.action_space = akro.from_gym(self.env.action_space)
        self.observation_space = akro.from_gym(self.env.observation_space)
        if self.spec:
            self.spec.action_space = self.action_space
            self.spec.observation_space = self.observation_space
        else:
            self.spec = EnvSpec(action_space=self.action_space,
                                observation_space=self.observation_space)

        Serializable.quick_init(self, locals())
コード例 #11
0
    def __init__(self, env=None, env_name='', is_image=False):
        # Needed for deserialization
        self._env_name = env_name
        self._env = env

        if env_name:
            print(env_name)
            # @TODO fix this line blowing things up
            super().__init__(gym.make(env_name))
        else:
            super().__init__(env)

        self.action_space = akro.from_gym(self.env.action_space)
        self.observation_space = akro.from_gym(self.env.observation_space,
                                               is_image=is_image)
        self.__spec = EnvSpec(action_space=self.action_space,
                              observation_space=self.observation_space)
コード例 #12
0
    def setup_method(self):
        """Setup for all test methods."""
        self.latent_dim = 5
        self.env_spec = TfEnv(DummyBoxEnv())
        latent_space = akro.Box(low=-1,
                                high=1,
                                shape=(self.latent_dim, ),
                                dtype=np.float32)

        # add latent space to observation space to create a new space
        augmented_obs_space = akro.Tuple(
            (self.env_spec.observation_space, latent_space))
        augmented_env_spec = EnvSpec(augmented_obs_space,
                                     self.env_spec.action_space)

        self.obs_dim = int(np.prod(self.env_spec.observation_space.shape))
        self.action_dim = int(np.prod(self.env_spec.action_space.shape))
        reward_dim = 1
        self.encoder_input_dim = self.obs_dim + self.action_dim + reward_dim
        encoder_output_dim = self.latent_dim * 2
        encoder_hidden_sizes = (3, 2, encoder_output_dim)

        context_encoder = MLPEncoder(input_dim=self.encoder_input_dim,
                                     output_dim=encoder_output_dim,
                                     hidden_nonlinearity=None,
                                     hidden_sizes=encoder_hidden_sizes,
                                     hidden_w_init=nn.init.ones_,
                                     output_w_init=nn.init.ones_)

        context_policy = TanhGaussianMLPPolicy(env_spec=augmented_env_spec,
                                               hidden_sizes=(3, 5, 7),
                                               hidden_nonlinearity=F.relu,
                                               output_nonlinearity=None)

        self.module = ContextConditionedPolicy(latent_dim=self.latent_dim,
                                               context_encoder=context_encoder,
                                               policy=context_policy,
                                               use_information_bottleneck=True,
                                               use_next_obs=False)
コード例 #13
0
    def __init__(self, env=None, env_name='', is_image=False):
        # Needed for deserialization
        self._env_name = env_name
        self._env = env

        if env_name:
            super().__init__(gym.make(env_name))
        else:
            super().__init__(env)

        if isinstance(self.env.action_space, Box):
            self.action_space = akro.Box(low=self.env.action_space.low,
                                         high=self.env.action_space.high)
            self.observation_space = akro.Image(
                shape=self.env.observation_space.shape)
        else:
            self.action_space = akro.from_gym(self.env.action_space)
            self.observation_space = akro.from_gym(self.env.observation_space,
                                                   is_image=is_image)

        self.__spec = EnvSpec(action_space=self.action_space,
                              observation_space=self.observation_space)
コード例 #14
0
    def test_init_std(self):
        sess = tf.Session()
        sess.__enter__()


        task_space = Box(
            np.zeros(8, dtype=np.float32),
            np.ones(8, dtype=np.float32))
        latent_space = Box(
            np.zeros(4, dtype=np.float32),
            np.ones(4, dtype=np.float32))
        embed_spec = EmbeddingSpec(task_space, latent_space)
        embedding = GaussianMLPEmbedding(
            name="embedding",
            embedding_spec=embed_spec,
            hidden_sizes=(20, 20),
            std_share_network=True,
            init_std=1.0,
            max_std=2.0,
        )

        std_parameterization = "exp"
        observation_space = Box(np.full(100, 0.0), np.full(100, 100.0))
        action_space = Box(np.full(10, 0.0), np.full(10, 10.0))
        env_spec = EnvSpec(observation_space, action_space)
        policy = GaussianMLPMultitaskPolicy(
            name="policy",
            env_spec=env_spec,
            task_space=task_space,
            embedding=embedding,
            hidden_sizes=(200, 100),
            std_share_network=True,
            # max_std=10.0,
            init_std=6.0,
            std_parameterization=std_parameterization,
        )

        sess.run(tf.global_variables_initializer())

        z = latent_space.sample()
        # z = np.ones_like(latent_space.low)
        print("|z| = {}".format(np.linalg.norm(z)))
        # z = z / np.linalg.norm(z)

        o = observation_space.sample()
        # o = np.ones_like(observation_space.low)
        print("|o| = {}".format(np.linalg.norm(o)))
        # o = o / np.linalg.norm(o)

        a, info = policy.get_action_from_latent(z, o)

        log_stds = info["log_std"]
        if std_parameterization == "exp":
            stds = np.exp(log_stds)
        elif std_parameterization == "softplus":
            stds = np.log(1. + np.exp(log_stds))
        else:
            raise NotImplementedError

        print("log_stds = {}".format(log_stds))
        print("stds = {}".format(stds))
        print("mean(stds) = {}".format(np.mean(stds)))
        print("std(stds) = {}".format(np.std(stds)))

        assert np.allclose(stds, 1.0), "stds: {}".format(stds)
コード例 #15
0
def run_task(v):
    v = SimpleNamespace(**v)

    task_names = sorted(v.tasks.keys())
    task_args = [v.tasks[t]['args'] for t in task_names]
    task_kwargs = [v.tasks[t]['kwargs'] for t in task_names]

    with TaskEmbeddingRunner() as runner:
        # Environment
        env = TfEnv(
                normalize(
                  MultiTaskEnv(
                    task_env_cls=FlatTorqueReacher,
                    task_args=task_args,
                    task_kwargs=task_kwargs)))

        # Latent space and embedding specs
        # TODO(gh/10): this should probably be done in Embedding or Algo
        latent_lb = np.zeros(v.latent_length, )
        latent_ub = np.ones(v.latent_length, )
        latent_space = Box(latent_lb, latent_ub)

        # trajectory space is (TRAJ_ENC_WINDOW, act_obs) where act_obs is a stacked
        # vector of flattened actions and observations
        act_lb, act_ub = env.action_space.bounds
        act_lb_flat = env.action_space.flatten(act_lb)
        act_ub_flat = env.action_space.flatten(act_ub)
        obs_lb, obs_ub = env.observation_space.bounds
        obs_lb_flat = env.observation_space.flatten(obs_lb)
        obs_ub_flat = env.observation_space.flatten(obs_ub)
        # act_obs_lb = np.concatenate([act_lb_flat, obs_lb_flat])
        # act_obs_ub = np.concatenate([act_ub_flat, obs_ub_flat])
        act_obs_lb = obs_lb_flat
        act_obs_ub = obs_ub_flat
        # act_obs_lb = act_lb_flat
        # act_obs_ub = act_ub_flat
        traj_lb = np.stack([act_obs_lb] * v.inference_window)
        traj_ub = np.stack([act_obs_ub] * v.inference_window)
        traj_space = Box(traj_lb, traj_ub)

        task_embed_spec = EmbeddingSpec(env.task_space, latent_space)
        traj_embed_spec = EmbeddingSpec(traj_space, latent_space)
        task_obs_space = concat_spaces(env.task_space, env.observation_space)
        env_spec_embed = EnvSpec(task_obs_space, env.action_space)

        # TODO(): rename to inference_network
        traj_embedding = GaussianMLPEmbedding(
            name="inference",
            embedding_spec=traj_embed_spec,
            hidden_sizes=(64, 64),
            std_share_network=True,
            init_std=1.0,
        )

        # Embeddings
        task_embedding = GaussianMLPEmbedding(
            name="embedding",
            embedding_spec=task_embed_spec,
            hidden_sizes=(64, 64),
            std_share_network=True,
            init_std=v.embedding_init_std,  # 1.0
            max_std=v.embedding_max_std,  # 2.0
            # std_parameterization="softplus",
        )

        # Multitask policy
        policy = GaussianMLPMultitaskPolicy(
            name="policy",
            env_spec=env.spec,
            task_space=env.task_space,
            embedding=task_embedding,
            hidden_sizes=(64, 32),
            std_share_network=True,
            init_std=v.policy_init_std,
            max_std=v.policy_max_std,
            # std_parameterization="softplus",
        )

        # baseline = MultiTaskLinearFeatureBaseline(env_spec=env_spec_embed)
        extra = v.latent_length + len(v.tasks)
        baseline = MultiTaskGaussianMLPBaseline(
            env_spec=env.spec, extra_dims=extra)

        algo = PPOTaskEmbedding(
            env=env,
            policy=policy,
            baseline=baseline,
            inference=traj_embedding,
            batch_size=v.batch_size,  # 4096
            max_path_length=v.max_path_length,
            discount=0.99,
            lr_clip_range=0.2,
            policy_ent_coeff=v.policy_ent_coeff,
            embedding_ent_coeff=v.embedding_ent_coeff,
            inference_ce_coeff=v.inference_ce_coeff,
            #optimizer_args=dict(max_grad_norm=0.5)
        )
        runner.setup(algo, env, batch_size=v.batch_size,
            max_path_length=v.max_path_length)
        runner.train(n_epochs=1000, plot=False)
コード例 #16
0
def run_task(v):
    v = SimpleNamespace(**v)

    task_names = sorted(v.tasks.keys())
    task_args = [v.tasks[t]['args'] for t in task_names]
    task_kwargs = [v.tasks[t]['kwargs'] for t in task_names]

    with TaskEmbeddingRunner() as runner:
        # Environment
        env = TfEnv(
            MultiTaskEnv(task_env_cls=SimpleReacherEnv,
                         task_args=task_args,
                         task_kwargs=task_kwargs))

        # Latent space and embedding specs
        # TODO(gh/10): this should probably be done in Embedding or Algo
        latent_lb = np.zeros(v.latent_length, )
        latent_ub = np.ones(v.latent_length, )
        latent_space = Box(latent_lb, latent_ub)

        # trajectory space is (TRAJ_ENC_WINDOW, act_obs) where act_obs is a stacked
        # vector of flattened actions and observations
        act_lb, act_ub = env.action_space.bounds
        act_lb_flat = env.action_space.flatten(act_lb)
        act_ub_flat = env.action_space.flatten(act_ub)
        obs_lb, obs_ub = env.observation_space.bounds
        obs_lb_flat = env.observation_space.flatten(obs_lb)
        obs_ub_flat = env.observation_space.flatten(obs_ub)
        # act_obs_lb = np.concatenate([act_lb_flat, obs_lb_flat])
        # act_obs_ub = np.concatenate([act_ub_flat, obs_ub_flat])
        act_obs_lb = obs_lb_flat
        act_obs_ub = obs_ub_flat
        # act_obs_lb = act_lb_flat
        # act_obs_ub = act_ub_flat
        traj_lb = np.stack([act_obs_lb] * v.inference_window)
        traj_ub = np.stack([act_obs_ub] * v.inference_window)
        traj_space = Box(traj_lb, traj_ub)

        task_embed_spec = EmbeddingSpec(env.task_space, latent_space)
        traj_embed_spec = EmbeddingSpec(traj_space, latent_space)
        task_obs_space = concat_spaces(env.task_space, env.observation_space)
        env_spec_embed = EnvSpec(task_obs_space, env.action_space)

        # TODO(): rename to inference_network
        traj_embedding = GaussianMLPEmbedding(
            name="inference",
            embedding_spec=traj_embed_spec,
            hidden_sizes=(200,
                          100),  # was the same size as policy in Karol's paper
            std_share_network=True,
            init_std=2.0,
        )

        # Embeddings
        task_embedding = GaussianMLPEmbedding(
            name="embedding",
            embedding_spec=task_embed_spec,
            hidden_sizes=(200, 200),
            std_share_network=True,
            init_std=v.embedding_init_std,
            max_std=v.embedding_max_std,
        )

        # Multitask policy
        policy = GaussianMLPMultitaskPolicy(
            name="policy",
            env_spec=env.spec,
            task_space=env.task_space,
            embedding=task_embedding,
            hidden_sizes=(200, 100),
            std_share_network=True,
            init_std=v.policy_init_std,
        )

        extra = v.latent_length + len(v.tasks)
        baseline = MultiTaskGaussianMLPBaseline(
            env_spec=env.spec,
            extra_dims=extra,
            regressor_args=dict(hidden_sizes=(200, 100)),
        )

        algo = PPOTaskEmbedding(
            env=env,
            policy=policy,
            baseline=baseline,
            inference=traj_embedding,
            max_path_length=v.max_path_length,
            n_itr=2000,
            discount=0.99,
            lr_clip_range=0.2,
            policy_ent_coeff=v.policy_ent_coeff,
            embedding_ent_coeff=v.embedding_ent_coeff,
            inference_ce_coeff=v.inference_ce_coeff,
            use_softplus_entropy=True,
        )
        runner.setup(algo,
                     env,
                     batch_size=v.batch_size,
                     max_path_length=v.max_path_length)
        runner.train(n_epochs=2000, plot=False)
コード例 #17
0
    data = [np.sin(matrices[i]) for i in range(100)]
    obs = [{'observations': [x], 'returns': [np.mean(x)]} for x in data]

    observations = np.concatenate([p['observations'] for p in obs])
    returns = np.concatenate([p['returns'] for p in obs])
    returns = returns.reshape((-1, 1))

    paths = {'observations': [np.sin(matrices[i]) for i in range(100, 110)]}

    expected = [[np.mean(x)] for x in paths['observations']]

    return (obs, observations, returns), (paths, expected)


test_env_spec = EnvSpec(observation_space=akro.Box(low=-1,
                                                   high=1,
                                                   shape=(10, 10, 3)),
                        action_space=None)


class TestGaussianCNNBaseline(TfGraphTestCase):

    @pytest.mark.large
    def test_fit_normalized(self):
        gcr = GaussianCNNBaseline(env_spec=test_env_spec,
                                  filters=((3, (3, 3)), (6, (3, 3))),
                                  strides=(1, 1),
                                  padding='SAME',
                                  hidden_sizes=(32, ),
                                  adaptive_std=False,
                                  use_trust_region=True)
コード例 #18
0
 def test_pickleable(self):
     env_spec = EnvSpec(Box(-1, 1, (1)), Box(-2, 2, (2)))
     round_trip = pickle.loads(pickle.dumps(env_spec))
     assert round_trip
     assert round_trip.action_space == env_spec.action_space
     assert round_trip.observation_space == env_spec.observation_space
コード例 #19
0
def run_task(v):
    v = SimpleNamespace(**v)

    task_names = sorted(v.tasks.keys())
    task_args = [v.tasks[t]['args'] for t in task_names]
    task_kwargs = [v.tasks[t]['kwargs'] for t in task_names]

    # Environment
    env = TfEnv(
        normalize(
            MultiTaskEnv(
                task_env_cls=Point3dEnv,
                task_args=task_args,
                task_kwargs=task_kwargs)))

    # Latent space and embedding specs
    # TODO(gh/10): this should probably be done in Embedding or Algo
    latent_lb = np.zeros(v.latent_length, )
    latent_ub = np.ones(v.latent_length, )
    latent_space = Box(latent_lb, latent_ub)

    # trajectory space is (TRAJ_ENC_WINDOW, act_obs) where act_obs is a stacked
    # vector of flattened actions and observations
    act_lb, act_ub = env.action_space.bounds
    act_lb_flat = env.action_space.flatten(act_lb)
    act_ub_flat = env.action_space.flatten(act_ub)
    obs_lb, obs_ub = env.observation_space.bounds
    obs_lb_flat = env.observation_space.flatten(obs_lb)
    obs_ub_flat = env.observation_space.flatten(obs_ub)
    # act_obs_lb = np.concatenate([act_lb_flat, obs_lb_flat])
    # act_obs_ub = np.concatenate([act_ub_flat, obs_ub_flat])
    act_obs_lb = obs_lb_flat
    act_obs_ub = obs_ub_flat
    # act_obs_lb = act_lb_flat
    # act_obs_ub = act_ub_flat
    traj_lb = np.stack([act_obs_lb] * v.inference_window)
    traj_ub = np.stack([act_obs_ub] * v.inference_window)
    traj_space = Box(traj_lb, traj_ub)

    task_embed_spec = EmbeddingSpec(env.task_space, latent_space)
    traj_embed_spec = EmbeddingSpec(traj_space, latent_space)
    task_obs_space = concat_spaces(env.task_space, env.observation_space)
    env_spec_embed = EnvSpec(task_obs_space, env.action_space)

    # TODO(): rename to inference_network
    traj_embedding = GaussianMLPEmbedding(
        name="inference",
        embedding_spec=traj_embed_spec,
        hidden_sizes=(20, 20),
        std_share_network=True,
        init_std=1.0,
    )

    # Embeddings
    task_embedding = GaussianMLPEmbedding(
        name="embedding",
        embedding_spec=task_embed_spec,
        hidden_sizes=(20, 20),
        std_share_network=True,
        init_std=1.0,
        max_std=2.0,
        # normalize=True,
    )

    # Multitask policy
    policy = GaussianMLPMultitaskPolicy(
        name="policy",
        env_spec=env.spec,
        task_space=env.task_space,
        embedding=task_embedding,
        hidden_sizes=(20, 10),
        std_share_network=True,
        # max_std=6.0,
        init_std=6.0,
    )

    # baseline = MultiTaskLinearFeatureBaseline(env_spec=env_spec_embed)
    extra = v.latent_length + len(v.tasks)
    baseline = MultiTaskGaussianMLPBaseline(
        env_spec=env.spec, extra_dims=extra)

    algo = PPOTaskEmbedding(
        env=env,
        policy=policy,
        baseline=baseline,
        inference=traj_embedding,
        batch_size=v.batch_size,  # 4096
        max_path_length=50,
        n_itr=500,
        discount=0.99,
        step_size=0.2,
        plot=True,
        policy_ent_coeff=v.policy_ent_coeff,
        embedding_ent_coeff=v.embedding_ent_coeff,
        inference_ce_coeff=v.inference_ce_coeff,
        num_tasks_held_out=1,
    )
    algo.train()
コード例 #20
0
def run_task(*_):
    with TaskEmbeddingRunner() as runner:
        # Environment
        env = TfEnv(
            MultiTaskEnv(
                task_env_cls=PointEnv,
                task_args=TASK_ARGS,
                task_kwargs=TASK_KWARGS))

        # Latent space and embedding specs
        # TODO(gh/10): this should probably be done in Embedding or Algo
        latent_lb = np.zeros(LATENT_LENGTH, )
        latent_ub = np.ones(LATENT_LENGTH, )
        latent_space = Box(latent_lb, latent_ub)

        # trajectory space is (TRAJ_ENC_WINDOW, act_obs) where act_obs is a stacked
        # vector of flattened actions and observations
        act_lb, act_ub = env.action_space.bounds
        act_lb_flat = env.action_space.flatten(act_lb)
        act_ub_flat = env.action_space.flatten(act_ub)
        obs_lb, obs_ub = env.observation_space.bounds
        obs_lb_flat = env.observation_space.flatten(obs_lb)
        obs_ub_flat = env.observation_space.flatten(obs_ub)
        # act_obs_lb = np.concatenate([act_lb_flat, obs_lb_flat])
        # act_obs_ub = np.concatenate([act_ub_flat, obs_ub_flat])
        act_obs_lb = obs_lb_flat
        act_obs_ub = obs_ub_flat
        # act_obs_lb = act_lb_flat
        # act_obs_ub = act_ub_flat
        traj_lb = np.stack([act_obs_lb] * TRAJ_ENC_WINDOW)
        traj_ub = np.stack([act_obs_ub] * TRAJ_ENC_WINDOW)
        traj_space = Box(traj_lb, traj_ub)

        task_embed_spec = EmbeddingSpec(env.task_space, latent_space)
        traj_embed_spec = EmbeddingSpec(traj_space, latent_space)
        task_obs_space = concat_spaces(env.task_space, env.observation_space)
        env_spec_embed = EnvSpec(task_obs_space, env.action_space)

        # Embeddings
        task_embedding = GaussianMLPEmbedding(
            name="embedding",
            embedding_spec=task_embed_spec,
            hidden_sizes=(20, 20),
            std_share_network=True,
            init_std=3.0,  # 2.0
        )

        # TODO(): rename to inference_network
        traj_embedding = GaussianMLPEmbedding(
            name="inference",
            embedding_spec=traj_embed_spec,
            hidden_sizes=(20, 10),  # was the same size as policy in Karol's paper
            std_share_network=True,
        )

        # Multitask policy
        policy = GaussianMLPMultitaskPolicy(
            name="policy",
            env_spec=env.spec,
            task_space=env.task_space,
            embedding=task_embedding,
            hidden_sizes=(20, 10),
            std_share_network=True,  # Must be True for embedding learning
            init_std=6.0,  # 4.5 6.0
        )

        baseline = MultiTaskLinearFeatureBaseline(env_spec=env_spec_embed)

        max_path_length = 50
        algo = TRPOTaskEmbedding(
            env=env,
            policy=policy,
            baseline=baseline,
            inference=traj_embedding,
            max_path_length=max_path_length,
            discount=0.99,
            max_kl_step=0.2,
            policy_ent_coeff=1e-7,  # 1e-7
            embedding_ent_coeff=1e-3,  # 1e-3
            inference_ce_coeff=1e-7,  # 1e-7
            # kl_constraint=KLConstraint.SOFT,
            # optimizer_args=dict(max_penalty=1e9),
        )
        runner.setup(algo, env, batch_size=20000,
            max_path_length=max_path_length)
        runner.train(n_epochs=1000, plot=False)
コード例 #21
0
ファイル: test_pearl_worker.py プロジェクト: fangqyi/garage
def test_methods():
    """Test PEARLWorker methods."""
    env_spec = TfEnv(DummyBoxEnv())
    latent_dim = 5
    latent_space = akro.Box(low=-1,
                            high=1,
                            shape=(latent_dim, ),
                            dtype=np.float32)

    # add latent space to observation space to create a new space
    augmented_obs_space = akro.Tuple(
        (env_spec.observation_space, latent_space))
    augmented_env_spec = EnvSpec(augmented_obs_space, env_spec.action_space)

    obs_dim = int(np.prod(env_spec.observation_space.shape))
    action_dim = int(np.prod(env_spec.action_space.shape))
    reward_dim = 1
    encoder_input_dim = obs_dim + action_dim + reward_dim
    encoder_output_dim = latent_dim * 2
    encoder_hidden_sizes = (3, 2, encoder_output_dim)

    context_encoder = MLPEncoder(input_dim=encoder_input_dim,
                                 output_dim=encoder_output_dim,
                                 hidden_nonlinearity=None,
                                 hidden_sizes=encoder_hidden_sizes,
                                 hidden_w_init=nn.init.ones_,
                                 output_w_init=nn.init.ones_)

    policy = TanhGaussianMLPPolicy(env_spec=augmented_env_spec,
                                   hidden_sizes=(3, 5, 7),
                                   hidden_nonlinearity=F.relu,
                                   output_nonlinearity=None)

    context_policy = ContextConditionedPolicy(latent_dim=latent_dim,
                                              context_encoder=context_encoder,
                                              policy=policy,
                                              use_information_bottleneck=True,
                                              use_next_obs=False)

    max_path_length = 20
    worker1 = PEARLWorker(seed=1,
                          max_path_length=max_path_length,
                          worker_number=1)
    worker1.update_agent(context_policy)
    worker1.update_env(env_spec)
    rollouts = worker1.rollout()

    assert rollouts.observations.shape == (max_path_length, obs_dim)
    assert rollouts.actions.shape == (max_path_length, action_dim)
    assert rollouts.rewards.shape == (max_path_length, )

    worker2 = PEARLWorker(seed=1,
                          max_path_length=max_path_length,
                          worker_number=1,
                          deterministic=True,
                          accum_context=True)
    worker2.update_agent(context_policy)
    worker2.update_env(env_spec)
    rollouts = worker2.rollout()

    assert context_policy.context.shape == (1, max_path_length,
                                            encoder_input_dim)
    assert rollouts.observations.shape == (max_path_length, obs_dim)
    assert rollouts.actions.shape == (max_path_length, action_dim)
    assert rollouts.rewards.shape == (max_path_length, )