예제 #1
0
    def __init__(self,
                 name,
                 observation_shape,
                 action_shape,
                 hid_size,
                 num_hid_layers,
                 stochastic=True):
        with tf.variable_scope(name):
            self.stochastic = stochastic
            self.hid_size, self.num_hid_layers = hid_size, num_hid_layers
            self.action_shape, self.observation_shape = action_shape, observation_shape
            self.scope = tf.get_variable_scope().name
            self.pdtype = DiagGaussianPdType(action_shape[0])

            observations_ph = U.get_placeholder(name='ob',
                                                dtype=tf.float32,
                                                shape=[None] +
                                                list(observation_shape))
            stochastic_ph = tf.placeholder(dtype=tf.bool, shape=())

            with tf.variable_scope('obfilter'):
                self.ob_rms = RunningMeanStd(shape=observation_shape)

            with tf.variable_scope('pol'):
                last_out = tf.clip_by_value(
                    (observations_ph - self.ob_rms.mean) / self.ob_rms.std,
                    -5.0, 5.0)
                for i in range(num_hid_layers):
                    last_out = tf.nn.tanh(
                        tf.layers.dense(
                            last_out,
                            hid_size,
                            name='fc%i' % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))

                mean = tf.layers.dense(
                    last_out,
                    self.pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name='logstd',
                    shape=[1, self.pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)

            self.pd = self.pdtype.pdfromflat(pdparam)

            action_op = U.switch(stochastic_ph, self.pd.sample(),
                                 self.pd.mode())
            self._act = U.function([stochastic_ph, observations_ph], action_op)
예제 #2
0
def make_pdtype(ac_space):
    from cadm import spaces as custom_spaces
    from gym import spaces
    if isinstance(ac_space, custom_spaces.Box):
        assert len(ac_space.shape) == 1
        return DiagGaussianPdType(ac_space.shape[0])
    elif isinstance(ac_space, spaces.Box):
        assert len(ac_space.shape) == 1
        return DiagGaussianPdType(ac_space.shape[0])
    elif isinstance(ac_space, spaces.Discrete):
        return CategoricalPdType(ac_space.n)
    elif isinstance(ac_space, spaces.MultiDiscrete):
        return MultiCategoricalPdType(ac_space.nvec)
    elif isinstance(ac_space, spaces.MultiBinary):
        return BernoulliPdType(ac_space.n)
    else:
        raise NotImplementedError
예제 #3
0
    def _init(self, ob_space, ac_space, hid_layers=[],
              deterministic=True, diagonal=True, trainable_std=True,
              use_bias=True, use_critic=False,
              seed=None, verbose=True,
              hidden_W_init=U.normc_initializer(1.0),
              higher_mean_init=None,
              higher_logstd_init=tf.constant_initializer(np.log(0.11)),
              const_std_init=False):
        """Params:
            ob_space: task observation space
            ac_space : task action space
            hid__layers: list with width of each hidden layer
            deterministic: whether the actor is deterministic
            diagonal: whether the higher order policy has a diagonal covariance
            matrix
            use_bias: whether to include bias in neurons
            use_critic: whether to include a critic network
            seed: optional random seed
        """
        # Check environment's shapes
        assert isinstance(ob_space, gym.spaces.Box)
        assert len(ac_space.shape) == 1
        # Set seed
        if seed is not None:
            set_global_seeds(seed)
        # Set some attributes
        self.diagonal = diagonal
        self.use_bias = use_bias
        batch_length = None  # Accepts a sequence of eps of arbitrary length
        self.ac_dim = ac_space.shape[0]
        self.ob_dim = ob_space.shape[0]
        self.linear = not hid_layers
        self.verbose = verbose
        self._ob = ob = U.get_placeholder(
            name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape))

        # Actor (N.B.: weight initialization is irrelevant)
        with tf.variable_scope('actor'):
            last_out = ob
            for i, hid_size in enumerate(hid_layers):
                # Mlp feature extraction
                last_out = tf.nn.tanh(
                    tf.layers.dense(last_out, hid_size,
                                    name='fc%i' % (i+1),
                                    kernel_initializer=hidden_W_init,
                                    use_bias=use_bias))
            if deterministic and isinstance(ac_space, gym.spaces.Box):
                # Determinisitc action selection
                self.actor_mean = actor_mean = \
                    tf.layers.dense(last_out, ac_space.shape[0],
                                    name='action',
                                    kernel_initializer=hidden_W_init,
                                    use_bias=use_bias)
            else:
                raise NotImplementedError

        # Get actor flatten weights
        with tf.variable_scope('actor') as scope:
            self.actor_weights = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES,
                scope=scope.name)
            # flatten weights
            self.flat_actor_weights = tf.concat(
                [tf.reshape(w, [-1]) for w in self.actor_weights], axis=0)
            self._n_actor_weights = n_actor_weights = \
                self.flat_actor_weights.shape[0]

        # Higher order policy (Gaussian)
        with tf.variable_scope('higher'):
            if higher_mean_init is None:
                # Initial means sampled from a normal distribution N(0,1)
                higher_mean_init = tf.where(
                    tf.not_equal(self.flat_actor_weights,
                                 tf.constant(0, dtype=tf.float32)),
                    tf.random_normal(shape=[n_actor_weights.value],
                                     stddev=0.01),
                    tf.zeros(shape=[n_actor_weights]))  # bias init always zero
            self.higher_mean = tf.get_variable(
                name='higher_mean',
                initializer=higher_mean_init,
                shape=self.flat_actor_weights.get_shape())
            # Keep the weights'domain compact
            # self.higher_mean = higher_mean = tf.clip_by_value(
            #     self.higher_mean, -1, 1, 'higher_mean_clipped')
            higher_mean = self.higher_mean
            if diagonal:
                if const_std_init:
                    self.higher_logstd = higher_logstd = \
                        tf.get_variable(
                            name='higher_logstd',
                            initializer=higher_logstd_init,
                            trainable=trainable_std)
                else:
                    self.higher_logstd = higher_logstd = \
                        tf.get_variable(
                            name='higher_logstd',
                            shape=[n_actor_weights],
                            initializer=higher_logstd_init,
                            trainable=trainable_std)
                pdparam = tf.concat([higher_mean,
                                     higher_mean * 0. + higher_logstd],
                                    axis=0)
                self.pdtype = pdtype = \
                    DiagGaussianPdType(n_actor_weights.value)
            else:
                # Cholesky covariance matrix
                self.higher_logstd = higher_logstd = tf.get_variable(
                    name='higher_logstd',
                    shape=[n_actor_weights*(n_actor_weights + 1)//2],
                    initializer=tf.initializers.constant(0.))
                pdparam = tf.concat([higher_mean, higher_logstd],
                                    axis=0)
                self.pdtype = pdtype = CholeskyGaussianPdType(
                    n_actor_weights.value)

        # Sample actor weights
        self.pd = pdtype.pdfromflat(pdparam)
        sampled_actor_params = self.pd.sample()
        symm_sampled_actor_params = self.pd.sample_symmetric()
        self._sample_actor_params = U.function([], [sampled_actor_params])
        self._sample_symm_actor_params = U.function(
            [], list(symm_sampled_actor_params))

        # Assign actor weights
        with tf.variable_scope('actor') as scope:
            actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                             scope=scope.name)
            self._use_sampled_actor_params = \
                U.assignFromFlat(actor_params, sampled_actor_params)
            self._get_actor_params = U.GetFlat(actor_params)
            self._set_actor_params = U.SetFromFlat(actor_params)

        # Act
        self._action = action = actor_mean
        self._act = U.function([ob], [action])

        # Manage higher policy weights
        with tf.variable_scope('higher') as scope:
            self._higher_params = higher_params = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)
            self.flat_higher_params = tf.concat([tf.reshape(w, [-1]) for w in
                                                 self._higher_params], axis=0)
            self._n_higher_params = self.flat_higher_params.shape[0]
            self._get_flat_higher_params = U.GetFlat(higher_params)
            self._set_higher_params = U.SetFromFlat(self._higher_params)

        # Evaluating
        self._actor_params_in = actor_params_in = \
            U.get_placeholder(name='actor_params_in',
                              dtype=tf.float32,
                              shape=[batch_length] + [n_actor_weights])
        self._rets_in = rets_in = \
            U.get_placeholder(name='returns_in',
                              dtype=tf.float32,
                              shape=[batch_length])
        ret_mean, ret_std = tf.nn.moments(rets_in, axes=[0])
        self._get_ret_mean = U.function([self._rets_in], [ret_mean])
        self._get_ret_std = U.function([self._rets_in], [ret_std])
        self._logprobs = logprobs = self.pd.logp(actor_params_in)
        pgpe_times_n = U.flatgrad(logprobs*rets_in, higher_params)
        self._get_pgpe_times_n = U.function([actor_params_in, rets_in],
                                            [pgpe_times_n])
        self._get_actor_mean = U.function([ob], [self.actor_mean])
        self._get_higher_mean = U.function([ob], [self.higher_mean])
        self._get_higher_std = U.function([], tf.exp([self.higher_logstd]))

        # Batch off-policy PGPE
        self._probs = tf.exp(logprobs)
        self._behavioral = None
        self._renyi_other = None

        # Renyi computation
        self._det_sigma = tf.exp(tf.reduce_sum(self.higher_logstd))

        # Fisher computation (diagonal case)
        mean_fisher_diag = tf.exp(-2*self.higher_logstd)
        if trainable_std:
            cov_fisher_diag = mean_fisher_diag*0 + 2
            self._fisher_diag = tf.concat(
                [mean_fisher_diag, cov_fisher_diag], axis=0)
        else:
            self._fisher_diag = mean_fisher_diag
        self._get_fisher_diag = U.function([], [self._fisher_diag])
예제 #4
0
    def _init(self,
              ob_space,
              ac_space,
              hid_layers=[],
              deterministic=True,
              diagonal=True,
              use_bias=False,
              use_critic=False,
              seed=None,
              verbose=True,
              zero_init=False):
        """Params:
            ob_space: task observation space
            ac_space : task action space
            hid__layers: list with width of each hidden layer
            deterministic: whether the actor is deterministic
            diagonal: whether the higher order policy has a diagonal covariance
            matrix
            use_bias: whether to include bias in neurons
            use_critic: whether to include a critic network
            seed: optional random seed
        """
        assert isinstance(ob_space, gym.spaces.Box)
        assert len(ac_space.shape) == 1
        self.diagonal = diagonal
        self.use_bias = use_bias
        batch_length = None  #Accepts a sequence of episodes of arbitrary length
        self.observation_space = ob_space
        self.action_space = ac_space
        self.ac_dim = ac_space.shape[0]
        self.ob_dim = ob_space.shape[0]
        self.hid_layers = hid_layers
        self.deterministic = deterministic
        self.use_critic = use_critic
        self.linear = not hid_layers
        self.verbose = verbose

        if seed is not None:
            set_global_seeds(seed)

        self._ob = ob = U.get_placeholder(name="ob",
                                          dtype=tf.float32,
                                          shape=[None] + list(ob_space.shape))

        #Critic (normally not used)
        if use_critic:
            with tf.variable_scope('critic'):
                last_out = ob
                for i, hid_size in enumerate(hid_layers):
                    last_out = tf.nn.tanh(
                        tf.layers.dense(
                            last_out,
                            hid_size,
                            name="fc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))
                self.vpred = tf.layers.dense(
                    last_out,
                    1,
                    name='final',
                    kernel_initializer=U.normc_initializer(1.0))[:, 0]

        #Actor (N.B.: weight initialization is irrelevant)
        with tf.variable_scope('actor'):
            last_out = ob
            for i, hid_size in enumerate(hid_layers):
                #Mlp feature extraction
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=tf.initializers.constant(0.),
                        use_bias=use_bias))
            if deterministic and isinstance(ac_space, gym.spaces.Box):
                #Determinisitc action selection
                self.actor_mean = actor_mean = tf.layers.dense(
                    last_out,
                    ac_space.shape[0],
                    name='action',
                    kernel_initializer=tf.initializers.constant(0.),
                    use_bias=use_bias)
            else:
                raise NotImplementedError  #Currently supports only deterministic action policies

        #Higher order policy (Gaussian)
        with tf.variable_scope('actor') as scope:
            self.actor_weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \
                                         scope=scope.name)
            self.flat_actor_weights = tf.concat([tf.reshape(w, [-1]) for w in \
                                            self.actor_weights], axis=0) #flatten
            self._n_actor_weights = n_actor_weights = self.flat_actor_weights.shape[
                0]

        with tf.variable_scope('higher'):
            if zero_init:
                higher_mean_init = tf.where(
                    tf.not_equal(self.flat_actor_weights,
                                 tf.constant(0, dtype=tf.float32)),
                    tf.zeros(shape=[n_actor_weights.value]),
                    tf.zeros(shape=[n_actor_weights]))
            else:
                #Initial means sampled from a normal distribution N(0,1)
                higher_mean_init = tf.where(
                    tf.not_equal(self.flat_actor_weights,
                                 tf.constant(0, dtype=tf.float32)),
                    tf.random_normal(shape=[n_actor_weights.value],
                                     stddev=0.01),
                    tf.zeros(shape=[n_actor_weights]))
            self.higher_mean = higher_mean = tf.get_variable(
                name='higher_mean', initializer=higher_mean_init)

            if diagonal:
                #Diagonal covariance matrix; all stds initialized to 0
                self.higher_logstd = higher_logstd = tf.get_variable(
                    name='higher_logstd',
                    shape=[n_actor_weights],
                    initializer=tf.initializers.constant(0.))
                pdparam = tf.concat(
                    [higher_mean, higher_mean * 0. + higher_logstd], axis=0)
                self.pdtype = pdtype = DiagGaussianPdType(
                    n_actor_weights.value)
            else:
                #Cholesky covariance matrix
                self.higher_logstd = higher_logstd = tf.get_variable(
                    name='higher_logstd',
                    shape=[n_actor_weights * (n_actor_weights + 1) // 2],
                    initializer=tf.initializers.constant(0.))
                pdparam = tf.concat([higher_mean, higher_logstd], axis=0)
                self.pdtype = pdtype = CholeskyGaussianPdType(
                    n_actor_weights.value)

        #Sample actor weights
        self.pd = pdtype.pdfromflat(pdparam)
        sampled_actor_params = self.pd.sample()
        symm_sampled_actor_params = self.pd.sample_symmetric()
        self._sample_symm_actor_params = U.function(
            [], list(symm_sampled_actor_params))
        self._sample_actor_params = U.function([], [sampled_actor_params])

        #Assign actor weights
        with tf.variable_scope('actor') as scope:
            actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \
                                         scope=scope.name)

            self._use_sampled_actor_params = U.assignFromFlat(
                actor_params, sampled_actor_params)

            self._set_actor_params = U.SetFromFlat(actor_params)

            self._get_actor_params = U.GetFlat(actor_params)

        #Act
        self._action = action = actor_mean
        self._act = U.function([ob], [action])

        #Higher policy weights
        with tf.variable_scope('higher') as scope:
            self._higher_params = higher_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, \
                                         scope=scope.name)
            self.flat_higher_params = tf.concat([tf.reshape(w, [-1]) for w in \
                                            self._higher_params], axis=0) #flatten
            self._n_higher_params = self.flat_higher_params.shape[0]
            self._get_flat_higher_params = U.GetFlat(higher_params)
            self._set_higher_params = U.SetFromFlat(self._higher_params)

        #Batch PGPE
        self._actor_params_in = actor_params_in = \
                U.get_placeholder(name='actor_params_in',
                                  dtype=tf.float32,
                                  shape=[batch_length] + [n_actor_weights])
        self._rets_in = rets_in = U.get_placeholder(name='returns_in',
                                                    dtype=tf.float32,
                                                    shape=[batch_length])
        ret_mean, ret_std = tf.nn.moments(rets_in, axes=[0])
        self._get_ret_mean = U.function([self._rets_in], [ret_mean])
        self._get_ret_std = U.function([self._rets_in], [ret_std])
        self._logprobs = logprobs = self.pd.logp(actor_params_in)
        pgpe_times_n = U.flatgrad(logprobs * rets_in, higher_params)
        self._get_pgpe_times_n = U.function([actor_params_in, rets_in],
                                            [pgpe_times_n])

        #One-episode PGPE
        #Used N times to compute the baseline -> can we do better?
        self._one_actor_param_in = one_actor_param_in = U.get_placeholder(
            name='one_actor_param_in',
            dtype=tf.float32,
            shape=[n_actor_weights])
        one_logprob = self.pd.logp(one_actor_param_in)
        score = U.flatgrad(one_logprob, higher_params)
        score_norm = tf.norm(score)
        self._get_score = U.function([one_actor_param_in], [score])
        self._get_score_norm = U.function([one_actor_param_in], [score_norm])

        #Batch off-policy PGPE
        self._probs = tf.exp(logprobs)
        self._behavioral = None
        self._renyi_other = None

        #One episode off-PGPE
        self._one_prob = tf.exp(one_logprob)

        #Renyi computation
        self._det_sigma = tf.exp(tf.reduce_sum(self.higher_logstd))

        #Fisher computation (diagonal case)
        mean_fisher_diag = tf.exp(-2 * self.higher_logstd)
        cov_fisher_diag = mean_fisher_diag * 0 + 2
        self._fisher_diag = tf.concat([mean_fisher_diag, cov_fisher_diag],
                                      axis=0)
        self._get_fisher_diag = U.function([], [self._fisher_diag])

        #Multiple importance sampling
        self._memory = None
예제 #5
0
    def _init(self,
              np_random,
              flavor,
              dim,
              hid_size=32,
              n_hid=2,
              alpha_sysid=0.1,
              test=False):

        print("obs dim:", dim.ob)

        # inputs & hyperparameters
        self.flavor = flavor
        self.dim = dim
        self.alpha_sysid = alpha_sysid
        self.ob = U.get_placeholder(name="ob",
                                    dtype=tf.float32,
                                    shape=(None, dim.ob_concat))
        self.ob_traj = U.get_placeholder(name="ob_traj",
                                         dtype=tf.float32,
                                         shape=[None, dim.window, dim.ob])
        self.ac_traj = U.get_placeholder(name="ac_traj",
                                         dtype=tf.float32,
                                         shape=[None, dim.window, dim.ac])

        # regular inputs whitening
        ob, sysid = tf.split(self.ob, [dim.ob, dim.sysid], axis=1)
        with tf.variable_scope("ob_filter"):
            self.ob_rms = RunningMeanStd(shape=(dim.ob_concat))
            obz_all = tf.clip_by_value(
                (self.ob - self.ob_rms.mean) / self.ob_rms.std,
                -5.0,
                5.0,
                name="ob_normalizer")
        obz, sysidz = tf.split(obz_all, [dim.ob, dim.sysid], axis=1)
        print("obz dim:", obz.shape, "sysidz dim:", sysidz.shape)
        with tf.variable_scope("ob_white"):
            obz = tf.identity(obz)
        with tf.variable_scope("sysid_white"):
            self.sysidz = tf.identity(sysidz)

        # trajectory inputs for SysID
        # NOTE: the environment should be defined such that
        # actions are relatively close to Normal(0,1)
        ob_trajz = tf.clip_by_value(
            (self.ob_traj - self.ob_rms.mean[:dim.ob]) /
            self.ob_rms.std[:dim.ob],
            -5.0,
            5.0,
            name="ob_traj_white")
        trajs = tf.concat([ob_trajz, self.ac_traj], axis=2)

        # these rewards will be optimized via direct gradient-based optimization
        # (not RL reward), in the same place as e.g. the entropy regularization
        self.extra_rewards = []
        self.extra_reward_names = []

        with tf.variable_scope("sysid"):
            if flavor == PLAIN:
                self.traj2sysid = sysid_convnet(np_random, trajs, dim.sysid)
            elif flavor == EXTRA:
                self.traj2sysid = sysid_convnet(np_random, trajs, dim.sysid)
            elif flavor == EMBED:
                self.traj2embed = sysid_convnet(np_random, trajs, dim.embed)

        EMBED_N_HID = 2
        EMBED_HID_SZ = 2 * dim.sysid

        # policy
        with tf.variable_scope("pol"):
            if flavor == BLIND:
                policy_input = obz
                self.sysid_err_supervised = tf.constant(0.0)
            elif flavor == PLAIN:
                self.sysid_err_supervised = tf.losses.mean_squared_error(
                    tf.stop_gradient(sysidz), self.traj2sysid)
                policy_input = tf.concat([obz, self.traj2sysid
                                          ]) if test else obz_all
            elif flavor == EXTRA:
                sysid_processor_input = self.traj2sysid if test else sysidz
                sysid_processor = MLPModule(np_random, sysid_processor_input,
                                            EMBED_N_HID, EMBED_HID_SZ, 1.0,
                                            dim.embed, "sysid_processor")
                policy_input = tf.concat([obz, sysid_processor],
                                         axis=1,
                                         name="input_concat")
                self.sysid_err_supervised = tf.losses.mean_squared_error(
                    tf.stop_gradient(sysidz), self.traj2sysid)
            elif flavor == EMBED:
                self.embed = MLPModule(np_random, sysidz, EMBED_N_HID,
                                       EMBED_HID_SZ, 1.0, dim.embed, "embed")
                embed_input = self.traj2embed if test else self.embed
                policy_input = tf.concat([obz, embed_input],
                                         axis=1,
                                         name="input_concat")
                self.sysid_err_supervised = tf.losses.mean_squared_error(
                    tf.stop_gradient(self.embed), self.traj2embed)
                mean, var = tf.nn.moments(self.embed, 0)
                dist = tf.distributions.Normal(loc=mean, scale=tf.sqrt(var))
                std_dist = tf.distributions.Normal(loc=0.0, scale=1.0)
                embed_KL = tf.reduce_mean(
                    tf.distributions.kl_divergence(dist, std_dist))
                self.extra_rewards.append(-0.1 * embed_KL)
                self.extra_reward_names.append("neg_embed_KL")
            elif flavor == TRAJ:
                self.traj_conv = sysid_convnet(np_random, trajs, dim.embed)
                policy_input = tf.concat([obz, self.traj_conv],
                                         axis=1,
                                         name="input_concat")
                self.sysid_err_supervised = tf.constant(0.0)
            else:
                raise ValueError("flavor '{}' does not exist".format(flavor))

            # main policy MLP. outputs mean and logstd of stochastic Gaussian policy
            with tf.variable_scope("policy"):
                print("policy input dimensionality:",
                      policy_input.get_shape().as_list())
                mean = MLPModule(np_random, policy_input, n_hid, hid_size,
                                 0.01, dim.ac, "pol")
                logstd = tf.maximum(
                    tf.get_variable(name="logstd",
                                    shape=[1, dim.ac],
                                    initializer=tf.constant_initializer(-0.3)),
                    -1.0)

            with tf.variable_scope("policy_to_gaussian"):
                pdparam = tf.concat([mean, mean * 0.0 + logstd], 1)
                self.pdtype = DiagGaussianPdType(dim.ac)
                self.pd = self.pdtype.pdfromflat(pdparam)

        # value function
        with tf.variable_scope("vf"):
            self.vpred = MLPModule(np_random, tf.stop_gradient(policy_input),
                                   n_hid, hid_size, 0.1, 1, "vf")[:, 0]

        # switch between stochastic and deterministic policy
        with tf.variable_scope("stochastic_switch"):
            self.stochastic = tf.placeholder(dtype=tf.bool,
                                             shape=(),
                                             name="stochastic")
            self.ac = U.switch(self.stochastic, self.pd.sample(),
                               self.pd.mode())

        # function we'll call when interacting with environment
        self._act = U.function([self.stochastic, self.ob],
                               [self.ac, self.vpred])

        # for test time, the trajectory is fed in
        self._act_traj = U.function(
            [self.stochastic, self.ob, self.ob_traj, self.ac_traj],
            [self.ac, self.vpred])
예제 #6
0
class SysIDPolicy(object):

    recurrent = False

    def __init__(self, name, *args, **kwargs):
        with tf.variable_scope(name):
            self._init(*args, **kwargs)
            self.scope = tf.get_variable_scope().name

    # set up the network
    # NOTE: due to normalization of SysID values and KL-regularization of embedding space,
    # alpha_sysid shouldn't need to vary between environments - but we'll see...
    def _init(self,
              np_random,
              flavor,
              dim,
              hid_size=32,
              n_hid=2,
              alpha_sysid=0.1,
              test=False):

        print("obs dim:", dim.ob)

        # inputs & hyperparameters
        self.flavor = flavor
        self.dim = dim
        self.alpha_sysid = alpha_sysid
        self.ob = U.get_placeholder(name="ob",
                                    dtype=tf.float32,
                                    shape=(None, dim.ob_concat))
        self.ob_traj = U.get_placeholder(name="ob_traj",
                                         dtype=tf.float32,
                                         shape=[None, dim.window, dim.ob])
        self.ac_traj = U.get_placeholder(name="ac_traj",
                                         dtype=tf.float32,
                                         shape=[None, dim.window, dim.ac])

        # regular inputs whitening
        ob, sysid = tf.split(self.ob, [dim.ob, dim.sysid], axis=1)
        with tf.variable_scope("ob_filter"):
            self.ob_rms = RunningMeanStd(shape=(dim.ob_concat))
            obz_all = tf.clip_by_value(
                (self.ob - self.ob_rms.mean) / self.ob_rms.std,
                -5.0,
                5.0,
                name="ob_normalizer")
        obz, sysidz = tf.split(obz_all, [dim.ob, dim.sysid], axis=1)
        print("obz dim:", obz.shape, "sysidz dim:", sysidz.shape)
        with tf.variable_scope("ob_white"):
            obz = tf.identity(obz)
        with tf.variable_scope("sysid_white"):
            self.sysidz = tf.identity(sysidz)

        # trajectory inputs for SysID
        # NOTE: the environment should be defined such that
        # actions are relatively close to Normal(0,1)
        ob_trajz = tf.clip_by_value(
            (self.ob_traj - self.ob_rms.mean[:dim.ob]) /
            self.ob_rms.std[:dim.ob],
            -5.0,
            5.0,
            name="ob_traj_white")
        trajs = tf.concat([ob_trajz, self.ac_traj], axis=2)

        # these rewards will be optimized via direct gradient-based optimization
        # (not RL reward), in the same place as e.g. the entropy regularization
        self.extra_rewards = []
        self.extra_reward_names = []

        with tf.variable_scope("sysid"):
            if flavor == PLAIN:
                self.traj2sysid = sysid_convnet(np_random, trajs, dim.sysid)
            elif flavor == EXTRA:
                self.traj2sysid = sysid_convnet(np_random, trajs, dim.sysid)
            elif flavor == EMBED:
                self.traj2embed = sysid_convnet(np_random, trajs, dim.embed)

        EMBED_N_HID = 2
        EMBED_HID_SZ = 2 * dim.sysid

        # policy
        with tf.variable_scope("pol"):
            if flavor == BLIND:
                policy_input = obz
                self.sysid_err_supervised = tf.constant(0.0)
            elif flavor == PLAIN:
                self.sysid_err_supervised = tf.losses.mean_squared_error(
                    tf.stop_gradient(sysidz), self.traj2sysid)
                policy_input = tf.concat([obz, self.traj2sysid
                                          ]) if test else obz_all
            elif flavor == EXTRA:
                sysid_processor_input = self.traj2sysid if test else sysidz
                sysid_processor = MLPModule(np_random, sysid_processor_input,
                                            EMBED_N_HID, EMBED_HID_SZ, 1.0,
                                            dim.embed, "sysid_processor")
                policy_input = tf.concat([obz, sysid_processor],
                                         axis=1,
                                         name="input_concat")
                self.sysid_err_supervised = tf.losses.mean_squared_error(
                    tf.stop_gradient(sysidz), self.traj2sysid)
            elif flavor == EMBED:
                self.embed = MLPModule(np_random, sysidz, EMBED_N_HID,
                                       EMBED_HID_SZ, 1.0, dim.embed, "embed")
                embed_input = self.traj2embed if test else self.embed
                policy_input = tf.concat([obz, embed_input],
                                         axis=1,
                                         name="input_concat")
                self.sysid_err_supervised = tf.losses.mean_squared_error(
                    tf.stop_gradient(self.embed), self.traj2embed)
                mean, var = tf.nn.moments(self.embed, 0)
                dist = tf.distributions.Normal(loc=mean, scale=tf.sqrt(var))
                std_dist = tf.distributions.Normal(loc=0.0, scale=1.0)
                embed_KL = tf.reduce_mean(
                    tf.distributions.kl_divergence(dist, std_dist))
                self.extra_rewards.append(-0.1 * embed_KL)
                self.extra_reward_names.append("neg_embed_KL")
            elif flavor == TRAJ:
                self.traj_conv = sysid_convnet(np_random, trajs, dim.embed)
                policy_input = tf.concat([obz, self.traj_conv],
                                         axis=1,
                                         name="input_concat")
                self.sysid_err_supervised = tf.constant(0.0)
            else:
                raise ValueError("flavor '{}' does not exist".format(flavor))

            # main policy MLP. outputs mean and logstd of stochastic Gaussian policy
            with tf.variable_scope("policy"):
                print("policy input dimensionality:",
                      policy_input.get_shape().as_list())
                mean = MLPModule(np_random, policy_input, n_hid, hid_size,
                                 0.01, dim.ac, "pol")
                logstd = tf.maximum(
                    tf.get_variable(name="logstd",
                                    shape=[1, dim.ac],
                                    initializer=tf.constant_initializer(-0.3)),
                    -1.0)

            with tf.variable_scope("policy_to_gaussian"):
                pdparam = tf.concat([mean, mean * 0.0 + logstd], 1)
                self.pdtype = DiagGaussianPdType(dim.ac)
                self.pd = self.pdtype.pdfromflat(pdparam)

        # value function
        with tf.variable_scope("vf"):
            self.vpred = MLPModule(np_random, tf.stop_gradient(policy_input),
                                   n_hid, hid_size, 0.1, 1, "vf")[:, 0]

        # switch between stochastic and deterministic policy
        with tf.variable_scope("stochastic_switch"):
            self.stochastic = tf.placeholder(dtype=tf.bool,
                                             shape=(),
                                             name="stochastic")
            self.ac = U.switch(self.stochastic, self.pd.sample(),
                               self.pd.mode())

        # function we'll call when interacting with environment
        self._act = U.function([self.stochastic, self.ob],
                               [self.ac, self.vpred])

        # for test time, the trajectory is fed in
        self._act_traj = U.function(
            [self.stochastic, self.ob, self.ob_traj, self.ac_traj],
            [self.ac, self.vpred])

    # given the actual dynamics parameters, compute the embedding
    def sysid_to_embedded(self, sysid_vals):
        if self.flavor in [BLIND, TRAJ]:
            # could also just return sysid_vals, but this draws attention to lack of sysid
            return 0 * sysid_vals

        # pass val[None,:] if needing to evaluate for just one sysid val
        assert len(sysid_vals.shape) == 2
        k = sysid_vals.shape[0]
        sysid_vals = np.concatenate([np.zeros((k, self.dim.ob)), sysid_vals],
                                    axis=1)
        sess = tf.get_default_session()

        if self.flavor == EMBED:
            embed = sess.run(self.embed, feed_dict={self.ob: sysid_vals})
            return embed
        else:
            sysidz = sess.run(self.sysidz, feed_dict={self.ob: sysid_vals})
            return sysidz

    # given the ob/ac trajectories, estimate the embedding.
    # it's also part of the main policy, but needed on its own for TRPO.
    def estimate_sysid(self, ob_trajs, ac_trajs):
        sess = tf.get_default_session()
        N = ob_trajs.shape[0]
        k = N // 2048 + 1

        if self.flavor in [BLIND, TRAJ]:
            return np.zeros((N, self.dim.sysid))

        # TODO use tf.data or something to do this automatically!
        def gen(ob_splits, ac_splits):
            for o, a in zip(ob_splits, ac_splits):
                feed = {
                    self.ob_traj: o,
                    self.ac_traj: a,
                }
                if self.flavor == EMBED:
                    yield sess.run(self.traj2embed, feed_dict=feed)
                else:
                    yield sess.run(self.traj2sysid, feed_dict=feed)

        est = np.vstack(
            gen(np.array_split(ob_trajs, k), np.array_split(ac_trajs, k)))
        return est

    # act - ob is concat(ob, sysid)
    def act(self, stochastic, ob):
        ac1, vpred1 = self._act(stochastic, ob)
        return ac1, vpred1

    def act_traj(self, stochastic, ob, ob_traj, ac_traj):
        return self._act_traj(stochastic, ob, ob_traj, ac_traj)

    # for OpenAI Baselines compatibility
    def get_variables(self):
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

    def get_trainable_variables(self):
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)

    def get_initial_state(self):
        return []
예제 #7
0
class MlpPolicy(object):
    """A multilayer perceptron to map state observations into actions.

    Note:
        The last layer of this network parameterises a diagonal gaussian distribution
        so the output can be stochstic by sampling from the distribution, or deterministic
        by taking the mean.

    Args:
        name: Name of the scope under which to delare all the network's tf variables
        observation_shape: Shape of the observation space
        action_shape: Shape of the action space
        hid_size: Number of neurons per hidden layer
        num_hid_layers: Number of hidden layers
        stochastic: Whether to sample the output distribution or take its mean when generating actions

    """
    def __init__(self,
                 name,
                 observation_shape,
                 action_shape,
                 hid_size,
                 num_hid_layers,
                 stochastic=True):
        with tf.variable_scope(name):
            self.stochastic = stochastic
            self.hid_size, self.num_hid_layers = hid_size, num_hid_layers
            self.action_shape, self.observation_shape = action_shape, observation_shape
            self.scope = tf.get_variable_scope().name
            self.pdtype = DiagGaussianPdType(action_shape[0])

            observations_ph = U.get_placeholder(name='ob',
                                                dtype=tf.float32,
                                                shape=[None] +
                                                list(observation_shape))
            stochastic_ph = tf.placeholder(dtype=tf.bool, shape=())

            with tf.variable_scope('obfilter'):
                self.ob_rms = RunningMeanStd(shape=observation_shape)

            with tf.variable_scope('pol'):
                last_out = tf.clip_by_value(
                    (observations_ph - self.ob_rms.mean) / self.ob_rms.std,
                    -5.0, 5.0)
                for i in range(num_hid_layers):
                    last_out = tf.nn.tanh(
                        tf.layers.dense(
                            last_out,
                            hid_size,
                            name='fc%i' % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))

                mean = tf.layers.dense(
                    last_out,
                    self.pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name='logstd',
                    shape=[1, self.pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)

            self.pd = self.pdtype.pdfromflat(pdparam)

            action_op = U.switch(stochastic_ph, self.pd.sample(),
                                 self.pd.mode())
            self._act = U.function([stochastic_ph, observations_ph], action_op)

    def act(self, observation):
        """Convenience function for generating a single action given an observation

        Args:
            observation: A state observation

        """
        return self._act(self.stochastic, np.array(observation)[None])[0]

    def get_variables(self):
        """Gets all the tf variables associated with this network."""
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

    def get_trainable_variables(self):
        """Gets all the trainable tf variables associated with this network."""
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)

    def make_target_network(self, name):
        """Creates a network which periodically updates its weights by copying them from this network.

        Args:
            name: Name of the scope under which to delare all the target network's tf variables

        """
        return TargetMlpPolicy(name, self)