def build_graph(self, obs_ph, reuse=False):
        with tf.variable_scope(self.scope):
            if reuse:
                tf.get_variable_scope().reuse_variables()

            with tf.variable_scope("obfilter"):
                self.obs_rms = RunningMeanStd(shape=self.observation_shape)
            obs = (obs_ph - self.obs_rms.mean) / self.obs_rms.std
            p_h1 = tf.contrib.layers.fully_connected(obs, self.hidden_size, activation_fn=tf.nn.tanh)
            p_h2 = tf.contrib.layers.fully_connected(p_h1, self.hidden_size, activation_fn=tf.nn.tanh)
            logits = tf.contrib.layers.fully_connected(p_h2, 1, activation_fn=tf.identity)
        return logits
Esempio n. 2
0
    def build_graph(self, obs_ph, acs_ph, reuse=False):
        with tf.variable_scope(self.scope):
            if reuse:
                tf.get_variable_scope().reuse_variables()

            with tf.variable_scope("obfilter"):
                self.obs_rms = RunningMeanStd(shape=self.observation_shape)
            obs = (obs_ph - self.obs_rms.mean) / self.obs_rms.std
            _input = tf.concat([obs, acs_ph], axis=1)  # concatenate the two input -> form a transition
            p_h1 = tf.contrib.layers.fully_connected(_input, self.hidden_size, activation_fn=tf.nn.tanh)
            p_h2 = tf.contrib.layers.fully_connected(p_h1, self.hidden_size, activation_fn=tf.nn.tanh)
            logits = tf.contrib.layers.fully_connected(p_h2, 1, activation_fn=tf.identity)
        return logits
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        self.obs = ob

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
        self.v_preds = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]
        self.pd, self.pi = pdtype.pdfromlatent(last_out)
        # last_out = obz
        # for i in range(num_hid_layers):
        #     last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0)))

        # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
        #     mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
        #     logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
        #     pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        # else:
        #     pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        # self.pd = pdtype.pdfromflat(pdparam)

        # change for BC
        stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = ac
        self._act = U.function([stochastic, ob], [ac, self.v_preds])
def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]):
    rms = RunningMeanStd(shape=x.shape[1:])
    norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range))
    return norm_x, rms
    def __init__(self, env, hidden_size, expert_dataset):
        self.hidden_size = hidden_size
        self.expert_dataset = expert_dataset
        with tf.variable_scope('guidance'):
            self.scope = tf.get_variable_scope().name

            self.agent_s = tf.placeholder(dtype=tf.float32,
                                          shape=[None] +
                                          list(env.observation_space.shape),
                                          name='ph_agent_s')
            self.agent_a = tf.placeholder(dtype=tf.float32,
                                          shape=[None] +
                                          list(env.action_space.shape),
                                          name='ph_agent_a')
            self.expert_a = tf.placeholder(dtype=tf.float32,
                                           shape=[None] +
                                           list(env.action_space.shape),
                                           name='ph_expert_a')

            with tf.variable_scope("obfilter"):
                self.obs_rms = RunningMeanStd(
                    shape=env.observation_space.shape)
            obs_ph_rms = (self.agent_s - self.obs_rms.mean) / self.obs_rms.std

            layer_s = tf.layers.dense(inputs=obs_ph_rms,
                                      units=self.hidden_size,
                                      activation=tf.nn.leaky_relu,
                                      name='layer_s')

            layer_a = tf.layers.dense(inputs=self.agent_a,
                                      units=self.hidden_size,
                                      activation=tf.nn.leaky_relu,
                                      name='layer_a')

            layer_s_a = tf.concat([layer_s, layer_a], axis=1)

            layer = tf.layers.dense(inputs=layer_s_a,
                                    units=self.hidden_size,
                                    activation=tf.nn.leaky_relu,
                                    name='layer1')

            output = tf.layers.dense(inputs=layer,
                                     units=env.action_space.shape[0],
                                     activation=tf.identity,
                                     name='layer2')

            ##########
            # BUG
            ##########
            # loss_func = tf.contrib.gan.losses.wargs.mutual_information_penalty
            labels = tf.nn.softmax(self.expert_a)
            self.loss = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(labels=labels,
                                                        logits=output))

            optimizer = tf.train.AdamOptimizer()
            self.train_op = optimizer.minimize(self.loss)

        self.loss_name = ["guidance_loss"]
        var_list = self.get_trainable_variables()
        self.lossandgrad = U.function(
            [self.agent_s, self.agent_a, self.expert_a],
            [self.loss] + [U.flatgrad(self.loss, var_list)])
    def __init__(self, env, hidden_size, expert_dataset):
        self.obs = expert_dataset.inputs
        self.acs = expert_dataset.labels
        with tf.variable_scope('guidance'):
            self.scope = tf.get_variable_scope().name

            self.agent_s = tf.placeholder(dtype=tf.float32,
                                          shape=[None] +
                                          list(env.observation_space.shape),
                                          name='ph_agent_s')
            self.agent_a = tf.placeholder(dtype=tf.int32,
                                          shape=[None],
                                          name='ph_agent_a')
            agent_a_one_hot = tf.one_hot(self.agent_a,
                                         depth=env.action_space.n)

            self.expert_a = tf.placeholder(dtype=tf.int32,
                                           shape=[None],
                                           name='ph_expert_a')
            expert_a_one_hot = tf.one_hot(self.expert_a,
                                          depth=env.action_space.n)

            with tf.variable_scope("obfilter"):
                self.obs_rms = RunningMeanStd(
                    shape=env.observation_space.shape)
            obs_ph_rms = (self.agent_s - self.obs_rms.mean) / self.obs_rms.std

            layer_s = tf.layers.dense(inputs=obs_ph_rms,
                                      units=hidden_size,
                                      activation=tf.nn.leaky_relu,
                                      name='layer_s')

            layer_a = tf.layers.dense(inputs=agent_a_one_hot,
                                      units=hidden_size,
                                      activation=tf.nn.leaky_relu,
                                      name='layer_a')

            layer_s_a = tf.concat([layer_s, layer_a], axis=1)

            layer = tf.layers.dense(inputs=layer_s_a,
                                    units=hidden_size,
                                    activation=tf.nn.leaky_relu,
                                    name='layer1')

            output = tf.layers.dense(inputs=layer,
                                     units=env.action_space.n,
                                     activation=tf.nn.softmax,
                                     name='layer2')

            loss = tf.keras.losses.categorical_crossentropy(
                y_true=expert_a_one_hot, y_pred=output)
            # loss = tf.nn.softmax_cross_entropy_with_logits(labels=expert_a_one_hot, logits=output)
            self.loss = tf.reduce_mean(loss)
            ##########
            # BUG
            ##########
            # loss_func = tf.contrib.gan.losses.wargs.mutual_information_penalty
            # self.loss = loss_func(structured_generator_inputs=output, predicted_distributions=expert_a_one_hot)

            optimizer = tf.train.AdamOptimizer()
            self.train_op = optimizer.minimize(self.loss)

        self.loss_name = ["guidance_loss"]
        var_list = self.get_trainable_variables()
        self.lossandgrad = U.function(
            [self.agent_s, self.agent_a, self.expert_a],
            [self.loss] + [U.flatgrad(self.loss, var_list)])