コード例 #1
0
ファイル: q_function.py プロジェクト: zzyunzhi/vds
    def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers,
                 **kwargs):
        """The actor-critic network and related training code.

        Args:
            inputs_tf (dict of tensors): all necessary inputs for the network: the
                observation (o), the goal (g), the action (u),
                the next observation (o_2), the next goal (g_2)
            dimo (int): the dimension of the observations
            dimg (int): the dimension of the goals
            dimu (int): the dimension of the actions
            max_u (float): the maximum magnitude of actions; action outputs will be scaled
                accordingly
            o_stats (baselines.her.Normalizer): normalizer for observations
            g_stats (baselines.her.Normalizer): normalizer for goals
            hidden (int): number of hidden units that should be used in hidden layers
            layers (int): number of hidden layers
        """
        self.o_tf = inputs_tf['o']
        self.g_tf = inputs_tf['g']
        self.o_2_tf = inputs_tf['o_2']
        self.g_2_tf = inputs_tf['g_2']

        # Prepare inputs for actor and critic.
        o = self.o_stats.normalize(self.o_tf)
        g = self.g_stats.normalize(self.g_tf)
        o_2 = self.o_stats.normalize(self.o_2_tf)
        g_2 = self.g_stats.normalize(self.g_2_tf)

        # Networks.
        with tf.variable_scope('V'):
            input_V = tf.concat(axis=1, values=[o, g])
            self.V_tf = nn(input_V, [self.hidden] * self.layers + [1])
            input_V_2 = tf.concat(axis=1, values=[o_2, g_2])
            self.V_2_tf = nn(input_V_2, [self.hidden] * self.layers + [1], reuse=True)
コード例 #2
0
    def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats,
                 hidden, layers, sac, **kwargs):
        """The actor-critic network and related training code.

        Args:
            inputs_tf (dict of tensors): all necessary inputs for the network: the
                observation (o), the goal (g), and the action (u)
            dimo (int): the dimension of the observations
            dimg (int): the dimension of the goals
            dimu (int): the dimension of the actions
            max_u (float): the maximum magnitude of actions; action outputs will be scaled
                accordingly
            o_stats (baselines.her.Normalizer): normalizer for observations
            g_stats (baselines.her.Normalizer): normalizer for goals
            hidden (int): number of hidden units that should be used in hidden layers
            layers (int): number of hidden layers
        """
        self.o_tf = inputs_tf['o']
        self.z_tf = inputs_tf['z']
        self.g_tf = inputs_tf['g']
        self.u_tf = inputs_tf['u']

        # Prepare inputs for actor and critic.
        o = self.o_stats.normalize(self.o_tf)
        g = self.g_stats.normalize(self.g_tf)
        z = self.z_tf

        input_pi = tf.concat(axis=1, values=[o, z, g])  # for actor

        # policy net
        if sac:
            with tf.variable_scope('pi'):
                mu, pi, logp_pi = mlp_gaussian_policy(input_pi, self.dimu,
                                                      self.hidden, self.layers)
                mu, pi, self.logp_pi_tf = apply_squashing_func(mu, pi, logp_pi)
                # make sure actions are in correct range
                self.mu_tf = mu * self.max_u
                self.pi_tf = pi * self.max_u
                self.neg_logp_pi_tf = -self.logp_pi_tf

        else:  # ddpg
            with tf.variable_scope('pi'):
                self.pi_tf = self.max_u * tf.tanh(
                    nn(input_pi, [self.hidden] * self.layers + [self.dimu]))

        # Q value net
        with tf.variable_scope('Q'):
            # for policy training
            input_Q = tf.concat(axis=1,
                                values=[o, z, g, self.pi_tf / self.max_u])
            self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
            # for critic training
            input_Q = tf.concat(axis=1,
                                values=[o, z, g, self.u_tf / self.max_u])
            self._input_Q = input_Q  # exposed for tests
            self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1],
                           reuse=True)
コード例 #3
0
    def __init__(self, inputs_tf, image_input_shapes, dimo, dimg, dimu, max_u,
                 o_stats, g_stats, hidden, layers, **kwargs):
        """The actor-critic network and related training code.

        Args:
            inputs_tf (dict of tensors): all necessary inputs for the network: the
                observation (o), the goal (g), and the action (u)
            dimo (int): the dimension of the observations
            dimg (int): the dimension of the goals
            dimu (int): the dimension of the actions
            max_u (float): the maximum magnitude of actions; action outputs will be scaled
                accordingly
            o_stats (baselines.her.Normalizer): normalizer for observations
            g_stats (baselines.her.Normalizer): normalizer for goals
            hidden (int): number of hidden units that should be used in hidden layers
            layers (int): number of hidden layers
        """
        self.o_tf = inputs_tf['o']
        self.g_tf = inputs_tf['g']
        self.u_tf = inputs_tf['u']

        # Prepare inputs for actor and critic.
        o = self.o_stats.normalize(self.o_tf)
        g = self.g_stats.normalize(self.g_tf)
        o = tf.reshape(o, [-1, *image_input_shapes['o']])
        g = tf.reshape(g, [-1, *image_input_shapes['g']])

        #print(o.shape)
        #input("--------------------")
        # input_pi = tf.concat(axis=1, values=[o, g])  # for actor

        # Networks.

        x_o = cnn_one_stream(o, scope='phi', reuse=False)
        #print(x_o.shape)
        #input("----------------")
        #x_g = cnn_one_stream(g, scope='phi', reuse=True)
        x_g = g

        x_concat = tf.concat(axis=1, values=[x_o, x_g])

        with tf.variable_scope('pi'):
            self.pi_tf = self.max_u * tf.tanh(
                nn(x_concat, [self.hidden] * self.layers + [self.dimu]))

        with tf.variable_scope('Q'):
            # for policy training
            input_Q = tf.concat(axis=1,
                                values=[x_concat, self.pi_tf / self.max_u])
            self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
            # for critic training
            input_Q = tf.concat(axis=1,
                                values=[x_concat, self.u_tf / self.max_u])
            self._input_Q = input_Q  # exposed for tests
            self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1],
                           reuse=True)
コード例 #4
0
ファイル: actor_critic.py プロジェクト: wwxFromTju/curious
    def __init__(self,
                 inputs_tf,
                 dimo,
                 dimg,
                 dimu,
                 max_u,
                 o_stats,
                 g_stats,
                 hidden,
                 layers,
                 normalize_obs=True,
                 **kwargs):
        """The actor-critic network and related training code.

        Args:
            inputs_tf (dict of tensors): all necessary inputs for the network: the
                observation (o), the goal (g), and the action (u)
            dimo (int): the dimension of the observations
            dimg (int): the dimension of the goals
            dimu (int): the dimension of the actions
            max_u (float): the maximum magnitude of actions; action outputs will be scaled
                accordingly
            o_stats (baselines.her.Normalizer): normalizer for observations
            g_stats (baselines.her.Normalizer): normalizer for goals
            hidden (int): number of hidden units that should be used in hidden layers
            layers (int): number of hidden layers
        """
        self.o_tf = inputs_tf['o']
        self.g_tf = inputs_tf['g']
        self.u_tf = inputs_tf['u']

        # Prepare inputs for actor and critic.
        if normalize_obs:
            o = self.o_stats.normalize(self.o_tf)
            g = self.g_stats.normalize(self.g_tf)
        else:
            o = self.o_tf
            g = self.g_tf
        input_pi = tf.concat(axis=1, values=[o, g])  # for actor

        # Networks.
        with tf.variable_scope('pi'):
            self.pi_tf = self.max_u * tf.tanh(
                nn(input_pi, [self.hidden] * self.layers + [self.dimu]))
        with tf.variable_scope('Q'):
            # for policy training
            input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u])
            self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
            # for critic training
            input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u])
            self._input_Q = input_Q  # exposed for tests
            self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1],
                           reuse=True)
コード例 #5
0
def mlp_gaussian_policy(x, act_dim, hidden, layers):
    net = nn(x, [hidden] * (layers + 1))
    mu = tf.layers.dense(net, act_dim, activation=None)

    log_std = tf.layers.dense(net, act_dim, activation=tf.tanh)
    log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)

    std = tf.exp(log_std)
    pi = mu + tf.random_normal(tf.shape(mu)) * std
    logp_pi = gaussian_likelihood(pi, mu, log_std)
    return mu, pi, logp_pi
コード例 #6
0
    def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats,
                 hidden, layers, **kwargs):
        """The actor-critic network and related training code.

        Args:
            inputs_tf (dict of tensors): all necessary inputs for the network: the
                observation (o), the goal (g), and the action (u)
            dimo (int): the dimension of the observations
            dimg (int): the dimension of the goals
            dimu (int): the dimension of the actions
            max_u (float): the maximum magnitude of actions; action outputs will be scaled
                accordingly
            o_stats (baselines.her.Normalizer): normalizer for observations
            g_stats (baselines.her.Normalizer): normalizer for goals
            hidden (int): number of hidden units that should be used in hidden layers
            layers (int): number of hidden layers
        """
        self.o_tf = inputs_tf['o']
        self.g_tf = inputs_tf['g']
        self.u_tf = inputs_tf['u']

        # Prepare inputs for actor and critic.
        o = self.o_stats.normalize(self.o_tf)
        g = self.g_stats.normalize(self.g_tf)

        num_blocks = (o.get_shape().as_list()[1] -
                      ENV_FEATURES) // BLOCK_FEATURES

        obs_env = tf.slice(o, [0, 0], [-1, ENV_FEATURES])
        obs_blocks = tf.slice(o, [0, ENV_FEATURES], [-1, -1])
        #print('######################', obs_blocks)

        input_blocks = tf.reshape(obs_blocks, [-1, num_blocks, BLOCK_FEATURES])
        #print('######################', input_blocks)
        to_concat = []
        batch_size = tf.shape(obs_blocks)[0]

        with tf.variable_scope('Q'):
            for _ in range(ATTENTION_CNT):
                block_mlp = [64]
                obs_blocks = input_blocks
                for num_hidden in block_mlp:
                    obs_blocks = tf.layers.dense(obs_blocks,
                                                 num_hidden,
                                                 activation=tf.nn.relu)
                    #print('###########', obs_blocks)

                obs_blocks = tf.layers.dense(obs_blocks,
                                             FEATURE_SIZE,
                                             activation=None)
                rnn_input = tf.unstack(tf.transpose(obs_blocks, perm=[1, 0,
                                                                      2]))
                RNN_HIDDEN = FEATURE_SIZE
                lstm = tf.contrib.rnn.LSTMCell(RNN_HIDDEN, state_is_tuple=True)

                #print('###########batch', batch_size, RNN_HIDDEN)
                hid_state = tf.zeros([batch_size, RNN_HIDDEN])
                cell_state = tf.zeros([batch_size, RNN_HIDDEN])
                state = (hid_state, cell_state)

                #out = tf.scan(lambda a, x: lstm(x, a), rnn_input, initializer=hid_state)
                #print('#####ghts)
                #out', out)

                blocks = []
                for block in rnn_input:
                    output, state = lstm(block, state)
                    blocks.append(output)
                    #print('#####', output)
                    #print('#####', state)

                blocks = tf.stack(blocks)
                blocks = tf.transpose(blocks, perm=[1, 0, 2])

                # Add all the blocks together
                # (?, n)
                sum_blocks = tf.reduce_sum(blocks, axis=1)
                #attention_input = tf.concat(axis=2, values=[obs_blocks, sum_blocks])
                #print('$$$$$$$', attention_input)

                sum_mlp = [64]
                for num_hidden in sum_mlp:
                    sum_blocks = tf.layers.dense(sum_blocks,
                                                 num_hidden,
                                                 activation=tf.nn.tanh)

                sum_blocks = tf.layers.dense(sum_blocks,
                                             FEATURE_SIZE,
                                             activation=None)
                print(sum_blocks)
                # (?, 1, n)
                attention = tf.expand_dims(sum_blocks, 1)

                #print('###########', attention)

                # (?, ?, n)
                attention = tf.tile(attention, [1, num_blocks, 1])
                #print('###########', attention)

                attention = tf.nn.l2_normalize(attention, axis=2)
                #print('###########', attention)

                # (?, ?)
                norm_block_emb = tf.nn.l2_normalize(blocks, axis=2)
                #print('###########', attention)
                #print('###########', norm_block_emb)

                weights = tf.reduce_sum(attention * norm_block_emb, axis=2)
                weights = tf.nn.softmax(weights, axis=1)
                print('###########', weights)
                sindex = tf.argmax(weights, axis=1, output_type=tf.int32)
                print('###########', sindex)

                findex = tf.range(tf.shape(sindex)[0])
                #index = tf.stack(tf.meshgrid(tf.range(0,batch_size), tf.range(0,batch_size)) + [ sindex ], axis=2)

                print('###########', findex)

                index = tf.stack([findex, sindex])
                index = tf.transpose(index, perm=[1, 0])
                #sind = tf.expand_dims(ind, axis=1)
                print('###########', index)
                chosen_block = tf.gather_nd(input_blocks, index)

                print('###########', chosen_block)

                self.block_weights = weights
                # (?, ?, 1)
                weights = tf.expand_dims(weights, 2)
                # (?, ?, n)
                weights = tf.tile(weights, [1, 1, FEATURE_SIZE])
                weighted = weights * blocks
                # (?, n)
                gated_obs = tf.reduce_sum(weighted, axis=1)
                to_concat.append(gated_obs)
                to_concat.append(chosen_block)
            gated_obs = tf.concat(axis=1, values=to_concat)

            input_pi = tf.concat(axis=1, values=[obs_env, gated_obs,
                                                 g])  # for actor

        # Networks.
        with tf.variable_scope('pi'):
            self.pi_tf = self.max_u * tf.tanh(
                nn(input_pi, [self.hidden] * self.layers + [self.dimu]))
        with tf.variable_scope('Q'):
            # for policy training
            input_Q = tf.concat(
                axis=1,
                values=[obs_env, gated_obs, g, self.pi_tf / self.max_u])
            self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
            # for critic training
            input_Q = tf.concat(
                axis=1, values=[obs_env, gated_obs, g, self.u_tf / self.max_u])
            self._input_Q = input_Q  # exposed for tests
            self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1],
                           reuse=True)
コード例 #7
0
    def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats,
                 hidden, layers, **kwargs):
        """The actor-critic network and related training code.

        Args:
            inputs_tf (dict of tensors): all necessary inputs for the network: the
                observation (o), the goal (g), and the action (u)
            dimo (int): the dimension of the observations
            dimg (int): the dimension of the goals
            dimu (int): the dimension of the actions
            max_u (float): the maximum magnitude of actions; action outputs will be scaled
                accordingly
            o_stats (baselines.her.Normalizer): normalizer for observations
            g_stats (baselines.her.Normalizer): normalizer for goals
            hidden (int): number of hidden units that should be used in hidden layers
            layers (int): number of hidden layers
        """
        self.o_tf = inputs_tf['o']
        self.g_tf = inputs_tf['g']
        self.u_tf = inputs_tf['u']

        # Prepare inputs for actor and critic.
        o = self.o_stats.normalize(self.o_tf)
        g = self.g_stats.normalize(self.g_tf)

        num_blocks = (o.get_shape().as_list()[1] -
                      ENV_FEATURES) // BLOCK_FEATURES

        obs_env = tf.slice(o, [0, 0], [-1, ENV_FEATURES])
        obs_blocks = tf.slice(o, [0, ENV_FEATURES], [-1, -1])

        batch_size = tf.shape(obs_blocks)[0]

        print(obs_blocks)

        with tf.variable_scope('pi'):
            # (?, b)
            # hidden = tf.layers.dense(obs_blocks, FEATURE_SIZE, activation=tf.nn.relu)
            # attention_weights = tf.layers.dense(hidden, num_blocks, activation=tf.sigmoid)
            attention_weights = tf.layers.dense(obs_blocks,
                                                num_blocks,
                                                activation=tf.tanh)
            self.block_weights = attention_weights
            # (?, b, f)
            input_blocks = tf.reshape(obs_blocks,
                                      [-1, num_blocks, BLOCK_FEATURES])
            # (?, b, 1)
            weights = tf.expand_dims(attention_weights, 2)
            # (?, b, f)
            weights = tf.tile(weights, [1, 1, BLOCK_FEATURES])
            weighted = weights * input_blocks
            # (?, b * f)
            gated_obs = tf.reshape(weighted, [-1, num_blocks * BLOCK_FEATURES])
            print(gated_obs)
            input_pi = tf.concat(axis=1, values=[obs_env,
                                                 gated_obs])  # for actor

            # Networks.
            # with tf.variable_scope('pi'):
            self.pi_tf = self.max_u * tf.tanh(
                nn(input_pi, [self.hidden] * self.layers + [self.dimu]))
        with tf.variable_scope('Q'):
            # for policy training
            input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u])
            self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
            # for critic training
            input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u])
            self._input_Q = input_Q  # exposed for tests
            self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1],
                           reuse=True)
コード例 #8
0
    def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats,
                 hidden, layers, **kwargs):
        """The actor-critic network and related training code.

        Args:
            inputs_tf (dict of tensors): all necessary inputs for the network: the
                observation (o), the goal (g), and the action (u)
            dimo (int): the dimension of the observations
            dimg (int): the dimension of the goals
            dimu (int): the dimension of the actions
            max_u (float): the maximum magnitude of actions; action outputs will be scaled
                accordingly
            o_stats (baselines.her.Normalizer): normalizer for observations
            g_stats (baselines.her.Normalizer): normalizer for goals
            hidden (int): number of hidden units that should be used in hidden layers
            layers (int): number of hidden layers
        """
        self.o_tf = inputs_tf['o']
        self.g_tf = inputs_tf['g']
        self.u_tf = inputs_tf['u']

        o = self.o_tf

        env_size = tf.constant(ENV_FEATURES, tf.int32)
        block_size = tf.constant(BLOCK_FEATURES, tf.int32)
        batch_size = tf.shape(o)[0]
        obs_shape = tf.shape(o)[1]

        max_num_blocks = tf.cast((obs_shape - env_size) / block_size, tf.int32)
        num_blocks = tf.reshape(tf.slice(o, [0, 0], [-1, 1]), [
            -1,
        ])
        num_blocks = tf.cast(num_blocks, tf.int32)
        o = tf.slice(o, [0, 1], [-1, -1])

        o = self.o_stats.normalize(o)

        obs_env = tf.slice(o, [0, 0], [-1, ENV_FEATURES])
        obs_blocks = tf.slice(o, [0, ENV_FEATURES], [-1, -1])
        #print('######################', obs_blocks)

        input_blocks = tf.reshape(obs_blocks,
                                  [-1, max_num_blocks, BLOCK_FEATURES])
        #print('######################', input_blocks)
        to_concat = []

        with tf.variable_scope('Q'):
            for _ in range(ATTENTION_CNT):
                block_mlp = [64]
                obs_blocks = input_blocks
                for num_hidden in block_mlp:
                    obs_blocks = tf.layers.dense(obs_blocks,
                                                 num_hidden,
                                                 activation=tf.nn.relu)
                    #print('###########', obs_blocks)

                obs_blocks = tf.layers.dense(obs_blocks,
                                             FEATURE_SIZE,
                                             activation=None)
                # rnn_input = tf.transpose(obs_blocks, perm=[1,0,2])
                RNN_HIDDEN = FEATURE_SIZE
                lstm = tf.contrib.rnn.LSTMCell(RNN_HIDDEN, state_is_tuple=True)
                # For loop doesn't work! Use tf.nn.dynamic_rnn instead!
                # https://stackoverflow.com/questions/43341374/tensorflow-dynamic-rnn-lstm-how-to-format-input
                blocks, _ = tf.nn.dynamic_rnn(lstm,
                                              obs_blocks,
                                              sequence_length=num_blocks,
                                              dtype=tf.float32)

                # Add all the blocks together
                # (?, n)
                sum_blocks = tf.reduce_sum(blocks, axis=1)

                sum_mlp = [64]
                for num_hidden in sum_mlp:
                    sum_blocks = tf.layers.dense(sum_blocks,
                                                 num_hidden,
                                                 activation=tf.nn.tanh)

                sum_blocks = tf.layers.dense(sum_blocks,
                                             FEATURE_SIZE,
                                             activation=None)
                print(sum_blocks)
                # (?, 1, n)
                attention = tf.expand_dims(sum_blocks, 1)
                #print('###########', attention)

                # (?, ?, n)
                attention = tf.tile(attention, [1, max_num_blocks, 1])
                #print('###########', attention)

                attention = tf.nn.l2_normalize(attention, axis=2)
                #print('###########', attention)

                # (?, ?)
                norm_block_emb = tf.nn.l2_normalize(blocks, axis=2)
                #print('###########', attention)
                #print('###########', norm_block_emb)

                weights = tf.reduce_sum(attention * norm_block_emb, axis=2)
                weights = tf.nn.softmax(weights, axis=1)
                print('###########', weights)
                sindex = tf.argmax(weights, axis=1, output_type=tf.int32)
                print('###########', sindex)

                findex = tf.range(tf.shape(sindex)[0])
                #index = tf.stack(tf.meshgrid(tf.range(0,batch_size), tf.range(0,batch_size)) + [ sindex ], axis=2)

                print('###########', findex)

                index = tf.stack([findex, sindex])
                index = tf.transpose(index, perm=[1, 0])
                #sind = tf.expand_dims(ind, axis=1)
                print('###########', index)
                chosen_block = tf.gather_nd(input_blocks, index)

                print('###########', chosen_block)

                self.block_weights = weights
                # (?, ?, 1)
                weights = tf.expand_dims(weights, 2)
                # (?, ?, n)
                weights = tf.tile(weights, [1, 1, FEATURE_SIZE])
                weighted = weights * blocks
                # (?, n)
                gated_obs = tf.reduce_sum(weighted, axis=1)
                to_concat.append(gated_obs)
                to_concat.append(chosen_block)
            gated_obs = tf.concat(axis=1, values=to_concat)
            input_pi = tf.concat(axis=1, values=[obs_env,
                                                 gated_obs])  # for actor

        # Networks.
        with tf.variable_scope('pi'):
            self.pi_tf = self.max_u * tf.tanh(
                nn(input_pi, [self.hidden] * self.layers + [self.dimu]))
        with tf.variable_scope('Q'):
            # for policy training
            input_Q = tf.concat(
                axis=1, values=[obs_env, gated_obs, self.pi_tf / self.max_u])
            self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
            # for critic training
            input_Q = tf.concat(
                axis=1, values=[obs_env, gated_obs, self.u_tf / self.max_u])
            self._input_Q = input_Q  # exposed for tests
            self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1],
                           reuse=True)
コード例 #9
0
ファイル: discriminator.py プロジェクト: phymucs/misc
    def __init__(self, inputs_tf, dimo, dimz, dimg, dimu, max_u, o_stats,
                 g_stats, hidden, layers, env_name, **kwargs):
        """The discriminator network and related training code.

        Args:
            inputs_tf (dict of tensors): all necessary inputs for the network: the
                observation (o), the goal (g), and the action (u)
            dimo (int): the dimension of the observations
            dimg (int): the dimension of the goals
            dimu (int): the dimension of the actions
            max_u (float): the maximum magnitude of actions; action outputs will be scaled
                accordingly
            o_stats (baselines.her.Normalizer): normalizer for observations
            g_stats (baselines.her.Normalizer): normalizer for goals
            hidden (int): number of hidden units that should be used in hidden layers
            layers (int): number of hidden layers
        """

        self.o_tf = tf.placeholder(tf.float32, shape=(None, self.dimo))
        self.z_tf = tf.placeholder(tf.float32, shape=(None, self.dimz))
        self.g_tf = tf.placeholder(tf.float32, shape=(None, self.dimg))

        obs_tau_excludes_goal, obs_tau_achieved_goal = split_observation_tf(
            self.env_name, self.o_tau_tf)

        obs_excludes_goal, obs_achieved_goal = split_observation_tf(
            self.env_name, self.o_tf)

        # Discriminator networks

        with tf.variable_scope('state_mi'):
            # Mutual Information Neural Estimation
            # shuffle and concatenate
            x_in = obs_tau_excludes_goal
            y_in = obs_tau_achieved_goal
            y_in_tran = tf.transpose(y_in, perm=[1, 0, 2])
            y_shuffle_tran = tf.random_shuffle(y_in_tran)
            y_shuffle = tf.transpose(y_shuffle_tran, perm=[1, 0, 2])
            x_conc = tf.concat([x_in, x_in], axis=-2)
            y_conc = tf.concat([y_in, y_shuffle], axis=-2)

            # propagate the forward pass
            layerx = tf_layers.linear(x_conc, int(self.hidden / 2))
            layery = tf_layers.linear(y_conc, int(self.hidden / 2))
            layer2 = tf.nn.relu(layerx + layery)
            output = tf_layers.linear(layer2, 1)
            output = tf.nn.tanh(output)

            # split in T_xy and T_x_y predictions
            N_samples = tf.shape(x_in)[-2]
            T_xy = output[:, :N_samples, :]
            T_x_y = output[:, N_samples:, :]

            # compute the negative loss (maximise loss == minimise -loss)
            mean_exp_T_x_y = tf.reduce_mean(tf.math.exp(T_x_y), axis=-2)
            neg_loss = -(tf.reduce_mean(T_xy, axis=-2) -
                         tf.math.log(mean_exp_T_x_y))
            neg_loss = tf.check_numerics(neg_loss,
                                         'check_numerics caught bad neg_loss')
            self.mi_tf = neg_loss

        with tf.variable_scope('skill_ds'):
            self.logits_tf = nn(obs_achieved_goal,
                                [int(self.hidden / 2)] * self.layers +
                                [self.dimz])
            self.sk_tf = tf.nn.softmax_cross_entropy_with_logits(
                labels=self.z_tf, logits=self.logits_tf)
            self.sk_r_tf = -1 * self.sk_tf
コード例 #10
0
    def __init__(self,
                 inputs_tf,
                 dimo,
                 dimg,
                 dimu,
                 max_u,
                 o_stats,
                 g_stats,
                 hidden,
                 layers,
                 env=None,
                 n_arms=None,
                 normalized=False,
                 **kwargs):
        """The actor-critic network and related training code.

        Args:
            inputs_tf (dict of tensors): all necessary inputs for the network: the
                observation (o), the goal (g), and the action (u)
            dimo (int): the dimension of the observations
            dimg (int): the dimension of the goals
            dimu (int): the dimension of the actions
            max_u (float): the maximum magnitude of actions; action outputs will be scaled
                accordingly
            o_stats (baselines.her.Normalizer): normalizer for observations
            g_stats (baselines.her.Normalizer): normalizer for goals
            hidden (int): number of hidden units that should be used in hidden layers
            layers (int): number of hidden layers
        """
        self.o_tf = inputs_tf['o']
        self.g_tf = inputs_tf['g']
        self.u_tf = inputs_tf['u']

        # Un-linearize the observation
        self.env = env.unwrapped
        o_normed = self.o_stats.normalize(self.o_tf)
        obs_dict = self.env.reshaper.unlinearize(self.o_tf)
        obs_dict_normed = self.env.reshaper.unlinearize(o_normed)

        # Prepare inputs for actor and critic.
        if not normalized:
            o = tf.layers.Flatten()(obs_dict['observation'])
            g = self.g_tf
        else:
            o = tf.layers.Flatten()(obs_dict_normed['observation'])
            g = self.g_stats.normalize(self.g_tf)
            #g = tf.stopgradient(g)

        input_pi = tf.concat(axis=1, values=[o, g])  # for actor

        # Networks.
        with tf.variable_scope('pi'):
            self.pi_tf = self.max_u * tf.tanh(
                nn(input_pi, [self.hidden] * self.layers + [self.dimu]))

        with tf.variable_scope('Q'):
            # for policy training
            input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u])
            self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
            # for critic training
            input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u])
            self._input_Q = input_Q  # exposed for tests
            self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1],
                           reuse=True)

        total_params()
コード例 #11
0
    def __init__(self,
                 inputs_tf,
                 dimo,
                 dimg,
                 dimu,
                 max_u,
                 o_stats,
                 g_stats,
                 hidden,
                 layers,
                 n_arms,
                 learn_kin=False,
                 conn_type='sums',
                 env=None,
                 normalized=False,
                 **kwargs):
        """The actor-critic network and related training code.

        Args:
            inputs_tf (dict of tensors): all necessary inputs for the network: the
                observation (o), the goal (g), and the action (u)
            dimo (int): the dimension of the observations
            dimg (int): the dimension of the goals
            dimu (int): the dimension of the actions
            max_u (float): the maximum magnitude of actions; action outputs will be scaled
                accordingly
            o_stats (baselines.her.Normalizer): normalizer for observations
            g_stats (baselines.her.Normalizer): normalizer for goals
            hidden (int): number of hidden units that should be used in hidden layers
            layers (int): number of hidden layers
        """
        self.o_tf = inputs_tf['o']
        self.g_tf = inputs_tf['g']
        self.u_tf = inputs_tf['u']

        # Calculate the gradients of g prior to normalization
        loss = calculate_loss(self.g_tf)
        dl_dg = tf.gradients(loss, self.g_tf)
        import pdb
        pdb.set_trace()

        # Access to the environment type
        self.env = env.unwrapped

        # N-Arms
        self.n_arms = n_arms

        # Un-linearize the observation
        self.env = env.unwrapped

        # Reshape inputs for the number of arms
        u = narm_reshape(self.u_tf, n_arms)

        # Normalize the observations and gradients
        observations = self.o_tf
        if normalized:
            observations = self.o_stats.normalize(observations)

        # Extract observations specifics
        obs_dict = self.env.reshaper.unlinearize(observations)
        o = obs_dict['observation']
        gradL = [obs_dict['jacp{}'.format(i)] for i in range(n_arms)]

        import pdb
        pdb.set_trace()

        ########################################################################
        # Solve a quadratic to get equal no of params #FIXME not working right
        ########################################################################
        hidden = solve_quadratic(self.layers,
                                 self.dimo,
                                 self.dimg,
                                 self.dimu,
                                 o2=o[:, 0].shape.as_list()[-1],
                                 g2=1,
                                 u2=u[:, 0].shape.as_list()[-1],
                                 H=self.hidden,
                                 n=n_arms) - 1

        ########################################################################

        # Outputs
        pi_tfs = [None] * n_arms
        Q_pi_tfs = [None] * n_arms
        Q_tfs = [None] * n_arms

        for i in range(n_arms):
            # Observataions and actions
            o_i = o[:, i]
            u_i = u[:, i]

            # Differentiation chain for method
            g_i = gradL[i]

            # Input Pi
            input_pis_i = tf.concat(axis=1, values=[o_i, g_i])

            with tf.variable_scope('pi{}'.format(i)):
                pi_tfs[i] = self.max_u * tf.tanh(
                    nn(input_pis_i, [hidden] * self.layers + [1]))

            with tf.variable_scope('Q{}'.format(i)):
                # for policy training
                input_Q_1_i = tf.concat(
                    axis=1, values=[o_i, g_i, pi_tfs[i] / self.max_u])
                Q_pi_tfs[i] = nn(input_Q_1_i, [hidden] * self.layers + [1])

                # for critic training
                input_Q_2_i = tf.concat(axis=1,
                                        values=[o_i, g_i, u_i / self.max_u])
                Q_tfs[i] = nn(input_Q_2_i, [hidden] * self.layers + [1],
                              reuse=True)
            #total_params()
            #tp = 2*(self.layers-1)*hidden*hidden + (2*(2 + 1 + 1 + self.layers) + 1)*hidden + (1 + 1)

        with tf.variable_scope('pi'):
            self.pi_tf = tf.concat(axis=1, values=pi_tfs)

        with tf.variable_scope('Q'):
            # for policy training
            self.Q_pi_tf = sum(Q_pi_tfs)
            self.Q_tf = sum(Q_tfs)

        total_params()