def _build(self, state_input, name=None): action_out = mlp(input_var=state_input, output_dim=self._output_dim, hidden_sizes=self._hidden_sizes, name='action_value', hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init, layer_normalization=self._layer_normalization) state_out = mlp(input_var=state_input, output_dim=1, hidden_sizes=self._hidden_sizes, name='state_value', hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init, layer_normalization=self._layer_normalization) action_out_mean = tf.reduce_mean(action_out, 1) # calculate the advantage of performing certain action # over other action in a particular state action_out_advantage = action_out - tf.expand_dims(action_out_mean, 1) q_func_out = state_out + action_out_advantage return q_func_out
def _build(self, state_input, name=None): """Build model given input placeholder(s). Args: state_input (tf.Tensor): Tensor input for state. name (str): Inner model name, also the variable scope of the inner model, if exist. One example is metarl.tf.models.Sequential. Return: tf.Tensor: Tensor output of the model. """ del name return mlp(input_var=state_input, output_dim=self._output_dim, hidden_sizes=self._hidden_sizes, name='mlp', hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init, layer_normalization=self._layer_normalization)
def test_different_mlp(self): # We create another mlp with different name with tf.compat.v1.variable_scope('MLP_Concat'): self.mlp_different_copy = mlp( input_var=self._obs_input, output_dim=self._output_shape, hidden_sizes=(32, 32), input_var2=self._act_input, concat_layer=0, hidden_nonlinearity=self.hidden_nonlinearity, name='mlp2') # Initialize the new mlp variables self.sess.run(tf.compat.v1.global_variables_initializer()) # We modify the weight of the default mlp and feed # The another mlp created should output different result with tf.compat.v1.variable_scope('MLP_Concat', reuse=True): w = tf.compat.v1.get_variable('mlp1/hidden_0/kernel') self.sess.run(w.assign(w + 1)) mlp_output = self.sess.run(self.mlp_f, feed_dict={ self._obs_input: self.obs_input, self._act_input: self.act_input }) mlp_output2 = self.sess.run(self.mlp_different_copy, feed_dict={ self._obs_input: self.obs_input, self._act_input: self.act_input }) assert not np.array_equal(mlp_output, mlp_output2)
def test_concat_layer(self, concat_idx): with tf.compat.v1.variable_scope('mlp_concat_test'): _ = mlp(input_var=self._obs_input, output_dim=self._output_shape, hidden_sizes=(64, 32), input_var2=self._act_input, concat_layer=concat_idx, hidden_nonlinearity=self.hidden_nonlinearity, name='mlp2') obs_input_size = self._obs_input.shape[1].value act_input_size = self._act_input.shape[1].value expected_units = [obs_input_size, 64, 32] expected_units[concat_idx] += act_input_size actual_units = [] with tf.compat.v1.variable_scope('mlp_concat_test', reuse=True): h1_w = tf.compat.v1.get_variable('mlp2/hidden_0/kernel') h2_w = tf.compat.v1.get_variable('mlp2/hidden_1/kernel') out_w = tf.compat.v1.get_variable('mlp2/output/kernel') actual_units.append(h1_w.shape[0].value) actual_units.append(h2_w.shape[0].value) actual_units.append(out_w.shape[0].value) assert np.array_equal(expected_units, actual_units)
def test_multiple_same_mlp(self): # We create another mlp with the same name, trying to reuse it with tf.compat.v1.variable_scope('MLP_Concat', reuse=True): self.mlp_same_copy = mlp( input_var=self._obs_input, output_dim=self._output_shape, hidden_sizes=(32, 32), input_var2=self._act_input, concat_layer=0, hidden_nonlinearity=self.hidden_nonlinearity, name='mlp1') # We modify the weight of the default mlp and feed # The another mlp created should output the same result with tf.compat.v1.variable_scope('MLP_Concat', reuse=True): w = tf.compat.v1.get_variable('mlp1/hidden_0/kernel') self.sess.run(w.assign(w + 1)) mlp_output = self.sess.run(self.mlp_f, feed_dict={ self._obs_input: self.obs_input, self._act_input: self.act_input }) mlp_output2 = self.sess.run(self.mlp_same_copy, feed_dict={ self._obs_input: self.obs_input, self._act_input: self.act_input }) np.testing.assert_array_almost_equal(mlp_output, mlp_output2)
def test_invalid_concat_args(self, concat_idx): with tf.compat.v1.variable_scope('mlp_concat_test'): _ = mlp(input_var=self._obs_input, output_dim=self._output_shape, hidden_sizes=(64, 32), concat_layer=concat_idx, hidden_nonlinearity=self.hidden_nonlinearity, name='mlp_no_input2') obs_input_size = self._obs_input.shape[1].value # concat_layer argument should be silently ignored. expected_units = [obs_input_size, 64, 32] actual_units = [] with tf.compat.v1.variable_scope('mlp_concat_test', reuse=True): h1_w = tf.compat.v1.get_variable('mlp_no_input2/hidden_0/kernel') h2_w = tf.compat.v1.get_variable('mlp_no_input2/hidden_1/kernel') out_w = tf.compat.v1.get_variable('mlp_no_input2/output/kernel') actual_units.append(h1_w.shape[0].value) actual_units.append(h2_w.shape[0].value) actual_units.append(out_w.shape[0].value) assert np.array_equal(expected_units, actual_units)
def setup_method(self): super(TestMLPConcat, self).setup_method() self.obs_input = np.array([[1, 2, 3, 4]]) self.act_input = np.array([[1, 2, 3, 4]]) input_shape_1 = self.obs_input.shape[1:] # 4 input_shape_2 = self.act_input.shape[1:] # 4 self.hidden_nonlinearity = tf.nn.relu self._obs_input = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + input_shape_1, name='input') self._act_input = tf.compat.v1.placeholder(tf.float32, shape=(None, ) + input_shape_2, name='input') self._output_shape = 2 # We build a default mlp with tf.compat.v1.variable_scope('MLP_Concat'): self.mlp_f = mlp(input_var=self._obs_input, output_dim=self._output_shape, hidden_sizes=(32, 32), input_var2=self._act_input, concat_layer=0, hidden_nonlinearity=self.hidden_nonlinearity, name='mlp1') self.sess.run(tf.compat.v1.global_variables_initializer())
def test_layer_normalization(self): # Create a mlp with layer normalization with tf.compat.v1.variable_scope('MLP_Concat'): self.mlp_f_w_n = mlp(input_var=self._obs_input, output_dim=self._output_shape, hidden_sizes=(32, 32), input_var2=self._act_input, concat_layer=0, hidden_nonlinearity=self.hidden_nonlinearity, name='mlp2', layer_normalization=True) # Initialize the new mlp variables self.sess.run(tf.compat.v1.global_variables_initializer()) with tf.compat.v1.variable_scope('MLP_Concat', reuse=True): h1_w = tf.compat.v1.get_variable('mlp2/hidden_0/kernel') h1_b = tf.compat.v1.get_variable('mlp2/hidden_0/bias') h2_w = tf.compat.v1.get_variable('mlp2/hidden_1/kernel') h2_b = tf.compat.v1.get_variable('mlp2/hidden_1/bias') out_w = tf.compat.v1.get_variable('mlp2/output/kernel') out_b = tf.compat.v1.get_variable('mlp2/output/bias') beta_1 = tf.compat.v1.get_variable('mlp2/LayerNorm/beta') gamma_1 = tf.compat.v1.get_variable('mlp2/LayerNorm/gamma') beta_2 = tf.compat.v1.get_variable('mlp2/LayerNorm_1/beta') gamma_2 = tf.compat.v1.get_variable('mlp2/LayerNorm_1/gamma') # First layer y = tf.matmul(tf.concat([self._obs_input, self._act_input], 1), h1_w) + h1_b y = self.hidden_nonlinearity(y) mean, variance = tf.nn.moments(y, [1], keep_dims=True) normalized_y = (y - mean) / tf.sqrt(variance + 1e-12) y_out = normalized_y * gamma_1 + beta_1 # Second layer y = tf.matmul(y_out, h2_w) + h2_b y = self.hidden_nonlinearity(y) mean, variance = tf.nn.moments(y, [1], keep_dims=True) normalized_y = (y - mean) / tf.sqrt(variance + 1e-12) y_out = normalized_y * gamma_2 + beta_2 # Output layer y = tf.matmul(y_out, out_w) + out_b out = self.sess.run(y, feed_dict={ self._obs_input: self.obs_input, self._act_input: self.act_input }) mlp_output = self.sess.run(self.mlp_f_w_n, feed_dict={ self._obs_input: self.obs_input, self._act_input: self.act_input }) np.testing.assert_array_almost_equal(out, mlp_output)
def _build(self, state_input, name=None): return mlp(input_var=state_input, output_dim=self._output_dim, hidden_sizes=self._hidden_sizes, name='mlp', hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init, layer_normalization=self._layer_normalization)
def test_no_hidden(self, concat_idx): with tf.compat.v1.variable_scope('mlp_concat_test'): _ = mlp(input_var=self._obs_input, output_dim=self._output_shape, hidden_sizes=(), input_var2=self._act_input, concat_layer=concat_idx, hidden_nonlinearity=self.hidden_nonlinearity, name='mlp2') obs_input_size = self._obs_input.shape[1].value act_input_size = self._act_input.shape[1].value # concat_layer argument should be reset to point to input_var. expected_units = [obs_input_size] expected_units[0] += act_input_size actual_units = [] with tf.compat.v1.variable_scope('mlp_concat_test', reuse=True): out_w = tf.compat.v1.get_variable('mlp2/output/kernel') actual_units.append(out_w.shape[0].value) assert np.array_equal(expected_units, actual_units)
def _build(self, state_input, name=None): action_dim = self._output_dim with tf.compat.v1.variable_scope('dist_params'): if self._std_share_network: # mean and std networks share an MLP b = np.concatenate([ np.zeros(action_dim), np.full(action_dim, self._init_std_param) ], axis=0) # yapf: disable mean_std_network = mlp( state_input, output_dim=action_dim * 2, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=tf.constant_initializer(b), name='mean_std_network', layer_normalization=self._layer_normalization) with tf.compat.v1.variable_scope('mean_network'): mean_network = mean_std_network[..., :action_dim] with tf.compat.v1.variable_scope('log_std_network'): log_std_network = mean_std_network[..., action_dim:] else: # separate MLPs for mean and std networks # mean network mean_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init, name='mean_network', layer_normalization=self._layer_normalization) # std network if self._adaptive_std: log_std_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._std_hidden_sizes, hidden_nonlinearity=self._std_hidden_nonlinearity, hidden_w_init=self._std_hidden_w_init, hidden_b_init=self._std_hidden_b_init, output_nonlinearity=self._std_output_nonlinearity, output_w_init=self._std_output_w_init, output_b_init=tf.constant_initializer( self._init_std_param), name='log_std_network', layer_normalization=self._layer_normalization) else: log_std_network = parameter( input_var=state_input, length=action_dim, initializer=tf.constant_initializer( self._init_std_param), trainable=self._learn_std, name='log_std_network') mean_var = mean_network std_param = log_std_network with tf.compat.v1.variable_scope('std_limits'): if self._min_std_param is not None: std_param = tf.maximum(std_param, self._min_std_param) if self._max_std_param is not None: std_param = tf.minimum(std_param, self._max_std_param) with tf.compat.v1.variable_scope('std_parameterization'): # build std_var with std parameterization if self._std_parameterization == 'exp': log_std_var = std_param else: # we know it must be softplus here log_std_var = tf.math.log(tf.math.log(1. + tf.exp(std_param))) dist = DiagonalGaussian(self._output_dim) return mean_var, log_std_var, std_param, dist
def _build(self, state_input, name=None): """Build model. Args: state_input (tf.Tensor): Entire time-series observation input. name (str): Inner model name, also the variable scope of the inner model, if exist. One example is metarl.tf.models.Sequential. Returns: tfp.distributions.MultivariateNormalDiag: Distribution. tf.tensor: Mean. tf.Tensor: Log of standard deviation. """ del name action_dim = self._output_dim with tf.compat.v1.variable_scope('dist_params'): if self._std_share_network: # mean and std networks share an MLP b = np.concatenate([ np.zeros(action_dim), np.full(action_dim, self._init_std_param) ], axis=0) # yapf: disable mean_std_network = mlp( state_input, output_dim=action_dim * 2, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=tf.constant_initializer(b), name='mean_std_network', layer_normalization=self._layer_normalization) with tf.compat.v1.variable_scope('mean_network'): mean_network = mean_std_network[..., :action_dim] with tf.compat.v1.variable_scope('log_std_network'): log_std_network = mean_std_network[..., action_dim:] else: # separate MLPs for mean and std networks # mean network mean_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init, name='mean_network', layer_normalization=self._layer_normalization) # std network if self._adaptive_std: log_std_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._std_hidden_sizes, hidden_nonlinearity=self._std_hidden_nonlinearity, hidden_w_init=self._std_hidden_w_init, hidden_b_init=self._std_hidden_b_init, output_nonlinearity=self._std_output_nonlinearity, output_w_init=self._std_output_w_init, output_b_init=tf.constant_initializer( self._init_std_param), name='log_std_network', layer_normalization=self._layer_normalization) else: log_std_network = parameter( input_var=state_input, length=action_dim, initializer=tf.constant_initializer( self._init_std_param), trainable=self._learn_std, name='log_std_network') log_std_network = tf.expand_dims(log_std_network, 1) mean_var = mean_network std_param = log_std_network with tf.compat.v1.variable_scope('std_limits'): if self._min_std_param is not None: std_param = tf.maximum(std_param, self._min_std_param) if self._max_std_param is not None: std_param = tf.minimum(std_param, self._max_std_param) with tf.compat.v1.variable_scope('std_parameterization'): # build std_var with std parameterization if self._std_parameterization == 'exp': log_std_var = std_param else: # we know it must be softplus here log_std_var = tf.math.log(tf.math.log(1. + tf.exp(std_param))) return tfp.distributions.MultivariateNormalDiag( loc=mean_var, scale_diag=tf.exp(log_std_var)), mean_var, log_std_var
def _build(self, state_input, name=None): """Build model given input placeholder(s). Args: state_input (tf.Tensor): Place holder for state input. name (str): Inner model name, also the variable scope of the inner model, if exist. One example is metarl.tf.models.Sequential. Return: tf.Tensor: Sampled action. tf.Tensor: Mean. tf.Tensor: Parameterized log_std. tf.Tensor: log_std. metarl.tf.distributions.DiagonalGaussian: Policy distribution. """ del name action_dim = self._output_dim with tf.compat.v1.variable_scope('dist_params'): if self._std_share_network: # mean and std networks share an CNN b = np.concatenate([ np.zeros(action_dim), np.full(action_dim, self._init_std_param) ], axis=0) # yapf: disable mean_std_conv = cnn( input_var=state_input, filters=self._filters, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, strides=self._strides, padding=self._padding, name='mean_std_cnn') mean_std_network = mlp( mean_std_conv, output_dim=action_dim * 2, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=tf.constant_initializer(b), name='mean_std_network', layer_normalization=self._layer_normalization) with tf.compat.v1.variable_scope('mean_network'): mean_network = mean_std_network[..., :action_dim] with tf.compat.v1.variable_scope('log_std_network'): log_std_network = mean_std_network[..., action_dim:] else: # separate MLPs for mean and std networks # mean network mean_conv = cnn(input_var=state_input, filters=self._filters, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, strides=self._strides, padding=self._padding, name='mean_cnn') mean_network = mlp( mean_conv, output_dim=action_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init, name='mean_network', layer_normalization=self._layer_normalization) # std network if self._adaptive_std: log_std_conv = cnn( input_var=state_input, filters=self._std_filters, hidden_nonlinearity=self._std_hidden_nonlinearity, hidden_w_init=self._std_hidden_w_init, hidden_b_init=self._std_hidden_b_init, strides=self._std_strides, padding=self._std_padding, name='log_std_cnn') log_std_network = mlp( log_std_conv, output_dim=action_dim, hidden_sizes=self._std_hidden_sizes, hidden_nonlinearity=self._std_hidden_nonlinearity, hidden_w_init=self._std_hidden_w_init, hidden_b_init=self._std_hidden_b_init, output_nonlinearity=self._std_output_nonlinearity, output_w_init=self._std_output_w_init, output_b_init=tf.constant_initializer( self._init_std_param), name='log_std_network', layer_normalization=self._layer_normalization) else: log_std_network = parameter( input_var=state_input, length=action_dim, initializer=tf.constant_initializer( self._init_std_param), trainable=self._learn_std, name='log_std_network') mean_var = mean_network std_param = log_std_network with tf.compat.v1.variable_scope('std_limits'): if self._min_std_param is not None: std_param = tf.maximum(std_param, self._min_std_param) if self._max_std_param is not None: std_param = tf.minimum(std_param, self._max_std_param) with tf.compat.v1.variable_scope('std_parameterization'): # build std_var with std parameterization if self._std_parameterization == 'exp': log_std_var = std_param else: # we know it must be softplus here log_std_var = tf.math.log(tf.math.log(1. + tf.exp(std_param))) dist = DiagonalGaussian(self._output_dim) rnd = tf.random.normal(shape=mean_var.get_shape().as_list()[1:]) action_var = rnd * tf.exp(log_std_var) + mean_var return action_var, mean_var, log_std_var, std_param, dist
def _build(self, obs_input, name=None): del name action = mlp(obs_input, self._output_dim, self._hidden_sizes, 'state') return action
def _build(self, obs_input, name=None): state = mlp(obs_input, self._output_dim, self._hidden_sizes, 'state') action = mlp(obs_input, self._output_dim, self._hidden_sizes, 'action') return state, action