def _build(self, state_input, step_input, hidden_input, name=None): action_dim = self._output_dim with tf.variable_scope('dist_params'): if self._std_share_network: # mean and std networks share an MLP (outputs, step_outputs, step_hidden, hidden_init_var) = gru( name='mean_std_network', gru_cell=self._mean_std_gru_cell, all_input_var=state_input, step_input_var=step_input, step_hidden_var=hidden_input, hidden_state_init=self._hidden_state_init, hidden_state_init_trainable=self. _hidden_state_init_trainable, output_nonlinearity_layer=self. _mean_std_output_nonlinearity_layer) with tf.variable_scope('mean_network'): mean_var = outputs[..., :action_dim] step_mean_var = step_outputs[..., :action_dim] with tf.variable_scope('log_std_network'): log_std_var = outputs[..., action_dim:] step_log_std_var = step_outputs[..., action_dim:] else: # separate MLPs for mean and std networks # mean network (mean_var, step_mean_var, step_hidden, hidden_init_var) = gru( name='mean_network', gru_cell=self._mean_gru_cell, all_input_var=state_input, step_input_var=step_input, step_hidden_var=hidden_input, hidden_state_init=self._hidden_state_init, hidden_state_init_trainable=self. _hidden_state_init_trainable, output_nonlinearity_layer=self. _mean_output_nonlinearity_layer) log_std_var = parameter(state_input, length=action_dim, initializer=tf.constant_initializer( self._init_std_param), trainable=self._learn_std, name='log_std_param') step_log_std_var = parameter( step_input, length=action_dim, initializer=tf.constant_initializer(self._init_std_param), trainable=self._learn_std, name='step_log_std_param') dist = DiagonalGaussian(self._output_dim) rnd = tf.random.normal(shape=step_mean_var.get_shape().as_list()[1:]) action_var = rnd * tf.exp(step_log_std_var) + step_mean_var return (action_var, mean_var, step_mean_var, log_std_var, step_log_std_var, step_hidden, hidden_init_var, dist)
def test_param(self): param = parameter(input_var=self.input_vars, length=3, initializer=tf.constant_initializer( self.initial_params)) self.sess.run(tf.global_variables_initializer()) p = self.sess.run(param, feed_dict=self.feed_dict) assert p.shape == (5, 3) assert np.all(p == self.initial_params)
def test_param(self): input_vars = tf.placeholder(shape=[None, 2, 3, 4], dtype=tf.float32) initial_params = np.array([48, 21, 33]) params = parameter(input_var=input_vars, length=3, initializer=tf.constant_initializer(initial_params)) data = np.zeros(shape=[5, 2, 3, 4]) feed_dict = { input_vars: data, } self.sess.run(tf.global_variables_initializer()) p = self.sess.run(params, feed_dict=feed_dict) assert p.shape[:-1] == data.shape[:-1] assert np.all(p[0, 0, 0, :] == initial_params)
def _build(self, state_input): action_dim = self._output_dim with tf.variable_scope('dist_params'): if self._std_share_network: # mean and std networks share an MLP b = np.concatenate([ np.zeros(action_dim), np.full(action_dim, self._init_std_param) ], axis=0) # yapf: disable b = tf.constant_initializer(b) mean_std_network = mlp( state_input, output_dim=action_dim * 2, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, output_b_init=b, name='mean_std_network') with tf.variable_scope('mean_network'): mean_network = mean_std_network[..., :action_dim] with tf.variable_scope('std_network'): std_network = mean_std_network[..., action_dim:] else: # separate MLPs for mean and std networks # mean network mean_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, name='mean_network') # std network if self._adaptive_std: b = tf.constant_initializer(self._init_std_param) std_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._std_hidden_sizes, hidden_nonlinearity=self._std_hidden_nonlinearity, output_nonlinearity=self._std_output_nonlinearity, output_b_init=b, name='std_network') else: p = tf.constant_initializer(self._init_std_param) std_network = parameter(state_input, length=action_dim, initializer=p, trainable=self._learn_std, name='std_network') mean_var = mean_network std_param_var = std_network with tf.variable_scope('std_parameterization'): # build std_var with std parameterization if self._std_parameterization == 'exp': std_param_var = std_param_var elif self._std_parameterization == 'softplus': std_param_var = tf.log(1. + tf.exp(std_param_var)) else: raise NotImplementedError with tf.variable_scope('std_limits'): if self._min_std_param: std_var = tf.maximum(std_param_var, self._min_std_param) if self._max_std_param: std_var = tf.minimum(std_param_var, self._max_std_param) dist = DiagonalGaussian(action_dim) rnd = tf.random.normal(shape=mean_var.get_shape().as_list()[1:], seed=deterministic.get_seed()) action_var = rnd * tf.exp(std_var) + mean_var return action_var, mean_var, std_var, std_param_var, dist
def _build(self, state_input): action_dim = self._output_dim with tf.variable_scope('dist_params'): if self._std_share_network: # mean and std networks share an MLP b = np.concatenate([ np.zeros(action_dim), np.full(action_dim, self._init_std_param) ], axis=0) # yapf: disable mean_std_network = mlp( state_input, output_dim=action_dim * 2, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=tf.constant_initializer(b), name='mean_std_network', layer_normalization=self._layer_normalization) with tf.variable_scope('mean_network'): mean_network = mean_std_network[..., :action_dim] with tf.variable_scope('log_std_network'): log_std_network = mean_std_network[..., action_dim:] else: # separate MLPs for mean and std networks # mean network mean_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init, name='mean_network', layer_normalization=self._layer_normalization) # std network if self._adaptive_std: log_std_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._std_hidden_sizes, hidden_nonlinearity=self._std_hidden_nonlinearity, hidden_w_init=self._std_hidden_w_init, hidden_b_init=self._std_hidden_b_init, output_nonlinearity=self._std_output_nonlinearity, output_w_init=self._std_output_w_init, output_b_init=tf.constant_initializer( self._init_std_param), name='log_std_network', layer_normalization=self._layer_normalization) else: log_std_network = parameter( state_input, length=action_dim, initializer=tf.constant_initializer( self._init_std_param), trainable=self._learn_std, name='log_std_network') mean_var = mean_network std_param = log_std_network with tf.variable_scope('std_parameterization'): # build std_var with std parameterization if self._std_parameterization == 'exp': log_std_var = std_param else: # we know it must be softplus here log_std_var = tf.log(1. + tf.exp(std_param)) with tf.variable_scope('std_limits'): if self._min_std_param is not None: log_std_var = tf.maximum(log_std_var, self._min_std_param) if self._max_std_param is not None: log_std_var = tf.minimum(log_std_var, self._max_std_param) dist = DiagonalGaussian(self._output_dim) rnd = tf.random.normal(shape=mean_var.get_shape().as_list()[1:], seed=deterministic.get_seed()) action_var = rnd * tf.exp(log_std_var) + mean_var return action_var, mean_var, log_std_var, std_param, dist
def _build(self, state_input): action_dim = self._output_dim with tf.variable_scope('dist_params'): if self._std_share_network: # mean and std networks share an MLP b = np.concatenate([ np.zeros(action_dim), np.full(action_dim, self._init_std_param) ], axis=0) # yapf: disable b = tf.constant_initializer(b) mean_std_network = mlp( state_input, output_dim=action_dim * 2, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, output_b_init=b, name='mean_std_network') with tf.variable_scope('mean_network'): mean_network = mean_std_network[..., :action_dim] with tf.variable_scope('std_network'): std_network = mean_std_network[..., action_dim:] else: # separate MLPs for mean and std networks # mean network mean_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, output_nonlinearity=self._output_nonlinearity, name='mean_network') # std network if self._adaptive_std: b = tf.constant_initializer(self._init_std_param) std_network = mlp( state_input, output_dim=action_dim, hidden_sizes=self._std_hidden_sizes, hidden_nonlinearity=self._std_hidden_nonlinearity, output_nonlinearity=self._std_output_nonlinearity, output_b_init=b, name='std_network') else: p = tf.constant_initializer(self._init_std_param) std_network = parameter(state_input, length=action_dim, initializer=p, trainable=self._learn_std, name='std_network') mean_var = mean_network log_std_var = std_network with tf.variable_scope('std_parameterization'): # build std_var with std parameterization if self._std_parameterization == 'exp': pass elif self._std_parameterization == 'softplus': softplus_std_var = tf.log(1. + tf.exp(log_std_var)) log_std_var = tf.log(softplus_std_var) else: raise NotImplementedError with tf.variable_scope('std_limits'): if self._min_std_param: log_std_var = tf.maximum(log_std_var, self._min_std_param) if self._max_std_param: log_std_var = tf.minimum(log_std_var, self._max_std_param) distribution = tfp.distributions.MultivariateNormalDiag( mean_var, tf.exp(log_std_var)) action_var = distribution.sample(seed=ext.get_seed()) return action_var, log_std_var, distribution