def build_network(self): self._build_ph() self._tensor = {} # Important parameters self._ppo_clip = self.args.ppo_clip self._kl_eta = self.args.kl_eta self._current_kl_lambda = 1 self._current_lr = self.args.policy_lr self._timesteps_so_far = 0 # construct the input to the forward network, we normalize the state # input, and concatenate with the action self._tensor['normalized_start_state'] = ( self._input_ph['start_state'] - self._whitening_operator['state_mean'] ) / self._whitening_operator['state_std'] self._tensor['net_input'] = self._tensor['normalized_start_state'] # the mlp for policy network_shape = [self._observation_size] + \ self.args.policy_network_shape + [self._action_size] num_layer = len(network_shape) - 1 act_type = \ [self.args.policy_activation_type] * (num_layer - 1) + [None] norm_type = \ [self.args.policy_normalizer_type] * (num_layer - 1) + [None] init_data = [] for _ in range(num_layer): init_data.append({ 'w_init_method': 'normc', 'w_init_para': { 'stddev': 1.0 }, 'b_init_method': 'constant', 'b_init_para': { 'val': 0.0 } }) init_data[-1]['w_init_para']['stddev'] = 0.01 # the output layer std self._MLP = tf_networks.MLP(dims=network_shape, scope='policy_mlp', train=True, activation_type=act_type, normalizer_type=norm_type, init_data=init_data) # the output policy of the network self._tensor['action_dist_mu'] = self._MLP(self._tensor['net_input']) self._tensor['action_logstd'] = tf.Variable( (0 * self._npr.randn(1, self._action_size)).astype(np.float32), name="action_logstd", trainable=True) self._tensor['action_dist_logstd'] = tf.tile( self._tensor['action_logstd'], tf.stack( (tf.shape(self._tensor['action_dist_mu'])[0], 1))) # make sure the size is matched to [batch, num_action] # fetch all the trainable variables self._set_var_list()
def build_network(self): # the placeholders self._build_ph() self._tensor = {} # construct the input to the forward network, we normalize the state # input, and concatenate with the action self._tensor['normalized_start_state'] = ( self._input_ph['start_state'] - self._whitening_operator['state_mean'] ) / self._whitening_operator['state_std'] self._tensor['net_input'] = tf.concat( [self._tensor['normalized_start_state'], self._input_ph['action']], 1) # the setting of mlp network for every layer (by using a list), provide # the activation function, normalization function and initialzation network_shape = [self._observation_size + self._action_size] + \ self.args.dynamics_network_shape + [self._observation_size] num_layer = len(network_shape) - 1 act_type = \ [self.args.dynamics_activation_type] * (num_layer - 1) + [None] norm_type = \ [self.args.dynamics_normalizer_type] * (num_layer - 1) + [None] init_data = [] for _ in range(num_layer): init_data.append({ 'w_init_method': 'xavier', 'w_init_para': { 'uniform': False }, 'b_init_method': 'xavier', 'b_init_para': { 'uniform': False } }) self._MLP = tf_networks.MLP(dims=network_shape, scope='dynamics_mlp', train=True, activation_type=act_type, normalizer_type=norm_type, init_data=init_data) self._tensor['net_output'] = self._MLP(self._tensor['net_input']) # get the predicted next state self._tensor['pred_output'] = self._tensor['net_output'] * \ self._whitening_operator['diff_state_std'] + \ self._whitening_operator['diff_state_mean'] + \ self._input_ph['start_state'] # fetch all the trainable variables self._set_var_list()
def build_network(self): self._build_ph() self._tensor = {} self._tensor['normalized_start_state'] = ( self._input_ph['start_state'] - self._whitening_operator['state_mean'] ) / self._whitening_operator['state_std'] self._tensor['net_input'] = self._tensor['normalized_start_state'] # the mlp for policy network_shape = [self._observation_size] + \ self.args.reward_network_shape + [1] num_layer = len(network_shape) - 1 act_type = \ [self.args.reward_activation_type] * (num_layer - 1) + [None] norm_type = \ [self.args.reward_normalizer_type] * (num_layer - 1) + [None] init_data = [] for _ in range(num_layer): init_data.append( {'w_init_method': 'normc', 'w_init_para': {'stddev': 1.0}, 'b_init_method': 'constant', 'b_init_para': {'val': 0.0}} ) # init_data[-1]['w_init_para']['stddev'] = 0.01 # the output layer std self._MLP = tf_networks.MLP( dims=network_shape, scope='discriminator_mlp', train=True, activation_type=act_type, normalizer_type=norm_type, init_data=init_data ) self._tensor['logits'] = self._MLP(self._tensor['net_input']) self._tensor['discriminator_output'] = \ tf.nn.sigmoid(self._tensor['logits']) # the self.discriminator_output is the sigmoid(logit) self._tensor['logOfD'] = \ tf.log(self._tensor['discriminator_output'] + 1e-8) self._tensor['logOf1minusD'] = \ tf.log(1 - self._tensor['discriminator_output'] + 1e-8) self._tensor['reward_output'] = tf.minimum( -self._tensor['logOf1minusD'], self.args.GAN_reward_clip_value )
def _build_value_network_and_loss(self): """ @brief: in this function, build the value network and the graph to update the loss @NOTE: it is different from my ppo repo... (I used 0.01 as stddev) """ # build the placeholder for training the value function self._input_ph['value_target'] = \ tf.placeholder(tf.float32, [None, 1], name='value_target') # build the baseline-value function network_shape = [self._observation_size] + \ self.args.value_network_shape + [1] num_layer = len(network_shape) - 1 act_type = \ [self.args.value_activation_type] * (num_layer - 1) + [None] norm_type = \ [self.args.value_normalizer_type] * (num_layer - 1) + [None] init_data = [] for _ in range(num_layer): init_data.append( {'w_init_method': 'normc', 'w_init_para': {'stddev': 1.0}, 'b_init_method': 'constant', 'b_init_para': {'val': 0.0}} ) self._baseline_MLP = tf_networks.MLP( dims=network_shape, scope='value_mlp', train=True, activation_type=act_type, normalizer_type=norm_type, init_data=init_data ) self._tensor['pred_value'] = \ self._baseline_MLP(self._tensor['net_input']) # build the loss for the value network self._update_operator['vf_loss'] = tf.reduce_mean( tf.square(self._tensor['pred_value'] - self._input_ph['value_target']), name='vf_loss' ) self._update_operator['vf_update_op'] = tf.train.AdamOptimizer( learning_rate=self.args.value_lr, beta1=0.5, beta2=0.99, epsilon=1e-4 ).minimize(self._update_operator['vf_loss'])
def build_network(self): # the placeholders self._build_ph() self._tensor = {} # construct the input to the forward network, we normalize the state # input, and concatenate with the action self._tensor['normalized_start_state'] = ( self._input_ph['start_state'] - self._whitening_operator['state_mean'] ) / self._whitening_operator['state_std'] self._tensor['net_input'] = self._tensor['normalized_start_state'] # the mlp for policy network_shape = [self._observation_size] + \ self.args.policy_network_shape + [self._action_size] num_layer = len(network_shape) - 1 act_type = \ [self.args.policy_activation_type] * (num_layer - 1) + [None] norm_type = \ [self.args.policy_normalizer_type] * (num_layer - 1) + [None] init_data = [] for _ in range(num_layer): init_data.append({ 'w_init_method': 'normc', 'w_init_para': { 'stddev': 1.0 }, 'b_init_method': 'constant', 'b_init_para': { 'val': 0.0 } }) init_data[-1]['w_init_para']['stddev'] = 0.01 # the output layer std self._MLP = tf_networks.MLP(dims=network_shape, scope='policy_mlp', train=True, activation_type=act_type, normalizer_type=norm_type, init_data=init_data) self._tensor['action_dist_mu'] = self._MLP(self._tensor['net_input']) # fetch all the trainable variables self._set_var_list() # the gmm approximation self._gmm = GaussianMixture(n_components=self.args.gmm_num_cluster, covariance_type='full', max_iter=self.args.gmm_max_iteration, random_state=self.args.seed, warm_start=True) self._gmm_weights = {'mean': None, 'cov': None} self._gmm_vec_size = self._observation_size * 2 + self._action_size self._NIW_prior = { 'm': self.args.gmm_prior_strength, 'n0': (self.args.gmm_batch_size - 2.0 - self._gmm_vec_size) / self.args.gmm_batch_size * self.args.gmm_prior_strength } self._policy_cov_data = { 'flat_cov_L': np.ones([self._action_size]), 'sig': np.eye(self._action_size), }