Beispiel #1
0
    def build_network(self):
        self._build_ph()

        self._tensor = {}

        # Important parameters
        self._ppo_clip = self.args.ppo_clip
        self._kl_eta = self.args.kl_eta
        self._current_kl_lambda = 1
        self._current_lr = self.args.policy_lr
        self._timesteps_so_far = 0

        # construct the input to the forward network, we normalize the state
        # input, and concatenate with the action
        self._tensor['normalized_start_state'] = (
            self._input_ph['start_state'] -
            self._whitening_operator['state_mean']
        ) / self._whitening_operator['state_std']
        self._tensor['net_input'] = self._tensor['normalized_start_state']
        # the mlp for policy
        network_shape = [self._observation_size] + \
            self.args.policy_network_shape + [self._action_size]
        num_layer = len(network_shape) - 1
        act_type = \
            [self.args.policy_activation_type] * (num_layer - 1) + [None]
        norm_type = \
            [self.args.policy_normalizer_type] * (num_layer - 1) + [None]
        init_data = []
        for _ in range(num_layer):
            init_data.append({
                'w_init_method': 'normc',
                'w_init_para': {
                    'stddev': 1.0
                },
                'b_init_method': 'constant',
                'b_init_para': {
                    'val': 0.0
                }
            })
        init_data[-1]['w_init_para']['stddev'] = 0.01  # the output layer std
        self._MLP = tf_networks.MLP(dims=network_shape,
                                    scope='policy_mlp',
                                    train=True,
                                    activation_type=act_type,
                                    normalizer_type=norm_type,
                                    init_data=init_data)
        # the output policy of the network
        self._tensor['action_dist_mu'] = self._MLP(self._tensor['net_input'])
        self._tensor['action_logstd'] = tf.Variable(
            (0 * self._npr.randn(1, self._action_size)).astype(np.float32),
            name="action_logstd",
            trainable=True)
        self._tensor['action_dist_logstd'] = tf.tile(
            self._tensor['action_logstd'],
            tf.stack(
                (tf.shape(self._tensor['action_dist_mu'])[0],
                 1)))  # make sure the size is matched to [batch, num_action]
        # fetch all the trainable variables
        self._set_var_list()
Beispiel #2
0
    def build_network(self):
        # the placeholders
        self._build_ph()

        self._tensor = {}

        # construct the input to the forward network, we normalize the state
        # input, and concatenate with the action
        self._tensor['normalized_start_state'] = (
            self._input_ph['start_state'] -
            self._whitening_operator['state_mean']
        ) / self._whitening_operator['state_std']

        self._tensor['net_input'] = tf.concat(
            [self._tensor['normalized_start_state'], self._input_ph['action']],
            1)

        # the setting of mlp network for every layer (by using a list), provide
        # the activation function, normalization function and initialzation
        network_shape = [self._observation_size + self._action_size] + \
            self.args.dynamics_network_shape + [self._observation_size]
        num_layer = len(network_shape) - 1
        act_type = \
            [self.args.dynamics_activation_type] * (num_layer - 1) + [None]
        norm_type = \
            [self.args.dynamics_normalizer_type] * (num_layer - 1) + [None]
        init_data = []
        for _ in range(num_layer):
            init_data.append({
                'w_init_method': 'xavier',
                'w_init_para': {
                    'uniform': False
                },
                'b_init_method': 'xavier',
                'b_init_para': {
                    'uniform': False
                }
            })

        self._MLP = tf_networks.MLP(dims=network_shape,
                                    scope='dynamics_mlp',
                                    train=True,
                                    activation_type=act_type,
                                    normalizer_type=norm_type,
                                    init_data=init_data)

        self._tensor['net_output'] = self._MLP(self._tensor['net_input'])

        # get the predicted next state
        self._tensor['pred_output'] = self._tensor['net_output'] * \
            self._whitening_operator['diff_state_std'] + \
            self._whitening_operator['diff_state_mean'] + \
            self._input_ph['start_state']

        # fetch all the trainable variables
        self._set_var_list()
Beispiel #3
0
    def build_network(self):
        self._build_ph()
        self._tensor = {}

        self._tensor['normalized_start_state'] = (
            self._input_ph['start_state'] -
            self._whitening_operator['state_mean']
        ) / self._whitening_operator['state_std']
        self._tensor['net_input'] = self._tensor['normalized_start_state']

        # the mlp for policy
        network_shape = [self._observation_size] + \
            self.args.reward_network_shape + [1]
        num_layer = len(network_shape) - 1
        act_type = \
            [self.args.reward_activation_type] * (num_layer - 1) + [None]
        norm_type = \
            [self.args.reward_normalizer_type] * (num_layer - 1) + [None]
        init_data = []
        for _ in range(num_layer):
            init_data.append(
                {'w_init_method': 'normc', 'w_init_para': {'stddev': 1.0},
                 'b_init_method': 'constant', 'b_init_para': {'val': 0.0}}
            )
        # init_data[-1]['w_init_para']['stddev'] = 0.01  # the output layer std
        self._MLP = tf_networks.MLP(
            dims=network_shape, scope='discriminator_mlp', train=True,
            activation_type=act_type, normalizer_type=norm_type,
            init_data=init_data
        )

        self._tensor['logits'] = self._MLP(self._tensor['net_input'])
        self._tensor['discriminator_output'] = \
            tf.nn.sigmoid(self._tensor['logits'])
        # the self.discriminator_output is the sigmoid(logit)
        self._tensor['logOfD'] = \
            tf.log(self._tensor['discriminator_output'] + 1e-8)
        self._tensor['logOf1minusD'] = \
            tf.log(1 - self._tensor['discriminator_output'] + 1e-8)

        self._tensor['reward_output'] = tf.minimum(
            -self._tensor['logOf1minusD'], self.args.GAN_reward_clip_value
        )
Beispiel #4
0
 def _build_value_network_and_loss(self):
     """ @brief:
             in this function, build the value network and the graph to
             update the loss
         @NOTE: it is different from my ppo repo... (I used 0.01 as stddev)
     """
     # build the placeholder for training the value function
     self._input_ph['value_target'] = \
         tf.placeholder(tf.float32, [None, 1], name='value_target')
     # build the baseline-value function
     network_shape = [self._observation_size] + \
         self.args.value_network_shape + [1]
     num_layer = len(network_shape) - 1
     act_type = \
         [self.args.value_activation_type] * (num_layer - 1) + [None]
     norm_type = \
         [self.args.value_normalizer_type] * (num_layer - 1) + [None]
     init_data = []
     for _ in range(num_layer):
         init_data.append(
             {'w_init_method': 'normc', 'w_init_para': {'stddev': 1.0},
              'b_init_method': 'constant', 'b_init_para': {'val': 0.0}}
         )
     self._baseline_MLP = tf_networks.MLP(
         dims=network_shape, scope='value_mlp', train=True,
         activation_type=act_type, normalizer_type=norm_type,
         init_data=init_data
     )
     self._tensor['pred_value'] = \
         self._baseline_MLP(self._tensor['net_input'])
     # build the loss for the value network
     self._update_operator['vf_loss'] = tf.reduce_mean(
         tf.square(self._tensor['pred_value'] -
                   self._input_ph['value_target']), name='vf_loss'
     )
     self._update_operator['vf_update_op'] = tf.train.AdamOptimizer(
         learning_rate=self.args.value_lr,
         beta1=0.5, beta2=0.99, epsilon=1e-4
     ).minimize(self._update_operator['vf_loss'])
Beispiel #5
0
    def build_network(self):
        # the placeholders
        self._build_ph()

        self._tensor = {}

        # construct the input to the forward network, we normalize the state
        # input, and concatenate with the action
        self._tensor['normalized_start_state'] = (
            self._input_ph['start_state'] -
            self._whitening_operator['state_mean']
        ) / self._whitening_operator['state_std']

        self._tensor['net_input'] = self._tensor['normalized_start_state']

        # the mlp for policy
        network_shape = [self._observation_size] + \
            self.args.policy_network_shape + [self._action_size]
        num_layer = len(network_shape) - 1
        act_type = \
            [self.args.policy_activation_type] * (num_layer - 1) + [None]
        norm_type = \
            [self.args.policy_normalizer_type] * (num_layer - 1) + [None]
        init_data = []
        for _ in range(num_layer):
            init_data.append({
                'w_init_method': 'normc',
                'w_init_para': {
                    'stddev': 1.0
                },
                'b_init_method': 'constant',
                'b_init_para': {
                    'val': 0.0
                }
            })
        init_data[-1]['w_init_para']['stddev'] = 0.01  # the output layer std

        self._MLP = tf_networks.MLP(dims=network_shape,
                                    scope='policy_mlp',
                                    train=True,
                                    activation_type=act_type,
                                    normalizer_type=norm_type,
                                    init_data=init_data)
        self._tensor['action_dist_mu'] = self._MLP(self._tensor['net_input'])

        # fetch all the trainable variables
        self._set_var_list()

        # the gmm approximation
        self._gmm = GaussianMixture(n_components=self.args.gmm_num_cluster,
                                    covariance_type='full',
                                    max_iter=self.args.gmm_max_iteration,
                                    random_state=self.args.seed,
                                    warm_start=True)
        self._gmm_weights = {'mean': None, 'cov': None}
        self._gmm_vec_size = self._observation_size * 2 + self._action_size

        self._NIW_prior = {
            'm':
            self.args.gmm_prior_strength,
            'n0': (self.args.gmm_batch_size - 2.0 - self._gmm_vec_size) /
            self.args.gmm_batch_size * self.args.gmm_prior_strength
        }
        self._policy_cov_data = {
            'flat_cov_L': np.ones([self._action_size]),
            'sig': np.eye(self._action_size),
        }