Beispiel #1
0
    def _create_actor_network(self, joints_input, is_online, reuse_flag):
        name_prefix = '{}_actor_{}'.format(self.name_prefix,
                                           'online' if is_online else 'target')
        activation = get_activation(
            self.config['action_predictor']['activation'])
        layers = self.config['action_predictor']['layers'] + [
            self.number_of_joints
        ]
        current = self._generate_policy_features(joints_input, name_prefix,
                                                 reuse_flag)
        for i, layer_size in enumerate(layers[:-1]):
            current = tf.layers.dense(current,
                                      layer_size,
                                      activation=activation,
                                      name='{}_{}'.format(name_prefix, i),
                                      reuse=reuse_flag)
        tanh_preactivation = tf.layers.dense(
            current,
            layers[-1],
            activation=None,
            name='{}_tanh'.format(name_prefix),
            reuse=reuse_flag)
        action = tf.nn.l2_normalize(tf.nn.tanh(tanh_preactivation), 1)

        return action, tanh_preactivation
Beispiel #2
0
    def _create_actor_network(self, joints_input, is_online, reuse_flag):
        name_prefix = "{}_actor_{}".format(self.name_prefix,
                                           "online" if is_online else "target")
        activation = get_activation(
            self.config["action_predictor"]["activation"])
        layers = self.config["action_predictor"]["layers"] + [
            self.number_of_joints
        ]
        current = self._generate_policy_features(joints_input, name_prefix,
                                                 reuse_flag)
        for i, layer_size in enumerate(layers[:-1]):
            current = tf.layers.dense(
                current,
                layer_size,
                activation=activation,
                name="{}_{}".format(name_prefix, i),
                reuse=reuse_flag,
            )
        tanh_preactivation = tf.layers.dense(
            current,
            layers[-1],
            activation=None,
            name="{}_tanh".format(name_prefix),
            reuse=reuse_flag,
        )
        action = tf.nn.l2_normalize(tf.nn.tanh(tanh_preactivation), 1)

        return action, tanh_preactivation
    def create_reward_network(
            self, joints_inputs, action_inputs, goal_joints_inputs, goal_pose_inputs, images_3d):
        name_prefix = 'reward'
        # get the next joints
        clipped_next_joints, unclipped_next_joints = self._next_state_model(joints_inputs, action_inputs)

        # predict the transition classification
        layers = self.config['reward']['layers'] + [3]
        scale = 0.0
        if 'l2_regularization_coefficient' in self.config['reward']:
            scale = self.config['reward']['l2_regularization_coefficient']
        current = tf.concat(
            (clipped_next_joints, self._generate_goal_features(goal_joints_inputs, goal_pose_inputs)), axis=1)
        # add vision if needed
        if self.is_vision_enabled:
            visual_inputs = DqnModel(name_prefix).predict(images_3d, self._reuse_flag)
            current = tf.concat((current, visual_inputs), axis=1)
        for i, layer_size in enumerate(layers):
            _activation = None if i == len(layers) - 1 else get_activation(self.config['reward']['activation'])
            current = tf.layers.dense(
                current, layer_size, activation=_activation, name='{}_layers_{}'.format(name_prefix, i),
                kernel_regularizer=tf_layers.l2_regularizer(scale), reuse=self._reuse_flag
            )
        softmax_logits = current
        softmax_res = tf.nn.softmax(softmax_logits)

        # if the one-hot input is fed, is labeled will be 1.0 otherwise it will be zero
        is_labeled = tf.expand_dims(tf.reduce_max(self.transition_label, axis=1), axis=1)
        reward_calculation_input = self.transition_label + tf.multiply(1.0 - is_labeled, softmax_res)

        # get the classification reward
        classification_reward = tf.layers.dense(
            reward_calculation_input, 1, activation=None, use_bias=False,
            name='{}_classification_reward'.format(name_prefix), reuse=self._reuse_flag
            )

        # get the clipping-related reward
        # clipped_difference = tf.expand_dims(tf.norm(unclipped_next_joints - clipped_next_joints, axis=1), axis=1)  # this is the original
        # clipped_difference = tf.expand_dims(tf.reduce_sum(tf.zeros_like(clipped_next_joints), axis=1), axis=1)  # this will have no gradient backlash
        clipped_difference = tf.expand_dims(tf.reduce_sum(tf.abs(unclipped_next_joints - clipped_next_joints), axis=1), axis=1)

        clipping_reward = tf.layers.dense(
            clipped_difference, 1, activation=None, use_bias=False, name='{}_clipping_weight'.format(name_prefix),
            reuse=self._reuse_flag
        )

        total_reward = classification_reward + clipping_reward
        self._reuse_flag = True
        return total_reward, softmax_logits
Beispiel #4
0
    def _create_critic_network(self, joints_input, action_input, is_online,
                               reuse_flag, add_regularization_loss):
        name_prefix = '{}_critic_{}'.format(
            self.name_prefix, 'online' if is_online else 'target')
        layers_before_action = self.config['critic']['layers_before_action']
        layers_after_action = self.config['critic']['layers_after_action']
        activation = get_activation(self.config['critic']['activation'])

        current = self._generate_policy_features(joints_input, name_prefix,
                                                 reuse_flag)
        scale = self.config['critic'][
            'l2_regularization_coefficient'] if add_regularization_loss else 0.0
        for i, layer_size in enumerate(layers_before_action):
            current = tf.layers.dense(
                current,
                layer_size,
                activation=activation,
                name='{}_before_action_{}'.format(name_prefix, i),
                reuse=reuse_flag,
                kernel_regularizer=layers.l2_regularizer(scale))
        current = tf.concat((current, action_input), axis=1)
        for i, layer_size in enumerate(layers_after_action):
            _activation = None if i == len(
                layers_after_action) - 1 else activation
            current = tf.layers.dense(
                current,
                layer_size,
                activation=_activation,
                name='{}_after_action_{}'.format(name_prefix, i),
                reuse=reuse_flag,
                kernel_regularizer=layers.l2_regularizer(scale))

        if self.config['critic']['last_layer_tanh']:
            q_val = tf.layers.dense(
                current,
                1,
                activation=tf.nn.tanh,
                name='{}_tanh_layer'.format(name_prefix),
                reuse=reuse_flag,
                kernel_regularizer=layers.l2_regularizer(scale))
            q_val_with_stretch = tf.layers.dense(
                tf.ones_like(q_val),
                1,
                tf.abs,
                False,
                name='{}_stretch'.format(name_prefix),
                reuse=reuse_flag,
                kernel_regularizer=layers.l2_regularizer(scale)) * q_val
            return q_val_with_stretch
            # gamma = self.config['model']['gamma']
            # stretch = 1.0 / (1.0 - gamma)
            # q_val_with_stretch = stretch * q_val
        else:
            q_val = tf.layers.dense(
                current,
                1,
                activation=None,
                name='{}_linear_layer'.format(name_prefix),
                reuse=reuse_flag,
                kernel_regularizer=layers.l2_regularizer(scale))
            return q_val