def _build_model(self, data, proj_multiple=2):
     model = tf.keras.models.Sequential()
     model.add(
         TNExpandCondense(proj_multiplier=proj_multiple,
                          use_bias=True,
                          activation='relu',
                          input_shape=(data.shape[-1], )))
     model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
     return model
    def test_config(self, input_dim, proj_multiple):
        data = np.random.randint(10, size=(100, input_dim))
        model = self._build_model(data, proj_multiple)

        expected_num_parameters = model.layers[0].count_params()

        # Serialize model and use config to create new layer
        model_config = model.get_config()
        layer_config = model_config['layers'][1]['config']

        new_model = TNExpandCondense.from_config(layer_config)

        # Build the layer so we can count params below
        new_model.build(layer_config['batch_input_shape'])

        # Check that original layer had same num params as layer built from config
        self.assertEqual(expected_num_parameters, new_model.count_params())
    def test_expandcondense_num_parameters(self, input_dim, proj_multiple):
        data = np.random.randint(10, size=(100, input_dim))
        proj_size = proj_multiple * data.shape[-1]
        model = tf.keras.models.Sequential()
        model.add(
            TNExpandCondense(proj_multiplier=proj_multiple,
                             use_bias=True,
                             activation='relu',
                             input_shape=(data.shape[-1], )))

        w1_params = data.shape[-1]**2
        w2_params = 128 * 128 * (proj_size // data.shape[-1])
        w3_params = 128 * 128 * (proj_size // data.shape[-1])
        w4_params = (data.shape[-1] // 128) * 128 * data.shape[-1]
        bias_params = ((data.shape[-1] // 128) * 128 *
                       (proj_size // data.shape[-1]))

        expected_num_parameters = (w1_params + w2_params + w3_params +
                                   w4_params) + bias_params

        self.assertEqual(expected_num_parameters, model.count_params())
Exemple #4
0
    def build(self, input_shape):
        input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
        input_tensor_shape = tf.TensorShape(input_tensor)
        if len(input_tensor_shape.as_list()) != 3:
            raise ValueError(
                "TNTransformerExpandCondense expects a three-dimensional input of "
                "shape [batch, sequence, width].")
        batch_size, sequence_length, hidden_size = input_tensor_shape

        if len(input_shape) == 2:
            mask_tensor_shape = tf.TensorShape(input_shape[1])
            expected_mask_tensor_shape = tf.TensorShape(
                [batch_size, sequence_length, sequence_length])
            if not expected_mask_tensor_shape.is_compatible_with(
                    mask_tensor_shape):
                raise ValueError(
                    "When passing a mask tensor to TNTransformerExpandCondense, the "
                    "mask tensor must be of shape [batch, "
                    "sequence_length, sequence_length] (here %s). Got a "
                    "mask tensor of shape %s." %
                    (expected_mask_tensor_shape, mask_tensor_shape))
        if hidden_size % self._num_heads != 0:
            raise ValueError(
                "The input size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (hidden_size, self._num_heads))
        self._attention_head_size = int(hidden_size // self._num_heads)
        common_kwargs = dict(kernel_regularizer=self._kernel_regularizer,
                             bias_regularizer=self._bias_regularizer,
                             activity_regularizer=self._activity_regularizer,
                             kernel_constraint=self._kernel_constraint,
                             bias_constraint=self._bias_constraint)
        self._attention_layer = tf.keras.layers.MultiHeadAttention(
            num_heads=self._num_heads,
            key_dim=self._attention_head_size,
            dropout=self._attention_dropout_rate,
            use_bias=self._use_bias,
            kernel_initializer=self._attention_initializer,
            bias_initializer=tf_utils.clone_initializer(
                self._bias_initializer),
            name="self_attention",
            **common_kwargs)
        self._attention_dropout = tf.keras.layers.Dropout(
            rate=self._dropout_rate)
        # Use float32 in layernorm for numeric stability.
        # It is probably safe in mixed_float16, but we haven't validated this yet.
        self._attention_layer_norm = (tf.keras.layers.LayerNormalization(
            name="self_attention_layer_norm",
            axis=-1,
            epsilon=self._norm_epsilon,
            dtype=tf.float32))

        # Substitute Dense layers with a single Expand-Condense layer.
        self._output_dense = TNExpandCondense(
            4,
            use_bias=True,
            activation=self._intermediate_activation,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer)

        self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
        # Use float32 in layernorm for numeric stability.
        self._output_layer_norm = tf.keras.layers.LayerNormalization(
            name="output_layer_norm",
            axis=-1,
            epsilon=self._norm_epsilon,
            dtype=tf.float32)

        super(TNTransformerExpandCondense, self).build(input_shape)