Ejemplo n.º 1
0
    def test_embeddings_as_inputs(self):
        hidden_size = 32
        sequence_length = 21
        # Create a small BertEncoder for testing.
        test_network = bert_encoder.BertEncoderV2(vocab_size=100,
                                                  hidden_size=hidden_size,
                                                  num_attention_heads=2,
                                                  num_layers=3)
        # Create the inputs (note that the first dimension is implicit).
        word_ids = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
        mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        type_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        test_network.build(
            dict(input_word_ids=word_ids,
                 input_mask=mask,
                 input_type_ids=type_ids))
        embeddings = test_network.get_embedding_layer()(word_ids)
        # Calls with the embeddings.
        dict_outputs = test_network(
            dict(input_word_embeddings=embeddings,
                 input_mask=mask,
                 input_type_ids=type_ids))
        all_encoder_outputs = dict_outputs["encoder_outputs"]
        pooled = dict_outputs["pooled_output"]

        expected_data_shape = [None, sequence_length, hidden_size]
        expected_pooled_shape = [None, hidden_size]
        self.assertLen(all_encoder_outputs, 3)
        for data in all_encoder_outputs:
            self.assertAllEqual(expected_data_shape, data.shape.as_list())
        self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())

        # The default output dtype is float32.
        self.assertAllEqual(tf.float32, all_encoder_outputs[-1].dtype)
        self.assertAllEqual(tf.float32, pooled.dtype)
  def test_weights_forward_compatible(self):
    batch_size = 3

    hidden_size = 32
    sequence_length = 21
    vocab_size = 57
    num_types = 7

    kwargs = dict(
        vocab_size=vocab_size,
        hidden_size=hidden_size,
        num_attention_heads=2,
        num_layers=3,
        type_vocab_size=num_types,
        output_range=None)

    word_id_data = np.random.randint(
        vocab_size, size=(batch_size, sequence_length))
    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
    type_id_data = np.random.randint(
        num_types, size=(batch_size, sequence_length))
    data = dict(
        input_word_ids=word_id_data,
        input_mask=mask_data,
        input_type_ids=type_id_data)

    # Create small BertEncoders for testing.
    new_net = bert_encoder.BertEncoderV2(**kwargs)
    _ = new_net(data)
    kwargs["dict_outputs"] = True
    old_net = bert_encoder.BertEncoder(**kwargs)
    _ = old_net(data)
    new_net._embedding_layer.set_weights(old_net._embedding_layer.get_weights())
    new_net._position_embedding_layer.set_weights(
        old_net._position_embedding_layer.get_weights())
    new_net._type_embedding_layer.set_weights(
        old_net._type_embedding_layer.get_weights())
    new_net._embedding_norm_layer.set_weights(
        old_net._embedding_norm_layer.get_weights())
    # embedding_dropout has no weights.

    if hasattr(old_net, "_embedding_projection"):
      new_net._embedding_projection.set_weights(
          old_net._embedding_projection.get_weights())
    # attention_mask_layer has no weights.

    new_net._pooler_layer.set_weights(old_net._pooler_layer.get_weights())

    for otl, ntl in zip(old_net._transformer_layers,
                        new_net._transformer_layers):
      ntl.set_weights(otl.get_weights())

    def check_output_close(data, net1, net2):
      output1 = net1(data)
      output2 = net2(data)
      for key in output1:
        self.assertAllClose(output1[key], output2[key])

    check_output_close(data, old_net, new_net)
Ejemplo n.º 3
0
  def test_dict_outputs_network_creation(self):
    hidden_size = 32
    sequence_length = 21
    dense_sequence_length = 20
    # Create a small dense BertEncoderV2 for testing.
    kwargs = {}
    test_network = bert_encoder.BertEncoderV2(
        vocab_size=100,
        hidden_size=hidden_size,
        num_attention_heads=2,
        num_layers=3,
        with_dense_inputs=True,
        **kwargs)
    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)

    dense_inputs = tf.keras.Input(
        shape=(dense_sequence_length, hidden_size), dtype=tf.float32)
    dense_mask = tf.keras.Input(shape=(dense_sequence_length,), dtype=tf.int32)
    dense_type_ids = tf.keras.Input(
        shape=(dense_sequence_length,), dtype=tf.int32)

    dict_outputs = test_network(
        dict(
            input_word_ids=word_ids,
            input_mask=mask,
            input_type_ids=type_ids,
            dense_inputs=dense_inputs,
            dense_mask=dense_mask,
            dense_type_ids=dense_type_ids))
    data = dict_outputs["sequence_output"]
    pooled = dict_outputs["pooled_output"]

    self.assertIsInstance(test_network.transformer_layers, list)
    self.assertLen(test_network.transformer_layers, 3)
    self.assertIsInstance(test_network.pooler_layer, tf.keras.layers.Dense)

    expected_data_shape = [
        None, sequence_length + dense_sequence_length, hidden_size
    ]
    expected_pooled_shape = [None, hidden_size]
    self.assertAllEqual(expected_data_shape, data.shape.as_list())
    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())

    # The default output dtype is float32.
    self.assertAllEqual(tf.float32, data.dtype)
    self.assertAllEqual(tf.float32, pooled.dtype)
  def test_keras_model_checkpoint_forward_compatible(self):
    batch_size = 3

    hidden_size = 32
    sequence_length = 21
    vocab_size = 57
    num_types = 7

    kwargs = dict(
        vocab_size=vocab_size,
        hidden_size=hidden_size,
        num_attention_heads=2,
        num_layers=3,
        type_vocab_size=num_types,
        output_range=None)

    word_id_data = np.random.randint(
        vocab_size, size=(batch_size, sequence_length))
    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
    type_id_data = np.random.randint(
        num_types, size=(batch_size, sequence_length))
    data = dict(
        input_word_ids=word_id_data,
        input_mask=mask_data,
        input_type_ids=type_id_data)

    kwargs["dict_outputs"] = True
    old_net = bert_encoder.BertEncoder(**kwargs)
    inputs = old_net.inputs
    outputs = old_net(inputs)
    old_model = tf.keras.Model(inputs=inputs, outputs=outputs)
    old_model_outputs = old_model(data)
    ckpt = tf.train.Checkpoint(net=old_model)
    path = ckpt.save(self.get_temp_dir())
    del kwargs["dict_outputs"]
    new_net = bert_encoder.BertEncoderV2(**kwargs)
    inputs = new_net.inputs
    outputs = new_net(inputs)
    new_model = tf.keras.Model(inputs=inputs, outputs=outputs)
    new_ckpt = tf.train.Checkpoint(net=new_model)
    status = new_ckpt.restore(path)

    status.assert_existing_objects_matched()
    new_model_outputs = new_model(data)

    self.assertAllEqual(old_model_outputs.keys(), new_model_outputs.keys())
    for key in old_model_outputs:
      self.assertAllClose(old_model_outputs[key], new_model_outputs[key])
Ejemplo n.º 5
0
  def test_dict_outputs_all_encoder_outputs_network_creation(self):
    hidden_size = 32
    sequence_length = 21
    dense_sequence_length = 20
    # Create a small BertEncoder for testing.
    test_network = bert_encoder.BertEncoderV2(
        vocab_size=100,
        hidden_size=hidden_size,
        num_attention_heads=2,
        num_layers=3,
        dict_outputs=True,
        with_dense_inputs=True)
    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)

    dense_inputs = tf.keras.Input(
        shape=(dense_sequence_length, hidden_size), dtype=tf.float32)
    dense_mask = tf.keras.Input(shape=(dense_sequence_length,), dtype=tf.int32)
    dense_type_ids = tf.keras.Input(
        shape=(dense_sequence_length,), dtype=tf.int32)

    dict_outputs = test_network(
        dict(
            input_word_ids=word_ids,
            input_mask=mask,
            input_type_ids=type_ids,
            dense_inputs=dense_inputs,
            dense_mask=dense_mask,
            dense_type_ids=dense_type_ids))

    all_encoder_outputs = dict_outputs["encoder_outputs"]
    pooled = dict_outputs["pooled_output"]

    expected_data_shape = [
        None, sequence_length + dense_sequence_length, hidden_size
    ]
    expected_pooled_shape = [None, hidden_size]
    self.assertLen(all_encoder_outputs, 3)
    for data in all_encoder_outputs:
      self.assertAllEqual(expected_data_shape, data.shape.as_list())
    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())

    # The default output dtype is float32.
    self.assertAllEqual(tf.float32, all_encoder_outputs[-1].dtype)
    self.assertAllEqual(tf.float32, pooled.dtype)
Ejemplo n.º 6
0
  def test_dict_outputs_network_creation_with_float16_dtype(self):
    hidden_size = 32
    sequence_length = 21
    dense_sequence_length = 20
    tf.keras.mixed_precision.set_global_policy("mixed_float16")
    # Create a small BertEncoder for testing.
    test_network = bert_encoder.BertEncoderV2(
        vocab_size=100,
        hidden_size=hidden_size,
        num_attention_heads=2,
        num_layers=3,
        dict_outputs=True,
        with_dense_inputs=True)
    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)

    dense_inputs = tf.keras.Input(
        shape=(dense_sequence_length, hidden_size), dtype=tf.float32)
    dense_mask = tf.keras.Input(shape=(dense_sequence_length,), dtype=tf.int32)
    dense_type_ids = tf.keras.Input(
        shape=(dense_sequence_length,), dtype=tf.int32)

    dict_outputs = test_network(
        dict(
            input_word_ids=word_ids,
            input_mask=mask,
            input_type_ids=type_ids,
            dense_inputs=dense_inputs,
            dense_mask=dense_mask,
            dense_type_ids=dense_type_ids))

    data = dict_outputs["sequence_output"]
    pooled = dict_outputs["pooled_output"]

    expected_data_shape = [
        None, sequence_length + dense_sequence_length, hidden_size
    ]
    expected_pooled_shape = [None, hidden_size]
    self.assertAllEqual(expected_data_shape, data.shape.as_list())
    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())

    # If float_dtype is set to float16, the data output is float32 (from a layer
    # norm) and pool output should be float16.
    self.assertAllEqual(tf.float32, data.dtype)
    self.assertAllEqual(tf.float16, pooled.dtype)
Ejemplo n.º 7
0
    def test_checkpoint_forward_compatible(self):
        batch_size = 3

        hidden_size = 32
        sequence_length = 21
        vocab_size = 57
        num_types = 7

        kwargs = dict(vocab_size=vocab_size,
                      hidden_size=hidden_size,
                      num_attention_heads=2,
                      num_layers=3,
                      type_vocab_size=num_types,
                      output_range=None)

        word_id_data = np.random.randint(vocab_size,
                                         size=(batch_size, sequence_length))
        mask_data = np.random.randint(2, size=(batch_size, sequence_length))
        type_id_data = np.random.randint(num_types,
                                         size=(batch_size, sequence_length))
        data = dict(input_word_ids=word_id_data,
                    input_mask=mask_data,
                    input_type_ids=type_id_data)

        old_net = bert_encoder.BertEncoderV2(**kwargs)
        old_net_outputs = old_net(data)
        ckpt = tf.train.Checkpoint(net=old_net)
        path = ckpt.save(self.get_temp_dir())
        new_net = encoder.TokenDropBertEncoder(token_keep_k=sequence_length,
                                               token_allow_list=(),
                                               token_deny_list=(),
                                               **kwargs)
        new_ckpt = tf.train.Checkpoint(net=new_net)
        status = new_ckpt.restore(path)
        status.assert_existing_objects_matched()
        # assert_consumed will fail because the old model has redundant nodes.
        new_net_outputs = new_net(data)

        self.assertAllEqual(old_net_outputs.keys(), new_net_outputs.keys())
        for key in old_net_outputs:
            self.assertAllClose(old_net_outputs[key], new_net_outputs[key])
Ejemplo n.º 8
0
  def test_dict_outputs_network_invocation(
      self, encoder_cls, output_range, out_seq_len):
    hidden_size = 32
    sequence_length = 21
    dense_sequence_length = 20
    vocab_size = 57
    num_types = 7
    # Create a small BertEncoder for testing.
    test_network = encoder_cls(
        vocab_size=vocab_size,
        hidden_size=hidden_size,
        num_attention_heads=2,
        num_layers=3,
        type_vocab_size=num_types,
        output_range=output_range,
        dict_outputs=True,
        with_dense_inputs=True)
    # Create the inputs (note that the first dimension is implicit).
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    dense_inputs = tf.keras.Input(
        shape=(dense_sequence_length, hidden_size), dtype=tf.float32)
    dense_mask = tf.keras.Input(shape=(dense_sequence_length,), dtype=tf.int32)
    dense_type_ids = tf.keras.Input(
        shape=(dense_sequence_length,), dtype=tf.int32)

    dict_outputs = test_network(
        dict(
            input_word_ids=word_ids,
            input_mask=mask,
            input_type_ids=type_ids,
            dense_inputs=dense_inputs,
            dense_mask=dense_mask,
            dense_type_ids=dense_type_ids))
    data = dict_outputs["sequence_output"]
    pooled = dict_outputs["pooled_output"]

    # Create a model based off of this network:
    model = tf.keras.Model(
        [word_ids, mask, type_ids, dense_inputs, dense_mask, dense_type_ids],
        [data, pooled])

    # Invoke the model. We can't validate the output data here (the model is too
    # complex) but this will catch structural runtime errors.
    batch_size = 3
    word_id_data = np.random.randint(
        vocab_size, size=(batch_size, sequence_length))
    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
    type_id_data = np.random.randint(
        num_types, size=(batch_size, sequence_length))

    dense_input_data = np.random.rand(batch_size, dense_sequence_length,
                                      hidden_size)
    dense_mask_data = np.random.randint(
        2, size=(batch_size, dense_sequence_length))
    dense_type_ids_data = np.random.randint(
        num_types, size=(batch_size, dense_sequence_length))

    outputs = model.predict([
        word_id_data, mask_data, type_id_data, dense_input_data,
        dense_mask_data, dense_type_ids_data
    ])
    self.assertEqual(outputs[0].shape[1], out_seq_len)

    # Creates a BertEncoder with max_sequence_length != sequence_length
    max_sequence_length = 128
    test_network = encoder_cls(
        vocab_size=vocab_size,
        hidden_size=hidden_size,
        max_sequence_length=max_sequence_length,
        num_attention_heads=2,
        num_layers=3,
        type_vocab_size=num_types,
        dict_outputs=True)
    dict_outputs = test_network(
        dict(
            input_word_ids=word_ids,
            input_mask=mask,
            input_type_ids=type_ids,
            dense_inputs=dense_inputs,
            dense_mask=dense_mask,
            dense_type_ids=dense_type_ids))
    data = dict_outputs["sequence_output"]
    pooled = dict_outputs["pooled_output"]
    model = tf.keras.Model(
        [word_ids, mask, type_ids, dense_inputs, dense_mask, dense_type_ids],
        [data, pooled])
    outputs = model.predict([
        word_id_data, mask_data, type_id_data, dense_input_data,
        dense_mask_data, dense_type_ids_data
    ])
    self.assertEqual(outputs[0].shape[1],
                     sequence_length + dense_sequence_length)

    # Creates a BertEncoder with embedding_width != hidden_size
    embedding_width = 16
    test_network = bert_encoder.BertEncoderV2(
        vocab_size=vocab_size,
        hidden_size=hidden_size,
        max_sequence_length=max_sequence_length,
        num_attention_heads=2,
        num_layers=3,
        type_vocab_size=num_types,
        embedding_width=embedding_width,
        dict_outputs=True)

    dense_inputs = tf.keras.Input(
        shape=(dense_sequence_length, embedding_width), dtype=tf.float32)
    dense_input_data = np.zeros(
        (batch_size, dense_sequence_length, embedding_width), dtype=float)

    dict_outputs = test_network(
        dict(
            input_word_ids=word_ids,
            input_mask=mask,
            input_type_ids=type_ids,
            dense_inputs=dense_inputs,
            dense_mask=dense_mask,
            dense_type_ids=dense_type_ids))
    data = dict_outputs["sequence_output"]
    pooled = dict_outputs["pooled_output"]
    model = tf.keras.Model(
        [word_ids, mask, type_ids, dense_inputs, dense_mask, dense_type_ids],
        [data, pooled])
    outputs = model.predict([
        word_id_data, mask_data, type_id_data, dense_input_data,
        dense_mask_data, dense_type_ids_data
    ])
    self.assertEqual(outputs[0].shape[-1], hidden_size)
    self.assertTrue(hasattr(test_network, "_embedding_projection"))