Ejemplo n.º 1
0
  def test_strategy_with_file(self, strategy):
    # TODO(b/180614455): remove this check when MLIR bridge is always enabled.
    if backend.is_tpu_strategy(strategy):
      self.skipTest("This test needs MLIR bridge on TPU.")

    vocab_data = ["earth", "wind", "and", "fire"]
    vocab_file = self._write_to_temp_file("temp", vocab_data)

    input_array = np.array([["earth", "wind", "and", "fire"],
                            ["fire", "and", "earth", "michigan"]])
    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_array).batch(
        2, drop_remainder=True)
    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

    config.set_soft_device_placement(True)

    with strategy.scope():
      input_data = keras.Input(shape=(None,), dtype=dtypes.string)
      layer = index_lookup.IndexLookup(
          max_tokens=None,
          num_oov_indices=1,
          mask_token="",
          oov_token="[OOV]",
          dtype=dtypes.string,
          vocabulary=vocab_file)
      int_data = layer(input_data)
      model = keras.Model(inputs=input_data, outputs=int_data)
    model.compile(loss="mse")
    output_dataset = model.predict(input_dataset)
    self.assertAllEqual(expected_output, output_dataset)
Ejemplo n.º 2
0
 def bm_adapt_implementation(self, num_elements, batch_size, k):
     """Test the KPL adapt implementation."""
     ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.string,
                                             tensor_shape.TensorShape([]))
     batched_ds = ds.take(num_elements).batch(batch_size)
     input_t = keras.Input(shape=(), dtype=dtypes.string)
     layer = index_lookup.IndexLookup(max_tokens=k,
                                      num_oov_indices=0,
                                      mask_token=None,
                                      oov_token="OOV",
                                      dtype=dtypes.string)
     _ = layer(input_t)
     num_repeats = 5
     starts = []
     ends = []
     for _ in range(num_repeats):
         starts.append(time.time())
         layer.adapt(batched_ds)
         ends.append(time.time())
     avg_time = np.mean(np.array(ends) - np.array(starts))
     name = "index_lookup_adapt|%s_elements|vocab_size_%s|batch_%s" % (
         num_elements, k, batch_size)
     baseline = self.run_numpy_implementation(num_elements, batch_size, k)
     extras = {
         "numpy implementation baseline": baseline,
         "delta seconds": (baseline - avg_time),
         "delta percent": ((baseline - avg_time) / baseline) * 100
     }
     self.report_benchmark(iters=num_repeats,
                           wall_time=avg_time,
                           extras=extras,
                           name=name)
  def DISABLED_test_tpu_distribution_with_file(self, distribution):
    vocab_data = ["earth", "wind", "and", "fire"]
    vocab_file = self._write_to_temp_file("temp", vocab_data)

    input_array = np.array([["earth", "wind", "and", "fire"],
                            ["fire", "and", "earth", "michigan"]])
    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_array).batch(
        2, drop_remainder=True)
    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

    config.set_soft_device_placement(True)

    with distribution.scope():
      input_data = keras.Input(shape=(None,), dtype=dtypes.string)
      layer = index_lookup.IndexLookup(
          max_tokens=None,
          num_oov_indices=1,
          mask_token="",
          oov_token="[OOV]",
          dtype=dtypes.string,
          vocabulary=vocab_file)
      int_data = layer(input_data)
      model = keras.Model(inputs=input_data, outputs=int_data)
    model.compile(loss="mse")
    output_dataset = model.predict(input_dataset)
    self.assertAllEqual(expected_output, output_dataset)
    def test_strategy(self, strategy):
        vocab_data = [[
            "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
            "and", "fire"
        ]]
        vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
        input_array = np.array([["earth", "wind", "and", "fire"],
                                ["fire", "and", "earth", "michigan"]])
        input_dataset = dataset_ops.Dataset.from_tensor_slices(
            input_array).batch(2, drop_remainder=True)
        expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

        config.set_soft_device_placement(True)

        with strategy.scope():
            input_data = keras.Input(shape=(None, ), dtype=dtypes.string)
            layer = index_lookup.IndexLookup(max_tokens=None,
                                             num_oov_indices=1,
                                             mask_token="",
                                             oov_token="[OOV]",
                                             dtype=dtypes.string)
            layer.adapt(vocab_dataset)
            int_data = layer(input_data)
            model = keras.Model(inputs=input_data, outputs=int_data)
        model.compile(loss="mse")
        output_dataset = model.predict(input_dataset)
        self.assertAllEqual(expected_output, output_dataset)
    def test_tpu_with_multiple_oov(self, strategy):
        # TODO(b/180614455): remove this check when MLIR bridge is always enabled.
        if "TPU" in type(strategy).__name__:
            self.skipTest("This test needs MLIR bridge on TPU.")

        vocab_data = [[
            "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
            "and", "fire"
        ]]
        vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
        input_array = np.array([["earth", "wind", "and", "fire"],
                                ["fire", "and", "earth", "michigan"]])
        input_dataset = dataset_ops.Dataset.from_tensor_slices(
            input_array).batch(2, drop_remainder=True)
        expected_output = [[3, 4, 5, 6], [6, 5, 3, 1]]

        config.set_soft_device_placement(True)

        with strategy.scope():
            input_data = keras.Input(shape=(None, ), dtype=dtypes.string)
            layer = index_lookup.IndexLookup(max_tokens=None,
                                             num_oov_indices=2,
                                             mask_token="",
                                             oov_token="[OOV]",
                                             dtype=dtypes.string)
            layer.adapt(vocab_dataset)
            int_data = layer(input_data)
            model = keras.Model(inputs=input_data, outputs=int_data)
        model.compile(loss="mse")
        output_dataset = model.predict(input_dataset)
        self.assertAllEqual(expected_output, output_dataset)
Ejemplo n.º 6
0
 def __init__(self, encoding, **kwargs):
     super().__init__(**kwargs)
     self.encoding = encoding
     self.encoding_layers = []
     for encoding in self.encoding:
         if encoding == NONE:
             self.encoding_layers.append(None)
         elif encoding == INT:
             self.encoding_layers.append(index_lookup.IndexLookup())
         elif encoding == ONE_HOT:
             self.encoding_layers.append(None)
Ejemplo n.º 7
0
 def bm_adapt_implementation(self, num_elements, batch_size):
     """Test the KPL adapt implementation."""
     vocab = get_vocab()
     vocab_file = self._write_to_temp_file("vocab", vocab)
     vocabulary_initializer = lookup_ops.TextFileInitializer(
         filename=vocab_file,
         key_dtype=dtypes.string,
         key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
         value_dtype=dtypes.int64,
         value_index=lookup_ops.TextFileIndex.LINE_NUMBER,
         value_index_offset=2)
     input_t = keras.Input(shape=(), dtype=dtypes.string)
     layer = index_lookup.IndexLookup(vocabulary=vocabulary_initializer,
                                      max_tokens=None,
                                      num_oov_indices=1,
                                      mask_token="",
                                      oov_token="OOV",
                                      dtype=dtypes.string)
     out_t = layer(input_t)
     model = keras.Model(input_t, out_t)
     num_repeats = 5
     starts = []
     ends = []
     data = tensor_gen(batch_size, num_elements)
     _ = model(data)
     for _ in range(num_repeats):
         starts.append(time.time())
         _ = model(data)
         ends.append(time.time())
     avg_time = np.mean(np.array(ends) - np.array(starts))
     baseline, _ = self.run_numpy_implementation(data, vocab)
     extras = {
         "numpy implementation baseline": baseline,
         "delta seconds": (baseline - avg_time),
         "delta percent": ((baseline - avg_time) / baseline) * 100
     }
     name = "index_lookup_forward|%s_elements|batch_%s" % (num_elements,
                                                           batch_size)
     self.report_benchmark(iters=num_repeats,
                           wall_time=avg_time,
                           extras=extras,
                           name=name)
 def run_numpy_implementation(self, num_elements, batch_size, k):
     """Test the python implementation."""
     ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.string,
                                             tensor_shape.TensorShape([]))
     batched_ds = ds.take(num_elements).batch(batch_size)
     input_t = keras.Input(shape=(), dtype=dtypes.string)
     layer = index_lookup.IndexLookup(max_tokens=k,
                                      num_oov_tokens=0,
                                      reserve_zero=False)
     _ = layer(input_t)
     num_repeats = 5
     starts = []
     ends = []
     for _ in range(num_repeats):
         starts.append(time.time())
         vocab = get_top_k(batched_ds, k)
         layer.set_vocabulary(vocab)
         ends.append(time.time())
     avg_time = np.mean(np.array(ends) - np.array(starts))
     return avg_time
Ejemplo n.º 9
0
 def run_numpy_implementation(self, data, vocab):
     """Test the python implementation."""
     input_t = keras.Input(shape=(), dtype=dtypes.string)
     layer = index_lookup.IndexLookup(vocabulary=vocab,
                                      max_tokens=None,
                                      num_oov_indices=1,
                                      mask_token="",
                                      oov_token="OOV",
                                      dtype=dtypes.string)
     out_t = layer(input_t)
     model = keras.Model(input_t, out_t)
     num_repeats = 5
     starts = []
     ends = []
     _ = model(data)
     for _ in range(num_repeats):
         starts.append(time.time())
         out = model(data)
         ends.append(time.time())
     avg_time = np.mean(np.array(ends) - np.array(starts))
     return avg_time, out
Ejemplo n.º 10
0
  def test_tpu_distribution(self):
    vocab_data = [[
        "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
        "and", "fire"
    ]]
    vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
    input_array = np.array([["earth", "wind", "and", "fire"],
                            ["fire", "and", "earth", "michigan"]])
    input_dataset = dataset_ops.Dataset.from_tensor_slices(input_array).batch(
        2, drop_remainder=True)
    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]

    config.set_soft_device_placement(True)
    strategy = tpu_strategy_test_utils.get_tpu_strategy()

    with strategy.scope():
      input_data = keras.Input(shape=(None,), dtype=dtypes.string)
      layer = index_lookup.IndexLookup()
      layer.adapt(vocab_dataset)
      int_data = layer(input_data)
      model = keras.Model(inputs=input_data, outputs=int_data)
    output_dataset = model.predict(input_dataset)
    self.assertAllEqual(expected_output, output_dataset)