Beispiel #1
0
    def test_hash_ragged_string_input_siphash(self):
        layer = hashing.Hashing(num_bins=2, salt=[133, 137])
        inp_data = tf.ragged.constant(
            [
                ["omar", "stringer", "marlo", "wire"],
                ["marlo", "skywalker", "wire"],
            ],
            dtype=tf.string,
        )
        out_data = layer(inp_data)
        # Same hashed output as test_hash_dense_input_siphash
        expected_output = [[0, 1, 0, 1], [0, 0, 1]]
        self.assertAllEqual(expected_output, out_data)

        inp_t = input_layer.Input(shape=(None, ), ragged=True, dtype=tf.string)
        out_t = layer(inp_t)
        model = training.Model(inputs=inp_t, outputs=out_t)
        self.assertAllClose(out_data, model.predict(inp_data))

        layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
        out_data = layer_2(inp_data)
        expected_output = [[1, 0, 1, 0], [1, 1, 0]]
        self.assertAllEqual(expected_output, out_data)

        out_t = layer_2(inp_t)
        model = training.Model(inputs=inp_t, outputs=out_t)
        self.assertAllClose(out_data, model.predict(inp_data))
Beispiel #2
0
 def test_legacy_dtype_compat(self):
     inputs = keras.Input(batch_size=16, shape=(4, ), dtype="string")
     layer = hashing.Hashing(num_bins=3, dtype="float32")
     outputs = layer(inputs)
     self.assertAllEqual(outputs.dtype, tf.int64)
     # In TF1 we sometimes face an explicit dtype=None in the config.
     layer = hashing.Hashing(num_bins=3, dtype=None)
     outputs = layer(inputs)
     self.assertAllEqual(outputs.dtype, tf.int64)
Beispiel #3
0
 def test_invalid_inputs(self):
   with self.assertRaisesRegex(ValueError, 'cannot be `None`'):
     _ = hashing.Hashing(num_bins=None)
   with self.assertRaisesRegex(ValueError, 'cannot be `None`'):
     _ = hashing.Hashing(num_bins=-1)
   with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'):
     _ = hashing.Hashing(num_bins=2, salt='string')
   with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'):
     _ = hashing.Hashing(num_bins=2, salt=[1])
   with self.assertRaisesRegex(ValueError, 'can only be a tuple of size 2'):
     _ = hashing.Hashing(num_bins=1, salt=tf.constant([133, 137]))
Beispiel #4
0
 def test_hash_dense_input_mask_value_farmhash(self):
     empty_mask_layer = hashing.Hashing(num_bins=3, mask_value="")
     omar_mask_layer = hashing.Hashing(num_bins=3, mask_value="omar")
     inp = np.asarray([["omar"], ["stringer"], ["marlo"], ["wire"],
                       ["skywalker"]])
     empty_mask_output = empty_mask_layer(inp)
     omar_mask_output = omar_mask_layer(inp)
     # Outputs should be one more than test_hash_dense_input_farmhash (the zeroth
     # bin is now reserved for masks).
     self.assertAllClose([[1], [1], [2], [1], [1]], empty_mask_output)
     # 'omar' should map to 0.
     self.assertAllClose([[0], [1], [2], [1], [1]], omar_mask_output)
Beispiel #5
0
    def test_hash_dense_input_siphash(self):
        layer = hashing.Hashing(num_bins=2, salt=[133, 137])
        inp = np.asarray([["omar"], ["stringer"], ["marlo"], ["wire"],
                          ["skywalker"]])
        output = layer(inp)
        # Assert equal for hashed output that should be true on all platforms.
        # Note the result is different from FarmHash.
        self.assertAllClose([[0], [1], [0], [1], [0]], output)

        layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
        output_2 = layer_2(inp)
        # Note the result is different from (133, 137).
        self.assertAllClose([[1], [0], [1], [0], [1]], output_2)
Beispiel #6
0
    def test_hash_dense_multi_inputs_siphash(self):
        layer = hashing.Hashing(num_bins=2, salt=[133, 137])
        inp_1 = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
                            ['skywalker']])
        inp_2 = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
        output = layer([inp_1, inp_2])
        # Assert equal for hashed output that should be true on all platforms.
        # Note the result is different from FarmHash.
        self.assertAllClose([[0], [1], [0], [0], [1]], output)

        layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
        output_2 = layer_2([inp_1, inp_2])
        # Note the result is different from (133, 137).
        self.assertAllClose([[1], [1], [1], [0], [1]], output_2)
Beispiel #7
0
 def test_hash_ragged_input_mask_value(self):
     empty_mask_layer = hashing.Hashing(num_bins=3, mask_value='')
     omar_mask_layer = hashing.Hashing(num_bins=3, mask_value='omar')
     inp_data = tf.ragged.constant([['omar', 'stringer', 'marlo', 'wire'],
                                    ['marlo', 'skywalker', 'wire']],
                                   dtype=tf.string)
     empty_mask_output = empty_mask_layer(inp_data)
     omar_mask_output = omar_mask_layer(inp_data)
     # Outputs should be one more than test_hash_ragged_string_input_farmhash
     # (the zeroth bin is now reserved for masks).
     expected_output = [[1, 1, 2, 1], [2, 1, 1]]
     self.assertAllClose(expected_output, empty_mask_output)
     # 'omar' should map to 0.
     expected_output = [[0, 1, 2, 1], [2, 1, 1]]
     self.assertAllClose(expected_output, omar_mask_output)
Beispiel #8
0
 def test_hash_dense_multi_inputs_mask_value_farmhash(self):
     layer = hashing.Hashing(num_bins=3, mask_value='omar')
     inp_1 = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
                         ['skywalker']])
     inp_2 = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
     with self.assertRaisesRegex(ValueError, 'not supported yet'):
         _ = layer([inp_1, inp_2])
  def bm_layer_implementation(self, batch_size):
    input_1 = keras.Input(shape=(None,), dtype=tf.string, name="word")
    layer = hashing.Hashing(num_bins=2)
    _ = layer(input_1)

    num_repeats = 5
    starts = []
    ends = []
    for _ in range(num_repeats):
      ds = tf.data.Dataset.from_generator(word_gen, tf.string,
                                              tf.TensorShape([]))
      ds = ds.shuffle(batch_size * 100)
      ds = ds.batch(batch_size)
      num_batches = 5
      ds = ds.take(num_batches)
      ds = ds.prefetch(num_batches)
      starts.append(time.time())
      # Benchmarked code begins here.
      for i in ds:
        _ = layer(i)
      # Benchmarked code ends here.
      ends.append(time.time())

    avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
    name = "hashing|batch_%s" % batch_size
    baseline = self.run_dataset_implementation(batch_size)
    extras = {
        "dataset implementation baseline": baseline,
        "delta seconds": (baseline - avg_time),
        "delta percent": ((baseline - avg_time) / baseline) * 100
    }
    self.report_benchmark(
        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
Beispiel #10
0
    def test_hash_sparse_input_siphash(self):
        layer = hashing.Hashing(num_bins=2, salt=[133, 137])
        indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
        inp = tf.SparseTensor(
            indices=indices,
            values=['omar', 'stringer', 'marlo', 'wire', 'skywalker'],
            dense_shape=[3, 2])
        output = layer(inp)
        self.assertAllClose(output.indices, indices)
        # The result should be same with test_hash_dense_input_siphash.
        self.assertAllClose([0, 1, 0, 1, 0], output.values)

        layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
        output = layer_2(inp)
        # The result should be same with test_hash_dense_input_siphash.
        self.assertAllClose([1, 0, 1, 0, 1], output.values)
Beispiel #11
0
    def test_tensor_like_inputs(self, data_fn):
        input_data = data_fn([0, 1, 2, 3, 4])
        expected_output = [1, 0, 1, 0, 2]

        layer = hashing.Hashing(num_bins=3)
        output_data = layer(input_data)
        self.assertAllEqual(output_data, expected_output)
Beispiel #12
0
 def test_hash_dense_input_farmhash(self):
     layer = hashing.Hashing(num_bins=2)
     inp = np.asarray([["omar"], ["stringer"], ["marlo"], ["wire"],
                       ["skywalker"]])
     output = layer(inp)
     # Assert equal for hashed output that should be true on all platforms.
     self.assertAllClose([[0], [0], [1], [0], [0]], output)
Beispiel #13
0
def embedding_varlen(batch_size, max_length):
  """Benchmark a variable-length embedding."""
  # Data and constants.

  num_buckets = 10000
  vocab = fc_bm.create_vocabulary(32768)
  data = fc_bm.create_string_data(
      max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0)

  # Keras implementation
  model = keras.Sequential()
  model.add(
      keras.Input(
          shape=(max_length,), name="data", ragged=True, dtype=tf.string))
  model.add(hashing.Hashing(num_buckets))

  # FC implementation
  fc = tf.feature_column.categorical_column_with_hash_bucket("data", num_buckets)

  # Wrap the FC implementation in a tf.function for a fair comparison
  @tf_function()
  def fc_fn(tensors):
    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)

  # Benchmark runs
  keras_data = {"data": data}
  k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

  fc_data = {"data": data.to_sparse()}
  fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

  return k_avg_time, fc_avg_time
Beispiel #14
0
 def test_hash_compute_output_signature(self):
     input_shape = tf.TensorShape([2, 3])
     input_spec = tf.TensorSpec(input_shape, tf.string)
     layer = hashing.Hashing(num_bins=2)
     output_spec = layer.compute_output_signature(input_spec)
     self.assertEqual(output_spec.shape.dims, input_shape.dims)
     self.assertEqual(output_spec.dtype, tf.int64)
Beispiel #15
0
 def test_hash_dense_multi_inputs_farmhash(self):
     layer = hashing.Hashing(num_bins=2)
     inp_1 = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
                         ['skywalker']])
     inp_2 = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
     output = layer([inp_1, inp_2])
     # Assert equal for hashed output that should be true on all platforms.
     self.assertAllClose([[0], [0], [1], [1], [0]], output)
Beispiel #16
0
 def test_hash_dense_list_inputs_mixed_int_string_farmhash(self):
     layer = hashing.Hashing(num_bins=2)
     inp_1 = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
                         ['skywalker']])
     inp_2 = np.asarray([[1], [2], [3], [4], [5]]).astype(np.int64)
     output = layer([inp_1, inp_2])
     # Assert equal for hashed output that should be true on all platforms.
     self.assertAllClose([[0], [1], [1], [1], [0]], output)
Beispiel #17
0
 def test_hash_sparse_int_input_siphash(self):
   layer = hashing.Hashing(num_bins=3, salt=[133, 137])
   indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
   inp = tf.SparseTensor(
       indices=indices, values=[0, 1, 2, 3, 4], dense_shape=[3, 2])
   output = layer(inp)
   self.assertAllClose(indices, output.indices)
   self.assertAllClose([1, 1, 2, 0, 1], output.values)
Beispiel #18
0
 def test_hash_sparse_input_mask_value_farmhash(self):
     empty_mask_layer = hashing.Hashing(num_bins=3, mask_value='')
     omar_mask_layer = hashing.Hashing(num_bins=3, mask_value='omar')
     indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
     inp = tf.SparseTensor(
         indices=indices,
         values=['omar', 'stringer', 'marlo', 'wire', 'skywalker'],
         dense_shape=[3, 2])
     empty_mask_output = empty_mask_layer(inp)
     omar_mask_output = omar_mask_layer(inp)
     self.assertAllClose(indices, omar_mask_output.indices)
     self.assertAllClose(indices, empty_mask_output.indices)
     # Outputs should be one more than test_hash_sparse_input_farmhash (the
     # zeroth bin is now reserved for masks).
     self.assertAllClose([1, 1, 2, 1, 1], empty_mask_output.values)
     # 'omar' should map to 0.
     self.assertAllClose([0, 1, 2, 1, 1], omar_mask_output.values)
Beispiel #19
0
    def test_hash_sparse_multi_inputs_siphash(self):
        layer = hashing.Hashing(num_bins=2, salt=[133, 137])
        indices = [[0, 0], [1, 0], [2, 0]]
        inp_1 = tf.SparseTensor(indices=indices,
                                values=['omar', 'stringer', 'marlo'],
                                dense_shape=[3, 1])
        inp_2 = tf.SparseTensor(indices=indices,
                                values=['A', 'B', 'C'],
                                dense_shape=[3, 1])
        output = layer([inp_1, inp_2])
        # The result should be same with test_hash_dense_input_siphash.
        self.assertAllClose(indices, output.indices)
        self.assertAllClose([0, 1, 0], output.values)

        layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
        output = layer_2([inp_1, inp_2])
        # The result should be same with test_hash_dense_input_siphash.
        self.assertAllClose([1, 1, 1], output.values)
Beispiel #20
0
 def test_hash_ragged_string_multi_inputs_siphash(self):
     layer = hashing.Hashing(num_bins=2, salt=[133, 137])
     inp_data_1 = tf.ragged.constant([['omar', 'stringer', 'marlo', 'wire'],
                                      ['marlo', 'skywalker', 'wire']],
                                     dtype=tf.string)
     inp_data_2 = tf.ragged.constant([['omar', 'stringer', 'marlo', 'wire'],
                                      ['marlo', 'skywalker', 'wire']],
                                     dtype=tf.string)
     with self.assertRaisesRegex(ValueError, 'not supported yet'):
         _ = layer([inp_data_1, inp_data_2])
Beispiel #21
0
 def test_hash_sparse_input_farmhash(self):
     layer = hashing.Hashing(num_bins=2)
     indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
     inp = tf.SparseTensor(
         indices=indices,
         values=['omar', 'stringer', 'marlo', 'wire', 'skywalker'],
         dense_shape=[3, 2])
     output = layer(inp)
     self.assertAllClose(indices, output.indices)
     self.assertAllClose([0, 0, 1, 0, 0], output.values)
Beispiel #22
0
    def test_hash_dense_list_input_farmhash(self):
        layer = hashing.Hashing(num_bins=2)
        inp = [['omar'], ['stringer'], ['marlo'], ['wire'], ['skywalker']]
        output = layer(inp)
        # Assert equal for hashed output that should be true on all platforms.
        self.assertAllClose([[0], [0], [1], [0], [0]], output)

        inp = ['omar', 'stringer', 'marlo', 'wire', 'skywalker']
        output = layer(inp)
        # Assert equal for hashed output that should be true on all platforms.
        self.assertAllClose([0, 0, 1, 0, 0], output)
Beispiel #23
0
 def test_hash_sparse_multi_inputs_farmhash(self):
     layer = hashing.Hashing(num_bins=2)
     indices = [[0, 0], [1, 0], [2, 0]]
     inp_1 = tf.SparseTensor(indices=indices,
                             values=['omar', 'stringer', 'marlo'],
                             dense_shape=[3, 1])
     inp_2 = tf.SparseTensor(indices=indices,
                             values=['A', 'B', 'C'],
                             dense_shape=[3, 1])
     output = layer([inp_1, inp_2])
     self.assertAllClose(indices, output.indices)
     self.assertAllClose([0, 0, 1], output.values)
Beispiel #24
0
  def test_hash_ragged_int_input_siphash(self):
    layer = hashing.Hashing(num_bins=3, salt=[133, 137])
    inp_data = tf.ragged.constant([[0, 1, 3, 4], [2, 1, 0]], dtype=tf.int64)
    out_data = layer(inp_data)
    # Same hashed output as test_hash_sparse_input_farmhash
    expected_output = [[1, 1, 0, 1], [2, 1, 1]]
    self.assertAllEqual(expected_output, out_data)

    inp_t = input_layer.Input(shape=(None,), ragged=True, dtype=tf.int64)
    out_t = layer(inp_t)
    model = training.Model(inputs=inp_t, outputs=out_t)
    self.assertAllClose(out_data, model.predict(inp_data))
Beispiel #25
0
    def test_hash_ragged_string_input_farmhash(self):
        layer = hashing.Hashing(num_bins=2)
        inp_data = tf.ragged.constant([['omar', 'stringer', 'marlo', 'wire'],
                                       ['marlo', 'skywalker', 'wire']],
                                      dtype=tf.string)
        out_data = layer(inp_data)
        # Same hashed output as test_hash_sparse_input_farmhash
        expected_output = [[0, 0, 1, 0], [1, 0, 0]]
        self.assertAllEqual(expected_output, out_data)

        inp_t = input_layer.Input(shape=(None, ), ragged=True, dtype=tf.string)
        out_t = layer(inp_t)
        model = training.Model(inputs=inp_t, outputs=out_t)
        self.assertAllClose(out_data, model.predict(inp_data))
def embedding_varlen(batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.

    num_buckets = 10000
    vocab = fc_bm.create_vocabulary(32768)
    data_a = fc_bm.create_string_data(max_length,
                                      batch_size * NUM_REPEATS,
                                      vocab,
                                      pct_oov=0.0)
    data_b = fc_bm.create_string_data(max_length,
                                      batch_size * NUM_REPEATS,
                                      vocab,
                                      pct_oov=0.0)

    # Keras implementation
    input_1 = keras.Input(shape=(None, ), name="data_a", dtype=tf.string)
    input_2 = keras.Input(shape=(None, ), name="data_b", dtype=tf.string)
    crossed_data = category_crossing.CategoryCrossing()([input_1, input_2])
    hashed_data = hashing.Hashing(num_buckets)(crossed_data)
    model = keras.Model([input_1, input_2], hashed_data)

    # FC implementation
    fc = tf.feature_column.crossed_column(["data_a", "data_b"], num_buckets)

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
        fc.transform_feature(
            tf.__internal__.feature_column.FeatureTransformationCache(tensors),
            None)

    # Benchmark runs
    keras_data = {
        "data_a":
        data_a.to_tensor(default_value="", shape=(batch_size, max_length)),
        "data_b":
        data_b.to_tensor(default_value="", shape=(batch_size, max_length)),
    }
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {
        "data_a":
        data_a.to_tensor(default_value="", shape=(batch_size, max_length)),
        "data_b":
        data_b.to_tensor(default_value="", shape=(batch_size, max_length)),
    }
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time
Beispiel #27
0
    def test_multi_hot_output(self):
        input_array = np.array([0, 1, 2, 3, 4])

        expected_output = [1., 1., 1.]
        expected_output_shape = [None, 3]

        inputs = keras.Input(shape=(3, ), dtype='int32')
        layer = hashing.Hashing(num_bins=3, output_mode='multi_hot')
        outputs = layer(inputs)
        self.assertAllEqual(expected_output_shape, outputs.shape.as_list())

        model = keras.Model(inputs, outputs)
        output_data = model(input_array)
        self.assertAllEqual(expected_output, output_data)
Beispiel #28
0
    def test_count_output(self):
        input_array = np.array([0, 1, 2, 3, 4])

        expected_output = [2.0, 2.0, 1.0]
        expected_output_shape = [None, 3]

        inputs = keras.Input(shape=(3, ), dtype="int32")
        layer = hashing.Hashing(num_bins=3, output_mode="count")
        outputs = layer(inputs)
        self.assertAllEqual(expected_output_shape, outputs.shape.as_list())

        model = keras.Model(inputs, outputs)
        output_data = model(input_array)
        self.assertAllEqual(expected_output, output_data)
    def test_distribution(self, distribution):
        input_data = np.asarray([["omar"], ["stringer"], ["marlo"], ["wire"]])
        input_dataset = tf.data.Dataset.from_tensor_slices(input_data).batch(
            2, drop_remainder=True)
        expected_output = [[0], [0], [1], [0]]

        tf.config.set_soft_device_placement(True)

        with distribution.scope():
            input_data = keras.Input(shape=(None, ), dtype=tf.string)
            layer = hashing.Hashing(num_bins=2)
            int_data = layer(input_data)
            model = keras.Model(inputs=input_data, outputs=int_data)
        output_dataset = model.predict(input_dataset)
        self.assertAllEqual(expected_output, output_dataset)
Beispiel #30
0
    def test_strategy(self, strategy):
        if (backend.is_tpu_strategy(strategy)
                and not tf_test_utils.is_mlir_bridge_enabled()):
            self.skipTest("TPU tests require MLIR bridge")

        input_data = np.asarray([["omar"], ["stringer"], ["marlo"], ["wire"]])
        input_dataset = tf.data.Dataset.from_tensor_slices(input_data).batch(
            2, drop_remainder=True)
        expected_output = [[0], [0], [1], [0]]

        tf.config.set_soft_device_placement(True)

        with strategy.scope():
            input_data = keras.Input(shape=(None, ), dtype=tf.string)
            layer = hashing.Hashing(num_bins=2)
            int_data = layer(input_data)
            model = keras.Model(inputs=input_data, outputs=int_data)
        output_dataset = model.predict(input_dataset)
        self.assertAllEqual(expected_output, output_dataset)