def embedding_varlen(self, batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.
    vocab = fc_bm.create_vocabulary(32768)
    path = self._write_to_temp_file("tmp", vocab)

    data = fc_bm.create_string_data(
        max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15)

    # Keras implementation
    model = keras.Sequential()
    model.add(
        keras.Input(
            shape=(max_length,), name="data", ragged=True, dtype=dt.string))
    model.add(string_lookup.StringLookup(vocabulary=path, mask_token=None))

    # FC implementation
    fc = sfc.sequence_categorical_column_with_vocabulary_list(
        key="data", vocabulary_list=vocab, num_oov_buckets=1)

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
      fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {"data": data}
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {"data": data.to_sparse()}
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time
Esempio n. 2
0
def embedding_varlen(batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.
    embedding_size = 32768
    data = fc_bm.create_data(max_length,
                             batch_size * NUM_REPEATS,
                             embedding_size - 1,
                             dtype=int)

    # Keras implementation
    model = keras.Sequential()
    model.add(
        keras.Input(shape=(None, ), ragged=True, name="data", dtype=dt.int64))
    model.add(keras.layers.Embedding(embedding_size, 256))
    model.add(keras.layers.Lambda(lambda x: math_ops.reduce_mean(x, axis=-1)))

    # FC implementation
    fc = fcv2.embedding_column(fcv2.categorical_column_with_identity(
        "data", num_buckets=embedding_size - 1),
                               dimension=256)

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
        fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {"data": data}
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {"data": data.to_sparse()}
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time
Esempio n. 3
0
def embedding_varlen(batch_size, max_length):
  """Benchmark a variable-length embedding."""
  # Data and constants.
  max_value = 25.0
  bins = np.arange(1.0, max_value)
  data = fc_bm.create_data(
      max_length, batch_size * NUM_REPEATS, 100000, dtype=float)

  # Keras implementation
  model = keras.Sequential()
  model.add(keras.Input(shape=(max_length,), name="data", dtype=dt.float32))
  model.add(discretization.Discretization(bins))

  # FC implementation
  fc = fcv2.bucketized_column(
      fcv2.numeric_column("data"), boundaries=list(bins))

  # Wrap the FC implementation in a tf.function for a fair comparison
  @tf_function()
  def fc_fn(tensors):
    fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

  # Benchmark runs
  keras_data = {"data": data.to_tensor(default_value=0.0)}
  k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

  fc_data = {"data": data.to_tensor(default_value=0.0)}
  fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

  return k_avg_time, fc_avg_time
def embedding_varlen(batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.

    num_buckets = 10000
    vocab = fc_bm.create_vocabulary(32768)
    data_a = fc_bm.create_string_data(max_length,
                                      batch_size * NUM_REPEATS,
                                      vocab,
                                      pct_oov=0.0)
    data_b = fc_bm.create_string_data(max_length,
                                      batch_size * NUM_REPEATS,
                                      vocab,
                                      pct_oov=0.0)

    # Keras implementation
    input_1 = keras.Input(shape=(None, ), name="data_a", dtype=dt.string)
    input_2 = keras.Input(shape=(None, ), name="data_b", dtype=dt.string)
    crossed_data = category_crossing.CategoryCrossing()([input_1, input_2])
    hashed_data = hashing.Hashing(num_buckets)(crossed_data)
    model = keras.Model([input_1, input_2], hashed_data)

    # FC implementation
    fc = fcv2.crossed_column(["data_a", "data_b"], num_buckets)

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
        fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {
        "data_a":
        data_a.to_tensor(default_value="", shape=(batch_size, max_length)),
        "data_b":
        data_b.to_tensor(default_value="", shape=(batch_size, max_length)),
    }
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {
        "data_a":
        data_a.to_tensor(default_value="", shape=(batch_size, max_length)),
        "data_b":
        data_b.to_tensor(default_value="", shape=(batch_size, max_length)),
    }
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time
Esempio n. 5
0
def embedding_varlen(batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.
    embedding_size = 32768
    data = fc_bm.create_data(max_length,
                             batch_size * NUM_REPEATS,
                             embedding_size - 1,
                             dtype=int)
    weight = array_ops.ones_like_v2(data, dtype=dt.float32)

    # Keras implementation
    data_input = keras.Input(shape=(None, ),
                             ragged=True,
                             name="data",
                             dtype=dt.int64)
    weight_input = keras.Input(shape=(None, ),
                               ragged=True,
                               name="weight",
                               dtype=dt.float32)
    embedded_data = keras.layers.Embedding(embedding_size, 256)(data_input)
    weighted_embedding = math_ops.multiply(
        embedded_data, array_ops.expand_dims(weight_input, -1))
    reduced_embedding = math_ops.reduce_sum(weighted_embedding, axis=1)
    model = keras.Model([data_input, weight_input], reduced_embedding)

    # FC implementation
    fc = fcv2.embedding_column(fcv2.weighted_categorical_column(
        fcv2.categorical_column_with_identity("data",
                                              num_buckets=embedding_size - 1),
        weight_feature_key="weight"),
                               dimension=256)

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
        fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {"data": data, "weight": weight}
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {"data": data.to_sparse(), "weight": weight.to_sparse()}
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time
def embedding_varlen(batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.
    vocab_size = 32768
    vocab = fc_bm.create_vocabulary(vocab_size)
    data = fc_bm.create_string_data(max_length,
                                    batch_size * NUM_REPEATS,
                                    vocab,
                                    pct_oov=0.15)

    # Keras implementation
    model = keras.Sequential()
    model.add(keras.Input(shape=(max_length, ), name="data", dtype=dt.string))
    model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None))
    model.add(
        category_encoding.CategoryEncoding(num_tokens=vocab_size + 1,
                                           output_mode="count"))

    # FC implementation
    fc = fcv2.indicator_column(
        fcv2.categorical_column_with_vocabulary_list(key="data",
                                                     vocabulary_list=vocab,
                                                     num_oov_buckets=1))

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
        fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {
        "data": data.to_tensor(default_value="",
                               shape=(batch_size, max_length))
    }
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {
        "data": data.to_tensor(default_value="",
                               shape=(batch_size, max_length))
    }
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time
Esempio n. 7
0
def embedding_varlen(batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.

    num_buckets = 10000
    vocab = fc_bm.create_vocabulary(32768)
    data = fc_bm.create_string_data(max_length,
                                    batch_size * NUM_REPEATS,
                                    vocab,
                                    pct_oov=0.0)

    # Keras implementation
    model = keras.Sequential()
    model.add(keras.Input(shape=(max_length, ), name="data", dtype=dt.string))
    model.add(hashing.Hashing(num_buckets))

    # FC implementation
    fc = sfc.sequence_categorical_column_with_hash_bucket("data", num_buckets)

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
        fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {
        "data": data.to_tensor(default_value="",
                               shape=(batch_size, max_length))
    }
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {
        "data": data.to_tensor(default_value="",
                               shape=(batch_size, max_length))
    }
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time