def embedding_varlen(self, batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. vocab = fc_bm.create_vocabulary(32768) path = self._write_to_temp_file("tmp", vocab) data = fc_bm.create_string_data( max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15) # Keras implementation model = keras.Sequential() model.add( keras.Input( shape=(max_length,), name="data", ragged=True, dtype=dt.string)) model.add(string_lookup.StringLookup(vocabulary=path, mask_token=None)) # FC implementation fc = sfc.sequence_categorical_column_with_vocabulary_list( key="data", vocabulary_list=vocab, num_oov_buckets=1) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = {"data": data} k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = {"data": data.to_sparse()} fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. embedding_size = 32768 data = fc_bm.create_data(max_length, batch_size * NUM_REPEATS, embedding_size - 1, dtype=int) # Keras implementation model = keras.Sequential() model.add( keras.Input(shape=(None, ), ragged=True, name="data", dtype=dt.int64)) model.add(keras.layers.Embedding(embedding_size, 256)) model.add(keras.layers.Lambda(lambda x: math_ops.reduce_mean(x, axis=-1))) # FC implementation fc = fcv2.embedding_column(fcv2.categorical_column_with_identity( "data", num_buckets=embedding_size - 1), dimension=256) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = {"data": data} k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = {"data": data.to_sparse()} fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. max_value = 25.0 bins = np.arange(1.0, max_value) data = fc_bm.create_data( max_length, batch_size * NUM_REPEATS, 100000, dtype=float) # Keras implementation model = keras.Sequential() model.add(keras.Input(shape=(max_length,), name="data", dtype=dt.float32)) model.add(discretization.Discretization(bins)) # FC implementation fc = fcv2.bucketized_column( fcv2.numeric_column("data"), boundaries=list(bins)) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = {"data": data.to_tensor(default_value=0.0)} k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = {"data": data.to_tensor(default_value=0.0)} fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. num_buckets = 10000 vocab = fc_bm.create_vocabulary(32768) data_a = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0) data_b = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0) # Keras implementation input_1 = keras.Input(shape=(None, ), name="data_a", dtype=dt.string) input_2 = keras.Input(shape=(None, ), name="data_b", dtype=dt.string) crossed_data = category_crossing.CategoryCrossing()([input_1, input_2]) hashed_data = hashing.Hashing(num_buckets)(crossed_data) model = keras.Model([input_1, input_2], hashed_data) # FC implementation fc = fcv2.crossed_column(["data_a", "data_b"], num_buckets) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = { "data_a": data_a.to_tensor(default_value="", shape=(batch_size, max_length)), "data_b": data_b.to_tensor(default_value="", shape=(batch_size, max_length)), } k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = { "data_a": data_a.to_tensor(default_value="", shape=(batch_size, max_length)), "data_b": data_b.to_tensor(default_value="", shape=(batch_size, max_length)), } fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. embedding_size = 32768 data = fc_bm.create_data(max_length, batch_size * NUM_REPEATS, embedding_size - 1, dtype=int) weight = array_ops.ones_like_v2(data, dtype=dt.float32) # Keras implementation data_input = keras.Input(shape=(None, ), ragged=True, name="data", dtype=dt.int64) weight_input = keras.Input(shape=(None, ), ragged=True, name="weight", dtype=dt.float32) embedded_data = keras.layers.Embedding(embedding_size, 256)(data_input) weighted_embedding = math_ops.multiply( embedded_data, array_ops.expand_dims(weight_input, -1)) reduced_embedding = math_ops.reduce_sum(weighted_embedding, axis=1) model = keras.Model([data_input, weight_input], reduced_embedding) # FC implementation fc = fcv2.embedding_column(fcv2.weighted_categorical_column( fcv2.categorical_column_with_identity("data", num_buckets=embedding_size - 1), weight_feature_key="weight"), dimension=256) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = {"data": data, "weight": weight} k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = {"data": data.to_sparse(), "weight": weight.to_sparse()} fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. vocab_size = 32768 vocab = fc_bm.create_vocabulary(vocab_size) data = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.15) # Keras implementation model = keras.Sequential() model.add(keras.Input(shape=(max_length, ), name="data", dtype=dt.string)) model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None)) model.add( category_encoding.CategoryEncoding(num_tokens=vocab_size + 1, output_mode="count")) # FC implementation fc = fcv2.indicator_column( fcv2.categorical_column_with_vocabulary_list(key="data", vocabulary_list=vocab, num_oov_buckets=1)) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = { "data": data.to_tensor(default_value="", shape=(batch_size, max_length)) } k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = { "data": data.to_tensor(default_value="", shape=(batch_size, max_length)) } fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. num_buckets = 10000 vocab = fc_bm.create_vocabulary(32768) data = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0) # Keras implementation model = keras.Sequential() model.add(keras.Input(shape=(max_length, ), name="data", dtype=dt.string)) model.add(hashing.Hashing(num_buckets)) # FC implementation fc = sfc.sequence_categorical_column_with_hash_bucket("data", num_buckets) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = { "data": data.to_tensor(default_value="", shape=(batch_size, max_length)) } k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = { "data": data.to_tensor(default_value="", shape=(batch_size, max_length)) } fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time