def test_get_sparse_tensors(self, inputs_args, expected_args): inputs = sparse_tensor.SparseTensorValue(**inputs_args) expected = sparse_tensor.SparseTensorValue(**expected_args) column = sfc.sequence_categorical_column_with_hash_bucket( 'aaa', hash_bucket_size=10) id_weight_pair = _get_sparse_tensors(column, {'aaa': inputs}) self.assertIsNone(id_weight_pair.weight_tensor) _assert_sparse_tensor_indices_shape( self, expected, self.evaluate(id_weight_pair.id_tensor))
def _build_feature_columns(self): col = fc.categorical_column_with_identity('int_ctx', num_buckets=100) ctx_cols = [ fc.embedding_column(col, dimension=10), fc.numeric_column('float_ctx') ] identity_col = sfc.sequence_categorical_column_with_identity( 'int_list', num_buckets=10) bucket_col = sfc.sequence_categorical_column_with_hash_bucket( 'bytes_list', hash_bucket_size=100) seq_cols = [ fc.embedding_column(identity_col, dimension=10), fc.embedding_column(bucket_col, dimension=20) ] return ctx_cols, seq_cols
def _build_feature_columns(self): col = fc.categorical_column_with_identity('int_ctx', num_buckets=100) ctx_cols = [ fc.embedding_column(col, dimension=10), fc.numeric_column('float_ctx') ] identity_col = sfc.sequence_categorical_column_with_identity( 'int_list', num_buckets=10) bucket_col = sfc.sequence_categorical_column_with_hash_bucket( 'bytes_list', hash_bucket_size=100) seq_cols = [ fc.embedding_column(identity_col, dimension=10), fc.embedding_column(bucket_col, dimension=20) ] return ctx_cols, seq_cols
def _get_sequence_categorical_column(params: dict) -> fc.SequenceCategoricalColumn: key = params['key'] if 'vocabulary' in params.keys(): feature = sfc.sequence_categorical_column_with_vocabulary_list(key, vocabulary_list=_parse_vocabulary( params['vocabulary']), default_value=0) elif 'bucket_size' in params.keys(): feature = sfc.sequence_categorical_column_with_hash_bucket( key, hash_bucket_size=params['bucket_size']) elif 'file' in params.keys(): feature = sfc.sequence_categorical_column_with_vocabulary_file(key, vocabulary_file=params['file'], default_value=0) elif 'num_buckets' in params.keys(): feature = sfc.sequence_categorical_column_with_identity(key, num_buckets=params['num_buckets']) else: raise Exception("params error") return feature
def embedding_varlen(batch_size, max_length): """Benchmark a variable-length embedding.""" # Data and constants. num_buckets = 10000 vocab = fc_bm.create_vocabulary(32768) data = fc_bm.create_string_data(max_length, batch_size * NUM_REPEATS, vocab, pct_oov=0.0) # Keras implementation model = keras.Sequential() model.add(keras.Input(shape=(max_length, ), name="data", dtype=dt.string)) model.add(hashing.Hashing(num_buckets)) # FC implementation fc = sfc.sequence_categorical_column_with_hash_bucket("data", num_buckets) # Wrap the FC implementation in a tf.function for a fair comparison @tf_function() def fc_fn(tensors): fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None) # Benchmark runs keras_data = { "data": data.to_tensor(default_value="", shape=(batch_size, max_length)) } k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS) fc_data = { "data": data.to_tensor(default_value="", shape=(batch_size, max_length)) } fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS) return k_avg_time, fc_avg_time