def get_feature_columns_and_inputs(): feature_columns = [] feature_input_layers = {} for header in ["trestbps", "chol", "thalach", "oldpeak", "slope", "ca"]: feature_columns.append(tf.feature_column.numeric_column(header)) feature_input_layers[header] = tf.keras.Input(shape=(1,), name=header) age = tf.feature_column.numeric_column("age") age_buckets = tf.feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65] ) feature_columns.append(age_buckets) feature_input_layers["age"] = tf.keras.Input(shape=(1,), name="age") thal_hashed = tf.feature_column.categorical_column_with_hash_bucket( "thal", hash_bucket_size=100 ) thal_embedding = feature_column.embedding_column(thal_hashed, dimension=8) feature_columns.append(thal_embedding) feature_input_layers["thal"] = tf.keras.Input( shape=(1,), name="thal", dtype=tf.string ) return feature_columns, feature_input_layers
def test_call_embedding_column_with_weights(self): dimension = 8 item_id_embedding = feature_column.embedding_column( tf.feature_column.weighted_categorical_column( tf.feature_column.categorical_column_with_identity( "item_id", num_buckets=128), weight_feature_key="frequency", ), dimension=dimension, initializer=tf.initializers.identity, combiner="sum", ) item_id_embedding.lookup_embedding = lambda unique_ids: ( generate_vectors_with_one_hot_value(unique_ids, item_id_embedding. dimension)) output = call_feature_columns( [item_id_embedding], { "item_id": [[2, 6, 5], [3, 1, 1]], "frequency": [[0.33, 5.0, 1.024], [2.048, 0.5, 1.0]], }, ) expected_output = np.array( [ [0.0, 0.0, 0.33, 0.0, 0.0, 1.024, 5.0, 0.0], [0.0, 1.5, 0.0, 2.048, 0.0, 0.0, 0.0, 0.0], ], dtype=np.float32, ) self.assertTrue(np.array_equal(output.numpy(), expected_output))
def test_call_embedding_column(self): dimension = 32 item_id_embedding = feature_column.embedding_column( tf.feature_column.categorical_column_with_identity( "item_id", num_buckets=128 ), dimension=dimension, ) item_id_embedding.lookup_embedding = lambda unique_ids: ( generate_mock_embedding_vectors( unique_ids, item_id_embedding.dimension ) ) output = call_feature_columns( [item_id_embedding], {"item_id": [1, 2, 3]} ) self.assertTrue( np.array_equal( output.numpy(), generate_mock_embedding_vectors([1, 2, 3], dimension), ) )
def test_call_embedding_column(self): dimension = 32 item_id_embedding = embedding_column( tf.feature_column.categorical_column_with_identity( "item_id", num_buckets=128 ), dimension=dimension, ) def _mock_gather_embedding(name, ids): return generate_vectors_with_one_hot_value(ids, dimension) item_id_embedding.set_lookup_embedding_func(_mock_gather_embedding) output = call_feature_columns( [item_id_embedding], {"item_id": [1, 2, 3]} ) self.assertTrue( np.array_equal( output.numpy(), generate_vectors_with_one_hot_value([1, 2, 3], dimension), ) )
def test_embedding_column_gradients(self): dimension = 8 inputs = {"item_id": [[1, 2], [10, 10], [6, 3], [10, 2]]} # unique_ids: 1, 2, 10, 6, 3 expected_sum_grads = [1, 2, 3, 1, 1] expected_mean_grads = [1 / 2.0, 2 / 2.0, 3 / 2.0, 1 / 2.0, 1 / 2.0] expected_sqrtn_grads = [ 1 / np.sqrt(2.0), 2 / np.sqrt(2.0), 3 / np.sqrt(2.0), 1 / np.sqrt(2.0), 1 / np.sqrt(2.0), ] combiner_to_expected_grads = { "sum": expected_sum_grads, "mean": expected_mean_grads, "sqrtn": expected_sqrtn_grads, } for combiner, expected_grads in combiner_to_expected_grads.items(): item_id_embedding = embedding_column( tf.feature_column.categorical_column_with_identity( "item_id", num_buckets=128 ), dimension=dimension, combiner=combiner, ) def _mock_gather_embedding(name, ids): return generate_vectors_fill_with_id_value(ids, dimension) item_id_embedding.set_lookup_embedding_func(_mock_gather_embedding) dense_features = tf.keras.layers.DenseFeatures([item_id_embedding]) call_fns = [dense_features.call, tf.function(dense_features.call)] for call_fn in call_fns: with tf.GradientTape() as tape: item_id_embedding.set_tape(tape) output = call_fn(inputs) batch_embedding = item_id_embedding.embedding_and_ids[ 0 ].batch_embedding grads = tape.gradient(output, batch_embedding) item_id_embedding.reset() grads = grads.numpy() for i in range(5): self.assertTrue( np.isclose( grads[i], np.full(dimension, expected_grads[i]), ).all() )
def get_feature_columns(): feature_columns = [] for numeric_feature_key in NUMERIC_FEATURE_KEYS: numeric_feature = tf.feature_column.numeric_column(numeric_feature_key) feature_columns.append(numeric_feature) for categorical_feature_key in CATEGORICAL_FEATURE_KEYS: embedding_feature = feature_column.embedding_column( tf.feature_column.categorical_column_with_hash_bucket( categorical_feature_key, hash_bucket_size=64), dimension=16, ) feature_columns.append(embedding_feature) return feature_columns
def _replace_tf_embedding_column_with_edl(dense_features_layer): new_feature_columns = [] for column in dense_features_layer._feature_columns: if isinstance( column, fc_lib.EmbeddingColumn) and _need_partition_embedding(column): logger.info("Replace embedding_column {} from TensorFlow " "version to ElasticDL version".format(column.name)) new_column = embedding_column(column.categorical_column, dimension=column.dimension) new_column.set_dense_features_layer_name(dense_features_layer.name) new_feature_columns.append(new_column) else: new_feature_columns.append(column) return tf.keras.layers.DenseFeatures(feature_columns=new_feature_columns, name=dense_features_layer.name)
def test_embedding_column_gradients_with_weights(self): dimension = 8 inputs = { "item_id": [[1, 2], [10, 10], [6, 3], [10, 2]], "frequency": [[0.3, 0.6], [0.1, 0.2], [0.8, 0.1], [0.9, 0.6]], } item_ids = tf.reshape(inputs["item_id"], shape=[-1]) _, item_id_idx = tf.unique(item_ids) # unique_ids: 1, 2, 10, 6, 3 expected_sum_grads = np.array([ np.full(dimension, value) for value in [0.3, 0.6, 0.1, 0.2, 0.8, 0.1, 0.9, 0.6] ]) expected_mean_grads = np.array([ np.full(dimension, value) for value in [ 0.3 / 0.9, 0.6 / 0.9, 0.1 / 0.3, 0.2 / 0.3, 0.8 / 0.9, 0.1 / 0.9, 0.9 / 1.5, 0.6 / 1.5, ] ]) expected_sqrtn_grads = np.array([ np.full(dimension, value) for value in [ 0.3 / np.sqrt(np.power(0.3, 2) + np.power(0.6, 2)), 0.6 / np.sqrt(np.power(0.3, 2) + np.power(0.6, 2)), 0.1 / np.sqrt(np.power(0.1, 2) + np.power(0.2, 2)), 0.2 / np.sqrt(np.power(0.1, 2) + np.power(0.2, 2)), 0.8 / np.sqrt(np.power(0.8, 2) + np.power(0.1, 2)), 0.1 / np.sqrt(np.power(0.8, 2) + np.power(0.1, 2)), 0.9 / np.sqrt(np.power(0.9, 2) + np.power(0.6, 2)), 0.6 / np.sqrt(np.power(0.9, 2) + np.power(0.6, 2)), ] ]) combiner_to_expected_grads = { "sum": expected_sum_grads, "mean": expected_mean_grads, "sqrtn": expected_sqrtn_grads, } for combiner, expected_grads in combiner_to_expected_grads.items(): item_id_embedding = feature_column.embedding_column( tf.feature_column.weighted_categorical_column( tf.feature_column.categorical_column_with_identity( "item_id", num_buckets=128), weight_feature_key="frequency", ), dimension=dimension, initializer=tf.initializers.identity, combiner=combiner, ) def _mock_gather_embedding(name, ids): return generate_vectors_fill_with_id_value(ids, dimension) item_id_embedding.set_lookup_embedding_func(_mock_gather_embedding) dense_features = tf.keras.layers.DenseFeatures([item_id_embedding]) call_fns = [dense_features.call, tf.function(dense_features.call)] for call_fn in call_fns: with tf.GradientTape() as tape: item_id_embedding.set_tape(tape) output = call_fn(inputs) batch_embedding = item_id_embedding.embedding_and_ids[ 0].batch_embedding grads = tape.gradient(output, batch_embedding) item_id_embedding.reset() grad_indices = grads.indices grad_values = grads.values self.assertTrue( np.array_equal(grad_indices.numpy(), item_id_idx.numpy())) self.assertTrue( np.isclose(grad_values.numpy(), expected_grads).all())