def transform(inputs): transformed = inputs.copy() for feature_transform_info in FEATURE_TRANSFORM_INFO_EXECUTE_ARRAY: if feature_transform_info.op_type == TransformOpType.HASH: transformed[feature_transform_info.input] = ToSparse()( transformed[feature_transform_info.input] ) transformed[feature_transform_info.output] = Hashing( feature_transform_info.hash_bucket_size )(transformed[feature_transform_info.input]) elif feature_transform_info.op_type == TransformOpType.BUCKETIZE: transformed[feature_transform_info.input] = ToSparse()( transformed[feature_transform_info.input] ) transformed[feature_transform_info.output] = Discretization( feature_transform_info.boundaries )(transformed[feature_transform_info.input]) elif feature_transform_info.op_type == TransformOpType.LOOKUP: transformed[feature_transform_info.input] = ToSparse()( transformed[feature_transform_info.input] ) transformed[feature_transform_info.output] = IndexLookup( feature_transform_info.vocabulary_list )(transformed[feature_transform_info.input]) elif feature_transform_info.op_type == TransformOpType.CONCAT: inputs_to_concat = [ transformed[name] for name in feature_transform_info.input ] transformed[feature_transform_info.output] = ConcatenateWithOffset( feature_transform_info.id_offsets )(inputs_to_concat) elif feature_transform_info.op_type == TransformOpType.EMBEDDING: transformed[feature_transform_info.output] = SparseEmbedding( input_dim=feature_transform_info.input_dim, output_dim=feature_transform_info.output_dim, )(transformed[feature_transform_info.input]) elif feature_transform_info.op_type == TransformOpType.ARRAY: transformed[feature_transform_info.output] = [ transformed[name] for name in feature_transform_info.input ] return tuple([transformed[name] for name in TRANSFORM_OUTPUTS])
def test_model_with_to_sparse(self): inputs = tf.keras.Input(shape=(1, ), dtype=tf.int32) sparse_inputs = ToSparse(ignore_value=-1)(inputs) model = tf.keras.Model(inputs=inputs, outputs=sparse_inputs) out = model.call(tf.constant([[1], [-1], [2], [3]])) expect_out = tf.SparseTensor( indices=tf.constant([[0, 0], [2, 0], [3, 0]], dtype=tf.int64), values=tf.constant([1, 2, 3], dtype=tf.int32), dense_shape=(4, 1), ) self.assertTrue(sparse_tensor_equal(out, expect_out))
def test_to_sparse(self): layer = ToSparse() inp = tf.constant([["A", ""], ["B", "C"]], tf.string) output = layer.call(inp) expected_out = tf.SparseTensor( indices=np.array([[0, 0], [1, 0], [1, 1]]), values=np.array(["A", "B", "C"]), dense_shape=(2, 2), ) self.assertTrue(sparse_tensor_equal(output, expected_out)) layer = ToSparse() inp = tf.constant([[12, -1], [45, 78]], tf.int64) output = layer.call(inp) expected_out = tf.SparseTensor( indices=np.array([[0, 0], [1, 0], [1, 1]]), values=np.array([12, 45, 78]), dense_shape=(2, 2), ) self.assertTrue(sparse_tensor_equal(output, expected_out))
def transform_from_code_gen(source_inputs): inputs = source_inputs.copy() education_hash_out = Hashing(education_hash.hash_bucket_size)( ToSparse()(inputs["education"]) ) occupation_hash_out = Hashing(occupation_hash.hash_bucket_size)( ToSparse()(inputs["occupation"]) ) native_country_hash_out = Hashing(native_country_hash.hash_bucket_size)( ToSparse()(inputs["native_country"]) ) workclass_lookup_out = IndexLookup(workclass_lookup.vocabulary_list)( ToSparse()(inputs["workclass"]) ) marital_status_lookup_out = IndexLookup( marital_status_lookup.vocabulary_list )(ToSparse()(inputs["marital_status"])) relationship_lookup_out = IndexLookup(relationship_lookup.vocabulary_list)( ToSparse()(inputs["relationship"]) ) race_lookup_out = IndexLookup(race_lookup.vocabulary_list)( ToSparse()(inputs["race"]) ) sex_lookup_out = IndexLookup(sex_lookup.vocabulary_list)( ToSparse()(inputs["sex"]) ) age_bucketize_out = Discretization(age_bucketize.boundaries)( ToSparse()(inputs["age"]) ) capital_gain_bucketize_out = Discretization( capital_gain_bucketize.boundaries )(ToSparse()(inputs["capital_gain"])) capital_loss_bucketize_out = Discretization( capital_loss_bucketize.boundaries )(ToSparse()(inputs["capital_loss"])) hours_per_week_bucketize_out = Discretization( hours_per_week_bucketize.boundaries )(ToSparse()(inputs["hours_per_week"])) group1_out = ConcatenateWithOffset(group1.id_offsets)( [ workclass_lookup_out, hours_per_week_bucketize_out, capital_gain_bucketize_out, capital_loss_bucketize_out, ] ) group2_out = ConcatenateWithOffset(group2.id_offsets)( [ education_hash_out, marital_status_lookup_out, relationship_lookup_out, occupation_hash_out, ] ) group3_out = ConcatenateWithOffset(group3.id_offsets)( [ age_bucketize_out, sex_lookup_out, race_lookup_out, native_country_hash_out, ] ) group1_embedding_wide_out = SparseEmbedding( input_dim=group1_embedding_wide.input_dim, output_dim=group1_embedding_wide.output_dim, )(group1_out) group2_embedding_wide_out = SparseEmbedding( input_dim=group2_embedding_wide.input_dim, output_dim=group2_embedding_wide.output_dim, )(group2_out) group1_embedding_deep_out = SparseEmbedding( input_dim=group1_embedding_deep.input_dim, output_dim=group1_embedding_deep.output_dim, )(group1_out) group2_embedding_deep_out = SparseEmbedding( input_dim=group2_embedding_deep.input_dim, output_dim=group2_embedding_deep.output_dim, )(group2_out) group3_embedding_deep_out = SparseEmbedding( input_dim=group3_embedding_deep.input_dim, output_dim=group3_embedding_deep.output_dim, )(group3_out) wide_embeddings_out = [ group1_embedding_wide_out, group2_embedding_wide_out, ] deep_embeddings_out = [ group1_embedding_deep_out, group2_embedding_deep_out, group3_embedding_deep_out, ] return wide_embeddings_out, deep_embeddings_out