def testWeightedSparseFeatures(self): """LinearClassifier with LinearSDCA and weighted sparse features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': sparse_tensor.SparseTensor(values=[2., 3., 1.], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 5]), 'country': sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 5]) }, constant_op.constant([[1], [0], [1]]) country = feature_column_lib.categorical_column_with_hash_bucket( 'country', hash_bucket_size=5) country_weighted_by_price = ( feature_column_lib.weighted_categorical_column(country, 'price')) optimizer = linear.LinearSDCA(example_id_column='example_id', symmetric_l2_regularization=0.01) classifier = linear.LinearClassifierV2( feature_columns=[country_weighted_by_price], optimizer=optimizer) classifier.train(input_fn=input_fn, steps=100) loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.2)
def test_denylisted_column(self): # HashedCategoricalColumn is denylisted and so will raise an exception. categorical_column = fc_lib.categorical_column_with_hash_bucket( key='aaa', hash_bucket_size=3) embedding_dimension = 2 with self.assertRaises(TypeError): tpu_fc.embedding_column(categorical_column, dimension=embedding_dimension)
def test_string_input(self): x = {'age': np.random.random((1024, 1)), 'cabin': np.array(['a'] * 1024)} y = np.random.randint(2, size=(1024, 1)) ds1 = dataset_ops.Dataset.from_tensor_slices(x) ds2 = dataset_ops.Dataset.from_tensor_slices(y) dataset = dataset_ops.Dataset.zip((ds1, ds2)).batch(4) categorical_cols = [fc.categorical_column_with_hash_bucket('cabin', 10)] feature_cols = ([fc.numeric_column('age')] + [fc.indicator_column(cc) for cc in categorical_cols]) layers = [fc.DenseFeatures(feature_cols), keras.layers.Dense(128), keras.layers.Dense(1)] model = keras.models.Sequential(layers) model.compile(optimizer='sgd', loss=keras.losses.BinaryCrossentropy()) model.fit(dataset)
def sequence_categorical_column_with_hash_bucket( key, hash_bucket_size, dtype=dtypes.string): """A sequence of categorical terms where ids are set by hashing. Pass this to `embedding_column` or `indicator_column` to convert sequence categorical data into dense representation for input to sequence NN, such as RNN. Example: ```python tokens = sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=1000) tokens_embedding = embedding_column(tokens, dimension=10) columns = [tokens_embedding] features = tf.parse_example(..., features=make_parse_example_spec(columns)) sequence_feature_layer = SequenceFeatures(columns) sequence_input, sequence_length = sequence_feature_layer(features) sequence_length_mask = tf.sequence_mask(sequence_length) rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) rnn_layer = tf.keras.layers.RNN(rnn_cell) outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) ``` Args: key: A unique string identifying the input feature. hash_bucket_size: An int > 1. The number of buckets. dtype: The type of features. Only string and integer types are supported. Returns: A `SequenceCategoricalColumn`. Raises: ValueError: `hash_bucket_size` is not greater than 1. ValueError: `dtype` is neither string nor integer. """ return fc.SequenceCategoricalColumn( fc.categorical_column_with_hash_bucket( key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
def sequence_categorical_column_with_hash_bucket(key, hash_bucket_size, dtype=dtypes.string): """A sequence of categorical terms where ids are set by hashing. Pass this to `embedding_column` or `indicator_column` to convert sequence categorical data into dense representation for input to sequence NN, such as RNN. Example: ```python tokens = sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=1000) tokens_embedding = embedding_column(tokens, dimension=10) columns = [tokens_embedding] features = tf.parse_example(..., features=make_parse_example_spec(columns)) sequence_feature_layer = SequenceFeatures(columns) sequence_input, sequence_length = sequence_feature_layer(features) sequence_length_mask = tf.sequence_mask(sequence_length) rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) rnn_layer = tf.keras.layers.RNN(rnn_cell) outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) ``` Args: key: A unique string identifying the input feature. hash_bucket_size: An int > 1. The number of buckets. dtype: The type of features. Only string and integer types are supported. Returns: A `SequenceCategoricalColumn`. Raises: ValueError: `hash_bucket_size` is not greater than 1. ValueError: `dtype` is neither string nor integer. """ return fc.SequenceCategoricalColumn( fc.categorical_column_with_hash_bucket( key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
def testPartitionedVariables(self): """Tests LinearClassifier with LinearSDCA with partitioned variables.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price = feature_column_lib.numeric_column('price') sq_footage_bucket = feature_column_lib.bucketized_column( feature_column_lib.numeric_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_lib.categorical_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_lib.crossed_column( [sq_footage_bucket, 'country'], hash_bucket_size=10) optimizer = linear.LinearSDCA(example_id_column='example_id', symmetric_l2_regularization=0.01) classifier = linear.LinearClassifierV2( feature_columns=[ price, sq_footage_bucket, country, sq_footage_country ], weight_column='weights', partitioner=partitioned_variables.fixed_size_partitioner( num_shards=2, axis=0), optimizer=optimizer) classifier.train(input_fn=input_fn, steps=100) loss = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.2)
def testMixedFeaturesArbitraryWeights(self): """Tests LinearRegressor with LinearSDCA and a mix of features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([0.6, 0.8, 0.3]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [5.0], [7.0]]) }, constant_op.constant([[1.55], [-1.25], [-3.0]]) price = feature_column_lib.numeric_column('price') sq_footage_bucket = feature_column_lib.bucketized_column( feature_column_lib.numeric_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_lib.categorical_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_lib.crossed_column( [sq_footage_bucket, 'country'], hash_bucket_size=10) optimizer = linear.LinearSDCA(example_id_column='example_id', symmetric_l2_regularization=0.1) regressor = linear.LinearRegressorV2(feature_columns=[ price, sq_footage_bucket, country, sq_footage_country ], weight_column='weights', optimizer=optimizer) regressor.train(input_fn=input_fn, steps=20) loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.05)
def test_one_shot_prediction_head_export(self, estimator_factory): def _new_temp_dir(): return os.path.join(test.get_temp_dir(), str(ops.uid())) model_dir = _new_temp_dir() categorical_column = feature_column.categorical_column_with_hash_bucket( key="categorical_exogenous_feature", hash_bucket_size=16) exogenous_feature_columns = [ feature_column.numeric_column( "2d_exogenous_feature", shape=(2,)), feature_column.embedding_column( categorical_column=categorical_column, dimension=10)] estimator = estimator_factory( model_dir=model_dir, exogenous_feature_columns=exogenous_feature_columns, head_type=ts_head_lib.OneShotPredictionHead) train_features = { feature_keys.TrainEvalFeatures.TIMES: numpy.arange( 20, dtype=numpy.int64), feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange( 20, dtype=numpy.float32)[:, None], [1, 5]), "2d_exogenous_feature": numpy.ones([20, 2]), "categorical_exogenous_feature": numpy.array( ["strkey"] * 20)[:, None] } train_input_fn = input_pipeline.RandomWindowInputFn( input_pipeline.NumpyReader(train_features), shuffle_seed=2, num_threads=1, batch_size=16, window_size=16) estimator.train(input_fn=train_input_fn, steps=5) result = estimator.evaluate(input_fn=train_input_fn, steps=1) self.assertIn("average_loss", result) self.assertNotIn(feature_keys.State.STATE_TUPLE, result) input_receiver_fn = estimator.build_raw_serving_input_receiver_fn() export_location = estimator.export_saved_model(_new_temp_dir(), input_receiver_fn) graph = ops.Graph() with graph.as_default(): with session_lib.Session() as session: signatures = loader.load( session, [tag_constants.SERVING], export_location) self.assertEqual([feature_keys.SavedModelLabels.PREDICT], list(signatures.signature_def.keys())) predict_signature = signatures.signature_def[ feature_keys.SavedModelLabels.PREDICT] six.assertCountEqual( self, [feature_keys.FilteringFeatures.TIMES, feature_keys.FilteringFeatures.VALUES, "2d_exogenous_feature", "categorical_exogenous_feature"], predict_signature.inputs.keys()) features = { feature_keys.TrainEvalFeatures.TIMES: numpy.tile( numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]), feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange( 20, dtype=numpy.float32)[None, :, None], [2, 1, 5]), "2d_exogenous_feature": numpy.ones([2, 35, 2]), "categorical_exogenous_feature": numpy.tile(numpy.array( ["strkey"] * 35)[None, :, None], [2, 1, 1]) } feeds = { graph.as_graph_element(input_value.name): features[input_key] for input_key, input_value in predict_signature.inputs.items()} fetches = {output_key: graph.as_graph_element(output_value.name) for output_key, output_value in predict_signature.outputs.items()} output = session.run(fetches, feed_dict=feeds) self.assertEqual((2, 15, 5), output["mean"].shape) # Build a parsing input function, then make a tf.Example for it to parse. export_location = estimator.export_saved_model( _new_temp_dir(), estimator.build_one_shot_parsing_serving_input_receiver_fn( filtering_length=20, prediction_length=15)) graph = ops.Graph() with graph.as_default(): with session_lib.Session() as session: example = example_pb2.Example() times = example.features.feature[feature_keys.TrainEvalFeatures.TIMES] values = example.features.feature[feature_keys.TrainEvalFeatures.VALUES] times.int64_list.value.extend(range(35)) for i in range(20): values.float_list.value.extend( [float(i) * 2. + feature_number for feature_number in range(5)]) real_feature = example.features.feature["2d_exogenous_feature"] categortical_feature = example.features.feature[ "categorical_exogenous_feature"] for i in range(35): real_feature.float_list.value.extend([1, 1]) categortical_feature.bytes_list.value.append(b"strkey") # Serialize the tf.Example for feeding to the Session examples = [example.SerializeToString()] * 2 signatures = loader.load( session, [tag_constants.SERVING], export_location) predict_signature = signatures.signature_def[ feature_keys.SavedModelLabels.PREDICT] ((_, input_value),) = predict_signature.inputs.items() feeds = {graph.as_graph_element(input_value.name): examples} fetches = {output_key: graph.as_graph_element(output_value.name) for output_key, output_value in predict_signature.outputs.items()} output = session.run(fetches, feed_dict=feeds) self.assertEqual((2, 15, 5), output["mean"].shape)
def testSparseFeaturesWithL1Reg(self): """Tests LinearRegressor with LinearSDCA and sparse features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.4], [0.6], [0.3]]), 'country': sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[10.0], [10.0], [10.0]]) }, constant_op.constant([[1.4], [-0.8], [2.6]]) price = feature_column_lib.numeric_column('price') country = feature_column_lib.categorical_column_with_hash_bucket( 'country', hash_bucket_size=5) # Regressor with no L1 regularization. optimizer = linear.LinearSDCA(example_id_column='example_id', symmetric_l2_regularization=0.1) regressor = linear.LinearRegressorV2(feature_columns=[price, country], weight_column='weights', optimizer=optimizer) regressor.train(input_fn=input_fn, steps=20) no_l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] variable_names = regressor.get_variable_names() self.assertIn('linear/linear_model/price/weights', variable_names) self.assertIn('linear/linear_model/country/weights', variable_names) no_l1_reg_weights = { 'linear/linear_model/price/weights': regressor.get_variable_value('linear/linear_model/price/weights'), 'linear/linear_model/country/weights': regressor.get_variable_value( 'linear/linear_model/country/weights'), } # Regressor with L1 regularization. optimizer = linear.LinearSDCA(example_id_column='example_id', symmetric_l1_regularization=1.0, symmetric_l2_regularization=0.1) regressor = linear.LinearRegressorV2(feature_columns=[price, country], weight_column='weights', optimizer=optimizer) regressor.train(input_fn=input_fn, steps=20) l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] l1_reg_weights = { 'linear/linear_model/price/weights': regressor.get_variable_value('linear/linear_model/price/weights'), 'linear/linear_model/country/weights': regressor.get_variable_value( 'linear/linear_model/country/weights'), } # Unregularized loss is lower when there is no L1 regularization. self.assertLess(no_l1_reg_loss, l1_reg_loss) self.assertLess(no_l1_reg_loss, 0.05) # But weights returned by the regressor with L1 regularization have smaller # L1 norm. l1_reg_weights_norm, no_l1_reg_weights_norm = 0.0, 0.0 for var_name in sorted(l1_reg_weights): l1_reg_weights_norm += sum( np.absolute(l1_reg_weights[var_name].flatten())) no_l1_reg_weights_norm += sum( np.absolute(no_l1_reg_weights[var_name].flatten())) print('Var name: %s, value: %s' % (var_name, no_l1_reg_weights[var_name].flatten())) self.assertLess(l1_reg_weights_norm, no_l1_reg_weights_norm)