def testJointLinearModel(self): """Tests that loss goes down with training.""" def input_fn(): return { 'age': sparse_tensor.SparseTensor( values=['1'], indices=[[0, 0]], dense_shape=[1, 1]), 'language': sparse_tensor.SparseTensor( values=['english'], indices=[[0, 0]], dense_shape=[1, 1]) }, constant_op.constant([[1]]) language = feature_column.sparse_column_with_hash_bucket('language', 100) age = feature_column.sparse_column_with_hash_bucket('age', 2) head = head_lib._multi_class_head(n_classes=2) classifier = _joint_linear_estimator(head, feature_columns=[age, language]) classifier.fit(input_fn=input_fn, steps=1000) loss1 = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] classifier.fit(input_fn=input_fn, steps=2000) loss2 = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss2, loss1) self.assertLess(loss2, 0.01)
def testCrossedFeatures(self): """Tests SDCALogisticClassifier with crossed features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'language': sparse_tensor.SparseTensor( values=['english', 'italian', 'spanish'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 1]), 'country': sparse_tensor.SparseTensor( values=['US', 'IT', 'MX'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 1]) }, constant_op.constant([[0], [0], [1]]) language = feature_column_lib.sparse_column_with_hash_bucket( 'language', hash_bucket_size=5) country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) country_language = feature_column_lib.crossed_column( [language, country], hash_bucket_size=10) classifier = sdca_estimator.SDCALogisticClassifier( example_id_column='example_id', feature_columns=[country_language]) classifier.fit(input_fn=input_fn, steps=10) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertGreater(metrics['accuracy'], 0.9)
def testSparseFeaturesWithDuplicates(self): """Tests SDCALogisticClassifier with duplicated sparse features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2']), 'age': sparse_tensor.SparseTensor( values=['20-29'] * 5 + ['31-40'] * 5, indices=[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0]], dense_shape=[2, 1]), 'gender': sparse_tensor.SparseTensor( values=['m'] * 5 + ['f'] * 5, indices=[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0]], dense_shape=[2, 1]), }, constant_op.constant([[1], [0]]) with self._single_threaded_test_session(): age = feature_column_lib.sparse_column_with_hash_bucket( 'age', hash_bucket_size=10) gender = feature_column_lib.sparse_column_with_hash_bucket( 'gender', hash_bucket_size=10) classifier = sdca_estimator.SDCALogisticClassifier( example_id_column='example_id', feature_columns=[age, gender]) classifier.fit(input_fn=input_fn, steps=50) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertLess(metrics['loss'], 0.060)
def testCrossedColumnNameCreatesSortedNames(self): a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100) bucket = fc.bucketized_column(fc.real_valued_column("cost"), [0, 4]) crossed = fc.crossed_column(set([b, bucket, a]), hash_bucket_size=10000) self.assertEqual("aaa_X_bbb_X_cost_bucketized", crossed.name, "name should be generated by sorted column names") self.assertEqual("aaa", crossed.columns[0].name) self.assertEqual("bbb", crossed.columns[1].name) self.assertEqual("cost_bucketized", crossed.columns[2].name)
def testSparseColumnWithHashBucket(self): a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100) self.assertEqual(a.name, "aaa") self.assertEqual(a.dtype, dtypes.string) a = fc.sparse_column_with_hash_bucket( "aaa", hash_bucket_size=100, dtype=dtypes.int64) self.assertEqual(a.name, "aaa") self.assertEqual(a.dtype, dtypes.int64) with self.assertRaisesRegexp(ValueError, "dtype must be string or integer"): a = fc.sparse_column_with_hash_bucket( "aaa", hash_bucket_size=100, dtype=dtypes.float32)
def testCreateSequenceFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys("id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column", dimension=2) real_valued_col2 = fc.real_valued_column( "real_valued_default_column", dimension=5, default_value=3.0) real_valued_col3 = fc._real_valued_var_len_column( "real_valued_var_len_column", default_value=3.0, is_sparse=True) real_valued_col4 = fc._real_valued_var_len_column( "real_valued_var_len_dense_column", default_value=4.0, is_sparse=False) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4 ]) feature_spec = fc._create_sequence_feature_spec_for_parsing(feature_columns) expected_feature_spec = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column": parsing_ops.FixedLenSequenceFeature( shape=[2], dtype=dtypes.float32, allow_missing=False), "real_valued_default_column": parsing_ops.FixedLenSequenceFeature( shape=[5], dtype=dtypes.float32, allow_missing=True), "real_valued_var_len_column": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_var_len_dense_column": parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.float32, allow_missing=True, default_value=4.0), } self.assertDictEqual(expected_feature_spec, feature_spec)
def testRegression_TensorData(self): """Tests regression using tensor data as input.""" def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant([[.8], [.15], [0.]]), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ['en', 'fr', 'zh'], num_epochs=num_epochs), indices=[[0, 0], [0, 1], [2, 0]], dense_shape=[3, 2]) } return features, constant_op.constant([1., 0., 0.2], dtype=dtypes.float32) language_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) feature_columns = [ feature_column.embedding_column( language_column, dimension=1), feature_column.real_valued_column('age') ] regressor = dnn.DNNRegressor( feature_columns=feature_columns, hidden_units=[3, 3], config=run_config.RunConfig(tf_random_seed=1)) regressor.fit(input_fn=_input_fn, steps=200) scores = regressor.evaluate(input_fn=_input_fn, steps=1) self.assertIn('loss', scores)
def testExport(self): """Tests export model for servo.""" def input_fn(): return { 'age': constant_op.constant([1]), 'language': sparse_tensor.SparseTensor( values=['english'], indices=[[0, 0]], dense_shape=[1, 1]) }, constant_op.constant([[1]]) language = feature_column.sparse_column_with_hash_bucket('language', 100) feature_columns = [ feature_column.real_valued_column('age'), feature_column.embedding_column( language, dimension=1) ] classifier = dnn.DNNClassifier( feature_columns=feature_columns, hidden_units=[3, 3]) classifier.fit(input_fn=input_fn, steps=5) export_dir = tempfile.mkdtemp() classifier.export(export_dir)
def testCrossedColumnNotSupportRealValuedColumn(self): b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100) with self.assertRaisesRegexp( TypeError, "columns must be a set of _SparseColumn, _CrossedColumn, " "or _BucketizedColumn instances"): fc.crossed_column( set([b, fc.real_valued_column("real")]), hash_bucket_size=10000)
def testMakePlaceHolderTensorsForBaseFeatures(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) real_valued_col = fc.real_valued_column("real_valued_column", 5) vlen_real_valued_col = fc.real_valued_column( "vlen_real_valued_column", dimension=None) bucketized_col = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization"), [0, 4]) feature_columns = set( [sparse_col, real_valued_col, vlen_real_valued_col, bucketized_col]) placeholders = ( fc.make_place_holder_tensors_for_base_features(feature_columns)) self.assertEqual(4, len(placeholders)) self.assertTrue( isinstance(placeholders["sparse_column"], sparse_tensor_lib.SparseTensor)) self.assertTrue( isinstance(placeholders["vlen_real_valued_column"], sparse_tensor_lib.SparseTensor)) placeholder = placeholders["real_valued_column"] self.assertGreaterEqual( placeholder.name.find(u"Placeholder_real_valued_column"), 0) self.assertEqual(dtypes.float32, placeholder.dtype) self.assertEqual([None, 5], placeholder.get_shape().as_list()) placeholder = placeholders["real_valued_column_for_bucketization"] self.assertGreaterEqual( placeholder.name.find( u"Placeholder_real_valued_column_for_bucketization"), 0) self.assertEqual(dtypes.float32, placeholder.dtype) self.assertEqual([None, 1], placeholder.get_shape().as_list())
def testWeightedSparseFeatures(self): """Tests SDCALogisticClassifier with weighted sparse features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': sparse_tensor.SparseTensor(values=[2., 3., 1.], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 5]), 'country': sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 5]) }, constant_op.constant([[1], [0], [1]]) country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) country_weighted_by_price = feature_column_lib.weighted_sparse_column( country, 'price') classifier = sdca_estimator.SDCALogisticClassifier( example_id_column='example_id', feature_columns=[country_weighted_by_price]) classifier.fit(input_fn=input_fn, steps=50) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertGreater(metrics['accuracy'], 0.9)
def testExport(self): """Tests export model for servo.""" def input_fn(): return { 'age': constant_op.constant([1]), 'language': sparse_tensor.SparseTensor(values=['english'], indices=[[0, 0]], dense_shape=[1, 1]) }, constant_op.constant([[1]]) language = feature_column.sparse_column_with_hash_bucket( 'language', 100) feature_columns = [ feature_column.real_valued_column('age'), feature_column.embedding_column(language, dimension=1) ] classifier = debug.DebugClassifier(config=run_config.RunConfig( tf_random_seed=1)) classifier.fit(input_fn=input_fn, steps=5) def default_input_fn(unused_estimator, examples): return feature_column_ops.parse_feature_columns_from_examples( examples, feature_columns) export_dir = tempfile.mkdtemp() classifier.export(export_dir, input_fn=default_input_fn)
def testLinearModel(self): """Tests that loss goes down with training.""" def input_fn(): return { 'age': constant_op.constant([1]), 'language': sparse_tensor.SparseTensor(values=['english'], indices=[[0, 0]], dense_shape=[1, 1]) }, constant_op.constant([[1]]) language = feature_column.sparse_column_with_hash_bucket( 'language', 100) age = feature_column.real_valued_column('age') head = head_lib._multi_class_head(n_classes=2) classifier = _linear_estimator(head, feature_columns=[age, language]) classifier.fit(input_fn=input_fn, steps=1000) loss1 = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] classifier.fit(input_fn=input_fn, steps=2000) loss2 = classifier.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss2, loss1) self.assertLess(loss2, 0.01)
def testSparseFeatures(self): """Tests SDCALogisticClassifier with sparse features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.4], [0.6], [0.3]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[1.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) with self._single_threaded_test_session(): price = feature_column_lib.real_valued_column('price') country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) classifier = sdca_estimator.SDCALogisticClassifier( example_id_column='example_id', feature_columns=[price, country], weight_column_name='weights') classifier.fit(input_fn=input_fn, steps=50) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertGreater(metrics['accuracy'], 0.9)
def testSparseFeatures(self): """Tests SVM classifier with (hashed) sparse features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.8], [0.6], [0.3]]), 'country': sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 1]), }, constant_op.constant([[0], [1], [1]]) price = feature_column.real_valued_column('price') country = feature_column.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) svm_classifier = svm.SVM(feature_columns=[price, country], example_id_column='example_id', l1_regularization=0.0, l2_regularization=1.0) svm_classifier.fit(input_fn=input_fn, steps=30) accuracy = svm_classifier.evaluate(input_fn=input_fn, steps=1)['accuracy'] self.assertAlmostEqual(accuracy, 1.0, places=3)
def testEmbeddingColumn(self): a = fc.sparse_column_with_hash_bucket( "aaa", hash_bucket_size=100, combiner="sum") b = fc.embedding_column(a, dimension=4, combiner="mean") self.assertEqual(b.sparse_id_column.name, "aaa") self.assertEqual(b.dimension, 4) self.assertEqual(b.combiner, "mean")
def testExport(self): """Tests export model for servo.""" def input_fn(): return { 'age': constant_op.constant([1]), 'language': sparse_tensor.SparseTensor( values=['english'], indices=[[0, 0]], dense_shape=[1, 1]) }, constant_op.constant([[1]]) language = feature_column.sparse_column_with_hash_bucket('language', 100) feature_columns = [ feature_column.real_valued_column('age'), feature_column.embedding_column( language, dimension=1) ] classifier = debug.DebugClassifier(config=run_config.RunConfig( tf_random_seed=1)) classifier.fit(input_fn=input_fn, steps=5) def default_input_fn(unused_estimator, examples): return feature_column_ops.parse_feature_columns_from_examples( examples, feature_columns) export_dir = tempfile.mkdtemp() classifier.export(export_dir, input_fn=default_input_fn)
def testSparseFeatures(self): """Tests SVM classifier with (hashed) sparse features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.8], [0.6], [0.3]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 1]), }, constant_op.constant([[0], [1], [1]]) price = feature_column.real_valued_column('price') country = feature_column.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) svm_classifier = svm.SVM(feature_columns=[price, country], example_id_column='example_id', l1_regularization=0.0, l2_regularization=1.0) svm_classifier.fit(input_fn=input_fn, steps=30) accuracy = svm_classifier.evaluate(input_fn=input_fn, steps=1)['accuracy'] self.assertAlmostEqual(accuracy, 1.0, places=3)
def testWeightedSparseFeatures(self): """Tests SDCALogisticClassifier with weighted sparse features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': sparse_tensor.SparseTensor( values=[2., 3., 1.], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 5]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 5]) }, constant_op.constant([[1], [0], [1]]) country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) country_weighted_by_price = feature_column_lib.weighted_sparse_column( country, 'price') classifier = sdca_estimator.SDCALogisticClassifier( example_id_column='example_id', feature_columns=[country_weighted_by_price]) classifier.fit(input_fn=input_fn, steps=50) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertGreater(metrics['accuracy'], 0.9)
def testMakePlaceHolderTensorsForBaseFeatures(self): sparse_col = fc.sparse_column_with_hash_bucket("sparse_column", hash_bucket_size=100) real_valued_col = fc.real_valued_column("real_valued_column", 5) vlen_real_valued_col = fc.real_valued_column("vlen_real_valued_column", dimension=None) bucketized_col = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization"), [0, 4]) feature_columns = set([ sparse_col, real_valued_col, vlen_real_valued_col, bucketized_col ]) placeholders = ( fc.make_place_holder_tensors_for_base_features(feature_columns)) self.assertEqual(4, len(placeholders)) self.assertTrue( isinstance(placeholders["sparse_column"], sparse_tensor_lib.SparseTensor)) self.assertTrue( isinstance(placeholders["vlen_real_valued_column"], sparse_tensor_lib.SparseTensor)) placeholder = placeholders["real_valued_column"] self.assertGreaterEqual( placeholder.name.find(u"Placeholder_real_valued_column"), 0) self.assertEqual(dtypes.float32, placeholder.dtype) self.assertEqual([None, 5], placeholder.get_shape().as_list()) placeholder = placeholders["real_valued_column_for_bucketization"] self.assertGreaterEqual( placeholder.name.find( u"Placeholder_real_valued_column_for_bucketization"), 0) self.assertEqual(dtypes.float32, placeholder.dtype) self.assertEqual([None, 1], placeholder.get_shape().as_list())
def testMixedFeatures(self): """Tests SDCALogisticClassifier with a mix of features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price = feature_column_lib.real_valued_column('price') sq_footage_bucket = feature_column_lib.bucketized_column( feature_column_lib.real_valued_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_lib.crossed_column( [sq_footage_bucket, country], hash_bucket_size=10) classifier = sdca_estimator.SDCALogisticClassifier( example_id_column='example_id', feature_columns=[price, sq_footage_bucket, country, sq_footage_country], weight_column_name='weights') classifier.fit(input_fn=input_fn, steps=50) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertGreater(metrics['accuracy'], 0.9)
def testSparseFeatures(self): """Tests SDCALogisticClassifier with sparse features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.4], [0.6], [0.3]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[1.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price = feature_column_lib.real_valued_column('price') country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) classifier = sdca_estimator.SDCALogisticClassifier( example_id_column='example_id', feature_columns=[price, country], weight_column_name='weights') classifier.fit(input_fn=input_fn, steps=50) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertGreater(metrics['accuracy'], 0.9)
def testSparseColumnWithHashBucket(self): a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100) self.assertEqual(a.name, "aaa") self.assertEqual(a.dtype, dtypes.string) a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100, dtype=dtypes.int64) self.assertEqual(a.name, "aaa") self.assertEqual(a.dtype, dtypes.int64) with self.assertRaisesRegexp(ValueError, "dtype must be string or integer"): a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100, dtype=dtypes.float32)
def testEmbeddingColumn(self): a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100, combiner="sum") b = fc.embedding_column(a, dimension=4, combiner="mean") self.assertEqual(b.sparse_id_column.name, "aaa") self.assertEqual(b.dimension, 4) self.assertEqual(b.combiner, "mean")
def testCrossedColumnNotSupportRealValuedColumn(self): b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100) with self.assertRaisesRegexp( TypeError, "columns must be a set of _SparseColumn, _CrossedColumn, " "or _BucketizedColumn instances"): fc.crossed_column(set([b, fc.real_valued_column("real")]), hash_bucket_size=10000)
def testSparseColumnHashBucketDeepCopy(self): """Tests deepcopy of sparse_column_with_hash_bucket.""" column = fc.sparse_column_with_hash_bucket("a", 10) self.assertEqual("a", column.name) column_copy = copy.deepcopy(column) self.assertEqual("a", column_copy.name) self.assertEqual(10, column_copy.bucket_size) self.assertFalse(column_copy.is_integerized)
def testMultipliesGradient(self): embedding_language = feature_column.embedding_column( feature_column.sparse_column_with_hash_bucket('language', 10), dimension=1, initializer=init_ops.constant_initializer(0.1)) embedding_wire = feature_column.embedding_column( feature_column.sparse_column_with_hash_bucket('wire', 10), dimension=1, initializer=init_ops.constant_initializer(0.1)) params = { 'feature_columns': [embedding_language, embedding_wire], 'head': head_lib._multi_class_head(2), 'hidden_units': [1], # Set lr mult to 0. to keep embeddings constant. 'embedding_lr_multipliers': { embedding_language: 0.0 }, } features = { 'language': sparse_tensor.SparseTensor( values=['en', 'fr', 'zh'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 1]), 'wire': sparse_tensor.SparseTensor( values=['omar', 'stringer', 'marlo'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 1]), } labels = constant_op.constant([[0], [0], [0]], dtype=dtypes.int32) model_ops = dnn._dnn_model_fn(features, labels, model_fn.ModeKeys.TRAIN, params) with monitored_session.MonitoredSession() as sess: language_var = dnn_linear_combined._get_embedding_variable( embedding_language, 'dnn', 'dnn/input_from_feature_columns') wire_var = dnn_linear_combined._get_embedding_variable( embedding_wire, 'dnn', 'dnn/input_from_feature_columns') for _ in range(2): _, language_value, wire_value = sess.run( [model_ops.train_op, language_var, wire_var]) initial_value = np.full_like(language_value, 0.1) self.assertTrue(np.all(np.isclose(language_value, initial_value))) self.assertFalse(np.all(np.isclose(wire_value, initial_value)))
def testEmbeddingColumnDeepCopy(self): a = fc.sparse_column_with_hash_bucket( "aaa", hash_bucket_size=100, combiner="sum") column = fc.embedding_column(a, dimension=4, combiner="mean") column_copy = copy.deepcopy(column) self.assertEqual(column_copy.name, "aaa_embedding") self.assertEqual(column_copy.sparse_id_column.name, "aaa") self.assertEqual(column_copy.dimension, 4) self.assertEqual(column_copy.combiner, "mean")
def test_make_parsing_export_strategy(self): """Only tests that an ExportStrategy instance is created.""" sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) real_valued_col1 = fc.real_valued_column("real_valued_column1") bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) feature_columns = [sparse_col, embedding_col, real_valued_col1, bucketized_col1] export_strategy = saved_model_export_utils.make_parsing_export_strategy( feature_columns=feature_columns) self.assertTrue( isinstance(export_strategy, export_strategy_lib.ExportStrategy))
def testInitEmbeddingColumnWeightsFromCkpt(self): sparse_col = fc.sparse_column_with_hash_bucket( column_name="object_in_image", hash_bucket_size=4) # Create _EmbeddingColumn which randomly initializes embedding of size # [4, 16]. embedding_col = fc.embedding_column(sparse_col, dimension=16) # Creating a SparseTensor which has all the ids possible for the given # vocab. input_tensor = sparse_tensor_lib.SparseTensor( indices=[[0, 0], [1, 1], [2, 2], [3, 3]], values=[0, 1, 2, 3], dense_shape=[4, 4]) # Invoking 'layers.input_from_feature_columns' will create the embedding # variable. Creating under scope 'run_1' so as to prevent name conflicts # when creating embedding variable for 'embedding_column_pretrained'. with variable_scope.variable_scope("run_1"): with variable_scope.variable_scope(embedding_col.name): # This will return a [4, 16] tensor which is same as embedding variable. embeddings = feature_column_ops.input_from_feature_columns({ embedding_col: input_tensor }, [embedding_col]) save = saver.Saver() ckpt_dir_prefix = os.path.join(self.get_temp_dir(), "init_embedding_col_w_from_ckpt") ckpt_dir = tempfile.mkdtemp(prefix=ckpt_dir_prefix) checkpoint_path = os.path.join(ckpt_dir, "model.ckpt") with self.test_session() as sess: sess.run(variables.global_variables_initializer()) saved_embedding = embeddings.eval() save.save(sess, checkpoint_path) embedding_col_initialized = fc.embedding_column( sparse_id_column=sparse_col, dimension=16, ckpt_to_load_from=checkpoint_path, tensor_name_in_ckpt=("run_1/object_in_image_embedding/" "input_from_feature_columns/object" "_in_image_embedding/weights")) with variable_scope.variable_scope("run_2"): # This will initialize the embedding from provided checkpoint and return a # [4, 16] tensor which is same as embedding variable. Since we didn't # modify embeddings, this should be same as 'saved_embedding'. pretrained_embeddings = feature_column_ops.input_from_feature_columns({ embedding_col_initialized: input_tensor }, [embedding_col_initialized]) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) loaded_embedding = pretrained_embeddings.eval() self.assertAllClose(saved_embedding, loaded_embedding)
def testOneHotColumn(self): a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"]) onehot_a = fc.one_hot_column(a) self.assertEqual(onehot_a.sparse_id_column.name, "a") self.assertEqual(onehot_a.length, 4) b = fc.sparse_column_with_hash_bucket( "b", hash_bucket_size=100, combiner="sum") onehot_b = fc.one_hot_column(b) self.assertEqual(onehot_b.sparse_id_column.name, "b") self.assertEqual(onehot_b.length, 100)
def testEmbeddingMultiplier(self): embedding_language = feature_column.embedding_column( feature_column.sparse_column_with_hash_bucket('language', 10), dimension=1, initializer=init_ops.constant_initializer(0.1)) classifier = dnn.DNNClassifier( feature_columns=[embedding_language], hidden_units=[3, 3], embedding_lr_multipliers={embedding_language: 0.8}) self.assertEqual({ embedding_language: 0.8 }, classifier._estimator.params['embedding_lr_multipliers'])
def testTrainWithPartitionedVariables(self): """Tests training with partitioned variables.""" def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant([[.8], [.2], [.1]]), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ['en', 'fr', 'zh'], num_epochs=num_epochs), indices=[[0, 0], [0, 1], [2, 0]], dense_shape=[3, 2]) } return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32) # The given hash_bucket_size results in variables larger than the # default min_slice_size attribute, so the variables are partitioned. sparse_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=2e7) feature_columns = [ feature_column.embedding_column( sparse_column, dimension=1) ] tf_config = { 'cluster': { run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1'] } } with test.mock.patch.dict('os.environ', {'TF_CONFIG': json.dumps(tf_config)}): config = run_config.RunConfig(tf_random_seed=1) # Because we did not start a distributed cluster, we need to pass an # empty ClusterSpec, otherwise the device_setter will look for # distributed jobs, such as "/job:ps" which are not present. config._cluster_spec = server_lib.ClusterSpec({}) classifier = dnn.DNNClassifier( n_classes=3, feature_columns=feature_columns, hidden_units=[3, 3], config=config) classifier.fit(input_fn=_input_fn, steps=5) scores = classifier.evaluate(input_fn=_input_fn, steps=1) self._assertInRange(0.0, 1.0, scores['accuracy']) self.assertIn('loss', scores)
def testExtractFeaturesWithTransformation(self): """Tests feature extraction.""" with self.test_session(): features = {} features["dense_float"] = array_ops.zeros([2, 1], dtypes.float32) features["sparse_float"] = sparse_tensor.SparseTensor( array_ops.zeros([2, 2], dtypes.int64), array_ops.zeros([2], dtypes.float32), array_ops.zeros([2], dtypes.int64)) features["sparse_categorical"] = sparse_tensor.SparseTensor( array_ops.zeros([2, 2], dtypes.int64), array_ops.zeros([2], dtypes.string), array_ops.zeros([2], dtypes.int64)) feature_columns = set() feature_columns.add(layers.real_valued_column("dense_float")) feature_columns.add( layers.feature_column._real_valued_var_len_column( "sparse_float", is_sparse=True)) feature_columns.add( feature_column_lib.sparse_column_with_hash_bucket( "sparse_categorical", hash_bucket_size=1000000)) (fc_names, dense_floats, sparse_float_indices, sparse_float_values, sparse_float_shapes, sparse_int_indices, sparse_int_values, sparse_int_shapes) = (gbdt_batch.extract_features( features, feature_columns)) self.assertEqual(len(fc_names), 3) self.assertAllEqual( fc_names, ["dense_float", "sparse_float", "sparse_categorical"]) self.assertEqual(len(dense_floats), 1) self.assertEqual(len(sparse_float_indices), 1) self.assertEqual(len(sparse_float_values), 1) self.assertEqual(len(sparse_float_shapes), 1) self.assertEqual(len(sparse_int_indices), 1) self.assertEqual(len(sparse_int_values), 1) self.assertEqual(len(sparse_int_shapes), 1) self.assertAllEqual(dense_floats[0].eval(), features["dense_float"].eval()) self.assertAllEqual(sparse_float_indices[0].eval(), features["sparse_float"].indices.eval()) self.assertAllEqual(sparse_float_values[0].eval(), features["sparse_float"].values.eval()) self.assertAllEqual(sparse_float_shapes[0].eval(), features["sparse_float"].dense_shape.eval()) self.assertAllEqual(sparse_int_indices[0].eval(), features["sparse_categorical"].indices.eval()) self.assertAllEqual(sparse_int_values[0].eval(), [397263, 397263]) self.assertAllEqual( sparse_int_shapes[0].eval(), features["sparse_categorical"].dense_shape.eval())
def benchmarkTensorData(self): def _input_fn(): iris = test_data.prepare_iris_data_for_logistic_regression() features = {} for i in range(4): # The following shows how to provide the Tensor data for # RealValuedColumns. features.update({ str(i): array_ops.reshape( constant_op.constant( iris.data[:, i], dtype=dtypes.float32), (-1, 1)) }) # The following shows how to provide the SparseTensor data for # a SparseColumn. features['dummy_sparse_column'] = sparse_tensor.SparseTensor( values=('en', 'fr', 'zh'), indices=((0, 0), (0, 1), (60, 0)), dense_shape=(len(iris.target), 2)) labels = array_ops.reshape( constant_op.constant( iris.target, dtype=dtypes.int32), (-1, 1)) return features, labels iris = test_data.prepare_iris_data_for_logistic_regression() cont_features = [ feature_column.real_valued_column(str(i)) for i in range(4) ] linear_features = [ feature_column.bucketized_column( cont_features[i], test_data.get_quantile_based_buckets(iris.data[:, i], 10)) for i in range(4) ] linear_features.append( feature_column.sparse_column_with_hash_bucket( 'dummy_sparse_column', hash_bucket_size=100)) classifier = dnn_linear_combined.DNNLinearCombinedClassifier( model_dir=tempfile.mkdtemp(), linear_feature_columns=linear_features, dnn_feature_columns=cont_features, dnn_hidden_units=(3, 3)) metrics = classifier.fit(input_fn=_input_fn, steps=_ITERS).evaluate( input_fn=_input_fn, steps=100) self._assertSingleClassMetrics(metrics)
def testExtractFeaturesWithTransformation(self): """Tests feature extraction.""" with self.test_session(): features = {} features["dense_float"] = array_ops.zeros([2, 1], dtypes.float32) features["sparse_float"] = sparse_tensor.SparseTensor( array_ops.zeros([2, 2], dtypes.int64), array_ops.zeros([2], dtypes.float32), array_ops.zeros([2], dtypes.int64)) features["sparse_categorical"] = sparse_tensor.SparseTensor( array_ops.zeros([2, 2], dtypes.int64), array_ops.zeros( [2], dtypes.string), array_ops.zeros([2], dtypes.int64)) feature_columns = set() feature_columns.add(layers.real_valued_column("dense_float")) feature_columns.add( layers.feature_column._real_valued_var_len_column( "sparse_float", is_sparse=True)) feature_columns.add( feature_column_lib.sparse_column_with_hash_bucket( "sparse_categorical", hash_bucket_size=1000000)) (fc_names, dense_floats, sparse_float_indices, sparse_float_values, sparse_float_shapes, sparse_int_indices, sparse_int_values, sparse_int_shapes) = (gbdt_batch.extract_features( features, feature_columns)) self.assertEqual(len(fc_names), 3) self.assertAllEqual(fc_names, ["dense_float", "sparse_float", "sparse_categorical"]) self.assertEqual(len(dense_floats), 1) self.assertEqual(len(sparse_float_indices), 1) self.assertEqual(len(sparse_float_values), 1) self.assertEqual(len(sparse_float_shapes), 1) self.assertEqual(len(sparse_int_indices), 1) self.assertEqual(len(sparse_int_values), 1) self.assertEqual(len(sparse_int_shapes), 1) self.assertAllEqual(dense_floats[0].eval(), features["dense_float"].eval()) self.assertAllEqual(sparse_float_indices[0].eval(), features["sparse_float"].indices.eval()) self.assertAllEqual(sparse_float_values[0].eval(), features["sparse_float"].values.eval()) self.assertAllEqual(sparse_float_shapes[0].eval(), features["sparse_float"].dense_shape.eval()) self.assertAllEqual(sparse_int_indices[0].eval(), features["sparse_categorical"].indices.eval()) self.assertAllEqual(sparse_int_values[0].eval(), [397263, 397263]) self.assertAllEqual(sparse_int_shapes[0].eval(), features["sparse_categorical"].dense_shape.eval())
def benchmarkTensorData(self): def _input_fn(): iris = test_data.prepare_iris_data_for_logistic_regression() features = {} for i in range(4): # The following shows how to provide the Tensor data for # RealValuedColumns. features.update({ str(i): array_ops.reshape( constant_op.constant(iris.data[:, i], dtype=dtypes.float32), (-1, 1)) }) # The following shows how to provide the SparseTensor data for # a SparseColumn. features['dummy_sparse_column'] = sparse_tensor.SparseTensor( values=('en', 'fr', 'zh'), indices=((0, 0), (0, 1), (60, 0)), dense_shape=(len(iris.target), 2)) labels = array_ops.reshape( constant_op.constant(iris.target, dtype=dtypes.int32), (-1, 1)) return features, labels iris = test_data.prepare_iris_data_for_logistic_regression() cont_features = [ feature_column.real_valued_column(str(i)) for i in range(4) ] linear_features = [ feature_column.bucketized_column( cont_features[i], test_data.get_quantile_based_buckets(iris.data[:, i], 10)) for i in range(4) ] linear_features.append( feature_column.sparse_column_with_hash_bucket( 'dummy_sparse_column', hash_bucket_size=100)) classifier = dnn_linear_combined.DNNLinearCombinedClassifier( model_dir=tempfile.mkdtemp(), linear_feature_columns=linear_features, dnn_feature_columns=cont_features, dnn_hidden_units=(3, 3)) metrics = classifier.fit(input_fn=_input_fn, steps=_ITERS).evaluate(input_fn=_input_fn, steps=100) self._assertSingleClassMetrics(metrics)
def testPredict_AsIterable(self): """Tests predict and predict_prob methods with as_iterable=True.""" def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant([[.8], [.2], [.1]]), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ['en', 'fr', 'zh'], num_epochs=num_epochs), indices=[[0, 0], [0, 1], [2, 0]], dense_shape=[3, 2]) } return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32) language_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) feature_columns = [ feature_column.embedding_column( language_column, dimension=1), feature_column.real_valued_column('age') ] classifier = dnn.DNNClassifier( n_classes=3, feature_columns=feature_columns, hidden_units=[3, 3], config=run_config.RunConfig(tf_random_seed=1)) classifier.fit(input_fn=_input_fn, steps=200) scores = classifier.evaluate(input_fn=_input_fn, steps=1) self._assertInRange(0.0, 1.0, scores['accuracy']) self.assertIn('loss', scores) predict_input_fn = functools.partial(_input_fn, num_epochs=1) predictions = list( classifier.predict( input_fn=predict_input_fn, as_iterable=True)) self.assertListEqual(predictions, [1, 0, 0]) predictions = list( classifier.predict_proba( input_fn=predict_input_fn, as_iterable=True)) self.assertAllClose( predictions, [[0., 1., 0.], [1., 0., 0.], [1., 0., 0.]], atol=0.3)
def benchmarkLogisticFloatLabel(self): def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs(constant_op.constant( ((50, ), (20, ), (10, ))), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor(values=input_lib.limit_epochs( ('en', 'fr', 'zh'), num_epochs=num_epochs), indices=((0, 0), (0, 1), (2, 0)), dense_shape=(3, 2)) } return features, constant_op.constant(((0.8, ), (0., ), (0.2, )), dtype=dtypes.float32) lang_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) n_classes = 2 classifier = dnn.DNNClassifier( n_classes=n_classes, feature_columns=(feature_column.embedding_column(lang_column, dimension=1), feature_column.real_valued_column('age')), hidden_units=(3, 3), config=run_config.RunConfig(tf_random_seed=1)) steps = 1000 metrics = classifier.fit(input_fn=_input_fn, steps=steps).evaluate(input_fn=_input_fn, steps=1) estimator_test_utils.assert_in_range(steps, steps + 5, 'global_step', metrics) # Prediction probabilities mirror the labels column, which proves that the # classifier learns from float input. self._report_metrics(metrics) self._report_predictions( classifier=classifier, input_fn=functools.partial(_input_fn, num_epochs=1), iters=metrics['global_step'], n_examples=3, n_classes=n_classes, expected_probabilities=((0.2, 0.8), (1., 0.), (0.8, 0.2)), expected_classes=(1, 0, 0), benchmark_name_override=( 'DNNClassifierBenchmark.benchmarkLogisticFloatLabel_predictions' ))
def benchmarkLogisticFloatLabel(self): def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant(((50,), (20,), (10,))), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ('en', 'fr', 'zh'), num_epochs=num_epochs), indices=((0, 0), (0, 1), (2, 0)), dense_shape=(3, 2)) } return features, constant_op.constant( ((0.8,), (0.,), (0.2,)), dtype=dtypes.float32) lang_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) n_classes = 2 classifier = dnn.DNNClassifier( n_classes=n_classes, feature_columns=(feature_column.embedding_column( lang_column, dimension=1), feature_column.real_valued_column('age')), hidden_units=(3, 3), config=run_config.RunConfig(tf_random_seed=1)) steps = 1000 metrics = classifier.fit(input_fn=_input_fn, steps=steps).evaluate( input_fn=_input_fn, steps=1) estimator_test_utils.assert_in_range(steps, steps + 5, 'global_step', metrics) # Prediction probabilities mirror the labels column, which proves that the # classifier learns from float input. self._report_metrics(metrics) self._report_predictions( classifier=classifier, input_fn=functools.partial(_input_fn, num_epochs=1), iters=metrics['global_step'], n_examples=3, n_classes=n_classes, expected_probabilities=((0.2, 0.8), (1., 0.), (0.8, 0.2)), expected_classes=(1, 0, 0), benchmark_name_override=( 'DNNClassifierBenchmark.benchmarkLogisticFloatLabel_predictions'))
def testTrainSaveLoad(self): """Tests that insures you can save and reload a trained model.""" def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant([[.8], [.2], [.1]]), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ['en', 'fr', 'zh'], num_epochs=num_epochs), indices=[[0, 0], [0, 1], [2, 0]], dense_shape=[3, 2]) } return features, constant_op.constant([[1], [0], [0]], dtype=dtypes.int32) sparse_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) feature_columns = [ feature_column.embedding_column( sparse_column, dimension=1) ] model_dir = tempfile.mkdtemp() classifier = dnn.DNNClassifier( model_dir=model_dir, n_classes=3, feature_columns=feature_columns, hidden_units=[3, 3], config=run_config.RunConfig(tf_random_seed=1)) classifier.fit(input_fn=_input_fn, steps=5) predict_input_fn = functools.partial(_input_fn, num_epochs=1) predictions1 = classifier.predict(input_fn=predict_input_fn) del classifier classifier2 = dnn.DNNClassifier( model_dir=model_dir, n_classes=3, feature_columns=feature_columns, hidden_units=[3, 3], config=run_config.RunConfig(tf_random_seed=1)) predictions2 = classifier2.predict(input_fn=predict_input_fn) self.assertEqual(list(predictions1), list(predictions2))
def testTrainSaveLoad(self): """Tests that insures you can save and reload a trained model.""" def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs( constant_op.constant([[0.8], [0.15], [0.]]), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor( values=input_lib.limit_epochs( ['en', 'fr', 'zh'], num_epochs=num_epochs), indices=[[0, 0], [0, 1], [2, 0]], dense_shape=[3, 2]) } return features, constant_op.constant([1., 0., 0.2], dtype=dtypes.float32) sparse_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) feature_columns = [ feature_column.embedding_column( sparse_column, dimension=1), feature_column.real_valued_column('age') ] model_dir = tempfile.mkdtemp() regressor = dnn.DNNRegressor( model_dir=model_dir, feature_columns=feature_columns, hidden_units=[3, 3], config=run_config.RunConfig(tf_random_seed=1)) regressor.fit(input_fn=_input_fn, steps=5) predict_input_fn = functools.partial(_input_fn, num_epochs=1) predictions = list(regressor.predict(input_fn=predict_input_fn)) del regressor regressor2 = dnn.DNNRegressor( model_dir=model_dir, feature_columns=feature_columns, hidden_units=[3, 3], config=run_config.RunConfig(tf_random_seed=1)) predictions2 = list(regressor2.predict(input_fn=predict_input_fn)) self.assertAllClose(predictions, predictions2)
def benchmarkLogisticTensorData(self): def _input_fn(num_epochs=None): features = { 'age': input_lib.limit_epochs(constant_op.constant( ((.8, ), (0.2, ), (.1, ))), num_epochs=num_epochs), 'language': sparse_tensor.SparseTensor(values=input_lib.limit_epochs( ('en', 'fr', 'zh'), num_epochs=num_epochs), indices=((0, 0), (0, 1), (2, 0)), dense_shape=(3, 2)) } return features, constant_op.constant(((1, ), (0, ), (0, )), dtype=dtypes.int32) lang_column = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=20) classifier = dnn.DNNClassifier( feature_columns=(feature_column.embedding_column(lang_column, dimension=1), feature_column.real_valued_column('age')), hidden_units=(3, 3), config=run_config.RunConfig(tf_random_seed=1)) steps = 100 metrics = classifier.fit(input_fn=_input_fn, steps=steps).evaluate(input_fn=_input_fn, steps=1) estimator_test_utils.assert_in_range(steps, steps + 5, 'global_step', metrics) estimator_test_utils.assert_in_range(0.9, 1.0, 'accuracy', metrics) estimator_test_utils.assert_in_range(0.0, 0.3, 'loss', metrics) self._report_metrics(metrics) self._report_predictions( classifier=classifier, input_fn=functools.partial(_input_fn, num_epochs=1), iters=metrics['global_step'], n_examples=3, n_classes=2, expected_classes=(1, 0, 0), benchmark_name_override=( 'DNNClassifierBenchmark.benchmarkLogisticTensorData_predictions' ))
def benchmarkPartitionedVariables(self): def _input_fn(): features = { 'language': sparse_tensor.SparseTensor(values=('en', 'fr', 'zh'), indices=((0, 0), (0, 1), (2, 0)), dense_shape=(3, 2)) } labels = constant_op.constant(((1, ), (0, ), (0, ))) return features, labels # The given hash_bucket_size results in variables larger than the # default min_slice_size attribute, so the variables are partitioned. sparse_feature = feature_column.sparse_column_with_hash_bucket( 'language', hash_bucket_size=2e7) embedding_feature = feature_column.embedding_column(sparse_feature, dimension=1) tf_config = { 'cluster': { run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1'] } } with test.mock.patch.dict('os.environ', {'TF_CONFIG': json.dumps(tf_config)}): config = run_config.RunConfig() # Because we did not start a distributed cluster, we need to pass an # empty ClusterSpec, otherwise the device_setter will look for # distributed jobs, such as "/job:ps" which are not present. config._cluster_spec = server_lib.ClusterSpec({}) classifier = dnn_linear_combined.DNNLinearCombinedClassifier( linear_feature_columns=(sparse_feature, ), dnn_feature_columns=(embedding_feature, ), dnn_hidden_units=(3, 3), config=config) metrics = classifier.fit(input_fn=_input_fn, steps=_ITERS).evaluate(input_fn=_input_fn, steps=100) self._assertCommonMetrics(metrics)
def testMixedFeaturesArbitraryWeightsPartitioned(self): """Tests SDCALinearRegressor works with a mix of features (partitioned).""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [5.0], [7.0]]) }, constant_op.constant([[1.55], [-1.25], [-3.0]]) with self._single_threaded_test_session(): price = feature_column_lib.real_valued_column('price') sq_footage_bucket = feature_column_lib.bucketized_column( feature_column_lib.real_valued_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_lib.crossed_column( [sq_footage_bucket, country], hash_bucket_size=10) regressor = sdca_estimator.SDCALinearRegressor( example_id_column='example_id', feature_columns=[ price, sq_footage_bucket, country, sq_footage_country ], l2_regularization=1.0, weight_column_name='weights', partitioner=partitioned_variables.fixed_size_partitioner( num_shards=2, axis=0)) regressor.fit(input_fn=input_fn, steps=20) loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.05)
def testPartitionedMixedFeatures(self): """Tests SDCALogisticClassifier with a mix of features (partitioned).""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([900.0, 700.0, 600.0]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) with self._single_threaded_test_session(): price = feature_column_lib.real_valued_column('price') sq_footage_bucket = feature_column_lib.bucketized_column( feature_column_lib.real_valued_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_lib.crossed_column( [sq_footage_bucket, country], hash_bucket_size=10) classifier = sdca_estimator.SDCALogisticClassifier( example_id_column='example_id', feature_columns=[ price, sq_footage_bucket, country, sq_footage_country ], weight_column_name='weights', partitioner=partitioned_variables.fixed_size_partitioner( num_shards=2, axis=0)) classifier.fit(input_fn=input_fn, steps=50) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertGreater(metrics['accuracy'], 0.9)
def testMixedFeatures(self): """Tests SVM classifier with a mix of features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price = feature_column.real_valued_column('price') sq_footage_bucket = feature_column.bucketized_column( feature_column.real_valued_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column.crossed_column( [sq_footage_bucket, country], hash_bucket_size=10) svm_classifier = svm.SVM(feature_columns=[ price, sq_footage_bucket, country, sq_footage_country ], example_id_column='example_id', weight_column_name='weights', l1_regularization=0.1, l2_regularization=1.0) svm_classifier.fit(input_fn=input_fn, steps=30) accuracy = svm_classifier.evaluate(input_fn=input_fn, steps=1)['accuracy'] self.assertAlmostEqual(accuracy, 1.0, places=3)
def testInitCrossedColumnWeightsFromCkpt(self): sparse_col_1 = fc.sparse_column_with_hash_bucket(column_name="col_1", hash_bucket_size=4) sparse_col_2 = fc.sparse_column_with_hash_bucket(column_name="col_2", hash_bucket_size=4) crossed_col = fc.crossed_column(columns=[sparse_col_1, sparse_col_2], hash_bucket_size=4) input_tensor = sparse_tensor_lib.SparseTensor(indices=[[0, 0], [1, 1], [2, 2], [3, 3]], values=[0, 1, 2, 3], dense_shape=[4, 4]) # Invoking 'weighted_sum_from_feature_columns' will create the crossed # column weights variable. with variable_scope.variable_scope("run_1"): with variable_scope.variable_scope(crossed_col.name): # Returns looked up column weights which is same as crossed column # weights as well as actual references to weights variables. _, col_weights, _ = ( feature_column_ops.weighted_sum_from_feature_columns( { sparse_col_1.name: input_tensor, sparse_col_2.name: input_tensor }, [crossed_col], 1)) # Update the weights since default initializer initializes all weights # to 0.0. for weight in col_weights.values(): assign_op = state_ops.assign(weight[0], weight[0] + 0.5) save = saver.Saver() ckpt_dir_prefix = os.path.join(self.get_temp_dir(), "init_crossed_col_w_from_ckpt") ckpt_dir = tempfile.mkdtemp(prefix=ckpt_dir_prefix) checkpoint_path = os.path.join(ckpt_dir, "model.ckpt") with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(assign_op) saved_col_weights = col_weights[crossed_col][0].eval() save.save(sess, checkpoint_path) crossed_col_initialized = fc.crossed_column( columns=[sparse_col_1, sparse_col_2], hash_bucket_size=4, ckpt_to_load_from=checkpoint_path, tensor_name_in_ckpt=("run_1/col_1_X_col_2/" "weighted_sum_from_feature_columns/" "col_1_X_col_2/weights")) with variable_scope.variable_scope("run_2"): # This will initialize the crossed column weights from provided checkpoint # and return a [4, 1] tensor which is same as weights variable. Since we # won't modify weights, this should be same as 'saved_col_weights'. _, col_weights, _ = ( feature_column_ops.weighted_sum_from_feature_columns( { sparse_col_1.name: input_tensor, sparse_col_2.name: input_tensor }, [crossed_col_initialized], 1)) col_weights_from_ckpt = col_weights[crossed_col_initialized][0] with self.test_session() as sess: sess.run(variables.global_variables_initializer()) loaded_col_weights = col_weights_from_ckpt.eval() self.assertAllClose(saved_col_weights, loaded_col_weights)
def testImmutability(self): a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100) with self.assertRaises(AttributeError): a.column_name = "bbb"
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket("sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys( "id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc.real_valued_column("real_valued_column3", dimension=None) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, bucketized_col1, bucketized_col2, cross_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature([1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature([5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature([1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature([4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string) } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc._real_valued_var_len_column( "real_valued_column3", is_sparse=True) real_valued_col4 = fc._real_valued_var_len_column( "real_valued_column4", dtype=dtypes.int64, default_value=0, is_sparse=False) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column4": parsing_ops.FixedLenSequenceFeature( [], dtype=dtypes.int64, allow_missing=True, default_value=0), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Tests that contrib feature columns work with core library: config_core = fc_core.make_parse_example_spec(feature_columns) self.assertDictEqual(expected_config, config_core) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testSdcaOptimizerSparseFeaturesWithL1Reg(self): """SDCALinearRegressor works with sparse features and L1 regularization.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([0.4, 0.6, 0.3]), 'country': sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[10.0], [10.0], [10.0]]) }, constant_op.constant([[1.4], [-0.8], [2.6]]) with self._single_threaded_test_session(): price = feature_column_lib.real_valued_column('price') country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) # Regressor with no L1 regularization. regressor = sdca_estimator.SDCALinearRegressor( example_id_column='example_id', feature_columns=[price, country], weight_column_name='weights') regressor.fit(input_fn=input_fn, steps=20) no_l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] variable_names = regressor.get_variable_names() self.assertIn('linear/price/weight', variable_names) self.assertIn('linear/country/weights', variable_names) no_l1_reg_weights = { 'linear/price/weight': regressor.get_variable_value('linear/price/weight'), 'linear/country/weights': regressor.get_variable_value('linear/country/weights'), } # Regressor with L1 regularization. regressor = sdca_estimator.SDCALinearRegressor( example_id_column='example_id', feature_columns=[price, country], l1_regularization=1.0, weight_column_name='weights') regressor.fit(input_fn=input_fn, steps=20) l1_reg_loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] l1_reg_weights = { 'linear/price/weight': regressor.get_variable_value('linear/price/weight'), 'linear/country/weights': regressor.get_variable_value('linear/country/weights'), } # Unregularized loss is lower when there is no L1 regularization. self.assertLess(no_l1_reg_loss, l1_reg_loss) self.assertLess(no_l1_reg_loss, 0.05) # But weights returned by the regressor with L1 regularization have # smaller L1 norm. l1_reg_weights_norm, no_l1_reg_weights_norm = 0.0, 0.0 for var_name in sorted(l1_reg_weights): l1_reg_weights_norm += sum( np.absolute(l1_reg_weights[var_name].flatten())) no_l1_reg_weights_norm += sum( np.absolute(no_l1_reg_weights[var_name].flatten())) print('Var name: %s, value: %s' % (var_name, no_l1_reg_weights[var_name].flatten())) self.assertLess(l1_reg_weights_norm, no_l1_reg_weights_norm)