def testBucketizedFeatures(self): """Tests SVM classifier with bucketized features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[600.0], [800.0], [400.0]]), 'sq_footage': constant_op.constant([[1000.0], [800.0], [500.0]]), 'weights': constant_op.constant([[1.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price_bucket = feature_column.bucketized_column( feature_column.real_valued_column('price'), boundaries=[500.0, 700.0]) sq_footage_bucket = feature_column.bucketized_column( feature_column.real_valued_column('sq_footage'), boundaries=[650.0]) svm_classifier = svm.SVM( feature_columns=[price_bucket, sq_footage_bucket], example_id_column='example_id', l1_regularization=0.1, l2_regularization=1.0) svm_classifier.fit(input_fn=input_fn, steps=30) accuracy = svm_classifier.evaluate(input_fn=input_fn, steps=1)['accuracy'] self.assertAlmostEqual(accuracy, 1.0, places=3)
def testBucketizedFeatures(self): """Tests SDCALogisticClassifier with bucketized features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([600.0, 1000.0, 400.0]), 'sq_footage': constant_op.constant([[1000.0], [600.0], [700.0]]), 'weights': constant_op.constant([[1.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) with self._single_threaded_test_session(): price_bucket = feature_column_lib.bucketized_column( feature_column_lib.real_valued_column('price'), boundaries=[500.0, 700.0]) sq_footage_bucket = feature_column_lib.bucketized_column( feature_column_lib.real_valued_column('sq_footage'), boundaries=[650.0]) classifier = sdca_estimator.SDCALogisticClassifier( example_id_column='example_id', feature_columns=[price_bucket, sq_footage_bucket], weight_column_name='weights', l2_regularization=1.0) classifier.fit(input_fn=input_fn, steps=50) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertGreater(metrics['accuracy'], 0.9)
def testBucketizedColumnRequiresRealValuedColumn(self): with self.assertRaisesRegexp( TypeError, "source_column must be an instance of _RealValuedColumn"): fc.bucketized_column("bbb", [0]) with self.assertRaisesRegexp( TypeError, "source_column must be an instance of _RealValuedColumn"): fc.bucketized_column( fc.sparse_column_with_integerized_feature( column_name="bbb", bucket_size=10), [0])
def testMakePlaceHolderTensorsForBaseFeatures(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) real_valued_col = fc.real_valued_column("real_valued_column", 5) vlen_real_valued_col = fc.real_valued_column( "vlen_real_valued_column", dimension=None) bucketized_col = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization"), [0, 4]) feature_columns = set( [sparse_col, real_valued_col, vlen_real_valued_col, bucketized_col]) placeholders = ( fc.make_place_holder_tensors_for_base_features(feature_columns)) self.assertEqual(4, len(placeholders)) self.assertTrue( isinstance(placeholders["sparse_column"], sparse_tensor_lib.SparseTensor)) self.assertTrue( isinstance(placeholders["vlen_real_valued_column"], sparse_tensor_lib.SparseTensor)) placeholder = placeholders["real_valued_column"] self.assertGreaterEqual( placeholder.name.find(u"Placeholder_real_valued_column"), 0) self.assertEqual(dtypes.float32, placeholder.dtype) self.assertEqual([None, 5], placeholder.get_shape().as_list()) placeholder = placeholders["real_valued_column_for_bucketization"] self.assertGreaterEqual( placeholder.name.find( u"Placeholder_real_valued_column_for_bucketization"), 0) self.assertEqual(dtypes.float32, placeholder.dtype) self.assertEqual([None, 1], placeholder.get_shape().as_list())
def testMixedFeatures(self): """Tests SDCALogisticClassifier with a mix of features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price = feature_column_lib.real_valued_column('price') sq_footage_bucket = feature_column_lib.bucketized_column( feature_column_lib.real_valued_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_lib.crossed_column( [sq_footage_bucket, country], hash_bucket_size=10) classifier = sdca_estimator.SDCALogisticClassifier( example_id_column='example_id', feature_columns=[price, sq_footage_bucket, country, sq_footage_country], weight_column_name='weights') classifier.fit(input_fn=input_fn, steps=50) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertGreater(metrics['accuracy'], 0.9)
def testMakePlaceHolderTensorsForBaseFeatures(self): sparse_col = fc.sparse_column_with_hash_bucket("sparse_column", hash_bucket_size=100) real_valued_col = fc.real_valued_column("real_valued_column", 5) vlen_real_valued_col = fc.real_valued_column("vlen_real_valued_column", dimension=None) bucketized_col = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization"), [0, 4]) feature_columns = set([ sparse_col, real_valued_col, vlen_real_valued_col, bucketized_col ]) placeholders = ( fc.make_place_holder_tensors_for_base_features(feature_columns)) self.assertEqual(4, len(placeholders)) self.assertTrue( isinstance(placeholders["sparse_column"], sparse_tensor_lib.SparseTensor)) self.assertTrue( isinstance(placeholders["vlen_real_valued_column"], sparse_tensor_lib.SparseTensor)) placeholder = placeholders["real_valued_column"] self.assertGreaterEqual( placeholder.name.find(u"Placeholder_real_valued_column"), 0) self.assertEqual(dtypes.float32, placeholder.dtype) self.assertEqual([None, 5], placeholder.get_shape().as_list()) placeholder = placeholders["real_valued_column_for_bucketization"] self.assertGreaterEqual( placeholder.name.find( u"Placeholder_real_valued_column_for_bucketization"), 0) self.assertEqual(dtypes.float32, placeholder.dtype) self.assertEqual([None, 1], placeholder.get_shape().as_list())
def testMixedFeatures(self): """Tests SDCALogisticClassifier with a mix of features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price = feature_column_lib.real_valued_column('price') sq_footage_bucket = feature_column_lib.bucketized_column( feature_column_lib.real_valued_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_lib.crossed_column( [sq_footage_bucket, country], hash_bucket_size=10) classifier = SDCALogisticClassifier(example_id_column='example_id', feature_columns=[ price, sq_footage_bucket, country, sq_footage_country ], weight_column_name='weights') classifier.fit(input_fn=input_fn, steps=50) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertGreater(metrics['accuracy'], 0.9)
def testCrossedColumnNameCreatesSortedNames(self): a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100) bucket = fc.bucketized_column(fc.real_valued_column("cost"), [0, 4]) crossed = fc.crossed_column(set([b, bucket, a]), hash_bucket_size=10000) self.assertEqual("aaa_X_bbb_X_cost_bucketized", crossed.name, "name should be generated by sorted column names") self.assertEqual("aaa", crossed.columns[0].name) self.assertEqual("bbb", crossed.columns[1].name) self.assertEqual("cost_bucketized", crossed.columns[2].name)
def testCrossedColumnDeepCopy(self): a = fc.sparse_column_with_hash_bucket("aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("bbb", hash_bucket_size=100) bucket = fc.bucketized_column(fc.real_valued_column("cost"), [0, 4]) crossed = fc.crossed_column(set([b, bucket, a]), hash_bucket_size=10000) crossed_copy = copy.deepcopy(crossed) self.assertEqual("aaa_X_bbb_X_cost_bucketized", crossed_copy.name, "name should be generated by sorted column names") self.assertEqual("aaa", crossed_copy.columns[0].name) self.assertEqual("bbb", crossed_copy.columns[1].name) self.assertEqual("cost_bucketized", crossed_copy.columns[2].name)
def testBucketizedColumnDeepCopy(self): """Tests that we can do a deepcopy of a bucketized column. This test requires that the bucketized column also accept boundaries as tuples. """ bucketized = fc.bucketized_column(fc.real_valued_column("a"), [1., 2., 2., 3., 3.]) self.assertEqual(bucketized.name, "a_bucketized") self.assertTupleEqual(bucketized.boundaries, (1., 2., 3.)) bucketized_copy = copy.deepcopy(bucketized) self.assertEqual(bucketized_copy.name, "a_bucketized") self.assertTupleEqual(bucketized_copy.boundaries, (1., 2., 3.))
def testBucketizedColumnDeepCopy(self): """Tests that we can do a deepcopy of a bucketized column. This test requires that the bucketized column also accept boundaries as tuples. """ bucketized = fc.bucketized_column( fc.real_valued_column("a"), [1., 2., 2., 3., 3.]) self.assertEqual(bucketized.name, "a_bucketized") self.assertTupleEqual(bucketized.boundaries, (1., 2., 3.)) bucketized_copy = copy.deepcopy(bucketized) self.assertEqual(bucketized_copy.name, "a_bucketized") self.assertTupleEqual(bucketized_copy.boundaries, (1., 2., 3.))
def testBucketizedFeatures(self): """Tests SVM classifier with bucketized features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[600.0], [800.0], [400.0]]), 'sq_footage': constant_op.constant([[1000.0], [800.0], [500.0]]), 'weights': constant_op.constant([[1.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price_bucket = feature_column.bucketized_column( feature_column.real_valued_column('price'), boundaries=[500.0, 700.0]) sq_footage_bucket = feature_column.bucketized_column( feature_column.real_valued_column('sq_footage'), boundaries=[650.0]) svm_classifier = svm.SVM(feature_columns=[price_bucket, sq_footage_bucket], example_id_column='example_id', l1_regularization=0.1, l2_regularization=1.0) svm_classifier.fit(input_fn=input_fn, steps=30) accuracy = svm_classifier.evaluate(input_fn=input_fn, steps=1)['accuracy'] self.assertAlmostEqual(accuracy, 1.0, places=3)
def benchmarkTensorData(self): def _input_fn(): iris = test_data.prepare_iris_data_for_logistic_regression() features = {} for i in range(4): # The following shows how to provide the Tensor data for # RealValuedColumns. features.update({ str(i): array_ops.reshape( constant_op.constant( iris.data[:, i], dtype=dtypes.float32), (-1, 1)) }) # The following shows how to provide the SparseTensor data for # a SparseColumn. features['dummy_sparse_column'] = sparse_tensor.SparseTensor( values=('en', 'fr', 'zh'), indices=((0, 0), (0, 1), (60, 0)), dense_shape=(len(iris.target), 2)) labels = array_ops.reshape( constant_op.constant( iris.target, dtype=dtypes.int32), (-1, 1)) return features, labels iris = test_data.prepare_iris_data_for_logistic_regression() cont_features = [ feature_column.real_valued_column(str(i)) for i in range(4) ] linear_features = [ feature_column.bucketized_column( cont_features[i], test_data.get_quantile_based_buckets(iris.data[:, i], 10)) for i in range(4) ] linear_features.append( feature_column.sparse_column_with_hash_bucket( 'dummy_sparse_column', hash_bucket_size=100)) classifier = dnn_linear_combined.DNNLinearCombinedClassifier( model_dir=tempfile.mkdtemp(), linear_feature_columns=linear_features, dnn_feature_columns=cont_features, dnn_hidden_units=(3, 3)) metrics = classifier.fit(input_fn=_input_fn, steps=_ITERS).evaluate( input_fn=_input_fn, steps=100) self._assertSingleClassMetrics(metrics)
def benchmarkMatrixData(self): iris = test_data.prepare_iris_data_for_logistic_regression() cont_feature = feature_column.real_valued_column('feature', dimension=4) bucketized_feature = feature_column.bucketized_column( cont_feature, test_data.get_quantile_based_buckets(iris.data, 10)) classifier = dnn_linear_combined.DNNLinearCombinedClassifier( model_dir=tempfile.mkdtemp(), linear_feature_columns=(bucketized_feature,), dnn_feature_columns=(cont_feature,), dnn_hidden_units=(3, 3)) input_fn = test_data.iris_input_logistic_fn metrics = classifier.fit(input_fn=input_fn, steps=_ITERS).evaluate( input_fn=input_fn, steps=100) self._assertSingleClassMetrics(metrics)
def benchmarkMultiClass(self): iris = base.load_iris() cont_feature = feature_column.real_valued_column('feature', dimension=4) bucketized_feature = feature_column.bucketized_column( cont_feature, test_data.get_quantile_based_buckets(iris.data, 10)) classifier = dnn_linear_combined.DNNLinearCombinedClassifier( n_classes=3, linear_feature_columns=(bucketized_feature,), dnn_feature_columns=(cont_feature,), dnn_hidden_units=(3, 3)) input_fn = test_data.iris_input_multiclass_fn metrics = classifier.fit(input_fn=input_fn, steps=_ITERS).evaluate( input_fn=input_fn, steps=100) self._assertCommonMetrics(metrics)
def benchmarkTensorData(self): def _input_fn(): iris = test_data.prepare_iris_data_for_logistic_regression() features = {} for i in range(4): # The following shows how to provide the Tensor data for # RealValuedColumns. features.update({ str(i): array_ops.reshape( constant_op.constant(iris.data[:, i], dtype=dtypes.float32), (-1, 1)) }) # The following shows how to provide the SparseTensor data for # a SparseColumn. features['dummy_sparse_column'] = sparse_tensor.SparseTensor( values=('en', 'fr', 'zh'), indices=((0, 0), (0, 1), (60, 0)), dense_shape=(len(iris.target), 2)) labels = array_ops.reshape( constant_op.constant(iris.target, dtype=dtypes.int32), (-1, 1)) return features, labels iris = test_data.prepare_iris_data_for_logistic_regression() cont_features = [ feature_column.real_valued_column(str(i)) for i in range(4) ] linear_features = [ feature_column.bucketized_column( cont_features[i], test_data.get_quantile_based_buckets(iris.data[:, i], 10)) for i in range(4) ] linear_features.append( feature_column.sparse_column_with_hash_bucket( 'dummy_sparse_column', hash_bucket_size=100)) classifier = dnn_linear_combined.DNNLinearCombinedClassifier( model_dir=tempfile.mkdtemp(), linear_feature_columns=linear_features, dnn_feature_columns=cont_features, dnn_hidden_units=(3, 3)) metrics = classifier.fit(input_fn=_input_fn, steps=_ITERS).evaluate(input_fn=_input_fn, steps=100) self._assertSingleClassMetrics(metrics)
def benchmarkMultiClass(self): iris = base.load_iris() cont_feature = feature_column.real_valued_column('feature', dimension=4) bucketized_feature = feature_column.bucketized_column( cont_feature, test_data.get_quantile_based_buckets(iris.data, 10)) classifier = dnn_linear_combined.DNNLinearCombinedClassifier( n_classes=3, linear_feature_columns=(bucketized_feature, ), dnn_feature_columns=(cont_feature, ), dnn_hidden_units=(3, 3)) input_fn = test_data.iris_input_multiclass_fn metrics = classifier.fit(input_fn=input_fn, steps=_ITERS).evaluate(input_fn=input_fn, steps=100) self._assertCommonMetrics(metrics)
def benchmarkMatrixData(self): iris = test_data.prepare_iris_data_for_logistic_regression() cont_feature = feature_column.real_valued_column('feature', dimension=4) bucketized_feature = feature_column.bucketized_column( cont_feature, test_data.get_quantile_based_buckets(iris.data, 10)) classifier = dnn_linear_combined.DNNLinearCombinedClassifier( model_dir=tempfile.mkdtemp(), linear_feature_columns=(bucketized_feature, ), dnn_feature_columns=(cont_feature, ), dnn_hidden_units=(3, 3)) input_fn = test_data.iris_input_logistic_fn metrics = classifier.fit(input_fn=input_fn, steps=_ITERS).evaluate(input_fn=input_fn, steps=100) self._assertSingleClassMetrics(metrics)
def test_make_parsing_export_strategy(self): """Only tests that an ExportStrategy instance is created.""" sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) real_valued_col1 = fc.real_valued_column("real_valued_column1") bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) feature_columns = [sparse_col, embedding_col, real_valued_col1, bucketized_col1] export_strategy = saved_model_export_utils.make_parsing_export_strategy( feature_columns=feature_columns) self.assertTrue( isinstance(export_strategy, export_strategy_lib.ExportStrategy))
def testMixedFeaturesArbitraryWeightsPartitioned(self): """Tests SDCALinearRegressor works with a mix of features (partitioned).""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [5.0], [7.0]]) }, constant_op.constant([[1.55], [-1.25], [-3.0]]) with self._single_threaded_test_session(): price = feature_column_lib.real_valued_column('price') sq_footage_bucket = feature_column_lib.bucketized_column( feature_column_lib.real_valued_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column_lib.crossed_column( [sq_footage_bucket, country], hash_bucket_size=10) regressor = sdca_estimator.SDCALinearRegressor( example_id_column='example_id', feature_columns=[ price, sq_footage_bucket, country, sq_footage_country ], l2_regularization=1.0, weight_column_name='weights', partitioner=partitioned_variables.fixed_size_partitioner( num_shards=2, axis=0)) regressor.fit(input_fn=input_fn, steps=20) loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss'] self.assertLess(loss, 0.05)
def testMixedFeatures(self): """Tests SVM classifier with a mix of features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([[0.6], [0.8], [0.3]]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price = feature_column.real_valued_column('price') sq_footage_bucket = feature_column.bucketized_column( feature_column.real_valued_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column.crossed_column( [sq_footage_bucket, country], hash_bucket_size=10) svm_classifier = svm.SVM(feature_columns=[ price, sq_footage_bucket, country, sq_footage_country ], example_id_column='example_id', weight_column_name='weights', l1_regularization=0.1, l2_regularization=1.0) svm_classifier.fit(input_fn=input_fn, steps=30) accuracy = svm_classifier.evaluate(input_fn=input_fn, steps=1)['accuracy'] self.assertAlmostEqual(accuracy, 1.0, places=3)
def testMixedFeatures(self): """Tests SVM classifier with a mix of features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': constant_op.constant([0.6, 0.8, 0.3]), 'sq_footage': constant_op.constant([[900.0], [700.0], [600.0]]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 3], [2, 1]], dense_shape=[3, 5]), 'weights': constant_op.constant([[3.0], [1.0], [1.0]]) }, constant_op.constant([[1], [0], [1]]) price = feature_column.real_valued_column('price') sq_footage_bucket = feature_column.bucketized_column( feature_column.real_valued_column('sq_footage'), boundaries=[650.0, 800.0]) country = feature_column.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) sq_footage_country = feature_column.crossed_column( [sq_footage_bucket, country], hash_bucket_size=10) svm_classifier = svm.SVM( feature_columns=[price, sq_footage_bucket, country, sq_footage_country], example_id_column='example_id', weight_column_name='weights', l1_regularization=0.1, l2_regularization=1.0) svm_classifier.fit(input_fn=input_fn, steps=30) accuracy = svm_classifier.evaluate(input_fn=input_fn, steps=1)['accuracy'] self.assertAlmostEqual(accuracy, 1.0, places=3)
def testBucketizedColumnWithSameBucketBoundaries(self): a_bucketized = fc.bucketized_column(fc.real_valued_column("a"), [1., 2., 2., 3., 3.]) self.assertEqual(a_bucketized.name, "a_bucketized") self.assertTupleEqual(a_bucketized.boundaries, (1., 2., 3.))
def testBucketizedColumnRequiresSortedBuckets(self): with self.assertRaisesRegexp(ValueError, "boundaries must be a sorted list"): fc.bucketized_column(fc.real_valued_column("ccc"), [5, 0, 4])
def testBucketizedColumnRequiresRealValuedColumnDimension(self): with self.assertRaisesRegexp( ValueError, "source_column must have a defined dimension"): fc.bucketized_column(fc.real_valued_column("bbb", dimension=None), [0])
def testBucketizedColumnNameEndsWithUnderscoreBucketized(self): a = fc.bucketized_column(fc.real_valued_column("aaa"), [0, 4]) self.assertEqual(a.name, "aaa_bucketized")
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys("id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc.real_valued_column( "real_valued_column3", dimension=None) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, bucketized_col1, bucketized_col2, cross_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string) } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testBucketizedColumnRequiresRealValuedColumnDimension(self): with self.assertRaisesRegexp( TypeError, "source_column must be an instance of _RealValuedColumn.*"): fc.bucketized_column(fc._real_valued_var_len_column("bbb", is_sparse=True), [0])
def testBucketizedColumnWithSameBucketBoundaries(self): a_bucketized = fc.bucketized_column( fc.real_valued_column("a"), [1., 2., 2., 3., 3.]) self.assertEqual(a_bucketized.name, "a_bucketized") self.assertTupleEqual(a_bucketized.boundaries, (1., 2., 3.))
def testBucketizedColumnRequiresRealValuedColumnDimension(self): with self.assertRaisesRegexp(ValueError, "source_column must have a defined dimension"): fc.bucketized_column(fc.real_valued_column("bbb", dimension=None), [0])
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc._real_valued_var_len_column( "real_valued_column3", is_sparse=True) real_valued_col4 = fc._real_valued_var_len_column( "real_valued_column4", dtype=dtypes.int64, default_value=0, is_sparse=False) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column4": parsing_ops.FixedLenSequenceFeature( [], dtype=dtypes.int64, allow_missing=True, default_value=0), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket("sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys( "id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc.real_valued_column("real_valued_column3", dimension=None) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, bucketized_col1, bucketized_col2, cross_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature([1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature([5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature([1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature([4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string) } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Tests that contrib feature columns work with core library: config_core = fc_core.make_parse_example_spec(feature_columns) self.assertDictEqual(expected_config, config_core) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)