def testInt32WeightedSparseInt64ColumnDtypes(self): ids = fc.sparse_column_with_keys("ids", [42, 1, -1000], dtype=dtypes.int64) weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.int32) self.assertDictEqual({ "ids": parsing_ops.VarLenFeature(dtypes.int64), "weights": parsing_ops.VarLenFeature(dtypes.int32) }, weighted_ids.config) with self.assertRaisesRegexp(ValueError, "dtype is not convertible to float"): weighted_ids = fc.weighted_sparse_column( ids, "weights", dtype=dtypes.string)
def testFloat32WeightedSparseStringColumnDtypes(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") self.assertDictEqual({ "ids": parsing_ops.VarLenFeature(dtypes.string), "weights": parsing_ops.VarLenFeature(dtypes.float32) }, weighted_ids.config)
def testFloat32WeightedSparseInt32ColumnDtypes(self): ids = fc.sparse_column_with_keys("ids", [42, 1, -1000], dtype=dtypes.int32) weighted_ids = fc.weighted_sparse_column(ids, "weights") self.assertDictEqual({ "ids": parsing_ops.VarLenFeature(dtypes.int32), "weights": parsing_ops.VarLenFeature(dtypes.float32) }, weighted_ids.config)
def testWeightedSparseColumnDeepCopy(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted = fc.weighted_sparse_column(ids, "weights") weighted_copy = copy.deepcopy(weighted) self.assertEqual(weighted_copy.sparse_id_column.name, "ids") self.assertEqual(weighted_copy.weight_column_name, "weights") self.assertEqual(weighted_copy.name, "ids_weighted_by_weights")
def testOneHotColumnForWeightedSparseColumn(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") one_hot = fc.one_hot_column(weighted_ids) self.assertEqual(one_hot.sparse_id_column.name, "ids_weighted_by_weights") self.assertEqual(one_hot.length, 3)
def testWeightedSparseFeatures(self): """Tests SDCALogisticClassifier with weighted sparse features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': sparse_tensor.SparseTensor( values=[2., 3., 1.], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 5]), 'country': sparse_tensor.SparseTensor( values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 5]) }, constant_op.constant([[1], [0], [1]]) country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) country_weighted_by_price = feature_column_lib.weighted_sparse_column( country, 'price') classifier = sdca_estimator.SDCALogisticClassifier( example_id_column='example_id', feature_columns=[country_weighted_by_price]) classifier.fit(input_fn=input_fn, steps=50) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertGreater(metrics['accuracy'], 0.9)
def testWeightedSparseFeatures(self): """Tests SDCALogisticClassifier with weighted sparse features.""" def input_fn(): return { 'example_id': constant_op.constant(['1', '2', '3']), 'price': sparse_tensor.SparseTensor(values=[2., 3., 1.], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 5]), 'country': sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 5]) }, constant_op.constant([[1], [0], [1]]) with self._single_threaded_test_session(): country = feature_column_lib.sparse_column_with_hash_bucket( 'country', hash_bucket_size=5) country_weighted_by_price = feature_column_lib.weighted_sparse_column( country, 'price') classifier = sdca_estimator.SDCALogisticClassifier( example_id_column='example_id', feature_columns=[country_weighted_by_price]) classifier.fit(input_fn=input_fn, steps=50) metrics = classifier.evaluate(input_fn=input_fn, steps=1) self.assertGreater(metrics['accuracy'], 0.9)
def testWeightedSparseColumnDtypes(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") self.assertDictEqual({ "ids": parsing_ops.VarLenFeature(dtypes.string), "weights": parsing_ops.VarLenFeature(dtypes.float32) }, weighted_ids.config) weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.int32) self.assertDictEqual({ "ids": parsing_ops.VarLenFeature(dtypes.string), "weights": parsing_ops.VarLenFeature(dtypes.int32) }, weighted_ids.config) with self.assertRaisesRegexp(ValueError, "dtype is not convertible to float"): weighted_ids = fc.weighted_sparse_column( ids, "weights", dtype=dtypes.string)
def testWeightedSparseColumnWithVocabularyFile(self): ids = fc.sparse_column_with_vocabulary_file( "ids", "a_file", num_oov_buckets=7, vocab_size=3) weighted_ids = fc.weighted_sparse_column(ids, "weights") self.assertEqual(weighted_ids.name, "ids_weighted_by_weights") self.assertEqual(weighted_ids.lookup_config, ids.lookup_config) self.assertEqual(weighted_ids.lookup_config.vocab_size, 3) self.assertEqual(weighted_ids.lookup_config.num_oov_buckets, 7) self.assertEqual(weighted_ids.lookup_config.vocabulary_file, "a_file")
def testSharedEmbeddingColumnWithWeightedSparseColumn(self): # Tests creation of shared embeddings containing weighted sparse columns. sparse_col = fc.sparse_column_with_keys("a1", ["marlo", "omar", "stringer"]) ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_sparse_col = fc.weighted_sparse_column(ids, "weights") self.assertEqual(weighted_sparse_col.name, "ids_weighted_by_weights") b = fc.shared_embedding_columns([sparse_col, weighted_sparse_col], dimension=4, combiner="mean") self.assertEqual(len(b), 2) self.assertEqual(b[0].shared_embedding_name, "a1_ids_weighted_by_weights_shared_embedding") self.assertEqual(b[1].shared_embedding_name, "a1_ids_weighted_by_weights_shared_embedding") # Tries reversing order to check compatibility condition. b = fc.shared_embedding_columns([weighted_sparse_col, sparse_col], dimension=4, combiner="mean") self.assertEqual(len(b), 2) self.assertEqual(b[0].shared_embedding_name, "a1_ids_weighted_by_weights_shared_embedding") self.assertEqual(b[1].shared_embedding_name, "a1_ids_weighted_by_weights_shared_embedding") # Tries adding two weighted columns to check compatibility between them. weighted_sparse_col_2 = fc.weighted_sparse_column(ids, "weights_2") b = fc.shared_embedding_columns([weighted_sparse_col, weighted_sparse_col_2], dimension=4, combiner="mean") self.assertEqual(len(b), 2) self.assertEqual( b[0].shared_embedding_name, "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding" ) self.assertEqual( b[1].shared_embedding_name, "ids_weighted_by_weights_ids_weighted_by_weights_2_shared_embedding" )
def testCreateSequenceFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys("id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column", dimension=2) real_valued_col2 = fc.real_valued_column( "real_valued_default_column", dimension=5, default_value=3.0) real_valued_col3 = fc._real_valued_var_len_column( "real_valued_var_len_column", default_value=3.0, is_sparse=True) real_valued_col4 = fc._real_valued_var_len_column( "real_valued_var_len_dense_column", default_value=4.0, is_sparse=False) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4 ]) feature_spec = fc._create_sequence_feature_spec_for_parsing(feature_columns) expected_feature_spec = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column": parsing_ops.FixedLenSequenceFeature( shape=[2], dtype=dtypes.float32, allow_missing=False), "real_valued_default_column": parsing_ops.FixedLenSequenceFeature( shape=[5], dtype=dtypes.float32, allow_missing=True), "real_valued_var_len_column": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_var_len_dense_column": parsing_ops.FixedLenSequenceFeature( shape=[], dtype=dtypes.float32, allow_missing=True, default_value=4.0), } self.assertDictEqual(expected_feature_spec, feature_spec)
def testWeightedSparseColumnDtypes(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") self.assertDictEqual( { "ids": parsing_ops.VarLenFeature(dtypes.string), "weights": parsing_ops.VarLenFeature(dtypes.float32) }, weighted_ids.config) weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.int32) self.assertDictEqual( { "ids": parsing_ops.VarLenFeature(dtypes.string), "weights": parsing_ops.VarLenFeature(dtypes.int32) }, weighted_ids.config) with self.assertRaisesRegexp(ValueError, "dtype is not convertible to float"): weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.string)
def testMissingValueInOneHotColumnForWeightedSparseColumn(self): # Github issue 12583 ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") one_hot = fc.one_hot_column(weighted_ids) features = { 'ids': constant_op.constant([['marlo', 'unknown', 'omar']]), 'weights': constant_op.constant([[2., 4., 6.]]) } one_hot_tensor = feature_column_ops.input_from_feature_columns( features, [one_hot]) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(lookup_ops.tables_initializer()) self.assertAllEqual([[2., 6., 0.]], one_hot_tensor.eval())
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys("id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc.real_valued_column( "real_valued_column3", dimension=None) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, bucketized_col1, bucketized_col2, cross_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string) } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testWeightedSparseColumn(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") self.assertEqual(weighted_ids.name, "ids_weighted_by_weights")
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Tests that contrib feature columns work with core library: config_core = fc_core.make_parse_example_spec(feature_columns) self.assertDictEqual(expected_config, config_core) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc._real_valued_var_len_column( "real_valued_column3", is_sparse=True) real_valued_col4 = fc._real_valued_var_len_column( "real_valued_column4", dtype=dtypes.int64, default_value=0, is_sparse=False) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column4": parsing_ops.FixedLenSequenceFeature( [], dtype=dtypes.int64, allow_missing=True, default_value=0), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket("sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) sparse_id_col = fc.sparse_column_with_keys( "id_column", ["marlo", "omar", "stringer"]) weighted_id_col = fc.weighted_sparse_column(sparse_id_col, "id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc.real_valued_column("real_valued_column3", dimension=None) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, real_valued_col1, real_valued_col2, real_valued_col3, bucketized_col1, bucketized_col2, cross_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "id_column": parsing_ops.VarLenFeature(dtypes.string), "id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature([1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature([5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature([1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature([4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string) } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)