def testOneHotColumn(self): a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"]) onehot_a = fc.one_hot_column(a) self.assertEqual(onehot_a.sparse_id_column.name, "a") self.assertEqual(onehot_a.length, 4) b = fc.sparse_column_with_hash_bucket( "b", hash_bucket_size=100, combiner="sum") onehot_b = fc.one_hot_column(b) self.assertEqual(onehot_b.sparse_id_column.name, "b") self.assertEqual(onehot_b.length, 100)
def testOneHotColumnDeepCopy(self): a = fc.sparse_column_with_keys("a", ["a", "b", "c", "d"]) column = fc.one_hot_column(a) column_copy = copy.deepcopy(column) self.assertEqual(column_copy.sparse_id_column.name, "a") self.assertEqual(column.name, "a_one_hot") self.assertEqual(column.length, 4)
def testOneHotColumnForWeightedSparseColumn(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") one_hot = fc.one_hot_column(weighted_ids) self.assertEqual(one_hot.sparse_id_column.name, "ids_weighted_by_weights") self.assertEqual(one_hot.length, 3)
def testMissingValueInOneHotColumnForSparseColumnWithKeys(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) one_hot = fc.one_hot_column(ids) features = {"ids": constant_op.constant([["marlo", "unknown", "omar"]])} one_hot_tensor = feature_column_ops.input_from_feature_columns( features, [one_hot]) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(lookup_ops.tables_initializer()) self.assertAllEqual([[1., 1., 0.]], one_hot_tensor.eval())
def testMissingValueInOneHotColumnForWeightedSparseColumn(self): # Github issue 12583 ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") one_hot = fc.one_hot_column(weighted_ids) features = { 'ids': constant_op.constant([['marlo', 'unknown', 'omar']]), 'weights': constant_op.constant([[2., 4., 6.]]) } one_hot_tensor = feature_column_ops.input_from_feature_columns( features, [one_hot]) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) sess.run(lookup_ops.tables_initializer()) self.assertAllEqual([[2., 6., 0.]], one_hot_tensor.eval())
def setUp(self): super(DynamicRnnEstimatorTest, self).setUp() self.rnn_cell = rnn_cell.BasicRNNCell(self.NUM_RNN_CELL_UNITS) self.mock_target_column = MockTargetColumn( num_label_columns=self.NUM_LABEL_COLUMNS) location = feature_column.sparse_column_with_keys( 'location', keys=['west_side', 'east_side', 'nyc']) location_onehot = feature_column.one_hot_column(location) self.context_feature_columns = [location_onehot] wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) wire_cast_embedded = feature_column.embedding_column(wire_cast, dimension=8) measurements = feature_column.real_valued_column( 'measurements', dimension=2) self.sequence_feature_columns = [measurements, wire_cast_embedded]
def setUp(self): super(DynamicRnnEstimatorTest, self).setUp() self.rnn_cell = core_rnn_cell_impl.BasicRNNCell(self.NUM_RNN_CELL_UNITS) self.mock_target_column = MockTargetColumn( num_label_columns=self.NUM_LABEL_COLUMNS) location = feature_column.sparse_column_with_keys( 'location', keys=['west_side', 'east_side', 'nyc']) location_onehot = feature_column.one_hot_column(location) self.context_feature_columns = [location_onehot] wire_cast = feature_column.sparse_column_with_keys( 'wire_cast', ['marlo', 'omar', 'stringer']) wire_cast_embedded = feature_column.embedding_column(wire_cast, dimension=8) measurements = feature_column.real_valued_column( 'measurements', dimension=2) self.sequence_feature_columns = [measurements, wire_cast_embedded]
def testOneHotReshaping(self): """Tests reshaping behavior of `OneHotColumn`.""" id_tensor_shape = [3, 2, 4, 5] sparse_column = fc.sparse_column_with_keys( "animals", ["squirrel", "moose", "dragon", "octopus"]) one_hot = fc.one_hot_column(sparse_column) vocab_size = len(sparse_column.lookup_config.keys) id_tensor = _sparse_id_tensor(id_tensor_shape, vocab_size) for output_rank in range(1, len(id_tensor_shape) + 1): with variable_scope.variable_scope("output_rank_{}".format(output_rank)): one_hot_output = one_hot._to_dnn_input_layer( id_tensor, output_rank=output_rank) with self.test_session() as sess: one_hot_value = sess.run(one_hot_output) expected_shape = (id_tensor_shape[:output_rank - 1] + [vocab_size]) self.assertEquals(expected_shape, list(one_hot_value.shape))
def testRaisesNonEmbeddingColumn(self): one_hot_language = feature_column.one_hot_column( feature_column.sparse_column_with_hash_bucket('language', 10)) params = { 'feature_columns': [one_hot_language], 'head': head_lib._multi_class_head(2), 'hidden_units': [1], # Set lr mult to 0. to keep embeddings constant. 'embedding_lr_multipliers': { one_hot_language: 0.0 }, } features = { 'language': sparse_tensor.SparseTensor( values=['en', 'fr', 'zh'], indices=[[0, 0], [1, 0], [2, 0]], dense_shape=[3, 1]), } labels = constant_op.constant([[0], [0], [0]], dtype=dtypes.int32) with self.assertRaisesRegexp(ValueError, 'can only be defined for embedding columns'): dnn._dnn_model_fn(features, labels, model_fn.ModeKeys.TRAIN, params)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) real_valued_col3 = fc._real_valued_var_len_column( "real_valued_column3", is_sparse=True) real_valued_col4 = fc._real_valued_var_len_column( "real_valued_column4", dtype=dtypes.int64, default_value=0, is_sparse=False) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, real_valued_col3, real_valued_col4, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column3": parsing_ops.VarLenFeature(dtype=dtypes.float32), "real_valued_column4": parsing_ops.FixedLenSequenceFeature( [], dtype=dtypes.int64, allow_missing=True, default_value=0), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)
def testCreateFeatureSpec(self): sparse_col = fc.sparse_column_with_hash_bucket( "sparse_column", hash_bucket_size=100) embedding_col = fc.embedding_column( fc.sparse_column_with_hash_bucket( "sparse_column_for_embedding", hash_bucket_size=10), dimension=4) str_sparse_id_col = fc.sparse_column_with_keys( "str_id_column", ["marlo", "omar", "stringer"]) int32_sparse_id_col = fc.sparse_column_with_keys( "int32_id_column", [42, 1, -1000], dtype=dtypes.int32) int64_sparse_id_col = fc.sparse_column_with_keys( "int64_id_column", [42, 1, -1000], dtype=dtypes.int64) weighted_id_col = fc.weighted_sparse_column(str_sparse_id_col, "str_id_weights_column") real_valued_col1 = fc.real_valued_column("real_valued_column1") real_valued_col2 = fc.real_valued_column("real_valued_column2", 5) bucketized_col1 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization1"), [0, 4]) bucketized_col2 = fc.bucketized_column( fc.real_valued_column("real_valued_column_for_bucketization2", 4), [0, 4]) a = fc.sparse_column_with_hash_bucket("cross_aaa", hash_bucket_size=100) b = fc.sparse_column_with_hash_bucket("cross_bbb", hash_bucket_size=100) cross_col = fc.crossed_column(set([a, b]), hash_bucket_size=10000) one_hot_col = fc.one_hot_column(fc.sparse_column_with_hash_bucket( "sparse_column_for_one_hot", hash_bucket_size=100)) scattered_embedding_col = fc.scattered_embedding_column( "scattered_embedding_column", size=100, dimension=10, hash_key=1) feature_columns = set([ sparse_col, embedding_col, weighted_id_col, int32_sparse_id_col, int64_sparse_id_col, real_valued_col1, real_valued_col2, bucketized_col1, bucketized_col2, cross_col, one_hot_col, scattered_embedding_col ]) expected_config = { "sparse_column": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_embedding": parsing_ops.VarLenFeature(dtypes.string), "str_id_column": parsing_ops.VarLenFeature(dtypes.string), "int32_id_column": parsing_ops.VarLenFeature(dtypes.int32), "int64_id_column": parsing_ops.VarLenFeature(dtypes.int64), "str_id_weights_column": parsing_ops.VarLenFeature(dtypes.float32), "real_valued_column1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column2": parsing_ops.FixedLenFeature( [5], dtype=dtypes.float32), "real_valued_column_for_bucketization1": parsing_ops.FixedLenFeature( [1], dtype=dtypes.float32), "real_valued_column_for_bucketization2": parsing_ops.FixedLenFeature( [4], dtype=dtypes.float32), "cross_aaa": parsing_ops.VarLenFeature(dtypes.string), "cross_bbb": parsing_ops.VarLenFeature(dtypes.string), "sparse_column_for_one_hot": parsing_ops.VarLenFeature(dtypes.string), "scattered_embedding_column": parsing_ops.VarLenFeature(dtypes.string), } config = fc.create_feature_spec_for_parsing(feature_columns) self.assertDictEqual(expected_config, config) # Tests that contrib feature columns work with core library: config_core = fc_core.make_parse_example_spec(feature_columns) self.assertDictEqual(expected_config, config_core) # Test that the same config is parsed out if we pass a dictionary. feature_columns_dict = { str(i): val for i, val in enumerate(feature_columns) } config = fc.create_feature_spec_for_parsing(feature_columns_dict) self.assertDictEqual(expected_config, config)