def test_with_1d_sparse_tensor():
    tf.compat.v1.reset_default_graph()
    #混合散列
    body_style = tf.feature_column.categorical_column_with_vocabulary_list(
        'name', vocabulary_list=['anna', 'gary', 'bob'],num_oov_buckets=2)   #稀疏矩阵
    
    #稠密矩阵
    builder = _LazyBuilder({
        'name': ['anna', 'gary','alsa'],        
      })
    
    #稀疏矩阵
    builder2 = _LazyBuilder({
        'name': tf.SparseTensor(
        indices=((0,), (1,), (2,)),
        values=('anna', 'gary', 'alsa'),
        dense_shape=(3,)),    
      })    

    id_weight_pair = body_style._get_sparse_tensors(builder)    #
    id_weight_pair2 = body_style._get_sparse_tensors(builder2)  #


    with tf.compat.v1.Session() as sess:
        sess.run(lookup_ops.tables_initializer())

        id_tensor_eval = id_weight_pair.id_tensor.eval()
        print("稀疏矩阵:\n",id_tensor_eval)
        id_tensor_eval2 = id_weight_pair2.id_tensor.eval()
        print("稀疏矩阵2:\n",id_tensor_eval2)
          
        dense_decoded = tf.sparse.to_dense( id_tensor_eval, default_value=-1).eval(session=sess)
        print("稠密矩阵:\n",dense_decoded)
  def test_sequence_length_with_empty_rows(self):
    """Tests _sequence_length when some examples do not have ids."""
    vocabulary_size = 3
    sparse_input_a = sparse_tensor.SparseTensorValue(
        # example 0, ids []
        # example 1, ids [2]
        # example 2, ids [0, 1]
        # example 3, ids []
        # example 4, ids [1]
        # example 5, ids []
        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
        values=(2, 0, 1, 1),
        dense_shape=(6, 2))
    expected_sequence_length_a = [0, 1, 2, 0, 1, 0]
    categorical_column_a = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)

    sparse_input_b = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids []
        # example 2, ids []
        # example 3, ids []
        # example 4, ids [1]
        # example 5, ids [0, 1]
        indices=((0, 0), (4, 0), (5, 0), (5, 1)),
        values=(2, 1, 0, 1),
        dense_shape=(6, 2))
    expected_sequence_length_b = [1, 0, 0, 0, 1, 2]
    categorical_column_b = sfc.sequence_categorical_column_with_identity(
        key='bbb', num_buckets=vocabulary_size)

    shared_embedding_columns = fc.shared_embedding_columns(
        [categorical_column_a, categorical_column_b], dimension=2)

    sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
        _LazyBuilder({
            'aaa': sparse_input_a
        }))[1]
    sequence_length_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
        _LazyBuilder({
            'bbb': sparse_input_b
        }))[1]

    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(
          expected_sequence_length_a, sequence_length_a.eval(session=sess))
      self.assertAllEqual(
          expected_sequence_length_b, sequence_length_b.eval(session=sess))
    def test_sequence_length_with_empty_rows(self):
        """Tests _sequence_length when some examples do not have ids."""
        vocabulary_size = 3
        sparse_input = sparse_tensor.SparseTensorValue(
            # example 0, ids []
            # example 1, ids [2]
            # example 2, ids [0, 1]
            # example 3, ids []
            # example 4, ids [1]
            # example 5, ids []
            indices=((1, 0), (2, 0), (2, 1), (4, 0)),
            values=(2, 0, 1, 1),
            dense_shape=(6, 2))
        expected_sequence_length = [0, 1, 2, 0, 1, 0]

        categorical_column = sfc.sequence_categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        indicator_column = sfc._sequence_indicator_column(categorical_column)

        _, sequence_length = indicator_column._get_sequence_dense_tensor(
            _LazyBuilder({'aaa': sparse_input}))

        with monitored_session.MonitoredSession() as sess:
            self.assertAllEqual(expected_sequence_length,
                                sequence_length.eval(session=sess))
Exemple #4
0
    def test_transormations_called_once(self):
        class TransformCounter(fc._FeatureColumn):
            def __init__(self):
                self.num_transform = 0

            @property
            def name(self):
                return 'TransformCounter'

            def _transform_feature(self, cache):
                self.num_transform += 1  # Count transform calls.
                return cache.get('a')

            @property
            def _parse_example_config(self):
                pass

        builder = fc._LazyBuilder(
            features={'a': constant_op.constant([[2], [3.]])})
        column = TransformCounter()
        self.assertEqual(0, column.num_transform)
        builder.get(column)
        self.assertEqual(1, column.num_transform)
        builder.get(column)
        self.assertEqual(1, column.num_transform)
Exemple #5
0
def test_categorical_column_with_vocabulary_list():

    color_data = {
        'color': [['R', 'R'], ['G', 'R'], ['B', 'G'], ['A', 'A']]
    }  # 4行样本

    builder = _LazyBuilder(color_data)

    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)

    color_column_tensor = color_column._get_sparse_tensors(builder)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print(session.run([color_column_tensor.id_tensor]))

    # 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot
    color_column_identy = feature_column.indicator_column(color_column)

    color_dense_tensor = feature_column.input_layer(color_data,
                                                    [color_column_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print('use input_layer' + '_' * 40)
        print(session.run([color_dense_tensor]))
Exemple #6
0
def test_shared_embedding_column_with_hash_bucket():
    color_data = {
        'color': [[2, 2], [5, 5], [0, -1], [0, 0]],
        'color2': [[2], [5], [-1], [0]]
    }  # 4行样本
    builder = _LazyBuilder(color_data)
    color_column = feature_column.categorical_column_with_hash_bucket(
        'color', 7, dtype=tf.int32)
    color_column_tensor = color_column._get_sparse_tensors(builder)
    color_column2 = feature_column.categorical_column_with_hash_bucket(
        'color2', 7, dtype=tf.int32)
    color_column_tensor2 = color_column2._get_sparse_tensors(builder)
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('not use input_layer' + '_' * 40)
        print(session.run([color_column_tensor.id_tensor]))
        print(session.run([color_column_tensor2.id_tensor]))

    # 将稀疏的转换成dense,也就是one-hot形式,只是multi-hot
    color_column_embed = feature_column.shared_embedding_columns(
        [color_column2, color_column], 3, combiner='sum')
    print(type(color_column_embed))
    color_dense_tensor = feature_column.input_layer(color_data,
                                                    color_column_embed)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run(color_dense_tensor))
  def test_get_sequence_dense_tensor_with_normalizer_fn(self):

    def _increment_two(input_sparse_tensor):
      return sparse_ops.sparse_add(
          input_sparse_tensor,
          sparse_tensor.SparseTensor(((0, 0), (1, 1)), (2.0, 2.0), (2, 2))
      )

    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, values [[0.], [1]]
        # example 1, [[10.]]
        indices=((0, 0), (0, 1), (1, 0)),
        values=(0., 1., 10.),
        dense_shape=(2, 2))

    # Before _increment_two:
    #   [[0.], [1.]],
    #   [[10.], [0.]],
    # After _increment_two:
    #   [[2.], [1.]],
    #   [[10.], [2.]],
    expected_dense_tensor = [
        [[2.], [1.]],
        [[10.], [2.]],
    ]
    numeric_column = sfc.sequence_numeric_column(
        'aaa', normalizer_fn=_increment_two)

    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
        _LazyBuilder({'aaa': sparse_input}))

    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(
          expected_dense_tensor, dense_tensor.eval(session=sess))
def test_categorical_column_with_hash_bucket():
    # 1. Input features
    color_data = {'color': [[2], [5], [-1], [0]]}
    builder = _LazyBuilder(color_data)
    # 2. Feature columns (Sparse)
    color_column = feature_column.categorical_column_with_hash_bucket(
        'color', 7, dtype=tf.int32)
    color_column_tensor = color_column._get_sparse_tensors(builder)
    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor.id_tensor]))

    # 2. Feature columns (Dense)
    # Convert the Categorical Column to Dense Column
    color_column_identity = feature_column.indicator_column(color_column)
    # 3. Feature tensor
    color_dense_tensor = feature_column.input_layer(color_data,
                                                    [color_column_identity])

    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run([color_dense_tensor]))
def test_weighted_categorical_column():
    # 1. Input features
    color_data = {
        'color': [['R'], ['G'], ['B'], ['A']],
        'weight': [[1.0], [2.0], [4.0], [8.0]]
    }
    # 2. Feature columns (Sparse)
    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)
    # 2. Feature columns (Sparse)
    color_weight_categorical_column \
        = feature_column.weighted_categorical_column(color_column, 'weight')
    builder = _LazyBuilder(color_data)
    id_tensor, weight = color_weight_categorical_column._get_sparse_tensors(
        builder)

    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('weighted categorical' + '-' * 40)
        print(session.run([id_tensor]))
        print('-' * 40)
        print(session.run([weight]))

    # 2. Feature columns (Dense)
    weighted_column = feature_column.indicator_column(
        color_weight_categorical_column)
    # 3. Feature tensor
    weighted_column_dense_tensor = feature_column.input_layer(
        color_data, [weighted_column])
    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run([weighted_column_dense_tensor]))
    def test_get_sequence_dense_tensor(self):
        vocabulary_size = 3
        sparse_input = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            # example 2, ids []
            # example 3, ids [1]
            indices=((0, 0), (1, 0), (1, 1), (3, 0)),
            values=(2, 0, 1, 1),
            dense_shape=(4, 2))

        expected_lookups = [
            # example 0, ids [2]
            [[0., 0., 1.], [0., 0., 0.]],
            # example 1, ids [0, 1]
            [[1., 0., 0.], [0., 1., 0.]],
            # example 2, ids []
            [[0., 0., 0.], [0., 0., 0.]],
            # example 3, ids [1]
            [[0., 1., 0.], [0., 0., 0.]],
        ]

        categorical_column = sfc.sequence_categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        indicator_column = sfc._sequence_indicator_column(categorical_column)

        indicator_tensor, _ = indicator_column._get_sequence_dense_tensor(
            _LazyBuilder({'aaa': sparse_input}))

        with monitored_session.MonitoredSession() as sess:
            self.assertAllEqual(expected_lookups,
                                indicator_tensor.eval(session=sess))
def test_embedding():
    tf.set_random_seed(1)
    # 1. Input features
    color_data = {'color': [['R', 'G'], ['G', 'A'], ['B', 'B'], ['A', 'A']]}
    builder = _LazyBuilder(color_data)
    # 2. Feature columns (Sparse)
    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)
    color_column_tensor = color_column._get_sparse_tensors(builder)
    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor.id_tensor]))

    # 2. Feature columns (Dense)
    color_embedding = feature_column.embedding_column(color_column,
                                                      4,
                                                      combiner='sum')
    # 3. Feature tensor
    color_embedding_dense_tensor = feature_column.input_layer(
        color_data, [color_embedding])

    with tf.Session() as session:
        # Embedding needs variables (weights) to do the embedding
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('embedding' + '_' * 40)
        print(session.run([color_embedding_dense_tensor]))
def test_shared_embedding_column_with_hash_bucket():
    # 1. Input features
    color_data = {
        'range': [[2, 2], [5, 5], [0, -1], [0, 0]],
        'id': [[2], [5], [-1], [0]]
    }
    builder = _LazyBuilder(color_data)
    # 2. Feature columns (Sparse)
    color_column = feature_column.categorical_column_with_hash_bucket(
        'range', 7, dtype=tf.int32)
    color_column_tensor = color_column._get_sparse_tensors(builder)
    # 2. Feature columns (Sparse)
    color_column2 = feature_column.categorical_column_with_hash_bucket(
        'id', 7, dtype=tf.int32)
    color_column_tensor2 = color_column2._get_sparse_tensors(builder)
    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('not use input_layer' + '_' * 40)
        print(session.run([color_column_tensor.id_tensor]))
        print(session.run([color_column_tensor2.id_tensor]))

    # 2. Feature columns (Dense)
    color_column_embed = feature_column.shared_embedding_columns(
        [color_column2, color_column], 3, combiner='sum')
    print(type(color_column_embed))
    # 3. Feature tensor
    color_dense_tensor = feature_column.input_layer(color_data,
                                                    color_column_embed)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run(color_dense_tensor))
Exemple #13
0
 def test_tensor_dtype_should_be_string_or_integer(self):
     string_fc = fc.categorical_column_with_hash_bucket('a_string',
                                                        10,
                                                        dtype=dtypes.string)
     int_fc = fc.categorical_column_with_hash_bucket('a_int',
                                                     10,
                                                     dtype=dtypes.int32)
     float_fc = fc.categorical_column_with_hash_bucket('a_float',
                                                       10,
                                                       dtype=dtypes.string)
     int_tensor = sparse_tensor.SparseTensor(values=constant_op.constant(
         [101]),
                                             indices=[[0, 0]],
                                             dense_shape=[1, 1])
     string_tensor = sparse_tensor.SparseTensor(values=constant_op.constant(
         ['101']),
                                                indices=[[0, 0]],
                                                dense_shape=[1, 1])
     float_tensor = sparse_tensor.SparseTensor(values=constant_op.constant(
         [101.]),
                                               indices=[[0, 0]],
                                               dense_shape=[1, 1])
     builder = fc._LazyBuilder({
         'a_int': int_tensor,
         'a_string': string_tensor,
         'a_float': float_tensor
     })
     builder.get(string_fc)
     builder.get(int_fc)
     with self.assertRaisesRegexp(ValueError,
                                  'dtype must be string or integer'):
         builder.get(float_fc)
  def test_get_sequence_dense_tensor(self):
    vocabulary_size = 3
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids [0, 1]
        # example 2, ids []
        # example 3, ids [1]
        indices=((0, 0), (1, 0), (1, 1), (3, 0)),
        values=(2, 0, 1, 1),
        dense_shape=(4, 2))

    expected_lookups = [
        # example 0, ids [2]
        [[0., 0., 1.], [0., 0., 0.]],
        # example 1, ids [0, 1]
        [[1., 0., 0.], [0., 1., 0.]],
        # example 2, ids []
        [[0., 0., 0.], [0., 0., 0.]],
        # example 3, ids [1]
        [[0., 1., 0.], [0., 0., 0.]],
    ]

    categorical_column = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    indicator_column = fc.indicator_column(categorical_column)

    indicator_tensor, _ = indicator_column._get_sequence_dense_tensor(
        _LazyBuilder({'aaa': sparse_input}))

    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(expected_lookups, indicator_tensor.eval(session=sess))
Exemple #15
0
def test_weighted_cate_column():
    # !!! id=''代表missing,其对应的weight只能为0,否则会导致id和weight长度不一致而报错
    # !!! 而且weight必须是float型,输入int会报错
    x_values = {
        'id': [[b'a', b'z', b'a', b'c'], [b'b', b'', b'd', b'b']],
        'weight': [[1.0, 2.0, -3.0, 4.0], [5.0, 0.0, 7.0, -8.0]]
    }
    builder = _LazyBuilder(x_values)  # lazy representation of input

    # ================== define ops
    sparse_id_featcol = feature_column.categorical_column_with_vocabulary_list(
        'id', ['a', 'b', 'c'], dtype=tf.string, default_value=-1)
    sparse_featcol = feature_column.weighted_categorical_column(
        categorical_column=sparse_id_featcol, weight_feature_key='weight')
    x_sparse_tensor = sparse_featcol._get_sparse_tensors(builder)

    # indicator_column将sparse tensor转换成dense MHE格式, shape=[batch_size, #tokens]
    # 其中的权重是这个token出现的所有权重的总和
    dense_featcol = feature_column.indicator_column(sparse_featcol)
    x_dense_tensor = feature_column.input_layer(x_values, [dense_featcol])

    # ================== run
    with tf.Session() as sess:
        # 必须initialize table,否则报错
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())

        id_sparse_value, weight_sparse_value = sess.run(
            [x_sparse_tensor.id_tensor, x_sparse_tensor.weight_tensor])

        print("************************* sparse id tensor")
        # sparse tensor's id_tensor保持与原始输入相同的形状,[batch_size, max_tokens_per_example]=[2,4]
        # SparseTensorValue(indices=array(
        #       [[0, 0],
        #        [0, 1],
        #        [0, 2],
        #        [0, 3],
        #        [1, 0],
        #        [1, 2],
        #        [1, 3]]), values=array([ 0, -1,  0,  2,  1, -1,  1]), dense_shape=array([2, 4]))
        print(id_sparse_value)

        print("************************* sparse weight tensor")
        # sparse tensor's weight_tensor保持与原始输入相同的形状,[batch_size, max_tokens_per_example]=[2,4]
        # SparseTensorValue(indices=array(
        #       [[0, 0],
        #        [0, 1],
        #        [0, 2],
        #        [0, 3],
        #        [1, 0],
        #        [1, 2],
        #        [1, 3]]), values=array([ 1.,  2., -3.,  4.,  5.,  7., -8.], dtype=float32), dense_shape=array([2, 4]))
        print(weight_sparse_value)

        print("************************* dense MHE tensor")
        # indicator_column将sparse tensor按照MHE的方式转化成dense tensor,shape=[batch_size, total_tokens_in_vocab]
        # 其中的每个数值是该token出现的所有权重的总和
        # [[-2.  0.  4.]
        #  [ 0. -3.  0.]]
        print(sess.run(x_dense_tensor))
Exemple #16
0
def test_multi_value_embedding():
    color_data = {
        'color': [['G', 'G'], ['G', 'B'], ['B', 'B'], ['G', 'R'], ['R', 'R'],
                  ['B', 'R']]
    }

    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)

    color_embeding = feature_column.embedding_column(color_column, 7)
    color_embeding_dense_tensor = feature_column.input_layer(
        color_data, [color_embeding])
    builder = _LazyBuilder(color_data)
    color_column_tensor = color_column._get_sparse_tensors(builder)

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor.id_tensor]))

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('embeding' + '-' * 40)
        print(session.run([color_embeding_dense_tensor]))
def test_weighted_categorical_column():
    color_data = {
        'color': [['R'], ['G'], ['B'], ['A']],
        'weight': [[1.0], [2.0], [4.0], [8.0]]
    }  # 4行样本

    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)

    color_weight_categorical_column = feature_column.weighted_categorical_column(
        color_column, 'weight')

    builder = _LazyBuilder(color_data)

    with tf.Session() as session:
        id_tensor, weight = color_weight_categorical_column._get_sparse_tensors(
            builder)

        session.run(tf.global_variables_initializer())

        session.run(tf.tables_initializer())

        print('weighted categorical' + '-' * 40)

        print(session.run([id_tensor]))
        print('-' * 40)
        print(session.run([weight]))
  def test_sequence_length_with_empty_rows(self):
    """Tests _sequence_length when some examples do not have ids."""
    vocabulary_size = 3
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, ids []
        # example 1, ids [2]
        # example 2, ids [0, 1]
        # example 3, ids []
        # example 4, ids [1]
        # example 5, ids []
        indices=((1, 0), (2, 0), (2, 1), (4, 0)),
        values=(2, 0, 1, 1),
        dense_shape=(6, 2))
    expected_sequence_length = [0, 1, 2, 0, 1, 0]

    categorical_column = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    indicator_column = fc.indicator_column(categorical_column)

    _, sequence_length = indicator_column._get_sequence_dense_tensor(
        _LazyBuilder({'aaa': sparse_input}))

    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(
          expected_sequence_length, sequence_length.eval(session=sess))
def test_categorical_column_with_hash_bucket():
    #源数据
    color_data = {'color': [[2], [5], [-1], [0]]}  # 4行样本 shape=[4,1]
    builder = _LazyBuilder(color_data)

    # categorical_column
    color_column = feature_column.categorical_column_with_hash_bucket(
        'color', 7, dtype=tf.int32)

    # tensor
    color_column_tensor = color_column._get_sparse_tensors(builder)  #稀疏表示
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor.id_tensor]))

    # 通过indicator_column,将稀疏的转换成dense,也就是one-hot形式,只是multi-hot
    color_column_identy = feature_column.indicator_column(color_column)

    #input_layer连接数据源和声明的column生成新的tensor
    color_dense_tensor = feature_column.input_layer(color_data,
                                                    [color_column_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run([color_dense_tensor]))
def test_embedding():
    tf.set_random_seed(1)
    #源数据
    color_data = {
        'color': [['R', 'G'], ['G', 'A'], ['B', 'B'], ['A', 'A']]
    }  # 4行样本
    builder = _LazyBuilder(color_data)

    # categorical_column  要想转为 embedding 先将源数据的clomn表达为categorical_column 这里只是声明没有源数据
    color_column = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1)
    # tensor 数据源  将数据源表达成tensor
    color_column_tensor = color_column._get_sparse_tensors(builder)

    #获取embedding_column; 第一个参数是:categorical_column;  第二个参数是维度
    color_embedding_column = feature_column.embedding_column(color_column,
                                                             4,
                                                             combiner='sum')

    # 转化为tensor  input_layer(数据源,column)  连接起数据源和embedding_column
    color_embeding_dense_tensor = feature_column.input_layer(
        color_data, [color_embedding_column])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([color_column_tensor.id_tensor]))
        print('embeding' + '_' * 40)
        print(session.run([color_embeding_dense_tensor]))
Exemple #21
0
  def test_get_dense_tensor(self):
    # Inputs.
    vocabulary_size = 3
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids [0, 1]
        # example 2, ids []
        # example 3, ids [1]
        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
        values=(2, 0, 1, 1),
        dense_shape=(4, 5))

    # Embedding variable.
    embedding_dimension = 2
    embedding_values = (
        (1., 2.),  # id 0
        (3., 5.),  # id 1
        (7., 11.)  # id 2
    )

    def _initializer(shape, dtype, partition_info):
      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
      self.assertEqual(dtypes.float32, dtype)
      self.assertIsNone(partition_info)
      return embedding_values

    # Expected lookup result, using combiner='mean'.
    expected_lookups = (
        # example 0, ids [2], embedding = [7, 11]
        (7., 11.),
        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
        (2., 3.5),
        # example 2, ids [], embedding = [0, 0]
        (0., 0.),
        # example 3, ids [1], embedding = [3, 5]
        (3., 5.),
    )

    # Build columns.
    categorical_column = fc_lib.categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    embedding_column = tpu_fc.embedding_column(
        categorical_column,
        dimension=embedding_dimension,
        initializer=_initializer)

    # Provide sparse input and get dense result.
    embedding_lookup = embedding_column._get_dense_tensor(
        fc._LazyBuilder({
            'aaa': sparse_input
        }))

    # Assert expected embedding variable and lookups.
    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
    self.assertItemsEqual(('embedding_weights:0',),
                          tuple([v.name for v in global_vars]))
    with _initialized_session():
      self.assertAllEqual(embedding_values, global_vars[0])
      self.assertAllEqual(expected_lookups, embedding_lookup)
  def test_get_dense_tensor(self):
    # Inputs.
    vocabulary_size = 3
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids [0, 1]
        # example 2, ids []
        # example 3, ids [1]
        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
        values=(2, 0, 1, 1),
        dense_shape=(4, 5))

    # Embedding variable.
    embedding_dimension = 2
    embedding_values = (
        (1., 2.),  # id 0
        (3., 5.),  # id 1
        (7., 11.)  # id 2
    )

    def _initializer(shape, dtype, partition_info):
      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
      self.assertEqual(dtypes.float32, dtype)
      self.assertIsNone(partition_info)
      return embedding_values

    # Expected lookup result, using combiner='mean'.
    expected_lookups = (
        # example 0, ids [2], embedding = [7, 11]
        (7., 11.),
        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
        (2., 3.5),
        # example 2, ids [], embedding = [0, 0]
        (0., 0.),
        # example 3, ids [1], embedding = [3, 5]
        (3., 5.),
    )

    # Build columns.
    categorical_column = fc_lib.categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    embedding_column = tpu_fc.embedding_column(
        categorical_column,
        dimension=embedding_dimension,
        initializer=_initializer)

    # Provide sparse input and get dense result.
    embedding_lookup = embedding_column._get_dense_tensor(
        fc._LazyBuilder({
            'aaa': sparse_input
        }))

    # Assert expected embedding variable and lookups.
    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
    self.assertItemsEqual(('embedding_weights:0',),
                          tuple([v.name for v in global_vars]))
    with _initialized_session():
      self.assertAllEqual(embedding_values, global_vars[0].eval())
      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
Exemple #23
0
def show_column1(data: dict, feature_column):
    builder = _LazyBuilder(data)
    id_tensor, weight = feature_column._get_sparse_tensors(builder)

    if weight is None:
        return id_tensor.values
    else:
        return id_tensor.values, weight.values
Exemple #24
0
 def test_dtype_should_match_with_tensor(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
     wire_tensor = sparse_tensor.SparseTensor(values=['omar'],
                                              indices=[[0, 0]],
                                              dense_shape=[1, 1])
     builder = fc._LazyBuilder({'wire': wire_tensor})
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
         builder.get(hashed_sparse)
Exemple #25
0
 def test_get_sparse_tensors(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
         dense_shape=[2, 2])
     builder = fc._LazyBuilder({'wire': wire_tensor})
     self.assertEqual(builder.get(hashed_sparse),
                      hashed_sparse._get_sparse_tensors(builder).id_tensor)
Exemple #26
0
 def test_sparse_tensor_not_supported(self):
     price = fc.numeric_column('price')
     builder = fc._LazyBuilder({
         'price':
         sparse_tensor.SparseTensor(indices=[[0, 0]],
                                    values=[0.3],
                                    dense_shape=[1, 1])
     })
     with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
         price._transform_feature(builder)
Exemple #27
0
    def test_get_dense_tensor(self):
        def _increment_two(input_tensor):
            return input_tensor + 2.

        price = fc.numeric_column('price',
                                  shape=[2],
                                  normalizer_fn=_increment_two)
        builder = fc._LazyBuilder(
            {'price': constant_op.constant([[1., 2.], [5., 6.]])})
        self.assertEqual(builder.get(price), price._get_dense_tensor(builder))
def practise():
    fx = {'x': [['a', 'a'], ['b', 'c'], ['c', 'e'], ['d', ''], ['e', 'f']]}
    fc = feature_column.categorical_column_with_hash_bucket('x', 5)
    fic = feature_column.indicator_column(fc)
    t2 = fc._get_sparse_tensors(_LazyBuilder(fx)).id_tensor
    tsor = feature_column.input_layer(fx, fic)

    with tf.Session() as sess:
        print(sess.run(t2))
        print(sess.run(tsor))
Exemple #29
0
 def calc_weight(self):
     with ops.name_scope(None, 'weights', values=self.features.values()):
         if self.weight_column is None or self.mode == model_fn.ModeKeys.PREDICT or self.mode == model_fn.ModeKeys.EVAL:
             return 1.
         weight_column = feature_column_lib.numeric_column(
             key=self.weight_column)
         weights = weight_column._get_dense_tensor(
             feature_column_lib._LazyBuilder(self.features))
         weights = math_ops.to_float(weights, name='weights')
         return weights
Exemple #30
0
    def test_key_should_be_string_or_feature_colum(self):
        class NotAFeatureColumn(object):
            pass

        builder = fc._LazyBuilder(
            features={'a': constant_op.constant([[2], [3.]])})
        with self.assertRaisesRegexp(
                TypeError,
                '"key" must be either a "str" or "_FeatureColumn".'):
            builder.get(NotAFeatureColumn())
Exemple #31
0
    def dnn_logit_fn(features, mode):
        with variable_scope.variable_scope(
                'input_from_feature_columns',
                values=tuple(six.itervalues(features)),
                partitioner=input_layer_partitioner):
            inputs = feature_column_lib.input_layer(
                features=features, feature_columns=feature_columns)
            dense = inputs
        for layer_id, num_hidden_units in enumerate(hidden_units):
            with variable_scope.variable_scope(
                    'dense_layer_%d' % layer_id,
                    values=(dense, )) as hidden_layer_scope:
                dense = core_layers.dense(
                    dense,
                    units=num_hidden_units,
                    activation=activation_fn,
                    kernel_initializer=init_ops.glorot_uniform_initializer(),
                    name=hidden_layer_scope)
                if dropout is not None and mode == model_fn.ModeKeys.TRAIN:
                    dense = core_layers.dropout(dense,
                                                rate=dropout,
                                                training=True)
            _add_hidden_layer_summary(dense, hidden_layer_scope.name)

        with variable_scope.variable_scope(
                'fm_layer', values=(inputs, )) as cross_layer_scope:
            builder = feature_column_lib._LazyBuilder(features)
            fm_outputs = []
            for col_pair in fm_feature_columns:
                column1, column2 = col_pair
                tensor1 = column1._get_dense_tensor(builder, trainable=True)
                num_elements = column1._variable_shape.num_elements()
                batch_size = array_ops.shape(tensor1)[0]
                tensor2 = column2._get_dense_tensor(builder, trainable=True)
                tensor1 = array_ops.reshape(tensor1,
                                            shape=(batch_size, num_elements))
                tensor2 = array_ops.reshape(tensor2,
                                            shape=(batch_size, num_elements))
                fm_outputs.append(matmul(tensor1, tensor2))
            fm_outputs = tf.convert_to_tensor(fm_outputs)
        _add_hidden_layer_summary(fm_outputs, cross_layer_scope.name)

        with variable_scope.variable_scope(
                'logits', values=(dense, fm_outputs)) as logits_scope:
            dense_cross = concat([dense, fm_outputs], axis=1)
            logits = core_layers.dense(
                dense_cross,
                units=1,
                activation=None,
                kernel_initializer=init_ops.glorot_uniform_initializer(),
                name=logits_scope)
        _add_hidden_layer_summary(logits, logits_scope.name)

        return logits
  def test_sequence_length(self):
    vocabulary_size = 3

    sparse_input_a = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids [0, 1]
        indices=((0, 0), (1, 0), (1, 1)),
        values=(2, 0, 1),
        dense_shape=(2, 2))
    expected_sequence_length_a = [1, 2]
    categorical_column_a = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)

    sparse_input_b = sparse_tensor.SparseTensorValue(
        # example 0, ids [0, 2]
        # example 1, ids [1]
        indices=((0, 0), (0, 1), (1, 0)),
        values=(0, 2, 1),
        dense_shape=(2, 2))
    expected_sequence_length_b = [2, 1]
    categorical_column_b = sfc.sequence_categorical_column_with_identity(
        key='bbb', num_buckets=vocabulary_size)
    shared_embedding_columns = fc.shared_embedding_columns(
        [categorical_column_a, categorical_column_b], dimension=2)

    sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
        _LazyBuilder({
            'aaa': sparse_input_a
        }))[1]
    sequence_length_b = shared_embedding_columns[1]._get_sequence_dense_tensor(
        _LazyBuilder({
            'bbb': sparse_input_b
        }))[1]

    with monitored_session.MonitoredSession() as sess:
      sequence_length_a = sess.run(sequence_length_a)
      self.assertAllEqual(expected_sequence_length_a, sequence_length_a)
      self.assertEqual(np.int64, sequence_length_a.dtype)
      sequence_length_b = sess.run(sequence_length_b)
      self.assertAllEqual(expected_sequence_length_b, sequence_length_b)
      self.assertEqual(np.int64, sequence_length_b.dtype)
    def test_get_sequence_dense_tensor(self):
        vocabulary_size = 3
        sparse_input = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            # example 2, ids []
            # example 3, ids [1]
            indices=((0, 0), (1, 0), (1, 1), (3, 0)),
            values=(2, 0, 1, 1),
            dense_shape=(4, 2))

        embedding_dimension = 2
        embedding_values = (
            (1., 2.),  # id 0
            (3., 5.),  # id 1
            (7., 11.)  # id 2
        )

        def _initializer(shape, dtype, partition_info):
            self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
            self.assertEqual(dtypes.float32, dtype)
            self.assertIsNone(partition_info)
            return embedding_values

        expected_lookups = [
            # example 0, ids [2]
            [[7., 11.], [0., 0.]],
            # example 1, ids [0, 1]
            [[1., 2.], [3., 5.]],
            # example 2, ids []
            [[0., 0.], [0., 0.]],
            # example 3, ids [1]
            [[3., 5.], [0., 0.]],
        ]

        categorical_column = sfc.sequence_categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        embedding_column = sfc._sequence_embedding_column(
            categorical_column,
            dimension=embedding_dimension,
            initializer=_initializer)

        embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
            _LazyBuilder({'aaa': sparse_input}))

        global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
        self.assertItemsEqual(('embedding_weights:0', ),
                              tuple([v.name for v in global_vars]))
        with monitored_session.MonitoredSession() as sess:
            self.assertAllEqual(embedding_values,
                                global_vars[0].eval(session=sess))
            self.assertAllEqual(expected_lookups,
                                embedding_lookup.eval(session=sess))
  def test_sequence_length(self):
    column = sfc.sequence_categorical_column_with_hash_bucket(
        'aaa', hash_bucket_size=10)
    inputs = sparse_tensor.SparseTensorValue(
        indices=((0, 0), (1, 0), (1, 1)),
        values=('omar', 'stringer', 'marlo'),
        dense_shape=(2, 2))
    expected_sequence_length = [1, 2]

    sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs}))

    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(
          expected_sequence_length, sequence_length.eval(session=sess))
  def test_sequence_length_with_zeros(self):
    column = sfc.sequence_categorical_column_with_identity(
        'aaa', num_buckets=3)
    inputs = sparse_tensor.SparseTensorValue(
        indices=((1, 0), (3, 0), (3, 1)),
        values=(1, 2, 0),
        dense_shape=(5, 2))
    expected_sequence_length = [0, 1, 0, 2, 0]

    sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs}))

    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(
          expected_sequence_length, sequence_length.eval(session=sess))
  def test_get_sequence_dense_tensor(self):
    vocabulary_size = 3
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids [0, 1]
        # example 2, ids []
        # example 3, ids [1]
        indices=((0, 0), (1, 0), (1, 1), (3, 0)),
        values=(2, 0, 1, 1),
        dense_shape=(4, 2))

    embedding_dimension = 2
    embedding_values = (
        (1., 2.),  # id 0
        (3., 5.),  # id 1
        (7., 11.)  # id 2
    )
    def _initializer(shape, dtype, partition_info):
      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
      self.assertEqual(dtypes.float32, dtype)
      self.assertIsNone(partition_info)
      return embedding_values

    expected_lookups = [
        # example 0, ids [2]
        [[7., 11.], [0., 0.]],
        # example 1, ids [0, 1]
        [[1., 2.], [3., 5.]],
        # example 2, ids []
        [[0., 0.], [0., 0.]],
        # example 3, ids [1]
        [[3., 5.], [0., 0.]],
    ]

    categorical_column = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    embedding_column = fc.embedding_column(
        categorical_column, dimension=embedding_dimension,
        initializer=_initializer)

    embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
        _LazyBuilder({'aaa': sparse_input}))

    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
    self.assertItemsEqual(
        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(embedding_values, global_vars[0].eval(session=sess))
      self.assertAllEqual(expected_lookups, embedding_lookup.eval(session=sess))
  def test_sequence_length(self):
    column = sfc.sequence_categorical_column_with_identity(
        'aaa', num_buckets=3)
    inputs = sparse_tensor.SparseTensorValue(
        indices=((0, 0), (1, 0), (1, 1)),
        values=(1, 2, 0),
        dense_shape=(2, 2))
    expected_sequence_length = [1, 2]

    sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs}))

    with monitored_session.MonitoredSession() as sess:
      sequence_length = sess.run(sequence_length)
      self.assertAllEqual(expected_sequence_length, sequence_length)
      self.assertEqual(np.int64, sequence_length.dtype)
  def test_sequence_length(self):
    column = sfc.sequence_categorical_column_with_vocabulary_file(
        key='aaa',
        vocabulary_file=self._wire_vocabulary_file_name,
        vocabulary_size=self._wire_vocabulary_size)
    inputs = sparse_tensor.SparseTensorValue(
        indices=((0, 0), (1, 0), (1, 1)),
        values=('marlo', 'skywalker', 'omar'),
        dense_shape=(2, 2))
    expected_sequence_length = [1, 2]

    sequence_length = column._sequence_length(_LazyBuilder({'aaa': inputs}))

    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(
          expected_sequence_length, sequence_length.eval(session=sess))
Exemple #39
0
def _weights(features, weight_column):
  """Fetches weights from features."""
  if weight_column is None:
    return 1.
  if isinstance(weight_column, six.string_types):
    weight_column = feature_column_lib.numeric_column(key=weight_column)
  if not isinstance(weight_column, feature_column_lib._NumericColumn):  # pylint: disable=protected-access
    raise TypeError('Weight column must be either a string or _NumericColumn. '
                    'Given type: {}.'.format(type(weight_column)))
  weights = weight_column._get_dense_tensor(  # pylint: disable=protected-access
      feature_column_lib._LazyBuilder(features))  # pylint: disable=protected-access
  if not (weights.dtype.is_floating or weights.dtype.is_integer):
    raise ValueError('Weight column should be castable to float. '
                     'Given dtype: {}'.format(weights.dtype))
  weights = _maybe_expand_dim(math_ops.to_float(weights, name='weights'))
  return weights
  def test_sequence_length_with_shape(self):
    """Tests _sequence_length with shape !=(1,)."""
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, values [[0.], [1]]
        # example 1, [[10.]]
        indices=((0, 0), (0, 1), (1, 0)),
        values=(0., 1., 10.),
        dense_shape=(2, 2))
    expected_sequence_length = [2, 1]
    numeric_column = sfc.sequence_numeric_column('aaa')

    _, sequence_length = numeric_column._get_sequence_dense_tensor(
        _LazyBuilder({'aaa': sparse_input}))

    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(
          expected_sequence_length, sequence_length.eval(session=sess))
  def test_get_sparse_tensors_inputs3d(self):
    """Tests _get_sparse_tensors when the input is already 3D Tensor."""
    column = sfc.sequence_categorical_column_with_identity(
        'aaa', num_buckets=3)
    inputs = sparse_tensor.SparseTensorValue(
        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
        values=(1, 2, 0),
        dense_shape=(2, 2, 1))

    with self.assertRaisesRegexp(
        errors.InvalidArgumentError,
        r'Column aaa expected ID tensor of rank 2\.\s*'
        r'id_tensor shape:\s*\[2 2 1\]'):
      id_weight_pair = column._get_sparse_tensors(
          _LazyBuilder({'aaa': inputs}))
      with monitored_session.MonitoredSession() as sess:
        id_weight_pair.id_tensor.eval(session=sess)
  def test_sequence_length(self):
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, values [[0., 1., 2.], [3., 4., 5.]]
        # example 1, [[10., 11., 12.]]
        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5),
                 (1, 0), (1, 1), (1, 2)),
        values=(0., 1., 2., 3., 4., 5., 10., 11., 12.),
        dense_shape=(2, 6))
    expected_sequence_length = [2, 1]
    numeric_column = sfc.sequence_numeric_column('aaa', shape=(3,))

    _, sequence_length = numeric_column._get_sequence_dense_tensor(
        _LazyBuilder({'aaa': sparse_input}))

    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(
          expected_sequence_length, sequence_length.eval(session=sess))
  def test_get_sequence_dense_tensor(self):
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, values [[0.], [1]]
        # example 1, [[10.]]
        indices=((0, 0), (0, 1), (1, 0)),
        values=(0., 1., 10.),
        dense_shape=(2, 2))
    expected_dense_tensor = [
        [[0.], [1.]],
        [[10.], [0.]],
    ]
    numeric_column = sfc.sequence_numeric_column('aaa')

    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
        _LazyBuilder({'aaa': sparse_input}))

    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(
          expected_dense_tensor, dense_tensor.eval(session=sess))
  def test_get_sparse_tensors(self):
    column = sfc.sequence_categorical_column_with_identity(
        'aaa', num_buckets=3)
    inputs = sparse_tensor.SparseTensorValue(
        indices=((0, 0), (1, 0), (1, 1)),
        values=(1, 2, 0),
        dense_shape=(2, 2))
    expected_sparse_ids = sparse_tensor.SparseTensorValue(
        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
        values=np.array((1, 2, 0), dtype=np.int64),
        dense_shape=(2, 2, 1))

    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))

    self.assertIsNone(id_weight_pair.weight_tensor)
    with monitored_session.MonitoredSession() as sess:
      _assert_sparse_tensor_value(
          self,
          expected_sparse_ids,
          id_weight_pair.id_tensor.eval(session=sess))
  def test_get_dense_tensor_multi_dim(self):
    """Tests get_sequence_dense_tensor for multi-dim numeric_column."""
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, values [[[0., 1.],  [2., 3.]], [[4., 5.],  [6., 7.]]]
        # example 1, [[[10., 11.],  [12., 13.]]]
        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7),
                 (1, 0), (1, 1), (1, 2), (1, 3)),
        values=(0., 1., 2., 3., 4., 5., 6., 7., 10., 11., 12., 13.),
        dense_shape=(2, 8))
    expected_dense_tensor = [
        [[[0., 1.], [2., 3.]], [[4., 5.], [6., 7.]]],
        [[[10., 11.], [12., 13.]], [[0., 0.], [0., 0.]]],
    ]
    numeric_column = sfc.sequence_numeric_column('aaa', shape=(2, 2))

    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
        _LazyBuilder({'aaa': sparse_input}))

    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(
          expected_dense_tensor, dense_tensor.eval(session=sess))
  def test_get_sparse_tensors(self):
    column = sfc.sequence_categorical_column_with_vocabulary_list(
        key='aaa',
        vocabulary_list=('omar', 'stringer', 'marlo'))
    inputs = sparse_tensor.SparseTensorValue(
        indices=((0, 0), (1, 0), (1, 1)),
        values=('marlo', 'skywalker', 'omar'),
        dense_shape=(2, 2))
    expected_sparse_ids = sparse_tensor.SparseTensorValue(
        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
        values=np.array((2, -1, 0), dtype=np.int64),
        dense_shape=(2, 2, 1))

    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))

    self.assertIsNone(id_weight_pair.weight_tensor)
    with monitored_session.MonitoredSession() as sess:
      _assert_sparse_tensor_value(
          self,
          expected_sparse_ids,
          id_weight_pair.id_tensor.eval(session=sess))
  def test_sequence_length(self):
    vocabulary_size = 3
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, ids [2]
        # example 1, ids [0, 1]
        indices=((0, 0), (1, 0), (1, 1)),
        values=(2, 0, 1),
        dense_shape=(2, 2))
    expected_sequence_length = [1, 2]

    categorical_column = sfc.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    indicator_column = fc.indicator_column(categorical_column)

    _, sequence_length = indicator_column._get_sequence_dense_tensor(
        _LazyBuilder({'aaa': sparse_input}))

    with monitored_session.MonitoredSession() as sess:
      sequence_length = sess.run(sequence_length)
      self.assertAllEqual(expected_sequence_length, sequence_length)
      self.assertEqual(np.int64, sequence_length.dtype)
  def test_sequence_length_with_empty_rows(self):
    """Tests _sequence_length when some examples do not have ids."""
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, values []
        # example 1, values [[0.], [1.]]
        # example 2, [[2.]]
        # example 3, values []
        # example 4, [[3.]]
        # example 5, values []
        indices=((1, 0), (1, 1), (2, 0), (4, 0)),
        values=(0., 1., 2., 3.),
        dense_shape=(6, 2))
    expected_sequence_length = [0, 2, 1, 0, 1, 0]
    numeric_column = sfc.sequence_numeric_column('aaa')

    _, sequence_length = numeric_column._get_sequence_dense_tensor(
        _LazyBuilder({'aaa': sparse_input}))

    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(
          expected_sequence_length, sequence_length.eval(session=sess))
  def test_get_sequence_dense_tensor_with_shape(self):
    """Tests get_sequence_dense_tensor with shape !=(1,)."""
    sparse_input = sparse_tensor.SparseTensorValue(
        # example 0, values [[0., 1., 2.], [3., 4., 5.]]
        # example 1, [[10., 11., 12.]]
        indices=((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5),
                 (1, 0), (1, 1), (1, 2)),
        values=(0., 1., 2., 3., 4., 5., 10., 11., 12.),
        dense_shape=(2, 6))
    expected_dense_tensor = [
        [[0., 1., 2.], [3., 4., 5.]],
        [[10., 11., 12.], [0., 0., 0.]],
    ]
    numeric_column = sfc.sequence_numeric_column('aaa', shape=(3,))

    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
        _LazyBuilder({'aaa': sparse_input}))

    with monitored_session.MonitoredSession() as sess:
      self.assertAllEqual(
          expected_dense_tensor, dense_tensor.eval(session=sess))
  def test_get_sparse_tensors(self):
    column = sfc.sequence_categorical_column_with_hash_bucket(
        'aaa', hash_bucket_size=10)
    inputs = sparse_tensor.SparseTensorValue(
        indices=((0, 0), (1, 0), (1, 1)),
        values=('omar', 'stringer', 'marlo'),
        dense_shape=(2, 2))

    expected_sparse_ids = sparse_tensor.SparseTensorValue(
        indices=((0, 0, 0), (1, 0, 0), (1, 1, 0)),
        # Ignored to avoid hash dependence in test.
        values=np.array((0, 0, 0), dtype=np.int64),
        dense_shape=(2, 2, 1))

    id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))

    self.assertIsNone(id_weight_pair.weight_tensor)
    with monitored_session.MonitoredSession() as sess:
      _assert_sparse_tensor_indices_shape(
          self,
          expected_sparse_ids,
          id_weight_pair.id_tensor.eval(session=sess))
def sequence_input_layer(
    features,
    feature_columns,
    weight_collections=None,
    trainable=True):
  """"Builds input layer for sequence input.

  All `feature_columns` must be sequence dense columns with the same
  `sequence_length`. The output of this method can be fed into sequence
  networks, such as RNN.

  The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`.
  `T` is the maximum sequence length for this batch, which could differ from
  batch to batch.

  If multiple `feature_columns` are given with `Di` `num_elements` each, their
  outputs are concatenated. So, the final `Tensor` has shape
  `[batch_size, T, D0 + D1 + ... + Dn]`.

  Example:

  ```python
  rating = sequence_numeric_column('rating')
  watches = sequence_categorical_column_with_identity(
      'watches', num_buckets=1000)
  watches_embedding = embedding_column(watches, dimension=10)
  columns = [rating, watches]

  features = tf.parse_example(..., features=make_parse_example_spec(columns))
  input_layer, sequence_length = sequence_input_layer(features, columns)

  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
  outputs, state = tf.nn.dynamic_rnn(
      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
  ```

  Args:
    features: A dict mapping keys to tensors.
    feature_columns: An iterable of dense sequence columns. Valid columns are
      - `embedding_column` that wraps a `sequence_categorical_column_with_*`
      - `sequence_numeric_column`.
    weight_collections: A list of collection names to which the Variable will be
      added. Note that variables will also be added to collections
      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
    trainable: If `True` also add the variable to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES`.

  Returns:
    An `(input_layer, sequence_length)` tuple where:
    - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
        `T` is the maximum sequence length for this batch, which could differ
        from batch to batch. `D` is the sum of `num_elements` for all
        `feature_columns`.
    - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence
        length for each example.

  Raises:
    ValueError: If any of the `feature_columns` is the wrong type.
  """
  feature_columns = fc._normalize_feature_columns(feature_columns)
  for c in feature_columns:
    if not isinstance(c, fc._SequenceDenseColumn):
      raise ValueError(
          'All feature_columns must be of type _SequenceDenseColumn. '
          'You can wrap a sequence_categorical_column with an embedding_column '
          'or indicator_column. '
          'Given (type {}): {}'.format(type(c), c))

  with variable_scope.variable_scope(
      None, default_name='sequence_input_layer', values=features.values()):
    builder = fc._LazyBuilder(features)
    output_tensors = []
    sequence_lengths = []
    ordered_columns = []

    for column in sorted(feature_columns, key=lambda x: x.name):
      ordered_columns.append(column)
      with variable_scope.variable_scope(
          None, default_name=column._var_scope_name):
        dense_tensor, sequence_length = column._get_sequence_dense_tensor(
            builder,
            weight_collections=weight_collections,
            trainable=trainable)
        # Flattens the final dimension to produce a 3D Tensor.
        num_elements = column._variable_shape.num_elements()
        shape = array_ops.shape(dense_tensor)
        target_shape = [shape[0], shape[1], num_elements]
        output_tensors.append(
            array_ops.reshape(dense_tensor, shape=target_shape))
        sequence_lengths.append(sequence_length)

    fc._verify_static_batch_size_equality(output_tensors, ordered_columns)
    fc._verify_static_batch_size_equality(sequence_lengths, ordered_columns)
    sequence_length = _assert_all_equal_and_return(sequence_lengths)

    return array_ops.concat(output_tensors, -1), sequence_length
  def test_get_dense_tensor(self):
    # Inputs.
    vocabulary_size = 3
    # -1 values are ignored.
    input_a = np.array([
        [2, -1, -1],  # example 0, ids [2]
        [0, 1, -1]
    ])  # example 1, ids [0, 1]
    input_b = np.array([
        [0, -1, -1],  # example 0, ids [0]
        [-1, -1, -1]
    ])  # example 1, ids []
    input_features = {'aaa': input_a, 'bbb': input_b}

    # Embedding variable.
    embedding_dimension = 2
    embedding_values = (
        (1., 2.),  # id 0
        (3., 5.),  # id 1
        (7., 11.)  # id 2
    )

    def _initializer(shape, dtype, partition_info):
      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
      self.assertEqual(dtypes.float32, dtype)
      self.assertIsNone(partition_info)
      return embedding_values

    # Expected lookup result, using combiner='mean'.
    expected_lookups_a = (
        # example 0:
        (7., 11.),  # ids [2], embedding = [7, 11]
        # example 1:
        (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
    )
    expected_lookups_b = (
        # example 0:
        (1., 2.),  # ids [0], embedding = [1, 2]
        # example 1:
        (0., 0.),  # ids [], embedding = [0, 0]
    )

    # Build columns.
    categorical_column_a = fc_lib.categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    categorical_column_b = fc_lib.categorical_column_with_identity(
        key='bbb', num_buckets=vocabulary_size)
    embedding_column_a, embedding_column_b = tpu_fc.shared_embedding_columns(
        [categorical_column_a, categorical_column_b],
        dimension=embedding_dimension,
        initializer=_initializer)

    # Provide sparse input and get dense result.
    embedding_lookup_a = embedding_column_a._get_dense_tensor(
        fc._LazyBuilder(input_features))
    embedding_lookup_b = embedding_column_b._get_dense_tensor(
        fc._LazyBuilder(input_features))

    # Assert expected embedding variable and lookups.
    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
    self.assertItemsEqual(('embedding_weights:0',),
                          tuple([v.name for v in global_vars]))
    embedding_var = global_vars[0]
    with _initialized_session():
      self.assertAllEqual(embedding_values, embedding_var.eval())
      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
def sequence_input_layer(
    features,
    feature_columns,
    weight_collections=None,
    trainable=True,
    scope=None):
  """"Builds input layer for sequence input.

  All `feature_columns` must be sequence dense columns with the same
  `sequence_length`. The output of this method can be fed into sequence
  networks, such as RNN.

  The output of this method is a 3D `Tensor` of shape `[batch_size, T, D]`.
  `T` is the maximum sequence length for this batch, which could differ from
  batch to batch.

  If multiple `feature_columns` are given with `Di` `num_elements` each, their
  outputs are concatenated. So, the final `Tensor` has shape
  `[batch_size, T, D0 + D1 + ... + Dn]`.

  Example:

  ```python
  rating = sequence_numeric_column('rating')
  watches = sequence_categorical_column_with_identity(
      'watches', num_buckets=1000)
  watches_embedding = embedding_column(watches, dimension=10)
  columns = [rating, watches]

  features = tf.parse_example(..., features=make_parse_example_spec(columns))
  input_layer, sequence_length = sequence_input_layer(features, columns)

  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
  outputs, state = tf.nn.dynamic_rnn(
      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
  ```

  Returns:
    An `(input_layer, sequence_length)` tuple where:
    - input_layer: A float `Tensor` of shape `[batch_size, T, D]`.
        `T` is the maximum sequence length for this batch, which could differ
        from batch to batch. `D` is the sum of `num_elements` for all
        `feature_columns`.
    - sequence_length: An int `Tensor` of shape `[batch_size]`. The sequence
        length for each example.
  Raises:
    ValueError: If any of the `feature_columns` is the wrong type.
  """
  feature_columns = fc._clean_feature_columns(feature_columns)
  for c in feature_columns:
    if not isinstance(c, _SequenceDenseColumn):
      raise ValueError(
          'All feature_columns must be of type _SequenceDenseColumn. '
          'Given (type {}): {}'.format(type(c), c))

  with variable_scope.variable_scope(
      scope, default_name='sequence_input_layer', values=features.values()):
    builder = fc._LazyBuilder(features)
    output_tensors = []
    sequence_lengths = []
    ordered_columns = []
    for column in sorted(feature_columns, key=lambda x: x.name):
      ordered_columns.append(column)
      with variable_scope.variable_scope(
          None, default_name=column._var_scope_name):
        dense_tensor, sequence_length = column._get_sequence_dense_tensor(
            builder,
            weight_collections=weight_collections,
            trainable=trainable)
        # Flattens the final dimension to produce a 3D Tensor.
        num_elements = column._variable_shape.num_elements()
        shape = array_ops.shape(dense_tensor)
        output_tensors.append(
            array_ops.reshape(
                dense_tensor,
                shape=array_ops.concat([shape[:2], [num_elements]], axis=0)))
        sequence_lengths.append(sequence_length)
    fc._verify_static_batch_size_equality(output_tensors, ordered_columns)
    # TODO(b/73160931): Verify sequence_length equality.
    return array_ops.concat(output_tensors, -1), sequence_lengths[0]
Exemple #54
0
def _get_weights_and_check_match_logits(
    features, weight_column, logits, allow_per_logit_weights=False):
  """Fetches weights from features and checks that the shape matches logits.

  Consider logits of shape [D0, D1, ... DN, logits_dimension]. Weights shape
  can be either:
  * [D0, D1, ... DN, logits_dimension] if `allow_per_logit_weights=True`.
  * [D0, D1, ... DN, 1]
  * [D0, D1, ... DN]: In this case, weights is reshaped into
    [D0, D1, ... DN, 1] to work with weight broadcasting rules.

  Args:
    features: The features dict that contains weights.
    weight_column: The weight column. If not given, this method returns 1.
    logits: logits Tensor.
    allow_per_logit_weights: Boolean. Whether we allow weights along the logits
      dimension, namely shape `[D0, D1, ... DN, logits_dimension]`.
  Returns:
    Validated and reshaped weights Tensor.
  Raises:
    ValueError: If the weights `Tensor` cannot be cast into float.
  """
  if allow_per_logit_weights:
    err_msg = (
        'weights shape must be [D0, D1, ... DN], [D0, D1, ... DN, 1] or '
        '[D0, D1, ... DN, logits_dimension]')
  else:
    err_msg = (
        'weights shape must be [D0, D1, ... DN] or [D0, D1, ... DN, 1]')
  with ops.name_scope(
      None, 'weights',
      values=tuple(six.itervalues(features)) + (logits,)) as scope:
    # Fetch the weights.
    if weight_column is None:
      return 1.
    if isinstance(weight_column, six.string_types):
      weight_column = feature_column_lib.numeric_column(
          key=weight_column, shape=(1,))
    if not isinstance(weight_column, feature_column_lib._NumericColumn):  # pylint: disable=protected-access
      raise TypeError('Weight column must be either a string or _NumericColumn.'
                      ' Given type: {}.'.format(type(weight_column)))
    weights = weight_column._get_dense_tensor(  # pylint: disable=protected-access
        feature_column_lib._LazyBuilder(features))  # pylint: disable=protected-access
    if not (weights.dtype.is_floating or weights.dtype.is_integer):
      raise ValueError('Weight column should be castable to float. '
                       'Given dtype: {}'.format(weights.dtype))
    weights = math_ops.to_float(weights, name='weights')

    # Validate the weights shape.
    weights_shape = array_ops.shape(weights, name='weights_shape')
    logits_shape = array_ops.shape(logits, name='logits_shape')
    if (weights.shape.ndims is not None and logits.shape.ndims is not None and
        weights.shape.ndims == logits.shape.ndims - 1):
      assert_dimension = check_ops.assert_equal(
          logits_shape[:-1], weights_shape, message=err_msg,
          data=['logits_shape: ', logits_shape,
                'weights_shape: ', weights_shape])
      with ops.control_dependencies([assert_dimension]):
        return array_ops.expand_dims(weights, -1, name=scope)
    supported_weights_shape = array_ops.concat([logits_shape[:-1], [1]], axis=0)
    if allow_per_logit_weights:
      condition = math_ops.reduce_any(
          [math_ops.reduce_all(math_ops.equal(logits_shape, weights_shape)),
           math_ops.reduce_all(math_ops.equal(
               supported_weights_shape, weights_shape))])
      assert_dimension = control_flow_ops.Assert(
          condition=condition,
          data=[err_msg, 'logits_shape: ', logits_shape,
                'weights_shape: ', weights_shape])
    else:
      assert_dimension = check_ops.assert_equal(
          supported_weights_shape, weights_shape, message=err_msg,
          data=['logits_shape: ', logits_shape,
                'weights_shape: ', weights_shape])
    with ops.control_dependencies([assert_dimension]):
      return array_ops.identity(weights, name=scope)