Exemple #1
0
    def test_compute_output_shape(self):
        price1 = tf.feature_column.sequence_numeric_column('price1', shape=2)
        price2 = tf.feature_column.sequence_numeric_column('price2')
        features = {
            'price1':
            tf.SparseTensor(indices=[[0, 0, 0], [0, 0, 1], [0, 1,
                                                            0], [0, 1, 1],
                                     [1, 0, 0], [1, 0, 1], [2, 0, 0],
                                     [2, 0, 1], [3, 0, 0], [3, 0, 1]],
                            values=[
                                0., 1., 10., 11., 100., 101., 200., 201., 300.,
                                301.
                            ],
                            dense_shape=(4, 3, 2)),
            'price2':
            tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0], [2, 0], [3, 0]],
                            values=[10., 11., 20., 30., 40.],
                            dense_shape=(4, 3))
        }
        sequence_features = ksfc.SequenceFeatures([price1, price2])
        seq_input, seq_len = sequence_features(features)
        self.assertEqual(sequence_features.compute_output_shape((None, None)),
                         (None, None, 3))
        self.evaluate(tf.compat.v1.global_variables_initializer())
        self.evaluate(tf.compat.v1.tables_initializer())

        self.assertAllClose([[[0., 1., 10.], [10., 11., 11.], [0., 0., 0.]],
                             [[100., 101., 20.], [0., 0., 0.], [0., 0., 0.]],
                             [[200., 201., 30.], [0., 0., 0.], [0., 0., 0.]],
                             [[300., 301., 40.], [0., 0., 0.], [0., 0., 0.]]],
                            self.evaluate(seq_input))
        self.assertAllClose([2, 1, 1, 1], self.evaluate(seq_len))
Exemple #2
0
    def test_shared_embedding_column_with_non_sequence_categorical(self):
        """Tests that error is raised for non-sequence shared embedding column."""
        with tf.Graph().as_default():
            vocabulary_size = 3
            sparse_input_a = tf.compat.v1.SparseTensorValue(
                # example 0, ids [2]
                # example 1, ids [0, 1]
                indices=((0, 0), (1, 0), (1, 1)),
                values=(2, 0, 1),
                dense_shape=(2, 2))
            sparse_input_b = tf.compat.v1.SparseTensorValue(
                # example 0, ids [2]
                # example 1, ids [0, 1]
                indices=((0, 0), (1, 0), (1, 1)),
                values=(2, 0, 1),
                dense_shape=(2, 2))

            categorical_column_a = tf.feature_column.categorical_column_with_identity(
                key='aaa', num_buckets=vocabulary_size)
            categorical_column_b = tf.feature_column.categorical_column_with_identity(
                key='bbb', num_buckets=vocabulary_size)
            shared_embedding_columns = tf.feature_column.shared_embeddings(
                [categorical_column_a, categorical_column_b], dimension=2)

            sequence_input_layer = ksfc.SequenceFeatures(
                shared_embedding_columns)
            with self.assertRaisesRegex(
                    ValueError, r'In embedding_column: aaa_shared_embedding\. '
                    r'categorical_column must '
                    r'be of type SequenceCategoricalColumn to use SequenceFeatures\.'
            ):
                _, _ = sequence_input_layer({
                    'aaa': sparse_input_a,
                    'bbb': sparse_input_b
                })
Exemple #3
0
    def test_indicator_column(self, sparse_input_args_a, sparse_input_args_b,
                              expected_input_layer, expected_sequence_length):
        sparse_input_a = tf.compat.v1.SparseTensorValue(**sparse_input_args_a)
        sparse_input_b = tf.compat.v1.SparseTensorValue(**sparse_input_args_b)

        vocabulary_size_a = 3
        vocabulary_size_b = 2

        categorical_column_a = tf.feature_column.sequence_categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size_a)
        indicator_column_a = tf.feature_column.indicator_column(
            categorical_column_a)
        categorical_column_b = tf.feature_column.sequence_categorical_column_with_identity(
            key='bbb', num_buckets=vocabulary_size_b)
        indicator_column_b = tf.feature_column.indicator_column(
            categorical_column_b)
        # Test that columns are reordered alphabetically.
        sequence_input_layer = ksfc.SequenceFeatures(
            [indicator_column_b, indicator_column_a])
        input_layer, sequence_length = sequence_input_layer({
            'aaa':
            sparse_input_a,
            'bbb':
            sparse_input_b
        })

        self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
        self.assertAllEqual(expected_sequence_length,
                            self.evaluate(sequence_length))
Exemple #4
0
    def test_shared_sequence_non_sequence_into_input_layer(self):
        non_seq = tf.feature_column.categorical_column_with_identity(
            'non_seq', num_buckets=10)
        seq = tf.feature_column.sequence_categorical_column_with_identity(
            'seq', num_buckets=10)
        shared_non_seq, shared_seq = tf.feature_column.shared_embeddings(
            [non_seq, seq],
            dimension=4,
            combiner='sum',
            initializer=tf.ones_initializer(),
            shared_embedding_collection_name='shared')

        seq = tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]],
                              values=[0, 1, 2],
                              dense_shape=[2, 2])
        non_seq = tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0]],
                                  values=[0, 1, 2],
                                  dense_shape=[2, 2])
        features = {'seq': seq, 'non_seq': non_seq}

        # Tile the context features across the sequence features
        seq_input, seq_length = ksfc.SequenceFeatures([shared_seq])(features)
        non_seq_input = dense_features.DenseFeatures([shared_non_seq
                                                      ])(features)

        with self.cached_session() as sess:
            sess.run(tf.compat.v1.global_variables_initializer())
            output_seq, output_seq_length, output_non_seq = sess.run(
                [seq_input, seq_length, non_seq_input])
            self.assertAllEqual(
                output_seq,
                [[[1, 1, 1, 1], [1, 1, 1, 1]], [[1, 1, 1, 1], [0, 0, 0, 0]]])
            self.assertAllEqual(output_seq_length, [2, 1])
            self.assertAllEqual(output_non_seq, [[2, 2, 2, 2], [1, 1, 1, 1]])
Exemple #5
0
    def test_sequence_length_not_equal(self):
        """Tests that an error is raised when sequence lengths are not equal."""
        # Input a with sequence_length = [2, 1]
        sparse_input_a = tf.compat.v1.SparseTensorValue(
            indices=((0, 0), (0, 1), (1, 0)),
            values=(0.0, 1.0, 10.0),
            dense_shape=(2, 2),
        )
        # Input b with sequence_length = [1, 1]
        sparse_input_b = tf.compat.v1.SparseTensorValue(indices=((0, 0), (1,
                                                                          0)),
                                                        values=(1.0, 10.0),
                                                        dense_shape=(2, 2))
        numeric_column_a = tf.feature_column.sequence_numeric_column("aaa")
        numeric_column_b = tf.feature_column.sequence_numeric_column("bbb")

        sequence_input_layer = ksfc.SequenceFeatures(
            [numeric_column_a, numeric_column_b])

        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
                                    r"Condition x == y did not hold.*"):
            _, sequence_length = sequence_input_layer({
                "aaa": sparse_input_a,
                "bbb": sparse_input_b
            })
            self.evaluate(sequence_length)
Exemple #6
0
    def test_serialization_sequence_features(self):
        rating = tf.feature_column.sequence_numeric_column("rating")
        sequence_feature = ksfc.SequenceFeatures([rating])
        config = keras.layers.serialize(sequence_feature)

        revived = keras.layers.deserialize(config)
        self.assertIsInstance(revived, ksfc.SequenceFeatures)
Exemple #7
0
    def test_saving_with_sequence_features(self):
        cols = [
            tf.feature_column.sequence_numeric_column('a'),
            tf.feature_column.indicator_column(
                tf.feature_column.
                sequence_categorical_column_with_vocabulary_list(
                    'b', ['one', 'two']))
        ]
        input_layers = {
            'a':
            keras.layers.Input(shape=(None, 1), sparse=True, name='a'),
            'b':
            keras.layers.Input(shape=(None, 1),
                               sparse=True,
                               name='b',
                               dtype='string')
        }

        fc_layer, _ = ksfc.SequenceFeatures(cols)(input_layers)
        # TODO(tibell): Figure out the right dtype and apply masking.
        # sequence_length_mask = array_ops.sequence_mask(sequence_length)
        # x = keras.layers.GRU(32)(fc_layer, mask=sequence_length_mask)
        x = keras.layers.GRU(32)(fc_layer)
        output = keras.layers.Dense(10)(x)

        model = keras.models.Model(input_layers, output)

        model.compile(loss=keras.losses.MSE,
                      optimizer='rmsprop',
                      metrics=[keras.metrics.categorical_accuracy])

        config = model.to_json()
        loaded_model = model_config.model_from_json(config)

        batch_size = 10
        timesteps = 1

        values_a = np.arange(10, dtype=np.float32)
        indices_a = np.zeros((10, 3), dtype=np.int64)
        indices_a[:, 0] = np.arange(10)
        inputs_a = tf.SparseTensor(indices_a, values_a,
                                   (batch_size, timesteps, 1))

        values_b = np.zeros(10, dtype=np.str)
        indices_b = np.zeros((10, 3), dtype=np.int64)
        indices_b[:, 0] = np.arange(10)
        inputs_b = tf.SparseTensor(indices_b, values_b,
                                   (batch_size, timesteps, 1))

        with self.cached_session():
            # Initialize tables for V1 lookup.
            if not tf.executing_eagerly():
                self.evaluate(tf.compat.v1.tables_initializer())

            self.assertLen(
                loaded_model.predict({
                    'a': inputs_a,
                    'b': inputs_b
                }, steps=1), batch_size)
Exemple #8
0
  def test_embedding_column(
      self, sparse_input_args_a, sparse_input_args_b, expected_input_layer,
      expected_sequence_length):

    sparse_input_a = tf.compat.v1.SparseTensorValue(**sparse_input_args_a)
    sparse_input_b = tf.compat.v1.SparseTensorValue(**sparse_input_args_b)
    vocabulary_size = 3
    embedding_dimension_a = 2
    embedding_values_a = (
        (1., 2.),  # id 0
        (3., 4.),  # id 1
        (5., 6.)  # id 2
    )
    embedding_dimension_b = 3
    embedding_values_b = (
        (11., 12., 13.),  # id 0
        (14., 15., 16.),  # id 1
        (17., 18., 19.)  # id 2
    )
    def _get_initializer(embedding_dimension, embedding_values):

      def _initializer(shape, dtype, partition_info=None):
        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
        self.assertEqual(tf.float32, dtype)
        self.assertIsNone(partition_info)
        return embedding_values
      return _initializer

    categorical_column_a = tf.feature_column.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=vocabulary_size)
    embedding_column_a = tf.feature_column.embedding_column(
        categorical_column_a,
        dimension=embedding_dimension_a,
        initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
    categorical_column_b = tf.feature_column.sequence_categorical_column_with_identity(
        key='bbb', num_buckets=vocabulary_size)
    embedding_column_b = tf.feature_column.embedding_column(
        categorical_column_b,
        dimension=embedding_dimension_b,
        initializer=_get_initializer(embedding_dimension_b, embedding_values_b))

    # Test that columns are reordered alphabetically.
    sequence_input_layer = ksfc.SequenceFeatures(
        [embedding_column_b, embedding_column_a])
    input_layer, sequence_length = sequence_input_layer({
        'aaa': sparse_input_a, 'bbb': sparse_input_b,})

    self.evaluate(tf.compat.v1.global_variables_initializer())
    weights = sequence_input_layer.weights
    self.assertCountEqual(
        ('sequence_features/aaa_embedding/embedding_weights:0',
         'sequence_features/bbb_embedding/embedding_weights:0'),
        tuple([v.name for v in weights]))
    self.assertAllEqual(embedding_values_a, self.evaluate(weights[0]))
    self.assertAllEqual(embedding_values_b, self.evaluate(weights[1]))
    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
    self.assertAllEqual(
        expected_sequence_length, self.evaluate(sequence_length))
Exemple #9
0
  def test_static_shape_from_tensors_numeric(
      self, sparse_input_args, expected_shape):
    """Tests that we return a known static shape when we have one."""
    sparse_input = tf.compat.v1.SparseTensorValue(**sparse_input_args)
    numeric_column = tf.feature_column.sequence_numeric_column('aaa', shape=(2, 2))

    sequence_input_layer = ksfc.SequenceFeatures([numeric_column])
    input_layer, _ = sequence_input_layer({'aaa': sparse_input})
    shape = input_layer.get_shape()
    self.assertEqual(shape, expected_shape)
Exemple #10
0
  def test_from_config(self, trainable, name):
    cols = [tf.feature_column.sequence_numeric_column('a')]
    orig_layer = ksfc.SequenceFeatures(cols, trainable=trainable, name=name)
    config = orig_layer.get_config()

    new_layer = ksfc.SequenceFeatures.from_config(config)

    self.assertEqual(new_layer.name, orig_layer.name)
    self.assertEqual(new_layer.trainable, trainable)
    self.assertLen(new_layer._feature_columns, 1)
    self.assertEqual(new_layer._feature_columns[0].name, 'a')
Exemple #11
0
  def test_get_config(self, trainable, name):
    cols = [tf.feature_column.sequence_numeric_column('a')]
    orig_layer = ksfc.SequenceFeatures(cols, trainable=trainable, name=name)
    config = orig_layer.get_config()

    self.assertEqual(config['name'], orig_layer.name)
    self.assertEqual(config['trainable'], trainable)
    self.assertLen(config['feature_columns'], 1)
    self.assertEqual(config['feature_columns'][0]['class_name'],
                     'SequenceNumericColumn')
    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
Exemple #12
0
  def test_static_shape_from_tensors_indicator(
      self, sparse_input_args, expected_shape):
    """Tests that we return a known static shape when we have one."""
    sparse_input = tf.compat.v1.SparseTensorValue(**sparse_input_args)
    categorical_column = tf.feature_column.sequence_categorical_column_with_identity(
        key='aaa', num_buckets=3)
    indicator_column = tf.feature_column.indicator_column(categorical_column)

    sequence_input_layer = ksfc.SequenceFeatures([indicator_column])
    input_layer, _ = sequence_input_layer({'aaa': sparse_input})
    shape = input_layer.get_shape()
    self.assertEqual(shape, expected_shape)
Exemple #13
0
  def test_numeric_column(
      self, sparse_input_args, expected_input_layer, expected_sequence_length):
    sparse_input = tf.compat.v1.SparseTensorValue(**sparse_input_args)

    numeric_column = tf.feature_column.sequence_numeric_column('aaa')

    sequence_input_layer = ksfc.SequenceFeatures([numeric_column])
    input_layer, sequence_length = sequence_input_layer({'aaa': sparse_input})

    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
    self.assertAllEqual(
        expected_sequence_length, self.evaluate(sequence_length))
Exemple #14
0
  def test_numeric_column_multi_dim(
      self, sparse_input_args, expected_input_layer, expected_sequence_length):
    """Tests SequenceFeatures for multi-dimensional numeric_column."""
    sparse_input = tf.compat.v1.SparseTensorValue(**sparse_input_args)

    numeric_column = tf.feature_column.sequence_numeric_column('aaa', shape=(2, 2))

    sequence_input_layer = ksfc.SequenceFeatures([numeric_column])
    input_layer, sequence_length = sequence_input_layer({'aaa': sparse_input})

    self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
    self.assertAllEqual(
        expected_sequence_length, self.evaluate(sequence_length))
Exemple #15
0
    def test_get_config(self, trainable, name):
        cols = [tf.feature_column.sequence_numeric_column("a")]
        orig_layer = ksfc.SequenceFeatures(cols,
                                           trainable=trainable,
                                           name=name)
        config = orig_layer.get_config()

        self.assertEqual(config["name"], orig_layer.name)
        self.assertEqual(config["trainable"], trainable)
        self.assertLen(config["feature_columns"], 1)
        self.assertEqual(config["feature_columns"][0]["class_name"],
                         "SequenceNumericColumn")
        self.assertEqual(config["feature_columns"][0]["config"]["shape"],
                         (1, ))
Exemple #16
0
    def test_embedding_column_with_non_sequence_categorical(self):
        """Tests that error is raised for non-sequence embedding column."""
        vocabulary_size = 3
        sparse_input = tf.compat.v1.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            indices=((0, 0), (1, 0), (1, 1)),
            values=(2, 0, 1),
            dense_shape=(2, 2))

        categorical_column_a = tf.feature_column.categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        embedding_column_a = tf.feature_column.embedding_column(
            categorical_column_a, dimension=2)
        sequence_input_layer = ksfc.SequenceFeatures([embedding_column_a])
        with self.assertRaisesRegex(
                ValueError,
                r'In embedding_column: aaa_embedding\. categorical_column must be of '
                r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
            _, _ = sequence_input_layer({'aaa': sparse_input})
    def test_sequence_example_into_input_layer(self):
        examples = [_make_sequence_example().SerializeToString()] * 100
        ctx_cols, seq_cols = self._build_feature_columns()

        def _parse_example(example):
            ctx, seq = tf.io.parse_single_sequence_example(
                example,
                context_features=tf.feature_column.make_parse_example_spec(
                    ctx_cols),
                sequence_features=tf.feature_column.make_parse_example_spec(
                    seq_cols),
            )
            ctx.update(seq)
            return ctx

        ds = tf.data.Dataset.from_tensor_slices(examples)
        ds = ds.map(_parse_example)
        ds = ds.batch(20)

        # Test on a single batch
        features = tf.compat.v1.data.make_one_shot_iterator(ds).get_next()

        # Tile the context features across the sequence features
        sequence_input_layer = ksfc.SequenceFeatures(seq_cols)
        seq_input, _ = sequence_input_layer(features)
        dense_input_layer = dense_features.DenseFeatures(ctx_cols)
        ctx_input = dense_input_layer(features)
        ctx_input = backend.repeat(ctx_input, tf.shape(seq_input)[1])
        concatenated_input = merging.concatenate([seq_input, ctx_input])

        rnn_layer = base_rnn.RNN(simple_rnn.SimpleRNNCell(10))
        output = rnn_layer(concatenated_input)

        with self.cached_session() as sess:
            sess.run(tf.compat.v1.global_variables_initializer())
            features_r = sess.run(features)
            self.assertAllEqual(features_r["int_list"].dense_shape, [20, 3, 6])

            output_r = sess.run(output)
            self.assertAllEqual(output_r.shape, [20, 10])
Exemple #18
0
    def test_indicator_column_with_non_sequence_categorical(self):
        """Tests that error is raised for non-sequence categorical column."""
        vocabulary_size = 3
        sparse_input = tf.compat.v1.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            indices=((0, 0), (1, 0), (1, 1)),
            values=(2, 0, 1),
            dense_shape=(2, 2),
        )

        categorical_column_a = (
            tf.feature_column.categorical_column_with_identity(
                key="aaa", num_buckets=vocabulary_size))
        indicator_column_a = tf.feature_column.indicator_column(
            categorical_column_a)

        sequence_input_layer = ksfc.SequenceFeatures([indicator_column_a])
        with self.assertRaisesRegex(
                ValueError,
                r"In indicator_column: aaa_indicator\. categorical_column must be of "
                r"type SequenceCategoricalColumn to use SequenceFeatures\.",
        ):
            _, _ = sequence_input_layer({"aaa": sparse_input})
Exemple #19
0
    def test_feature_layer_cpu(self, use_safe_embedding_lookup):
        # Inputs.
        vocabulary_size = 3
        sparse_input = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            # example 2, ids []
            # example 3, ids [1]
            indices=((0, 0), (1, 0), (1, 1), (3, 0)),
            values=(2, 0, 1, 1),
            dense_shape=(4, 2))

        # Embedding variable.
        embedding_dimension = 2
        embedding_values = (
            (1., 2.),  # id 0
            (3., 5.),  # id 1
            (7., 11.)  # id 2
        )

        def _initializer(shape, dtype, partition_info=None):
            self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
            self.assertEqual(dtypes.float32, dtype)
            self.assertIsNone(partition_info)
            return embedding_values

        # Expected lookup result, using combiner='mean'.
        expected_lookups = (
            # example 0, ids [2], embedding = [7, 11]
            (7., 11.),
            # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
            (2., 3.5),
            # example 2, ids [], embedding = [0, 0]
            (0., 0.),
            # example 3, ids [1], embedding = [3, 5]
            (3., 5.),
        )
        expected_lookups_sequence = (
            # example 0, ids [2], embedding = [[7, 11], [0, 0]]
            (
                (7., 11.),
                (0., 0.),
            ),
            # example 1, ids [0, 1], embedding = [[1, 2], [3. 5]]
            (
                (1., 2.),
                (3., 5.),
            ),
            # example 2, ids [], embedding = [0, 0]
            (
                (0., 0.),
                (0., 0.),
            ),
            # example 3, ids [1], embedding = [3, 5]
            (
                (3., 5.),
                (0., 0.),
            ),
        )

        # Build columns.
        categorical_column = fc_lib.categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        sequence_categorical_column = (
            fc_lib.sequence_categorical_column_with_identity(
                key='bbb', num_buckets=vocabulary_size))
        embedding_column = tpu_fc.embedding_column_v2(
            categorical_column,
            dimension=embedding_dimension,
            initializer=_initializer,
            use_safe_embedding_lookup=use_safe_embedding_lookup)
        sequence_embedding_column = tpu_fc.embedding_column_v2(
            sequence_categorical_column,
            dimension=embedding_dimension,
            initializer=_initializer,
            max_sequence_length=2,
            use_safe_embedding_lookup=use_safe_embedding_lookup)

        # Provide sparse input and get dense result.
        features = {'aaa': sparse_input, 'bbb': sparse_input}
        dense_features = df_lib.DenseFeatures([embedding_column])
        sequence_features = sfc_lib.SequenceFeatures(
            [sequence_embedding_column])
        embedding_lookup = dense_features(features)
        sequence_embedding_lookup = sequence_features(features)

        # Assert expected embedding variable and lookups.
        global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
        self.assertItemsEqual((
            'dense_features/aaa_embedding/embedding_weights:0',
            'sequence_features/bbb_embedding/embedding_weights:0',
        ), tuple([v.name for v in global_vars]))
        with _initialized_session():
            self.assertAllEqual(embedding_values, global_vars[0])
            self.assertAllEqual(expected_lookups, embedding_lookup)
            self.assertAllEqual(expected_lookups_sequence,
                                sequence_embedding_lookup[0].eval())
            # The graph will still have SparseFillEmptyRows due to sequence being
            # a Rank3 embedding lookup.
            if use_safe_embedding_lookup:
                self.assertEqual(2, [
                    x.type for x in ops.get_default_graph().get_operations()
                ].count('SparseFillEmptyRows'))
            else:
                self.assertEqual(1, [
                    x.type for x in ops.get_default_graph().get_operations()
                ].count('SparseFillEmptyRows'))
Exemple #20
0
    def test_feature_layer_cpu(self, use_safe_embedding_lookup):
        # Inputs.
        vocabulary_size = 3
        input_a = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            indices=((0, 0), (1, 0), (1, 1)),
            values=(2, 0, 1),
            dense_shape=(2, 2))
        input_b = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            # example 2, ids []
            indices=((0, 0), (1, 0), (1, 1)),
            values=(2, 0, 1),
            dense_shape=(3, 2))
        input_features = {'aaa': input_a, 'bbb': input_b}

        # Embedding variable.
        embedding_dimension = 2
        embedding_values = (
            (1., 2.),  # id 0
            (3., 5.),  # id 1
            (7., 11.)  # id 2
        )

        def _initializer(shape, dtype, partition_info=None):
            self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
            self.assertEqual(dtypes.float32, dtype)
            self.assertIsNone(partition_info)
            return embedding_values

        # Expected lookup result, using combiner='mean'.
        expected_lookups_a = (
            # example 0:
            (7., 11.),  # ids [2], embedding = [7, 11]
            # example 1:
            (2., 3.5
             ),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
        )
        expected_lookups_b = (
            # example 0:
            (
                (7., 11.),
                (0., 0.),
            ),  # ids [2], embedding = [[7, 11], [0, 0]]
            # example 1:
            (
                (1., 2.),
                (3., 5.),
            ),  # ids [0, 1], embedding = [[1, 2], [3, 5]]
            # example 2:
            (
                (0., 0.),
                (0., 0.),
            ),  # ids [], embedding = [[0, 0], [0, 0]]
        )

        # Build columns.
        categorical_column_a = fc_lib.categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        categorical_column_b = fc_lib.sequence_categorical_column_with_identity(
            key='bbb', num_buckets=vocabulary_size)
        embedding_column_a, embedding_column_b = tpu_fc.shared_embedding_columns_v2(
            [categorical_column_a, categorical_column_b],
            dimension=embedding_dimension,
            initializer=_initializer,
            max_sequence_lengths=[0, 2],
            use_safe_embedding_lookup=use_safe_embedding_lookup)

        # Provide sparse input and get dense result.
        dense_features = df_lib.DenseFeatures([embedding_column_a])
        sequence_features = sfc_lib.SequenceFeatures([embedding_column_b])
        embedding_lookup_a = dense_features(input_features)
        embedding_lookup_b = sequence_features(input_features)

        # Assert expected embedding variable and lookups.
        global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
        self.assertItemsEqual(('aaa_bbb_shared_embedding:0', ),
                              tuple([v.name for v in global_vars]))
        embedding_var = global_vars[0]
        with _initialized_session():
            self.assertAllEqual(embedding_values, embedding_var)
            self.assertAllEqual(expected_lookups_a, embedding_lookup_a)
            self.assertAllEqual(expected_lookups_b,
                                embedding_lookup_b[0].eval())
            # The graph will still have SparseFillEmptyRows due to sequence being
            # a Rank3 embedding lookup.
            if use_safe_embedding_lookup:
                self.assertEqual(2, [
                    x.type for x in ops.get_default_graph().get_operations()
                ].count('SparseFillEmptyRows'))
            else:
                self.assertEqual(1, [
                    x.type for x in ops.get_default_graph().get_operations()
                ].count('SparseFillEmptyRows'))
Exemple #21
0
    def test_shared_embedding_column(self):
        with tf.Graph().as_default():
            vocabulary_size = 3
            sparse_input_a = tf.compat.v1.SparseTensorValue(
                # example 0, ids [2]
                # example 1, ids [0, 1]
                indices=((0, 0), (1, 0), (1, 1)),
                values=(2, 0, 1),
                dense_shape=(2, 2),
            )
            sparse_input_b = tf.compat.v1.SparseTensorValue(
                # example 0, ids [1]
                # example 1, ids [2, 0]
                indices=((0, 0), (1, 0), (1, 1)),
                values=(1, 2, 0),
                dense_shape=(2, 2),
            )

            embedding_dimension = 2
            embedding_values = (
                (1.0, 2.0),  # id 0
                (3.0, 4.0),  # id 1
                (5.0, 6.0),  # id 2
            )

            def _get_initializer(embedding_dimension, embedding_values):
                def _initializer(shape, dtype, partition_info=None):
                    self.assertAllEqual((vocabulary_size, embedding_dimension),
                                        shape)
                    self.assertEqual(tf.float32, dtype)
                    self.assertIsNone(partition_info)
                    return embedding_values

                return _initializer

            expected_input_layer = [
                # example 0, ids_a [2], ids_b [1]
                [[5.0, 6.0, 3.0, 4.0], [0.0, 0.0, 0.0, 0.0]],
                # example 1, ids_a [0, 1], ids_b [2, 0]
                [[1.0, 2.0, 5.0, 6.0], [3.0, 4.0, 1.0, 2.0]],
            ]
            expected_sequence_length = [1, 2]

            categorical_column_a = (
                tf.feature_column.sequence_categorical_column_with_identity(
                    key="aaa", num_buckets=vocabulary_size))
            categorical_column_b = (
                tf.feature_column.sequence_categorical_column_with_identity(
                    key="bbb", num_buckets=vocabulary_size))
            # Test that columns are reordered alphabetically.
            shared_embedding_columns = tf.feature_column.shared_embeddings(
                [categorical_column_b, categorical_column_a],
                dimension=embedding_dimension,
                initializer=_get_initializer(embedding_dimension,
                                             embedding_values),
            )

            sequence_input_layer = ksfc.SequenceFeatures(
                shared_embedding_columns)
            input_layer, sequence_length = sequence_input_layer({
                "aaa":
                sparse_input_a,
                "bbb":
                sparse_input_b
            })

            global_vars = tf.compat.v1.get_collection(
                tf.compat.v1.GraphKeys.GLOBAL_VARIABLES)
            self.assertCountEqual(
                ("aaa_bbb_shared_embedding:0", ),
                tuple([v.name for v in global_vars]),
            )
            with _initialized_session() as sess:
                self.assertAllEqual(embedding_values,
                                    global_vars[0].eval(session=sess))
                self.assertAllEqual(expected_input_layer,
                                    input_layer.eval(session=sess))
                self.assertAllEqual(expected_sequence_length,
                                    sequence_length.eval(session=sess))