def test_indicator_column(self, sparse_input_args_a, sparse_input_args_b,
                              expected_input_layer, expected_sequence_length):
        sparse_input_a = sparse_tensor.SparseTensorValue(**sparse_input_args_a)
        sparse_input_b = sparse_tensor.SparseTensorValue(**sparse_input_args_b)

        vocabulary_size_a = 3
        vocabulary_size_b = 2

        categorical_column_a = sfc.sequence_categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size_a)
        indicator_column_a = fc.indicator_column(categorical_column_a)
        categorical_column_b = sfc.sequence_categorical_column_with_identity(
            key='bbb', num_buckets=vocabulary_size_b)
        indicator_column_b = fc.indicator_column(categorical_column_b)
        # Test that columns are reordered alphabetically.
        sequence_input_layer = ksfc.SequenceFeatures(
            [indicator_column_b, indicator_column_a])
        input_layer, sequence_length = sequence_input_layer({
            'aaa':
            sparse_input_a,
            'bbb':
            sparse_input_b
        })

        self.assertAllEqual(expected_input_layer, self.evaluate(input_layer))
        self.assertAllEqual(expected_sequence_length,
                            self.evaluate(sequence_length))
def create_feature_columns():
    age = vocabulary_column('age_level', [c for c in range(1, 7)])
    gender = vocabulary_column('gender', [-1, 1])

    all_cat_cross = crossed_column([age, gender], hash_bucket_size=100)

    categorical_column = [indicator_column(age), indicator_column(gender)]

    crossed_columns = [indicator_column(all_cat_cross)]

    numerical_column = []

    range_0_20 = [c for c in range(0, 20)]

    embedding_columns = [
        embedding_column(vocabulary_column("order_cnt", range_0_20),
                         dimension=1),
        embedding_column(age, dimension=1),
        embedding_column(gender, dimension=1),
        embedding_column(all_cat_cross, dimension=10)
    ]

    wide_columns = categorical_column + crossed_columns
    deep_columns = numerical_column + embedding_columns
    return wide_columns, deep_columns
Example #3
0
    def test_from_config(self, trainable, name):
        cols = [
            fc.numeric_column('a'),
            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
                'b', vocabulary_list=['1', '2', '3']),
                                dimension=2),
            fc.indicator_column(
                fc.categorical_column_with_hash_bucket(key='c',
                                                       hash_bucket_size=3))
        ]
        orig_layer = df.DenseFeatures(cols, trainable=trainable, name=name)
        config = orig_layer.get_config()

        new_layer = df.DenseFeatures.from_config(config)

        self.assertEqual(new_layer.name, orig_layer.name)
        self.assertEqual(new_layer.trainable, trainable)
        self.assertLen(new_layer._feature_columns, 3)
        self.assertEqual(new_layer._feature_columns[0].name, 'a')
        self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
        self.assertEqual(new_layer._feature_columns[1].categorical_column.name,
                         'b')
        self.assertIsInstance(new_layer._feature_columns[0], cols[0].__class__)
        self.assertIsInstance(new_layer._feature_columns[1], cols[1].__class__)
        self.assertIsInstance(new_layer._feature_columns[2], cols[2].__class__)
Example #4
0
    def test_with_1d_unknown_shape_sparse_tensor(self):
        embedding_values = (
            (1., 2.),  # id 0
            (6., 7.),  # id 1
            (11., 12.)  # id 2
        )

        def _initializer(shape, dtype, partition_info=None):
            del shape, dtype, partition_info
            return embedding_values

        # price has 1 dimension in dense_features
        price = fc.numeric_column('price')

        # one_hot_body_style has 3 dims in dense_features.
        body_style = fc.categorical_column_with_vocabulary_list(
            'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
        one_hot_body_style = fc.indicator_column(body_style)

        # embedded_body_style has 5 dims in dense_features.
        country = fc.categorical_column_with_vocabulary_list(
            'country', vocabulary_list=['US', 'JP', 'CA'])
        embedded_country = fc.embedding_column(country,
                                               dimension=2,
                                               initializer=_initializer)

        # Provides 1-dim tensor and dense tensor.
        features = {
            'price': array_ops.placeholder(dtypes.float32),
            'body-style': array_ops.sparse_placeholder(dtypes.string),
            # This is dense tensor for the categorical_column.
            'country': array_ops.placeholder(dtypes.string),
        }
        self.assertIsNone(features['price'].shape.ndims)
        self.assertIsNone(features['body-style'].get_shape().ndims)
        self.assertIsNone(features['country'].shape.ndims)

        price_data = np.array([11., 12.])
        body_style_data = sparse_tensor.SparseTensorValue(indices=((0, ),
                                                                   (1, )),
                                                          values=('sedan',
                                                                  'hardtop'),
                                                          dense_shape=(2, ))
        country_data = np.array([['US'], ['CA']])

        net = df.DenseFeatures([price, one_hot_body_style,
                                embedded_country])(features)
        self.assertEqual(1 + 3 + 2, net.shape[1])
        with _initialized_session() as sess:

            # Each row is formed by concatenating `embedded_body_style`,
            # `one_hot_body_style`, and `price` in order.
            self.assertAllEqual(
                [[0., 0., 1., 1., 2., 11.], [1., 0., 0., 11., 12., 12.]],
                sess.run(net,
                         feed_dict={
                             features['price']: price_data,
                             features['body-style']: body_style_data,
                             features['country']: country_data
                         }))
Example #5
0
 def test_linear_model_with_feature_column(self):
     with context.eager_mode():
         vocab_list = ['alpha', 'beta', 'gamma']
         vocab_val = [0.4, 0.6, 0.9]
         data = np.random.choice(vocab_list, size=256)
         y = np.zeros_like(data, dtype=np.float32)
         for vocab, val in zip(vocab_list, vocab_val):
             indices = np.where(data == vocab)
             y[indices] = val + np.random.uniform(
                 low=-0.01, high=0.01, size=indices[0].shape)
         cat_column = fc.categorical_column_with_vocabulary_list(
             key='symbol', vocabulary_list=vocab_list)
         ind_column = fc.indicator_column(cat_column)
         dense_feature_layer = dense_features_v2.DenseFeatures([ind_column])
         linear_model = linear.LinearModel(use_bias=False,
                                           kernel_initializer='zeros')
         combined = sequential.Sequential(
             [dense_feature_layer, linear_model])
         opt = gradient_descent.SGD(learning_rate=0.1)
         combined.compile(opt, 'mse', [])
         combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
         self.assertAllClose(
             [[0.4], [0.6], [0.9]],
             combined.layers[1].dense_layers[0].kernel.numpy(),
             atol=0.01)
Example #6
0
 def test_wide_deep_model_with_two_feature_columns(self):
     vocab_list = ['alpha', 'beta', 'gamma']
     vocab_val = [0.4, 0.6, 0.9]
     data = np.random.choice(vocab_list, size=256)
     y = np.zeros_like(data, dtype=np.float32)
     for vocab, val in zip(vocab_list, vocab_val):
         indices = np.where(data == vocab)
         y[indices] = val + np.random.uniform(
             low=-0.01, high=0.01, size=indices[0].shape)
     cat_column = fc.categorical_column_with_vocabulary_list(
         key='symbol', vocabulary_list=vocab_list)
     ind_column = fc.indicator_column(cat_column)
     emb_column = fc.embedding_column(cat_column, dimension=5)
     linear_feature_layer = dense_features_v2.DenseFeatures([ind_column])
     linear_model = linear.LinearModel(use_bias=False,
                                       kernel_initializer='zeros')
     combined_linear = sequential.Sequential(
         [linear_feature_layer, linear_model])
     dnn_model = sequential.Sequential([core.Dense(units=1)])
     dnn_feature_layer = dense_features_v2.DenseFeatures([emb_column])
     combined_dnn = sequential.Sequential([dnn_feature_layer, dnn_model])
     wide_deep_model = wide_deep.WideDeepModel(combined_linear,
                                               combined_dnn)
     opt = gradient_descent.SGD(learning_rate=0.1)
     wide_deep_model.compile(opt,
                             'mse', [],
                             run_eagerly=testing_utils.should_run_eagerly(),
                             experimental_run_tf_function=testing_utils.
                             should_run_tf_function())
     wide_deep_model.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
     self.assertEqual(3, linear_model.inputs[0].shape[1])
     self.assertEqual(5, dnn_model.inputs[0].shape[1])
    def test_train_premade_widedeep_model_with_feature_layers(self):
        vocab_list = ['alpha', 'beta', 'gamma']
        vocab_val = [0.4, 0.6, 0.9]
        data = np.random.choice(vocab_list, size=256)
        y = np.zeros_like(data, dtype=np.float32)
        for vocab, val in zip(vocab_list, vocab_val):
            indices = np.where(data == vocab)
            y[indices] = val + np.random.uniform(
                low=-0.01, high=0.01, size=indices[0].shape)
        cat_column = feature_column.categorical_column_with_vocabulary_list(
            key='symbol', vocabulary_list=vocab_list)
        ind_column = feature_column.indicator_column(cat_column)
        # TODO(tanzheny): use emb column for dense part once b/139667019 is fixed.
        # emb_column = feature_column.embedding_column(cat_column, dimension=5)
        keras_input = keras.layers.Input(name='symbol',
                                         shape=3,
                                         dtype=dtypes.string)

        # build linear part with feature layer.
        linear_feature_layer = dense_features.DenseFeatures([ind_column])
        linear_model = linear.LinearModel(units=1,
                                          name='Linear',
                                          kernel_initializer='zeros')
        combined_linear = keras.Sequential(
            [linear_feature_layer, linear_model])

        # build dnn part with feature layer.
        dnn_feature_layer = dense_features.DenseFeatures([ind_column])
        dense_layer = keras.layers.Dense(units=1,
                                         name='DNNDense',
                                         kernel_initializer='zeros')
        combined_dnn = keras.Sequential([dnn_feature_layer, dense_layer])

        # build and compile wide deep.
        wide_deep_model = wide_deep.WideDeepModel(combined_linear,
                                                  combined_dnn)
        wide_deep_model._set_inputs({'symbol': keras_input})
        sgd_opt = gradient_descent.SGD(0.1)
        adam_opt = adam.Adam(0.1)
        wide_deep_model.compile([sgd_opt, adam_opt], 'mse', ['mse'])

        # build estimator.
        train_input_fn = numpy_io.numpy_input_fn(x={'symbol': data},
                                                 y=y,
                                                 num_epochs=20,
                                                 shuffle=False)
        eval_input_fn = numpy_io.numpy_input_fn(x={'symbol': data},
                                                y=y,
                                                num_epochs=20,
                                                shuffle=False)
        est = keras_lib.model_to_estimator(keras_model=wide_deep_model,
                                           config=self._config,
                                           checkpoint_format='saver')

        before_eval_results = est.evaluate(input_fn=eval_input_fn, steps=1)
        est.train(input_fn=train_input_fn, steps=20)
        after_eval_results = est.evaluate(input_fn=eval_input_fn, steps=1)
        self.assertLess(after_eval_results['loss'],
                        before_eval_results['loss'])
        self.assertLess(after_eval_results['loss'], 0.1)
Example #8
0
    def test_train_with_dense_features(self):
        feature_dict = {
            'sex': np.int64([1, 1, 1, 1, 0]),
            'cp': np.int64([0, 3, 3, 2, 1]),
            'slope': np.int64([3, 2, 0, 3, 1]),
        }
        label = np.int64([0, 1, 0, 0, 0])
        train_input_fn = numpy_io.numpy_input_fn(x=feature_dict,
                                                 y=label,
                                                 num_epochs=1,
                                                 shuffle=False)
        feature_columns = list()
        input_features = dict()
        for feature_name, data_array in feature_dict.items():
            feature_columns.append(
                feature_column.indicator_column(
                    feature_column.categorical_column_with_identity(
                        key=feature_name,
                        num_buckets=np.size(np.unique(data_array)))))
            input_features[feature_name] = keras.layers.Input(
                name=feature_name,
                shape=(np.size(np.unique(data_array)), ),
                dtype=dtypes.int64)

        x = feature_column.DenseFeatures(feature_columns)(input_features)
        x = keras.layers.Dense(16, activation='relu')(x)
        logits = keras.layers.Dense(1, activation='linear')(x)
        model = keras.Model(inputs=input_features, outputs=logits)

        model.compile(optimizer='rmsprop',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        estimator_model = keras_lib.model_to_estimator(keras_model=model)
        estimator_model.train(input_fn=train_input_fn, steps=5)
Example #9
0
    def test_serialization(self):
        """Tests that column can be serialized."""
        parent = sfc.sequence_categorical_column_with_identity('animal',
                                                               num_buckets=4)
        animal = fc.indicator_column(parent)

        config = animal.get_config()
        self.assertEqual(
            {
                'categorical_column': {
                    'class_name': 'SequenceCategoricalColumn',
                    'config': {
                        'categorical_column': {
                            'class_name': 'IdentityCategoricalColumn',
                            'config': {
                                'default_value': None,
                                'key': 'animal',
                                'number_buckets': 4
                            }
                        }
                    }
                }
            }, config)

        new_animal = fc.IndicatorColumn.from_config(config)
        self.assertEqual(animal, new_animal)
        self.assertIsNot(parent, new_animal.categorical_column)

        new_animal = fc.IndicatorColumn.from_config(
            config,
            columns_by_name={
                serialization._column_name_with_class_name(parent): parent
            })
        self.assertEqual(animal, new_animal)
        self.assertIs(parent, new_animal.categorical_column)
Example #10
0
    def test_saving_with_dense_features(self):
        cols = [
            feature_column_v2.numeric_column('a'),
            feature_column_v2.indicator_column(
                feature_column_v2.categorical_column_with_vocabulary_list(
                    'b', ['one', 'two']))
        ]
        input_layers = {
            'a': keras.layers.Input(shape=(1, ), name='a'),
            'b': keras.layers.Input(shape=(1, ), name='b', dtype='string')
        }

        fc_layer = feature_column_v2.DenseFeatures(cols)(input_layers)
        output = keras.layers.Dense(10)(fc_layer)

        model = keras.models.Model(input_layers, output)

        model.compile(loss=keras.losses.MSE,
                      optimizer=keras.optimizers.RMSprop(lr=0.0001),
                      metrics=[keras.metrics.categorical_accuracy])

        config = model.to_json()
        loaded_model = model_config.model_from_json(config)

        inputs_a = np.arange(10).reshape(10, 1)
        inputs_b = np.arange(10).reshape(10, 1).astype('str')

        # Initialize tables for V1 lookup.
        if not context.executing_eagerly():
            self.evaluate(lookup_ops.tables_initializer())

        self.assertLen(loaded_model.predict({
            'a': inputs_a,
            'b': inputs_b
        }), 10)
    def test_saving_with_sequence_features(self):
        cols = [
            sfc.sequence_numeric_column('a'),
            fc.indicator_column(
                sfc.sequence_categorical_column_with_vocabulary_list(
                    'b', ['one', 'two']))
        ]
        input_layers = {
            'a':
            keras.layers.Input(shape=(None, 1), sparse=True, name='a'),
            'b':
            keras.layers.Input(shape=(None, 1),
                               sparse=True,
                               name='b',
                               dtype='string')
        }

        fc_layer, _ = ksfc.SequenceFeatures(cols)(input_layers)
        # TODO(tibell): Figure out the right dtype and apply masking.
        # sequence_length_mask = array_ops.sequence_mask(sequence_length)
        # x = keras.layers.GRU(32)(fc_layer, mask=sequence_length_mask)
        x = keras.layers.GRU(32)(fc_layer)
        output = keras.layers.Dense(10)(x)

        model = keras.models.Model(input_layers, output)

        model.compile(loss=keras.losses.MSE,
                      optimizer='rmsprop',
                      metrics=[keras.metrics.categorical_accuracy])

        config = model.to_json()
        loaded_model = model_config.model_from_json(config)

        batch_size = 10
        timesteps = 1

        values_a = np.arange(10, dtype=np.float32)
        indices_a = np.zeros((10, 3), dtype=np.int64)
        indices_a[:, 0] = np.arange(10)
        inputs_a = sparse_tensor.SparseTensor(indices_a, values_a,
                                              (batch_size, timesteps, 1))

        values_b = np.zeros(10, dtype=np.str)
        indices_b = np.zeros((10, 3), dtype=np.int64)
        indices_b[:, 0] = np.arange(10)
        inputs_b = sparse_tensor.SparseTensor(indices_b, values_b,
                                              (batch_size, timesteps, 1))

        with self.cached_session():
            # Initialize tables for V1 lookup.
            if not context.executing_eagerly():
                self.evaluate(lookup_ops.tables_initializer())

            self.assertLen(
                loaded_model.predict({
                    'a': inputs_a,
                    'b': inputs_b
                }, steps=1), batch_size)
Example #12
0
    def test_with_1d_sparse_tensor(self):
        embedding_values = (
            (1., 2., 3., 4., 5.),  # id 0
            (6., 7., 8., 9., 10.),  # id 1
            (11., 12., 13., 14., 15.)  # id 2
        )

        def _initializer(shape, dtype, partition_info=None):
            del shape, dtype, partition_info
            return embedding_values

        # price has 1 dimension in dense_features
        price = fc.numeric_column('price')

        # one_hot_body_style has 3 dims in dense_features.
        body_style = fc.categorical_column_with_vocabulary_list(
            'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
        one_hot_body_style = fc.indicator_column(body_style)

        # embedded_body_style has 5 dims in dense_features.
        country = fc.categorical_column_with_vocabulary_list(
            'country', vocabulary_list=['US', 'JP', 'CA'])
        embedded_country = fc.embedding_column(country,
                                               dimension=5,
                                               initializer=_initializer)

        with ops.Graph().as_default():
            # Provides 1-dim tensor and dense tensor.
            features = {
                'price':
                constant_op.constant([
                    11.,
                    12.,
                ]),
                'body-style':
                sparse_tensor.SparseTensor(indices=((0, ), (1, )),
                                           values=('sedan', 'hardtop'),
                                           dense_shape=(2, )),
                # This is dense tensor for the categorical_column.
                'country':
                constant_op.constant(['CA', 'US']),
            }
            self.assertEqual(1, features['price'].shape.ndims)
            self.assertEqual(1,
                             features['body-style'].dense_shape.get_shape()[0])
            self.assertEqual(1, features['country'].shape.ndims)

            net = df.DenseFeatures(
                [price, one_hot_body_style, embedded_country])(features)
            self.assertEqual(1 + 3 + 5, net.shape[1])
            with _initialized_session() as sess:

                # Each row is formed by concatenating `embedded_body_style`,
                # `one_hot_body_style`, and `price` in order.
                self.assertAllEqual(
                    [[0., 0., 1., 11., 12., 13., 14., 15., 11.],
                     [1., 0., 0., 1., 2., 3., 4., 5., 12.]], sess.run(net))
Example #13
0
    def test_get_sequence_dense_tensor(self, inputs_args, expected):
        inputs = sparse_tensor.SparseTensorValue(**inputs_args)
        vocabulary_size = 3

        categorical_column = sfc.sequence_categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        indicator_column = fc.indicator_column(categorical_column)

        indicator_tensor, _ = _get_sequence_dense_tensor(
            indicator_column, {'aaa': inputs})

        self.assertAllEqual(expected, self.evaluate(indicator_tensor))
    def test_static_shape_from_tensors_indicator(self, sparse_input_args,
                                                 expected_shape):
        """Tests that we return a known static shape when we have one."""
        sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
        categorical_column = sfc.sequence_categorical_column_with_identity(
            key='aaa', num_buckets=3)
        indicator_column = fc.indicator_column(categorical_column)

        sequence_input_layer = ksfc.SequenceFeatures([indicator_column])
        input_layer, _ = sequence_input_layer({'aaa': sparse_input})
        shape = input_layer.get_shape()
        self.assertEqual(shape, expected_shape)
Example #15
0
    def test_sequence_length(self, inputs_args, expected_sequence_length):
        inputs = sparse_tensor.SparseTensorValue(**inputs_args)
        vocabulary_size = 3

        categorical_column = sfc.sequence_categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        indicator_column = fc.indicator_column(categorical_column)

        _, sequence_length = _get_sequence_dense_tensor(
            indicator_column, {'aaa': inputs})

        sequence_length = self.evaluate(sequence_length)
        self.assertAllEqual(expected_sequence_length, sequence_length)
        self.assertEqual(np.int64, sequence_length.dtype)
Example #16
0
    def test_crossed_column(self):
        a = fc.categorical_column_with_vocabulary_list(
            'a', vocabulary_list=['1', '2', '3'])
        b = fc.categorical_column_with_vocabulary_list(
            'b', vocabulary_list=['1', '2', '3'])
        ab = fc.crossed_column([a, b], hash_bucket_size=2)
        cols = [fc.indicator_column(ab)]

        orig_layer = df.DenseFeatures(cols)
        config = orig_layer.get_config()

        new_layer = df.DenseFeatures.from_config(config)

        self.assertLen(new_layer._feature_columns, 1)
        self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
Example #17
0
    def _cate_indicator_column(self, params: dict) -> DenseFeatures:
        """
        输入:类别
        输出:类别对应的one_hot
        :param params:
        :return:
        """
        key, inputs = self._get_input_layer(params)

        feature = _get_categorical_column(params)
        feature_column = fc.indicator_column(feature)

        outputs = DenseFeatures(
            feature_column, name=params.get('name', None))({key: inputs})

        return outputs
Example #18
0
    def test_dense_features(self):
        animal = fc.indicator_column(
            fc.categorical_column_with_identity('animal', num_buckets=4))
        with ops.Graph().as_default():
            features = {
                'animal':
                sparse_tensor.SparseTensor(indices=[[0, 0], [0, 1]],
                                           values=[1, 2],
                                           dense_shape=[1, 2])
            }
            net = df.DenseFeatures([animal])(features)

            self.evaluate(variables_lib.global_variables_initializer())
            self.evaluate(lookup_ops.tables_initializer())

            self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
def embedding_varlen(batch_size, max_length):
    """Benchmark a variable-length embedding."""
    # Data and constants.
    vocab_size = 32768
    vocab = fc_bm.create_vocabulary(vocab_size)
    data = fc_bm.create_string_data(max_length,
                                    batch_size * NUM_REPEATS,
                                    vocab,
                                    pct_oov=0.15)

    # Keras implementation
    model = keras.Sequential()
    model.add(keras.Input(shape=(max_length, ), name="data", dtype=dt.string))
    model.add(string_lookup.StringLookup(vocabulary=vocab, mask_token=None))
    model.add(
        category_encoding.CategoryEncoding(num_tokens=vocab_size + 1,
                                           output_mode="count"))

    # FC implementation
    fc = fcv2.indicator_column(
        fcv2.categorical_column_with_vocabulary_list(key="data",
                                                     vocabulary_list=vocab,
                                                     num_oov_buckets=1))

    # Wrap the FC implementation in a tf.function for a fair comparison
    @tf_function()
    def fc_fn(tensors):
        fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)

    # Benchmark runs
    keras_data = {
        "data": data.to_tensor(default_value="",
                               shape=(batch_size, max_length))
    }
    k_avg_time = fc_bm.run_keras(keras_data, model, batch_size, NUM_REPEATS)

    fc_data = {
        "data": data.to_tensor(default_value="",
                               shape=(batch_size, max_length))
    }
    fc_avg_time = fc_bm.run_fc(fc_data, fc_fn, batch_size, NUM_REPEATS)

    return k_avg_time, fc_avg_time
Example #20
0
    def test_indicator_column(self):
        """Tests that error is raised for sequence indicator column."""
        vocabulary_size = 3
        sparse_input = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            indices=((0, 0), (1, 0), (1, 1)),
            values=(2, 0, 1),
            dense_shape=(2, 2))

        categorical_column_a = sfc.sequence_categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        indicator_column_a = fc.indicator_column(categorical_column_a)

        input_layer = dense_features.DenseFeatures([indicator_column_a])
        with self.assertRaisesRegexp(
                ValueError,
                r'In indicator_column: aaa_indicator\. categorical_column must not be '
                r'of type SequenceCategoricalColumn\.'):
            _ = input_layer({'aaa': sparse_input})
    def test_indicator_column_with_non_sequence_categorical(self):
        """Tests that error is raised for non-sequence categorical column."""
        vocabulary_size = 3
        sparse_input = sparse_tensor.SparseTensorValue(
            # example 0, ids [2]
            # example 1, ids [0, 1]
            indices=((0, 0), (1, 0), (1, 1)),
            values=(2, 0, 1),
            dense_shape=(2, 2))

        categorical_column_a = fc.categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        indicator_column_a = fc.indicator_column(categorical_column_a)

        sequence_input_layer = ksfc.SequenceFeatures([indicator_column_a])
        with self.assertRaisesRegex(
                ValueError,
                r'In indicator_column: aaa_indicator\. categorical_column must be of '
                r'type SequenceCategoricalColumn to use SequenceFeatures\.'):
            _, _ = sequence_input_layer({'aaa': sparse_input})
Example #22
0
    def testFeatureColumns(self):
        # TODO(b/120099662): Error with table initialization with Keras models in
        # graph mode.
        if context.executing_eagerly():
            numeric = fc.numeric_column('a')
            bucketized = fc.bucketized_column(numeric, boundaries=[5, 10, 15])
            cat_vocab = fc.categorical_column_with_vocabulary_list(
                'b', ['1', '2', '3'])
            one_hot = fc.indicator_column(cat_vocab)
            embedding = fc.embedding_column(cat_vocab, dimension=8)
            feature_layer = DenseFeatures([bucketized, one_hot, embedding])
            model = keras.models.Sequential(feature_layer)

            features = {'a': np.array([13, 15]), 'b': np.array(['1', '2'])}
            predictions = model.predict(features)

            saved_model_dir = self._save_model_dir()
            model.save(saved_model_dir, save_format='tf')
            loaded = keras_load.load(saved_model_dir)
            loaded_predictions = loaded.predict(features)
            self.assertAllClose(predictions, loaded_predictions)
    def test_train_premade_linear_model_with_dense_features(self):
        vocab_list = ['alpha', 'beta', 'gamma']
        vocab_val = [0.4, 0.6, 0.9]
        data = np.random.choice(vocab_list, size=256)
        y = np.zeros_like(data, dtype=np.float32)
        for vocab, val in zip(vocab_list, vocab_val):
            indices = np.where(data == vocab)
            y[indices] = val + np.random.uniform(
                low=-0.01, high=0.01, size=indices[0].shape)
        cat_column = feature_column.categorical_column_with_vocabulary_list(
            key='symbol', vocabulary_list=vocab_list)
        ind_column = feature_column.indicator_column(cat_column)
        keras_input = keras.layers.Input(name='symbol',
                                         shape=3,
                                         dtype=dtypes.string)
        feature_layer = dense_features.DenseFeatures([ind_column])
        h = feature_layer({'symbol': keras_input})
        linear_model = linear.LinearModel(units=1)
        h = linear_model(h)

        model = keras.Model(inputs=keras_input, outputs=h)
        opt = gradient_descent.SGD(0.1)
        model.compile(opt, 'mse', ['mse'])
        train_input_fn = numpy_io.numpy_input_fn(x={'symbol': data},
                                                 y=y,
                                                 num_epochs=20,
                                                 shuffle=False)
        eval_input_fn = numpy_io.numpy_input_fn(x={'symbol': data},
                                                y=y,
                                                num_epochs=20,
                                                shuffle=False)
        est = keras_lib.model_to_estimator(keras_model=model,
                                           config=self._config,
                                           checkpoint_format='saver')
        before_eval_results = est.evaluate(input_fn=eval_input_fn, steps=1)
        est.train(input_fn=train_input_fn, steps=30)
        after_eval_results = est.evaluate(input_fn=eval_input_fn, steps=1)
        self.assertLess(after_eval_results['loss'],
                        before_eval_results['loss'])
        self.assertLess(after_eval_results['loss'], 0.05)
Example #24
0
 def test_wide_deep_model_with_single_feature_column(self):
     vocab_list = ['alpha', 'beta', 'gamma']
     vocab_val = [0.4, 0.6, 0.9]
     data = np.random.choice(vocab_list, size=256)
     y = np.zeros_like(data, dtype=np.float32)
     for vocab, val in zip(vocab_list, vocab_val):
         indices = np.where(data == vocab)
         y[indices] = val + np.random.uniform(
             low=-0.01, high=0.01, size=indices[0].shape)
     cat_column = feature_column_v2.categorical_column_with_vocabulary_list(
         key='symbol', vocabulary_list=vocab_list)
     ind_column = feature_column_v2.indicator_column(cat_column)
     dense_feature_layer = dense_features_v2.DenseFeatures([ind_column])
     linear_model = linear.LinearModel(use_bias=False,
                                       kernel_initializer='zeros')
     dnn_model = keras.Sequential([keras.layers.Dense(units=1)])
     wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
     combined = keras.Sequential([dense_feature_layer, wide_deep_model])
     opt = gradient_descent.SGD(learning_rate=0.1)
     combined.compile(opt,
                      'mse', [],
                      run_eagerly=testing_utils.should_run_eagerly())
     combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
Example #25
0
    def test_sequence_length_with_empty_rows(self):
        """Tests _sequence_length when some examples do not have ids."""
        vocabulary_size = 3
        sparse_input = sparse_tensor.SparseTensorValue(
            # example 0, ids []
            # example 1, ids [2]
            # example 2, ids [0, 1]
            # example 3, ids []
            # example 4, ids [1]
            # example 5, ids []
            indices=((1, 0), (2, 0), (2, 1), (4, 0)),
            values=(2, 0, 1, 1),
            dense_shape=(6, 2))
        expected_sequence_length = [0, 1, 2, 0, 1, 0]

        categorical_column = sfc.sequence_categorical_column_with_identity(
            key='aaa', num_buckets=vocabulary_size)
        indicator_column = fc.indicator_column(categorical_column)

        _, sequence_length = _get_sequence_dense_tensor(
            indicator_column, {'aaa': sparse_input})

        self.assertAllEqual(expected_sequence_length,
                            self.evaluate(sequence_length))
Example #26
0
def make_feature_config(num_players):
    return FeatureConfig(
        context_features=[
            fc.numeric_column(
                "public_context__starting_stack_sizes",
                shape=num_players,
                dtype=tf.int64,
            ),
            fc.embedding_column(
                tf.feature_column.categorical_column_with_vocabulary_list(
                    "private_context__hand_encoded", range(1326)),
                dimension=4,
            ),
        ],
        sequence_features=[
            fc.indicator_column(
                sfc.sequence_categorical_column_with_identity(
                    "last_action__action_encoded", 22)),
            fc.indicator_column(
                sfc.sequence_categorical_column_with_identity(
                    "last_action__move", 5)),
            sfc.sequence_numeric_column(
                "last_action__amount_added",
                dtype=tf.int64,
                default_value=-1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "last_action__amount_added_percent_of_remaining",
                dtype=tf.float32,
                default_value=-1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "last_action__amount_raised",
                dtype=tf.int64,
                default_value=-1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "last_action__amount_raised_percent_of_pot",
                dtype=tf.float32,
                default_value=-1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "public_state__all_in_player_mask",
                dtype=tf.int64,
                default_value=-1,
                shape=num_players,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "public_state__stack_sizes",
                dtype=tf.int64,
                default_value=-1,
                shape=num_players,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "public_state__amount_to_call",
                dtype=tf.int64,
                default_value=-1,
                shape=num_players,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "public_state__current_player_mask",
                dtype=tf.int64,
                default_value=-1,
                shape=num_players,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "public_state__min_raise_amount",
                dtype=tf.int64,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "public_state__pot_size",
                dtype=tf.int64,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "public_state__street",
                dtype=tf.int64,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "player_state__is_current_player",
                dtype=tf.int64,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "player_state__current_player_offset",
                dtype=tf.int64,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            fc.indicator_column(
                sfc.sequence_categorical_column_with_identity(
                    "player_state__current_hand_type", 9)),
            sfc.sequence_numeric_column(
                "player_state__win_odds",
                dtype=tf.float32,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "player_state__win_odds_vs_better",
                dtype=tf.float32,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "player_state__win_odds_vs_tied",
                dtype=tf.float32,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "player_state__win_odds_vs_worse",
                dtype=tf.float32,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "player_state__frac_better_hands",
                dtype=tf.float32,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "player_state__frac_tied_hands",
                dtype=tf.float32,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
            sfc.sequence_numeric_column(
                "player_state__frac_worse_hands",
                dtype=tf.float32,
                default_value=-1,
                shape=1,
                normalizer_fn=make_float,
            ),
        ],
        context_targets=[
            fc.numeric_column("public_context__num_players",
                              shape=1,
                              dtype=tf.int64),
        ],
        sequence_targets=[
            sfc.sequence_numeric_column("next_action__action_encoded",
                                        dtype=tf.int64,
                                        default_value=-1),
            sfc.sequence_numeric_column("reward__cumulative_reward",
                                        dtype=tf.int64,
                                        default_value=-1),
            sfc.sequence_numeric_column("public_state__pot_size",
                                        dtype=tf.int64,
                                        default_value=-1),
            sfc.sequence_numeric_column("player_state__is_current_player",
                                        dtype=tf.int64,
                                        default_value=-1),
            sfc.sequence_numeric_column("public_state__num_players_remaining",
                                        dtype=tf.int64,
                                        default_value=-1),
        ],
    )
    def make_columns():
        """
        Builds the feature_columns required by the estimator to link the Dataset and the model_fn
        :return:
        """
        columns_dict = {}

        columns_dict['gci'] = fc.indicator_column(
            fc.sequence_categorical_column_with_vocabulary_file(
                'gci',
                vocab_file,
                default_value="0"
            )
        )
        columns_dict['ta'] = (
            seq_fc.sequence_numeric_column(
                'ta', normalizer_fn=lambda x: normalize(x, 'ta', stats_dict)
            )
        )
        columns_dict['rsrp'] = (
            seq_fc.sequence_numeric_column(
                'rsrp', normalizer_fn=lambda x: normalize(
                    x, 'rsrp', stats_dict)))
        columns_dict['gci0'] = fc.indicator_column(
            fc.sequence_categorical_column_with_vocabulary_file(
                'gci0',
                vocab_file,
                default_value="0"
            )
        )
        columns_dict['rsrp0'] = (
            seq_fc.sequence_numeric_column(
                'rsrp0', normalizer_fn=lambda x: normalize(
                    x, 'rsrp0', stats_dict)))
        columns_dict['gci1'] = fc.indicator_column(
            fc.sequence_categorical_column_with_vocabulary_file(
                'gci1',
                vocab_file,
                default_value="0"
            )
        )
        columns_dict['rsrp1'] = (
            seq_fc.sequence_numeric_column(
                'rsrp1', normalizer_fn=lambda x: normalize(
                    x, 'rsrp1', stats_dict)))
        columns_dict['gci2'] = fc.indicator_column(
            fc.sequence_categorical_column_with_vocabulary_file(
                'gci2',
                vocab_file,
                default_value="0"
            )
        )
        columns_dict['rsrp2'] = (
            seq_fc.sequence_numeric_column(
                'rsrp2', normalizer_fn=lambda x: normalize(
                    x, 'rsrp2', stats_dict)))
        columns_dict['dt'] = (
            seq_fc.sequence_numeric_column(
                'dt', normalizer_fn=lambda x: normalize(x, 'dt', stats_dict)
            )
        )
        return columns_dict