Ejemplo n.º 1
0
 def test_tensor_dtype_should_be_string_or_integer(self):
     string_fc = fc.categorical_column_with_hash_bucket('a_string',
                                                        10,
                                                        dtype=dtypes.string)
     int_fc = fc.categorical_column_with_hash_bucket('a_int',
                                                     10,
                                                     dtype=dtypes.int32)
     float_fc = fc.categorical_column_with_hash_bucket('a_float',
                                                       10,
                                                       dtype=dtypes.string)
     int_tensor = sparse_tensor.SparseTensor(values=constant_op.constant(
         [101]),
                                             indices=[[0, 0]],
                                             dense_shape=[1, 1])
     string_tensor = sparse_tensor.SparseTensor(values=constant_op.constant(
         ['101']),
                                                indices=[[0, 0]],
                                                dense_shape=[1, 1])
     float_tensor = sparse_tensor.SparseTensor(values=constant_op.constant(
         [101.]),
                                               indices=[[0, 0]],
                                               dense_shape=[1, 1])
     builder = fc._LazyBuilder({
         'a_int': int_tensor,
         'a_string': string_tensor,
         'a_float': float_tensor
     })
     builder.get(string_fc)
     builder.get(int_fc)
     with self.assertRaisesRegexp(ValueError,
                                  'dtype must be string or integer'):
         builder.get(float_fc)
def sequence_categorical_column_with_hash_bucket(
    key, hash_bucket_size, dtype=dtypes.string):
  """A sequence of categorical terms where ids are set by hashing.

  Example:

  ```python
  tokens = sequence_categorical_column_with_hash_bucket(
      'tokens', hash_bucket_size=1000)
  tokens_embedding = embedding_column(tokens, dimension=10)
  columns = [tokens_embedding]

  features = tf.parse_example(..., features=make_parse_example_spec(columns))
  input_layer, sequence_length = sequence_input_layer(features, columns)

  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
  outputs, state = tf.nn.dynamic_rnn(
      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
  ```

  Args:
    key: A unique string identifying the input feature.
    hash_bucket_size: An int > 1. The number of buckets.
    dtype: The type of features. Only string and integer types are supported.

  Returns:
    A `_SequenceCategoricalColumn`.
  """
  return _SequenceCategoricalColumn(
      fc.categorical_column_with_hash_bucket(
          key=key,
          hash_bucket_size=hash_bucket_size,
          dtype=dtype))
Ejemplo n.º 3
0
    def testPartitioner(self):
        x_dim = 64
        partitions = 4

        def _partitioner(shape, dtype):
            del dtype  # unused; required by Fn signature.
            # Only partition the embedding tensor.
            return [partitions, 1] if shape[0] == x_dim else [1]

        regressor = self._linear_regressor_fn(feature_columns=(
            feature_column_lib.categorical_column_with_hash_bucket(
                'language', hash_bucket_size=x_dim), ),
                                              partitioner=_partitioner,
                                              model_dir=self._model_dir)

        def _input_fn():
            return {
                'language':
                sparse_tensor.SparseTensor(values=['english', 'spanish'],
                                           indices=[[0, 0], [0, 1]],
                                           dense_shape=[1, 2])
            }, [[10.]]

        hook = CheckPartitionerVarHook(self, LANGUAGE_WEIGHT_NAME, x_dim,
                                       partitions)
        regressor.train(input_fn=_input_fn, steps=1, hooks=[hook])
Ejemplo n.º 4
0
    def testDefaultPartitionerWithMultiplePsReplicas(self):
        partitions = 2
        # This results in weights larger than the default partition size of 64M,
        # so partitioned weights are created (each weight uses 4 bytes).
        x_dim = 32 << 20

        class FakeRunConfig(run_config.RunConfig):
            @property
            def num_ps_replicas(self):
                return partitions

        # Mock the device setter as ps is not available on test machines.
        with test.mock.patch.object(estimator,
                                    '_get_replica_device_setter',
                                    return_value=lambda _: '/cpu:0'):
            linear_regressor = self._linear_regressor_fn(
                feature_columns=(
                    feature_column_lib.categorical_column_with_hash_bucket(
                        'language', hash_bucket_size=x_dim), ),
                config=FakeRunConfig(),
                model_dir=self._model_dir)

            def _input_fn():
                return {
                    'language':
                    sparse_tensor.SparseTensor(values=['english', 'spanish'],
                                               indices=[[0, 0], [0, 1]],
                                               dense_shape=[1, 2])
                }, [[10.]]

            hook = CheckPartitionerVarHook(self, LANGUAGE_WEIGHT_NAME, x_dim,
                                           partitions)
            linear_regressor.train(input_fn=_input_fn, steps=1, hooks=[hook])
Ejemplo n.º 5
0
  def testWarmStart_SparseColumnHashed(self):
    # Create feature column.
    sc_hash = fc.categorical_column_with_hash_bucket(
        "sc_hash", hash_bucket_size=15)

    # Save checkpoint from which to warm-start.
    _, prev_hash_val = self._create_prev_run_var(
        "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())

    partitioner = lambda shape, dtype: [1] * len(shape)
    # New graph, new session WITHOUT warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model([sc_hash], partitioner)
        sess.run(variables.global_variables_initializer())
        # Without warmstarting, the weights should be initialized using default
        # initializer (which is init_ops.zeros_initializer).
        self._assert_cols_to_vars(cols_to_vars, {sc_hash: [np.zeros([15, 1])]},
                                  sess)

    # New graph, new session with warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model([sc_hash], partitioner)
        ws_util._warmstart(ws_util._WarmStartSettings(
            self.get_temp_dir(),
            vars_to_warmstart=".*sc_hash.*"))
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warmstarted.
        self._assert_cols_to_vars(cols_to_vars, {sc_hash: [prev_hash_val]},
                                  sess)
  def testWarmStart_SparseColumnHashed(self):
    # Create feature column.
    sc_hash = fc.categorical_column_with_hash_bucket(
        "sc_hash", hash_bucket_size=15)

    # Save checkpoint from which to warm-start.
    _, prev_hash_val = self._create_prev_run_var(
        "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())

    partitioner = lambda shape, dtype: [1] * len(shape)
    # New graph, new session WITHOUT warm-starting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model([sc_hash], partitioner)
        sess.run(variables.global_variables_initializer())
        # Without warm-starting, the weights should be initialized using default
        # initializer (which is init_ops.zeros_initializer).
        self._assert_cols_to_vars(cols_to_vars, {sc_hash: [np.zeros([15, 1])]},
                                  sess)

    # New graph, new session with warm-starting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model([sc_hash], partitioner)
        ws_util._warm_start(
            ws_util.WarmStartSettings(
                self.get_temp_dir(), vars_to_warm_start=".*sc_hash.*"))
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warm-started.
        self._assert_cols_to_vars(cols_to_vars, {sc_hash: [prev_hash_val]},
                                  sess)
Ejemplo n.º 7
0
  def testWarmStartMoreSettingsNoPartitioning(self):
    # Create old and new vocabs for sparse column "sc_vocab".
    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                        "old_vocab")
    new_vocab_path = self._write_vocab(
        ["orange", "guava", "banana", "apple", "raspberry",
         "blueberry"], "new_vocab")
    # Create feature columns.
    sc_hash = fc.categorical_column_with_hash_bucket(
        "sc_hash", hash_bucket_size=15)
    sc_keys = fc.categorical_column_with_vocabulary_list(
        "sc_keys", vocabulary_list=["a", "b", "c", "e"])
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
    all_linear_cols = [sc_hash, sc_keys, sc_vocab]

    # Save checkpoint from which to warm-start.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        variable_scope.get_variable(
            "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
        sc_keys_weights = variable_scope.get_variable(
            "some_other_name", shape=[4, 1], initializer=rand())
        variable_scope.get_variable(
            "linear_model/sc_vocab/weights",
            initializer=[[0.5], [1.], [2.], [3.]])
        self._write_checkpoint(sess)
        prev_keys_val = sess.run(sc_keys_weights)

    # New graph, new session with warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols,
                                                 partitioner=None)
        vocab_info = ws_util._VocabInfo(
            new_vocab=sc_vocab.vocabulary_file,
            new_vocab_size=sc_vocab.vocabulary_size,
            num_oov_buckets=sc_vocab.num_oov_buckets,
            old_vocab=prev_vocab_path
        )
        ws_settings = ws_util._WarmStartSettings(
            self.get_temp_dir(),
            vars_to_warmstart=".*(sc_keys|sc_vocab).*",
            var_name_to_vocab_info={
                ws_util._infer_var_name(cols_to_vars[sc_vocab]): vocab_info
            },
            var_name_to_prev_var_name={
                ws_util._infer_var_name(cols_to_vars[sc_keys]):
                    "some_other_name"
            })
        ws_util._warmstart(ws_settings)
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warmstarted.  Var corresponding to
        # sc_hash should not be warm-started.  Var corresponding to sc_vocab
        # should be correctly warmstarted after vocab remapping.
        self._assert_cols_to_vars(cols_to_vars, {
            sc_keys: [prev_keys_val],
            sc_hash: [np.zeros([15, 1])],
            sc_vocab: [np.array([[3.], [2.], [1.], [0.5], [0.], [0.]])]
        }, sess)
Ejemplo n.º 8
0
 def test_deep_copy(self):
     """Tests deepcopy of categorical_column_with_hash_bucket."""
     column = fc.categorical_column_with_hash_bucket('aaa', 10)
     column_copy = copy.deepcopy(column)
     self.assertEqual('aaa', column_copy.name)
     self.assertEqual(10, column_copy.hash_bucket_size)
     self.assertEqual(dtypes.string, column_copy.dtype)
Ejemplo n.º 9
0
  def testDefaultPartitionerWithMultiplePsReplicas(self):
    partitions = 2
    # This results in weights larger than the default partition size of 64M,
    # so partitioned weights are created (each weight uses 4 bytes).
    x_dim = 32 << 20

    class FakeRunConfig(run_config.RunConfig):

      @property
      def num_ps_replicas(self):
        return partitions

    # Mock the device setter as ps is not available on test machines.
    with test.mock.patch.object(estimator,
                                '_get_replica_device_setter',
                                return_value=lambda _: '/cpu:0'):
      linear_regressor = linear.LinearRegressor(
          feature_columns=(
              feature_column_lib.categorical_column_with_hash_bucket(
                  'language', hash_bucket_size=x_dim),),
          config=FakeRunConfig(),
          model_dir=self._model_dir)

      def _input_fn():
        return {
            'language': sparse_tensor.SparseTensor(
                values=['english', 'spanish'],
                indices=[[0, 0], [0, 1]],
                dense_shape=[1, 2])
        }, [[10.]]

      hook = _CheckPartitionerVarHook(
          self, _LANGUAGE_WEIGHT_NAME, x_dim, partitions)
      linear_regressor.train(
          input_fn=_input_fn, steps=1, hooks=[hook])
Ejemplo n.º 10
0
  def testPartitioner(self):
    x_dim = 64
    partitions = 4

    def _partitioner(shape, dtype):
      del dtype  # unused; required by Fn signature.
      # Only partition the embedding tensor.
      return [partitions, 1] if shape[0] == x_dim else [1]

    regressor = linear.LinearRegressor(
        feature_columns=(
            feature_column_lib.categorical_column_with_hash_bucket(
                'language', hash_bucket_size=x_dim),),
        partitioner=_partitioner,
        model_dir=self._model_dir)

    def _input_fn():
      return {
          'language': sparse_tensor.SparseTensor(
              values=['english', 'spanish'],
              indices=[[0, 0], [0, 1]],
              dense_shape=[1, 2])
      }, [[10.]]

    hook = _CheckPartitionerVarHook(
        self, _LANGUAGE_WEIGHT_NAME, x_dim, partitions)
    regressor.train(
        input_fn=_input_fn, steps=1, hooks=[hook])
Ejemplo n.º 11
0
    def test_column_order(self):
        price_a = fc.numeric_column('price_a')
        price_b = fc.numeric_column('price_b')
        wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
        with ops.Graph().as_default() as g:
            features = {
                'price_a': [[1.]],
                'price_b': [[3.]],
                'wire_cast':
                sparse_tensor.SparseTensor(values=['omar'],
                                           indices=[[0, 0]],
                                           dense_shape=[1, 1])
            }
            fc.make_linear_model(features, [price_a, wire_cast, price_b],
                                 weight_collections=['my-vars'])
            my_vars = g.get_collection('my-vars')
            self.assertIn('price_a', my_vars[0].name)
            self.assertIn('price_b', my_vars[1].name)
            self.assertIn('wire_cast', my_vars[2].name)

        with ops.Graph().as_default() as g:
            features = {
                'price_a': [[1.]],
                'price_b': [[3.]],
                'wire_cast':
                sparse_tensor.SparseTensor(values=['omar'],
                                           indices=[[0, 0]],
                                           dense_shape=[1, 1])
            }
            fc.make_linear_model(features, [wire_cast, price_b, price_a],
                                 weight_collections=['my-vars'])
            my_vars = g.get_collection('my-vars')
            self.assertIn('price_a', my_vars[0].name)
            self.assertIn('price_b', my_vars[1].name)
            self.assertIn('wire_cast', my_vars[2].name)
  def testWarmStartInputLayerMoreSettings(self):
    # Create old and new vocabs for sparse column "sc_vocab".
    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                        "old_vocab")
    new_vocab_path = self._write_vocab(
        ["orange", "guava", "banana", "apple", "raspberry",
         "blueberry"], "new_vocab")
    # Create feature columns.
    sc_hash = fc.categorical_column_with_hash_bucket(
        "sc_hash", hash_bucket_size=15)
    sc_keys = fc.categorical_column_with_vocabulary_list(
        "sc_keys", vocabulary_list=["a", "b", "c", "e"])
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
    all_linear_cols = [sc_hash, sc_keys, sc_vocab]

    # Save checkpoint from which to warm-start.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        _ = variable_scope.get_variable(
            "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
        sc_keys_weights = variable_scope.get_variable(
            "some_other_name", shape=[4, 1], initializer=rand())
        _ = variable_scope.get_variable(
            "linear_model/sc_vocab/weights",
            initializer=[[0.5], [1.], [2.], [3.]])
        self._write_checkpoint(sess)
        prev_keys_val = sess.run(sc_keys_weights)

    def _partitioner(shape, dtype):  # pylint:disable=unused-argument
      # Partition each var into 2 equal slices.
      partitions = [1] * len(shape)
      partitions[0] = min(2, shape[0].value)
      return partitions

    # New graph, new session with warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, _partitioner)
        ws_settings = ws_util._WarmStartSettings(
            self.get_temp_dir(),
            col_to_prev_vocab={sc_vocab: prev_vocab_path},
            col_to_prev_tensor={sc_keys: "some_other_name"},
            exclude_columns=[sc_hash])
        ws_util._warmstart_input_layer(cols_to_vars, ws_settings)
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warmstarted.  Var corresponding to
        # sc_hash should not be warm-started.  Var corresponding to sc_vocab
        # should be correctly warmstarted after vocab remapping.
        self._assert_cols_to_vars(cols_to_vars, {
            sc_keys:
                np.split(prev_keys_val, 2),
            sc_hash: [np.zeros([8, 1]), np.zeros([7, 1])],
            sc_vocab: [
                np.array([[3.], [2.], [1.]]),
                np.array([[0.5], [0.], [0.]])
            ]
        }, sess)
Ejemplo n.º 13
0
 def test_get_sparse_tensors(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
         dense_shape=[2, 2])
     builder = fc._LazyBuilder({'wire': wire_tensor})
     self.assertEqual(builder.get(hashed_sparse),
                      hashed_sparse._get_sparse_tensors(builder).id_tensor)
Ejemplo n.º 14
0
 def test_dtype_should_match_with_tensor(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
     wire_tensor = sparse_tensor.SparseTensor(values=['omar'],
                                              indices=[[0, 0]],
                                              dense_shape=[1, 1])
     builder = fc._LazyBuilder({'wire': wire_tensor})
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
         builder.get(hashed_sparse)
Ejemplo n.º 15
0
def make_feature_cols(train):
    input_labels = []
    for col in cat_cols:
        tc = tf.feature_column.indicator_column(categorical_column_with_hash_bucket(col, getBucketSize(train[col].size)))
        input_labels.append(tc)
    input_labels.append(tf.feature_column.numeric_column('totals.hits'))
    input_labels.append(tf.feature_column.numeric_column('totals.pageviews'))
    input_labels.append(tf.feature_column.numeric_column('totals.visits'))
    return input_labels
Ejemplo n.º 16
0
 def test_sparse_trainable_false(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
         wire_tensor = sparse_tensor.SparseTensor(values=['omar'],
                                                  indices=[[0, 0]],
                                                  dense_shape=[1, 1])
         features = {'wire_cast': wire_tensor}
         fc.make_linear_model(features, [wire_cast], trainable=False)
         trainable_vars = g.get_collection(
             ops.GraphKeys.TRAINABLE_VARIABLES)
         self.assertEqual([], trainable_vars)
Ejemplo n.º 17
0
 def test_int32_64_is_compatible(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
     wire_tensor = sparse_tensor.SparseTensor(
         values=constant_op.constant([101, 201, 301], dtype=dtypes.int32),
         indices=[[0, 0], [1, 0], [1, 1]],
         dense_shape=[2, 2])
     builder = fc._LazyBuilder({'wire': wire_tensor})
     output = builder.get(hashed_sparse)
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [3, 7, 5]
     with self.test_session():
         self.assertAllEqual(expected_values, output.values.eval())
Ejemplo n.º 18
0
 def test_dtype_should_be_string_or_integer(self):
     fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.string)
     fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
     with self.assertRaisesRegexp(ValueError,
                                  'dtype must be string or integer'):
         fc.categorical_column_with_hash_bucket('aaa',
                                                10,
                                                dtype=dtypes.float32)
Ejemplo n.º 19
0
 def test_sparse_collection(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
         wire_tensor = sparse_tensor.SparseTensor(values=['omar'],
                                                  indices=[[0, 0]],
                                                  dense_shape=[1, 1])
         features = {'wire_cast': wire_tensor}
         fc.make_linear_model(features, [wire_cast],
                              weight_collections=['my-vars'])
         my_vars = g.get_collection('my-vars')
         bias = get_linear_model_bias()
         wire_cast_var = get_linear_model_column_var(wire_cast)
         self.assertIn(bias, my_vars)
         self.assertIn(wire_cast_var, my_vars)
Ejemplo n.º 20
0
 def test_sparse_combiner(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
         wire_tensor = sparse_tensor.SparseTensor(
             values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
             indices=[[0, 0], [1, 0], [1, 1]],
             dense_shape=[2, 2])
         features = {'wire_cast': wire_tensor}
         predictions = fc.make_linear_model(features, [wire_cast],
                                            sparse_combiner='mean')
         bias = get_linear_model_bias()
         wire_cast_var = get_linear_model_column_var(wire_cast)
         with _initialized_session() as sess:
             sess.run(
                 wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
             sess.run(bias.assign([5.]))
             self.assertAllClose([[1005.], [5010.]], predictions.eval())
Ejemplo n.º 21
0
 def test_strings_should_be_hashed(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
         dense_shape=[2, 2])
     builder = fc._LazyBuilder({'wire': wire_tensor})
     output = builder.get(hashed_sparse)
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [6, 4, 1]
     with self.test_session():
         self.assertEqual(dtypes.int64, output.values.dtype)
         self.assertAllEqual(expected_values, output.values.eval())
         self.assertAllEqual(wire_tensor.indices.eval(),
                             output.indices.eval())
         self.assertAllEqual(wire_tensor.dense_shape.eval(),
                             output.dense_shape.eval())
def sequence_categorical_column_with_hash_bucket(
    key, hash_bucket_size, dtype=dtypes.string):
  """A sequence of categorical terms where ids are set by hashing.

  Pass this to `embedding_column` or `indicator_column` to convert sequence
  categorical data into dense representation for input to sequence NN, such as
  RNN.

  Example:

  ```python
  tokens = sequence_categorical_column_with_hash_bucket(
      'tokens', hash_bucket_size=1000)
  tokens_embedding = embedding_column(tokens, dimension=10)
  columns = [tokens_embedding]

  features = tf.parse_example(..., features=make_parse_example_spec(columns))
  input_layer, sequence_length = sequence_input_layer(features, columns)

  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
  outputs, state = tf.nn.dynamic_rnn(
      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
  ```

  Args:
    key: A unique string identifying the input feature.
    hash_bucket_size: An int > 1. The number of buckets.
    dtype: The type of features. Only string and integer types are supported.

  Returns:
    A `_SequenceCategoricalColumn`.

  Raises:
    ValueError: `hash_bucket_size` is not greater than 1.
    ValueError: `dtype` is neither string nor integer.
  """
  return fc_old._SequenceCategoricalColumn(
      fc_old.categorical_column_with_hash_bucket(
          key=key,
          hash_bucket_size=hash_bucket_size,
          dtype=dtype))
Ejemplo n.º 23
0
def sequence_categorical_column_with_hash_bucket(
    key, hash_bucket_size, dtype=dtypes.string):
  """A sequence of categorical terms where ids are set by hashing.

  Pass this to `embedding_column` or `indicator_column` to convert sequence
  categorical data into dense representation for input to sequence NN, such as
  RNN.

  Example:

  ```python
  tokens = sequence_categorical_column_with_hash_bucket(
      'tokens', hash_bucket_size=1000)
  tokens_embedding = embedding_column(tokens, dimension=10)
  columns = [tokens_embedding]

  features = tf.parse_example(..., features=make_parse_example_spec(columns))
  input_layer, sequence_length = sequence_input_layer(features, columns)

  rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
  outputs, state = tf.nn.dynamic_rnn(
      rnn_cell, inputs=input_layer, sequence_length=sequence_length)
  ```

  Args:
    key: A unique string identifying the input feature.
    hash_bucket_size: An int > 1. The number of buckets.
    dtype: The type of features. Only string and integer types are supported.

  Returns:
    A `_SequenceCategoricalColumn`.

  Raises:
    ValueError: `hash_bucket_size` is not greater than 1.
    ValueError: `dtype` is neither string nor integer.
  """
  return fc._SequenceCategoricalColumn(
      fc.categorical_column_with_hash_bucket(
          key=key,
          hash_bucket_size=hash_bucket_size,
          dtype=dtype))
Ejemplo n.º 24
0
 def test_sparse_multi_output(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
         wire_tensor = sparse_tensor.SparseTensor(
             values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
             indices=[[0, 0], [1, 0], [1, 1]],
             dense_shape=[2, 2])
         features = {'wire_cast': wire_tensor}
         predictions = fc.make_linear_model(features, [wire_cast], units=3)
         bias = get_linear_model_bias()
         wire_cast_var = get_linear_model_column_var(wire_cast)
         with _initialized_session() as sess:
             self.assertAllClose([0., 0., 0.], bias.eval())
             self.assertAllClose([[0.] * 3] * 4, wire_cast_var.eval())
             sess.run(
                 wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
                                       [1000., 1100., 1200.],
                                       [10000., 11000., 12000.]]))
             sess.run(bias.assign([5., 6., 7.]))
             self.assertAllClose(
                 [[1005., 1106., 1207.], [10015., 11017., 12019.]],
                 predictions.eval())
Ejemplo n.º 25
0
    def test_one_shot_prediction_head_export(self, estimator_factory):
        def _new_temp_dir():
            return os.path.join(test.get_temp_dir(), str(ops.uid()))

        model_dir = _new_temp_dir()
        categorical_column = feature_column.categorical_column_with_hash_bucket(
            key="categorical_exogenous_feature", hash_bucket_size=16)
        exogenous_feature_columns = [
            feature_column.numeric_column("2d_exogenous_feature", shape=(2, )),
            feature_column.embedding_column(
                categorical_column=categorical_column, dimension=10)
        ]
        estimator = estimator_factory(
            model_dir=model_dir,
            exogenous_feature_columns=exogenous_feature_columns,
            head_type=ts_head_lib.OneShotPredictionHead)
        train_features = {
            feature_keys.TrainEvalFeatures.TIMES:
            numpy.arange(20, dtype=numpy.int64),
            feature_keys.TrainEvalFeatures.VALUES:
            numpy.tile(numpy.arange(20, dtype=numpy.float32)[:, None], [1, 5]),
            "2d_exogenous_feature":
            numpy.ones([20, 2]),
            "categorical_exogenous_feature":
            numpy.array(["strkey"] * 20)[:, None]
        }
        train_input_fn = input_pipeline.RandomWindowInputFn(
            input_pipeline.NumpyReader(train_features),
            shuffle_seed=2,
            num_threads=1,
            batch_size=16,
            window_size=16)
        estimator.train(input_fn=train_input_fn, steps=5)
        result = estimator.evaluate(input_fn=train_input_fn, steps=1)
        self.assertIn("average_loss", result)
        self.assertNotIn(feature_keys.State.STATE_TUPLE, result)
        input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
        export_location = estimator.export_savedmodel(_new_temp_dir(),
                                                      input_receiver_fn)
        graph = ops.Graph()
        with graph.as_default():
            with session_lib.Session() as session:
                signatures = loader.load(session, [tag_constants.SERVING],
                                         export_location)
                self.assertEqual([feature_keys.SavedModelLabels.PREDICT],
                                 list(signatures.signature_def.keys()))
                predict_signature = signatures.signature_def[
                    feature_keys.SavedModelLabels.PREDICT]
                six.assertCountEqual(self, [
                    feature_keys.FilteringFeatures.TIMES,
                    feature_keys.FilteringFeatures.VALUES,
                    "2d_exogenous_feature", "categorical_exogenous_feature"
                ], predict_signature.inputs.keys())
                features = {
                    feature_keys.TrainEvalFeatures.TIMES:
                    numpy.tile(
                        numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]),
                    feature_keys.TrainEvalFeatures.VALUES:
                    numpy.tile(
                        numpy.arange(20, dtype=numpy.float32)[None, :, None],
                        [2, 1, 5]),
                    "2d_exogenous_feature":
                    numpy.ones([2, 35, 2]),
                    "categorical_exogenous_feature":
                    numpy.tile(
                        numpy.array(["strkey"] * 35)[None, :, None], [2, 1, 1])
                }
                feeds = {
                    graph.as_graph_element(input_value.name):
                    features[input_key]
                    for input_key, input_value in
                    predict_signature.inputs.items()
                }
                fetches = {
                    output_key: graph.as_graph_element(output_value.name)
                    for output_key, output_value in
                    predict_signature.outputs.items()
                }
                output = session.run(fetches, feed_dict=feeds)
                self.assertEqual((2, 15, 5), output["mean"].shape)
        # Build a parsing input function, then make a tf.Example for it to parse.
        export_location = estimator.export_savedmodel(
            _new_temp_dir(),
            estimator.build_one_shot_parsing_serving_input_receiver_fn(
                filtering_length=20, prediction_length=15))
        graph = ops.Graph()
        with graph.as_default():
            with session_lib.Session() as session:
                example = example_pb2.Example()
                times = example.features.feature[
                    feature_keys.TrainEvalFeatures.TIMES]
                values = example.features.feature[
                    feature_keys.TrainEvalFeatures.VALUES]
                times.int64_list.value.extend(range(35))
                for i in range(20):
                    values.float_list.value.extend([
                        float(i) * 2. + feature_number
                        for feature_number in range(5)
                    ])
                real_feature = example.features.feature["2d_exogenous_feature"]
                categortical_feature = example.features.feature[
                    "categorical_exogenous_feature"]
                for i in range(35):
                    real_feature.float_list.value.extend([1, 1])
                    categortical_feature.bytes_list.value.append(b"strkey")
                # Serialize the tf.Example for feeding to the Session
                examples = [example.SerializeToString()] * 2
                signatures = loader.load(session, [tag_constants.SERVING],
                                         export_location)
                predict_signature = signatures.signature_def[
                    feature_keys.SavedModelLabels.PREDICT]
                ((_, input_value), ) = predict_signature.inputs.items()
                feeds = {graph.as_graph_element(input_value.name): examples}
                fetches = {
                    output_key: graph.as_graph_element(output_value.name)
                    for output_key, output_value in
                    predict_signature.outputs.items()
                }
                output = session.run(fetches, feed_dict=feeds)
                self.assertEqual((2, 15, 5), output["mean"].shape)
Ejemplo n.º 26
0
 def test_bucket_size_should_be_positive(self):
     with self.assertRaisesRegexp(ValueError,
                                  'hash_bucket_size must be at least 1'):
         fc.categorical_column_with_hash_bucket('aaa', 0)
Ejemplo n.º 27
0
    def _build_feature_columns(self, ):
        multi_hot_feature_columns = {}
        multi_hot_feature_columns_deep = {}
        multi_category_feature_columns = {}
        continuous_feature_columns = {}
        crossed_feature_columns = []
        bucketized_feature_columns = []
        embedding_feature_columns = []

        if self._data_conf.multi_hot_columns is not None:
            for column in self._data_conf.multi_hot_columns:
                multi_hot_feature_columns[
                    column] = categorical_column_with_vocabulary_list(
                        column,
                        self._data_conf.multi_hot_columns[column],
                        dtype=tf.string)
                multi_hot_feature_columns_deep[column] = indicator_column(
                    multi_hot_feature_columns[column])

        if self._data_conf.multi_category_columns is not None:
            multi_category_feature_columns = {
                column:
                categorical_column_with_hash_bucket(column,
                                                    hash_bucket_size=1000)
                for column in self._data_conf.multi_category_columns
            }

        if self._data_conf.continuous_columns is not None:
            continuous_feature_columns = {
                column: numeric_column(column)
                for column in self._data_conf.continuous_columns
            }

        if self._data_conf.crossed_columns is not None:
            crossed_feature_columns = [
                crossed_column(_, hash_bucket_size=100000)
                for _ in self._data_conf.crossed_columns
            ]

        if self._data_conf.bucketized_columns is not None:
            [
                bucketized_feature_columns.append(
                    bucketized_column(continuous_feature_columns[column],
                                      boundaries=boundary)) for column,
                boundary in self._data_conf.bucketized_columns.items
            ]

        if len(multi_category_feature_columns) > 0:
            embedding_feature_columns = [
                embedding_column(
                    _, dimension=self._model_conf.embedding_dimension)
                for _ in multi_category_feature_columns.values()
            ]

        self._feature_mapping = {
            0: list(multi_hot_feature_columns.values()),
            1: list(multi_category_feature_columns.values()),
            2: list(continuous_feature_columns.values()),
            3: crossed_feature_columns,
            4: bucketized_feature_columns,
            5: embedding_feature_columns,
            6: list(multi_hot_feature_columns_deep.values())
        }

        self._build_feature_columns_for_model()
  def testWarmStartVarsToWarmstartIsNone(self):
    # Create old and new vocabs for sparse column "sc_vocab".
    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                        "old_vocab")
    new_vocab_path = self._write_vocab(
        ["orange", "guava", "banana", "apple", "raspberry",
         "blueberry"], "new_vocab")
    # Create feature columns.
    sc_hash = fc.categorical_column_with_hash_bucket(
        "sc_hash", hash_bucket_size=15)
    sc_keys = fc.categorical_column_with_vocabulary_list(
        "sc_keys", vocabulary_list=["a", "b", "c", "e"])
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
    all_linear_cols = [sc_hash, sc_keys, sc_vocab]

    # Save checkpoint from which to warm-start.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        variable_scope.get_variable(
            "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
        variable_scope.get_variable(
            "some_other_name", shape=[4, 1], initializer=rand())
        variable_scope.get_variable(
            "linear_model/sc_vocab/weights",
            initializer=[[0.5], [1.], [2.], [3.]])
        self._write_checkpoint(sess)

    def _partitioner(shape, dtype):  # pylint:disable=unused-argument
      # Partition each var into 2 equal slices.
      partitions = [1] * len(shape)
      partitions[0] = min(2, shape[0].value)
      return partitions

    # New graph, new session with warm-starting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, _partitioner)
        vocab_info = ws_util.VocabInfo(
            new_vocab=sc_vocab.vocabulary_file,
            new_vocab_size=sc_vocab.vocabulary_size,
            num_oov_buckets=sc_vocab.num_oov_buckets,
            old_vocab=prev_vocab_path)
        ws_settings = ws_util.WarmStartSettings(
            self.get_temp_dir(),
            # The special value of None here will ensure that only the variable
            # specified in var_name_to_vocab_info (sc_vocab embedding) is
            # warm-started.
            vars_to_warm_start=None,
            var_name_to_vocab_info={
                ws_util._infer_var_name(cols_to_vars[sc_vocab]): vocab_info
            },
            # Even though this is provided, the None value for
            # vars_to_warm_start overrides the logic, and this will not be
            # warm-started.
            var_name_to_prev_var_name={
                ws_util._infer_var_name(cols_to_vars[sc_keys]):
                    "some_other_name"
            })
        ws_util._warm_start(ws_settings)
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warm-started.  Var corresponding to
        # sc_vocab should be correctly warm-started after vocab remapping,
        # and neither of the other two should be warm-started..
        self._assert_cols_to_vars(cols_to_vars, {
            sc_keys: [np.zeros([2, 1]), np.zeros([2, 1])],
            sc_hash: [np.zeros([8, 1]), np.zeros([7, 1])],
            sc_vocab: [
                np.array([[3.], [2.], [1.]]),
                np.array([[0.5], [0.], [0.]])
            ]
        }, sess)
  def testWarmStart_MultipleCols(self):
    # Create vocab for sparse column "sc_vocab".
    vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                   "vocab")

    # Create feature columns.
    sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10)
    sc_hash = fc.categorical_column_with_hash_bucket(
        "sc_hash", hash_bucket_size=15)
    sc_keys = fc.categorical_column_with_vocabulary_list(
        "sc_keys", vocabulary_list=["a", "b", "c", "e"])
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=vocab_path, vocabulary_size=4)
    real = fc.numeric_column("real")
    real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.])
    cross = fc.crossed_column([sc_keys, sc_vocab], hash_bucket_size=20)
    all_linear_cols = [sc_int, sc_hash, sc_keys, sc_vocab, real_bucket, cross]

    # Save checkpoint from which to warm-start.  Also create a bias variable,
    # so we can check that it's also warm-started.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        sc_int_weights = variable_scope.get_variable(
            "linear_model/sc_int/weights", shape=[10, 1], initializer=ones())
        sc_hash_weights = variable_scope.get_variable(
            "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
        sc_keys_weights = variable_scope.get_variable(
            "linear_model/sc_keys/weights", shape=[4, 1], initializer=rand())
        sc_vocab_weights = variable_scope.get_variable(
            "linear_model/sc_vocab/weights", shape=[4, 1], initializer=ones())
        real_bucket_weights = variable_scope.get_variable(
            "linear_model/real_bucketized/weights",
            shape=[5, 1],
            initializer=norms())
        cross_weights = variable_scope.get_variable(
            "linear_model/sc_keys_X_sc_vocab/weights",
            shape=[20, 1],
            initializer=rand())
        bias = variable_scope.get_variable(
            "linear_model/bias_weights",
            shape=[1],
            initializer=rand())
        self._write_checkpoint(sess)
        (prev_int_val, prev_hash_val, prev_keys_val, prev_vocab_val,
         prev_bucket_val, prev_cross_val, prev_bias_val) = sess.run([
             sc_int_weights, sc_hash_weights, sc_keys_weights, sc_vocab_weights,
             real_bucket_weights, cross_weights, bias
         ])

    partitioner = lambda shape, dtype: [1] * len(shape)
    # New graph, new session WITHOUT warm-starting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
        sess.run(variables.global_variables_initializer())
        # Without warm-starting, all weights should be initialized using default
        # initializer (which is init_ops.zeros_initializer).
        self._assert_cols_to_vars(cols_to_vars, {
            sc_int: [np.zeros([10, 1])],
            sc_hash: [np.zeros([15, 1])],
            sc_keys: [np.zeros([4, 1])],
            sc_vocab: [np.zeros([4, 1])],
            real_bucket: [np.zeros([5, 1])],
            cross: [np.zeros([20, 1])],
        }, sess)

    # New graph, new session with warm-starting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
        vocab_info = ws_util.VocabInfo(
            new_vocab=sc_vocab.vocabulary_file,
            new_vocab_size=sc_vocab.vocabulary_size,
            num_oov_buckets=sc_vocab.num_oov_buckets,
            old_vocab=vocab_path)
        ws_util._warm_start(
            ws_util.WarmStartSettings(
                self.get_temp_dir(),
                var_name_to_vocab_info={
                    "linear_model/sc_vocab/weights": vocab_info
                }))
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warm-started.
        self._assert_cols_to_vars(cols_to_vars, {
            sc_int: [prev_int_val],
            sc_hash: [prev_hash_val],
            sc_keys: [prev_keys_val],
            sc_vocab: [prev_vocab_val],
            real_bucket: [prev_bucket_val],
            cross: [prev_cross_val],
            "bias": [prev_bias_val],
        }, sess)
Ejemplo n.º 30
0
 def test_one_shot_prediction_head_export(self):
     model_dir = self.get_temp_dir()
     categorical_column = feature_column.categorical_column_with_hash_bucket(
         key="categorical_exogenous_feature", hash_bucket_size=16)
     exogenous_feature_columns = [
         feature_column.numeric_column("2d_exogenous_feature", shape=(2, )),
         feature_column.embedding_column(
             categorical_column=categorical_column, dimension=10)
     ]
     estimator = ts_estimators.TimeSeriesRegressor(
         model=lstm_example._LSTMModel(
             num_features=5,
             num_units=128,
             exogenous_feature_columns=exogenous_feature_columns),
         optimizer=adam.AdamOptimizer(0.001),
         config=estimator_lib.RunConfig(tf_random_seed=4),
         state_manager=state_management.ChainingStateManager(),
         head_type=ts_head_lib.OneShotPredictionHead,
         model_dir=model_dir)
     train_features = {
         feature_keys.TrainEvalFeatures.TIMES:
         numpy.arange(20, dtype=numpy.int64),
         feature_keys.TrainEvalFeatures.VALUES:
         numpy.tile(numpy.arange(20, dtype=numpy.float32)[:, None], [1, 5]),
         "2d_exogenous_feature":
         numpy.ones([20, 2]),
         "categorical_exogenous_feature":
         numpy.array(["strkey"] * 20)[:, None]
     }
     train_input_fn = input_pipeline.RandomWindowInputFn(
         input_pipeline.NumpyReader(train_features),
         shuffle_seed=2,
         num_threads=1,
         batch_size=16,
         window_size=16)
     estimator.train(input_fn=train_input_fn, steps=5)
     input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
     export_location = estimator.export_savedmodel(self.get_temp_dir(),
                                                   input_receiver_fn)
     graph = ops.Graph()
     with graph.as_default():
         with session_lib.Session() as session:
             signatures = loader.load(session, [tag_constants.SERVING],
                                      export_location)
             self.assertEqual([feature_keys.SavedModelLabels.PREDICT],
                              list(signatures.signature_def.keys()))
             predict_signature = signatures.signature_def[
                 feature_keys.SavedModelLabels.PREDICT]
             six.assertCountEqual(self, [
                 feature_keys.FilteringFeatures.TIMES,
                 feature_keys.FilteringFeatures.VALUES,
                 "2d_exogenous_feature", "categorical_exogenous_feature"
             ], predict_signature.inputs.keys())
             features = {
                 feature_keys.TrainEvalFeatures.TIMES:
                 numpy.tile(
                     numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]),
                 feature_keys.TrainEvalFeatures.VALUES:
                 numpy.tile(
                     numpy.arange(20, dtype=numpy.float32)[None, :, None],
                     [2, 1, 5]),
                 "2d_exogenous_feature":
                 numpy.ones([2, 35, 2]),
                 "categorical_exogenous_feature":
                 numpy.tile(
                     numpy.array(["strkey"] * 35)[None, :, None], [2, 1, 1])
             }
             feeds = {
                 graph.as_graph_element(input_value.name):
                 features[input_key]
                 for input_key, input_value in
                 predict_signature.inputs.items()
             }
             fetches = {
                 output_key: graph.as_graph_element(output_value.name)
                 for output_key, output_value in
                 predict_signature.outputs.items()
             }
             output = session.run(fetches, feed_dict=feeds)
             self.assertAllEqual((2, 15, 5), output["mean"].shape)
Ejemplo n.º 31
0
  def testWarmStartVarsToWarmstartIsNone(self):
    # Create old and new vocabs for sparse column "sc_vocab".
    prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                        "old_vocab")
    new_vocab_path = self._write_vocab(
        ["orange", "guava", "banana", "apple", "raspberry",
         "blueberry"], "new_vocab")
    # Create feature columns.
    sc_hash = fc.categorical_column_with_hash_bucket(
        "sc_hash", hash_bucket_size=15)
    sc_keys = fc.categorical_column_with_vocabulary_list(
        "sc_keys", vocabulary_list=["a", "b", "c", "e"])
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
    all_linear_cols = [sc_hash, sc_keys, sc_vocab]

    # Save checkpoint from which to warm-start.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        variable_scope.get_variable(
            "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
        variable_scope.get_variable(
            "some_other_name", shape=[4, 1], initializer=rand())
        variable_scope.get_variable(
            "linear_model/sc_vocab/weights",
            initializer=[[0.5], [1.], [2.], [3.]])
        self._write_checkpoint(sess)

    def _partitioner(shape, dtype):  # pylint:disable=unused-argument
      # Partition each var into 2 equal slices.
      partitions = [1] * len(shape)
      partitions[0] = min(2, shape[0].value)
      return partitions

    # New graph, new session with warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, _partitioner)
        vocab_info = ws_util._VocabInfo(
            new_vocab=sc_vocab.vocabulary_file,
            new_vocab_size=sc_vocab.vocabulary_size,
            num_oov_buckets=sc_vocab.num_oov_buckets,
            old_vocab=prev_vocab_path
        )
        ws_settings = ws_util._WarmStartSettings(
            self.get_temp_dir(),
            # The special value of None here will ensure that only the variable
            # specified in var_name_to_vocab_info (sc_vocab embedding) is
            # warmstarted.
            vars_to_warmstart=None,
            var_name_to_vocab_info={
                ws_util._infer_var_name(cols_to_vars[sc_vocab]): vocab_info
            },
            # Even though this is provided, the None value for vars_to_warmstart
            # overrides the logic, and this will not be warmstarted.
            var_name_to_prev_var_name={
                ws_util._infer_var_name(cols_to_vars[sc_keys]):
                    "some_other_name"
            })
        ws_util._warmstart(ws_settings)
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warmstarted.  Var corresponding to
        # sc_vocab should be correctly warmstarted after vocab remapping,
        # and neither of the other two should be warmstarted..
        self._assert_cols_to_vars(cols_to_vars, {
            sc_keys: [np.zeros([2, 1]), np.zeros([2, 1])],
            sc_hash: [np.zeros([8, 1]), np.zeros([7, 1])],
            sc_vocab: [
                np.array([[3.], [2.], [1.]]),
                np.array([[0.5], [0.], [0.]])
            ]
        }, sess)
Ejemplo n.º 32
0
    def testWarmStartInputLayerMoreSettings(self):
        # Create old and new vocabs for sparse column "sc_vocab".
        prev_vocab_path = self._write_vocab(
            ["apple", "banana", "guava", "orange"], "old_vocab")
        new_vocab_path = self._write_vocab(
            ["orange", "guava", "banana", "apple", "raspberry", "blueberry"],
            "new_vocab")
        # Create feature columns.
        sc_hash = fc.categorical_column_with_hash_bucket("sc_hash",
                                                         hash_bucket_size=15)
        sc_keys = fc.categorical_column_with_vocabulary_list(
            "sc_keys", vocabulary_list=["a", "b", "c", "e"])
        sc_vocab = fc.categorical_column_with_vocabulary_file(
            "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6)
        all_linear_cols = [sc_hash, sc_keys, sc_vocab]

        # Save checkpoint from which to warm-start.
        with ops.Graph().as_default() as g:
            with self.test_session(graph=g) as sess:
                _ = variable_scope.get_variable("linear_model/sc_hash/weights",
                                                shape=[15, 1],
                                                initializer=norms())
                sc_keys_weights = variable_scope.get_variable(
                    "some_other_name", shape=[4, 1], initializer=rand())
                _ = variable_scope.get_variable(
                    "linear_model/sc_vocab/weights",
                    initializer=[[0.5], [1.], [2.], [3.]])
                self._write_checkpoint(sess)
                prev_keys_val = sess.run(sc_keys_weights)

        def _partitioner(shape, dtype):  # pylint:disable=unused-argument
            # Partition each var into 2 equal slices.
            partitions = [1] * len(shape)
            partitions[0] = min(2, shape[0].value)
            return partitions

        # New graph, new session with warmstarting.
        with ops.Graph().as_default() as g:
            with self.test_session(graph=g) as sess:
                cols_to_vars = self._create_linear_model(
                    all_linear_cols, _partitioner)
                ws_settings = ws_util._WarmStartSettings(
                    self.get_temp_dir(),
                    col_to_prev_vocab={sc_vocab: prev_vocab_path},
                    col_to_prev_tensor={sc_keys: "some_other_name"},
                    exclude_columns=[sc_hash])
                ws_util._warmstart_input_layer(cols_to_vars, ws_settings)
                sess.run(variables.global_variables_initializer())
                # Verify weights were correctly warmstarted.  Var corresponding to
                # sc_hash should not be warm-started.  Var corresponding to sc_vocab
                # should be correctly warmstarted after vocab remapping.
                self._assert_cols_to_vars(
                    cols_to_vars, {
                        sc_keys:
                        np.split(prev_keys_val, 2),
                        sc_hash: [np.zeros([8, 1]),
                                  np.zeros([7, 1])],
                        sc_vocab: [
                            np.array([[3.], [2.], [1.]]),
                            np.array([[0.5], [0.], [0.]])
                        ]
                    }, sess)
Ejemplo n.º 33
0
  def testWarmStart_MultipleCols(self):
    # Create vocab for sparse column "sc_vocab".
    vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                   "vocab")

    # Create feature columns.
    sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10)
    sc_hash = fc.categorical_column_with_hash_bucket(
        "sc_hash", hash_bucket_size=15)
    sc_keys = fc.categorical_column_with_vocabulary_list(
        "sc_keys", vocabulary_list=["a", "b", "c", "e"])
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=vocab_path, vocabulary_size=4)
    real = fc.numeric_column("real")
    real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.])
    cross = fc.crossed_column([sc_keys, sc_vocab], hash_bucket_size=20)
    all_linear_cols = [sc_int, sc_hash, sc_keys, sc_vocab, real_bucket, cross]

    # Save checkpoint from which to warm-start.  Also create a bias variable,
    # so we can check that it's also warmstarted.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        sc_int_weights = variable_scope.get_variable(
            "linear_model/sc_int/weights", shape=[10, 1], initializer=ones())
        sc_hash_weights = variable_scope.get_variable(
            "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
        sc_keys_weights = variable_scope.get_variable(
            "linear_model/sc_keys/weights", shape=[4, 1], initializer=rand())
        sc_vocab_weights = variable_scope.get_variable(
            "linear_model/sc_vocab/weights", shape=[4, 1], initializer=ones())
        real_bucket_weights = variable_scope.get_variable(
            "linear_model/real_bucketized/weights",
            shape=[5, 1],
            initializer=norms())
        cross_weights = variable_scope.get_variable(
            "linear_model/sc_keys_X_sc_vocab/weights",
            shape=[20, 1],
            initializer=rand())
        bias = variable_scope.get_variable(
            "linear_model/bias_weights",
            shape=[1],
            initializer=rand())
        self._write_checkpoint(sess)
        (prev_int_val, prev_hash_val, prev_keys_val, prev_vocab_val,
         prev_bucket_val, prev_cross_val, prev_bias_val) = sess.run([
             sc_int_weights, sc_hash_weights, sc_keys_weights, sc_vocab_weights,
             real_bucket_weights, cross_weights, bias
         ])

    partitioner = lambda shape, dtype: [1] * len(shape)
    # New graph, new session WITHOUT warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
        sess.run(variables.global_variables_initializer())
        # Without warmstarting, all weights should be initialized using default
        # initializer (which is init_ops.zeros_initializer).
        self._assert_cols_to_vars(cols_to_vars, {
            sc_int: [np.zeros([10, 1])],
            sc_hash: [np.zeros([15, 1])],
            sc_keys: [np.zeros([4, 1])],
            sc_vocab: [np.zeros([4, 1])],
            real_bucket: [np.zeros([5, 1])],
            cross: [np.zeros([20, 1])],
        }, sess)

    # New graph, new session with warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
        vocab_info = ws_util._VocabInfo(
            new_vocab=sc_vocab.vocabulary_file,
            new_vocab_size=sc_vocab.vocabulary_size,
            num_oov_buckets=sc_vocab.num_oov_buckets,
            old_vocab=vocab_path
        )
        ws_util._warmstart(
            ws_util._WarmStartSettings(
                self.get_temp_dir(),
                var_name_to_vocab_info={
                    "linear_model/sc_vocab/weights": vocab_info
                }))
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warmstarted.
        self._assert_cols_to_vars(cols_to_vars, {
            sc_int: [prev_int_val],
            sc_hash: [prev_hash_val],
            sc_keys: [prev_keys_val],
            sc_vocab: [prev_vocab_val],
            real_bucket: [prev_bucket_val],
            cross: [prev_cross_val],
            "bias": [prev_bias_val],
        }, sess)
Ejemplo n.º 34
0
 def test_bucket_size_should_be_given(self):
     with self.assertRaisesRegexp(ValueError,
                                  'hash_bucket_size must be set.'):
         fc.categorical_column_with_hash_bucket('aaa', None)
Ejemplo n.º 35
0
 def test_one_shot_prediction_head_export(self, estimator_factory):
   def _new_temp_dir():
     return os.path.join(test.get_temp_dir(), str(ops.uid()))
   model_dir = _new_temp_dir()
   categorical_column = feature_column.categorical_column_with_hash_bucket(
       key="categorical_exogenous_feature", hash_bucket_size=16)
   exogenous_feature_columns = [
       feature_column.numeric_column(
           "2d_exogenous_feature", shape=(2,)),
       feature_column.embedding_column(
           categorical_column=categorical_column, dimension=10)]
   estimator = estimator_factory(
       model_dir=model_dir,
       exogenous_feature_columns=exogenous_feature_columns,
       head_type=ts_head_lib.OneShotPredictionHead)
   train_features = {
       feature_keys.TrainEvalFeatures.TIMES: numpy.arange(
           20, dtype=numpy.int64),
       feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
           20, dtype=numpy.float32)[:, None], [1, 5]),
       "2d_exogenous_feature": numpy.ones([20, 2]),
       "categorical_exogenous_feature": numpy.array(
           ["strkey"] * 20)[:, None]
   }
   train_input_fn = input_pipeline.RandomWindowInputFn(
       input_pipeline.NumpyReader(train_features), shuffle_seed=2,
       num_threads=1, batch_size=16, window_size=16)
   estimator.train(input_fn=train_input_fn, steps=5)
   result = estimator.evaluate(input_fn=train_input_fn, steps=1)
   self.assertIn("average_loss", result)
   self.assertNotIn(feature_keys.State.STATE_TUPLE, result)
   input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
   export_location = estimator.export_saved_model(_new_temp_dir(),
                                                  input_receiver_fn)
   graph = ops.Graph()
   with graph.as_default():
     with session_lib.Session() as session:
       signatures = loader.load(
           session, [tag_constants.SERVING], export_location)
       self.assertEqual([feature_keys.SavedModelLabels.PREDICT],
                        list(signatures.signature_def.keys()))
       predict_signature = signatures.signature_def[
           feature_keys.SavedModelLabels.PREDICT]
       six.assertCountEqual(
           self,
           [feature_keys.FilteringFeatures.TIMES,
            feature_keys.FilteringFeatures.VALUES,
            "2d_exogenous_feature",
            "categorical_exogenous_feature"],
           predict_signature.inputs.keys())
       features = {
           feature_keys.TrainEvalFeatures.TIMES: numpy.tile(
               numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]),
           feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
               20, dtype=numpy.float32)[None, :, None], [2, 1, 5]),
           "2d_exogenous_feature": numpy.ones([2, 35, 2]),
           "categorical_exogenous_feature": numpy.tile(numpy.array(
               ["strkey"] * 35)[None, :, None], [2, 1, 1])
       }
       feeds = {
           graph.as_graph_element(input_value.name): features[input_key]
           for input_key, input_value in predict_signature.inputs.items()}
       fetches = {output_key: graph.as_graph_element(output_value.name)
                  for output_key, output_value
                  in predict_signature.outputs.items()}
       output = session.run(fetches, feed_dict=feeds)
       self.assertEqual((2, 15, 5), output["mean"].shape)
   # Build a parsing input function, then make a tf.Example for it to parse.
   export_location = estimator.export_saved_model(
       _new_temp_dir(),
       estimator.build_one_shot_parsing_serving_input_receiver_fn(
           filtering_length=20, prediction_length=15))
   graph = ops.Graph()
   with graph.as_default():
     with session_lib.Session() as session:
       example = example_pb2.Example()
       times = example.features.feature[feature_keys.TrainEvalFeatures.TIMES]
       values = example.features.feature[feature_keys.TrainEvalFeatures.VALUES]
       times.int64_list.value.extend(range(35))
       for i in range(20):
         values.float_list.value.extend(
             [float(i) * 2. + feature_number
              for feature_number in range(5)])
       real_feature = example.features.feature["2d_exogenous_feature"]
       categortical_feature = example.features.feature[
           "categorical_exogenous_feature"]
       for i in range(35):
         real_feature.float_list.value.extend([1, 1])
         categortical_feature.bytes_list.value.append(b"strkey")
       # Serialize the tf.Example for feeding to the Session
       examples = [example.SerializeToString()] * 2
       signatures = loader.load(
           session, [tag_constants.SERVING], export_location)
       predict_signature = signatures.signature_def[
           feature_keys.SavedModelLabels.PREDICT]
       ((_, input_value),) = predict_signature.inputs.items()
       feeds = {graph.as_graph_element(input_value.name): examples}
       fetches = {output_key: graph.as_graph_element(output_value.name)
                  for output_key, output_value
                  in predict_signature.outputs.items()}
       output = session.run(fetches, feed_dict=feeds)
       self.assertEqual((2, 15, 5), output["mean"].shape)
Ejemplo n.º 36
0
 def test_one_shot_prediction_head_export(self, estimator_factory):
   model_dir = os.path.join(test.get_temp_dir(), str(ops.uid()))
   categorical_column = feature_column.categorical_column_with_hash_bucket(
       key="categorical_exogenous_feature", hash_bucket_size=16)
   exogenous_feature_columns = [
       feature_column.numeric_column(
           "2d_exogenous_feature", shape=(2,)),
       feature_column.embedding_column(
           categorical_column=categorical_column, dimension=10)]
   estimator = estimator_factory(
       model_dir=model_dir,
       exogenous_feature_columns=exogenous_feature_columns,
       head_type=ts_head_lib.OneShotPredictionHead)
   train_features = {
       feature_keys.TrainEvalFeatures.TIMES: numpy.arange(
           20, dtype=numpy.int64),
       feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
           20, dtype=numpy.float32)[:, None], [1, 5]),
       "2d_exogenous_feature": numpy.ones([20, 2]),
       "categorical_exogenous_feature": numpy.array(
           ["strkey"] * 20)[:, None]
   }
   train_input_fn = input_pipeline.RandomWindowInputFn(
       input_pipeline.NumpyReader(train_features), shuffle_seed=2,
       num_threads=1, batch_size=16, window_size=16)
   estimator.train(input_fn=train_input_fn, steps=5)
   input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
   export_location = estimator.export_savedmodel(test.get_temp_dir(),
                                                 input_receiver_fn)
   graph = ops.Graph()
   with graph.as_default():
     with session_lib.Session() as session:
       signatures = loader.load(
           session, [tag_constants.SERVING], export_location)
       self.assertEqual([feature_keys.SavedModelLabels.PREDICT],
                        list(signatures.signature_def.keys()))
       predict_signature = signatures.signature_def[
           feature_keys.SavedModelLabels.PREDICT]
       six.assertCountEqual(
           self,
           [feature_keys.FilteringFeatures.TIMES,
            feature_keys.FilteringFeatures.VALUES,
            "2d_exogenous_feature",
            "categorical_exogenous_feature"],
           predict_signature.inputs.keys())
       features = {
           feature_keys.TrainEvalFeatures.TIMES: numpy.tile(
               numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]),
           feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
               20, dtype=numpy.float32)[None, :, None], [2, 1, 5]),
           "2d_exogenous_feature": numpy.ones([2, 35, 2]),
           "categorical_exogenous_feature": numpy.tile(numpy.array(
               ["strkey"] * 35)[None, :, None], [2, 1, 1])
       }
       feeds = {
           graph.as_graph_element(input_value.name): features[input_key]
           for input_key, input_value in predict_signature.inputs.items()}
       fetches = {output_key: graph.as_graph_element(output_value.name)
                  for output_key, output_value
                  in predict_signature.outputs.items()}
       output = session.run(fetches, feed_dict=feeds)
       self.assertEqual((2, 15, 5), output["mean"].shape)
Ejemplo n.º 37
0
from tensorflow.contrib.learn import LinearRegressor, pandas_input_fn, DNNRegressor, Experiment
from tensorflow.python.feature_column.feature_column import categorical_column_with_hash_bucket, numeric_column, \
    categorical_column_with_vocabulary_list, embedding_column, indicator_column

make = categorical_column_with_hash_bucket('make', 100)
horsepower = numeric_column('horsepower', shape=[])
cylinders = categorical_column_with_vocabulary_list(
    'num-of-cylinders', ['two', 'three', 'four', 'six', 'eight'])

###############
regressor = DNNRegressor(feature_columns=[
    embedding_column(make, 10), horsepower,
    indicator_column(cylinders, 3)
],
                         hidden_units=[50, 30, 10])
################
regressor = LinearRegressor(feature_columns=[make, horsepower, cylinders])

# any python generator
train_input_fn = pandas_input_fn(x=input_data,
                                 y=input_label,
                                 batch_size=64,
                                 shuffle=True,
                                 num_epochs=None)

regressor.train(train_input_fn, steps=10000)


def expirement_fn(run_config, hparams):
    regressor = DNNRegressor(...,
                             config=run_config,
Ejemplo n.º 38
0
 def test_defaults(self):
     a = fc.categorical_column_with_hash_bucket('aaa', 10)
     self.assertEqual('aaa', a.name)
     self.assertEqual('aaa', a.key)
     self.assertEqual(10, a.hash_bucket_size)
     self.assertEqual(dtypes.string, a.dtype)
Ejemplo n.º 39
0
 def test_parse_config_int(self):
     a = fc.categorical_column_with_hash_bucket('aaa',
                                                10,
                                                dtype=dtypes.int32)
     self.assertEqual({'aaa': parsing_ops.VarLenFeature(dtypes.int32)},
                      a._parse_example_config)