def test_tensor_dtype_should_be_string_or_integer(self): string_fc = fc.categorical_column_with_hash_bucket('a_string', 10, dtype=dtypes.string) int_fc = fc.categorical_column_with_hash_bucket('a_int', 10, dtype=dtypes.int32) float_fc = fc.categorical_column_with_hash_bucket('a_float', 10, dtype=dtypes.string) int_tensor = sparse_tensor.SparseTensor(values=constant_op.constant( [101]), indices=[[0, 0]], dense_shape=[1, 1]) string_tensor = sparse_tensor.SparseTensor(values=constant_op.constant( ['101']), indices=[[0, 0]], dense_shape=[1, 1]) float_tensor = sparse_tensor.SparseTensor(values=constant_op.constant( [101.]), indices=[[0, 0]], dense_shape=[1, 1]) builder = fc._LazyBuilder({ 'a_int': int_tensor, 'a_string': string_tensor, 'a_float': float_tensor }) builder.get(string_fc) builder.get(int_fc) with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'): builder.get(float_fc)
def sequence_categorical_column_with_hash_bucket( key, hash_bucket_size, dtype=dtypes.string): """A sequence of categorical terms where ids are set by hashing. Example: ```python tokens = sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=1000) tokens_embedding = embedding_column(tokens, dimension=10) columns = [tokens_embedding] features = tf.parse_example(..., features=make_parse_example_spec(columns)) input_layer, sequence_length = sequence_input_layer(features, columns) rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) outputs, state = tf.nn.dynamic_rnn( rnn_cell, inputs=input_layer, sequence_length=sequence_length) ``` Args: key: A unique string identifying the input feature. hash_bucket_size: An int > 1. The number of buckets. dtype: The type of features. Only string and integer types are supported. Returns: A `_SequenceCategoricalColumn`. """ return _SequenceCategoricalColumn( fc.categorical_column_with_hash_bucket( key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
def testPartitioner(self): x_dim = 64 partitions = 4 def _partitioner(shape, dtype): del dtype # unused; required by Fn signature. # Only partition the embedding tensor. return [partitions, 1] if shape[0] == x_dim else [1] regressor = self._linear_regressor_fn(feature_columns=( feature_column_lib.categorical_column_with_hash_bucket( 'language', hash_bucket_size=x_dim), ), partitioner=_partitioner, model_dir=self._model_dir) def _input_fn(): return { 'language': sparse_tensor.SparseTensor(values=['english', 'spanish'], indices=[[0, 0], [0, 1]], dense_shape=[1, 2]) }, [[10.]] hook = CheckPartitionerVarHook(self, LANGUAGE_WEIGHT_NAME, x_dim, partitions) regressor.train(input_fn=_input_fn, steps=1, hooks=[hook])
def testDefaultPartitionerWithMultiplePsReplicas(self): partitions = 2 # This results in weights larger than the default partition size of 64M, # so partitioned weights are created (each weight uses 4 bytes). x_dim = 32 << 20 class FakeRunConfig(run_config.RunConfig): @property def num_ps_replicas(self): return partitions # Mock the device setter as ps is not available on test machines. with test.mock.patch.object(estimator, '_get_replica_device_setter', return_value=lambda _: '/cpu:0'): linear_regressor = self._linear_regressor_fn( feature_columns=( feature_column_lib.categorical_column_with_hash_bucket( 'language', hash_bucket_size=x_dim), ), config=FakeRunConfig(), model_dir=self._model_dir) def _input_fn(): return { 'language': sparse_tensor.SparseTensor(values=['english', 'spanish'], indices=[[0, 0], [0, 1]], dense_shape=[1, 2]) }, [[10.]] hook = CheckPartitionerVarHook(self, LANGUAGE_WEIGHT_NAME, x_dim, partitions) linear_regressor.train(input_fn=_input_fn, steps=1, hooks=[hook])
def testWarmStart_SparseColumnHashed(self): # Create feature column. sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) # Save checkpoint from which to warm-start. _, prev_hash_val = self._create_prev_run_var( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) partitioner = lambda shape, dtype: [1] * len(shape) # New graph, new session WITHOUT warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model([sc_hash], partitioner) sess.run(variables.global_variables_initializer()) # Without warmstarting, the weights should be initialized using default # initializer (which is init_ops.zeros_initializer). self._assert_cols_to_vars(cols_to_vars, {sc_hash: [np.zeros([15, 1])]}, sess) # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model([sc_hash], partitioner) ws_util._warmstart(ws_util._WarmStartSettings( self.get_temp_dir(), vars_to_warmstart=".*sc_hash.*")) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. self._assert_cols_to_vars(cols_to_vars, {sc_hash: [prev_hash_val]}, sess)
def testWarmStart_SparseColumnHashed(self): # Create feature column. sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) # Save checkpoint from which to warm-start. _, prev_hash_val = self._create_prev_run_var( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) partitioner = lambda shape, dtype: [1] * len(shape) # New graph, new session WITHOUT warm-starting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model([sc_hash], partitioner) sess.run(variables.global_variables_initializer()) # Without warm-starting, the weights should be initialized using default # initializer (which is init_ops.zeros_initializer). self._assert_cols_to_vars(cols_to_vars, {sc_hash: [np.zeros([15, 1])]}, sess) # New graph, new session with warm-starting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model([sc_hash], partitioner) ws_util._warm_start( ws_util.WarmStartSettings( self.get_temp_dir(), vars_to_warm_start=".*sc_hash.*")) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warm-started. self._assert_cols_to_vars(cols_to_vars, {sc_hash: [prev_hash_val]}, sess)
def testWarmStartMoreSettingsNoPartitioning(self): # Create old and new vocabs for sparse column "sc_vocab". prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "old_vocab") new_vocab_path = self._write_vocab( ["orange", "guava", "banana", "apple", "raspberry", "blueberry"], "new_vocab") # Create feature columns. sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6) all_linear_cols = [sc_hash, sc_keys, sc_vocab] # Save checkpoint from which to warm-start. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) sc_keys_weights = variable_scope.get_variable( "some_other_name", shape=[4, 1], initializer=rand()) variable_scope.get_variable( "linear_model/sc_vocab/weights", initializer=[[0.5], [1.], [2.], [3.]]) self._write_checkpoint(sess) prev_keys_val = sess.run(sc_keys_weights) # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner=None) vocab_info = ws_util._VocabInfo( new_vocab=sc_vocab.vocabulary_file, new_vocab_size=sc_vocab.vocabulary_size, num_oov_buckets=sc_vocab.num_oov_buckets, old_vocab=prev_vocab_path ) ws_settings = ws_util._WarmStartSettings( self.get_temp_dir(), vars_to_warmstart=".*(sc_keys|sc_vocab).*", var_name_to_vocab_info={ ws_util._infer_var_name(cols_to_vars[sc_vocab]): vocab_info }, var_name_to_prev_var_name={ ws_util._infer_var_name(cols_to_vars[sc_keys]): "some_other_name" }) ws_util._warmstart(ws_settings) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. Var corresponding to # sc_hash should not be warm-started. Var corresponding to sc_vocab # should be correctly warmstarted after vocab remapping. self._assert_cols_to_vars(cols_to_vars, { sc_keys: [prev_keys_val], sc_hash: [np.zeros([15, 1])], sc_vocab: [np.array([[3.], [2.], [1.], [0.5], [0.], [0.]])] }, sess)
def test_deep_copy(self): """Tests deepcopy of categorical_column_with_hash_bucket.""" column = fc.categorical_column_with_hash_bucket('aaa', 10) column_copy = copy.deepcopy(column) self.assertEqual('aaa', column_copy.name) self.assertEqual(10, column_copy.hash_bucket_size) self.assertEqual(dtypes.string, column_copy.dtype)
def testDefaultPartitionerWithMultiplePsReplicas(self): partitions = 2 # This results in weights larger than the default partition size of 64M, # so partitioned weights are created (each weight uses 4 bytes). x_dim = 32 << 20 class FakeRunConfig(run_config.RunConfig): @property def num_ps_replicas(self): return partitions # Mock the device setter as ps is not available on test machines. with test.mock.patch.object(estimator, '_get_replica_device_setter', return_value=lambda _: '/cpu:0'): linear_regressor = linear.LinearRegressor( feature_columns=( feature_column_lib.categorical_column_with_hash_bucket( 'language', hash_bucket_size=x_dim),), config=FakeRunConfig(), model_dir=self._model_dir) def _input_fn(): return { 'language': sparse_tensor.SparseTensor( values=['english', 'spanish'], indices=[[0, 0], [0, 1]], dense_shape=[1, 2]) }, [[10.]] hook = _CheckPartitionerVarHook( self, _LANGUAGE_WEIGHT_NAME, x_dim, partitions) linear_regressor.train( input_fn=_input_fn, steps=1, hooks=[hook])
def testPartitioner(self): x_dim = 64 partitions = 4 def _partitioner(shape, dtype): del dtype # unused; required by Fn signature. # Only partition the embedding tensor. return [partitions, 1] if shape[0] == x_dim else [1] regressor = linear.LinearRegressor( feature_columns=( feature_column_lib.categorical_column_with_hash_bucket( 'language', hash_bucket_size=x_dim),), partitioner=_partitioner, model_dir=self._model_dir) def _input_fn(): return { 'language': sparse_tensor.SparseTensor( values=['english', 'spanish'], indices=[[0, 0], [0, 1]], dense_shape=[1, 2]) }, [[10.]] hook = _CheckPartitionerVarHook( self, _LANGUAGE_WEIGHT_NAME, x_dim, partitions) regressor.train( input_fn=_input_fn, steps=1, hooks=[hook])
def test_column_order(self): price_a = fc.numeric_column('price_a') price_b = fc.numeric_column('price_b') wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4) with ops.Graph().as_default() as g: features = { 'price_a': [[1.]], 'price_b': [[3.]], 'wire_cast': sparse_tensor.SparseTensor(values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) } fc.make_linear_model(features, [price_a, wire_cast, price_b], weight_collections=['my-vars']) my_vars = g.get_collection('my-vars') self.assertIn('price_a', my_vars[0].name) self.assertIn('price_b', my_vars[1].name) self.assertIn('wire_cast', my_vars[2].name) with ops.Graph().as_default() as g: features = { 'price_a': [[1.]], 'price_b': [[3.]], 'wire_cast': sparse_tensor.SparseTensor(values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) } fc.make_linear_model(features, [wire_cast, price_b, price_a], weight_collections=['my-vars']) my_vars = g.get_collection('my-vars') self.assertIn('price_a', my_vars[0].name) self.assertIn('price_b', my_vars[1].name) self.assertIn('wire_cast', my_vars[2].name)
def testWarmStartInputLayerMoreSettings(self): # Create old and new vocabs for sparse column "sc_vocab". prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "old_vocab") new_vocab_path = self._write_vocab( ["orange", "guava", "banana", "apple", "raspberry", "blueberry"], "new_vocab") # Create feature columns. sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6) all_linear_cols = [sc_hash, sc_keys, sc_vocab] # Save checkpoint from which to warm-start. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: _ = variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) sc_keys_weights = variable_scope.get_variable( "some_other_name", shape=[4, 1], initializer=rand()) _ = variable_scope.get_variable( "linear_model/sc_vocab/weights", initializer=[[0.5], [1.], [2.], [3.]]) self._write_checkpoint(sess) prev_keys_val = sess.run(sc_keys_weights) def _partitioner(shape, dtype): # pylint:disable=unused-argument # Partition each var into 2 equal slices. partitions = [1] * len(shape) partitions[0] = min(2, shape[0].value) return partitions # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, _partitioner) ws_settings = ws_util._WarmStartSettings( self.get_temp_dir(), col_to_prev_vocab={sc_vocab: prev_vocab_path}, col_to_prev_tensor={sc_keys: "some_other_name"}, exclude_columns=[sc_hash]) ws_util._warmstart_input_layer(cols_to_vars, ws_settings) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. Var corresponding to # sc_hash should not be warm-started. Var corresponding to sc_vocab # should be correctly warmstarted after vocab remapping. self._assert_cols_to_vars(cols_to_vars, { sc_keys: np.split(prev_keys_val, 2), sc_hash: [np.zeros([8, 1]), np.zeros([7, 1])], sc_vocab: [ np.array([[3.], [2.], [1.]]), np.array([[0.5], [0.], [0.]]) ] }, sess)
def test_get_sparse_tensors(self): hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10) wire_tensor = sparse_tensor.SparseTensor( values=['omar', 'stringer', 'marlo'], indices=[[0, 0], [1, 0], [1, 1]], dense_shape=[2, 2]) builder = fc._LazyBuilder({'wire': wire_tensor}) self.assertEqual(builder.get(hashed_sparse), hashed_sparse._get_sparse_tensors(builder).id_tensor)
def test_dtype_should_match_with_tensor(self): hashed_sparse = fc.categorical_column_with_hash_bucket( 'wire', 10, dtype=dtypes.int64) wire_tensor = sparse_tensor.SparseTensor(values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) builder = fc._LazyBuilder({'wire': wire_tensor}) with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'): builder.get(hashed_sparse)
def make_feature_cols(train): input_labels = [] for col in cat_cols: tc = tf.feature_column.indicator_column(categorical_column_with_hash_bucket(col, getBucketSize(train[col].size))) input_labels.append(tc) input_labels.append(tf.feature_column.numeric_column('totals.hits')) input_labels.append(tf.feature_column.numeric_column('totals.pageviews')) input_labels.append(tf.feature_column.numeric_column('totals.visits')) return input_labels
def test_sparse_trainable_false(self): wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4) with ops.Graph().as_default() as g: wire_tensor = sparse_tensor.SparseTensor(values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) features = {'wire_cast': wire_tensor} fc.make_linear_model(features, [wire_cast], trainable=False) trainable_vars = g.get_collection( ops.GraphKeys.TRAINABLE_VARIABLES) self.assertEqual([], trainable_vars)
def test_int32_64_is_compatible(self): hashed_sparse = fc.categorical_column_with_hash_bucket( 'wire', 10, dtype=dtypes.int64) wire_tensor = sparse_tensor.SparseTensor( values=constant_op.constant([101, 201, 301], dtype=dtypes.int32), indices=[[0, 0], [1, 0], [1, 1]], dense_shape=[2, 2]) builder = fc._LazyBuilder({'wire': wire_tensor}) output = builder.get(hashed_sparse) # Check exact hashed output. If hashing changes this test will break. expected_values = [3, 7, 5] with self.test_session(): self.assertAllEqual(expected_values, output.values.eval())
def test_dtype_should_be_string_or_integer(self): fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.string) fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32) with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'): fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
def test_sparse_collection(self): wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4) with ops.Graph().as_default() as g: wire_tensor = sparse_tensor.SparseTensor(values=['omar'], indices=[[0, 0]], dense_shape=[1, 1]) features = {'wire_cast': wire_tensor} fc.make_linear_model(features, [wire_cast], weight_collections=['my-vars']) my_vars = g.get_collection('my-vars') bias = get_linear_model_bias() wire_cast_var = get_linear_model_column_var(wire_cast) self.assertIn(bias, my_vars) self.assertIn(wire_cast_var, my_vars)
def test_sparse_combiner(self): wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4) with ops.Graph().as_default(): wire_tensor = sparse_tensor.SparseTensor( values=['omar', 'stringer', 'marlo'], # hashed to = [2, 0, 3] indices=[[0, 0], [1, 0], [1, 1]], dense_shape=[2, 2]) features = {'wire_cast': wire_tensor} predictions = fc.make_linear_model(features, [wire_cast], sparse_combiner='mean') bias = get_linear_model_bias() wire_cast_var = get_linear_model_column_var(wire_cast) with _initialized_session() as sess: sess.run( wire_cast_var.assign([[10.], [100.], [1000.], [10000.]])) sess.run(bias.assign([5.])) self.assertAllClose([[1005.], [5010.]], predictions.eval())
def test_strings_should_be_hashed(self): hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10) wire_tensor = sparse_tensor.SparseTensor( values=['omar', 'stringer', 'marlo'], indices=[[0, 0], [1, 0], [1, 1]], dense_shape=[2, 2]) builder = fc._LazyBuilder({'wire': wire_tensor}) output = builder.get(hashed_sparse) # Check exact hashed output. If hashing changes this test will break. expected_values = [6, 4, 1] with self.test_session(): self.assertEqual(dtypes.int64, output.values.dtype) self.assertAllEqual(expected_values, output.values.eval()) self.assertAllEqual(wire_tensor.indices.eval(), output.indices.eval()) self.assertAllEqual(wire_tensor.dense_shape.eval(), output.dense_shape.eval())
def sequence_categorical_column_with_hash_bucket( key, hash_bucket_size, dtype=dtypes.string): """A sequence of categorical terms where ids are set by hashing. Pass this to `embedding_column` or `indicator_column` to convert sequence categorical data into dense representation for input to sequence NN, such as RNN. Example: ```python tokens = sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=1000) tokens_embedding = embedding_column(tokens, dimension=10) columns = [tokens_embedding] features = tf.parse_example(..., features=make_parse_example_spec(columns)) input_layer, sequence_length = sequence_input_layer(features, columns) rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) outputs, state = tf.nn.dynamic_rnn( rnn_cell, inputs=input_layer, sequence_length=sequence_length) ``` Args: key: A unique string identifying the input feature. hash_bucket_size: An int > 1. The number of buckets. dtype: The type of features. Only string and integer types are supported. Returns: A `_SequenceCategoricalColumn`. Raises: ValueError: `hash_bucket_size` is not greater than 1. ValueError: `dtype` is neither string nor integer. """ return fc_old._SequenceCategoricalColumn( fc_old.categorical_column_with_hash_bucket( key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
def sequence_categorical_column_with_hash_bucket( key, hash_bucket_size, dtype=dtypes.string): """A sequence of categorical terms where ids are set by hashing. Pass this to `embedding_column` or `indicator_column` to convert sequence categorical data into dense representation for input to sequence NN, such as RNN. Example: ```python tokens = sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=1000) tokens_embedding = embedding_column(tokens, dimension=10) columns = [tokens_embedding] features = tf.parse_example(..., features=make_parse_example_spec(columns)) input_layer, sequence_length = sequence_input_layer(features, columns) rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) outputs, state = tf.nn.dynamic_rnn( rnn_cell, inputs=input_layer, sequence_length=sequence_length) ``` Args: key: A unique string identifying the input feature. hash_bucket_size: An int > 1. The number of buckets. dtype: The type of features. Only string and integer types are supported. Returns: A `_SequenceCategoricalColumn`. Raises: ValueError: `hash_bucket_size` is not greater than 1. ValueError: `dtype` is neither string nor integer. """ return fc._SequenceCategoricalColumn( fc.categorical_column_with_hash_bucket( key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
def test_sparse_multi_output(self): wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4) with ops.Graph().as_default(): wire_tensor = sparse_tensor.SparseTensor( values=['omar', 'stringer', 'marlo'], # hashed to = [2, 0, 3] indices=[[0, 0], [1, 0], [1, 1]], dense_shape=[2, 2]) features = {'wire_cast': wire_tensor} predictions = fc.make_linear_model(features, [wire_cast], units=3) bias = get_linear_model_bias() wire_cast_var = get_linear_model_column_var(wire_cast) with _initialized_session() as sess: self.assertAllClose([0., 0., 0.], bias.eval()) self.assertAllClose([[0.] * 3] * 4, wire_cast_var.eval()) sess.run( wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [1000., 1100., 1200.], [10000., 11000., 12000.]])) sess.run(bias.assign([5., 6., 7.])) self.assertAllClose( [[1005., 1106., 1207.], [10015., 11017., 12019.]], predictions.eval())
def test_one_shot_prediction_head_export(self, estimator_factory): def _new_temp_dir(): return os.path.join(test.get_temp_dir(), str(ops.uid())) model_dir = _new_temp_dir() categorical_column = feature_column.categorical_column_with_hash_bucket( key="categorical_exogenous_feature", hash_bucket_size=16) exogenous_feature_columns = [ feature_column.numeric_column("2d_exogenous_feature", shape=(2, )), feature_column.embedding_column( categorical_column=categorical_column, dimension=10) ] estimator = estimator_factory( model_dir=model_dir, exogenous_feature_columns=exogenous_feature_columns, head_type=ts_head_lib.OneShotPredictionHead) train_features = { feature_keys.TrainEvalFeatures.TIMES: numpy.arange(20, dtype=numpy.int64), feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(20, dtype=numpy.float32)[:, None], [1, 5]), "2d_exogenous_feature": numpy.ones([20, 2]), "categorical_exogenous_feature": numpy.array(["strkey"] * 20)[:, None] } train_input_fn = input_pipeline.RandomWindowInputFn( input_pipeline.NumpyReader(train_features), shuffle_seed=2, num_threads=1, batch_size=16, window_size=16) estimator.train(input_fn=train_input_fn, steps=5) result = estimator.evaluate(input_fn=train_input_fn, steps=1) self.assertIn("average_loss", result) self.assertNotIn(feature_keys.State.STATE_TUPLE, result) input_receiver_fn = estimator.build_raw_serving_input_receiver_fn() export_location = estimator.export_savedmodel(_new_temp_dir(), input_receiver_fn) graph = ops.Graph() with graph.as_default(): with session_lib.Session() as session: signatures = loader.load(session, [tag_constants.SERVING], export_location) self.assertEqual([feature_keys.SavedModelLabels.PREDICT], list(signatures.signature_def.keys())) predict_signature = signatures.signature_def[ feature_keys.SavedModelLabels.PREDICT] six.assertCountEqual(self, [ feature_keys.FilteringFeatures.TIMES, feature_keys.FilteringFeatures.VALUES, "2d_exogenous_feature", "categorical_exogenous_feature" ], predict_signature.inputs.keys()) features = { feature_keys.TrainEvalFeatures.TIMES: numpy.tile( numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]), feature_keys.TrainEvalFeatures.VALUES: numpy.tile( numpy.arange(20, dtype=numpy.float32)[None, :, None], [2, 1, 5]), "2d_exogenous_feature": numpy.ones([2, 35, 2]), "categorical_exogenous_feature": numpy.tile( numpy.array(["strkey"] * 35)[None, :, None], [2, 1, 1]) } feeds = { graph.as_graph_element(input_value.name): features[input_key] for input_key, input_value in predict_signature.inputs.items() } fetches = { output_key: graph.as_graph_element(output_value.name) for output_key, output_value in predict_signature.outputs.items() } output = session.run(fetches, feed_dict=feeds) self.assertEqual((2, 15, 5), output["mean"].shape) # Build a parsing input function, then make a tf.Example for it to parse. export_location = estimator.export_savedmodel( _new_temp_dir(), estimator.build_one_shot_parsing_serving_input_receiver_fn( filtering_length=20, prediction_length=15)) graph = ops.Graph() with graph.as_default(): with session_lib.Session() as session: example = example_pb2.Example() times = example.features.feature[ feature_keys.TrainEvalFeatures.TIMES] values = example.features.feature[ feature_keys.TrainEvalFeatures.VALUES] times.int64_list.value.extend(range(35)) for i in range(20): values.float_list.value.extend([ float(i) * 2. + feature_number for feature_number in range(5) ]) real_feature = example.features.feature["2d_exogenous_feature"] categortical_feature = example.features.feature[ "categorical_exogenous_feature"] for i in range(35): real_feature.float_list.value.extend([1, 1]) categortical_feature.bytes_list.value.append(b"strkey") # Serialize the tf.Example for feeding to the Session examples = [example.SerializeToString()] * 2 signatures = loader.load(session, [tag_constants.SERVING], export_location) predict_signature = signatures.signature_def[ feature_keys.SavedModelLabels.PREDICT] ((_, input_value), ) = predict_signature.inputs.items() feeds = {graph.as_graph_element(input_value.name): examples} fetches = { output_key: graph.as_graph_element(output_value.name) for output_key, output_value in predict_signature.outputs.items() } output = session.run(fetches, feed_dict=feeds) self.assertEqual((2, 15, 5), output["mean"].shape)
def test_bucket_size_should_be_positive(self): with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be at least 1'): fc.categorical_column_with_hash_bucket('aaa', 0)
def _build_feature_columns(self, ): multi_hot_feature_columns = {} multi_hot_feature_columns_deep = {} multi_category_feature_columns = {} continuous_feature_columns = {} crossed_feature_columns = [] bucketized_feature_columns = [] embedding_feature_columns = [] if self._data_conf.multi_hot_columns is not None: for column in self._data_conf.multi_hot_columns: multi_hot_feature_columns[ column] = categorical_column_with_vocabulary_list( column, self._data_conf.multi_hot_columns[column], dtype=tf.string) multi_hot_feature_columns_deep[column] = indicator_column( multi_hot_feature_columns[column]) if self._data_conf.multi_category_columns is not None: multi_category_feature_columns = { column: categorical_column_with_hash_bucket(column, hash_bucket_size=1000) for column in self._data_conf.multi_category_columns } if self._data_conf.continuous_columns is not None: continuous_feature_columns = { column: numeric_column(column) for column in self._data_conf.continuous_columns } if self._data_conf.crossed_columns is not None: crossed_feature_columns = [ crossed_column(_, hash_bucket_size=100000) for _ in self._data_conf.crossed_columns ] if self._data_conf.bucketized_columns is not None: [ bucketized_feature_columns.append( bucketized_column(continuous_feature_columns[column], boundaries=boundary)) for column, boundary in self._data_conf.bucketized_columns.items ] if len(multi_category_feature_columns) > 0: embedding_feature_columns = [ embedding_column( _, dimension=self._model_conf.embedding_dimension) for _ in multi_category_feature_columns.values() ] self._feature_mapping = { 0: list(multi_hot_feature_columns.values()), 1: list(multi_category_feature_columns.values()), 2: list(continuous_feature_columns.values()), 3: crossed_feature_columns, 4: bucketized_feature_columns, 5: embedding_feature_columns, 6: list(multi_hot_feature_columns_deep.values()) } self._build_feature_columns_for_model()
def testWarmStartVarsToWarmstartIsNone(self): # Create old and new vocabs for sparse column "sc_vocab". prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "old_vocab") new_vocab_path = self._write_vocab( ["orange", "guava", "banana", "apple", "raspberry", "blueberry"], "new_vocab") # Create feature columns. sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6) all_linear_cols = [sc_hash, sc_keys, sc_vocab] # Save checkpoint from which to warm-start. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) variable_scope.get_variable( "some_other_name", shape=[4, 1], initializer=rand()) variable_scope.get_variable( "linear_model/sc_vocab/weights", initializer=[[0.5], [1.], [2.], [3.]]) self._write_checkpoint(sess) def _partitioner(shape, dtype): # pylint:disable=unused-argument # Partition each var into 2 equal slices. partitions = [1] * len(shape) partitions[0] = min(2, shape[0].value) return partitions # New graph, new session with warm-starting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, _partitioner) vocab_info = ws_util.VocabInfo( new_vocab=sc_vocab.vocabulary_file, new_vocab_size=sc_vocab.vocabulary_size, num_oov_buckets=sc_vocab.num_oov_buckets, old_vocab=prev_vocab_path) ws_settings = ws_util.WarmStartSettings( self.get_temp_dir(), # The special value of None here will ensure that only the variable # specified in var_name_to_vocab_info (sc_vocab embedding) is # warm-started. vars_to_warm_start=None, var_name_to_vocab_info={ ws_util._infer_var_name(cols_to_vars[sc_vocab]): vocab_info }, # Even though this is provided, the None value for # vars_to_warm_start overrides the logic, and this will not be # warm-started. var_name_to_prev_var_name={ ws_util._infer_var_name(cols_to_vars[sc_keys]): "some_other_name" }) ws_util._warm_start(ws_settings) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warm-started. Var corresponding to # sc_vocab should be correctly warm-started after vocab remapping, # and neither of the other two should be warm-started.. self._assert_cols_to_vars(cols_to_vars, { sc_keys: [np.zeros([2, 1]), np.zeros([2, 1])], sc_hash: [np.zeros([8, 1]), np.zeros([7, 1])], sc_vocab: [ np.array([[3.], [2.], [1.]]), np.array([[0.5], [0.], [0.]]) ] }, sess)
def testWarmStart_MultipleCols(self): # Create vocab for sparse column "sc_vocab". vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "vocab") # Create feature columns. sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10) sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=vocab_path, vocabulary_size=4) real = fc.numeric_column("real") real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.]) cross = fc.crossed_column([sc_keys, sc_vocab], hash_bucket_size=20) all_linear_cols = [sc_int, sc_hash, sc_keys, sc_vocab, real_bucket, cross] # Save checkpoint from which to warm-start. Also create a bias variable, # so we can check that it's also warm-started. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: sc_int_weights = variable_scope.get_variable( "linear_model/sc_int/weights", shape=[10, 1], initializer=ones()) sc_hash_weights = variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) sc_keys_weights = variable_scope.get_variable( "linear_model/sc_keys/weights", shape=[4, 1], initializer=rand()) sc_vocab_weights = variable_scope.get_variable( "linear_model/sc_vocab/weights", shape=[4, 1], initializer=ones()) real_bucket_weights = variable_scope.get_variable( "linear_model/real_bucketized/weights", shape=[5, 1], initializer=norms()) cross_weights = variable_scope.get_variable( "linear_model/sc_keys_X_sc_vocab/weights", shape=[20, 1], initializer=rand()) bias = variable_scope.get_variable( "linear_model/bias_weights", shape=[1], initializer=rand()) self._write_checkpoint(sess) (prev_int_val, prev_hash_val, prev_keys_val, prev_vocab_val, prev_bucket_val, prev_cross_val, prev_bias_val) = sess.run([ sc_int_weights, sc_hash_weights, sc_keys_weights, sc_vocab_weights, real_bucket_weights, cross_weights, bias ]) partitioner = lambda shape, dtype: [1] * len(shape) # New graph, new session WITHOUT warm-starting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner) sess.run(variables.global_variables_initializer()) # Without warm-starting, all weights should be initialized using default # initializer (which is init_ops.zeros_initializer). self._assert_cols_to_vars(cols_to_vars, { sc_int: [np.zeros([10, 1])], sc_hash: [np.zeros([15, 1])], sc_keys: [np.zeros([4, 1])], sc_vocab: [np.zeros([4, 1])], real_bucket: [np.zeros([5, 1])], cross: [np.zeros([20, 1])], }, sess) # New graph, new session with warm-starting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner) vocab_info = ws_util.VocabInfo( new_vocab=sc_vocab.vocabulary_file, new_vocab_size=sc_vocab.vocabulary_size, num_oov_buckets=sc_vocab.num_oov_buckets, old_vocab=vocab_path) ws_util._warm_start( ws_util.WarmStartSettings( self.get_temp_dir(), var_name_to_vocab_info={ "linear_model/sc_vocab/weights": vocab_info })) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warm-started. self._assert_cols_to_vars(cols_to_vars, { sc_int: [prev_int_val], sc_hash: [prev_hash_val], sc_keys: [prev_keys_val], sc_vocab: [prev_vocab_val], real_bucket: [prev_bucket_val], cross: [prev_cross_val], "bias": [prev_bias_val], }, sess)
def test_one_shot_prediction_head_export(self): model_dir = self.get_temp_dir() categorical_column = feature_column.categorical_column_with_hash_bucket( key="categorical_exogenous_feature", hash_bucket_size=16) exogenous_feature_columns = [ feature_column.numeric_column("2d_exogenous_feature", shape=(2, )), feature_column.embedding_column( categorical_column=categorical_column, dimension=10) ] estimator = ts_estimators.TimeSeriesRegressor( model=lstm_example._LSTMModel( num_features=5, num_units=128, exogenous_feature_columns=exogenous_feature_columns), optimizer=adam.AdamOptimizer(0.001), config=estimator_lib.RunConfig(tf_random_seed=4), state_manager=state_management.ChainingStateManager(), head_type=ts_head_lib.OneShotPredictionHead, model_dir=model_dir) train_features = { feature_keys.TrainEvalFeatures.TIMES: numpy.arange(20, dtype=numpy.int64), feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(20, dtype=numpy.float32)[:, None], [1, 5]), "2d_exogenous_feature": numpy.ones([20, 2]), "categorical_exogenous_feature": numpy.array(["strkey"] * 20)[:, None] } train_input_fn = input_pipeline.RandomWindowInputFn( input_pipeline.NumpyReader(train_features), shuffle_seed=2, num_threads=1, batch_size=16, window_size=16) estimator.train(input_fn=train_input_fn, steps=5) input_receiver_fn = estimator.build_raw_serving_input_receiver_fn() export_location = estimator.export_savedmodel(self.get_temp_dir(), input_receiver_fn) graph = ops.Graph() with graph.as_default(): with session_lib.Session() as session: signatures = loader.load(session, [tag_constants.SERVING], export_location) self.assertEqual([feature_keys.SavedModelLabels.PREDICT], list(signatures.signature_def.keys())) predict_signature = signatures.signature_def[ feature_keys.SavedModelLabels.PREDICT] six.assertCountEqual(self, [ feature_keys.FilteringFeatures.TIMES, feature_keys.FilteringFeatures.VALUES, "2d_exogenous_feature", "categorical_exogenous_feature" ], predict_signature.inputs.keys()) features = { feature_keys.TrainEvalFeatures.TIMES: numpy.tile( numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]), feature_keys.TrainEvalFeatures.VALUES: numpy.tile( numpy.arange(20, dtype=numpy.float32)[None, :, None], [2, 1, 5]), "2d_exogenous_feature": numpy.ones([2, 35, 2]), "categorical_exogenous_feature": numpy.tile( numpy.array(["strkey"] * 35)[None, :, None], [2, 1, 1]) } feeds = { graph.as_graph_element(input_value.name): features[input_key] for input_key, input_value in predict_signature.inputs.items() } fetches = { output_key: graph.as_graph_element(output_value.name) for output_key, output_value in predict_signature.outputs.items() } output = session.run(fetches, feed_dict=feeds) self.assertAllEqual((2, 15, 5), output["mean"].shape)
def testWarmStartVarsToWarmstartIsNone(self): # Create old and new vocabs for sparse column "sc_vocab". prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "old_vocab") new_vocab_path = self._write_vocab( ["orange", "guava", "banana", "apple", "raspberry", "blueberry"], "new_vocab") # Create feature columns. sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6) all_linear_cols = [sc_hash, sc_keys, sc_vocab] # Save checkpoint from which to warm-start. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) variable_scope.get_variable( "some_other_name", shape=[4, 1], initializer=rand()) variable_scope.get_variable( "linear_model/sc_vocab/weights", initializer=[[0.5], [1.], [2.], [3.]]) self._write_checkpoint(sess) def _partitioner(shape, dtype): # pylint:disable=unused-argument # Partition each var into 2 equal slices. partitions = [1] * len(shape) partitions[0] = min(2, shape[0].value) return partitions # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, _partitioner) vocab_info = ws_util._VocabInfo( new_vocab=sc_vocab.vocabulary_file, new_vocab_size=sc_vocab.vocabulary_size, num_oov_buckets=sc_vocab.num_oov_buckets, old_vocab=prev_vocab_path ) ws_settings = ws_util._WarmStartSettings( self.get_temp_dir(), # The special value of None here will ensure that only the variable # specified in var_name_to_vocab_info (sc_vocab embedding) is # warmstarted. vars_to_warmstart=None, var_name_to_vocab_info={ ws_util._infer_var_name(cols_to_vars[sc_vocab]): vocab_info }, # Even though this is provided, the None value for vars_to_warmstart # overrides the logic, and this will not be warmstarted. var_name_to_prev_var_name={ ws_util._infer_var_name(cols_to_vars[sc_keys]): "some_other_name" }) ws_util._warmstart(ws_settings) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. Var corresponding to # sc_vocab should be correctly warmstarted after vocab remapping, # and neither of the other two should be warmstarted.. self._assert_cols_to_vars(cols_to_vars, { sc_keys: [np.zeros([2, 1]), np.zeros([2, 1])], sc_hash: [np.zeros([8, 1]), np.zeros([7, 1])], sc_vocab: [ np.array([[3.], [2.], [1.]]), np.array([[0.5], [0.], [0.]]) ] }, sess)
def testWarmStartInputLayerMoreSettings(self): # Create old and new vocabs for sparse column "sc_vocab". prev_vocab_path = self._write_vocab( ["apple", "banana", "guava", "orange"], "old_vocab") new_vocab_path = self._write_vocab( ["orange", "guava", "banana", "apple", "raspberry", "blueberry"], "new_vocab") # Create feature columns. sc_hash = fc.categorical_column_with_hash_bucket("sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6) all_linear_cols = [sc_hash, sc_keys, sc_vocab] # Save checkpoint from which to warm-start. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: _ = variable_scope.get_variable("linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) sc_keys_weights = variable_scope.get_variable( "some_other_name", shape=[4, 1], initializer=rand()) _ = variable_scope.get_variable( "linear_model/sc_vocab/weights", initializer=[[0.5], [1.], [2.], [3.]]) self._write_checkpoint(sess) prev_keys_val = sess.run(sc_keys_weights) def _partitioner(shape, dtype): # pylint:disable=unused-argument # Partition each var into 2 equal slices. partitions = [1] * len(shape) partitions[0] = min(2, shape[0].value) return partitions # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model( all_linear_cols, _partitioner) ws_settings = ws_util._WarmStartSettings( self.get_temp_dir(), col_to_prev_vocab={sc_vocab: prev_vocab_path}, col_to_prev_tensor={sc_keys: "some_other_name"}, exclude_columns=[sc_hash]) ws_util._warmstart_input_layer(cols_to_vars, ws_settings) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. Var corresponding to # sc_hash should not be warm-started. Var corresponding to sc_vocab # should be correctly warmstarted after vocab remapping. self._assert_cols_to_vars( cols_to_vars, { sc_keys: np.split(prev_keys_val, 2), sc_hash: [np.zeros([8, 1]), np.zeros([7, 1])], sc_vocab: [ np.array([[3.], [2.], [1.]]), np.array([[0.5], [0.], [0.]]) ] }, sess)
def testWarmStart_MultipleCols(self): # Create vocab for sparse column "sc_vocab". vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "vocab") # Create feature columns. sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10) sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=vocab_path, vocabulary_size=4) real = fc.numeric_column("real") real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.]) cross = fc.crossed_column([sc_keys, sc_vocab], hash_bucket_size=20) all_linear_cols = [sc_int, sc_hash, sc_keys, sc_vocab, real_bucket, cross] # Save checkpoint from which to warm-start. Also create a bias variable, # so we can check that it's also warmstarted. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: sc_int_weights = variable_scope.get_variable( "linear_model/sc_int/weights", shape=[10, 1], initializer=ones()) sc_hash_weights = variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) sc_keys_weights = variable_scope.get_variable( "linear_model/sc_keys/weights", shape=[4, 1], initializer=rand()) sc_vocab_weights = variable_scope.get_variable( "linear_model/sc_vocab/weights", shape=[4, 1], initializer=ones()) real_bucket_weights = variable_scope.get_variable( "linear_model/real_bucketized/weights", shape=[5, 1], initializer=norms()) cross_weights = variable_scope.get_variable( "linear_model/sc_keys_X_sc_vocab/weights", shape=[20, 1], initializer=rand()) bias = variable_scope.get_variable( "linear_model/bias_weights", shape=[1], initializer=rand()) self._write_checkpoint(sess) (prev_int_val, prev_hash_val, prev_keys_val, prev_vocab_val, prev_bucket_val, prev_cross_val, prev_bias_val) = sess.run([ sc_int_weights, sc_hash_weights, sc_keys_weights, sc_vocab_weights, real_bucket_weights, cross_weights, bias ]) partitioner = lambda shape, dtype: [1] * len(shape) # New graph, new session WITHOUT warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner) sess.run(variables.global_variables_initializer()) # Without warmstarting, all weights should be initialized using default # initializer (which is init_ops.zeros_initializer). self._assert_cols_to_vars(cols_to_vars, { sc_int: [np.zeros([10, 1])], sc_hash: [np.zeros([15, 1])], sc_keys: [np.zeros([4, 1])], sc_vocab: [np.zeros([4, 1])], real_bucket: [np.zeros([5, 1])], cross: [np.zeros([20, 1])], }, sess) # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner) vocab_info = ws_util._VocabInfo( new_vocab=sc_vocab.vocabulary_file, new_vocab_size=sc_vocab.vocabulary_size, num_oov_buckets=sc_vocab.num_oov_buckets, old_vocab=vocab_path ) ws_util._warmstart( ws_util._WarmStartSettings( self.get_temp_dir(), var_name_to_vocab_info={ "linear_model/sc_vocab/weights": vocab_info })) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. self._assert_cols_to_vars(cols_to_vars, { sc_int: [prev_int_val], sc_hash: [prev_hash_val], sc_keys: [prev_keys_val], sc_vocab: [prev_vocab_val], real_bucket: [prev_bucket_val], cross: [prev_cross_val], "bias": [prev_bias_val], }, sess)
def test_bucket_size_should_be_given(self): with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be set.'): fc.categorical_column_with_hash_bucket('aaa', None)
def test_one_shot_prediction_head_export(self, estimator_factory): def _new_temp_dir(): return os.path.join(test.get_temp_dir(), str(ops.uid())) model_dir = _new_temp_dir() categorical_column = feature_column.categorical_column_with_hash_bucket( key="categorical_exogenous_feature", hash_bucket_size=16) exogenous_feature_columns = [ feature_column.numeric_column( "2d_exogenous_feature", shape=(2,)), feature_column.embedding_column( categorical_column=categorical_column, dimension=10)] estimator = estimator_factory( model_dir=model_dir, exogenous_feature_columns=exogenous_feature_columns, head_type=ts_head_lib.OneShotPredictionHead) train_features = { feature_keys.TrainEvalFeatures.TIMES: numpy.arange( 20, dtype=numpy.int64), feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange( 20, dtype=numpy.float32)[:, None], [1, 5]), "2d_exogenous_feature": numpy.ones([20, 2]), "categorical_exogenous_feature": numpy.array( ["strkey"] * 20)[:, None] } train_input_fn = input_pipeline.RandomWindowInputFn( input_pipeline.NumpyReader(train_features), shuffle_seed=2, num_threads=1, batch_size=16, window_size=16) estimator.train(input_fn=train_input_fn, steps=5) result = estimator.evaluate(input_fn=train_input_fn, steps=1) self.assertIn("average_loss", result) self.assertNotIn(feature_keys.State.STATE_TUPLE, result) input_receiver_fn = estimator.build_raw_serving_input_receiver_fn() export_location = estimator.export_saved_model(_new_temp_dir(), input_receiver_fn) graph = ops.Graph() with graph.as_default(): with session_lib.Session() as session: signatures = loader.load( session, [tag_constants.SERVING], export_location) self.assertEqual([feature_keys.SavedModelLabels.PREDICT], list(signatures.signature_def.keys())) predict_signature = signatures.signature_def[ feature_keys.SavedModelLabels.PREDICT] six.assertCountEqual( self, [feature_keys.FilteringFeatures.TIMES, feature_keys.FilteringFeatures.VALUES, "2d_exogenous_feature", "categorical_exogenous_feature"], predict_signature.inputs.keys()) features = { feature_keys.TrainEvalFeatures.TIMES: numpy.tile( numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]), feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange( 20, dtype=numpy.float32)[None, :, None], [2, 1, 5]), "2d_exogenous_feature": numpy.ones([2, 35, 2]), "categorical_exogenous_feature": numpy.tile(numpy.array( ["strkey"] * 35)[None, :, None], [2, 1, 1]) } feeds = { graph.as_graph_element(input_value.name): features[input_key] for input_key, input_value in predict_signature.inputs.items()} fetches = {output_key: graph.as_graph_element(output_value.name) for output_key, output_value in predict_signature.outputs.items()} output = session.run(fetches, feed_dict=feeds) self.assertEqual((2, 15, 5), output["mean"].shape) # Build a parsing input function, then make a tf.Example for it to parse. export_location = estimator.export_saved_model( _new_temp_dir(), estimator.build_one_shot_parsing_serving_input_receiver_fn( filtering_length=20, prediction_length=15)) graph = ops.Graph() with graph.as_default(): with session_lib.Session() as session: example = example_pb2.Example() times = example.features.feature[feature_keys.TrainEvalFeatures.TIMES] values = example.features.feature[feature_keys.TrainEvalFeatures.VALUES] times.int64_list.value.extend(range(35)) for i in range(20): values.float_list.value.extend( [float(i) * 2. + feature_number for feature_number in range(5)]) real_feature = example.features.feature["2d_exogenous_feature"] categortical_feature = example.features.feature[ "categorical_exogenous_feature"] for i in range(35): real_feature.float_list.value.extend([1, 1]) categortical_feature.bytes_list.value.append(b"strkey") # Serialize the tf.Example for feeding to the Session examples = [example.SerializeToString()] * 2 signatures = loader.load( session, [tag_constants.SERVING], export_location) predict_signature = signatures.signature_def[ feature_keys.SavedModelLabels.PREDICT] ((_, input_value),) = predict_signature.inputs.items() feeds = {graph.as_graph_element(input_value.name): examples} fetches = {output_key: graph.as_graph_element(output_value.name) for output_key, output_value in predict_signature.outputs.items()} output = session.run(fetches, feed_dict=feeds) self.assertEqual((2, 15, 5), output["mean"].shape)
def test_one_shot_prediction_head_export(self, estimator_factory): model_dir = os.path.join(test.get_temp_dir(), str(ops.uid())) categorical_column = feature_column.categorical_column_with_hash_bucket( key="categorical_exogenous_feature", hash_bucket_size=16) exogenous_feature_columns = [ feature_column.numeric_column( "2d_exogenous_feature", shape=(2,)), feature_column.embedding_column( categorical_column=categorical_column, dimension=10)] estimator = estimator_factory( model_dir=model_dir, exogenous_feature_columns=exogenous_feature_columns, head_type=ts_head_lib.OneShotPredictionHead) train_features = { feature_keys.TrainEvalFeatures.TIMES: numpy.arange( 20, dtype=numpy.int64), feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange( 20, dtype=numpy.float32)[:, None], [1, 5]), "2d_exogenous_feature": numpy.ones([20, 2]), "categorical_exogenous_feature": numpy.array( ["strkey"] * 20)[:, None] } train_input_fn = input_pipeline.RandomWindowInputFn( input_pipeline.NumpyReader(train_features), shuffle_seed=2, num_threads=1, batch_size=16, window_size=16) estimator.train(input_fn=train_input_fn, steps=5) input_receiver_fn = estimator.build_raw_serving_input_receiver_fn() export_location = estimator.export_savedmodel(test.get_temp_dir(), input_receiver_fn) graph = ops.Graph() with graph.as_default(): with session_lib.Session() as session: signatures = loader.load( session, [tag_constants.SERVING], export_location) self.assertEqual([feature_keys.SavedModelLabels.PREDICT], list(signatures.signature_def.keys())) predict_signature = signatures.signature_def[ feature_keys.SavedModelLabels.PREDICT] six.assertCountEqual( self, [feature_keys.FilteringFeatures.TIMES, feature_keys.FilteringFeatures.VALUES, "2d_exogenous_feature", "categorical_exogenous_feature"], predict_signature.inputs.keys()) features = { feature_keys.TrainEvalFeatures.TIMES: numpy.tile( numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]), feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange( 20, dtype=numpy.float32)[None, :, None], [2, 1, 5]), "2d_exogenous_feature": numpy.ones([2, 35, 2]), "categorical_exogenous_feature": numpy.tile(numpy.array( ["strkey"] * 35)[None, :, None], [2, 1, 1]) } feeds = { graph.as_graph_element(input_value.name): features[input_key] for input_key, input_value in predict_signature.inputs.items()} fetches = {output_key: graph.as_graph_element(output_value.name) for output_key, output_value in predict_signature.outputs.items()} output = session.run(fetches, feed_dict=feeds) self.assertEqual((2, 15, 5), output["mean"].shape)
from tensorflow.contrib.learn import LinearRegressor, pandas_input_fn, DNNRegressor, Experiment from tensorflow.python.feature_column.feature_column import categorical_column_with_hash_bucket, numeric_column, \ categorical_column_with_vocabulary_list, embedding_column, indicator_column make = categorical_column_with_hash_bucket('make', 100) horsepower = numeric_column('horsepower', shape=[]) cylinders = categorical_column_with_vocabulary_list( 'num-of-cylinders', ['two', 'three', 'four', 'six', 'eight']) ############### regressor = DNNRegressor(feature_columns=[ embedding_column(make, 10), horsepower, indicator_column(cylinders, 3) ], hidden_units=[50, 30, 10]) ################ regressor = LinearRegressor(feature_columns=[make, horsepower, cylinders]) # any python generator train_input_fn = pandas_input_fn(x=input_data, y=input_label, batch_size=64, shuffle=True, num_epochs=None) regressor.train(train_input_fn, steps=10000) def expirement_fn(run_config, hparams): regressor = DNNRegressor(..., config=run_config,
def test_defaults(self): a = fc.categorical_column_with_hash_bucket('aaa', 10) self.assertEqual('aaa', a.name) self.assertEqual('aaa', a.key) self.assertEqual(10, a.hash_bucket_size) self.assertEqual(dtypes.string, a.dtype)
def test_parse_config_int(self): a = fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32) self.assertEqual({'aaa': parsing_ops.VarLenFeature(dtypes.int32)}, a._parse_example_config)