def sequence_categorical_column_with_vocabulary_list(key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0): """A sequence of categorical terms where ids use an in-memory list. Pass this to `embedding_column` or `indicator_column` to convert sequence categorical data into dense representation for input to sequence NN, such as RNN. Example: ```python colors = sequence_categorical_column_with_vocabulary_list( key='colors', vocabulary_list=('R', 'G', 'B', 'Y'), num_oov_buckets=2) colors_embedding = embedding_column(colors, dimension=3) columns = [colors_embedding] features = tf.parse_example(..., features=make_parse_example_spec(columns)) input_layer, sequence_length = sequence_input_layer(features, columns) rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) outputs, state = tf.nn.dynamic_rnn( rnn_cell, inputs=input_layer, sequence_length=sequence_length) ``` Args: key: A unique string identifying the input feature. vocabulary_list: An ordered iterable defining the vocabulary. Each feature is mapped to the index of its value (if present) in `vocabulary_list`. Must be castable to `dtype`. dtype: The type of features. Only string and integer types are supported. If `None`, it will be inferred from `vocabulary_list`. default_value: The integer ID value to return for out-of-vocabulary feature values, defaults to `-1`. This can not be specified with a positive `num_oov_buckets`. num_oov_buckets: Non-negative integer, the number of out-of-vocabulary buckets. All out-of-vocabulary inputs will be assigned IDs in the range `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a hash of the input value. A positive `num_oov_buckets` can not be specified with `default_value`. Returns: A `_SequenceCategoricalColumn`. Raises: ValueError: if `vocabulary_list` is empty, or contains duplicate keys. ValueError: `num_oov_buckets` is a negative integer. ValueError: `num_oov_buckets` and `default_value` are both specified. ValueError: if `dtype` is not integer or string. """ return fc._SequenceCategoricalColumn( fc.categorical_column_with_vocabulary_list( key=key, vocabulary_list=vocabulary_list, dtype=dtype, default_value=default_value, num_oov_buckets=num_oov_buckets))
def test_dnn_classifier(self): embedding = feature_column_lib.embedding_column( feature_column_lib.categorical_column_with_vocabulary_list( 'wire_cast', ['kima', 'omar', 'stringer']), 8) dnn = estimator_lib.DNNClassifier( feature_columns=[embedding], hidden_units=[3, 1]) def train_input_fn(): return dataset_ops.Dataset.from_tensors(({ 'wire_cast': [['omar'], ['kima']] }, [[0], [1]])).repeat(3) def eval_input_fn(): return dataset_ops.Dataset.from_tensors(({ 'wire_cast': [['stringer'], ['kima']] }, [[0], [1]])).repeat(2) evaluator = hooks_lib.InMemoryEvaluatorHook( dnn, eval_input_fn, name='in-memory') dnn.train(train_input_fn, hooks=[evaluator]) self.assertTrue(os.path.isdir(dnn.eval_dir('in-memory'))) step_keyword_to_value = summary_step_keyword_to_value_mapping( dnn.eval_dir('in-memory')) final_metrics = dnn.evaluate(eval_input_fn) step = final_metrics[ops.GraphKeys.GLOBAL_STEP] for summary_tag in final_metrics: if summary_tag == ops.GraphKeys.GLOBAL_STEP: continue self.assertEqual(final_metrics[summary_tag], step_keyword_to_value[step][summary_tag])
def test_dnn_classifier(self): embedding = feature_column_lib.embedding_column( feature_column_lib.categorical_column_with_vocabulary_list( 'wire_cast', ['kima', 'omar', 'stringer']), 8) dnn = estimator_lib.DNNClassifier(feature_columns=[embedding], hidden_units=[3, 1]) def train_input_fn(): return dataset_ops.Dataset.from_tensors(({ 'wire_cast': [['omar'], ['kima']] }, [[0], [1]])).repeat(3) def eval_input_fn(): return dataset_ops.Dataset.from_tensors(({ 'wire_cast': [['stringer'], ['kima']] }, [[0], [1]])).repeat(2) evaluator = hooks_lib.InMemoryEvaluatorHook(dnn, eval_input_fn, name='in-memory') dnn.train(train_input_fn, hooks=[evaluator]) self.assertTrue(os.path.isdir(dnn.eval_dir('in-memory'))) step_keyword_to_value = summary_step_keyword_to_value_mapping( dnn.eval_dir('in-memory')) final_metrics = dnn.evaluate(eval_input_fn) step = final_metrics[ops.GraphKeys.GLOBAL_STEP] for summary_tag in final_metrics: if summary_tag == ops.GraphKeys.GLOBAL_STEP: continue self.assertEqual(final_metrics[summary_tag], step_keyword_to_value[step][summary_tag])
def testWarmStartMoreSettingsNoPartitioning(self): # Create old and new vocabs for sparse column "sc_vocab". prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "old_vocab") new_vocab_path = self._write_vocab( ["orange", "guava", "banana", "apple", "raspberry", "blueberry"], "new_vocab") # Create feature columns. sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6) all_linear_cols = [sc_hash, sc_keys, sc_vocab] # Save checkpoint from which to warm-start. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) sc_keys_weights = variable_scope.get_variable( "some_other_name", shape=[4, 1], initializer=rand()) variable_scope.get_variable( "linear_model/sc_vocab/weights", initializer=[[0.5], [1.], [2.], [3.]]) self._write_checkpoint(sess) prev_keys_val = sess.run(sc_keys_weights) # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner=None) vocab_info = ws_util._VocabInfo( new_vocab=sc_vocab.vocabulary_file, new_vocab_size=sc_vocab.vocabulary_size, num_oov_buckets=sc_vocab.num_oov_buckets, old_vocab=prev_vocab_path ) ws_settings = ws_util._WarmStartSettings( self.get_temp_dir(), vars_to_warmstart=".*(sc_keys|sc_vocab).*", var_name_to_vocab_info={ ws_util._infer_var_name(cols_to_vars[sc_vocab]): vocab_info }, var_name_to_prev_var_name={ ws_util._infer_var_name(cols_to_vars[sc_keys]): "some_other_name" }) ws_util._warmstart(ws_settings) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. Var corresponding to # sc_hash should not be warm-started. Var corresponding to sc_vocab # should be correctly warmstarted after vocab remapping. self._assert_cols_to_vars(cols_to_vars, { sc_keys: [prev_keys_val], sc_hash: [np.zeros([15, 1])], sc_vocab: [np.array([[3.], [2.], [1.], [0.5], [0.], [0.]])] }, sess)
def testWarmStartInputLayerMoreSettings(self): # Create old and new vocabs for sparse column "sc_vocab". prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "old_vocab") new_vocab_path = self._write_vocab( ["orange", "guava", "banana", "apple", "raspberry", "blueberry"], "new_vocab") # Create feature columns. sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6) all_linear_cols = [sc_hash, sc_keys, sc_vocab] # Save checkpoint from which to warm-start. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: _ = variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) sc_keys_weights = variable_scope.get_variable( "some_other_name", shape=[4, 1], initializer=rand()) _ = variable_scope.get_variable( "linear_model/sc_vocab/weights", initializer=[[0.5], [1.], [2.], [3.]]) self._write_checkpoint(sess) prev_keys_val = sess.run(sc_keys_weights) def _partitioner(shape, dtype): # pylint:disable=unused-argument # Partition each var into 2 equal slices. partitions = [1] * len(shape) partitions[0] = min(2, shape[0].value) return partitions # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, _partitioner) ws_settings = ws_util._WarmStartSettings( self.get_temp_dir(), col_to_prev_vocab={sc_vocab: prev_vocab_path}, col_to_prev_tensor={sc_keys: "some_other_name"}, exclude_columns=[sc_hash]) ws_util._warmstart_input_layer(cols_to_vars, ws_settings) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. Var corresponding to # sc_hash should not be warm-started. Var corresponding to sc_vocab # should be correctly warmstarted after vocab remapping. self._assert_cols_to_vars(cols_to_vars, { sc_keys: np.split(prev_keys_val, 2), sc_hash: [np.zeros([8, 1]), np.zeros([7, 1])], sc_vocab: [ np.array([[3.], [2.], [1.]]), np.array([[0.5], [0.], [0.]]) ] }, sess)
def sequence_categorical_column_with_vocabulary_list( key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0): """A sequence of categorical terms where ids use an in-memory list. Pass this to `embedding_column` or `indicator_column` to convert sequence categorical data into dense representation for input to sequence NN, such as RNN. Example: ```python colors = sequence_categorical_column_with_vocabulary_list( key='colors', vocabulary_list=('R', 'G', 'B', 'Y'), num_oov_buckets=2) colors_embedding = embedding_column(colors, dimension=3) columns = [colors_embedding] features = tf.parse_example(..., features=make_parse_example_spec(columns)) input_layer, sequence_length = sequence_input_layer(features, columns) rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size) outputs, state = tf.nn.dynamic_rnn( rnn_cell, inputs=input_layer, sequence_length=sequence_length) ``` Args: key: A unique string identifying the input feature. vocabulary_list: An ordered iterable defining the vocabulary. Each feature is mapped to the index of its value (if present) in `vocabulary_list`. Must be castable to `dtype`. dtype: The type of features. Only string and integer types are supported. If `None`, it will be inferred from `vocabulary_list`. default_value: The integer ID value to return for out-of-vocabulary feature values, defaults to `-1`. This can not be specified with a positive `num_oov_buckets`. num_oov_buckets: Non-negative integer, the number of out-of-vocabulary buckets. All out-of-vocabulary inputs will be assigned IDs in the range `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a hash of the input value. A positive `num_oov_buckets` can not be specified with `default_value`. Returns: A `_SequenceCategoricalColumn`. Raises: ValueError: if `vocabulary_list` is empty, or contains duplicate keys. ValueError: `num_oov_buckets` is a negative integer. ValueError: `num_oov_buckets` and `default_value` are both specified. ValueError: if `dtype` is not integer or string. """ return fc_old._SequenceCategoricalColumn( fc_old.categorical_column_with_vocabulary_list( key=key, vocabulary_list=vocabulary_list, dtype=dtype, default_value=default_value, num_oov_buckets=num_oov_buckets))
def testBaseLinearRegressorTraining3D(self): # Tests also a categorical feature with vocabulary list. feature_columns = [ feature_column_lib.numeric_column('x0'), feature_column_lib.numeric_column('x1'), feature_column_lib.categorical_column_with_vocabulary_list( 'x2', ['Y', 'N']) ] self._TestRegressor(feature_columns, self._test_data.threed_input_fn(False, 1))
def test_warm_starting_selective_variables(self): """Tests selecting variables to warm-start.""" age = feature_column.numeric_column('age') city = feature_column.embedding_column( feature_column.categorical_column_with_vocabulary_list( 'city', vocabulary_list=['Mountain View', 'Palo Alto']), dimension=5) # Create a DNNLinearCombinedClassifier and train to save a checkpoint. dnn_lc_classifier = dnn_linear_combined.DNNLinearCombinedClassifier( linear_feature_columns=[age], dnn_feature_columns=[city], dnn_hidden_units=[256, 128], model_dir=self._ckpt_and_vocab_dir, n_classes=4, linear_optimizer='SGD', dnn_optimizer='SGD') dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1) # Create a second DNNLinearCombinedClassifier, warm-started from the first. # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't # have accumulator values that change). warm_started_dnn_lc_classifier = ( dnn_linear_combined.DNNLinearCombinedClassifier( linear_feature_columns=[age], dnn_feature_columns=[city], dnn_hidden_units=[256, 128], n_classes=4, linear_optimizer=gradient_descent.GradientDescentOptimizer( learning_rate=0.0), dnn_optimizer=gradient_descent.GradientDescentOptimizer( learning_rate=0.0), # The provided regular expression will only warm-start the deep # portion of the model. warm_start_from=estimator.WarmStartSettings( ckpt_to_initialize_from=dnn_lc_classifier.model_dir, vars_to_warm_start='.*(dnn).*'))) warm_started_dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1) for variable_name in warm_started_dnn_lc_classifier.get_variable_names( ): if 'dnn' in variable_name: self.assertAllClose( dnn_lc_classifier.get_variable_value(variable_name), warm_started_dnn_lc_classifier.get_variable_value( variable_name)) elif 'linear' in variable_name: linear_values = warm_started_dnn_lc_classifier.get_variable_value( variable_name) # Since they're not warm-started, the linear weights will be # zero-initialized. self.assertAllClose(np.zeros_like(linear_values), linear_values)
def build_model_columns(): week_list = fc.categorical_column_with_vocabulary_list("week_list", vocabulary_list=['mon', 'tue', 'wed', 'thur', 'fri', 'sat', 'sun']) week = fc.weighted_categorical_column(week_list, 'week_weight') week = fc.embedding_column(week, 3) wide = [] deep = [week] return wide, deep
def test_reuse(): data = { 'gender': [['M'], ['G'], ['M'], ['M']], 'user': [['A'], ['B'], ['C'], ['C']], 'pos': [['a'], ['d'], ['f'], ['c']], 'neg': [['c'], ['e'], ['d'], ['a']] } user_v_list = ['A', 'B', 'C', 'D'] item_v_list = ['a', 'b', 'c', 'd', 'e', 'f'] gender_col = feature_column.categorical_column_with_vocabulary_list( 'gender', ['M', "G"], dtype=tf.string) user_col = feature_column.categorical_column_with_vocabulary_list( 'user', user_v_list, dtype=tf.string) pos_item_col = feature_column.categorical_column_with_vocabulary_list( 'pos', item_v_list, dtype=tf.string) neg_item_col = feature_column.categorical_column_with_vocabulary_list( 'neg', item_v_list, dtype=tf.string) gender_embedding = feature_column.embedding_column(gender_col, 2) user_embedding = feature_column.embedding_column(user_col, 2) pos_embedding, neg_embedding = feature_column.shared_embedding_columns( [pos_item_col, neg_item_col], 3) columns = [gender_embedding, user_embedding, pos_embedding, neg_embedding] with tf.variable_scope("a") as scope: aa = scope.name ret = tf.feature_column.input_layer(data, columns) print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=aa)) with tf.variable_scope("b") as scope: bb = scope.name ret1 = tf.feature_column.input_layer(data, columns) print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=bb)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) print(sess.run(ret)) print('------------------') print(sess.run(ret1))
def testTrainEvaluateAndPredictWithIndicatorColumn(self): categorical = feature_column.categorical_column_with_vocabulary_list( key='categorical', vocabulary_list=('bad', 'good', 'ok')) feature_indicator = feature_column.indicator_column(categorical) bucketized_col = feature_column.bucketized_column( feature_column.numeric_column('an_uninformative_feature', dtype=dtypes.float32), BUCKET_BOUNDARIES) labels = np.array([[0.], [5.7], [5.7], [0.], [0.]], dtype=np.float32) # Our categorical feature defines the labels perfectly input_fn = numpy_io.numpy_input_fn(x={ 'an_uninformative_feature': np.array([1, 1, 1, 1, 1]), 'categorical': np.array(['bad', 'good', 'good', 'ok', 'bad']), }, y=labels, batch_size=5, shuffle=False) # Train depth 1 tree. est = boosted_trees.BoostedTreesRegressor( feature_columns=[bucketized_col, feature_indicator], n_batches_per_layer=1, n_trees=1, learning_rate=1.0, max_depth=1) num_steps = 1 est.train(input_fn, steps=num_steps) ensemble = self._assert_checkpoint_and_return_model(est.model_dir, global_step=1, finalized_trees=1, attempted_layers=1) # We learnt perfectly. eval_res = est.evaluate(input_fn=input_fn, steps=1) self.assertAllClose(eval_res['loss'], 0) predictions = list(est.predict(input_fn)) self.assertAllClose(labels, [pred['predictions'] for pred in predictions]) self.assertEqual(3, len(ensemble.trees[0].nodes)) # Check that the split happened on 'good' value, which will be encoded as # feature with index 2 (0-numeric, 1 - 'bad') self.assertEqual( 2, ensemble.trees[0].nodes[0].bucketized_split.feature_id) self.assertEqual(0, ensemble.trees[0].nodes[0].bucketized_split.threshold)
def test_warm_starting_selective_variables(self): """Tests selecting variables to warm-start.""" age = feature_column.numeric_column('age') city = feature_column.embedding_column( feature_column.categorical_column_with_vocabulary_list( 'city', vocabulary_list=['Mountain View', 'Palo Alto']), dimension=5) # Create a DNNLinearCombinedClassifier and train to save a checkpoint. dnn_lc_classifier = dnn_linear_combined.DNNLinearCombinedClassifier( linear_feature_columns=[age], dnn_feature_columns=[city], dnn_hidden_units=[256, 128], model_dir=self._ckpt_and_vocab_dir, n_classes=4, linear_optimizer='SGD', dnn_optimizer='SGD') dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1) # Create a second DNNLinearCombinedClassifier, warm-started from the first. # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't # have accumulator values that change). warm_started_dnn_lc_classifier = ( dnn_linear_combined.DNNLinearCombinedClassifier( linear_feature_columns=[age], dnn_feature_columns=[city], dnn_hidden_units=[256, 128], n_classes=4, linear_optimizer=gradient_descent.GradientDescentOptimizer( learning_rate=0.0), dnn_optimizer=gradient_descent.GradientDescentOptimizer( learning_rate=0.0), # The provided regular expression will only warm-start the deep # portion of the model. warm_start_from=estimator.WarmStartSettings( ckpt_to_initialize_from=dnn_lc_classifier.model_dir, vars_to_warm_start='.*(dnn).*'))) warm_started_dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1) for variable_name in warm_started_dnn_lc_classifier.get_variable_names(): if 'dnn' in variable_name: self.assertAllClose( dnn_lc_classifier.get_variable_value(variable_name), warm_started_dnn_lc_classifier.get_variable_value(variable_name)) elif 'linear' in variable_name: linear_values = warm_started_dnn_lc_classifier.get_variable_value( variable_name) # Since they're not warm-started, the linear weights will be # zero-initialized. self.assertAllClose(np.zeros_like(linear_values), linear_values)
def test_forward_in_exported_sparse(self): features_columns = [ fc.indicator_column( fc.categorical_column_with_vocabulary_list('x', range(10))) ] classifier = linear.LinearClassifier(feature_columns=features_columns) def train_input_fn(): dataset = dataset_ops.Dataset.from_tensors({ 'x': sparse_tensor.SparseTensor(values=[1, 2, 3], indices=[[0, 0], [1, 0], [1, 1]], dense_shape=[2, 2]), 'labels': [[0], [1]] }) def _split(x): labels = x.pop('labels') return x, labels dataset = dataset.map(_split) return dataset classifier.train(train_input_fn, max_steps=1) classifier = extenders.forward_features(classifier, keys=['x'], sparse_default_values={'x': 0}) def serving_input_fn(): features_ph = array_ops.placeholder(dtype=dtypes.int32, name='x', shape=[None]) features = {'x': layers.dense_to_sparse(features_ph)} return estimator_lib.export.ServingInputReceiver( features, {'x': features_ph}) export_dir, tmpdir = self._export_estimator(classifier, serving_input_fn) prediction_fn = from_saved_model(export_dir, signature_def_key='predict') features = (0, 2) prediction = prediction_fn({'x': features}) self.assertIn('x', prediction) self.assertEqual(features, tuple(prediction['x'])) gfile.DeleteRecursively(tmpdir)
def testTrainEvaluateAndPredictWithIndicatorColumn(self): categorical = feature_column.categorical_column_with_vocabulary_list( key='categorical', vocabulary_list=('bad', 'good', 'ok')) feature_indicator = feature_column.indicator_column(categorical) bucketized_col = feature_column.bucketized_column( feature_column.numeric_column( 'an_uninformative_feature', dtype=dtypes.float32), BUCKET_BOUNDARIES) labels = np.array([[0.], [5.7], [5.7], [0.], [0.]], dtype=np.float32) # Our categorical feature defines the labels perfectly input_fn = numpy_io.numpy_input_fn( x={ 'an_uninformative_feature': np.array([1, 1, 1, 1, 1]), 'categorical': np.array(['bad', 'good', 'good', 'ok', 'bad']), }, y=labels, batch_size=5, shuffle=False) # Train depth 1 tree. est = boosted_trees.BoostedTreesRegressor( feature_columns=[bucketized_col, feature_indicator], n_batches_per_layer=1, n_trees=1, learning_rate=1.0, max_depth=1) num_steps = 1 est.train(input_fn, steps=num_steps) ensemble = self._assert_checkpoint_and_return_model( est.model_dir, global_step=1, finalized_trees=1, attempted_layers=1) # We learnt perfectly. eval_res = est.evaluate(input_fn=input_fn, steps=1) self.assertAllClose(eval_res['loss'], 0) predictions = list(est.predict(input_fn)) self.assertAllClose( labels, [pred['predictions'] for pred in predictions]) self.assertEqual(3, len(ensemble.trees[0].nodes)) # Check that the split happened on 'good' value, which will be encoded as # feature with index 2 (0-numeric, 1 - 'bad') self.assertEqual(2, ensemble.trees[0].nodes[0].bucketized_split.feature_id) self.assertEqual(0, ensemble.trees[0].nodes[0].bucketized_split.threshold)
def testCalibratedLinearRegressorTraining3D(self): # Tests also categorical features that has a limited number # of valid values. feature_columns = [ feature_column_lib.numeric_column('x0'), feature_column_lib.numeric_column('x1'), feature_column_lib.categorical_column_with_vocabulary_list( 'x2', ['Y', 'N']) ] with ops.Graph().as_default(): estimator = self._CalibratedLinearRegressorWithQuantiles( ['x0', 'x1', 'x2'], feature_columns) estimator.train(input_fn=self._test_data.threed_input_fn(False, 4)) results = estimator.evaluate(input_fn=self._test_data.threed_input_fn( False, 1)) # For the record: # average_loss(CalibratedLinear, 4 epochs)=~1e-5 # average_loss(LinearRegressor, 100 epochs)=~0.159 self.assertLess(results['average_loss'], 1e-4)
def test_classifier_basic_warm_starting(self): """Tests correctness of DNNLinearCombinedClassifier default warm-start.""" age = feature_column.numeric_column('age') city = feature_column.embedding_column( feature_column.categorical_column_with_vocabulary_list( 'city', vocabulary_list=['Mountain View', 'Palo Alto']), dimension=5) # Create a DNNLinearCombinedClassifier and train to save a checkpoint. dnn_lc_classifier = dnn_linear_combined.DNNLinearCombinedClassifier( linear_feature_columns=[age], dnn_feature_columns=[city], dnn_hidden_units=[256, 128], model_dir=self._ckpt_and_vocab_dir, n_classes=4, linear_optimizer='SGD', dnn_optimizer='SGD') dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1) # Create a second DNNLinearCombinedClassifier, warm-started from the first. # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't # have accumulator values that change). warm_started_dnn_lc_classifier = ( dnn_linear_combined.DNNLinearCombinedClassifier( linear_feature_columns=[age], dnn_feature_columns=[city], dnn_hidden_units=[256, 128], n_classes=4, linear_optimizer=gradient_descent.GradientDescentOptimizer( learning_rate=0.0), dnn_optimizer=gradient_descent.GradientDescentOptimizer( learning_rate=0.0), warm_start_from=dnn_lc_classifier.model_dir)) warm_started_dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1) for variable_name in warm_started_dnn_lc_classifier.get_variable_names( ): self.assertAllClose( dnn_lc_classifier.get_variable_value(variable_name), warm_started_dnn_lc_classifier.get_variable_value( variable_name))
def test_forward_in_exported_sparse(self): features_columns = [fc.indicator_column( fc.categorical_column_with_vocabulary_list('x', range(10)))] classifier = linear.LinearClassifier(feature_columns=features_columns) def train_input_fn(): dataset = dataset_ops.Dataset.from_tensors({ 'x': sparse_tensor.SparseTensor( values=[1, 2, 3], indices=[[0, 0], [1, 0], [1, 1]], dense_shape=[2, 2]), 'labels': [[0], [1]] }) def _split(x): labels = x.pop('labels') return x, labels dataset = dataset.map(_split) return dataset classifier.train(train_input_fn, max_steps=1) classifier = extenders.forward_features( classifier, keys=['x'], sparse_default_values={'x': 0}) def serving_input_fn(): features_ph = array_ops.placeholder(dtype=dtypes.int32, name='x', shape=[None]) features = {'x': layers.dense_to_sparse(features_ph)} return estimator_lib.export.ServingInputReceiver(features, {'x': features_ph}) export_dir, tmpdir = self._export_estimator(classifier, serving_input_fn) prediction_fn = from_saved_model(export_dir, signature_def_key='predict') features = (0, 2) prediction = prediction_fn({'x': features}) self.assertIn('x', prediction) self.assertEqual(features, tuple(prediction['x'])) gfile.DeleteRecursively(tmpdir)
def test_classifier_basic_warm_starting(self): """Tests correctness of DNNLinearCombinedClassifier default warm-start.""" age = feature_column.numeric_column('age') city = feature_column.embedding_column( feature_column.categorical_column_with_vocabulary_list( 'city', vocabulary_list=['Mountain View', 'Palo Alto']), dimension=5) # Create a DNNLinearCombinedClassifier and train to save a checkpoint. dnn_lc_classifier = dnn_linear_combined.DNNLinearCombinedClassifier( linear_feature_columns=[age], dnn_feature_columns=[city], dnn_hidden_units=[256, 128], model_dir=self._ckpt_and_vocab_dir, n_classes=4, linear_optimizer='SGD', dnn_optimizer='SGD') dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1) # Create a second DNNLinearCombinedClassifier, warm-started from the first. # Use a learning_rate = 0.0 optimizer to check values (use SGD so we don't # have accumulator values that change). warm_started_dnn_lc_classifier = ( dnn_linear_combined.DNNLinearCombinedClassifier( linear_feature_columns=[age], dnn_feature_columns=[city], dnn_hidden_units=[256, 128], n_classes=4, linear_optimizer=gradient_descent.GradientDescentOptimizer( learning_rate=0.0), dnn_optimizer=gradient_descent.GradientDescentOptimizer( learning_rate=0.0), warm_start_from=dnn_lc_classifier.model_dir)) warm_started_dnn_lc_classifier.train(input_fn=self._input_fn, max_steps=1) for variable_name in warm_started_dnn_lc_classifier.get_variable_names(): self.assertAllClose( dnn_lc_classifier.get_variable_value(variable_name), warm_started_dnn_lc_classifier.get_variable_value(variable_name))
def _testAnnotationsPresentForEstimator(self, estimator_class): feature_columns = [ feature_column.numeric_column('x', shape=(1,)), feature_column.embedding_column( feature_column.categorical_column_with_vocabulary_list( 'y', vocabulary_list=['a', 'b', 'c']), dimension=3) ] estimator = estimator_class( hidden_units=(2, 2), feature_columns=feature_columns, model_dir=self._model_dir) model_fn = estimator.model_fn graph = ops.Graph() with graph.as_default(): model_fn({ 'x': array_ops.constant([1.0]), 'y': array_ops.constant(['a']) }, {}, model_fn_lib.ModeKeys.PREDICT, config=None) unprocessed_features = self._getLayerAnnotationCollection( graph, dnn_with_layer_annotations.LayerAnnotationsCollectionNames .UNPROCESSED_FEATURES) processed_features = self._getLayerAnnotationCollection( graph, dnn_with_layer_annotations.LayerAnnotationsCollectionNames .PROCESSED_FEATURES) feature_columns = graph.get_collection( dnn_with_layer_annotations.LayerAnnotationsCollectionNames .FEATURE_COLUMNS) self.assertItemsEqual(unprocessed_features.keys(), ['x', 'y']) self.assertEqual(2, len(processed_features.keys())) self.assertEqual(2, len(feature_columns))
def _testAnnotationsPresentForEstimator(self, estimator_class): feature_columns = [ feature_column.numeric_column('x', shape=(1, )), feature_column.embedding_column( feature_column.categorical_column_with_vocabulary_list( 'y', vocabulary_list=['a', 'b', 'c']), dimension=3) ] estimator = estimator_class(hidden_units=(2, 2), feature_columns=feature_columns, model_dir=self._model_dir) model_fn = estimator.model_fn graph = ops.Graph() with graph.as_default(): model_fn( { 'x': array_ops.constant([1.0]), 'y': array_ops.constant(['a']) }, {}, model_fn_lib.ModeKeys.PREDICT, config=None) unprocessed_features = self._getLayerAnnotationCollection( graph, dnn_with_layer_annotations. LayerAnnotationsCollectionNames.UNPROCESSED_FEATURES) processed_features = self._getLayerAnnotationCollection( graph, dnn_with_layer_annotations. LayerAnnotationsCollectionNames.PROCESSED_FEATURES) feature_columns = graph.get_collection( dnn_with_layer_annotations.LayerAnnotationsCollectionNames. FEATURE_COLUMNS) self.assertItemsEqual(unprocessed_features.keys(), ['x', 'y']) self.assertEqual(2, len(processed_features.keys())) self.assertEqual(2, len(feature_columns))
def test_complete_flow(self): n_classes = 3 input_dimension = 2 batch_size = 12 data = np.linspace(0., n_classes - 1., batch_size * input_dimension, dtype=np.float32) x_data = data.reshape(batch_size, input_dimension) categorical_data = np.random.random_integers(0, len(x_data), size=len(x_data)) y_data = np.reshape(self._as_label(data[:batch_size]), (batch_size, 1)) train_input_fn = numpy_io.numpy_input_fn(x={ 'x': x_data, 'categories': categorical_data }, y=y_data, batch_size=batch_size, num_epochs=None, shuffle=True) eval_input_fn = numpy_io.numpy_input_fn(x={ 'x': x_data, 'categories': categorical_data }, y=y_data, batch_size=batch_size, shuffle=False) predict_input_fn = numpy_io.numpy_input_fn(x={ 'x': x_data, 'categories': categorical_data }, batch_size=batch_size, shuffle=False) feature_columns = [ feature_column.numeric_column('x', shape=(input_dimension, )), feature_column.embedding_column( feature_column.categorical_column_with_vocabulary_list( 'categories', vocabulary_list=np.linspace(0., len(x_data), len(x_data), dtype=np.int64)), 1) ] estimator = dnn.DNNClassifier(hidden_units=(2, 2), feature_columns=feature_columns, n_classes=n_classes, model_dir=self._model_dir) def optimizer_fn(): return optimizers.get_optimizer_instance('Adagrad', learning_rate=0.05) estimator = estimator_lib.Estimator( model_fn=replicate_model_fn.replicate_model_fn( estimator.model_fn, optimizer_fn, devices=['/gpu:0', '/gpu:1', '/gpu:2']), model_dir=estimator.model_dir, config=estimator.config, params=estimator.params) num_steps = 10 estimator.train(train_input_fn, steps=num_steps) scores = estimator.evaluate(eval_input_fn) self.assertEqual(num_steps, scores[ops_lib.GraphKeys.GLOBAL_STEP]) self.assertIn('loss', six.iterkeys(scores)) predicted_proba = np.array([ x[prediction_keys.PredictionKeys.PROBABILITIES] for x in estimator.predict(predict_input_fn) ]) self.assertAllEqual((batch_size, n_classes), predicted_proba.shape) feature_spec = feature_column.make_parse_example_spec(feature_columns) serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn( feature_spec) export_dir = estimator.export_savedmodel(tempfile.mkdtemp(), serving_input_receiver_fn) self.assertTrue(gfile.Exists(export_dir))
from tensorflow.contrib.learn import LinearRegressor, pandas_input_fn, DNNRegressor, Experiment from tensorflow.python.feature_column.feature_column import categorical_column_with_hash_bucket, numeric_column, \ categorical_column_with_vocabulary_list, embedding_column, indicator_column make = categorical_column_with_hash_bucket('make', 100) horsepower = numeric_column('horsepower', shape=[]) cylinders = categorical_column_with_vocabulary_list( 'num-of-cylinders', ['two', 'three', 'four', 'six', 'eight']) ############### regressor = DNNRegressor(feature_columns=[ embedding_column(make, 10), horsepower, indicator_column(cylinders, 3) ], hidden_units=[50, 30, 10]) ################ regressor = LinearRegressor(feature_columns=[make, horsepower, cylinders]) # any python generator train_input_fn = pandas_input_fn(x=input_data, y=input_label, batch_size=64, shuffle=True, num_epochs=None) regressor.train(train_input_fn, steps=10000) def expirement_fn(run_config, hparams): regressor = DNNRegressor(..., config=run_config,
def _complete_flow_with_mode(self, mode): n_classes = 3 input_dimension = 2 batch_size = 12 data = np.linspace( 0., n_classes - 1., batch_size * input_dimension, dtype=np.float32) x_data = data.reshape(batch_size, input_dimension) categorical_data = np.random.random_integers( 0, len(x_data), size=len(x_data)) y_data = np.reshape(self._as_label(data[:batch_size]), (batch_size, 1)) train_input_fn = numpy_io.numpy_input_fn( x={'x': x_data, 'categories': categorical_data}, y=y_data, batch_size=batch_size, num_epochs=None, shuffle=True) eval_input_fn = numpy_io.numpy_input_fn( x={'x': x_data, 'categories': categorical_data}, y=y_data, batch_size=batch_size, shuffle=False) predict_input_fn = numpy_io.numpy_input_fn( x={'x': x_data, 'categories': categorical_data}, batch_size=batch_size, shuffle=False) feature_columns = [ feature_column.numeric_column('x', shape=(input_dimension,)), feature_column.embedding_column( feature_column.categorical_column_with_vocabulary_list( 'categories', vocabulary_list=np.linspace( 0., len(x_data), len(x_data), dtype=np.int64)), 1) ] estimator = dnn.DNNClassifier( hidden_units=(2, 2), feature_columns=feature_columns, n_classes=n_classes, model_dir=self._model_dir) def optimizer_fn(): return optimizers.get_optimizer_instance('Adagrad', learning_rate=0.05) if not mode: # Use the public `replicate_model_fn`. model_fn = replicate_model_fn.replicate_model_fn( estimator.model_fn, optimizer_fn, devices=['/gpu:0', '/gpu:1', '/gpu:2']) else: model_fn = replicate_model_fn._replicate_model_fn_with_mode( estimator.model_fn, optimizer_fn, devices=['/gpu:0', '/gpu:1', '/gpu:2'], mode=mode) estimator = estimator_lib.Estimator( model_fn=model_fn, model_dir=estimator.model_dir, config=estimator.config, params=estimator.params) num_steps = 10 estimator.train(train_input_fn, steps=num_steps) scores = estimator.evaluate(eval_input_fn) self.assertEqual(num_steps, scores[ops_lib.GraphKeys.GLOBAL_STEP]) self.assertIn('loss', six.iterkeys(scores)) predicted_proba = np.array([ x[prediction_keys.PredictionKeys.PROBABILITIES] for x in estimator.predict(predict_input_fn) ]) self.assertAllEqual((batch_size, n_classes), predicted_proba.shape) feature_spec = feature_column.make_parse_example_spec(feature_columns) serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn( feature_spec) export_dir = estimator.export_savedmodel(tempfile.mkdtemp(), serving_input_receiver_fn) self.assertTrue(gfile.Exists(export_dir))
def _build_feature_columns(self, ): multi_hot_feature_columns = {} multi_hot_feature_columns_deep = {} multi_category_feature_columns = {} continuous_feature_columns = {} crossed_feature_columns = [] bucketized_feature_columns = [] embedding_feature_columns = [] if self._data_conf.multi_hot_columns is not None: for column in self._data_conf.multi_hot_columns: multi_hot_feature_columns[ column] = categorical_column_with_vocabulary_list( column, self._data_conf.multi_hot_columns[column], dtype=tf.string) multi_hot_feature_columns_deep[column] = indicator_column( multi_hot_feature_columns[column]) if self._data_conf.multi_category_columns is not None: multi_category_feature_columns = { column: categorical_column_with_hash_bucket(column, hash_bucket_size=1000) for column in self._data_conf.multi_category_columns } if self._data_conf.continuous_columns is not None: continuous_feature_columns = { column: numeric_column(column) for column in self._data_conf.continuous_columns } if self._data_conf.crossed_columns is not None: crossed_feature_columns = [ crossed_column(_, hash_bucket_size=100000) for _ in self._data_conf.crossed_columns ] if self._data_conf.bucketized_columns is not None: [ bucketized_feature_columns.append( bucketized_column(continuous_feature_columns[column], boundaries=boundary)) for column, boundary in self._data_conf.bucketized_columns.items ] if len(multi_category_feature_columns) > 0: embedding_feature_columns = [ embedding_column( _, dimension=self._model_conf.embedding_dimension) for _ in multi_category_feature_columns.values() ] self._feature_mapping = { 0: list(multi_hot_feature_columns.values()), 1: list(multi_category_feature_columns.values()), 2: list(continuous_feature_columns.values()), 3: crossed_feature_columns, 4: bucketized_feature_columns, 5: embedding_feature_columns, 6: list(multi_hot_feature_columns_deep.values()) } self._build_feature_columns_for_model()
def testWarmStartInputLayerMoreSettings(self): # Create old and new vocabs for sparse column "sc_vocab". prev_vocab_path = self._write_vocab( ["apple", "banana", "guava", "orange"], "old_vocab") new_vocab_path = self._write_vocab( ["orange", "guava", "banana", "apple", "raspberry", "blueberry"], "new_vocab") # Create feature columns. sc_hash = fc.categorical_column_with_hash_bucket("sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6) all_linear_cols = [sc_hash, sc_keys, sc_vocab] # Save checkpoint from which to warm-start. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: _ = variable_scope.get_variable("linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) sc_keys_weights = variable_scope.get_variable( "some_other_name", shape=[4, 1], initializer=rand()) _ = variable_scope.get_variable( "linear_model/sc_vocab/weights", initializer=[[0.5], [1.], [2.], [3.]]) self._write_checkpoint(sess) prev_keys_val = sess.run(sc_keys_weights) def _partitioner(shape, dtype): # pylint:disable=unused-argument # Partition each var into 2 equal slices. partitions = [1] * len(shape) partitions[0] = min(2, shape[0].value) return partitions # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model( all_linear_cols, _partitioner) ws_settings = ws_util._WarmStartSettings( self.get_temp_dir(), col_to_prev_vocab={sc_vocab: prev_vocab_path}, col_to_prev_tensor={sc_keys: "some_other_name"}, exclude_columns=[sc_hash]) ws_util._warmstart_input_layer(cols_to_vars, ws_settings) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. Var corresponding to # sc_hash should not be warm-started. Var corresponding to sc_vocab # should be correctly warmstarted after vocab remapping. self._assert_cols_to_vars( cols_to_vars, { sc_keys: np.split(prev_keys_val, 2), sc_hash: [np.zeros([8, 1]), np.zeros([7, 1])], sc_vocab: [ np.array([[3.], [2.], [1.]]), np.array([[0.5], [0.], [0.]]) ] }, sess)
def testWarmStartVarsToWarmstartIsNone(self): # Create old and new vocabs for sparse column "sc_vocab". prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "old_vocab") new_vocab_path = self._write_vocab( ["orange", "guava", "banana", "apple", "raspberry", "blueberry"], "new_vocab") # Create feature columns. sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6) all_linear_cols = [sc_hash, sc_keys, sc_vocab] # Save checkpoint from which to warm-start. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) variable_scope.get_variable( "some_other_name", shape=[4, 1], initializer=rand()) variable_scope.get_variable( "linear_model/sc_vocab/weights", initializer=[[0.5], [1.], [2.], [3.]]) self._write_checkpoint(sess) def _partitioner(shape, dtype): # pylint:disable=unused-argument # Partition each var into 2 equal slices. partitions = [1] * len(shape) partitions[0] = min(2, shape[0].value) return partitions # New graph, new session with warm-starting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, _partitioner) vocab_info = ws_util.VocabInfo( new_vocab=sc_vocab.vocabulary_file, new_vocab_size=sc_vocab.vocabulary_size, num_oov_buckets=sc_vocab.num_oov_buckets, old_vocab=prev_vocab_path) ws_settings = ws_util.WarmStartSettings( self.get_temp_dir(), # The special value of None here will ensure that only the variable # specified in var_name_to_vocab_info (sc_vocab embedding) is # warm-started. vars_to_warm_start=None, var_name_to_vocab_info={ ws_util._infer_var_name(cols_to_vars[sc_vocab]): vocab_info }, # Even though this is provided, the None value for # vars_to_warm_start overrides the logic, and this will not be # warm-started. var_name_to_prev_var_name={ ws_util._infer_var_name(cols_to_vars[sc_keys]): "some_other_name" }) ws_util._warm_start(ws_settings) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warm-started. Var corresponding to # sc_vocab should be correctly warm-started after vocab remapping, # and neither of the other two should be warm-started.. self._assert_cols_to_vars(cols_to_vars, { sc_keys: [np.zeros([2, 1]), np.zeros([2, 1])], sc_hash: [np.zeros([8, 1]), np.zeros([7, 1])], sc_vocab: [ np.array([[3.], [2.], [1.]]), np.array([[0.5], [0.], [0.]]) ] }, sess)
def testWarmStart_MultipleCols(self): # Create vocab for sparse column "sc_vocab". vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "vocab") # Create feature columns. sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10) sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=vocab_path, vocabulary_size=4) real = fc.numeric_column("real") real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.]) cross = fc.crossed_column([sc_keys, sc_vocab], hash_bucket_size=20) all_linear_cols = [sc_int, sc_hash, sc_keys, sc_vocab, real_bucket, cross] # Save checkpoint from which to warm-start. Also create a bias variable, # so we can check that it's also warm-started. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: sc_int_weights = variable_scope.get_variable( "linear_model/sc_int/weights", shape=[10, 1], initializer=ones()) sc_hash_weights = variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) sc_keys_weights = variable_scope.get_variable( "linear_model/sc_keys/weights", shape=[4, 1], initializer=rand()) sc_vocab_weights = variable_scope.get_variable( "linear_model/sc_vocab/weights", shape=[4, 1], initializer=ones()) real_bucket_weights = variable_scope.get_variable( "linear_model/real_bucketized/weights", shape=[5, 1], initializer=norms()) cross_weights = variable_scope.get_variable( "linear_model/sc_keys_X_sc_vocab/weights", shape=[20, 1], initializer=rand()) bias = variable_scope.get_variable( "linear_model/bias_weights", shape=[1], initializer=rand()) self._write_checkpoint(sess) (prev_int_val, prev_hash_val, prev_keys_val, prev_vocab_val, prev_bucket_val, prev_cross_val, prev_bias_val) = sess.run([ sc_int_weights, sc_hash_weights, sc_keys_weights, sc_vocab_weights, real_bucket_weights, cross_weights, bias ]) partitioner = lambda shape, dtype: [1] * len(shape) # New graph, new session WITHOUT warm-starting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner) sess.run(variables.global_variables_initializer()) # Without warm-starting, all weights should be initialized using default # initializer (which is init_ops.zeros_initializer). self._assert_cols_to_vars(cols_to_vars, { sc_int: [np.zeros([10, 1])], sc_hash: [np.zeros([15, 1])], sc_keys: [np.zeros([4, 1])], sc_vocab: [np.zeros([4, 1])], real_bucket: [np.zeros([5, 1])], cross: [np.zeros([20, 1])], }, sess) # New graph, new session with warm-starting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner) vocab_info = ws_util.VocabInfo( new_vocab=sc_vocab.vocabulary_file, new_vocab_size=sc_vocab.vocabulary_size, num_oov_buckets=sc_vocab.num_oov_buckets, old_vocab=vocab_path) ws_util._warm_start( ws_util.WarmStartSettings( self.get_temp_dir(), var_name_to_vocab_info={ "linear_model/sc_vocab/weights": vocab_info })) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warm-started. self._assert_cols_to_vars(cols_to_vars, { sc_int: [prev_int_val], sc_hash: [prev_hash_val], sc_keys: [prev_keys_val], sc_vocab: [prev_vocab_val], real_bucket: [prev_bucket_val], cross: [prev_cross_val], "bias": [prev_bias_val], }, sess)
def testWarmStartVarsToWarmstartIsNone(self): # Create old and new vocabs for sparse column "sc_vocab". prev_vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "old_vocab") new_vocab_path = self._write_vocab( ["orange", "guava", "banana", "apple", "raspberry", "blueberry"], "new_vocab") # Create feature columns. sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=new_vocab_path, vocabulary_size=6) all_linear_cols = [sc_hash, sc_keys, sc_vocab] # Save checkpoint from which to warm-start. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) variable_scope.get_variable( "some_other_name", shape=[4, 1], initializer=rand()) variable_scope.get_variable( "linear_model/sc_vocab/weights", initializer=[[0.5], [1.], [2.], [3.]]) self._write_checkpoint(sess) def _partitioner(shape, dtype): # pylint:disable=unused-argument # Partition each var into 2 equal slices. partitions = [1] * len(shape) partitions[0] = min(2, shape[0].value) return partitions # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, _partitioner) vocab_info = ws_util._VocabInfo( new_vocab=sc_vocab.vocabulary_file, new_vocab_size=sc_vocab.vocabulary_size, num_oov_buckets=sc_vocab.num_oov_buckets, old_vocab=prev_vocab_path ) ws_settings = ws_util._WarmStartSettings( self.get_temp_dir(), # The special value of None here will ensure that only the variable # specified in var_name_to_vocab_info (sc_vocab embedding) is # warmstarted. vars_to_warmstart=None, var_name_to_vocab_info={ ws_util._infer_var_name(cols_to_vars[sc_vocab]): vocab_info }, # Even though this is provided, the None value for vars_to_warmstart # overrides the logic, and this will not be warmstarted. var_name_to_prev_var_name={ ws_util._infer_var_name(cols_to_vars[sc_keys]): "some_other_name" }) ws_util._warmstart(ws_settings) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. Var corresponding to # sc_vocab should be correctly warmstarted after vocab remapping, # and neither of the other two should be warmstarted.. self._assert_cols_to_vars(cols_to_vars, { sc_keys: [np.zeros([2, 1]), np.zeros([2, 1])], sc_hash: [np.zeros([8, 1]), np.zeros([7, 1])], sc_vocab: [ np.array([[3.], [2.], [1.]]), np.array([[0.5], [0.], [0.]]) ] }, sess)
def testWarmStart_MultipleCols(self): # Create vocab for sparse column "sc_vocab". vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "vocab") # Create feature columns. sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10) sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=vocab_path, vocabulary_size=4) real = fc.numeric_column("real") real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.]) cross = fc.crossed_column([sc_keys, sc_vocab], hash_bucket_size=20) all_linear_cols = [sc_int, sc_hash, sc_keys, sc_vocab, real_bucket, cross] # Save checkpoint from which to warm-start. Also create a bias variable, # so we can check that it's also warmstarted. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: sc_int_weights = variable_scope.get_variable( "linear_model/sc_int/weights", shape=[10, 1], initializer=ones()) sc_hash_weights = variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) sc_keys_weights = variable_scope.get_variable( "linear_model/sc_keys/weights", shape=[4, 1], initializer=rand()) sc_vocab_weights = variable_scope.get_variable( "linear_model/sc_vocab/weights", shape=[4, 1], initializer=ones()) real_bucket_weights = variable_scope.get_variable( "linear_model/real_bucketized/weights", shape=[5, 1], initializer=norms()) cross_weights = variable_scope.get_variable( "linear_model/sc_keys_X_sc_vocab/weights", shape=[20, 1], initializer=rand()) bias = variable_scope.get_variable( "linear_model/bias_weights", shape=[1], initializer=rand()) self._write_checkpoint(sess) (prev_int_val, prev_hash_val, prev_keys_val, prev_vocab_val, prev_bucket_val, prev_cross_val, prev_bias_val) = sess.run([ sc_int_weights, sc_hash_weights, sc_keys_weights, sc_vocab_weights, real_bucket_weights, cross_weights, bias ]) partitioner = lambda shape, dtype: [1] * len(shape) # New graph, new session WITHOUT warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner) sess.run(variables.global_variables_initializer()) # Without warmstarting, all weights should be initialized using default # initializer (which is init_ops.zeros_initializer). self._assert_cols_to_vars(cols_to_vars, { sc_int: [np.zeros([10, 1])], sc_hash: [np.zeros([15, 1])], sc_keys: [np.zeros([4, 1])], sc_vocab: [np.zeros([4, 1])], real_bucket: [np.zeros([5, 1])], cross: [np.zeros([20, 1])], }, sess) # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner) vocab_info = ws_util._VocabInfo( new_vocab=sc_vocab.vocabulary_file, new_vocab_size=sc_vocab.vocabulary_size, num_oov_buckets=sc_vocab.num_oov_buckets, old_vocab=vocab_path ) ws_util._warmstart( ws_util._WarmStartSettings( self.get_temp_dir(), var_name_to_vocab_info={ "linear_model/sc_vocab/weights": vocab_info })) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. self._assert_cols_to_vars(cols_to_vars, { sc_int: [prev_int_val], sc_hash: [prev_hash_val], sc_keys: [prev_keys_val], sc_vocab: [prev_vocab_val], real_bucket: [prev_bucket_val], cross: [prev_cross_val], "bias": [prev_bias_val], }, sess)