def setUp(self): self._feature_columns = { feature_column.bucketized_column( feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32), BUCKET_BOUNDARIES) for i in range(NUM_FEATURES) }
def testWarmStart_BucketizedColumn(self): # Create feature column. real = fc.numeric_column("real") real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.]) # Save checkpoint from which to warm-start. _, prev_bucket_val = self._create_prev_run_var( "linear_model/real_bucketized/weights", shape=[5, 1], initializer=norms()) partitioner = lambda shape, dtype: [1] * len(shape) # New graph, new session WITHOUT warm-starting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model([real_bucket], partitioner) sess.run(variables.global_variables_initializer()) # Without warm-starting, the weights should be initialized using default # initializer (which is init_ops.zeros_initializer). self._assert_cols_to_vars(cols_to_vars, {real_bucket: [np.zeros([5, 1])]}, sess) # New graph, new session with warm-starting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model([real_bucket], partitioner) ws_util._warm_start( ws_util.WarmStartSettings( self.get_temp_dir(), vars_to_warm_start=".*real_bucketized.*")) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warm-started. self._assert_cols_to_vars(cols_to_vars, {real_bucket: [prev_bucket_val]}, sess)
def testWarmStart_BucketizedColumn(self): # Create feature column. real = fc.numeric_column("real") real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.]) # Save checkpoint from which to warm-start. _, prev_bucket_val = self._create_prev_run_var( "linear_model/real_bucketized/weights", shape=[5, 1], initializer=norms()) partitioner = lambda shape, dtype: [1] * len(shape) # New graph, new session WITHOUT warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model([real_bucket], partitioner) sess.run(variables.global_variables_initializer()) # Without warmstarting, the weights should be initialized using default # initializer (which is init_ops.zeros_initializer). self._assert_cols_to_vars(cols_to_vars, {real_bucket: [np.zeros([5, 1])]}, sess) # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model([real_bucket], partitioner) ws_util._warmstart(ws_util._WarmStartSettings( self.get_temp_dir(), vars_to_warmstart=".*real_bucketized.*")) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. self._assert_cols_to_vars(cols_to_vars, {real_bucket: [prev_bucket_val]}, sess)
def setUp(self): self._head = canned_boosted_trees._create_regression_head(label_dimension=1) self._feature_columns = { feature_column.bucketized_column( feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32), BUCKET_BOUNDARIES) for i in range(NUM_FEATURES) }
def testTrainEvaluateAndPredictWithIndicatorColumn(self): categorical = feature_column.categorical_column_with_vocabulary_list( key='categorical', vocabulary_list=('bad', 'good', 'ok')) feature_indicator = feature_column.indicator_column(categorical) bucketized_col = feature_column.bucketized_column( feature_column.numeric_column('an_uninformative_feature', dtype=dtypes.float32), BUCKET_BOUNDARIES) labels = np.array([[0.], [5.7], [5.7], [0.], [0.]], dtype=np.float32) # Our categorical feature defines the labels perfectly input_fn = numpy_io.numpy_input_fn(x={ 'an_uninformative_feature': np.array([1, 1, 1, 1, 1]), 'categorical': np.array(['bad', 'good', 'good', 'ok', 'bad']), }, y=labels, batch_size=5, shuffle=False) # Train depth 1 tree. est = boosted_trees.BoostedTreesRegressor( feature_columns=[bucketized_col, feature_indicator], n_batches_per_layer=1, n_trees=1, learning_rate=1.0, max_depth=1) num_steps = 1 est.train(input_fn, steps=num_steps) ensemble = self._assert_checkpoint_and_return_model(est.model_dir, global_step=1, finalized_trees=1, attempted_layers=1) # We learnt perfectly. eval_res = est.evaluate(input_fn=input_fn, steps=1) self.assertAllClose(eval_res['loss'], 0) predictions = list(est.predict(input_fn)) self.assertAllClose(labels, [pred['predictions'] for pred in predictions]) self.assertEqual(3, len(ensemble.trees[0].nodes)) # Check that the split happened on 'good' value, which will be encoded as # feature with index 2 (0-numeric, 1 - 'bad') self.assertEqual( 2, ensemble.trees[0].nodes[0].bucketized_split.feature_id) self.assertEqual(0, ensemble.trees[0].nodes[0].bucketized_split.threshold)
def setUp(self): self._feature_columns = { feature_column.bucketized_column( feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32), BUCKET_BOUNDARIES) for i in range(NUM_FEATURES) } self._tree_hparams = boosted_trees.TreeHParams( n_trees=2, max_depth=2, learning_rate=0.1, l1=0., l2=0.01, tree_complexity=0.)
def setUp(self): self._feature_columns = { feature_column.bucketized_column( feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32), BUCKET_BOUNDARIES) for i in range(NUM_FEATURES) } self._tree_hparams = boosted_trees._TreeHParams( # pylint:disable=protected-access n_trees=2, max_depth=2, learning_rate=0.1, l1=0., l2=0.01, tree_complexity=0.)
def setUp(self): self._feature_columns = { feature_column.bucketized_column( feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32), BUCKET_BOUNDARIES) for i in range(NUM_FEATURES) } self._tree_hparams = boosted_trees._TreeHParams( # pylint:disable=protected-access n_trees=2, max_depth=2, learning_rate=0.1, l1=0., l2=0.01, tree_complexity=0., min_node_weight=0.)
def testTrainEvaluateAndPredictWithIndicatorColumn(self): categorical = feature_column.categorical_column_with_vocabulary_list( key='categorical', vocabulary_list=('bad', 'good', 'ok')) feature_indicator = feature_column.indicator_column(categorical) bucketized_col = feature_column.bucketized_column( feature_column.numeric_column( 'an_uninformative_feature', dtype=dtypes.float32), BUCKET_BOUNDARIES) labels = np.array([[0.], [5.7], [5.7], [0.], [0.]], dtype=np.float32) # Our categorical feature defines the labels perfectly input_fn = numpy_io.numpy_input_fn( x={ 'an_uninformative_feature': np.array([1, 1, 1, 1, 1]), 'categorical': np.array(['bad', 'good', 'good', 'ok', 'bad']), }, y=labels, batch_size=5, shuffle=False) # Train depth 1 tree. est = boosted_trees.BoostedTreesRegressor( feature_columns=[bucketized_col, feature_indicator], n_batches_per_layer=1, n_trees=1, learning_rate=1.0, max_depth=1) num_steps = 1 est.train(input_fn, steps=num_steps) ensemble = self._assert_checkpoint_and_return_model( est.model_dir, global_step=1, finalized_trees=1, attempted_layers=1) # We learnt perfectly. eval_res = est.evaluate(input_fn=input_fn, steps=1) self.assertAllClose(eval_res['loss'], 0) predictions = list(est.predict(input_fn)) self.assertAllClose( labels, [pred['predictions'] for pred in predictions]) self.assertEqual(3, len(ensemble.trees[0].nodes)) # Check that the split happened on 'good' value, which will be encoded as # feature with index 2 (0-numeric, 1 - 'bad') self.assertEqual(2, ensemble.trees[0].nodes[0].bucketized_split.feature_id) self.assertEqual(0, ensemble.trees[0].nodes[0].bucketized_split.threshold)
def testWarmStart_MultipleCols(self): # Create vocab for sparse column "sc_vocab". vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "vocab") # Create feature columns. sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10) sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=vocab_path, vocabulary_size=4) real = fc.numeric_column("real") real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.]) cross = fc.crossed_column([sc_keys, sc_vocab], hash_bucket_size=20) all_linear_cols = [sc_int, sc_hash, sc_keys, sc_vocab, real_bucket, cross] # Save checkpoint from which to warm-start. Also create a bias variable, # so we can check that it's also warm-started. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: sc_int_weights = variable_scope.get_variable( "linear_model/sc_int/weights", shape=[10, 1], initializer=ones()) sc_hash_weights = variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) sc_keys_weights = variable_scope.get_variable( "linear_model/sc_keys/weights", shape=[4, 1], initializer=rand()) sc_vocab_weights = variable_scope.get_variable( "linear_model/sc_vocab/weights", shape=[4, 1], initializer=ones()) real_bucket_weights = variable_scope.get_variable( "linear_model/real_bucketized/weights", shape=[5, 1], initializer=norms()) cross_weights = variable_scope.get_variable( "linear_model/sc_keys_X_sc_vocab/weights", shape=[20, 1], initializer=rand()) bias = variable_scope.get_variable( "linear_model/bias_weights", shape=[1], initializer=rand()) self._write_checkpoint(sess) (prev_int_val, prev_hash_val, prev_keys_val, prev_vocab_val, prev_bucket_val, prev_cross_val, prev_bias_val) = sess.run([ sc_int_weights, sc_hash_weights, sc_keys_weights, sc_vocab_weights, real_bucket_weights, cross_weights, bias ]) partitioner = lambda shape, dtype: [1] * len(shape) # New graph, new session WITHOUT warm-starting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner) sess.run(variables.global_variables_initializer()) # Without warm-starting, all weights should be initialized using default # initializer (which is init_ops.zeros_initializer). self._assert_cols_to_vars(cols_to_vars, { sc_int: [np.zeros([10, 1])], sc_hash: [np.zeros([15, 1])], sc_keys: [np.zeros([4, 1])], sc_vocab: [np.zeros([4, 1])], real_bucket: [np.zeros([5, 1])], cross: [np.zeros([20, 1])], }, sess) # New graph, new session with warm-starting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner) vocab_info = ws_util.VocabInfo( new_vocab=sc_vocab.vocabulary_file, new_vocab_size=sc_vocab.vocabulary_size, num_oov_buckets=sc_vocab.num_oov_buckets, old_vocab=vocab_path) ws_util._warm_start( ws_util.WarmStartSettings( self.get_temp_dir(), var_name_to_vocab_info={ "linear_model/sc_vocab/weights": vocab_info })) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warm-started. self._assert_cols_to_vars(cols_to_vars, { sc_int: [prev_int_val], sc_hash: [prev_hash_val], sc_keys: [prev_keys_val], sc_vocab: [prev_vocab_val], real_bucket: [prev_bucket_val], cross: [prev_cross_val], "bias": [prev_bias_val], }, sess)
def testWarmStart_MultipleCols(self): # Create vocab for sparse column "sc_vocab". vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"], "vocab") # Create feature columns. sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10) sc_hash = fc.categorical_column_with_hash_bucket( "sc_hash", hash_bucket_size=15) sc_keys = fc.categorical_column_with_vocabulary_list( "sc_keys", vocabulary_list=["a", "b", "c", "e"]) sc_vocab = fc.categorical_column_with_vocabulary_file( "sc_vocab", vocabulary_file=vocab_path, vocabulary_size=4) real = fc.numeric_column("real") real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.]) cross = fc.crossed_column([sc_keys, sc_vocab], hash_bucket_size=20) all_linear_cols = [sc_int, sc_hash, sc_keys, sc_vocab, real_bucket, cross] # Save checkpoint from which to warm-start. Also create a bias variable, # so we can check that it's also warmstarted. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: sc_int_weights = variable_scope.get_variable( "linear_model/sc_int/weights", shape=[10, 1], initializer=ones()) sc_hash_weights = variable_scope.get_variable( "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms()) sc_keys_weights = variable_scope.get_variable( "linear_model/sc_keys/weights", shape=[4, 1], initializer=rand()) sc_vocab_weights = variable_scope.get_variable( "linear_model/sc_vocab/weights", shape=[4, 1], initializer=ones()) real_bucket_weights = variable_scope.get_variable( "linear_model/real_bucketized/weights", shape=[5, 1], initializer=norms()) cross_weights = variable_scope.get_variable( "linear_model/sc_keys_X_sc_vocab/weights", shape=[20, 1], initializer=rand()) bias = variable_scope.get_variable( "linear_model/bias_weights", shape=[1], initializer=rand()) self._write_checkpoint(sess) (prev_int_val, prev_hash_val, prev_keys_val, prev_vocab_val, prev_bucket_val, prev_cross_val, prev_bias_val) = sess.run([ sc_int_weights, sc_hash_weights, sc_keys_weights, sc_vocab_weights, real_bucket_weights, cross_weights, bias ]) partitioner = lambda shape, dtype: [1] * len(shape) # New graph, new session WITHOUT warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner) sess.run(variables.global_variables_initializer()) # Without warmstarting, all weights should be initialized using default # initializer (which is init_ops.zeros_initializer). self._assert_cols_to_vars(cols_to_vars, { sc_int: [np.zeros([10, 1])], sc_hash: [np.zeros([15, 1])], sc_keys: [np.zeros([4, 1])], sc_vocab: [np.zeros([4, 1])], real_bucket: [np.zeros([5, 1])], cross: [np.zeros([20, 1])], }, sess) # New graph, new session with warmstarting. with ops.Graph().as_default() as g: with self.test_session(graph=g) as sess: cols_to_vars = self._create_linear_model(all_linear_cols, partitioner) vocab_info = ws_util._VocabInfo( new_vocab=sc_vocab.vocabulary_file, new_vocab_size=sc_vocab.vocabulary_size, num_oov_buckets=sc_vocab.num_oov_buckets, old_vocab=vocab_path ) ws_util._warmstart( ws_util._WarmStartSettings( self.get_temp_dir(), var_name_to_vocab_info={ "linear_model/sc_vocab/weights": vocab_info })) sess.run(variables.global_variables_initializer()) # Verify weights were correctly warmstarted. self._assert_cols_to_vars(cols_to_vars, { sc_int: [prev_int_val], sc_hash: [prev_hash_val], sc_keys: [prev_keys_val], sc_vocab: [prev_vocab_val], real_bucket: [prev_bucket_val], cross: [prev_cross_val], "bias": [prev_bias_val], }, sess)
def _build_feature_columns(self, ): multi_hot_feature_columns = {} multi_hot_feature_columns_deep = {} multi_category_feature_columns = {} continuous_feature_columns = {} crossed_feature_columns = [] bucketized_feature_columns = [] embedding_feature_columns = [] if self._data_conf.multi_hot_columns is not None: for column in self._data_conf.multi_hot_columns: multi_hot_feature_columns[ column] = categorical_column_with_vocabulary_list( column, self._data_conf.multi_hot_columns[column], dtype=tf.string) multi_hot_feature_columns_deep[column] = indicator_column( multi_hot_feature_columns[column]) if self._data_conf.multi_category_columns is not None: multi_category_feature_columns = { column: categorical_column_with_hash_bucket(column, hash_bucket_size=1000) for column in self._data_conf.multi_category_columns } if self._data_conf.continuous_columns is not None: continuous_feature_columns = { column: numeric_column(column) for column in self._data_conf.continuous_columns } if self._data_conf.crossed_columns is not None: crossed_feature_columns = [ crossed_column(_, hash_bucket_size=100000) for _ in self._data_conf.crossed_columns ] if self._data_conf.bucketized_columns is not None: [ bucketized_feature_columns.append( bucketized_column(continuous_feature_columns[column], boundaries=boundary)) for column, boundary in self._data_conf.bucketized_columns.items ] if len(multi_category_feature_columns) > 0: embedding_feature_columns = [ embedding_column( _, dimension=self._model_conf.embedding_dimension) for _ in multi_category_feature_columns.values() ] self._feature_mapping = { 0: list(multi_hot_feature_columns.values()), 1: list(multi_category_feature_columns.values()), 2: list(continuous_feature_columns.values()), 3: crossed_feature_columns, 4: bucketized_feature_columns, 5: embedding_feature_columns, 6: list(multi_hot_feature_columns_deep.values()) } self._build_feature_columns_for_model()