def setUp(self):
   self._feature_columns = {
       feature_column.bucketized_column(
           feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
           BUCKET_BOUNDARIES)
       for i in range(NUM_FEATURES)
   }
  def testWarmStart_BucketizedColumn(self):
    # Create feature column.
    real = fc.numeric_column("real")
    real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.])

    # Save checkpoint from which to warm-start.
    _, prev_bucket_val = self._create_prev_run_var(
        "linear_model/real_bucketized/weights",
        shape=[5, 1],
        initializer=norms())

    partitioner = lambda shape, dtype: [1] * len(shape)
    # New graph, new session WITHOUT warm-starting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model([real_bucket], partitioner)
        sess.run(variables.global_variables_initializer())
        # Without warm-starting, the weights should be initialized using default
        # initializer (which is init_ops.zeros_initializer).
        self._assert_cols_to_vars(cols_to_vars,
                                  {real_bucket: [np.zeros([5, 1])]}, sess)

    # New graph, new session with warm-starting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model([real_bucket], partitioner)
        ws_util._warm_start(
            ws_util.WarmStartSettings(
                self.get_temp_dir(), vars_to_warm_start=".*real_bucketized.*"))
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warm-started.
        self._assert_cols_to_vars(cols_to_vars,
                                  {real_bucket: [prev_bucket_val]}, sess)
Esempio n. 3
0
  def testWarmStart_BucketizedColumn(self):
    # Create feature column.
    real = fc.numeric_column("real")
    real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.])

    # Save checkpoint from which to warm-start.
    _, prev_bucket_val = self._create_prev_run_var(
        "linear_model/real_bucketized/weights",
        shape=[5, 1],
        initializer=norms())

    partitioner = lambda shape, dtype: [1] * len(shape)
    # New graph, new session WITHOUT warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model([real_bucket], partitioner)
        sess.run(variables.global_variables_initializer())
        # Without warmstarting, the weights should be initialized using default
        # initializer (which is init_ops.zeros_initializer).
        self._assert_cols_to_vars(cols_to_vars,
                                  {real_bucket: [np.zeros([5, 1])]}, sess)

    # New graph, new session with warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model([real_bucket], partitioner)
        ws_util._warmstart(ws_util._WarmStartSettings(
            self.get_temp_dir(),
            vars_to_warmstart=".*real_bucketized.*"))
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warmstarted.
        self._assert_cols_to_vars(cols_to_vars,
                                  {real_bucket: [prev_bucket_val]}, sess)
 def setUp(self):
   self._head = canned_boosted_trees._create_regression_head(label_dimension=1)
   self._feature_columns = {
       feature_column.bucketized_column(
           feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
           BUCKET_BOUNDARIES) for i in range(NUM_FEATURES)
   }
Esempio n. 5
0
 def setUp(self):
   self._feature_columns = {
       feature_column.bucketized_column(
           feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
           BUCKET_BOUNDARIES)
       for i in range(NUM_FEATURES)
   }
Esempio n. 6
0
 def setUp(self):
   self._head = canned_boosted_trees._create_regression_head(label_dimension=1)
   self._feature_columns = {
       feature_column.bucketized_column(
           feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
           BUCKET_BOUNDARIES) for i in range(NUM_FEATURES)
   }
    def testTrainEvaluateAndPredictWithIndicatorColumn(self):
        categorical = feature_column.categorical_column_with_vocabulary_list(
            key='categorical', vocabulary_list=('bad', 'good', 'ok'))
        feature_indicator = feature_column.indicator_column(categorical)
        bucketized_col = feature_column.bucketized_column(
            feature_column.numeric_column('an_uninformative_feature',
                                          dtype=dtypes.float32),
            BUCKET_BOUNDARIES)

        labels = np.array([[0.], [5.7], [5.7], [0.], [0.]], dtype=np.float32)
        # Our categorical feature defines the labels perfectly
        input_fn = numpy_io.numpy_input_fn(x={
            'an_uninformative_feature':
            np.array([1, 1, 1, 1, 1]),
            'categorical':
            np.array(['bad', 'good', 'good', 'ok', 'bad']),
        },
                                           y=labels,
                                           batch_size=5,
                                           shuffle=False)

        # Train depth 1 tree.
        est = boosted_trees.BoostedTreesRegressor(
            feature_columns=[bucketized_col, feature_indicator],
            n_batches_per_layer=1,
            n_trees=1,
            learning_rate=1.0,
            max_depth=1)

        num_steps = 1
        est.train(input_fn, steps=num_steps)
        ensemble = self._assert_checkpoint_and_return_model(est.model_dir,
                                                            global_step=1,
                                                            finalized_trees=1,
                                                            attempted_layers=1)

        # We learnt perfectly.
        eval_res = est.evaluate(input_fn=input_fn, steps=1)
        self.assertAllClose(eval_res['loss'], 0)

        predictions = list(est.predict(input_fn))
        self.assertAllClose(labels,
                            [pred['predictions'] for pred in predictions])

        self.assertEqual(3, len(ensemble.trees[0].nodes))

        # Check that the split happened on 'good' value, which will be encoded as
        # feature with index 2 (0-numeric, 1 - 'bad')
        self.assertEqual(
            2, ensemble.trees[0].nodes[0].bucketized_split.feature_id)
        self.assertEqual(0,
                         ensemble.trees[0].nodes[0].bucketized_split.threshold)
Esempio n. 8
0
 def setUp(self):
   self._feature_columns = {
       feature_column.bucketized_column(
           feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
           BUCKET_BOUNDARIES) for i in range(NUM_FEATURES)
   }
   self._tree_hparams = boosted_trees.TreeHParams(
       n_trees=2,
       max_depth=2,
       learning_rate=0.1,
       l1=0.,
       l2=0.01,
       tree_complexity=0.)
 def setUp(self):
   self._feature_columns = {
       feature_column.bucketized_column(
           feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
           BUCKET_BOUNDARIES) for i in range(NUM_FEATURES)
   }
   self._tree_hparams = boosted_trees._TreeHParams(  # pylint:disable=protected-access
       n_trees=2,
       max_depth=2,
       learning_rate=0.1,
       l1=0.,
       l2=0.01,
       tree_complexity=0.)
Esempio n. 10
0
 def setUp(self):
   self._feature_columns = {
       feature_column.bucketized_column(
           feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
           BUCKET_BOUNDARIES) for i in range(NUM_FEATURES)
   }
   self._tree_hparams = boosted_trees._TreeHParams(  # pylint:disable=protected-access
       n_trees=2,
       max_depth=2,
       learning_rate=0.1,
       l1=0.,
       l2=0.01,
       tree_complexity=0.,
       min_node_weight=0.)
Esempio n. 11
0
  def testTrainEvaluateAndPredictWithIndicatorColumn(self):
    categorical = feature_column.categorical_column_with_vocabulary_list(
        key='categorical', vocabulary_list=('bad', 'good', 'ok'))
    feature_indicator = feature_column.indicator_column(categorical)
    bucketized_col = feature_column.bucketized_column(
        feature_column.numeric_column(
            'an_uninformative_feature', dtype=dtypes.float32),
        BUCKET_BOUNDARIES)

    labels = np.array([[0.], [5.7], [5.7], [0.], [0.]], dtype=np.float32)
    # Our categorical feature defines the labels perfectly
    input_fn = numpy_io.numpy_input_fn(
        x={
            'an_uninformative_feature': np.array([1, 1, 1, 1, 1]),
            'categorical': np.array(['bad', 'good', 'good', 'ok', 'bad']),
        },
        y=labels,
        batch_size=5,
        shuffle=False)

    # Train depth 1 tree.
    est = boosted_trees.BoostedTreesRegressor(
        feature_columns=[bucketized_col, feature_indicator],
        n_batches_per_layer=1,
        n_trees=1,
        learning_rate=1.0,
        max_depth=1)

    num_steps = 1
    est.train(input_fn, steps=num_steps)
    ensemble = self._assert_checkpoint_and_return_model(
        est.model_dir, global_step=1, finalized_trees=1, attempted_layers=1)

    # We learnt perfectly.
    eval_res = est.evaluate(input_fn=input_fn, steps=1)
    self.assertAllClose(eval_res['loss'], 0)

    predictions = list(est.predict(input_fn))
    self.assertAllClose(
        labels,
        [pred['predictions'] for pred in predictions])

    self.assertEqual(3, len(ensemble.trees[0].nodes))

    # Check that the split happened on 'good' value, which will be encoded as
    # feature with index 2 (0-numeric, 1 - 'bad')
    self.assertEqual(2, ensemble.trees[0].nodes[0].bucketized_split.feature_id)
    self.assertEqual(0, ensemble.trees[0].nodes[0].bucketized_split.threshold)
  def testWarmStart_MultipleCols(self):
    # Create vocab for sparse column "sc_vocab".
    vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                   "vocab")

    # Create feature columns.
    sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10)
    sc_hash = fc.categorical_column_with_hash_bucket(
        "sc_hash", hash_bucket_size=15)
    sc_keys = fc.categorical_column_with_vocabulary_list(
        "sc_keys", vocabulary_list=["a", "b", "c", "e"])
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=vocab_path, vocabulary_size=4)
    real = fc.numeric_column("real")
    real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.])
    cross = fc.crossed_column([sc_keys, sc_vocab], hash_bucket_size=20)
    all_linear_cols = [sc_int, sc_hash, sc_keys, sc_vocab, real_bucket, cross]

    # Save checkpoint from which to warm-start.  Also create a bias variable,
    # so we can check that it's also warm-started.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        sc_int_weights = variable_scope.get_variable(
            "linear_model/sc_int/weights", shape=[10, 1], initializer=ones())
        sc_hash_weights = variable_scope.get_variable(
            "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
        sc_keys_weights = variable_scope.get_variable(
            "linear_model/sc_keys/weights", shape=[4, 1], initializer=rand())
        sc_vocab_weights = variable_scope.get_variable(
            "linear_model/sc_vocab/weights", shape=[4, 1], initializer=ones())
        real_bucket_weights = variable_scope.get_variable(
            "linear_model/real_bucketized/weights",
            shape=[5, 1],
            initializer=norms())
        cross_weights = variable_scope.get_variable(
            "linear_model/sc_keys_X_sc_vocab/weights",
            shape=[20, 1],
            initializer=rand())
        bias = variable_scope.get_variable(
            "linear_model/bias_weights",
            shape=[1],
            initializer=rand())
        self._write_checkpoint(sess)
        (prev_int_val, prev_hash_val, prev_keys_val, prev_vocab_val,
         prev_bucket_val, prev_cross_val, prev_bias_val) = sess.run([
             sc_int_weights, sc_hash_weights, sc_keys_weights, sc_vocab_weights,
             real_bucket_weights, cross_weights, bias
         ])

    partitioner = lambda shape, dtype: [1] * len(shape)
    # New graph, new session WITHOUT warm-starting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
        sess.run(variables.global_variables_initializer())
        # Without warm-starting, all weights should be initialized using default
        # initializer (which is init_ops.zeros_initializer).
        self._assert_cols_to_vars(cols_to_vars, {
            sc_int: [np.zeros([10, 1])],
            sc_hash: [np.zeros([15, 1])],
            sc_keys: [np.zeros([4, 1])],
            sc_vocab: [np.zeros([4, 1])],
            real_bucket: [np.zeros([5, 1])],
            cross: [np.zeros([20, 1])],
        }, sess)

    # New graph, new session with warm-starting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
        vocab_info = ws_util.VocabInfo(
            new_vocab=sc_vocab.vocabulary_file,
            new_vocab_size=sc_vocab.vocabulary_size,
            num_oov_buckets=sc_vocab.num_oov_buckets,
            old_vocab=vocab_path)
        ws_util._warm_start(
            ws_util.WarmStartSettings(
                self.get_temp_dir(),
                var_name_to_vocab_info={
                    "linear_model/sc_vocab/weights": vocab_info
                }))
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warm-started.
        self._assert_cols_to_vars(cols_to_vars, {
            sc_int: [prev_int_val],
            sc_hash: [prev_hash_val],
            sc_keys: [prev_keys_val],
            sc_vocab: [prev_vocab_val],
            real_bucket: [prev_bucket_val],
            cross: [prev_cross_val],
            "bias": [prev_bias_val],
        }, sess)
Esempio n. 13
0
  def testWarmStart_MultipleCols(self):
    # Create vocab for sparse column "sc_vocab".
    vocab_path = self._write_vocab(["apple", "banana", "guava", "orange"],
                                   "vocab")

    # Create feature columns.
    sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10)
    sc_hash = fc.categorical_column_with_hash_bucket(
        "sc_hash", hash_bucket_size=15)
    sc_keys = fc.categorical_column_with_vocabulary_list(
        "sc_keys", vocabulary_list=["a", "b", "c", "e"])
    sc_vocab = fc.categorical_column_with_vocabulary_file(
        "sc_vocab", vocabulary_file=vocab_path, vocabulary_size=4)
    real = fc.numeric_column("real")
    real_bucket = fc.bucketized_column(real, boundaries=[0., 1., 2., 3.])
    cross = fc.crossed_column([sc_keys, sc_vocab], hash_bucket_size=20)
    all_linear_cols = [sc_int, sc_hash, sc_keys, sc_vocab, real_bucket, cross]

    # Save checkpoint from which to warm-start.  Also create a bias variable,
    # so we can check that it's also warmstarted.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        sc_int_weights = variable_scope.get_variable(
            "linear_model/sc_int/weights", shape=[10, 1], initializer=ones())
        sc_hash_weights = variable_scope.get_variable(
            "linear_model/sc_hash/weights", shape=[15, 1], initializer=norms())
        sc_keys_weights = variable_scope.get_variable(
            "linear_model/sc_keys/weights", shape=[4, 1], initializer=rand())
        sc_vocab_weights = variable_scope.get_variable(
            "linear_model/sc_vocab/weights", shape=[4, 1], initializer=ones())
        real_bucket_weights = variable_scope.get_variable(
            "linear_model/real_bucketized/weights",
            shape=[5, 1],
            initializer=norms())
        cross_weights = variable_scope.get_variable(
            "linear_model/sc_keys_X_sc_vocab/weights",
            shape=[20, 1],
            initializer=rand())
        bias = variable_scope.get_variable(
            "linear_model/bias_weights",
            shape=[1],
            initializer=rand())
        self._write_checkpoint(sess)
        (prev_int_val, prev_hash_val, prev_keys_val, prev_vocab_val,
         prev_bucket_val, prev_cross_val, prev_bias_val) = sess.run([
             sc_int_weights, sc_hash_weights, sc_keys_weights, sc_vocab_weights,
             real_bucket_weights, cross_weights, bias
         ])

    partitioner = lambda shape, dtype: [1] * len(shape)
    # New graph, new session WITHOUT warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
        sess.run(variables.global_variables_initializer())
        # Without warmstarting, all weights should be initialized using default
        # initializer (which is init_ops.zeros_initializer).
        self._assert_cols_to_vars(cols_to_vars, {
            sc_int: [np.zeros([10, 1])],
            sc_hash: [np.zeros([15, 1])],
            sc_keys: [np.zeros([4, 1])],
            sc_vocab: [np.zeros([4, 1])],
            real_bucket: [np.zeros([5, 1])],
            cross: [np.zeros([20, 1])],
        }, sess)

    # New graph, new session with warmstarting.
    with ops.Graph().as_default() as g:
      with self.test_session(graph=g) as sess:
        cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
        vocab_info = ws_util._VocabInfo(
            new_vocab=sc_vocab.vocabulary_file,
            new_vocab_size=sc_vocab.vocabulary_size,
            num_oov_buckets=sc_vocab.num_oov_buckets,
            old_vocab=vocab_path
        )
        ws_util._warmstart(
            ws_util._WarmStartSettings(
                self.get_temp_dir(),
                var_name_to_vocab_info={
                    "linear_model/sc_vocab/weights": vocab_info
                }))
        sess.run(variables.global_variables_initializer())
        # Verify weights were correctly warmstarted.
        self._assert_cols_to_vars(cols_to_vars, {
            sc_int: [prev_int_val],
            sc_hash: [prev_hash_val],
            sc_keys: [prev_keys_val],
            sc_vocab: [prev_vocab_val],
            real_bucket: [prev_bucket_val],
            cross: [prev_cross_val],
            "bias": [prev_bias_val],
        }, sess)
Esempio n. 14
0
    def _build_feature_columns(self, ):
        multi_hot_feature_columns = {}
        multi_hot_feature_columns_deep = {}
        multi_category_feature_columns = {}
        continuous_feature_columns = {}
        crossed_feature_columns = []
        bucketized_feature_columns = []
        embedding_feature_columns = []

        if self._data_conf.multi_hot_columns is not None:
            for column in self._data_conf.multi_hot_columns:
                multi_hot_feature_columns[
                    column] = categorical_column_with_vocabulary_list(
                        column,
                        self._data_conf.multi_hot_columns[column],
                        dtype=tf.string)
                multi_hot_feature_columns_deep[column] = indicator_column(
                    multi_hot_feature_columns[column])

        if self._data_conf.multi_category_columns is not None:
            multi_category_feature_columns = {
                column:
                categorical_column_with_hash_bucket(column,
                                                    hash_bucket_size=1000)
                for column in self._data_conf.multi_category_columns
            }

        if self._data_conf.continuous_columns is not None:
            continuous_feature_columns = {
                column: numeric_column(column)
                for column in self._data_conf.continuous_columns
            }

        if self._data_conf.crossed_columns is not None:
            crossed_feature_columns = [
                crossed_column(_, hash_bucket_size=100000)
                for _ in self._data_conf.crossed_columns
            ]

        if self._data_conf.bucketized_columns is not None:
            [
                bucketized_feature_columns.append(
                    bucketized_column(continuous_feature_columns[column],
                                      boundaries=boundary)) for column,
                boundary in self._data_conf.bucketized_columns.items
            ]

        if len(multi_category_feature_columns) > 0:
            embedding_feature_columns = [
                embedding_column(
                    _, dimension=self._model_conf.embedding_dimension)
                for _ in multi_category_feature_columns.values()
            ]

        self._feature_mapping = {
            0: list(multi_hot_feature_columns.values()),
            1: list(multi_category_feature_columns.values()),
            2: list(continuous_feature_columns.values()),
            3: crossed_feature_columns,
            4: bucketized_feature_columns,
            5: embedding_feature_columns,
            6: list(multi_hot_feature_columns_deep.values())
        }

        self._build_feature_columns_for_model()