def testBiasEnsembleMultiClass(self):
    with self.test_session():
      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
      tree = tree_ensemble_config.trees.add()
      tree_ensemble_config.tree_metadata.add().is_finalized = True
      leaf = tree.nodes.add().leaf
      _append_to_leaf(leaf, 0, -0.4)
      _append_to_leaf(leaf, 1, 0.9)

      tree_ensemble_config.tree_weights.append(1.0)

      tree_ensemble_handle = model_ops.tree_ensemble_variable(
          stamp_token=0,
          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
          name="multiclass")
      resources.initialize_resources(resources.shared_resources()).run()

      # Prepare learner config.
      learner_config = learner_pb2.LearnerConfig()
      learner_config.num_classes = 3

      result, dropout_info = self._get_predictions(
          tree_ensemble_handle,
          learner_config=learner_config.SerializeToString(),
          reduce_dim=True)
      self.assertAllClose([[-0.4, 0.9], [-0.4, 0.9]], result.eval())

      # Empty dropout.
      self.assertAllEqual([[], []], dropout_info.eval())
  def testTreeFinalized(self):
    with self.test_session():
      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
      # Depth 3 tree.
      tree1 = tree_ensemble_config.trees.add()
      _set_float_split(tree1.nodes.add().dense_float_binary_split, 0, 9.0, 1, 2)
      _set_float_split(tree1.nodes.add()
                       .sparse_float_binary_split_default_left.split, 0, -20.0,
                       3, 4)
      _append_to_leaf(tree1.nodes.add().leaf, 0, 0.2)
      _append_to_leaf(tree1.nodes.add().leaf, 0, 0.3)
      _set_categorical_id_split(tree1.nodes.add().categorical_id_binary_split,
                                0, 9, 5, 6)
      _append_to_leaf(tree1.nodes.add().leaf, 0, 0.5)
      _append_to_leaf(tree1.nodes.add().leaf, 0, 0.6)

      tree_ensemble_config.tree_weights.append(1.0)
      tree_ensemble_config.tree_metadata.add().is_finalized = True

      tree_ensemble_handle = model_ops.tree_ensemble_variable(
          stamp_token=0,
          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
          name="full_ensemble")
      resources.initialize_resources(resources.shared_resources()).run()

      result = prediction_ops.gradient_trees_partition_examples(
          tree_ensemble_handle, [self._dense_float_tensor], [
              self._sparse_float_indices1, self._sparse_float_indices2
          ], [self._sparse_float_values1, self._sparse_float_values2],
          [self._sparse_float_shape1,
           self._sparse_float_shape2], [self._sparse_int_indices1],
          [self._sparse_int_values1], [self._sparse_int_shape1])

      self.assertAllEqual([0, 0], result.eval())
Beispiel #3
0
    def __init__(self, num_quantiles, epsilon, serialized_tf_config=None):
        self._num_quantiles = num_quantiles
        self._epsilon = epsilon
        self._serialized_tf_config = serialized_tf_config

        # _stamp_token is used to commit the state of the qaccumulator. In
        # this case, the qaccumulator state is completely returned and stored
        # as part of quantile_state/summary in the combiner fn (i.e the summary is
        # extracted and stored outside the qaccumulator). So we don't use
        # the timestamp mechanism to signify progress in the qaccumulator state.
        self._stamp_token = 0
        # Represents an empty summary. This could be changed to a tf.constant
        # implemented by the quantile ops library.
        self._empty_summary = None

        # Create a new session with a new graph for quantile ops.
        self._session = tf.Session(
            graph=tf.Graph(),
            config=_maybe_deserialize_tf_config(serialized_tf_config))
        with self._session.graph.as_default():
            with self._session.as_default():
                self._qaccumulator = quantile_ops.QuantileAccumulator(
                    init_stamp_token=self._stamp_token,
                    num_quantiles=self._num_quantiles,
                    epsilon=self._epsilon,
                    name='qaccumulator')
                resources.initialize_resources(
                    resources.shared_resources()).run()
  def testAverageMoreThanNumTreesExist(self):
    with self.test_session():
      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
      adjusted_tree_ensemble_config = (
          tree_config_pb2.DecisionTreeEnsembleConfig())
      # When we say to average over more trees than possible, it is averaging
      # across all trees.
      total_num = 100
      for i in range(0, total_num):
        tree = tree_ensemble_config.trees.add()
        _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)

        tree_ensemble_config.tree_metadata.add().is_finalized = True
        tree_ensemble_config.tree_weights.append(1.0)
        # This is how the weight will look after averaging
        copy_tree = adjusted_tree_ensemble_config.trees.add()
        _append_to_leaf(copy_tree.nodes.add().leaf, 0, -0.4)

        adjusted_tree_ensemble_config.tree_metadata.add().is_finalized = True
        adjusted_tree_ensemble_config.tree_weights.append(
            1.0 * (total_num - i) / total_num)

      # Prepare learner config WITH AVERAGING.
      learner_config = learner_pb2.LearnerConfig()
      learner_config.num_classes = 2
      # We have only 100 trees but we ask to average over 250.
      learner_config.averaging_config.average_last_n_trees = 250

      # No averaging config.
      learner_config_no_averaging = learner_pb2.LearnerConfig()
      learner_config_no_averaging.num_classes = 2

      tree_ensemble_handle = model_ops.tree_ensemble_variable(
          stamp_token=0,
          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
          name="existing")

      # This is how our ensemble will "look" during averaging
      adjusted_tree_ensemble_handle = model_ops.tree_ensemble_variable(
          stamp_token=0,
          tree_ensemble_config=adjusted_tree_ensemble_config.SerializeToString(
          ),
          name="adjusted")

      resources.initialize_resources(resources.shared_resources()).run()

      result, dropout_info = self._get_predictions(
          tree_ensemble_handle,
          learner_config.SerializeToString(),
          apply_averaging=True,
          reduce_dim=True)

      pattern_result, pattern_dropout_info = self._get_predictions(
          adjusted_tree_ensemble_handle,
          learner_config_no_averaging.SerializeToString(),
          apply_averaging=False,
          reduce_dim=True)

      self.assertAllEqual(result.eval(), pattern_result.eval())
      self.assertAllEqual(dropout_info.eval(), pattern_dropout_info.eval())
  def testCachedPredictionOnEmptyEnsemble(self):
    """Tests that prediction on a dummy ensemble does not fail."""
    with self.cached_session() as session:
      # Create a dummy ensemble.
      tree_ensemble = boosted_trees_ops.TreeEnsemble(
          'ensemble', serialized_proto='')
      tree_ensemble_handle = tree_ensemble.resource_handle
      resources.initialize_resources(resources.shared_resources()).run()

      # No previous cached values.
      cached_tree_ids = [0, 0]
      cached_node_ids = [0, 0]

      # We have two features: 0 and 1. Values don't matter here on a dummy
      # ensemble.
      feature_0_values = [67, 5]
      feature_1_values = [9, 17]

      # Grow tree ensemble.
      predict_op = boosted_trees_ops.training_predict(
          tree_ensemble_handle,
          cached_tree_ids=cached_tree_ids,
          cached_node_ids=cached_node_ids,
          bucketized_features=[feature_0_values, feature_1_values],
          logits_dimension=1)

      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)

      # Nothing changed.
      self.assertAllClose(cached_tree_ids, new_tree_ids)
      self.assertAllClose(cached_node_ids, new_node_ids)
      self.assertAllClose([[0], [0]], logits_updates)
  def testBasicQuantileBucketsMultipleResources(self):
    with self.test_session() as sess:
      quantile_accumulator_handle_0 = self.create_resource("float_0", self.eps,
                                                           self.max_elements)
      quantile_accumulator_handle_1 = self.create_resource("float_1", self.eps,
                                                           self.max_elements)
      resources.initialize_resources(resources.shared_resources()).run()
      summaries = boosted_trees_ops.make_quantile_summaries(
          [self._feature_0, self._feature_1], self._example_weights,
          epsilon=self.eps)
      summary_op_0 = boosted_trees_ops.quantile_add_summaries(
          quantile_accumulator_handle_0,
          [summaries[0]])
      summary_op_1 = boosted_trees_ops.quantile_add_summaries(
          quantile_accumulator_handle_1,
          [summaries[1]])
      flush_op_0 = boosted_trees_ops.quantile_flush(
          quantile_accumulator_handle_0, self.num_quantiles)
      flush_op_1 = boosted_trees_ops.quantile_flush(
          quantile_accumulator_handle_1, self.num_quantiles)
      bucket_0 = boosted_trees_ops.get_bucket_boundaries(
          quantile_accumulator_handle_0, num_features=1)
      bucket_1 = boosted_trees_ops.get_bucket_boundaries(
          quantile_accumulator_handle_1, num_features=1)
      quantiles = boosted_trees_ops.boosted_trees_bucketize(
          [self._feature_0, self._feature_1], bucket_0 + bucket_1)
      sess.run([summary_op_0, summary_op_1])
      sess.run([flush_op_0, flush_op_1])
      self.assertAllClose(self._feature_0_boundaries, bucket_0[0].eval())
      self.assertAllClose(self._feature_1_boundaries, bucket_1[0].eval())

      self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
      self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
Beispiel #7
0
  def testBasicQuantileBucketsMultipleResources(self):
    with self.cached_session() as sess:
      quantile_accumulator_handle_0 = self.create_resource("float_0", self.eps,
                                                           self.max_elements)
      quantile_accumulator_handle_1 = self.create_resource("float_1", self.eps,
                                                           self.max_elements)
      resources.initialize_resources(resources.shared_resources()).run()
      summaries = boosted_trees_ops.make_quantile_summaries(
          [self._feature_0, self._feature_1], self._example_weights,
          epsilon=self.eps)
      summary_op_0 = boosted_trees_ops.quantile_add_summaries(
          quantile_accumulator_handle_0,
          [summaries[0]])
      summary_op_1 = boosted_trees_ops.quantile_add_summaries(
          quantile_accumulator_handle_1,
          [summaries[1]])
      flush_op_0 = boosted_trees_ops.quantile_flush(
          quantile_accumulator_handle_0, self.num_quantiles)
      flush_op_1 = boosted_trees_ops.quantile_flush(
          quantile_accumulator_handle_1, self.num_quantiles)
      bucket_0 = boosted_trees_ops.get_bucket_boundaries(
          quantile_accumulator_handle_0, num_features=1)
      bucket_1 = boosted_trees_ops.get_bucket_boundaries(
          quantile_accumulator_handle_1, num_features=1)
      quantiles = boosted_trees_ops.boosted_trees_bucketize(
          [self._feature_0, self._feature_1], bucket_0 + bucket_1)
      sess.run([summary_op_0, summary_op_1])
      sess.run([flush_op_0, flush_op_1])
      self.assertAllClose(self._feature_0_boundaries, bucket_0[0].eval())
      self.assertAllClose(self._feature_1_boundaries, bucket_1[0].eval())

      self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
      self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
  def testCreate(self):
    with self.cached_session():
      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
      tree = tree_ensemble_config.trees.add()
      _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
      tree_ensemble_config.tree_weights.append(1.0)

      # Prepare learner config.
      learner_config = learner_pb2.LearnerConfig()
      learner_config.num_classes = 2

      tree_ensemble_handle = model_ops.tree_ensemble_variable(
          stamp_token=3,
          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
          name="create_tree")
      resources.initialize_resources(resources.shared_resources()).run()

      result, _ = prediction_ops.gradient_trees_prediction(
          tree_ensemble_handle,
          self._seed, [self._dense_float_tensor], [
              self._sparse_float_indices1, self._sparse_float_indices2
          ], [self._sparse_float_values1, self._sparse_float_values2],
          [self._sparse_float_shape1,
           self._sparse_float_shape2], [self._sparse_int_indices1],
          [self._sparse_int_values1], [self._sparse_int_shape1],
          learner_config=learner_config.SerializeToString(),
          apply_dropout=False,
          apply_averaging=False,
          center_bias=False,
          reduce_dim=True)
      self.assertAllClose(result.eval(), [[-0.4], [-0.4]])
      stamp_token = model_ops.tree_ensemble_stamp_token(tree_ensemble_handle)
      self.assertEqual(stamp_token.eval(), 3)
 def testBasicCallableParams(self):
   for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
     with self.cached_session():
       var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
       var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
       grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
       grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
       lr = lambda: 3.0
       sgd_op = gradient_descent.GradientDescentOptimizer(lr).apply_gradients(
           zip([grads0, grads1], [var0, var1]))
       # TODO(apassos) calling initialize_resources on all resources here
       # doesn't work because the sessions and graph are reused across unit
       # tests and this would mean trying to reinitialize variables. Figure out
       # a long-term solution for this.
       resources.initialize_resources([var0, var1]).run()
       # Fetch params to validate initial values
       self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
       self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
       # Run 1 step of sgd
       sgd_op.run()
       # Validate updated params
       self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
                                          var0.eval())
       self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
                                          var1.eval())
Beispiel #10
0
 def testBasicCallableParams(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
         with self.cached_session():
             var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
                                                           dtype=dtype)
             var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
                                                           dtype=dtype)
             grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
             grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
             lr = lambda: 3.0
             sgd_op = gradient_descent.SGD(lr).apply_gradients(
                 zip([grads0, grads1], [var0, var1]))
             # TODO(apassos) calling initialize_resources on all resources here
             # doesn't work because the sessions and graph are reused across unit
             # tests and this would mean trying to reinitialize variables. Figure out
             # a long-term solution for this.
             resources.initialize_resources([var0, var1]).run()
             # Fetch params to validate initial values
             self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
             self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
             # Run 1 step of sgd
             sgd_op.run()
             # Validate updated params
             self.assertAllCloseAccordingToType(
                 [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], var0.eval())
             self.assertAllCloseAccordingToType(
                 [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], var1.eval())
  def _testStreamingQuantileBucketsHelper(
      self, inputs, num_quantiles=3, expected_buckets=None):
    """Helper to test quantile buckets on different inputs."""

    # set generate_quantiles to True since the test will generate fewer
    # boundaries otherwise.
    with self.test_session() as sess:
      accumulator = quantile_ops.QuantileAccumulator(
          init_stamp_token=0, num_quantiles=num_quantiles,
          epsilon=0.001, name="q1", generate_quantiles=True)
      resources.initialize_resources(resources.shared_resources()).run()
    input_column = array_ops.placeholder(dtypes.float32)
    weights = array_ops.placeholder(dtypes.float32)
    update = accumulator.add_summary(
        stamp_token=0,
        column=input_column,
        example_weights=weights)

    with self.test_session() as sess:
      sess.run(update,
               {input_column: inputs,
                weights: [1] * len(inputs)})

    with self.test_session() as sess:
      sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
      are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
      buckets, are_ready_flush = (sess.run(
          [buckets, are_ready_flush]))
      self.assertEqual(True, are_ready_flush)
      # By default, use 3 quantiles, 4 boundaries for simplicity.
      self.assertEqual(num_quantiles + 1, len(buckets))
      if expected_buckets:
        self.assertAllEqual(buckets, expected_buckets)
  def _get_train_op_and_ensemble(self, head, config, is_classification,
                                 train_in_memory):
    """Calls bt_model_fn() and returns the train_op and ensemble_serialzed."""
    features, labels = _make_train_input_fn(is_classification)()
    estimator_spec = boosted_trees._bt_model_fn(  # pylint:disable=protected-access
        features=features,
        labels=labels,
        mode=model_fn.ModeKeys.TRAIN,
        head=head,
        feature_columns=self._feature_columns,
        tree_hparams=self._tree_hparams,
        example_id_column_name=EXAMPLE_ID_COLUMN,
        n_batches_per_layer=1,
        config=config,
        train_in_memory=train_in_memory)
    resources.initialize_resources(resources.shared_resources()).run()
    variables.global_variables_initializer().run()
    variables.local_variables_initializer().run()

    # Gets the train_op and serialized proto of the ensemble.
    shared_resources = resources.shared_resources()
    self.assertEqual(1, len(shared_resources))
    train_op = estimator_spec.train_op
    with ops.control_dependencies([train_op]):
      _, ensemble_serialized = (
          gen_boosted_trees_ops.boosted_trees_serialize_ensemble(
              shared_resources[0].handle))
    return train_op, ensemble_serialized
  def testStreamingQuantileBuckets(self):
    """Sets up the quantile summary op test as follows.

    100 batches of data is added to the accumulator. The batches are in form:
    [0 1 .. 99]
    [100 101 .. 200]
    ...
    [9900 9901 .. 9999]
    All the batches have 1 for all the example weights.
    """
    with self.test_session() as sess:
      accumulator = quantile_ops.QuantileAccumulator(
          init_stamp_token=0, num_quantiles=3, epsilon=0.01, name="q1")
      resources.initialize_resources(resources.shared_resources()).run()
    weight_placeholder = array_ops.placeholder(dtypes.float32)
    dense_placeholder = array_ops.placeholder(dtypes.float32)
    update = accumulator.add_summary(
        stamp_token=0,
        column=dense_placeholder,
        example_weights=weight_placeholder)
    with self.test_session() as sess:
      for i in range(100):
        dense_float = np.linspace(
            i * 100, (i + 1) * 100 - 1, num=100).reshape(-1, 1)
        sess.run(update, {
            dense_placeholder: dense_float,
            weight_placeholder: np.ones(shape=(100, 1), dtype=np.float32)
        })

    with self.test_session() as sess:
      sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
      are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
      buckets, are_ready_flush = (sess.run([buckets, are_ready_flush]))
      self.assertEqual(True, are_ready_flush)
      self.assertAllEqual([0, 3335., 6671., 9999.], buckets)
Beispiel #14
0
    def initialize_local_state(self, tf_config=None):
        """Called by the CombineFnWrapper's __init__ method.

    This can be used to set non-pickleable local state.  It is used in
    conjunction with overriding __reduce__ so this state is not pickled.  This
    method must be called prior to any other method.

    Args:
      tf_config: (optional) A tf.ConfigProto
    """
        # _stamp_token is used to commit the state of the qaccumulator. In
        # this case, the qaccumulator state is completely returned and stored
        # as part of quantile_state/summary in the combiner fn (i.e the summary is
        # extracted and stored outside the qaccumulator). So we don't use
        # the timestamp mechanism to signify progress in the qaccumulator state.
        self._stamp_token = 0
        # Represents an empty summary. This could be changed to a tf.constant
        # implemented by the quantile ops library.
        self._empty_summary = None

        # Create a new session with a new graph for quantile ops.
        self._session = tf.Session(graph=tf.Graph(), config=tf_config)
        with self._session.graph.as_default():
            with self._session.as_default():
                self._qaccumulator = quantile_ops.QuantileAccumulator(
                    init_stamp_token=self._stamp_token,
                    num_quantiles=self._num_quantiles,
                    epsilon=self._epsilon,
                    name='qaccumulator')
                resources.initialize_resources(
                    resources.shared_resources()).run()
Beispiel #15
0
    def testContribsForOnlyABiasNode(self):
        """Tests case when, after training, only left with a bias node.

    For example, this could happen if the final ensemble contains one tree that
    got pruned up to the root.
    """
        with self.test_session() as session:
            tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
            text_format.Merge(
                """
        trees {
          nodes {
            leaf {
              scalar: 1.72
            }
          }
        }
        tree_weights: 0.1
        tree_metadata: {
          num_layers_grown: 0
        }
      """, tree_ensemble_config)

            tree_ensemble = boosted_trees_ops.TreeEnsemble(
                'ensemble',
                serialized_proto=tree_ensemble_config.SerializeToString())
            tree_ensemble_handle = tree_ensemble.resource_handle
            resources.initialize_resources(resources.shared_resources()).run()

            # All features are unused.
            feature_0_values = [36, 32]
            feature_1_values = [13, -29]
            feature_2_values = [11, 27]

            # Expected logits are computed by traversing the logit path and
            # subtracting child logits from parent logits.
            bias = 1.72 * 0.1  # Root node of tree_0.
            expected_feature_ids = ((), ())
            expected_logits_paths = ((bias, ), (bias, ))

            bucketized_features = [
                feature_0_values, feature_1_values, feature_2_values
            ]

            debug_op = boosted_trees_ops.example_debug_outputs(
                tree_ensemble_handle,
                bucketized_features=bucketized_features,
                logits_dimension=1)

            serialized_examples_debug_outputs = session.run(debug_op)
            feature_ids = []
            logits_paths = []
            for example in serialized_examples_debug_outputs:
                example_debug_outputs = boosted_trees_pb2.DebugOutput()
                example_debug_outputs.ParseFromString(example)
                feature_ids.append(example_debug_outputs.feature_ids)
                logits_paths.append(example_debug_outputs.logits_path)

            self.assertAllClose(feature_ids, expected_feature_ids)
            self.assertAllClose(logits_paths, expected_logits_paths)
Beispiel #16
0
 def testMinimizeResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
         with self.cached_session():
             var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]],
                                                           dtype=dtype)
             var1 = resource_variable_ops.ResourceVariable([3.0],
                                                           dtype=dtype)
             x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
             pred = math_ops.matmul(var0, x) + var1
             loss = pred * pred
             sgd_op = gradient_descent.SGD(1.0).minimize(loss)
             # TODO(apassos) calling initialize_resources on all resources here
             # doesn't work because the sessions and graph are reused across unit
             # tests and this would mean trying to reinitialize variables. Figure out
             # a long-term solution for this.
             resources.initialize_resources([var0, var1]).run()
             # Fetch params to validate initial values
             self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
             self.assertAllCloseAccordingToType([3.0], var1.eval())
             # Run 1 step of sgd
             sgd_op.run()
             # Validate updated params
             np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
             np_grad = 2 * np_pred
             self.assertAllCloseAccordingToType(
                 [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
             self.assertAllCloseAccordingToType([3.0 - np_grad],
                                                var1.eval())
    def testWithExistingEnsembleAndShrinkage(self):
        with self.test_session():
            # Add shrinkage config.
            learning_rate = 0.0001
            tree_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
            # Add 10 trees with some weights.
            for i in range(0, 5):
                tree = tree_ensemble.trees.add()
                _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
                tree_ensemble.tree_weights.append(i + 1)
                meta = tree_ensemble.tree_metadata.add()
                meta.num_tree_weight_updates = 1
            tree_ensemble_handle = model_ops.tree_ensemble_variable(
                stamp_token=0,
                tree_ensemble_config=tree_ensemble.SerializeToString(),
                name="existing")

            # Create non-zero feature importance.
            feature_usage_counts = variables.Variable(
                initial_value=np.array([4, 7], np.int64),
                name="feature_usage_counts",
                trainable=False)
            feature_gains = variables.Variable(initial_value=np.array(
                [0.2, 0.8], np.float32),
                                               name="feature_gains",
                                               trainable=False)

            resources.initialize_resources(resources.shared_resources()).run()
            variables.initialize_all_variables().run()

            output_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
            with ops.control_dependencies([
                    ensemble_optimizer_ops.add_trees_to_ensemble(
                        tree_ensemble_handle,
                        self._ensemble_to_add.SerializeToString(),
                        feature_usage_counts, [1, 2],
                        feature_gains, [0.5, 0.3], [[], []],
                        learning_rate=learning_rate)
            ]):
                output_ensemble.ParseFromString(
                    model_ops.tree_ensemble_serialize(tree_ensemble_handle)
                    [1].eval())

            # The weights of previous trees stayed the same, new tree (LAST) is added
            # with shrinkage weight.
            self.assertAllClose([1.0, 2.0, 3.0, 4.0, 5.0, learning_rate],
                                output_ensemble.tree_weights)

            # Check that all number of updates are equal to 1 (e,g, no old tree weight
            # got adjusted.
            for i in range(0, 6):
                self.assertEqual(
                    1,
                    output_ensemble.tree_metadata[i].num_tree_weight_updates)

            # Ensure feature importance was aggregated correctly.
            self.assertAllEqual([5, 9], feature_usage_counts.eval())
            self.assertArrayNear(
                [0.2 + 0.5 * learning_rate, 0.8 + 0.3 * learning_rate],
                feature_gains.eval(), 1e-6)
Beispiel #18
0
    def testTrainFnNonChiefWithCentering(self):
        """Tests the train function running on worker with bias centering."""
        with self.test_session():
            ensemble_handle = model_ops.tree_ensemble_variable(
                stamp_token=0, tree_ensemble_config="", name="tree_ensemble")
            learner_config = learner_pb2.LearnerConfig()
            learner_config.learning_rate_tuner.fixed.learning_rate = 0.1
            learner_config.num_classes = 2
            learner_config.regularization.l1 = 0
            learner_config.regularization.l2 = 0
            learner_config.constraints.max_tree_depth = 1
            learner_config.constraints.min_node_weight = 0
            features = {}
            features["dense_float"] = array_ops.ones([4, 1], dtypes.float32)

            gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
                is_chief=False,
                num_ps_replicas=0,
                center_bias=True,
                ensemble_handle=ensemble_handle,
                examples_per_layer=1,
                learner_config=learner_config,
                features=features)

            predictions = array_ops.constant([[0.0], [1.0], [0.0], [2.0]],
                                             dtype=dtypes.float32)
            partition_ids = array_ops.zeros([4], dtypes.int32)
            ensemble_stamp = variables.Variable(initial_value=0,
                                                name="ensemble_stamp",
                                                trainable=False,
                                                dtype=dtypes.int64)

            predictions_dict = {
                "predictions": predictions,
                "predictions_no_dropout": predictions,
                "partition_ids": partition_ids,
                "ensemble_stamp": ensemble_stamp
            }

            labels = array_ops.ones([4, 1], dtypes.float32)
            weights = array_ops.ones([4, 1], dtypes.float32)
            # Create train op.
            train_op = gbdt_model.train(loss=math_ops.reduce_mean(
                _squared_loss(labels, weights, predictions)),
                                        predictions_dict=predictions_dict,
                                        labels=labels)
            variables.global_variables_initializer().run()
            resources.initialize_resources(resources.shared_resources()).run()

            # Regardless of how many times the train op is run, a non-chief worker
            # can only accumulate stats so the tree ensemble never changes.
            for _ in range(5):
                train_op.run()
            stamp_token, serialized = model_ops.tree_ensemble_serialize(
                ensemble_handle)
            output = tree_config_pb2.DecisionTreeEnsembleConfig()
            output.ParseFromString(serialized.eval())
            self.assertEquals(len(output.trees), 0)
            self.assertEquals(len(output.tree_weights), 0)
            self.assertEquals(stamp_token.eval(), 0)
    def _get_train_op_and_ensemble(self, head, config, is_classification,
                                   train_in_memory):
        """Calls bt_model_fn() and returns the train_op and ensemble_serialzed."""
        features, labels = _make_train_input_fn(is_classification)()
        estimator_spec = boosted_trees._bt_model_fn(  # pylint:disable=protected-access
            features=features,
            labels=labels,
            mode=model_fn.ModeKeys.TRAIN,
            head=head,
            feature_columns=self._feature_columns,
            tree_hparams=self._tree_hparams,
            example_id_column_name=EXAMPLE_ID_COLUMN,
            n_batches_per_layer=1,
            config=config,
            train_in_memory=train_in_memory)
        resources.initialize_resources(resources.shared_resources()).run()
        variables.global_variables_initializer().run()
        variables.local_variables_initializer().run()

        # Gets the train_op and serialized proto of the ensemble.
        shared_resources = resources.shared_resources()
        self.assertEqual(1, len(shared_resources))
        train_op = estimator_spec.train_op
        with ops.control_dependencies([train_op]):
            _, ensemble_serialized = (
                gen_boosted_trees_ops.boosted_trees_serialize_ensemble(
                    shared_resources[0].handle))
        return train_op, ensemble_serialized
Beispiel #20
0
    def testBasicQuantileBucketsSingleResourcesAddFlushed(self):
        with self.cached_session():
            quantile_accumulator_handle = self.create_resource(
                "floats_0", self.eps, self.max_elements, 2)
            resources.initialize_resources(resources.shared_resources()).run()
            summaries = boosted_trees_ops.make_quantile_summaries(
                [self._feature_0, self._feature_1],
                self._example_weights,
                epsilon=self.eps)
            summary_op = boosted_trees_ops.quantile_add_summaries(
                quantile_accumulator_handle, summaries)
            flushed_summaries = flush_quantile_summaries(
                quantile_accumulator_handle, num_features=2)

            # We are testing whether the flushed summaries output at the previous step
            # will give the same expected results by inputing it to add_summaries
            summary_op_2 = boosted_trees_ops.quantile_add_summaries(
                quantile_accumulator_handle, flushed_summaries)

            flush_op = boosted_trees_ops.quantile_flush(
                quantile_accumulator_handle, self.num_quantiles)
            buckets = boosted_trees_ops.get_bucket_boundaries(
                quantile_accumulator_handle, num_features=2)
            quantiles = boosted_trees_ops.boosted_trees_bucketize(
                [self._feature_0, self._feature_1], buckets)

            self.evaluate(summary_op)
            self.evaluate(summary_op_2)
            self.evaluate(flush_op)

            self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
            self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())

            self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
            self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
Beispiel #21
0
  def testSaveRestoreBeforeFlush(self):
    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")

    with self.cached_session() as sess:
      accumulator = boosted_trees_ops.QuantileAccumulator(
          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")

      save = saver.Saver()
      resources.initialize_resources(resources.shared_resources()).run()

      summaries = accumulator.add_summaries([self._feature_0, self._feature_1],
                                            self._example_weights)
      self.evaluate(summaries)
      buckets = accumulator.get_bucket_boundaries()
      self.assertAllClose([], buckets[0].eval())
      self.assertAllClose([], buckets[1].eval())
      save.save(sess, save_path)
      self.evaluate(accumulator.flush())
      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())

    with self.session(graph=ops.Graph()) as sess:
      accumulator = boosted_trees_ops.QuantileAccumulator(
          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
      save = saver.Saver()
      save.restore(sess, save_path)
      buckets = accumulator.get_bucket_boundaries()
      self.assertAllClose([], buckets[0].eval())
      self.assertAllClose([], buckets[1].eval())
  def testSaveRestoreBeforeFlush(self):
    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")

    with self.cached_session() as sess:
      accumulator = boosted_trees_ops.QuantileAccumulator(
          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")

      save = saver.Saver()
      resources.initialize_resources(resources.shared_resources()).run()

      summaries = accumulator.add_summaries([self._feature_0, self._feature_1],
                                            self._example_weights)
      self.evaluate(summaries)
      buckets = accumulator.get_bucket_boundaries()
      self.assertAllClose([], buckets[0].eval())
      self.assertAllClose([], buckets[1].eval())
      save.save(sess, save_path)
      self.evaluate(accumulator.flush())
      self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
      self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())

    with self.session(graph=ops.Graph()) as sess:
      accumulator = boosted_trees_ops.QuantileAccumulator(
          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
      save = saver.Saver()
      save.restore(sess, save_path)
      buckets = accumulator.get_bucket_boundaries()
      self.assertAllClose([], buckets[0].eval())
      self.assertAllClose([], buckets[1].eval())
Beispiel #23
0
 def testBasicResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
         # train.GradientDescentOptimizer is V1 only API.
         with ops.Graph().as_default(), self.cached_session():
             var0 = resource_variable_ops.ResourceVariable([1.0, 2.0],
                                                           dtype=dtype)
             var1 = resource_variable_ops.ResourceVariable([3.0, 4.0],
                                                           dtype=dtype)
             grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
             grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
             sgd_op = gradient_descent.GradientDescentOptimizer(
                 3.0).apply_gradients(zip([grads0, grads1], [var0, var1]))
             # TODO(apassos) calling initialize_resources on all resources here
             # doesn't work because the sessions and graph are reused across unit
             # tests and this would mean trying to reinitialize variables. Figure out
             # a long-term solution for this.
             resources.initialize_resources([var0, var1]).run()
             # Fetch params to validate initial values
             self.assertAllCloseAccordingToType([1.0, 2.0],
                                                self.evaluate(var0))
             self.assertAllCloseAccordingToType([3.0, 4.0],
                                                self.evaluate(var1))
             # Run 1 step of sgd
             sgd_op.run()
             # Validate updated params
             self.assertAllCloseAccordingToType(
                 [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], self.evaluate(var0))
             self.assertAllCloseAccordingToType(
                 [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], self.evaluate(var1))
Beispiel #24
0
    def testCachedPredictionOnEmptyEnsemble(self):
        """Tests that prediction on a dummy ensemble does not fail."""
        with self.test_session() as session:
            # Create a dummy ensemble.
            tree_ensemble = boosted_trees_ops.TreeEnsemble('ensemble',
                                                           serialized_proto='')
            tree_ensemble_handle = tree_ensemble.resource_handle
            resources.initialize_resources(resources.shared_resources()).run()

            # No previous cached values.
            cached_tree_ids = [0, 0]
            cached_node_ids = [0, 0]

            # We have two features: 0 and 1. Values don't matter here on a dummy
            # ensemble.
            feature_0_values = [67, 5]
            feature_1_values = [9, 17]

            # Grow tree ensemble.
            predict_op = boosted_trees_ops.training_predict(
                tree_ensemble_handle,
                cached_tree_ids=cached_tree_ids,
                cached_node_ids=cached_node_ids,
                bucketized_features=[feature_0_values, feature_1_values],
                logits_dimension=1)

            logits_updates, new_tree_ids, new_node_ids = session.run(
                predict_op)

            # Nothing changed.
            self.assertAllClose(cached_tree_ids, new_tree_ids)
            self.assertAllClose(cached_node_ids, new_node_ids)
            self.assertAllClose([[0], [0]], logits_updates)
Beispiel #25
0
    def testCreate(self):
        with self.test_session():
            tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
            tree = tree_ensemble_config.trees.add()
            _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
            tree_ensemble_config.tree_weights.append(1.0)

            # Prepare learner config.
            learner_config = learner_pb2.LearnerConfig()
            learner_config.num_classes = 2

            tree_ensemble_handle = model_ops.tree_ensemble_variable(
                stamp_token=3,
                tree_ensemble_config=tree_ensemble_config.SerializeToString(),
                name="create_tree")
            resources.initialize_resources(resources.shared_resources()).run()

            result, _, _ = prediction_ops.gradient_trees_prediction(
                tree_ensemble_handle,
                self._seed, [self._dense_float_tensor],
                [self._sparse_float_indices1, self._sparse_float_indices2],
                [self._sparse_float_values1, self._sparse_float_values2],
                [self._sparse_float_shape1, self._sparse_float_shape2],
                [self._sparse_int_indices1], [self._sparse_int_values1],
                [self._sparse_int_shape1],
                learner_config=learner_config.SerializeToString(),
                apply_dropout=False,
                apply_averaging=False,
                center_bias=False,
                reduce_dim=True)
            self.assertAllClose(result.eval(), [[-0.4], [-0.4]])
            stamp_token = model_ops.tree_ensemble_stamp_token(
                tree_ensemble_handle)
            self.assertEqual(stamp_token.eval(), 3)
Beispiel #26
0
 def _eval(self, var, accum, linear, grad, lr, l1, l2, l2_shrinkage=0,
           lr_power=1, multiply_linear_by_lr=False):
   dtype = np.float32
   var = np.array(var, dtype=dtype)
   accum = np.array(accum, dtype=dtype)
   linear = np.array(linear, dtype=dtype)
   grad = np.array(grad, dtype=dtype)
   use_v2 = bool(l2_shrinkage)
   with self.session() as session:
     lr = constant_op.constant(lr, dtype=dtype)
     l1 = constant_op.constant(l1, dtype=dtype)
     l2 = constant_op.constant(l2, dtype=dtype)
     l2_shrinkage = constant_op.constant(l2_shrinkage, dtype=dtype)
     lr_power = constant_op.constant(lr_power, dtype=dtype)
     v_var = resource_variable_ops.ResourceVariable(var, dtype=dtype)
     v_accum = resource_variable_ops.ResourceVariable(accum, dtype=dtype)
     v_linear = resource_variable_ops.ResourceVariable(linear, dtype=dtype)
     resources.initialize_resources([v_var, v_accum, v_linear]).run()
     assert not (use_v2 and multiply_linear_by_lr)
     if use_v2:
       session.run(training_ops.resource_apply_ftrl_v2(
           v_var.handle, v_accum.handle, v_linear.handle,
           grad, lr, l1, l2, l2_shrinkage, lr_power,
           multiply_linear_by_lr=multiply_linear_by_lr))
     else:
       session.run(training_ops.resource_apply_ftrl(
           v_var.handle, v_accum.handle, v_linear.handle,
           grad, lr, l1, l2, lr_power,
           multiply_linear_by_lr=multiply_linear_by_lr))
     return (v_var.read_value().eval().reshape(var.shape),
             v_accum.read_value().eval().reshape(accum.shape),
             v_linear.read_value().eval().reshape(linear.shape))
 def testMinimizeResourceVariable(self):
   for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
     with self.cached_session():
       var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
       var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
       x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
       pred = math_ops.matmul(var0, x) + var1
       loss = pred * pred
       sgd_op = gradient_descent.GradientDescentOptimizer(1.0).minimize(loss)
       # TODO(apassos) calling initialize_resources on all resources here
       # doesn't work because the sessions and graph are reused across unit
       # tests and this would mean trying to reinitialize variables. Figure out
       # a long-term solution for this.
       resources.initialize_resources([var0, var1]).run()
       # Fetch params to validate initial values
       self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
       self.assertAllCloseAccordingToType([3.0], var1.eval())
       # Run 1 step of sgd
       sgd_op.run()
       # Validate updated params
       np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
       np_grad = 2 * np_pred
       self.assertAllCloseAccordingToType(
           [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
       self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
  def testDropout(self):
    with self.test_session():
      # Empty tree ensenble.
      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
      # Add 1000 trees with some weights.
      for i in range(0, 999):
        tree = tree_ensemble_config.trees.add()
        tree_ensemble_config.tree_metadata.add().is_finalized = True
        _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
        tree_ensemble_config.tree_weights.append(i + 1)

      # Prepare learner/dropout config.
      learner_config = learner_pb2.LearnerConfig()
      learner_config.learning_rate_tuner.dropout.dropout_probability = 0.5
      learner_config.learning_rate_tuner.dropout.learning_rate = 1.0
      learner_config.num_classes = 2

      # Apply dropout.
      tree_ensemble_handle = model_ops.tree_ensemble_variable(
          stamp_token=0,
          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
          name="existing")
      resources.initialize_resources(resources.shared_resources()).run()

      result, dropout_info = self._get_predictions(
          tree_ensemble_handle,
          learner_config=learner_config.SerializeToString(),
          apply_dropout=True,
          apply_averaging=False,
          center_bias=False,
          reduce_dim=True)

      # We expect approx 500 trees were dropped.
      dropout_info = dropout_info.eval()
      self.assertIn(dropout_info[0].size, range(400, 601))
      self.assertEqual(dropout_info[0].size, dropout_info[1].size)

      for i in range(dropout_info[0].size):
        dropped_index = dropout_info[0][i]
        dropped_weight = dropout_info[1][i]
        # We constructed the trees so tree number + 1 is the tree weight, so
        # we can check here the weights for dropped trees.
        self.assertEqual(dropped_index + 1, dropped_weight)

      # Don't apply dropout.
      result_no_dropout, no_dropout_info = self._get_predictions(
          tree_ensemble_handle,
          learner_config=learner_config.SerializeToString(),
          apply_dropout=False,
          apply_averaging=False,
          center_bias=False,
          reduce_dim=True)

      self.assertEqual(result.eval().size, result_no_dropout.eval().size)
      for i in range(result.eval().size):
        self.assertNotEqual(result.eval()[i], result_no_dropout.eval()[i])

      # We expect none of the trees were dropped.
      self.assertAllEqual([[], []], no_dropout_info.eval())
  def testContribsForOnlyABiasNode(self):
    """Tests case when, after training, only left with a bias node.

    For example, this could happen if the final ensemble contains one tree that
    got pruned up to the root.
    """
    with self.cached_session() as session:
      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
      text_format.Merge(
          """
        trees {
          nodes {
            leaf {
              scalar: 1.72
            }
          }
        }
        tree_weights: 0.1
        tree_metadata: {
          num_layers_grown: 0
        }
      """, tree_ensemble_config)

      tree_ensemble = boosted_trees_ops.TreeEnsemble(
          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
      tree_ensemble_handle = tree_ensemble.resource_handle
      resources.initialize_resources(resources.shared_resources()).run()

      # All features are unused.
      feature_0_values = [36, 32]
      feature_1_values = [13, -29]
      feature_2_values = [11, 27]

      # Expected logits are computed by traversing the logit path and
      # subtracting child logits from parent logits.
      bias = 1.72 * 0.1  # Root node of tree_0.
      expected_feature_ids = ((), ())
      expected_logits_paths = ((bias,), (bias,))

      bucketized_features = [
          feature_0_values, feature_1_values, feature_2_values
      ]

      debug_op = boosted_trees_ops.example_debug_outputs(
          tree_ensemble_handle,
          bucketized_features=bucketized_features,
          logits_dimension=1)

      serialized_examples_debug_outputs = session.run(debug_op)
      feature_ids = []
      logits_paths = []
      for example in serialized_examples_debug_outputs:
        example_debug_outputs = boosted_trees_pb2.DebugOutput()
        example_debug_outputs.ParseFromString(example)
        feature_ids.append(example_debug_outputs.feature_ids)
        logits_paths.append(example_debug_outputs.logits_path)

      self.assertAllClose(feature_ids, expected_feature_ids)
      self.assertAllClose(logits_paths, expected_logits_paths)
Beispiel #30
0
	def test_simple(self):
		with self.session():
			TADDR_VALID = 'zrpull://127.0.0.1:5555'
			output = zmq_conn_handle(TADDR_VALID, ZMQ_HWM, 0)
			resources.initialize_resources(resources.local_resources()).run()
			# assertDTypeEqual not working for resource type. it trans tf.dtype to np.dtype and resource is incompatible with numpy
			#self.assertDtypeEqual(output, dtypes.resource.as_numpy_type)
			self.assertEqual(type(output.dtype), type(dtypes.resource))
Beispiel #31
0
    def testSaveRestoreBeforeFlush(self):
        save_dir = os.path.join(self.get_temp_dir(), "save_restore")
        save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")

        with self.test_session(graph=ops.Graph()) as sess:
            accumulator = quantile_ops.QuantileAccumulator(init_stamp_token=0,
                                                           num_quantiles=3,
                                                           epsilon=0.33,
                                                           name="q0")

            save = saver.Saver()
            resources.initialize_resources(resources.shared_resources()).run()

            sparse_indices_0 = constant_op.constant(
                [[1, 0], [2, 1], [3, 0], [4, 2], [5, 0]], dtype=dtypes.int64)
            sparse_values_0 = constant_op.constant([2.0, 3.0, 4.0, 5.0, 6.0],
                                                   dtype=dtypes.float32)
            sparse_shape_0 = constant_op.constant([6, 3], dtype=dtypes.int64)
            example_weights = constant_op.constant([10, 1, 1, 1, 1, 1],
                                                   dtype=dtypes.float32,
                                                   shape=[6, 1])
            update = accumulator.add_summary(stamp_token=0,
                                             column=sparse_tensor.SparseTensor(
                                                 sparse_indices_0,
                                                 sparse_values_0,
                                                 sparse_shape_0),
                                             example_weights=example_weights)
            update.run()
            save.save(sess, save_path)
            reset = accumulator.flush(stamp_token=0, next_stamp_token=1)
            with ops.control_dependencies([reset]):
                are_ready_flush, buckets = (accumulator.get_buckets(
                    stamp_token=1))
            buckets, are_ready_flush = (sess.run([buckets, are_ready_flush]))
            self.assertEqual(True, are_ready_flush)
            self.assertAllEqual([2, 4, 6.], buckets)

        with self.test_session(graph=ops.Graph()) as sess:
            accumulator = quantile_ops.QuantileAccumulator(init_stamp_token=0,
                                                           num_quantiles=3,
                                                           epsilon=0.33,
                                                           name="q0")
            save = saver.Saver()

            # Restore the saved values in the parameter nodes.
            save.restore(sess, save_path)
            are_ready_noflush = accumulator.get_buckets(stamp_token=0)[0]
            with ops.control_dependencies([are_ready_noflush]):
                reset = accumulator.flush(stamp_token=0, next_stamp_token=1)

            with ops.control_dependencies([reset]):
                are_ready_flush, buckets = accumulator.get_buckets(
                    stamp_token=1)
            buckets, are_ready_flush, are_ready_noflush = (sess.run(
                [buckets, are_ready_flush, are_ready_noflush]))
            self.assertFalse(are_ready_noflush)
            self.assertTrue(are_ready_flush)
            self.assertAllEqual([2, 4, 6.], buckets)
  def testWithExistingEnsembleAndShrinkage(self):
    with self.test_session():
      # Add shrinkage config.
      learning_rate = 0.0001
      tree_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
      # Add 10 trees with some weights.
      for i in range(0, 5):
        tree = tree_ensemble.trees.add()
        _append_to_leaf(tree.nodes.add().leaf, 0, -0.4)
        tree_ensemble.tree_weights.append(i + 1)
        meta = tree_ensemble.tree_metadata.add()
        meta.num_tree_weight_updates = 1
      tree_ensemble_handle = model_ops.tree_ensemble_variable(
          stamp_token=0,
          tree_ensemble_config=tree_ensemble.SerializeToString(),
          name="existing")

      # Create non-zero feature importance.
      feature_usage_counts = variables.Variable(
          initial_value=np.array([4, 7], np.int64),
          name="feature_usage_counts",
          trainable=False)
      feature_gains = variables.Variable(
          initial_value=np.array([0.2, 0.8], np.float32),
          name="feature_gains",
          trainable=False)

      resources.initialize_resources(resources.shared_resources()).run()
      variables.initialize_all_variables().run()

      output_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
      with ops.control_dependencies([
          ensemble_optimizer_ops.add_trees_to_ensemble(
              tree_ensemble_handle,
              self._ensemble_to_add.SerializeToString(),
              feature_usage_counts, [1, 2],
              feature_gains, [0.5, 0.3], [[], []],
              learning_rate=learning_rate)
      ]):
        output_ensemble.ParseFromString(
            model_ops.tree_ensemble_serialize(tree_ensemble_handle)[1].eval())

      # The weights of previous trees stayed the same, new tree (LAST) is added
      # with shrinkage weight.
      self.assertAllClose([1.0, 2.0, 3.0, 4.0, 5.0, learning_rate],
                          output_ensemble.tree_weights)

      # Check that all number of updates are equal to 1 (e,g, no old tree weight
      # got adjusted.
      for i in range(0, 6):
        self.assertEqual(
            1, output_ensemble.tree_metadata[i].num_tree_weight_updates)

      # Ensure feature importance was aggregated correctly.
      self.assertAllEqual([5, 9], feature_usage_counts.eval())
      self.assertArrayNear(
          [0.2 + 0.5 * learning_rate, 0.8 + 0.3 * learning_rate],
          feature_gains.eval(), 1e-6)
Beispiel #33
0
 def testCreate(self):
   with self.test_session():
     ensemble = boosted_trees_ops.TreeEnsemble('ensemble')
     resources.initialize_resources(resources.shared_resources()).run()
     stamp_token = ensemble.get_stamp_token()
     self.assertEqual(0, stamp_token.eval())
     (_, num_trees, num_finalized_trees,
      num_attempted_layers) = ensemble.get_states()
     self.assertEqual(0, num_trees.eval())
     self.assertEqual(0, num_finalized_trees.eval())
     self.assertEqual(0, num_attempted_layers.eval())
Beispiel #34
0
 def testCreate(self):
     with self.cached_session():
         ensemble = boosted_trees_ops.TreeEnsemble('ensemble')
         resources.initialize_resources(resources.shared_resources()).run()
         stamp_token = ensemble.get_stamp_token()
         self.assertEqual(0, self.evaluate(stamp_token))
         (_, num_trees, num_finalized_trees, num_attempted_layers,
          nodes_range) = ensemble.get_states()
         self.assertEqual(0, self.evaluate(num_trees))
         self.assertEqual(0, self.evaluate(num_finalized_trees))
         self.assertEqual(0, self.evaluate(num_attempted_layers))
         self.assertAllEqual([0, 1], self.evaluate(nodes_range))
Beispiel #35
0
    def testPredictFn(self):
        """Tests the predict function."""
        with self.test_session() as sess:
            # Create ensemble with one bias node.
            ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
            text_format.Merge(
                """
          trees {
            nodes {
              leaf {
                vector {
                  value: 0.25
                }
              }
            }
          }
          tree_weights: 1.0
          tree_metadata {
            num_tree_weight_updates: 1
            num_layers_grown: 1
            is_finalized: true
          }""", ensemble_config)
            ensemble_handle = model_ops.tree_ensemble_variable(
                stamp_token=3,
                tree_ensemble_config=ensemble_config.SerializeToString(),
                name="tree_ensemble")
            resources.initialize_resources(resources.shared_resources()).run()
            learner_config = learner_pb2.LearnerConfig()
            learner_config.learning_rate_tuner.fixed.learning_rate = 0.1
            learner_config.num_classes = 2
            learner_config.regularization.l1 = 0
            learner_config.regularization.l2 = 0
            learner_config.constraints.max_tree_depth = 1
            learner_config.constraints.min_node_weight = 0
            features = {}
            features["dense_float"] = array_ops.ones([4, 1], dtypes.float32)
            gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
                is_chief=False,
                num_ps_replicas=0,
                center_bias=True,
                ensemble_handle=ensemble_handle,
                examples_per_layer=1,
                learner_config=learner_config,
                features=features)

            # Create predict op.
            mode = model_fn.ModeKeys.EVAL
            predictions_dict = sess.run(gbdt_model.predict(mode))
            self.assertEquals(predictions_dict["ensemble_stamp"], 3)
            self.assertAllClose(predictions_dict["predictions"],
                                [[0.25], [0.25], [0.25], [0.25]])
            self.assertAllClose(predictions_dict["partition_ids"],
                                [0, 0, 0, 0])
Beispiel #36
0
 def testCreate(self):
   with self.cached_session():
     ensemble = boosted_trees_ops.TreeEnsemble('ensemble')
     resources.initialize_resources(resources.shared_resources()).run()
     stamp_token = ensemble.get_stamp_token()
     self.assertEqual(0, self.evaluate(stamp_token))
     (_, num_trees, num_finalized_trees, num_attempted_layers,
      nodes_range) = ensemble.get_states()
     self.assertEqual(0, self.evaluate(num_trees))
     self.assertEqual(0, self.evaluate(num_finalized_trees))
     self.assertEqual(0, self.evaluate(num_attempted_layers))
     self.assertAllEqual([0, 1], self.evaluate(nodes_range))
Beispiel #37
0
  def testPredictFn(self):
    """Tests the predict function."""
    with self.test_session() as sess:
      # Create ensemble with one bias node.
      ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
      text_format.Merge("""
          trees {
            nodes {
              leaf {
                vector {
                  value: 0.25
                }
              }
            }
          }
          tree_weights: 1.0
          tree_metadata {
            num_tree_weight_updates: 1
            num_layers_grown: 1
            is_finalized: true
          }""", ensemble_config)
      ensemble_handle = model_ops.tree_ensemble_variable(
          stamp_token=3,
          tree_ensemble_config=ensemble_config.SerializeToString(),
          name="tree_ensemble")
      resources.initialize_resources(resources.shared_resources()).run()
      learner_config = learner_pb2.LearnerConfig()
      learner_config.learning_rate_tuner.fixed.learning_rate = 0.1
      learner_config.num_classes = 2
      learner_config.regularization.l1 = 0
      learner_config.regularization.l2 = 0
      learner_config.constraints.max_tree_depth = 1
      learner_config.constraints.min_node_weight = 0
      features = {}
      features["dense_float"] = array_ops.ones([4, 1], dtypes.float32)
      gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
          is_chief=False,
          num_ps_replicas=0,
          center_bias=True,
          ensemble_handle=ensemble_handle,
          examples_per_layer=1,
          learner_config=learner_config,
          features=features)

      # Create predict op.
      mode = model_fn.ModeKeys.EVAL
      predictions_dict = sess.run(gbdt_model.predict(mode))
      self.assertEquals(predictions_dict["ensemble_stamp"], 3)
      self.assertAllClose(predictions_dict["predictions"], [[0.25], [0.25],
                                                            [0.25], [0.25]])
      self.assertAllClose(predictions_dict["partition_ids"], [0, 0, 0, 0])
  def testSaveRestoreBeforeFlush(self):
    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")

    with self.test_session(graph=ops.Graph()) as sess:
      accumulator = quantile_ops.QuantileAccumulator(
          init_stamp_token=0, num_quantiles=3, epsilon=0.33, name="q0")

      save = saver.Saver()
      resources.initialize_resources(resources.shared_resources()).run()

      sparse_indices_0 = constant_op.constant(
          [[1, 0], [2, 1], [3, 0], [4, 2], [5, 0]], dtype=dtypes.int64)
      sparse_values_0 = constant_op.constant(
          [2.0, 3.0, 4.0, 5.0, 6.0], dtype=dtypes.float32)
      sparse_shape_0 = constant_op.constant([6, 3], dtype=dtypes.int64)
      example_weights = constant_op.constant(
          [10, 1, 1, 1, 1, 1], dtype=dtypes.float32, shape=[6, 1])
      update = accumulator.add_summary(
          stamp_token=0,
          column=sparse_tensor.SparseTensor(sparse_indices_0, sparse_values_0,
                                            sparse_shape_0),
          example_weights=example_weights)
      update.run()
      save.save(sess, save_path)
      reset = accumulator.flush(stamp_token=0, next_stamp_token=1)
      with ops.control_dependencies([reset]):
        are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
      buckets, are_ready_flush = (sess.run([buckets, are_ready_flush]))
      self.assertEqual(True, are_ready_flush)
      self.assertAllEqual([2, 4, 6.], buckets)

    with self.test_session(graph=ops.Graph()) as sess:
      accumulator = quantile_ops.QuantileAccumulator(
          init_stamp_token=0, num_quantiles=3, epsilon=0.33, name="q0")
      save = saver.Saver()

      # Restore the saved values in the parameter nodes.
      save.restore(sess, save_path)
      are_ready_noflush = accumulator.get_buckets(stamp_token=0)[0]
      with ops.control_dependencies([are_ready_noflush]):
        reset = accumulator.flush(stamp_token=0, next_stamp_token=1)

      with ops.control_dependencies([reset]):
        are_ready_flush, buckets = accumulator.get_buckets(stamp_token=1)
      buckets, are_ready_flush, are_ready_noflush = (sess.run(
          [buckets, are_ready_flush, are_ready_noflush]))
      self.assertFalse(are_ready_noflush)
      self.assertTrue(are_ready_flush)
      self.assertAllEqual([2, 4, 6.], buckets)
Beispiel #39
0
    def testEmpty(self):
        with self.cached_session() as sess:
            gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
            hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
            partition_ids = [0, 0, 0, 1]
            indices = constant_op.constant_v1([],
                                              dtype=dtypes.int64,
                                              shape=[0, 2])
            values = constant_op.constant_v1([], dtype=dtypes.int64)

            gradient_shape = tensor_shape.TensorShape([])
            hessian_shape = tensor_shape.TensorShape([])
            class_id = -1

            split_handler = categorical_split_handler.EqualitySplitHandler(
                l1_regularization=0.1,
                l2_regularization=1,
                tree_complexity_regularization=0,
                min_node_weight=0,
                sparse_int_column=sparse_tensor.SparseTensor(
                    indices, values, [4, 1]),
                feature_column_group_id=0,
                gradient_shape=gradient_shape,
                hessian_shape=hessian_shape,
                multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS,
                init_stamp_token=0)
            resources.initialize_resources(resources.shared_resources()).run()

            empty_gradients, empty_hessians = get_empty_tensors(
                gradient_shape, hessian_shape)
            example_weights = array_ops.ones([4, 1], dtypes.float32)

            update_1 = split_handler.update_stats_sync(
                0,
                partition_ids,
                gradients,
                hessians,
                empty_gradients,
                empty_hessians,
                example_weights,
                is_active=array_ops.constant([True, True]))
            with ops.control_dependencies([update_1]):
                are_splits_ready, partitions, gains, splits = (
                    split_handler.make_splits(0, 1, class_id))
                are_splits_ready, partitions, gains, splits = (sess.run(
                    [are_splits_ready, partitions, gains, splits]))
        self.assertTrue(are_splits_ready)
        self.assertEqual(len(partitions), 0)
        self.assertEqual(len(gains), 0)
        self.assertEqual(len(splits), 0)
  def testMetadataMissing(self):
    # Sometimes we want to do prediction on trees that are not added to ensemble
    # (for example in
    with self.test_session():
      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
      # Bias tree.
      tree1 = tree_ensemble_config.trees.add()
      _append_to_leaf(tree1.nodes.add().leaf, 0, -0.4)

      # Depth 3 tree.
      tree2 = tree_ensemble_config.trees.add()
      # We are not setting the tree_ensemble_config.tree_metadata in this test.
      _set_float_split(tree2.nodes.add().dense_float_binary_split, 0, 9.0, 1, 2)
      _set_float_split(tree2.nodes.add()
                       .sparse_float_binary_split_default_left.split, 0, -20.0,
                       3, 4)
      _append_to_leaf(tree2.nodes.add().leaf, 0, 0.5)
      _append_to_leaf(tree2.nodes.add().leaf, 0, 1.2)
      _set_categorical_id_split(tree2.nodes.add().categorical_id_binary_split,
                                0, 9, 5, 6)
      _append_to_leaf(tree2.nodes.add().leaf, 0, -0.9)
      _append_to_leaf(tree2.nodes.add().leaf, 0, 0.7)

      tree_ensemble_config.tree_weights.append(1.0)
      tree_ensemble_config.tree_weights.append(1.0)

      tree_ensemble_handle = model_ops.tree_ensemble_variable(
          stamp_token=0,
          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
          name="full_ensemble")
      resources.initialize_resources(resources.shared_resources()).run()

      # Prepare learner config.
      learner_config = learner_pb2.LearnerConfig()
      learner_config.num_classes = 2

      result, dropout_info = self._get_predictions(
          tree_ensemble_handle,
          learner_config=learner_config.SerializeToString(),
          reduce_dim=True)

      # The first example will get bias -0.4 from first tree and
      # leaf 4 payload of -0.9 hence -1.3, the second example will
      # get the same bias -0.4 and leaf 3 payload (sparse feature missing)
      # of 1.2 hence 0.8.
      self.assertAllClose([[-1.3], [0.8]], result.eval())

      # Empty dropout.
      self.assertAllEqual([[], []], dropout_info.eval())
Beispiel #41
0
 def testUsedHandlers(self):
   with self.cached_session():
     tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
     tree_ensemble_config.growing_metadata.used_handler_ids.append(1)
     tree_ensemble_config.growing_metadata.used_handler_ids.append(5)
     stamp_token = 3
     tree_ensemble_handle = model_ops.tree_ensemble_variable(
         stamp_token=stamp_token,
         tree_ensemble_config=tree_ensemble_config.SerializeToString(),
         name="create_tree")
     resources.initialize_resources(resources.shared_resources()).run()
     result = model_ops.tree_ensemble_used_handlers(
         tree_ensemble_handle, stamp_token, num_all_handlers=6)
     self.assertAllEqual([0, 1, 0, 0, 0, 1], result.used_handlers_mask.eval())
     self.assertEqual(2, result.num_used_handlers.eval())
Beispiel #42
0
    def testStreamingQuantileBucketsLowPrecisionInput(self):
        """Tests inputs that simulate low precision float16 values."""

        num_quantiles = 3
        # set generate_quantiles to True since the test will generate fewer
        # boundaries otherwise.
        with self.test_session() as sess:
            accumulator = quantile_ops.QuantileAccumulator(
                init_stamp_token=0,
                num_quantiles=num_quantiles,
                epsilon=0.001,
                name="q1",
                generate_quantiles=True)
            resources.initialize_resources(resources.shared_resources()).run()
        input_column = array_ops.placeholder(dtypes.float32)
        weights = array_ops.placeholder(dtypes.float32)
        update = accumulator.add_summary(stamp_token=0,
                                         column=input_column,
                                         example_weights=weights)

        with self.test_session() as sess:
            # This input is generated by integer in the range [2030, 2060]
            # but represented by with float16 precision. Integers <= 2048 are
            # exactly represented, whereas  numbers > 2048 are rounded; and hence
            # numbers > 2048 are repeated. For precision loss / rounding, see:
            # https://en.wikipedia.org/wiki/Half-precision_floating-point_format.
            #
            # The intent of the test is not handling of float16 values, but to
            # validate the number of buckets is returned, in cases where  the input
            # may contain repeated values.
            inputs = [
                2030.0, 2031.0, 2032.0, 2033.0, 2034.0, 2035.0, 2036.0, 2037.0,
                2038.0, 2039.0, 2040.0, 2041.0, 2042.0, 2043.0, 2044.0, 2045.0,
                2046.0, 2047.0, 2048.0, 2048.0, 2050.0, 2052.0, 2052.0, 2052.0,
                2054.0, 2056.0, 2056.0, 2056.0, 2058.0, 2060.0
            ]
            sess.run(update, {
                input_column: inputs,
                weights: [1] * len(inputs)
            })

        with self.test_session() as sess:
            sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
            are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
            buckets, are_ready_flush = (sess.run([buckets, are_ready_flush]))
            self.assertEqual(True, are_ready_flush)
            self.assertEqual(num_quantiles + 1, len(buckets))
            self.assertAllEqual([2030, 2040, 2050, 2060], buckets)
def tree_models(X, y, num_feat, num_class):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
    num_features = num_feat
    num_steps = 400
    num_classes = num_class
    num_trees = 10
    max_nodes = 1000

    X = tf.placeholder(tf.float32, shape=[None, num_features])
    Y = tf.placeholder(tf.int64, shape=[None])

    hparams = tensor_forest.ForestHParams(num_classes=num_classes, num_features=num_features, num_trees=num_trees, max_nodes=max_nodes).fill()

    forest_graph = tensor_forest.RandomForestGraphs(hparams)

    train_op = forest_graph.training_graph(X, Y)
    loss_op = forest_graph.training_loss(X, Y)

    infer_op, _, _, = forest_graph.inference_graph(X)
    correct_prediction = tf.equal(tf.argmax(infer_op, 1), tf.cast(Y, tf.int64))
    accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    init_vars = tf.group(tf.global_variables_initializer(), resources.initialize_resources(resources.shared_resources()))

    rf_sess = tf.Session()

    rf_sess.run(init_vars)

    for i in range(1, num_steps + 1):
        _, l = rf_sess.run([train_op, loss_op], feed_dict={X: X_train, Y: y_train})
        if i % 50 == 0 or i == 1:
            acc = rf_sess.run(accuracy_op, feed_dict={X: X_train, Y: y_train})
            print("Step %i, Loss: %f, Acc: %f" % (i, l, acc))

    print("Test Accuracy:", rf_sess.run(accuracy_op, feed_dict={X: X_test, Y: y_test}))
Beispiel #44
0
def save_model(sess, model):
    """
    存储TF模型
    """
    inputs = {"inputs": model.input}  # 输入String图像
    outputs = {"prob": model.predictions}  # 输出

    prediction_signature = tf.saved_model.signature_def_utils.predict_signature_def(inputs, outputs)
    signature_map = {signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: prediction_signature}

    legacy_op = control_flow_ops.group(
        tf.local_variables_initializer(),
        resources.initialize_resources(resources.shared_resources()),
        tf.tables_initializer())

    res_dir = "data/model-tf"
    print('[Info] 模型存储路径: {}'.format(res_dir))
    builder = saved_model_builder.SavedModelBuilder(res_dir)

    builder.add_meta_graph_and_variables(
        sess, [tag_constants.SERVING],
        signature_def_map=signature_map,
        legacy_init_op=legacy_op)

    builder.save()
Beispiel #45
0
def random_forest(num_classes=2,
                  num_features=46,
                  num_trees=100,
                  max_nodes=10000):
    X = tf.placeholder(tf.float32, shape=[None, num_features])
    # For random forest, labels must be integers (the class id)
    Y = tf.placeholder(tf.int32, shape=[None])

    # Random Forest Parameters
    hparams = tensor_forest.ForestHParams(
        num_classes=num_classes,
        num_features=num_features,
        num_trees=num_trees,
        max_nodes=max_nodes,
    ).fill()

    forest_graph = tensor_forest.RandomForestGraphs(hparams)
    train_op = forest_graph.training_graph(X, Y)
    loss_op = forest_graph.training_loss(X, Y)

    # Measure the accuracy
    infer_op, _, _ = forest_graph.inference_graph(X)
    correct_prediction = tf.equal(tf.argmax(infer_op, 1), tf.cast(Y, tf.int64))
    accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    init_vars = tf.group(
        tf.global_variables_initializer(),
        resources.initialize_resources(resources.shared_resources()),
    )

    # sess = tf.Session()
    # sess.run(init_vars)
    return infer_op, accuracy_op, train_op, loss_op, X, Y
Beispiel #46
0
    def __init__(self, num_features, num_classes, num_trees, max_nodes):
        tf.reset_default_graph()

        self.X = tf.placeholder(tf.float32, shape=[None, num_features])
        self.Y = tf.placeholder(tf.int32, shape=[None])

        self.hparams = tensor_forest.ForestHParams(num_classes=num_classes,
                                                   num_features=num_features,
                                                   num_trees=num_trees,
                                                   max_nodes=max_nodes).fill()
        print("test 1")
        # build graph
        self.forest_graph = tensor_forest.RandomForestGraphs(self.hparams)

        self.train_op = self.forest_graph.training_graph(self.X, self.Y)
        self.loss_op = self.forest_graph.training_loss(self.X, self.Y)

        infer_op, _, _ = self.forest_graph.inference_graph(self.X)
        self.infer_op = infer_op

        print("test 2")
        self.correct_pred = tf.equal(tf.argmax(self.infer_op, 1),
                                     tf.cast(self.Y, tf.int64))

        self.accuracy_op = tf.reduce_mean(
            tf.cast(self.correct_pred, tf.float32))

        self.init = tf.group(
            tf.global_variables_initializer(),
            resources.initialize_resources(resources.shared_resources()))

        self.sess = None
Beispiel #47
0
def train_and_test_model(train_features, train_labels, train_targets,
                         test_features, test_targets, test_labels):
    """Trains and tests random forest model.

    Args:
        train_features: training features
        train_labels: training labels, integer values
        train_targets: one-hot row vector of labels
        test_features: testing features
        test_targets: one-hot row vector of test labels
        test_labels: labels for testing
    Returns:
        The loss for the model
    """
    # initialize model and build it
    config = Config()
    forest_model = RandomForest(config)
    forest_model.build()

    # initialize tensorflow variables
    init_vars = tf.group(
        tf.global_variables_initializer(),
        resources.initialize_resources(resources.shared_resources()))

    with tf.Session() as session:
        session.run(init_vars)

        # train
        loss = forest_model.do_train(session, train_features, train_labels,
                                     train_targets)
        print "Final train loss: {}".format(loss)

        # test
        forest_model.do_test(session, test_features, test_targets, test_labels)
    return loss
  def testInactive(self):
    with self.test_session() as sess:
      gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
      hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
      partition_ids = [0, 0, 0, 1]
      indices = [[0, 0], [0, 1], [2, 0], [3, 0]]
      values = array_ops.constant([1, 2, 2, 1], dtype=dtypes.int64)

      gradient_shape = tensor_shape.scalar()
      hessian_shape = tensor_shape.scalar()
      class_id = -1

      split_handler = categorical_split_handler.EqualitySplitHandler(
          l1_regularization=0.1,
          l2_regularization=1,
          tree_complexity_regularization=0,
          min_node_weight=0,
          sparse_int_column=sparse_tensor.SparseTensor(indices, values, [4, 1]),
          feature_column_group_id=0,
          gradient_shape=gradient_shape,
          hessian_shape=hessian_shape,
          multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS,
          init_stamp_token=0)
      resources.initialize_resources(resources.shared_resources()).run()

      empty_gradients, empty_hessians = get_empty_tensors(
          gradient_shape, hessian_shape)
      example_weights = array_ops.ones([4, 1], dtypes.float32)

      update_1 = split_handler.update_stats_sync(
          0,
          partition_ids,
          gradients,
          hessians,
          empty_gradients,
          empty_hessians,
          example_weights,
          is_active=array_ops.constant([False, False]))
      with ops.control_dependencies([update_1]):
        are_splits_ready, partitions, gains, splits = (
            split_handler.make_splits(0, 1, class_id))
        are_splits_ready, partitions, gains, splits = (sess.run(
            [are_splits_ready, partitions, gains, splits]))
    self.assertTrue(are_splits_ready)
    self.assertEqual(len(partitions), 0)
    self.assertEqual(len(gains), 0)
    self.assertEqual(len(splits), 0)
  def testExcludeNonFinalTree(self):
    with self.test_session():
      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
      # Bias tree.
      tree1 = tree_ensemble_config.trees.add()
      tree_ensemble_config.tree_metadata.add().is_finalized = True
      _append_to_leaf(tree1.nodes.add().leaf, 0, -0.4)

      # Depth 3 tree.
      tree2 = tree_ensemble_config.trees.add()
      tree_ensemble_config.tree_metadata.add().is_finalized = False
      _set_float_split(tree2.nodes.add().dense_float_binary_split, 0, 9.0, 1, 2)
      _set_float_split(tree2.nodes.add()
                       .sparse_float_binary_split_default_left.split, 0, -20.0,
                       3, 4)
      _append_to_leaf(tree2.nodes.add().leaf, 0, 0.5)
      _append_to_leaf(tree2.nodes.add().leaf, 0, 1.2)
      _set_categorical_id_split(tree2.nodes.add().categorical_id_binary_split,
                                0, 9, 5, 6)
      _append_to_leaf(tree2.nodes.add().leaf, 0, -0.9)
      _append_to_leaf(tree2.nodes.add().leaf, 0, 0.7)

      tree_ensemble_config.tree_weights.append(1.0)
      tree_ensemble_config.tree_weights.append(1.0)

      tree_ensemble_handle = model_ops.tree_ensemble_variable(
          stamp_token=0,
          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
          name="full_ensemble")
      resources.initialize_resources(resources.shared_resources()).run()

      # Prepare learner config.
      learner_config = learner_pb2.LearnerConfig()
      learner_config.num_classes = 2
      learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE

      result, dropout_info = self._get_predictions(
          tree_ensemble_handle,
          learner_config=learner_config.SerializeToString(),
          reduce_dim=True)

      # All the examples should get only the bias since the second tree is
      # non-finalized
      self.assertAllClose([[-0.4], [-0.4]], result.eval())

      # Empty dropout.
      self.assertAllEqual([[], []], dropout_info.eval())
  def testFullEnsembleMultiNotClassTreePerClassStrategyDenseVector(self):
    with self.test_session():
      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
      # Bias tree only for second class.
      tree1 = tree_ensemble_config.trees.add()
      tree_ensemble_config.tree_metadata.add().is_finalized = True
      _append_multi_values_to_dense_leaf(tree1.nodes.add().leaf, [0, -0.2, -2])

      # Depth 2 tree.
      tree2 = tree_ensemble_config.trees.add()
      tree_ensemble_config.tree_metadata.add().is_finalized = True
      _set_float_split(tree2.nodes.add()
                       .sparse_float_binary_split_default_right.split, 1, 4.0,
                       1, 2)
      _set_float_split(tree2.nodes.add().dense_float_binary_split, 0, 9.0, 3, 4)
      _append_multi_values_to_dense_leaf(tree2.nodes.add().leaf, [0.5, 0, 0])
      _append_multi_values_to_dense_leaf(tree2.nodes.add().leaf, [0, 1.2, -0.7])
      _append_multi_values_to_dense_leaf(tree2.nodes.add().leaf, [-0.9, 0, 0])

      tree_ensemble_config.tree_weights.append(1.0)
      tree_ensemble_config.tree_weights.append(1.0)

      tree_ensemble_handle = model_ops.tree_ensemble_variable(
          stamp_token=0,
          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
          name="ensemble_multi_class")
      resources.initialize_resources(resources.shared_resources()).run()

      # Prepare learner config.
      learner_config = learner_pb2.LearnerConfig()
      learner_config.num_classes = 3
      learner_config.multi_class_strategy = (
          learner_pb2.LearnerConfig.FULL_HESSIAN)

      result, dropout_info = self._get_predictions(
          tree_ensemble_handle,
          learner_config=learner_config.SerializeToString(),
          reduce_dim=False)
      # The first example will get bias class 1 -0.2 and -2 for class 2 from
      # first tree and leaf 2 payload (sparse feature missing) of 0.5 hence
      # 0.5, -0.2], the second example will get the same bias and leaf 3 payload
      # of class 1 1.2 and class 2-0.7 hence [0.0, 1.0, -2.7].
      self.assertAllClose([[0.5, -0.2, -2.0], [0, 1.0, -2.7]], result.eval())

      # Empty dropout.
      self.assertAllEqual([[], []], dropout_info.eval())
  def testStreamingQuantileBucketsLowPrecisionInput(self):
    """Tests inputs that simulate low precision float16 values."""

    num_quantiles = 3
    # set generate_quantiles to True since the test will generate fewer
    # boundaries otherwise.
    with self.test_session() as sess:
      accumulator = quantile_ops.QuantileAccumulator(
          init_stamp_token=0, num_quantiles=num_quantiles,
          epsilon=0.001, name="q1", generate_quantiles=True)
      resources.initialize_resources(resources.shared_resources()).run()
    input_column = array_ops.placeholder(dtypes.float32)
    weights = array_ops.placeholder(dtypes.float32)
    update = accumulator.add_summary(
        stamp_token=0,
        column=input_column,
        example_weights=weights)

    with self.test_session() as sess:
      # This input is generated by integer in the range [2030, 2060]
      # but represented by with float16 precision. Integers <= 2048 are
      # exactly represented, whereas  numbers > 2048 are rounded; and hence
      # numbers > 2048 are repeated. For precision loss / rounding, see:
      # https://en.wikipedia.org/wiki/Half-precision_floating-point_format.
      #
      # The intent of the test is not handling of float16 values, but to
      # validate the number of buckets is returned, in cases where  the input
      # may contain repeated values.
      inputs = [
          2030.0, 2031.0, 2032.0, 2033.0, 2034.0, 2035.0, 2036.0, 2037.0,
          2038.0, 2039.0, 2040.0, 2041.0, 2042.0, 2043.0, 2044.0, 2045.0,
          2046.0, 2047.0, 2048.0, 2048.0, 2050.0, 2052.0, 2052.0, 2052.0,
          2054.0, 2056.0, 2056.0, 2056.0, 2058.0, 2060.0
      ]
      sess.run(update,
               {input_column: inputs,
                weights: [1] * len(inputs)})

    with self.test_session() as sess:
      sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
      are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
      buckets, are_ready_flush = (sess.run(
          [buckets, are_ready_flush]))
      self.assertEqual(True, are_ready_flush)
      self.assertEqual(num_quantiles + 1, len(buckets))
      self.assertAllEqual([2030, 2040, 2050, 2060], buckets)
  def testWithExistingEnsemble(self):
    with self.test_session():
      # Create existing tree ensemble.
      tree_ensemble_handle = model_ops.tree_ensemble_variable(
          stamp_token=0,
          tree_ensemble_config=self._tree_ensemble.SerializeToString(),
          name="existing")
      # Create non-zero feature importance.
      feature_usage_counts = variables.Variable(
          initial_value=np.array([0, 4, 1], np.int64),
          name="feature_usage_counts",
          trainable=False)
      feature_gains = variables.Variable(
          initial_value=np.array([0.0, 0.3, 0.05], np.float32),
          name="feature_gains",
          trainable=False)

      resources.initialize_resources(resources.shared_resources()).run()
      variables.initialize_all_variables().run()
      output_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
      with ops.control_dependencies([
          ensemble_optimizer_ops.add_trees_to_ensemble(
              tree_ensemble_handle,
              self._ensemble_to_add.SerializeToString(),
              feature_usage_counts, [1, 2, 0],
              feature_gains, [0.02, 0.1, 0.0], [[], []],
              learning_rate=1)
      ]):
        output_ensemble.ParseFromString(
            model_ops.tree_ensemble_serialize(tree_ensemble_handle)[1].eval())

      # Output.
      self.assertEqual(3, len(output_ensemble.trees))
      self.assertProtoEquals(self._tree_to_add, output_ensemble.trees[2])

      self.assertAllEqual([1.0, 1.0, 1.0], output_ensemble.tree_weights)

      self.assertEqual(2,
                       output_ensemble.tree_metadata[0].num_tree_weight_updates)
      self.assertEqual(3,
                       output_ensemble.tree_metadata[1].num_tree_weight_updates)
      self.assertEqual(1,
                       output_ensemble.tree_metadata[2].num_tree_weight_updates)
      self.assertAllEqual([1, 6, 1], feature_usage_counts.eval())
      self.assertArrayNear([0.02, 0.4, 0.05], feature_gains.eval(), 1e-6)
  def testStreamingQuantileBucketsWithVaryingBatch(self):
    """Sets up the quantile summary op test as follows.

    Creates batches examples with different number of inputs in each batch.
    The input values are dense in the range [1 ... N]
    The data looks like this:
    | Batch | Start | InputList
    |   1   |   1   |  [1]
    |   2   |   2   |  [2, 3]
    |   3   |   4   |  [4, 5, 6]
    |   4   |   7   |  [7, 8, 9, 10]
    |   5   |  11   |  [11, 12, 13, 14, 15]
    |   6   |  16   |  [16, 17, 18, 19, 20, 21]
    """

    num_quantiles = 3
    with self.test_session() as sess:
      accumulator = quantile_ops.QuantileAccumulator(
          init_stamp_token=0, num_quantiles=num_quantiles,
          epsilon=0.001, name="q1")
      resources.initialize_resources(resources.shared_resources()).run()
    input_column = array_ops.placeholder(dtypes.float32)
    weights = array_ops.placeholder(dtypes.float32)
    update = accumulator.add_summary(
        stamp_token=0,
        column=input_column,
        example_weights=weights)

    with self.test_session() as sess:
      for i in range(1, 23):
        # start = 1, 2, 4, 7, 11, 16 ... (see comment above)
        start = int((i * (i-1) / 2) + 1)
        sess.run(update,
                 {input_column: range(start, start+i),
                  weights: [1] * i})

    with self.test_session() as sess:
      sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
      are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
      buckets, are_ready_flush = (sess.run(
          [buckets, are_ready_flush]))
      self.assertEqual(True, are_ready_flush)
      self.assertEqual(num_quantiles + 1, len(buckets))
      self.assertAllEqual([1, 86., 170., 253.], buckets)
  def testEnsembleEmpty(self):
    with self.test_session():
      tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()

      tree_ensemble_handle = model_ops.tree_ensemble_variable(
          stamp_token=0,
          tree_ensemble_config=tree_ensemble_config.SerializeToString(),
          name="full_ensemble")
      resources.initialize_resources(resources.shared_resources()).run()

      result = prediction_ops.gradient_trees_partition_examples(
          tree_ensemble_handle, [self._dense_float_tensor], [
              self._sparse_float_indices1, self._sparse_float_indices2
          ], [self._sparse_float_values1, self._sparse_float_values2],
          [self._sparse_float_shape1,
           self._sparse_float_shape2], [self._sparse_int_indices1],
          [self._sparse_int_values1], [self._sparse_int_shape1])

      self.assertAllEqual([0, 0], result.eval())
  def testWithEmptyEnsembleAndShrinkage(self):
    with self.test_session():
      # Add shrinkage config.
      learning_rate = 0.0001
      tree_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
      tree_ensemble_handle = model_ops.tree_ensemble_variable(
          stamp_token=0,
          tree_ensemble_config=tree_ensemble.SerializeToString(),
          name="existing")

      # Create zero feature importance.
      feature_usage_counts = variables.Variable(
          initial_value=np.array([0, 0], np.int64),
          name="feature_usage_counts",
          trainable=False)
      feature_gains = variables.Variable(
          initial_value=np.array([0.0, 0.0], np.float32),
          name="feature_gains",
          trainable=False)

      resources.initialize_resources(resources.shared_resources()).run()
      variables.initialize_all_variables().run()

      output_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
      with ops.control_dependencies([
          ensemble_optimizer_ops.add_trees_to_ensemble(
              tree_ensemble_handle,
              self._ensemble_to_add.SerializeToString(),
              feature_usage_counts, [1, 2],
              feature_gains, [0.5, 0.3], [[], []],
              learning_rate=learning_rate)
      ]):
        output_ensemble.ParseFromString(
            model_ops.tree_ensemble_serialize(tree_ensemble_handle)[1].eval())

      # New tree is added with shrinkage weight.
      self.assertAllClose([learning_rate], output_ensemble.tree_weights)
      self.assertEqual(1,
                       output_ensemble.tree_metadata[0].num_tree_weight_updates)
      self.assertAllEqual([1, 2], feature_usage_counts.eval())
      self.assertArrayNear([0.5 * learning_rate, 0.3 * learning_rate],
                           feature_gains.eval(), 1e-6)
  def testWithEmptyEnsemble(self):
    with self.test_session():
      # Create an empty ensemble.
      tree_ensemble_handle = model_ops.tree_ensemble_variable(
          stamp_token=0, tree_ensemble_config="", name="empty")

      # Create zero feature importance.
      feature_usage_counts = variables.Variable(
          initial_value=array_ops.zeros([1], dtypes.int64),
          name="feature_usage_counts",
          trainable=False)
      feature_gains = variables.Variable(
          initial_value=array_ops.zeros([1], dtypes.float32),
          name="feature_gains",
          trainable=False)

      resources.initialize_resources(resources.shared_resources()).run()
      variables.initialize_all_variables().run()

      with ops.control_dependencies([
          ensemble_optimizer_ops.add_trees_to_ensemble(
              tree_ensemble_handle,
              self._ensemble_to_add.SerializeToString(),
              feature_usage_counts, [2],
              feature_gains, [0.4], [[]],
              learning_rate=1.0)
      ]):
        result = model_ops.tree_ensemble_serialize(tree_ensemble_handle)[1]

      # Output.
      output_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig()
      output_ensemble.ParseFromString(result.eval())
      self.assertProtoEquals(self._tree_to_add, output_ensemble.trees[0])
      self.assertEqual(1, len(output_ensemble.trees))

      self.assertAllEqual([1.0], output_ensemble.tree_weights)

      self.assertEqual(1,
                       output_ensemble.tree_metadata[0].num_tree_weight_updates)

      self.assertAllEqual([2], feature_usage_counts.eval())
      self.assertArrayNear([0.4], feature_gains.eval(), 1e-6)