def testBiasEnsembleMultiClass(self): with self.test_session(): tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig() tree = tree_ensemble_config.trees.add() tree_ensemble_config.tree_metadata.add().is_finalized = True leaf = tree.nodes.add().leaf _append_to_leaf(leaf, 0, -0.4) _append_to_leaf(leaf, 1, 0.9) tree_ensemble_config.tree_weights.append(1.0) tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config=tree_ensemble_config.SerializeToString(), name="multiclass") resources.initialize_resources(resources.shared_resources()).run() # Prepare learner config. learner_config = learner_pb2.LearnerConfig() learner_config.num_classes = 3 result, dropout_info = self._get_predictions( tree_ensemble_handle, learner_config=learner_config.SerializeToString(), reduce_dim=True) self.assertAllClose([[-0.4, 0.9], [-0.4, 0.9]], result.eval()) # Empty dropout. self.assertAllEqual([[], []], dropout_info.eval())
def testTreeFinalized(self): with self.test_session(): tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig() # Depth 3 tree. tree1 = tree_ensemble_config.trees.add() _set_float_split(tree1.nodes.add().dense_float_binary_split, 0, 9.0, 1, 2) _set_float_split(tree1.nodes.add() .sparse_float_binary_split_default_left.split, 0, -20.0, 3, 4) _append_to_leaf(tree1.nodes.add().leaf, 0, 0.2) _append_to_leaf(tree1.nodes.add().leaf, 0, 0.3) _set_categorical_id_split(tree1.nodes.add().categorical_id_binary_split, 0, 9, 5, 6) _append_to_leaf(tree1.nodes.add().leaf, 0, 0.5) _append_to_leaf(tree1.nodes.add().leaf, 0, 0.6) tree_ensemble_config.tree_weights.append(1.0) tree_ensemble_config.tree_metadata.add().is_finalized = True tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config=tree_ensemble_config.SerializeToString(), name="full_ensemble") resources.initialize_resources(resources.shared_resources()).run() result = prediction_ops.gradient_trees_partition_examples( tree_ensemble_handle, [self._dense_float_tensor], [ self._sparse_float_indices1, self._sparse_float_indices2 ], [self._sparse_float_values1, self._sparse_float_values2], [self._sparse_float_shape1, self._sparse_float_shape2], [self._sparse_int_indices1], [self._sparse_int_values1], [self._sparse_int_shape1]) self.assertAllEqual([0, 0], result.eval())
def __init__(self, num_quantiles, epsilon, serialized_tf_config=None): self._num_quantiles = num_quantiles self._epsilon = epsilon self._serialized_tf_config = serialized_tf_config # _stamp_token is used to commit the state of the qaccumulator. In # this case, the qaccumulator state is completely returned and stored # as part of quantile_state/summary in the combiner fn (i.e the summary is # extracted and stored outside the qaccumulator). So we don't use # the timestamp mechanism to signify progress in the qaccumulator state. self._stamp_token = 0 # Represents an empty summary. This could be changed to a tf.constant # implemented by the quantile ops library. self._empty_summary = None # Create a new session with a new graph for quantile ops. self._session = tf.Session( graph=tf.Graph(), config=_maybe_deserialize_tf_config(serialized_tf_config)) with self._session.graph.as_default(): with self._session.as_default(): self._qaccumulator = quantile_ops.QuantileAccumulator( init_stamp_token=self._stamp_token, num_quantiles=self._num_quantiles, epsilon=self._epsilon, name='qaccumulator') resources.initialize_resources( resources.shared_resources()).run()
def testAverageMoreThanNumTreesExist(self): with self.test_session(): tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig() adjusted_tree_ensemble_config = ( tree_config_pb2.DecisionTreeEnsembleConfig()) # When we say to average over more trees than possible, it is averaging # across all trees. total_num = 100 for i in range(0, total_num): tree = tree_ensemble_config.trees.add() _append_to_leaf(tree.nodes.add().leaf, 0, -0.4) tree_ensemble_config.tree_metadata.add().is_finalized = True tree_ensemble_config.tree_weights.append(1.0) # This is how the weight will look after averaging copy_tree = adjusted_tree_ensemble_config.trees.add() _append_to_leaf(copy_tree.nodes.add().leaf, 0, -0.4) adjusted_tree_ensemble_config.tree_metadata.add().is_finalized = True adjusted_tree_ensemble_config.tree_weights.append( 1.0 * (total_num - i) / total_num) # Prepare learner config WITH AVERAGING. learner_config = learner_pb2.LearnerConfig() learner_config.num_classes = 2 # We have only 100 trees but we ask to average over 250. learner_config.averaging_config.average_last_n_trees = 250 # No averaging config. learner_config_no_averaging = learner_pb2.LearnerConfig() learner_config_no_averaging.num_classes = 2 tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config=tree_ensemble_config.SerializeToString(), name="existing") # This is how our ensemble will "look" during averaging adjusted_tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config=adjusted_tree_ensemble_config.SerializeToString( ), name="adjusted") resources.initialize_resources(resources.shared_resources()).run() result, dropout_info = self._get_predictions( tree_ensemble_handle, learner_config.SerializeToString(), apply_averaging=True, reduce_dim=True) pattern_result, pattern_dropout_info = self._get_predictions( adjusted_tree_ensemble_handle, learner_config_no_averaging.SerializeToString(), apply_averaging=False, reduce_dim=True) self.assertAllEqual(result.eval(), pattern_result.eval()) self.assertAllEqual(dropout_info.eval(), pattern_dropout_info.eval())
def testCachedPredictionOnEmptyEnsemble(self): """Tests that prediction on a dummy ensemble does not fail.""" with self.cached_session() as session: # Create a dummy ensemble. tree_ensemble = boosted_trees_ops.TreeEnsemble( 'ensemble', serialized_proto='') tree_ensemble_handle = tree_ensemble.resource_handle resources.initialize_resources(resources.shared_resources()).run() # No previous cached values. cached_tree_ids = [0, 0] cached_node_ids = [0, 0] # We have two features: 0 and 1. Values don't matter here on a dummy # ensemble. feature_0_values = [67, 5] feature_1_values = [9, 17] # Grow tree ensemble. predict_op = boosted_trees_ops.training_predict( tree_ensemble_handle, cached_tree_ids=cached_tree_ids, cached_node_ids=cached_node_ids, bucketized_features=[feature_0_values, feature_1_values], logits_dimension=1) logits_updates, new_tree_ids, new_node_ids = session.run(predict_op) # Nothing changed. self.assertAllClose(cached_tree_ids, new_tree_ids) self.assertAllClose(cached_node_ids, new_node_ids) self.assertAllClose([[0], [0]], logits_updates)
def testBasicQuantileBucketsMultipleResources(self): with self.test_session() as sess: quantile_accumulator_handle_0 = self.create_resource("float_0", self.eps, self.max_elements) quantile_accumulator_handle_1 = self.create_resource("float_1", self.eps, self.max_elements) resources.initialize_resources(resources.shared_resources()).run() summaries = boosted_trees_ops.make_quantile_summaries( [self._feature_0, self._feature_1], self._example_weights, epsilon=self.eps) summary_op_0 = boosted_trees_ops.quantile_add_summaries( quantile_accumulator_handle_0, [summaries[0]]) summary_op_1 = boosted_trees_ops.quantile_add_summaries( quantile_accumulator_handle_1, [summaries[1]]) flush_op_0 = boosted_trees_ops.quantile_flush( quantile_accumulator_handle_0, self.num_quantiles) flush_op_1 = boosted_trees_ops.quantile_flush( quantile_accumulator_handle_1, self.num_quantiles) bucket_0 = boosted_trees_ops.get_bucket_boundaries( quantile_accumulator_handle_0, num_features=1) bucket_1 = boosted_trees_ops.get_bucket_boundaries( quantile_accumulator_handle_1, num_features=1) quantiles = boosted_trees_ops.boosted_trees_bucketize( [self._feature_0, self._feature_1], bucket_0 + bucket_1) sess.run([summary_op_0, summary_op_1]) sess.run([flush_op_0, flush_op_1]) self.assertAllClose(self._feature_0_boundaries, bucket_0[0].eval()) self.assertAllClose(self._feature_1_boundaries, bucket_1[0].eval()) self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval()) self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
def testBasicQuantileBucketsMultipleResources(self): with self.cached_session() as sess: quantile_accumulator_handle_0 = self.create_resource("float_0", self.eps, self.max_elements) quantile_accumulator_handle_1 = self.create_resource("float_1", self.eps, self.max_elements) resources.initialize_resources(resources.shared_resources()).run() summaries = boosted_trees_ops.make_quantile_summaries( [self._feature_0, self._feature_1], self._example_weights, epsilon=self.eps) summary_op_0 = boosted_trees_ops.quantile_add_summaries( quantile_accumulator_handle_0, [summaries[0]]) summary_op_1 = boosted_trees_ops.quantile_add_summaries( quantile_accumulator_handle_1, [summaries[1]]) flush_op_0 = boosted_trees_ops.quantile_flush( quantile_accumulator_handle_0, self.num_quantiles) flush_op_1 = boosted_trees_ops.quantile_flush( quantile_accumulator_handle_1, self.num_quantiles) bucket_0 = boosted_trees_ops.get_bucket_boundaries( quantile_accumulator_handle_0, num_features=1) bucket_1 = boosted_trees_ops.get_bucket_boundaries( quantile_accumulator_handle_1, num_features=1) quantiles = boosted_trees_ops.boosted_trees_bucketize( [self._feature_0, self._feature_1], bucket_0 + bucket_1) sess.run([summary_op_0, summary_op_1]) sess.run([flush_op_0, flush_op_1]) self.assertAllClose(self._feature_0_boundaries, bucket_0[0].eval()) self.assertAllClose(self._feature_1_boundaries, bucket_1[0].eval()) self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval()) self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
def testCreate(self): with self.cached_session(): tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig() tree = tree_ensemble_config.trees.add() _append_to_leaf(tree.nodes.add().leaf, 0, -0.4) tree_ensemble_config.tree_weights.append(1.0) # Prepare learner config. learner_config = learner_pb2.LearnerConfig() learner_config.num_classes = 2 tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=3, tree_ensemble_config=tree_ensemble_config.SerializeToString(), name="create_tree") resources.initialize_resources(resources.shared_resources()).run() result, _ = prediction_ops.gradient_trees_prediction( tree_ensemble_handle, self._seed, [self._dense_float_tensor], [ self._sparse_float_indices1, self._sparse_float_indices2 ], [self._sparse_float_values1, self._sparse_float_values2], [self._sparse_float_shape1, self._sparse_float_shape2], [self._sparse_int_indices1], [self._sparse_int_values1], [self._sparse_int_shape1], learner_config=learner_config.SerializeToString(), apply_dropout=False, apply_averaging=False, center_bias=False, reduce_dim=True) self.assertAllClose(result.eval(), [[-0.4], [-0.4]]) stamp_token = model_ops.tree_ensemble_stamp_token(tree_ensemble_handle) self.assertEqual(stamp_token.eval(), 3)
def testBasicCallableParams(self): for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: with self.cached_session(): var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype) grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) lr = lambda: 3.0 sgd_op = gradient_descent.GradientDescentOptimizer(lr).apply_gradients( zip([grads0, grads1], [var0, var1])) # TODO(apassos) calling initialize_resources on all resources here # doesn't work because the sessions and graph are reused across unit # tests and this would mean trying to reinitialize variables. Figure out # a long-term solution for this. resources.initialize_resources([var0, var1]).run() # Fetch params to validate initial values self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval()) self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval()) # Run 1 step of sgd sgd_op.run() # Validate updated params self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], var0.eval()) self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], var1.eval())
def testBasicCallableParams(self): for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: with self.cached_session(): var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype) grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) lr = lambda: 3.0 sgd_op = gradient_descent.SGD(lr).apply_gradients( zip([grads0, grads1], [var0, var1])) # TODO(apassos) calling initialize_resources on all resources here # doesn't work because the sessions and graph are reused across unit # tests and this would mean trying to reinitialize variables. Figure out # a long-term solution for this. resources.initialize_resources([var0, var1]).run() # Fetch params to validate initial values self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval()) self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval()) # Run 1 step of sgd sgd_op.run() # Validate updated params self.assertAllCloseAccordingToType( [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], var0.eval()) self.assertAllCloseAccordingToType( [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], var1.eval())
def _testStreamingQuantileBucketsHelper( self, inputs, num_quantiles=3, expected_buckets=None): """Helper to test quantile buckets on different inputs.""" # set generate_quantiles to True since the test will generate fewer # boundaries otherwise. with self.test_session() as sess: accumulator = quantile_ops.QuantileAccumulator( init_stamp_token=0, num_quantiles=num_quantiles, epsilon=0.001, name="q1", generate_quantiles=True) resources.initialize_resources(resources.shared_resources()).run() input_column = array_ops.placeholder(dtypes.float32) weights = array_ops.placeholder(dtypes.float32) update = accumulator.add_summary( stamp_token=0, column=input_column, example_weights=weights) with self.test_session() as sess: sess.run(update, {input_column: inputs, weights: [1] * len(inputs)}) with self.test_session() as sess: sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1)) are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1)) buckets, are_ready_flush = (sess.run( [buckets, are_ready_flush])) self.assertEqual(True, are_ready_flush) # By default, use 3 quantiles, 4 boundaries for simplicity. self.assertEqual(num_quantiles + 1, len(buckets)) if expected_buckets: self.assertAllEqual(buckets, expected_buckets)
def _get_train_op_and_ensemble(self, head, config, is_classification, train_in_memory): """Calls bt_model_fn() and returns the train_op and ensemble_serialzed.""" features, labels = _make_train_input_fn(is_classification)() estimator_spec = boosted_trees._bt_model_fn( # pylint:disable=protected-access features=features, labels=labels, mode=model_fn.ModeKeys.TRAIN, head=head, feature_columns=self._feature_columns, tree_hparams=self._tree_hparams, example_id_column_name=EXAMPLE_ID_COLUMN, n_batches_per_layer=1, config=config, train_in_memory=train_in_memory) resources.initialize_resources(resources.shared_resources()).run() variables.global_variables_initializer().run() variables.local_variables_initializer().run() # Gets the train_op and serialized proto of the ensemble. shared_resources = resources.shared_resources() self.assertEqual(1, len(shared_resources)) train_op = estimator_spec.train_op with ops.control_dependencies([train_op]): _, ensemble_serialized = ( gen_boosted_trees_ops.boosted_trees_serialize_ensemble( shared_resources[0].handle)) return train_op, ensemble_serialized
def testStreamingQuantileBuckets(self): """Sets up the quantile summary op test as follows. 100 batches of data is added to the accumulator. The batches are in form: [0 1 .. 99] [100 101 .. 200] ... [9900 9901 .. 9999] All the batches have 1 for all the example weights. """ with self.test_session() as sess: accumulator = quantile_ops.QuantileAccumulator( init_stamp_token=0, num_quantiles=3, epsilon=0.01, name="q1") resources.initialize_resources(resources.shared_resources()).run() weight_placeholder = array_ops.placeholder(dtypes.float32) dense_placeholder = array_ops.placeholder(dtypes.float32) update = accumulator.add_summary( stamp_token=0, column=dense_placeholder, example_weights=weight_placeholder) with self.test_session() as sess: for i in range(100): dense_float = np.linspace( i * 100, (i + 1) * 100 - 1, num=100).reshape(-1, 1) sess.run(update, { dense_placeholder: dense_float, weight_placeholder: np.ones(shape=(100, 1), dtype=np.float32) }) with self.test_session() as sess: sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1)) are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1)) buckets, are_ready_flush = (sess.run([buckets, are_ready_flush])) self.assertEqual(True, are_ready_flush) self.assertAllEqual([0, 3335., 6671., 9999.], buckets)
def initialize_local_state(self, tf_config=None): """Called by the CombineFnWrapper's __init__ method. This can be used to set non-pickleable local state. It is used in conjunction with overriding __reduce__ so this state is not pickled. This method must be called prior to any other method. Args: tf_config: (optional) A tf.ConfigProto """ # _stamp_token is used to commit the state of the qaccumulator. In # this case, the qaccumulator state is completely returned and stored # as part of quantile_state/summary in the combiner fn (i.e the summary is # extracted and stored outside the qaccumulator). So we don't use # the timestamp mechanism to signify progress in the qaccumulator state. self._stamp_token = 0 # Represents an empty summary. This could be changed to a tf.constant # implemented by the quantile ops library. self._empty_summary = None # Create a new session with a new graph for quantile ops. self._session = tf.Session(graph=tf.Graph(), config=tf_config) with self._session.graph.as_default(): with self._session.as_default(): self._qaccumulator = quantile_ops.QuantileAccumulator( init_stamp_token=self._stamp_token, num_quantiles=self._num_quantiles, epsilon=self._epsilon, name='qaccumulator') resources.initialize_resources( resources.shared_resources()).run()
def testContribsForOnlyABiasNode(self): """Tests case when, after training, only left with a bias node. For example, this could happen if the final ensemble contains one tree that got pruned up to the root. """ with self.test_session() as session: tree_ensemble_config = boosted_trees_pb2.TreeEnsemble() text_format.Merge( """ trees { nodes { leaf { scalar: 1.72 } } } tree_weights: 0.1 tree_metadata: { num_layers_grown: 0 } """, tree_ensemble_config) tree_ensemble = boosted_trees_ops.TreeEnsemble( 'ensemble', serialized_proto=tree_ensemble_config.SerializeToString()) tree_ensemble_handle = tree_ensemble.resource_handle resources.initialize_resources(resources.shared_resources()).run() # All features are unused. feature_0_values = [36, 32] feature_1_values = [13, -29] feature_2_values = [11, 27] # Expected logits are computed by traversing the logit path and # subtracting child logits from parent logits. bias = 1.72 * 0.1 # Root node of tree_0. expected_feature_ids = ((), ()) expected_logits_paths = ((bias, ), (bias, )) bucketized_features = [ feature_0_values, feature_1_values, feature_2_values ] debug_op = boosted_trees_ops.example_debug_outputs( tree_ensemble_handle, bucketized_features=bucketized_features, logits_dimension=1) serialized_examples_debug_outputs = session.run(debug_op) feature_ids = [] logits_paths = [] for example in serialized_examples_debug_outputs: example_debug_outputs = boosted_trees_pb2.DebugOutput() example_debug_outputs.ParseFromString(example) feature_ids.append(example_debug_outputs.feature_ids) logits_paths.append(example_debug_outputs.logits_path) self.assertAllClose(feature_ids, expected_feature_ids) self.assertAllClose(logits_paths, expected_logits_paths)
def testMinimizeResourceVariable(self): for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: with self.cached_session(): var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype) x = constant_op.constant([[4.0], [5.0]], dtype=dtype) pred = math_ops.matmul(var0, x) + var1 loss = pred * pred sgd_op = gradient_descent.SGD(1.0).minimize(loss) # TODO(apassos) calling initialize_resources on all resources here # doesn't work because the sessions and graph are reused across unit # tests and this would mean trying to reinitialize variables. Figure out # a long-term solution for this. resources.initialize_resources([var0, var1]).run() # Fetch params to validate initial values self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval()) self.assertAllCloseAccordingToType([3.0], var1.eval()) # Run 1 step of sgd sgd_op.run() # Validate updated params np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0 np_grad = 2 * np_pred self.assertAllCloseAccordingToType( [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval()) self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
def testWithExistingEnsembleAndShrinkage(self): with self.test_session(): # Add shrinkage config. learning_rate = 0.0001 tree_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig() # Add 10 trees with some weights. for i in range(0, 5): tree = tree_ensemble.trees.add() _append_to_leaf(tree.nodes.add().leaf, 0, -0.4) tree_ensemble.tree_weights.append(i + 1) meta = tree_ensemble.tree_metadata.add() meta.num_tree_weight_updates = 1 tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config=tree_ensemble.SerializeToString(), name="existing") # Create non-zero feature importance. feature_usage_counts = variables.Variable( initial_value=np.array([4, 7], np.int64), name="feature_usage_counts", trainable=False) feature_gains = variables.Variable(initial_value=np.array( [0.2, 0.8], np.float32), name="feature_gains", trainable=False) resources.initialize_resources(resources.shared_resources()).run() variables.initialize_all_variables().run() output_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig() with ops.control_dependencies([ ensemble_optimizer_ops.add_trees_to_ensemble( tree_ensemble_handle, self._ensemble_to_add.SerializeToString(), feature_usage_counts, [1, 2], feature_gains, [0.5, 0.3], [[], []], learning_rate=learning_rate) ]): output_ensemble.ParseFromString( model_ops.tree_ensemble_serialize(tree_ensemble_handle) [1].eval()) # The weights of previous trees stayed the same, new tree (LAST) is added # with shrinkage weight. self.assertAllClose([1.0, 2.0, 3.0, 4.0, 5.0, learning_rate], output_ensemble.tree_weights) # Check that all number of updates are equal to 1 (e,g, no old tree weight # got adjusted. for i in range(0, 6): self.assertEqual( 1, output_ensemble.tree_metadata[i].num_tree_weight_updates) # Ensure feature importance was aggregated correctly. self.assertAllEqual([5, 9], feature_usage_counts.eval()) self.assertArrayNear( [0.2 + 0.5 * learning_rate, 0.8 + 0.3 * learning_rate], feature_gains.eval(), 1e-6)
def testTrainFnNonChiefWithCentering(self): """Tests the train function running on worker with bias centering.""" with self.test_session(): ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config="", name="tree_ensemble") learner_config = learner_pb2.LearnerConfig() learner_config.learning_rate_tuner.fixed.learning_rate = 0.1 learner_config.num_classes = 2 learner_config.regularization.l1 = 0 learner_config.regularization.l2 = 0 learner_config.constraints.max_tree_depth = 1 learner_config.constraints.min_node_weight = 0 features = {} features["dense_float"] = array_ops.ones([4, 1], dtypes.float32) gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel( is_chief=False, num_ps_replicas=0, center_bias=True, ensemble_handle=ensemble_handle, examples_per_layer=1, learner_config=learner_config, features=features) predictions = array_ops.constant([[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32) partition_ids = array_ops.zeros([4], dtypes.int32) ensemble_stamp = variables.Variable(initial_value=0, name="ensemble_stamp", trainable=False, dtype=dtypes.int64) predictions_dict = { "predictions": predictions, "predictions_no_dropout": predictions, "partition_ids": partition_ids, "ensemble_stamp": ensemble_stamp } labels = array_ops.ones([4, 1], dtypes.float32) weights = array_ops.ones([4, 1], dtypes.float32) # Create train op. train_op = gbdt_model.train(loss=math_ops.reduce_mean( _squared_loss(labels, weights, predictions)), predictions_dict=predictions_dict, labels=labels) variables.global_variables_initializer().run() resources.initialize_resources(resources.shared_resources()).run() # Regardless of how many times the train op is run, a non-chief worker # can only accumulate stats so the tree ensemble never changes. for _ in range(5): train_op.run() stamp_token, serialized = model_ops.tree_ensemble_serialize( ensemble_handle) output = tree_config_pb2.DecisionTreeEnsembleConfig() output.ParseFromString(serialized.eval()) self.assertEquals(len(output.trees), 0) self.assertEquals(len(output.tree_weights), 0) self.assertEquals(stamp_token.eval(), 0)
def testBasicQuantileBucketsSingleResourcesAddFlushed(self): with self.cached_session(): quantile_accumulator_handle = self.create_resource( "floats_0", self.eps, self.max_elements, 2) resources.initialize_resources(resources.shared_resources()).run() summaries = boosted_trees_ops.make_quantile_summaries( [self._feature_0, self._feature_1], self._example_weights, epsilon=self.eps) summary_op = boosted_trees_ops.quantile_add_summaries( quantile_accumulator_handle, summaries) flushed_summaries = flush_quantile_summaries( quantile_accumulator_handle, num_features=2) # We are testing whether the flushed summaries output at the previous step # will give the same expected results by inputing it to add_summaries summary_op_2 = boosted_trees_ops.quantile_add_summaries( quantile_accumulator_handle, flushed_summaries) flush_op = boosted_trees_ops.quantile_flush( quantile_accumulator_handle, self.num_quantiles) buckets = boosted_trees_ops.get_bucket_boundaries( quantile_accumulator_handle, num_features=2) quantiles = boosted_trees_ops.boosted_trees_bucketize( [self._feature_0, self._feature_1], buckets) self.evaluate(summary_op) self.evaluate(summary_op_2) self.evaluate(flush_op) self.assertAllClose(self._feature_0_boundaries, buckets[0].eval()) self.assertAllClose(self._feature_1_boundaries, buckets[1].eval()) self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval()) self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
def testSaveRestoreBeforeFlush(self): save_dir = os.path.join(self.get_temp_dir(), "save_restore") save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash") with self.cached_session() as sess: accumulator = boosted_trees_ops.QuantileAccumulator( num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0") save = saver.Saver() resources.initialize_resources(resources.shared_resources()).run() summaries = accumulator.add_summaries([self._feature_0, self._feature_1], self._example_weights) self.evaluate(summaries) buckets = accumulator.get_bucket_boundaries() self.assertAllClose([], buckets[0].eval()) self.assertAllClose([], buckets[1].eval()) save.save(sess, save_path) self.evaluate(accumulator.flush()) self.assertAllClose(self._feature_0_boundaries, buckets[0].eval()) self.assertAllClose(self._feature_1_boundaries, buckets[1].eval()) with self.session(graph=ops.Graph()) as sess: accumulator = boosted_trees_ops.QuantileAccumulator( num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0") save = saver.Saver() save.restore(sess, save_path) buckets = accumulator.get_bucket_boundaries() self.assertAllClose([], buckets[0].eval()) self.assertAllClose([], buckets[1].eval())
def testBasicResourceVariable(self): for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: # train.GradientDescentOptimizer is V1 only API. with ops.Graph().as_default(), self.cached_session(): var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype) grads0 = constant_op.constant([0.1, 0.1], dtype=dtype) grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) sgd_op = gradient_descent.GradientDescentOptimizer( 3.0).apply_gradients(zip([grads0, grads1], [var0, var1])) # TODO(apassos) calling initialize_resources on all resources here # doesn't work because the sessions and graph are reused across unit # tests and this would mean trying to reinitialize variables. Figure out # a long-term solution for this. resources.initialize_resources([var0, var1]).run() # Fetch params to validate initial values self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0)) self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1)) # Run 1 step of sgd sgd_op.run() # Validate updated params self.assertAllCloseAccordingToType( [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], self.evaluate(var0)) self.assertAllCloseAccordingToType( [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], self.evaluate(var1))
def testCachedPredictionOnEmptyEnsemble(self): """Tests that prediction on a dummy ensemble does not fail.""" with self.test_session() as session: # Create a dummy ensemble. tree_ensemble = boosted_trees_ops.TreeEnsemble('ensemble', serialized_proto='') tree_ensemble_handle = tree_ensemble.resource_handle resources.initialize_resources(resources.shared_resources()).run() # No previous cached values. cached_tree_ids = [0, 0] cached_node_ids = [0, 0] # We have two features: 0 and 1. Values don't matter here on a dummy # ensemble. feature_0_values = [67, 5] feature_1_values = [9, 17] # Grow tree ensemble. predict_op = boosted_trees_ops.training_predict( tree_ensemble_handle, cached_tree_ids=cached_tree_ids, cached_node_ids=cached_node_ids, bucketized_features=[feature_0_values, feature_1_values], logits_dimension=1) logits_updates, new_tree_ids, new_node_ids = session.run( predict_op) # Nothing changed. self.assertAllClose(cached_tree_ids, new_tree_ids) self.assertAllClose(cached_node_ids, new_node_ids) self.assertAllClose([[0], [0]], logits_updates)
def testCreate(self): with self.test_session(): tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig() tree = tree_ensemble_config.trees.add() _append_to_leaf(tree.nodes.add().leaf, 0, -0.4) tree_ensemble_config.tree_weights.append(1.0) # Prepare learner config. learner_config = learner_pb2.LearnerConfig() learner_config.num_classes = 2 tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=3, tree_ensemble_config=tree_ensemble_config.SerializeToString(), name="create_tree") resources.initialize_resources(resources.shared_resources()).run() result, _, _ = prediction_ops.gradient_trees_prediction( tree_ensemble_handle, self._seed, [self._dense_float_tensor], [self._sparse_float_indices1, self._sparse_float_indices2], [self._sparse_float_values1, self._sparse_float_values2], [self._sparse_float_shape1, self._sparse_float_shape2], [self._sparse_int_indices1], [self._sparse_int_values1], [self._sparse_int_shape1], learner_config=learner_config.SerializeToString(), apply_dropout=False, apply_averaging=False, center_bias=False, reduce_dim=True) self.assertAllClose(result.eval(), [[-0.4], [-0.4]]) stamp_token = model_ops.tree_ensemble_stamp_token( tree_ensemble_handle) self.assertEqual(stamp_token.eval(), 3)
def _eval(self, var, accum, linear, grad, lr, l1, l2, l2_shrinkage=0, lr_power=1, multiply_linear_by_lr=False): dtype = np.float32 var = np.array(var, dtype=dtype) accum = np.array(accum, dtype=dtype) linear = np.array(linear, dtype=dtype) grad = np.array(grad, dtype=dtype) use_v2 = bool(l2_shrinkage) with self.session() as session: lr = constant_op.constant(lr, dtype=dtype) l1 = constant_op.constant(l1, dtype=dtype) l2 = constant_op.constant(l2, dtype=dtype) l2_shrinkage = constant_op.constant(l2_shrinkage, dtype=dtype) lr_power = constant_op.constant(lr_power, dtype=dtype) v_var = resource_variable_ops.ResourceVariable(var, dtype=dtype) v_accum = resource_variable_ops.ResourceVariable(accum, dtype=dtype) v_linear = resource_variable_ops.ResourceVariable(linear, dtype=dtype) resources.initialize_resources([v_var, v_accum, v_linear]).run() assert not (use_v2 and multiply_linear_by_lr) if use_v2: session.run(training_ops.resource_apply_ftrl_v2( v_var.handle, v_accum.handle, v_linear.handle, grad, lr, l1, l2, l2_shrinkage, lr_power, multiply_linear_by_lr=multiply_linear_by_lr)) else: session.run(training_ops.resource_apply_ftrl( v_var.handle, v_accum.handle, v_linear.handle, grad, lr, l1, l2, lr_power, multiply_linear_by_lr=multiply_linear_by_lr)) return (v_var.read_value().eval().reshape(var.shape), v_accum.read_value().eval().reshape(accum.shape), v_linear.read_value().eval().reshape(linear.shape))
def testMinimizeResourceVariable(self): for dtype in [dtypes.half, dtypes.float32, dtypes.float64]: with self.cached_session(): var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype) var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype) x = constant_op.constant([[4.0], [5.0]], dtype=dtype) pred = math_ops.matmul(var0, x) + var1 loss = pred * pred sgd_op = gradient_descent.GradientDescentOptimizer(1.0).minimize(loss) # TODO(apassos) calling initialize_resources on all resources here # doesn't work because the sessions and graph are reused across unit # tests and this would mean trying to reinitialize variables. Figure out # a long-term solution for this. resources.initialize_resources([var0, var1]).run() # Fetch params to validate initial values self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval()) self.assertAllCloseAccordingToType([3.0], var1.eval()) # Run 1 step of sgd sgd_op.run() # Validate updated params np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0 np_grad = 2 * np_pred self.assertAllCloseAccordingToType( [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval()) self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
def testDropout(self): with self.test_session(): # Empty tree ensenble. tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig() # Add 1000 trees with some weights. for i in range(0, 999): tree = tree_ensemble_config.trees.add() tree_ensemble_config.tree_metadata.add().is_finalized = True _append_to_leaf(tree.nodes.add().leaf, 0, -0.4) tree_ensemble_config.tree_weights.append(i + 1) # Prepare learner/dropout config. learner_config = learner_pb2.LearnerConfig() learner_config.learning_rate_tuner.dropout.dropout_probability = 0.5 learner_config.learning_rate_tuner.dropout.learning_rate = 1.0 learner_config.num_classes = 2 # Apply dropout. tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config=tree_ensemble_config.SerializeToString(), name="existing") resources.initialize_resources(resources.shared_resources()).run() result, dropout_info = self._get_predictions( tree_ensemble_handle, learner_config=learner_config.SerializeToString(), apply_dropout=True, apply_averaging=False, center_bias=False, reduce_dim=True) # We expect approx 500 trees were dropped. dropout_info = dropout_info.eval() self.assertIn(dropout_info[0].size, range(400, 601)) self.assertEqual(dropout_info[0].size, dropout_info[1].size) for i in range(dropout_info[0].size): dropped_index = dropout_info[0][i] dropped_weight = dropout_info[1][i] # We constructed the trees so tree number + 1 is the tree weight, so # we can check here the weights for dropped trees. self.assertEqual(dropped_index + 1, dropped_weight) # Don't apply dropout. result_no_dropout, no_dropout_info = self._get_predictions( tree_ensemble_handle, learner_config=learner_config.SerializeToString(), apply_dropout=False, apply_averaging=False, center_bias=False, reduce_dim=True) self.assertEqual(result.eval().size, result_no_dropout.eval().size) for i in range(result.eval().size): self.assertNotEqual(result.eval()[i], result_no_dropout.eval()[i]) # We expect none of the trees were dropped. self.assertAllEqual([[], []], no_dropout_info.eval())
def testContribsForOnlyABiasNode(self): """Tests case when, after training, only left with a bias node. For example, this could happen if the final ensemble contains one tree that got pruned up to the root. """ with self.cached_session() as session: tree_ensemble_config = boosted_trees_pb2.TreeEnsemble() text_format.Merge( """ trees { nodes { leaf { scalar: 1.72 } } } tree_weights: 0.1 tree_metadata: { num_layers_grown: 0 } """, tree_ensemble_config) tree_ensemble = boosted_trees_ops.TreeEnsemble( 'ensemble', serialized_proto=tree_ensemble_config.SerializeToString()) tree_ensemble_handle = tree_ensemble.resource_handle resources.initialize_resources(resources.shared_resources()).run() # All features are unused. feature_0_values = [36, 32] feature_1_values = [13, -29] feature_2_values = [11, 27] # Expected logits are computed by traversing the logit path and # subtracting child logits from parent logits. bias = 1.72 * 0.1 # Root node of tree_0. expected_feature_ids = ((), ()) expected_logits_paths = ((bias,), (bias,)) bucketized_features = [ feature_0_values, feature_1_values, feature_2_values ] debug_op = boosted_trees_ops.example_debug_outputs( tree_ensemble_handle, bucketized_features=bucketized_features, logits_dimension=1) serialized_examples_debug_outputs = session.run(debug_op) feature_ids = [] logits_paths = [] for example in serialized_examples_debug_outputs: example_debug_outputs = boosted_trees_pb2.DebugOutput() example_debug_outputs.ParseFromString(example) feature_ids.append(example_debug_outputs.feature_ids) logits_paths.append(example_debug_outputs.logits_path) self.assertAllClose(feature_ids, expected_feature_ids) self.assertAllClose(logits_paths, expected_logits_paths)
def test_simple(self): with self.session(): TADDR_VALID = 'zrpull://127.0.0.1:5555' output = zmq_conn_handle(TADDR_VALID, ZMQ_HWM, 0) resources.initialize_resources(resources.local_resources()).run() # assertDTypeEqual not working for resource type. it trans tf.dtype to np.dtype and resource is incompatible with numpy #self.assertDtypeEqual(output, dtypes.resource.as_numpy_type) self.assertEqual(type(output.dtype), type(dtypes.resource))
def testSaveRestoreBeforeFlush(self): save_dir = os.path.join(self.get_temp_dir(), "save_restore") save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash") with self.test_session(graph=ops.Graph()) as sess: accumulator = quantile_ops.QuantileAccumulator(init_stamp_token=0, num_quantiles=3, epsilon=0.33, name="q0") save = saver.Saver() resources.initialize_resources(resources.shared_resources()).run() sparse_indices_0 = constant_op.constant( [[1, 0], [2, 1], [3, 0], [4, 2], [5, 0]], dtype=dtypes.int64) sparse_values_0 = constant_op.constant([2.0, 3.0, 4.0, 5.0, 6.0], dtype=dtypes.float32) sparse_shape_0 = constant_op.constant([6, 3], dtype=dtypes.int64) example_weights = constant_op.constant([10, 1, 1, 1, 1, 1], dtype=dtypes.float32, shape=[6, 1]) update = accumulator.add_summary(stamp_token=0, column=sparse_tensor.SparseTensor( sparse_indices_0, sparse_values_0, sparse_shape_0), example_weights=example_weights) update.run() save.save(sess, save_path) reset = accumulator.flush(stamp_token=0, next_stamp_token=1) with ops.control_dependencies([reset]): are_ready_flush, buckets = (accumulator.get_buckets( stamp_token=1)) buckets, are_ready_flush = (sess.run([buckets, are_ready_flush])) self.assertEqual(True, are_ready_flush) self.assertAllEqual([2, 4, 6.], buckets) with self.test_session(graph=ops.Graph()) as sess: accumulator = quantile_ops.QuantileAccumulator(init_stamp_token=0, num_quantiles=3, epsilon=0.33, name="q0") save = saver.Saver() # Restore the saved values in the parameter nodes. save.restore(sess, save_path) are_ready_noflush = accumulator.get_buckets(stamp_token=0)[0] with ops.control_dependencies([are_ready_noflush]): reset = accumulator.flush(stamp_token=0, next_stamp_token=1) with ops.control_dependencies([reset]): are_ready_flush, buckets = accumulator.get_buckets( stamp_token=1) buckets, are_ready_flush, are_ready_noflush = (sess.run( [buckets, are_ready_flush, are_ready_noflush])) self.assertFalse(are_ready_noflush) self.assertTrue(are_ready_flush) self.assertAllEqual([2, 4, 6.], buckets)
def testWithExistingEnsembleAndShrinkage(self): with self.test_session(): # Add shrinkage config. learning_rate = 0.0001 tree_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig() # Add 10 trees with some weights. for i in range(0, 5): tree = tree_ensemble.trees.add() _append_to_leaf(tree.nodes.add().leaf, 0, -0.4) tree_ensemble.tree_weights.append(i + 1) meta = tree_ensemble.tree_metadata.add() meta.num_tree_weight_updates = 1 tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config=tree_ensemble.SerializeToString(), name="existing") # Create non-zero feature importance. feature_usage_counts = variables.Variable( initial_value=np.array([4, 7], np.int64), name="feature_usage_counts", trainable=False) feature_gains = variables.Variable( initial_value=np.array([0.2, 0.8], np.float32), name="feature_gains", trainable=False) resources.initialize_resources(resources.shared_resources()).run() variables.initialize_all_variables().run() output_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig() with ops.control_dependencies([ ensemble_optimizer_ops.add_trees_to_ensemble( tree_ensemble_handle, self._ensemble_to_add.SerializeToString(), feature_usage_counts, [1, 2], feature_gains, [0.5, 0.3], [[], []], learning_rate=learning_rate) ]): output_ensemble.ParseFromString( model_ops.tree_ensemble_serialize(tree_ensemble_handle)[1].eval()) # The weights of previous trees stayed the same, new tree (LAST) is added # with shrinkage weight. self.assertAllClose([1.0, 2.0, 3.0, 4.0, 5.0, learning_rate], output_ensemble.tree_weights) # Check that all number of updates are equal to 1 (e,g, no old tree weight # got adjusted. for i in range(0, 6): self.assertEqual( 1, output_ensemble.tree_metadata[i].num_tree_weight_updates) # Ensure feature importance was aggregated correctly. self.assertAllEqual([5, 9], feature_usage_counts.eval()) self.assertArrayNear( [0.2 + 0.5 * learning_rate, 0.8 + 0.3 * learning_rate], feature_gains.eval(), 1e-6)
def testCreate(self): with self.test_session(): ensemble = boosted_trees_ops.TreeEnsemble('ensemble') resources.initialize_resources(resources.shared_resources()).run() stamp_token = ensemble.get_stamp_token() self.assertEqual(0, stamp_token.eval()) (_, num_trees, num_finalized_trees, num_attempted_layers) = ensemble.get_states() self.assertEqual(0, num_trees.eval()) self.assertEqual(0, num_finalized_trees.eval()) self.assertEqual(0, num_attempted_layers.eval())
def testCreate(self): with self.cached_session(): ensemble = boosted_trees_ops.TreeEnsemble('ensemble') resources.initialize_resources(resources.shared_resources()).run() stamp_token = ensemble.get_stamp_token() self.assertEqual(0, self.evaluate(stamp_token)) (_, num_trees, num_finalized_trees, num_attempted_layers, nodes_range) = ensemble.get_states() self.assertEqual(0, self.evaluate(num_trees)) self.assertEqual(0, self.evaluate(num_finalized_trees)) self.assertEqual(0, self.evaluate(num_attempted_layers)) self.assertAllEqual([0, 1], self.evaluate(nodes_range))
def testPredictFn(self): """Tests the predict function.""" with self.test_session() as sess: # Create ensemble with one bias node. ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig() text_format.Merge( """ trees { nodes { leaf { vector { value: 0.25 } } } } tree_weights: 1.0 tree_metadata { num_tree_weight_updates: 1 num_layers_grown: 1 is_finalized: true }""", ensemble_config) ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=3, tree_ensemble_config=ensemble_config.SerializeToString(), name="tree_ensemble") resources.initialize_resources(resources.shared_resources()).run() learner_config = learner_pb2.LearnerConfig() learner_config.learning_rate_tuner.fixed.learning_rate = 0.1 learner_config.num_classes = 2 learner_config.regularization.l1 = 0 learner_config.regularization.l2 = 0 learner_config.constraints.max_tree_depth = 1 learner_config.constraints.min_node_weight = 0 features = {} features["dense_float"] = array_ops.ones([4, 1], dtypes.float32) gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel( is_chief=False, num_ps_replicas=0, center_bias=True, ensemble_handle=ensemble_handle, examples_per_layer=1, learner_config=learner_config, features=features) # Create predict op. mode = model_fn.ModeKeys.EVAL predictions_dict = sess.run(gbdt_model.predict(mode)) self.assertEquals(predictions_dict["ensemble_stamp"], 3) self.assertAllClose(predictions_dict["predictions"], [[0.25], [0.25], [0.25], [0.25]]) self.assertAllClose(predictions_dict["partition_ids"], [0, 0, 0, 0])
def testPredictFn(self): """Tests the predict function.""" with self.test_session() as sess: # Create ensemble with one bias node. ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig() text_format.Merge(""" trees { nodes { leaf { vector { value: 0.25 } } } } tree_weights: 1.0 tree_metadata { num_tree_weight_updates: 1 num_layers_grown: 1 is_finalized: true }""", ensemble_config) ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=3, tree_ensemble_config=ensemble_config.SerializeToString(), name="tree_ensemble") resources.initialize_resources(resources.shared_resources()).run() learner_config = learner_pb2.LearnerConfig() learner_config.learning_rate_tuner.fixed.learning_rate = 0.1 learner_config.num_classes = 2 learner_config.regularization.l1 = 0 learner_config.regularization.l2 = 0 learner_config.constraints.max_tree_depth = 1 learner_config.constraints.min_node_weight = 0 features = {} features["dense_float"] = array_ops.ones([4, 1], dtypes.float32) gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel( is_chief=False, num_ps_replicas=0, center_bias=True, ensemble_handle=ensemble_handle, examples_per_layer=1, learner_config=learner_config, features=features) # Create predict op. mode = model_fn.ModeKeys.EVAL predictions_dict = sess.run(gbdt_model.predict(mode)) self.assertEquals(predictions_dict["ensemble_stamp"], 3) self.assertAllClose(predictions_dict["predictions"], [[0.25], [0.25], [0.25], [0.25]]) self.assertAllClose(predictions_dict["partition_ids"], [0, 0, 0, 0])
def testSaveRestoreBeforeFlush(self): save_dir = os.path.join(self.get_temp_dir(), "save_restore") save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash") with self.test_session(graph=ops.Graph()) as sess: accumulator = quantile_ops.QuantileAccumulator( init_stamp_token=0, num_quantiles=3, epsilon=0.33, name="q0") save = saver.Saver() resources.initialize_resources(resources.shared_resources()).run() sparse_indices_0 = constant_op.constant( [[1, 0], [2, 1], [3, 0], [4, 2], [5, 0]], dtype=dtypes.int64) sparse_values_0 = constant_op.constant( [2.0, 3.0, 4.0, 5.0, 6.0], dtype=dtypes.float32) sparse_shape_0 = constant_op.constant([6, 3], dtype=dtypes.int64) example_weights = constant_op.constant( [10, 1, 1, 1, 1, 1], dtype=dtypes.float32, shape=[6, 1]) update = accumulator.add_summary( stamp_token=0, column=sparse_tensor.SparseTensor(sparse_indices_0, sparse_values_0, sparse_shape_0), example_weights=example_weights) update.run() save.save(sess, save_path) reset = accumulator.flush(stamp_token=0, next_stamp_token=1) with ops.control_dependencies([reset]): are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1)) buckets, are_ready_flush = (sess.run([buckets, are_ready_flush])) self.assertEqual(True, are_ready_flush) self.assertAllEqual([2, 4, 6.], buckets) with self.test_session(graph=ops.Graph()) as sess: accumulator = quantile_ops.QuantileAccumulator( init_stamp_token=0, num_quantiles=3, epsilon=0.33, name="q0") save = saver.Saver() # Restore the saved values in the parameter nodes. save.restore(sess, save_path) are_ready_noflush = accumulator.get_buckets(stamp_token=0)[0] with ops.control_dependencies([are_ready_noflush]): reset = accumulator.flush(stamp_token=0, next_stamp_token=1) with ops.control_dependencies([reset]): are_ready_flush, buckets = accumulator.get_buckets(stamp_token=1) buckets, are_ready_flush, are_ready_noflush = (sess.run( [buckets, are_ready_flush, are_ready_noflush])) self.assertFalse(are_ready_noflush) self.assertTrue(are_ready_flush) self.assertAllEqual([2, 4, 6.], buckets)
def testEmpty(self): with self.cached_session() as sess: gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0]) hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13]) partition_ids = [0, 0, 0, 1] indices = constant_op.constant_v1([], dtype=dtypes.int64, shape=[0, 2]) values = constant_op.constant_v1([], dtype=dtypes.int64) gradient_shape = tensor_shape.TensorShape([]) hessian_shape = tensor_shape.TensorShape([]) class_id = -1 split_handler = categorical_split_handler.EqualitySplitHandler( l1_regularization=0.1, l2_regularization=1, tree_complexity_regularization=0, min_node_weight=0, sparse_int_column=sparse_tensor.SparseTensor( indices, values, [4, 1]), feature_column_group_id=0, gradient_shape=gradient_shape, hessian_shape=hessian_shape, multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS, init_stamp_token=0) resources.initialize_resources(resources.shared_resources()).run() empty_gradients, empty_hessians = get_empty_tensors( gradient_shape, hessian_shape) example_weights = array_ops.ones([4, 1], dtypes.float32) update_1 = split_handler.update_stats_sync( 0, partition_ids, gradients, hessians, empty_gradients, empty_hessians, example_weights, is_active=array_ops.constant([True, True])) with ops.control_dependencies([update_1]): are_splits_ready, partitions, gains, splits = ( split_handler.make_splits(0, 1, class_id)) are_splits_ready, partitions, gains, splits = (sess.run( [are_splits_ready, partitions, gains, splits])) self.assertTrue(are_splits_ready) self.assertEqual(len(partitions), 0) self.assertEqual(len(gains), 0) self.assertEqual(len(splits), 0)
def testMetadataMissing(self): # Sometimes we want to do prediction on trees that are not added to ensemble # (for example in with self.test_session(): tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig() # Bias tree. tree1 = tree_ensemble_config.trees.add() _append_to_leaf(tree1.nodes.add().leaf, 0, -0.4) # Depth 3 tree. tree2 = tree_ensemble_config.trees.add() # We are not setting the tree_ensemble_config.tree_metadata in this test. _set_float_split(tree2.nodes.add().dense_float_binary_split, 0, 9.0, 1, 2) _set_float_split(tree2.nodes.add() .sparse_float_binary_split_default_left.split, 0, -20.0, 3, 4) _append_to_leaf(tree2.nodes.add().leaf, 0, 0.5) _append_to_leaf(tree2.nodes.add().leaf, 0, 1.2) _set_categorical_id_split(tree2.nodes.add().categorical_id_binary_split, 0, 9, 5, 6) _append_to_leaf(tree2.nodes.add().leaf, 0, -0.9) _append_to_leaf(tree2.nodes.add().leaf, 0, 0.7) tree_ensemble_config.tree_weights.append(1.0) tree_ensemble_config.tree_weights.append(1.0) tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config=tree_ensemble_config.SerializeToString(), name="full_ensemble") resources.initialize_resources(resources.shared_resources()).run() # Prepare learner config. learner_config = learner_pb2.LearnerConfig() learner_config.num_classes = 2 result, dropout_info = self._get_predictions( tree_ensemble_handle, learner_config=learner_config.SerializeToString(), reduce_dim=True) # The first example will get bias -0.4 from first tree and # leaf 4 payload of -0.9 hence -1.3, the second example will # get the same bias -0.4 and leaf 3 payload (sparse feature missing) # of 1.2 hence 0.8. self.assertAllClose([[-1.3], [0.8]], result.eval()) # Empty dropout. self.assertAllEqual([[], []], dropout_info.eval())
def testUsedHandlers(self): with self.cached_session(): tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig() tree_ensemble_config.growing_metadata.used_handler_ids.append(1) tree_ensemble_config.growing_metadata.used_handler_ids.append(5) stamp_token = 3 tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=stamp_token, tree_ensemble_config=tree_ensemble_config.SerializeToString(), name="create_tree") resources.initialize_resources(resources.shared_resources()).run() result = model_ops.tree_ensemble_used_handlers( tree_ensemble_handle, stamp_token, num_all_handlers=6) self.assertAllEqual([0, 1, 0, 0, 0, 1], result.used_handlers_mask.eval()) self.assertEqual(2, result.num_used_handlers.eval())
def testStreamingQuantileBucketsLowPrecisionInput(self): """Tests inputs that simulate low precision float16 values.""" num_quantiles = 3 # set generate_quantiles to True since the test will generate fewer # boundaries otherwise. with self.test_session() as sess: accumulator = quantile_ops.QuantileAccumulator( init_stamp_token=0, num_quantiles=num_quantiles, epsilon=0.001, name="q1", generate_quantiles=True) resources.initialize_resources(resources.shared_resources()).run() input_column = array_ops.placeholder(dtypes.float32) weights = array_ops.placeholder(dtypes.float32) update = accumulator.add_summary(stamp_token=0, column=input_column, example_weights=weights) with self.test_session() as sess: # This input is generated by integer in the range [2030, 2060] # but represented by with float16 precision. Integers <= 2048 are # exactly represented, whereas numbers > 2048 are rounded; and hence # numbers > 2048 are repeated. For precision loss / rounding, see: # https://en.wikipedia.org/wiki/Half-precision_floating-point_format. # # The intent of the test is not handling of float16 values, but to # validate the number of buckets is returned, in cases where the input # may contain repeated values. inputs = [ 2030.0, 2031.0, 2032.0, 2033.0, 2034.0, 2035.0, 2036.0, 2037.0, 2038.0, 2039.0, 2040.0, 2041.0, 2042.0, 2043.0, 2044.0, 2045.0, 2046.0, 2047.0, 2048.0, 2048.0, 2050.0, 2052.0, 2052.0, 2052.0, 2054.0, 2056.0, 2056.0, 2056.0, 2058.0, 2060.0 ] sess.run(update, { input_column: inputs, weights: [1] * len(inputs) }) with self.test_session() as sess: sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1)) are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1)) buckets, are_ready_flush = (sess.run([buckets, are_ready_flush])) self.assertEqual(True, are_ready_flush) self.assertEqual(num_quantiles + 1, len(buckets)) self.assertAllEqual([2030, 2040, 2050, 2060], buckets)
def tree_models(X, y, num_feat, num_class): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) num_features = num_feat num_steps = 400 num_classes = num_class num_trees = 10 max_nodes = 1000 X = tf.placeholder(tf.float32, shape=[None, num_features]) Y = tf.placeholder(tf.int64, shape=[None]) hparams = tensor_forest.ForestHParams(num_classes=num_classes, num_features=num_features, num_trees=num_trees, max_nodes=max_nodes).fill() forest_graph = tensor_forest.RandomForestGraphs(hparams) train_op = forest_graph.training_graph(X, Y) loss_op = forest_graph.training_loss(X, Y) infer_op, _, _, = forest_graph.inference_graph(X) correct_prediction = tf.equal(tf.argmax(infer_op, 1), tf.cast(Y, tf.int64)) accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) init_vars = tf.group(tf.global_variables_initializer(), resources.initialize_resources(resources.shared_resources())) rf_sess = tf.Session() rf_sess.run(init_vars) for i in range(1, num_steps + 1): _, l = rf_sess.run([train_op, loss_op], feed_dict={X: X_train, Y: y_train}) if i % 50 == 0 or i == 1: acc = rf_sess.run(accuracy_op, feed_dict={X: X_train, Y: y_train}) print("Step %i, Loss: %f, Acc: %f" % (i, l, acc)) print("Test Accuracy:", rf_sess.run(accuracy_op, feed_dict={X: X_test, Y: y_test}))
def save_model(sess, model): """ 存储TF模型 """ inputs = {"inputs": model.input} # 输入String图像 outputs = {"prob": model.predictions} # 输出 prediction_signature = tf.saved_model.signature_def_utils.predict_signature_def(inputs, outputs) signature_map = {signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: prediction_signature} legacy_op = control_flow_ops.group( tf.local_variables_initializer(), resources.initialize_resources(resources.shared_resources()), tf.tables_initializer()) res_dir = "data/model-tf" print('[Info] 模型存储路径: {}'.format(res_dir)) builder = saved_model_builder.SavedModelBuilder(res_dir) builder.add_meta_graph_and_variables( sess, [tag_constants.SERVING], signature_def_map=signature_map, legacy_init_op=legacy_op) builder.save()
def random_forest(num_classes=2, num_features=46, num_trees=100, max_nodes=10000): X = tf.placeholder(tf.float32, shape=[None, num_features]) # For random forest, labels must be integers (the class id) Y = tf.placeholder(tf.int32, shape=[None]) # Random Forest Parameters hparams = tensor_forest.ForestHParams( num_classes=num_classes, num_features=num_features, num_trees=num_trees, max_nodes=max_nodes, ).fill() forest_graph = tensor_forest.RandomForestGraphs(hparams) train_op = forest_graph.training_graph(X, Y) loss_op = forest_graph.training_loss(X, Y) # Measure the accuracy infer_op, _, _ = forest_graph.inference_graph(X) correct_prediction = tf.equal(tf.argmax(infer_op, 1), tf.cast(Y, tf.int64)) accuracy_op = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) init_vars = tf.group( tf.global_variables_initializer(), resources.initialize_resources(resources.shared_resources()), ) # sess = tf.Session() # sess.run(init_vars) return infer_op, accuracy_op, train_op, loss_op, X, Y
def __init__(self, num_features, num_classes, num_trees, max_nodes): tf.reset_default_graph() self.X = tf.placeholder(tf.float32, shape=[None, num_features]) self.Y = tf.placeholder(tf.int32, shape=[None]) self.hparams = tensor_forest.ForestHParams(num_classes=num_classes, num_features=num_features, num_trees=num_trees, max_nodes=max_nodes).fill() print("test 1") # build graph self.forest_graph = tensor_forest.RandomForestGraphs(self.hparams) self.train_op = self.forest_graph.training_graph(self.X, self.Y) self.loss_op = self.forest_graph.training_loss(self.X, self.Y) infer_op, _, _ = self.forest_graph.inference_graph(self.X) self.infer_op = infer_op print("test 2") self.correct_pred = tf.equal(tf.argmax(self.infer_op, 1), tf.cast(self.Y, tf.int64)) self.accuracy_op = tf.reduce_mean( tf.cast(self.correct_pred, tf.float32)) self.init = tf.group( tf.global_variables_initializer(), resources.initialize_resources(resources.shared_resources())) self.sess = None
def train_and_test_model(train_features, train_labels, train_targets, test_features, test_targets, test_labels): """Trains and tests random forest model. Args: train_features: training features train_labels: training labels, integer values train_targets: one-hot row vector of labels test_features: testing features test_targets: one-hot row vector of test labels test_labels: labels for testing Returns: The loss for the model """ # initialize model and build it config = Config() forest_model = RandomForest(config) forest_model.build() # initialize tensorflow variables init_vars = tf.group( tf.global_variables_initializer(), resources.initialize_resources(resources.shared_resources())) with tf.Session() as session: session.run(init_vars) # train loss = forest_model.do_train(session, train_features, train_labels, train_targets) print "Final train loss: {}".format(loss) # test forest_model.do_test(session, test_features, test_targets, test_labels) return loss
def testInactive(self): with self.test_session() as sess: gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0]) hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13]) partition_ids = [0, 0, 0, 1] indices = [[0, 0], [0, 1], [2, 0], [3, 0]] values = array_ops.constant([1, 2, 2, 1], dtype=dtypes.int64) gradient_shape = tensor_shape.scalar() hessian_shape = tensor_shape.scalar() class_id = -1 split_handler = categorical_split_handler.EqualitySplitHandler( l1_regularization=0.1, l2_regularization=1, tree_complexity_regularization=0, min_node_weight=0, sparse_int_column=sparse_tensor.SparseTensor(indices, values, [4, 1]), feature_column_group_id=0, gradient_shape=gradient_shape, hessian_shape=hessian_shape, multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS, init_stamp_token=0) resources.initialize_resources(resources.shared_resources()).run() empty_gradients, empty_hessians = get_empty_tensors( gradient_shape, hessian_shape) example_weights = array_ops.ones([4, 1], dtypes.float32) update_1 = split_handler.update_stats_sync( 0, partition_ids, gradients, hessians, empty_gradients, empty_hessians, example_weights, is_active=array_ops.constant([False, False])) with ops.control_dependencies([update_1]): are_splits_ready, partitions, gains, splits = ( split_handler.make_splits(0, 1, class_id)) are_splits_ready, partitions, gains, splits = (sess.run( [are_splits_ready, partitions, gains, splits])) self.assertTrue(are_splits_ready) self.assertEqual(len(partitions), 0) self.assertEqual(len(gains), 0) self.assertEqual(len(splits), 0)
def testExcludeNonFinalTree(self): with self.test_session(): tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig() # Bias tree. tree1 = tree_ensemble_config.trees.add() tree_ensemble_config.tree_metadata.add().is_finalized = True _append_to_leaf(tree1.nodes.add().leaf, 0, -0.4) # Depth 3 tree. tree2 = tree_ensemble_config.trees.add() tree_ensemble_config.tree_metadata.add().is_finalized = False _set_float_split(tree2.nodes.add().dense_float_binary_split, 0, 9.0, 1, 2) _set_float_split(tree2.nodes.add() .sparse_float_binary_split_default_left.split, 0, -20.0, 3, 4) _append_to_leaf(tree2.nodes.add().leaf, 0, 0.5) _append_to_leaf(tree2.nodes.add().leaf, 0, 1.2) _set_categorical_id_split(tree2.nodes.add().categorical_id_binary_split, 0, 9, 5, 6) _append_to_leaf(tree2.nodes.add().leaf, 0, -0.9) _append_to_leaf(tree2.nodes.add().leaf, 0, 0.7) tree_ensemble_config.tree_weights.append(1.0) tree_ensemble_config.tree_weights.append(1.0) tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config=tree_ensemble_config.SerializeToString(), name="full_ensemble") resources.initialize_resources(resources.shared_resources()).run() # Prepare learner config. learner_config = learner_pb2.LearnerConfig() learner_config.num_classes = 2 learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE result, dropout_info = self._get_predictions( tree_ensemble_handle, learner_config=learner_config.SerializeToString(), reduce_dim=True) # All the examples should get only the bias since the second tree is # non-finalized self.assertAllClose([[-0.4], [-0.4]], result.eval()) # Empty dropout. self.assertAllEqual([[], []], dropout_info.eval())
def testFullEnsembleMultiNotClassTreePerClassStrategyDenseVector(self): with self.test_session(): tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig() # Bias tree only for second class. tree1 = tree_ensemble_config.trees.add() tree_ensemble_config.tree_metadata.add().is_finalized = True _append_multi_values_to_dense_leaf(tree1.nodes.add().leaf, [0, -0.2, -2]) # Depth 2 tree. tree2 = tree_ensemble_config.trees.add() tree_ensemble_config.tree_metadata.add().is_finalized = True _set_float_split(tree2.nodes.add() .sparse_float_binary_split_default_right.split, 1, 4.0, 1, 2) _set_float_split(tree2.nodes.add().dense_float_binary_split, 0, 9.0, 3, 4) _append_multi_values_to_dense_leaf(tree2.nodes.add().leaf, [0.5, 0, 0]) _append_multi_values_to_dense_leaf(tree2.nodes.add().leaf, [0, 1.2, -0.7]) _append_multi_values_to_dense_leaf(tree2.nodes.add().leaf, [-0.9, 0, 0]) tree_ensemble_config.tree_weights.append(1.0) tree_ensemble_config.tree_weights.append(1.0) tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config=tree_ensemble_config.SerializeToString(), name="ensemble_multi_class") resources.initialize_resources(resources.shared_resources()).run() # Prepare learner config. learner_config = learner_pb2.LearnerConfig() learner_config.num_classes = 3 learner_config.multi_class_strategy = ( learner_pb2.LearnerConfig.FULL_HESSIAN) result, dropout_info = self._get_predictions( tree_ensemble_handle, learner_config=learner_config.SerializeToString(), reduce_dim=False) # The first example will get bias class 1 -0.2 and -2 for class 2 from # first tree and leaf 2 payload (sparse feature missing) of 0.5 hence # 0.5, -0.2], the second example will get the same bias and leaf 3 payload # of class 1 1.2 and class 2-0.7 hence [0.0, 1.0, -2.7]. self.assertAllClose([[0.5, -0.2, -2.0], [0, 1.0, -2.7]], result.eval()) # Empty dropout. self.assertAllEqual([[], []], dropout_info.eval())
def testStreamingQuantileBucketsLowPrecisionInput(self): """Tests inputs that simulate low precision float16 values.""" num_quantiles = 3 # set generate_quantiles to True since the test will generate fewer # boundaries otherwise. with self.test_session() as sess: accumulator = quantile_ops.QuantileAccumulator( init_stamp_token=0, num_quantiles=num_quantiles, epsilon=0.001, name="q1", generate_quantiles=True) resources.initialize_resources(resources.shared_resources()).run() input_column = array_ops.placeholder(dtypes.float32) weights = array_ops.placeholder(dtypes.float32) update = accumulator.add_summary( stamp_token=0, column=input_column, example_weights=weights) with self.test_session() as sess: # This input is generated by integer in the range [2030, 2060] # but represented by with float16 precision. Integers <= 2048 are # exactly represented, whereas numbers > 2048 are rounded; and hence # numbers > 2048 are repeated. For precision loss / rounding, see: # https://en.wikipedia.org/wiki/Half-precision_floating-point_format. # # The intent of the test is not handling of float16 values, but to # validate the number of buckets is returned, in cases where the input # may contain repeated values. inputs = [ 2030.0, 2031.0, 2032.0, 2033.0, 2034.0, 2035.0, 2036.0, 2037.0, 2038.0, 2039.0, 2040.0, 2041.0, 2042.0, 2043.0, 2044.0, 2045.0, 2046.0, 2047.0, 2048.0, 2048.0, 2050.0, 2052.0, 2052.0, 2052.0, 2054.0, 2056.0, 2056.0, 2056.0, 2058.0, 2060.0 ] sess.run(update, {input_column: inputs, weights: [1] * len(inputs)}) with self.test_session() as sess: sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1)) are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1)) buckets, are_ready_flush = (sess.run( [buckets, are_ready_flush])) self.assertEqual(True, are_ready_flush) self.assertEqual(num_quantiles + 1, len(buckets)) self.assertAllEqual([2030, 2040, 2050, 2060], buckets)
def testWithExistingEnsemble(self): with self.test_session(): # Create existing tree ensemble. tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config=self._tree_ensemble.SerializeToString(), name="existing") # Create non-zero feature importance. feature_usage_counts = variables.Variable( initial_value=np.array([0, 4, 1], np.int64), name="feature_usage_counts", trainable=False) feature_gains = variables.Variable( initial_value=np.array([0.0, 0.3, 0.05], np.float32), name="feature_gains", trainable=False) resources.initialize_resources(resources.shared_resources()).run() variables.initialize_all_variables().run() output_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig() with ops.control_dependencies([ ensemble_optimizer_ops.add_trees_to_ensemble( tree_ensemble_handle, self._ensemble_to_add.SerializeToString(), feature_usage_counts, [1, 2, 0], feature_gains, [0.02, 0.1, 0.0], [[], []], learning_rate=1) ]): output_ensemble.ParseFromString( model_ops.tree_ensemble_serialize(tree_ensemble_handle)[1].eval()) # Output. self.assertEqual(3, len(output_ensemble.trees)) self.assertProtoEquals(self._tree_to_add, output_ensemble.trees[2]) self.assertAllEqual([1.0, 1.0, 1.0], output_ensemble.tree_weights) self.assertEqual(2, output_ensemble.tree_metadata[0].num_tree_weight_updates) self.assertEqual(3, output_ensemble.tree_metadata[1].num_tree_weight_updates) self.assertEqual(1, output_ensemble.tree_metadata[2].num_tree_weight_updates) self.assertAllEqual([1, 6, 1], feature_usage_counts.eval()) self.assertArrayNear([0.02, 0.4, 0.05], feature_gains.eval(), 1e-6)
def testStreamingQuantileBucketsWithVaryingBatch(self): """Sets up the quantile summary op test as follows. Creates batches examples with different number of inputs in each batch. The input values are dense in the range [1 ... N] The data looks like this: | Batch | Start | InputList | 1 | 1 | [1] | 2 | 2 | [2, 3] | 3 | 4 | [4, 5, 6] | 4 | 7 | [7, 8, 9, 10] | 5 | 11 | [11, 12, 13, 14, 15] | 6 | 16 | [16, 17, 18, 19, 20, 21] """ num_quantiles = 3 with self.test_session() as sess: accumulator = quantile_ops.QuantileAccumulator( init_stamp_token=0, num_quantiles=num_quantiles, epsilon=0.001, name="q1") resources.initialize_resources(resources.shared_resources()).run() input_column = array_ops.placeholder(dtypes.float32) weights = array_ops.placeholder(dtypes.float32) update = accumulator.add_summary( stamp_token=0, column=input_column, example_weights=weights) with self.test_session() as sess: for i in range(1, 23): # start = 1, 2, 4, 7, 11, 16 ... (see comment above) start = int((i * (i-1) / 2) + 1) sess.run(update, {input_column: range(start, start+i), weights: [1] * i}) with self.test_session() as sess: sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1)) are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1)) buckets, are_ready_flush = (sess.run( [buckets, are_ready_flush])) self.assertEqual(True, are_ready_flush) self.assertEqual(num_quantiles + 1, len(buckets)) self.assertAllEqual([1, 86., 170., 253.], buckets)
def testEnsembleEmpty(self): with self.test_session(): tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig() tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config=tree_ensemble_config.SerializeToString(), name="full_ensemble") resources.initialize_resources(resources.shared_resources()).run() result = prediction_ops.gradient_trees_partition_examples( tree_ensemble_handle, [self._dense_float_tensor], [ self._sparse_float_indices1, self._sparse_float_indices2 ], [self._sparse_float_values1, self._sparse_float_values2], [self._sparse_float_shape1, self._sparse_float_shape2], [self._sparse_int_indices1], [self._sparse_int_values1], [self._sparse_int_shape1]) self.assertAllEqual([0, 0], result.eval())
def testWithEmptyEnsembleAndShrinkage(self): with self.test_session(): # Add shrinkage config. learning_rate = 0.0001 tree_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig() tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config=tree_ensemble.SerializeToString(), name="existing") # Create zero feature importance. feature_usage_counts = variables.Variable( initial_value=np.array([0, 0], np.int64), name="feature_usage_counts", trainable=False) feature_gains = variables.Variable( initial_value=np.array([0.0, 0.0], np.float32), name="feature_gains", trainable=False) resources.initialize_resources(resources.shared_resources()).run() variables.initialize_all_variables().run() output_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig() with ops.control_dependencies([ ensemble_optimizer_ops.add_trees_to_ensemble( tree_ensemble_handle, self._ensemble_to_add.SerializeToString(), feature_usage_counts, [1, 2], feature_gains, [0.5, 0.3], [[], []], learning_rate=learning_rate) ]): output_ensemble.ParseFromString( model_ops.tree_ensemble_serialize(tree_ensemble_handle)[1].eval()) # New tree is added with shrinkage weight. self.assertAllClose([learning_rate], output_ensemble.tree_weights) self.assertEqual(1, output_ensemble.tree_metadata[0].num_tree_weight_updates) self.assertAllEqual([1, 2], feature_usage_counts.eval()) self.assertArrayNear([0.5 * learning_rate, 0.3 * learning_rate], feature_gains.eval(), 1e-6)
def testWithEmptyEnsemble(self): with self.test_session(): # Create an empty ensemble. tree_ensemble_handle = model_ops.tree_ensemble_variable( stamp_token=0, tree_ensemble_config="", name="empty") # Create zero feature importance. feature_usage_counts = variables.Variable( initial_value=array_ops.zeros([1], dtypes.int64), name="feature_usage_counts", trainable=False) feature_gains = variables.Variable( initial_value=array_ops.zeros([1], dtypes.float32), name="feature_gains", trainable=False) resources.initialize_resources(resources.shared_resources()).run() variables.initialize_all_variables().run() with ops.control_dependencies([ ensemble_optimizer_ops.add_trees_to_ensemble( tree_ensemble_handle, self._ensemble_to_add.SerializeToString(), feature_usage_counts, [2], feature_gains, [0.4], [[]], learning_rate=1.0) ]): result = model_ops.tree_ensemble_serialize(tree_ensemble_handle)[1] # Output. output_ensemble = tree_config_pb2.DecisionTreeEnsembleConfig() output_ensemble.ParseFromString(result.eval()) self.assertProtoEquals(self._tree_to_add, output_ensemble.trees[0]) self.assertEqual(1, len(output_ensemble.trees)) self.assertAllEqual([1.0], output_ensemble.tree_weights) self.assertEqual(1, output_ensemble.tree_metadata[0].num_tree_weight_updates) self.assertAllEqual([2], feature_usage_counts.eval()) self.assertArrayNear([0.4], feature_gains.eval(), 1e-6)