def test_core_value_to_value_classifier(self): core_node = decision_tree_pb2.Node() core_node.classifier.distribution.counts[:] = [0.0, 8.0, 2.0] core_node.classifier.distribution.sum = 10.0 self.assertEqual( value_lib.core_value_to_value(core_node), value_lib.ProbabilityValue(probability=[0.8, 0.2], num_examples=10))
def test_core_value_to_value_regressor(self): core_node = decision_tree_pb2.Node() core_node.regressor.top_value = 1.0 core_node.regressor.distribution.sum = 10.0 core_node.regressor.distribution.sum_squares = 20.0 core_node.regressor.distribution.count = 10.0 self.assertEqual( value_lib.core_value_to_value(core_node), value_lib.RegressionValue(value=1.0, num_examples=10, standard_deviation=1.0))
def test_leaf(self): node = node_lib.LeafNode(value=value_lib.RegressionValue( value=5.0, num_examples=10, standard_deviation=1.0)) core_node = decision_tree_pb2.Node( regressor=decision_tree_pb2.NodeRegressorOutput(top_value=5.0)) dist = core_node.regressor.distribution dist.count = 10.0 dist.sum = 0 dist.sum_squares = 10.0 self.assertEqual( node_lib.node_to_core_node(node, data_spec_pb2.DataSpecification()), core_node) logging.info("node:\n%s", node)
def node_to_core_node( node: AbstractNode, dataspec: data_spec_pb2.DataSpecification) -> decision_tree_pb2.Node: """Converts a python node into a core node (proto format).""" core_node = decision_tree_pb2.Node() if isinstance(node, NonLeafNode): condition_lib.set_core_node(node.condition, dataspec, core_node) if node.value is not None: value_lib.set_core_node(node.value, core_node) elif isinstance(node, LeafNode): value_lib.set_core_node(node.value, core_node) else: raise ValueError( f"Expecting a LeafNode or a NonLeafNode. Got {node} instead") return core_node
def build_toy_gbdt(path, num_classes): """Creates a toy GBDT model compatible with _build_toy_data_spec.""" logging.info("Create toy model in %s", path) tf.io.gfile.makedirs(path) with tf.io.gfile.GFile(os.path.join(path, "done"), "w") as f: f.write("Something") data_spec = build_toy_data_spec() with tf.io.gfile.GFile(os.path.join(path, "data_spec.pb"), "w") as f: f.write(data_spec.SerializeToString()) header = abstract_model_pb2.AbstractModel( name="GRADIENT_BOOSTED_TREES", task=abstract_model_pb2.CLASSIFICATION, label_col_idx=4 if num_classes == 2 else 3, input_features=[0, 1, 2]) with tf.io.gfile.GFile(os.path.join(path, "header.pb"), "w") as f: f.write(header.SerializeToString()) num_iters = 2 num_trees_per_iter = 1 if num_classes == 2 else num_classes rf_header = gradient_boosted_trees_pb2.Header( num_node_shards=1, num_trees=num_iters * num_trees_per_iter, loss=gradient_boosted_trees_pb2.BINOMIAL_LOG_LIKELIHOOD if num_classes == 2 else gradient_boosted_trees_pb2.MULTINOMIAL_LOG_LIKELIHOOD, initial_predictions=[1.0] if num_classes == 2 else [0.0] * num_classes, num_trees_per_iter=num_trees_per_iter, node_format="BLOB_SEQUENCE") with tf.io.gfile.GFile( os.path.join(path, "gradient_boosted_trees_header.pb"), "w") as f: f.write(rf_header.SerializeToString()) with blob_sequence.Writer(os.path.join( path, "nodes-00000-of-00001")) as output_file: for _ in range(num_iters): for tree_in_iter_idx in range(num_trees_per_iter): # [a > 1 ] // Node 0 # |-- [label = 1.0 + tree_in_iter_idx] // Node 1 # L-- [label = 5.0 + tree_in_iter_idx^2] // Node 2 # # Two classes # Case a<=1: # logit = 1.0 + 1.0 * 2 = 3.0 # proba = [0.0474259, 0.9525741] # Case a>1: # logit = 1.0 + 5.0 * 2 = 11.0 # proba = [1.67e-05, 0.9999833] # # Three classes # Case a<=1: # logit = [1.0 * 2, 2.0 * 2, 3.0 * 2] = [2.0, 4.0, 6.0] # proba = [0.01587624 0.11731043 0.86681333] # Case a>1: # logit = [5.0 * 2, 6.0 * 2, 9.0 * 2] = [10.0, 12.0, 18.0] # proba = [0.01587624 0.11731043 0.86681333] # Node 0 node = decision_tree_pb2.Node( condition=decision_tree_pb2.NodeCondition( na_value=False, attribute=0, condition=decision_tree_pb2.Condition( higher_condition=decision_tree_pb2.Condition. Higher(threshold=1.0)), )) output_file.write(node.SerializeToString()) # Node 1 node = decision_tree_pb2.Node( regressor=decision_tree_pb2.NodeRegressorOutput( top_value=1.0 + tree_in_iter_idx)) output_file.write(node.SerializeToString()) # Node 2 node = decision_tree_pb2.Node( regressor=decision_tree_pb2.NodeRegressorOutput( top_value=5.0 + tree_in_iter_idx * tree_in_iter_idx)) output_file.write(node.SerializeToString())
def build_toy_random_forest(path, winner_take_all_inference, add_boolean_features=False, has_catset=False, num_trees=2): """Creates a toy Random Forest model compatible with _build_toy_data_spec.""" logging.info("Create toy model in %s", path) tf.io.gfile.makedirs(path) with tf.io.gfile.GFile(os.path.join(path, "done"), "w") as f: f.write("Something") data_spec = build_toy_data_spec(add_boolean_features=add_boolean_features, has_catset=has_catset) with tf.io.gfile.GFile(os.path.join(path, "data_spec.pb"), "w") as f: f.write(data_spec.SerializeToString()) header = abstract_model_pb2.AbstractModel( name="RANDOM_FOREST", task=abstract_model_pb2.CLASSIFICATION, label_col_idx=3, input_features=[0, 1, 2] + ([5] if add_boolean_features else []) + ([5, 6] if has_catset else [])) with tf.io.gfile.GFile(os.path.join(path, "header.pb"), "w") as f: f.write(header.SerializeToString()) rf_header = random_forest_pb2.Header( num_node_shards=1, num_trees=num_trees, winner_take_all_inference=winner_take_all_inference, node_format="BLOB_SEQUENCE") with tf.io.gfile.GFile(os.path.join(path, "random_forest_header.pb"), "w") as f: f.write(rf_header.SerializeToString()) with blob_sequence.Writer(os.path.join( path, "nodes-00000-of-00001")) as output_file: for _ in range(rf_header.num_trees): # [a > 1 ] // Node 0 # |-- [b in ["x,"y"] ] // Node 1 # | |-- [label = 80%;10%;10%] // Node 2 # | L-- [label = 10%;80%;10%] // Node 3 # L-- [c in [1, 3] ] // Node 4 # |-- [label = 50%;50%;0%] // Node 5 # L-- [label = 0%;50%;50%] // Node 6 # # If add_boolean_features is True, Node 6 is repurposed as follows: # # ['bool' is True] // Node 6 # | -- [label = 0%;20%;80%] // Node 7 # L -- [label = 0%;80%;20%] // Node 8 # # If has_catset is True, Node 4 condition is replaced by: # [ d \intersect [1,3] != \emptyset # Node 0 node = decision_tree_pb2.Node( condition=decision_tree_pb2.NodeCondition( na_value=False, attribute=0, condition=decision_tree_pb2.Condition( higher_condition=decision_tree_pb2.Condition.Higher( threshold=1.0)), )) output_file.write(node.SerializeToString()) # Node 1 node = decision_tree_pb2.Node( condition=decision_tree_pb2.NodeCondition( na_value=False, attribute=1, condition=decision_tree_pb2.Condition( contains_bitmap_condition=decision_tree_pb2.Condition. ContainsBitmap(elements_bitmap=b"\x06")), # [1,2] )) output_file.write(node.SerializeToString()) # Node 2 node = decision_tree_pb2.Node( classifier=decision_tree_pb2.NodeClassifierOutput( top_value=1, distribution=distribution_pb2.IntegerDistributionDouble( counts=[0, 0.8, 0.1, 0.1], sum=1))) output_file.write(node.SerializeToString()) # Node 3 node = decision_tree_pb2.Node( classifier=decision_tree_pb2.NodeClassifierOutput( top_value=2, distribution=distribution_pb2.IntegerDistributionDouble( counts=[0, 0.1, 0.8, 0.1], sum=1))) output_file.write(node.SerializeToString()) # Node 4 node = decision_tree_pb2.Node( condition=decision_tree_pb2.NodeCondition( na_value=False, attribute=5 if has_catset else 2, condition=decision_tree_pb2.Condition( contains_condition=decision_tree_pb2.Condition. ContainsVector(elements=[1, 3])))) output_file.write(node.SerializeToString()) # Node 5 node = decision_tree_pb2.Node( classifier=decision_tree_pb2.NodeClassifierOutput( top_value=1, distribution=distribution_pb2.IntegerDistributionDouble( counts=[0, 1, 1, 0], sum=2))) output_file.write(node.SerializeToString()) if not add_boolean_features: # Node 6 node = decision_tree_pb2.Node( classifier=decision_tree_pb2.NodeClassifierOutput( top_value=2, distribution=distribution_pb2. IntegerDistributionDouble(counts=[0, 0, 1, 1], sum=2))) output_file.write(node.SerializeToString()) else: # Node 6 node = decision_tree_pb2.Node( condition=decision_tree_pb2.NodeCondition( na_value=False, attribute=5, condition=decision_tree_pb2.Condition( true_value_condition=decision_tree_pb2.Condition. TrueValue()))) output_file.write(node.SerializeToString()) # Node 7 node = decision_tree_pb2.Node( classifier=decision_tree_pb2.NodeClassifierOutput( top_value=3, distribution=distribution_pb2. IntegerDistributionDouble(counts=[0, 0, 0.2, 0.8], sum=1))) output_file.write(node.SerializeToString()) # Node 8 node = decision_tree_pb2.Node( classifier=decision_tree_pb2.NodeClassifierOutput( top_value=2, distribution=distribution_pb2. IntegerDistributionDouble(counts=[0, 0, 0.8, 0.2], sum=1))) output_file.write(node.SerializeToString())