コード例 #1
0
 def test_core_value_to_value_classifier(self):
     core_node = decision_tree_pb2.Node()
     core_node.classifier.distribution.counts[:] = [0.0, 8.0, 2.0]
     core_node.classifier.distribution.sum = 10.0
     self.assertEqual(
         value_lib.core_value_to_value(core_node),
         value_lib.ProbabilityValue(probability=[0.8, 0.2],
                                    num_examples=10))
コード例 #2
0
 def test_core_value_to_value_regressor(self):
     core_node = decision_tree_pb2.Node()
     core_node.regressor.top_value = 1.0
     core_node.regressor.distribution.sum = 10.0
     core_node.regressor.distribution.sum_squares = 20.0
     core_node.regressor.distribution.count = 10.0
     self.assertEqual(
         value_lib.core_value_to_value(core_node),
         value_lib.RegressionValue(value=1.0,
                                   num_examples=10,
                                   standard_deviation=1.0))
コード例 #3
0
 def test_leaf(self):
     node = node_lib.LeafNode(value=value_lib.RegressionValue(
         value=5.0, num_examples=10, standard_deviation=1.0))
     core_node = decision_tree_pb2.Node(
         regressor=decision_tree_pb2.NodeRegressorOutput(top_value=5.0))
     dist = core_node.regressor.distribution
     dist.count = 10.0
     dist.sum = 0
     dist.sum_squares = 10.0
     self.assertEqual(
         node_lib.node_to_core_node(node,
                                    data_spec_pb2.DataSpecification()),
         core_node)
     logging.info("node:\n%s", node)
コード例 #4
0
def node_to_core_node(
        node: AbstractNode,
        dataspec: data_spec_pb2.DataSpecification) -> decision_tree_pb2.Node:
    """Converts a python node into a core node (proto format)."""

    core_node = decision_tree_pb2.Node()
    if isinstance(node, NonLeafNode):
        condition_lib.set_core_node(node.condition, dataspec, core_node)
        if node.value is not None:
            value_lib.set_core_node(node.value, core_node)

    elif isinstance(node, LeafNode):
        value_lib.set_core_node(node.value, core_node)

    else:
        raise ValueError(
            f"Expecting a LeafNode or a NonLeafNode. Got {node} instead")

    return core_node
コード例 #5
0
def build_toy_gbdt(path, num_classes):
    """Creates a toy GBDT model compatible with _build_toy_data_spec."""

    logging.info("Create toy model in %s", path)

    tf.io.gfile.makedirs(path)

    with tf.io.gfile.GFile(os.path.join(path, "done"), "w") as f:
        f.write("Something")

    data_spec = build_toy_data_spec()
    with tf.io.gfile.GFile(os.path.join(path, "data_spec.pb"), "w") as f:
        f.write(data_spec.SerializeToString())

    header = abstract_model_pb2.AbstractModel(
        name="GRADIENT_BOOSTED_TREES",
        task=abstract_model_pb2.CLASSIFICATION,
        label_col_idx=4 if num_classes == 2 else 3,
        input_features=[0, 1, 2])
    with tf.io.gfile.GFile(os.path.join(path, "header.pb"), "w") as f:
        f.write(header.SerializeToString())

    num_iters = 2
    num_trees_per_iter = 1 if num_classes == 2 else num_classes

    rf_header = gradient_boosted_trees_pb2.Header(
        num_node_shards=1,
        num_trees=num_iters * num_trees_per_iter,
        loss=gradient_boosted_trees_pb2.BINOMIAL_LOG_LIKELIHOOD if num_classes
        == 2 else gradient_boosted_trees_pb2.MULTINOMIAL_LOG_LIKELIHOOD,
        initial_predictions=[1.0] if num_classes == 2 else [0.0] * num_classes,
        num_trees_per_iter=num_trees_per_iter,
        node_format="BLOB_SEQUENCE")
    with tf.io.gfile.GFile(
            os.path.join(path, "gradient_boosted_trees_header.pb"), "w") as f:
        f.write(rf_header.SerializeToString())

    with blob_sequence.Writer(os.path.join(
            path, "nodes-00000-of-00001")) as output_file:

        for _ in range(num_iters):
            for tree_in_iter_idx in range(num_trees_per_iter):

                # [a > 1 ] // Node 0
                #   |-- [label = 1.0 + tree_in_iter_idx] // Node 1
                #   L-- [label = 5.0 + tree_in_iter_idx^2] // Node 2
                #
                # Two classes
                #   Case a<=1:
                #     logit = 1.0 + 1.0 * 2 = 3.0
                #     proba = [0.0474259, 0.9525741]
                #   Case a>1:
                #     logit = 1.0 + 5.0 * 2 = 11.0
                #     proba = [1.67e-05, 0.9999833]
                #
                # Three classes
                #   Case a<=1:
                #     logit = [1.0 * 2, 2.0 * 2, 3.0 * 2] = [2.0, 4.0, 6.0]
                #     proba = [0.01587624 0.11731043 0.86681333]
                #   Case a>1:
                #     logit = [5.0 * 2, 6.0 * 2, 9.0 * 2] = [10.0, 12.0, 18.0]
                #     proba = [0.01587624 0.11731043 0.86681333]

                # Node 0
                node = decision_tree_pb2.Node(
                    condition=decision_tree_pb2.NodeCondition(
                        na_value=False,
                        attribute=0,
                        condition=decision_tree_pb2.Condition(
                            higher_condition=decision_tree_pb2.Condition.
                            Higher(threshold=1.0)),
                    ))
                output_file.write(node.SerializeToString())

                # Node 1
                node = decision_tree_pb2.Node(
                    regressor=decision_tree_pb2.NodeRegressorOutput(
                        top_value=1.0 + tree_in_iter_idx))
                output_file.write(node.SerializeToString())

                # Node 2
                node = decision_tree_pb2.Node(
                    regressor=decision_tree_pb2.NodeRegressorOutput(
                        top_value=5.0 + tree_in_iter_idx * tree_in_iter_idx))
                output_file.write(node.SerializeToString())
コード例 #6
0
def build_toy_random_forest(path,
                            winner_take_all_inference,
                            add_boolean_features=False,
                            has_catset=False,
                            num_trees=2):
    """Creates a toy Random Forest model compatible with _build_toy_data_spec."""

    logging.info("Create toy model in %s", path)

    tf.io.gfile.makedirs(path)

    with tf.io.gfile.GFile(os.path.join(path, "done"), "w") as f:
        f.write("Something")

    data_spec = build_toy_data_spec(add_boolean_features=add_boolean_features,
                                    has_catset=has_catset)
    with tf.io.gfile.GFile(os.path.join(path, "data_spec.pb"), "w") as f:
        f.write(data_spec.SerializeToString())

    header = abstract_model_pb2.AbstractModel(
        name="RANDOM_FOREST",
        task=abstract_model_pb2.CLASSIFICATION,
        label_col_idx=3,
        input_features=[0, 1, 2] + ([5] if add_boolean_features else []) +
        ([5, 6] if has_catset else []))
    with tf.io.gfile.GFile(os.path.join(path, "header.pb"), "w") as f:
        f.write(header.SerializeToString())

    rf_header = random_forest_pb2.Header(
        num_node_shards=1,
        num_trees=num_trees,
        winner_take_all_inference=winner_take_all_inference,
        node_format="BLOB_SEQUENCE")
    with tf.io.gfile.GFile(os.path.join(path, "random_forest_header.pb"),
                           "w") as f:
        f.write(rf_header.SerializeToString())

    with blob_sequence.Writer(os.path.join(
            path, "nodes-00000-of-00001")) as output_file:

        for _ in range(rf_header.num_trees):
            # [a > 1 ] // Node 0
            #   |-- [b in ["x,"y"] ] // Node 1
            #   |     |-- [label = 80%;10%;10%] // Node 2
            #   |     L-- [label = 10%;80%;10%] // Node 3
            #   L-- [c in [1, 3] ] // Node 4
            #         |-- [label = 50%;50%;0%] // Node 5
            #         L-- [label = 0%;50%;50%] // Node 6
            #
            # If add_boolean_features is True, Node 6 is repurposed as follows:
            #
            # ['bool' is True] // Node 6
            #   | -- [label = 0%;20%;80%] // Node 7
            #   L -- [label = 0%;80%;20%] // Node 8
            #
            # If has_catset is True, Node 4 condition is replaced by:
            #   [ d \intersect [1,3] != \emptyset
            # Node 0
            node = decision_tree_pb2.Node(
                condition=decision_tree_pb2.NodeCondition(
                    na_value=False,
                    attribute=0,
                    condition=decision_tree_pb2.Condition(
                        higher_condition=decision_tree_pb2.Condition.Higher(
                            threshold=1.0)),
                ))
            output_file.write(node.SerializeToString())

            # Node 1
            node = decision_tree_pb2.Node(
                condition=decision_tree_pb2.NodeCondition(
                    na_value=False,
                    attribute=1,
                    condition=decision_tree_pb2.Condition(
                        contains_bitmap_condition=decision_tree_pb2.Condition.
                        ContainsBitmap(elements_bitmap=b"\x06")),  # [1,2]
                ))
            output_file.write(node.SerializeToString())

            # Node 2
            node = decision_tree_pb2.Node(
                classifier=decision_tree_pb2.NodeClassifierOutput(
                    top_value=1,
                    distribution=distribution_pb2.IntegerDistributionDouble(
                        counts=[0, 0.8, 0.1, 0.1], sum=1)))
            output_file.write(node.SerializeToString())

            # Node 3
            node = decision_tree_pb2.Node(
                classifier=decision_tree_pb2.NodeClassifierOutput(
                    top_value=2,
                    distribution=distribution_pb2.IntegerDistributionDouble(
                        counts=[0, 0.1, 0.8, 0.1], sum=1)))
            output_file.write(node.SerializeToString())

            # Node 4
            node = decision_tree_pb2.Node(
                condition=decision_tree_pb2.NodeCondition(
                    na_value=False,
                    attribute=5 if has_catset else 2,
                    condition=decision_tree_pb2.Condition(
                        contains_condition=decision_tree_pb2.Condition.
                        ContainsVector(elements=[1, 3]))))
            output_file.write(node.SerializeToString())

            # Node 5
            node = decision_tree_pb2.Node(
                classifier=decision_tree_pb2.NodeClassifierOutput(
                    top_value=1,
                    distribution=distribution_pb2.IntegerDistributionDouble(
                        counts=[0, 1, 1, 0], sum=2)))
            output_file.write(node.SerializeToString())

            if not add_boolean_features:
                # Node 6
                node = decision_tree_pb2.Node(
                    classifier=decision_tree_pb2.NodeClassifierOutput(
                        top_value=2,
                        distribution=distribution_pb2.
                        IntegerDistributionDouble(counts=[0, 0, 1, 1], sum=2)))
                output_file.write(node.SerializeToString())
            else:
                # Node 6
                node = decision_tree_pb2.Node(
                    condition=decision_tree_pb2.NodeCondition(
                        na_value=False,
                        attribute=5,
                        condition=decision_tree_pb2.Condition(
                            true_value_condition=decision_tree_pb2.Condition.
                            TrueValue())))
                output_file.write(node.SerializeToString())

                # Node 7
                node = decision_tree_pb2.Node(
                    classifier=decision_tree_pb2.NodeClassifierOutput(
                        top_value=3,
                        distribution=distribution_pb2.
                        IntegerDistributionDouble(counts=[0, 0, 0.2, 0.8],
                                                  sum=1)))
                output_file.write(node.SerializeToString())

                # Node 8
                node = decision_tree_pb2.Node(
                    classifier=decision_tree_pb2.NodeClassifierOutput(
                        top_value=2,
                        distribution=distribution_pb2.
                        IntegerDistributionDouble(counts=[0, 0, 0.8, 0.2],
                                                  sum=1)))
                output_file.write(node.SerializeToString())