Ejemplo n.º 1
0
def test_sum_node_create_and_eval():
    # create child nodes
    child1 = Node()
    val1 = 1.
    child1.set_val(val1)

    child2 = Node()
    val2 = 1.
    child2.set_val(val2)

    # create sum node and adding children to it
    sum_node = SumNode()
    weight1 = 0.8
    weight2 = 0.2
    sum_node.add_child(child1, weight1)
    sum_node.add_child(child2, weight2)
    assert len(sum_node.children) == 2
    assert len(sum_node.weights) == 2
    assert len(sum_node.log_weights) == 2
    log_weights = [log(weight1), log(weight2)]
    assert log_weights == sum_node.log_weights

    print(sum_node)

    # evaluating
    sum_node.eval()
    print(sum_node.log_val)
    assert_almost_equal(sum_node.log_val,
                        log(val1 * weight1 + val2 * weight2),
                        places=15)

    # changing values 1,0
    val1 = 1.
    child1.set_val(val1)
    val2 = 0.
    child2.set_val(val2)

    # evaluating
    sum_node.eval()
    print(sum_node.log_val)
    assert_almost_equal(sum_node.log_val,
                        log(val1 * weight1 + val2 * weight2),
                        places=15)

    # changing values 0,0 -> LOG_ZERO
    val1 = 0.
    child1.set_val(val1)
    val2 = 0.
    child2.set_val(val2)

    # evaluating
    sum_node.eval()
    print(sum_node.log_val)
    assert_almost_equal(sum_node.log_val,
                        LOG_ZERO,
                        places=15)
Ejemplo n.º 2
0
def test_sum_node_create_and_eval_keras():

    n_trials = 100
    for i in range(n_trials):

        n_children = numpy.random.randint(1, 100)
        print('n children', n_children)
        children = [Node() for c in range(n_children)]
        weights = numpy.random.rand(n_children)
        weights = weights / weights.sum()

        #
        # create sum node and adding children to it
        sum_node = SumNode()

        for child, w in zip(children, weights):
            sum_node.add_child(child, w)
            child.log_vals = K.placeholder(ndim=2)

        assert len(sum_node.children) == n_children
        assert len(sum_node.weights) == n_children
        assert len(sum_node.log_weights) == n_children

        print(sum_node)

        #
        # evaluating for fake probabilities
        n_instances = numpy.random.randint(1, 100)
        print('n instances', n_instances)
        probs = numpy.random.rand(n_instances, n_children)  # .astype(theano.config.floatX)
        log_probs = numpy.log(probs)

        log_vals = []
        for d in range(n_instances):
            for c, child in enumerate(children):
                child.set_val(probs[d, c])

            sum_node.eval()
            print('sum node eval')
            print(sum_node.log_val)
            log_vals.append(sum_node.log_val)

        #
        # now theano
        sum_node.build_k()
        eval_sum_node_f = K.function(inputs=[c.log_vals for c in children],
                                     outputs=[sum_node.log_vals])
        keras_log_vals = eval_sum_node_f([log_probs[:, c].reshape(log_probs.shape[0], 1)
                                          for c in range(n_children)])[0]
        print(keras_log_vals)

        assert_array_almost_equal(numpy.array(log_vals).reshape(log_probs.shape[0], 1),
                                  keras_log_vals,
                                  decimal=4)
Ejemplo n.º 3
0
def estimate_kernel_density_spn(data_slice, feature_sizes, data, alpha,
                                node_id_assoc, building_stack,
                                slices_to_process):
    """
    A mixture with one component for each instance
    """

    instance_ids = data_slice.instance_ids
    feature_ids = data_slice.feature_ids
    current_id = data_slice.id

    n_instances = len(instance_ids)
    n_features = len(feature_ids)

    logging.info('Adding a kernel density estimation ' +
                 'over a slice {0} X {1}'.format(n_instances, n_features))

    #
    # create sum node
    root_sum_node = SumNode(var_scope=frozenset(feature_ids))

    data_slice.type = SumNode
    building_stack.append(data_slice)

    root_sum_node.id = current_id
    node_id_assoc[current_id] = root_sum_node

    #
    # for each instance
    for i in instance_ids:
        #
        # create a slice
        instance_slice = DataSlice(numpy.array([i]), feature_ids)
        slices_to_process.append(instance_slice)
        #
        # linking with appropriate weight
        data_slice.add_child(instance_slice, 1.0 / n_instances)

    return root_sum_node, node_id_assoc, building_stack, slices_to_process
Ejemplo n.º 4
0
def test_product_layer_is_decomposable():
    # creating scopes and nodes
    scope1 = frozenset({0, 2, 3})
    scope2 = frozenset({10, 9})
    prod_node_1 = ProductNode(var_scope=scope1)
    prod_node_2 = ProductNode(var_scope=scope2)

    # creating children manually (argh=)
    for var in scope1:
        prod_node_1.add_child(SumNode(var_scope=frozenset({var})))
    for var in scope2:
        prod_node_2.add_child(CategoricalSmoothedNode(var=var,
                                                      var_values=2))

    # creating layer
    prod_layer = ProductLayer(nodes=[prod_node_1, prod_node_2])

    assert prod_layer.is_decomposable()

    # making it not decomposable anymore
    scope3 = frozenset({2})
    prod_node_1.add_child(SumNode(var_scope=scope3))

    assert not prod_layer.is_decomposable()
Ejemplo n.º 5
0
def test_sum_node_normalize():
    # create child nodes
    child1 = Node()
    val1 = 1.
    child1.set_val(val1)

    child2 = Node()
    val2 = 1.
    child2.set_val(val2)

    # create sum node and adding children to it
    sum_node = SumNode()
    weight1 = 1.
    weight2 = 0.2
    weights = [weight1, weight2]
    sum_node.add_child(child1, weight1)
    sum_node.add_child(child2, weight2)
    un_sum = sum(weights)

    # normalizing
    sum_node.normalize()
    assert len(sum_node.children) == 2
    assert len(sum_node.weights) == 2
    assert len(sum_node.log_weights) == 2

    # checking weight sum
    w_sum = sum(sum_node.weights)
    assert w_sum == 1.

    # and check the correct values
    normal_sum = [weight / un_sum for weight in weights]
    print(normal_sum)
    assert normal_sum == sum_node.weights

    # checking log_weights
    log_weights = [log(weight) for weight in normal_sum]
    print(log_weights)
    assert log_weights == sum_node.log_weights
Ejemplo n.º 6
0
def test_categorical_to_indicator_input_layer():
    #
    # creating all the data slices
    # the slicing is a fake stub
    # rows = 5
    # cols = 5
    var_1 = 0
    values_1 = 2
    var_2 = 1
    values_2 = 3
    var_3 = 2
    values_3 = 4

    node_1 = SumNode()
    node_1.id = 1

    node_2 = ProductNode()
    node_2.id = 2

    node_3 = SumNode()
    node_3.id = 3

    # adding first level
    weight_12 = 0.4
    weight_13 = 0.6
    node_1.add_child(node_2, weight_12)
    node_1.add_child(node_3, weight_13)

    node_4 = ProductNode()
    node_4.id = 4

    leaf_5 = CategoricalSmoothedNode(var_1, values_1)
    leaf_5.id = 5

    # not adding the slice to the stack

    node_2.add_child(node_4)
    node_2.add_child(leaf_5)

    node_6 = SumNode()
    node_6.id = 6

    node_7 = SumNode()
    node_7.id = 7

    weight_36 = 0.1
    weight_37 = 0.9
    node_3.add_child(node_6, weight_36)
    node_3.add_child(node_7, weight_37)

    node_8 = ProductNode()
    node_8.id = 8

    leaf_15 = CategoricalSmoothedNode(var_2, values_2)
    leaf_15.id = 15

    node_4.add_child(node_8)
    node_4.add_child(leaf_15)

    leaf_13 = CategoricalSmoothedNode(var_3, values_3)
    leaf_13.id = 13

    leaf_14 = CategoricalSmoothedNode(var_1, values_1)
    leaf_14.id = 14

    node_8.add_child(leaf_13)
    node_8.add_child(leaf_14)

    node_9 = ProductNode()
    node_9.id = 9

    leaf_16 = CategoricalSmoothedNode(var_2, values_2)
    leaf_16.id = 16

    leaf_17 = CategoricalSmoothedNode(var_3, values_3)
    leaf_17.id = 17

    node_9.add_child(leaf_16)
    node_9.add_child(leaf_17)

    node_10 = ProductNode()
    node_10.id = 10

    leaf_18 = CategoricalSmoothedNode(var_2, values_2)
    leaf_18.id = 18

    leaf_19 = CategoricalSmoothedNode(var_2, values_2)
    leaf_19.id = 19

    node_10.add_child(leaf_18)
    node_10.add_child(leaf_19)

    weight_69 = 0.3
    weight_610 = 0.7
    node_6.add_child(node_9, weight_69)
    node_6.add_child(node_10, weight_610)

    node_11 = ProductNode()
    node_11.id = 11

    leaf_20 = CategoricalSmoothedNode(var_1, values_1)
    leaf_20.id = 20

    leaf_21 = CategoricalSmoothedNode(var_3, values_3)
    leaf_21.id = 21

    node_11.add_child(leaf_20)
    node_11.add_child(leaf_21)

    node_12 = ProductNode()
    node_12.id = 12

    leaf_22 = CategoricalSmoothedNode(var_1, values_1)
    leaf_22.id = 22

    leaf_23 = CategoricalSmoothedNode(var_3, values_3)
    leaf_23.id = 23

    node_12.add_child(leaf_22)
    node_12.add_child(leaf_23)

    weight_711 = 0.5
    weight_712 = 0.5
    node_7.add_child(node_11, weight_711)
    node_7.add_child(node_12, weight_712)

    root_node = SpnFactory.layered_pruned_linked_spn(node_1)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 5
        elif i == 2:
            assert layer.n_nodes() == 12

    #
    # changing input layer
    spn = linked_categorical_input_to_indicators(spn)

    print('Changed input layer to indicator variables')
    print(spn)
Ejemplo n.º 7
0
def test_product_node_is_decomposable():
    # create a prod node with a scope
    scope = frozenset({0, 2, 7, 13})

    # creating sub scopes
    sub_scope_1 = frozenset({0})
    sub_scope_2 = frozenset({0, 2})
    sub_scope_3 = frozenset({7})
    sub_scope_4 = frozenset({17})
    sub_scope_5 = frozenset({7, 13})

    # now with decomposable children
    child1 = SumNode(var_scope=sub_scope_2)
    child2 = SumNode(var_scope=sub_scope_5)
    child3 = SumNode(var_scope=sub_scope_2)
    child4 = SumNode(var_scope=sub_scope_1)

    prod_node = ProductNode(var_scope=scope)
    prod_node.add_child(child1)
    prod_node.add_child(child2)

    assert prod_node.is_decomposable()

    prod_node = ProductNode(var_scope=scope)
    prod_node.add_child(child4)
    prod_node.add_child(child1)
    prod_node.add_child(child2)

    assert not prod_node.is_decomposable()

    prod_node = ProductNode(var_scope=scope)
    prod_node.add_child(child4)
    prod_node.add_child(child2)

    assert not prod_node.is_decomposable()

    # now with input nodes
    child5 = CategoricalSmoothedNode(var=0, var_values=2)
    child6 = CategoricalSmoothedNode(var=2, var_values=2)
    child7 = CategoricalSmoothedNode(var=7, var_values=2)
    child8 = CategoricalSmoothedNode(var=13, var_values=2)
    child9 = CategoricalSmoothedNode(var=17, var_values=2)

    prod_node = ProductNode(var_scope=scope)
    prod_node.add_child(child5)
    prod_node.add_child(child6)
    prod_node.add_child(child7)
    prod_node.add_child(child8)

    assert prod_node.is_decomposable()

    prod_node = ProductNode(var_scope=scope)
    prod_node.add_child(child5)
    prod_node.add_child(child6)
    prod_node.add_child(child7)
    prod_node.add_child(child9)

    assert not prod_node.is_decomposable()

    prod_node = ProductNode(var_scope=scope)
    prod_node.add_child(child5)
    prod_node.add_child(child6)
    prod_node.add_child(child8)

    assert not prod_node.is_decomposable()
Ejemplo n.º 8
0
def test_sum_layer_is_complete():
    # creating two scopes and two sum nodes
    scope1 = frozenset({0, 2, 3})
    scope2 = frozenset({10})
    sum_node_1 = SumNode(var_scope=scope1)
    sum_node_2 = SumNode(var_scope=scope2)

    # adding product nodes as children to the first, indicator the second
    for i in range(4):
        sum_node_1.add_child(ProductNode(var_scope=scope1), 1.0)
        sum_node_2.add_child(CategoricalIndicatorNode(var=10, var_val=i), 1.0)

    # creating sum layer
    sum_layer = SumLayer(nodes=[sum_node_1, sum_node_2])

    assert sum_layer.is_complete()

    # now with errors in scope
    scope3 = frozenset({6})
    sum_node_1 = SumNode(var_scope=scope1)
    sum_node_2 = SumNode(var_scope=scope3)

    # adding product nodes as children to the first, indicator the second
    for i in range(4):
        sum_node_1.add_child(ProductNode(var_scope=scope1), 1.0)
        sum_node_2.add_child(CategoricalIndicatorNode(var=10, var_val=i), 1.0)

    # creating sum layer
    sum_layer = SumLayer(nodes=[sum_node_1, sum_node_2])

    assert not sum_layer.is_complete()

    sum_node_2.var_scope = scope2

    assert sum_layer.is_complete()

    sum_node_2.children[3].var_scope = scope3

    assert not sum_layer.is_complete()
Ejemplo n.º 9
0
def linked_categorical_input_to_indicators(spn, input_layer=None):
    """
    Convertes a linked spn categorical input layer into an indicator one
    """

    #
    # get child, parent relations for node relinking
    child_assoc = retrieve_children_parent_assoc(spn)

    #
    # get input layer
    cat_input_layer = spn.input_layer()
    assert isinstance(cat_input_layer, CategoricalSmoothedLayerLinked)

    #
    # one indicator node for each var value
    vars = cat_input_layer.vars()
    if not vars:
        vars = list(sorted({node.var for node in cat_input_layer.nodes()}))

    feature_values = cat_input_layer.feature_vals()
    # print('vars', vars)
    # print('feature values', feature_values)

    indicator_nodes = [
        CategoricalIndicatorNode(var, val) for i, var in enumerate(vars)
        for val in range(feature_values[i])
    ]
    # for node in indicator_nodes:
    #     print(node)

    indicator_map = defaultdict(set)
    for ind_node in indicator_nodes:
        indicator_map[ind_node.var].add(ind_node)

    sum_nodes = []
    #
    # as many sum nodes as cat nodes
    for node in cat_input_layer.nodes():

        sum_node = SumNode(var_scope=frozenset([node.var]))
        sum_nodes.append(sum_node)

        for ind_node in sorted(indicator_map[node.var],
                               key=lambda x: x.var_val):
            sum_node.add_child(ind_node,
                               numpy.exp(node._var_probs[ind_node.var_val]))

        #
        # removing links to parents
        parents = child_assoc[node]
        for p_node in parents:
            #
            # assume it to be a product node
            # TODO: generalize
            assert isinstance(p_node, ProductNode)
            p_node.children.remove(node)
            p_node.add_child(sum_node)

    #
    # creating layer
    sum_layer = SumLayerLinked(sum_nodes)

    indicator_layer = CategoricalIndicatorLayerLinked(indicator_nodes)

    cat_input_layer.disconnect_layer()
    spn.set_input_layer(indicator_layer)
    spn.insert_layer(sum_layer, 0)

    return spn
Ejemplo n.º 10
0
def build_spn_layers(input_layer):

    # this is ugly... TODO try to beutify this process
    ind1 = input_layer._nodes[0]
    ind2 = input_layer._nodes[1]
    ind3 = input_layer._nodes[2]
    ind4 = input_layer._nodes[3]
    ind5 = input_layer._nodes[4]
    ind6 = input_layer._nodes[5]
    ind7 = input_layer._nodes[6]
    ind8 = input_layer._nodes[7]

    # creating sum nodes
    sum_node1 = SumNode()
    sum_node2 = SumNode()
    sum_node3 = SumNode()
    sum_node4 = SumNode()

    # linking them with nodes
    sum_node1.add_child(ind1, 0.5)
    sum_node1.add_child(ind2, 0.5)
    sum_node2.add_child(ind3, 0.1)
    sum_node2.add_child(ind4, 0.9)
    sum_node3.add_child(ind5, 0.3)
    sum_node3.add_child(ind6, 0.7)
    sum_node4.add_child(ind7, 0.6)
    sum_node4.add_child(ind8, 0.4)

    # creating sumlayer
    sum_layer = SumLayer([sum_node1,
                          sum_node2,
                          sum_node3,
                          sum_node4])

    # creating product nodes
    prod_node1 = ProductNode()
    prod_node2 = ProductNode()
    prod_node3 = ProductNode()

    # linking them to sum nodes
    prod_node1.add_child(sum_node1)
    prod_node1.add_child(sum_node2)
    prod_node2.add_child(sum_node2)
    prod_node2.add_child(sum_node3)
    prod_node3.add_child(sum_node3)
    prod_node3.add_child(sum_node4)

    # creating a product layer
    prod_layer = ProductLayer([prod_node1,
                               prod_node2,
                               prod_node3])

    return sum_layer, prod_layer
Ejemplo n.º 11
0
def test_sum_layer_create_and_eval():
    # creating generic nodes
    node1 = Node()
    node2 = Node()
    node3 = Node()

    # whose values are
    val1 = 1.
    val2 = 1.
    val3 = 0.
    node1.set_val(val1)
    node2.set_val(val2)
    node3.set_val(val3)

    # setting weights
    weight11 = 0.2
    weight12 = 0.3
    weight13 = 0.5

    weight21 = 0.3
    weight22 = 0.7

    weight32 = 0.4
    weight33 = 0.6

    # creating sum nodes
    sum1 = SumNode()
    sum2 = SumNode()
    sum3 = SumNode()

    # adding children
    sum1.add_child(node1, weight11)
    sum1.add_child(node2, weight12)
    sum1.add_child(node3, weight13)

    sum2.add_child(node1, weight21)
    sum2.add_child(node2, weight22)

    sum3.add_child(node2, weight32)
    sum3.add_child(node3, weight33)

    # adding to layer
    sum_layer = SumLayer([sum1, sum2, sum3])

    # evaluation
    sum_layer.eval()

    # computing 'log values by hand'
    layer_evals = sum_layer.node_values()
    print('Layer eval nodes')
    print(layer_evals)

    logval1 = log(weight11 * val1 +
                  weight12 * val2 +
                  weight13 * val3)
    logval2 = log(weight21 * val1 +
                  weight22 * val2)
    logval3 = log(weight32 * val2 +
                  weight33 * val3)
    logvals = [logval1, logval2, logval3]

    print('log vals')
    print(logvals)
    # checking for correctness
    for logval, eval in zip(logvals, layer_evals):
        assert_almost_equal(logval, eval, PRECISION)
Ejemplo n.º 12
0
def test_layered_linked_spn():
    # creating single nodes
    # this code is replicated TODO: make a function
    root = SumNode()

    prod1 = ProductNode()
    prod2 = ProductNode()
    prod3 = ProductNode()

    sum1 = SumNode()
    sum2 = SumNode()
    sum3 = SumNode()
    sum4 = SumNode()

    ind1 = CategoricalIndicatorNode(var=0, var_val=0)
    ind2 = CategoricalIndicatorNode(var=0, var_val=1)
    ind3 = CategoricalIndicatorNode(var=1, var_val=0)
    ind4 = CategoricalIndicatorNode(var=1, var_val=1)
    ind5 = CategoricalIndicatorNode(var=2, var_val=0)
    ind6 = CategoricalIndicatorNode(var=2, var_val=1)
    ind7 = CategoricalIndicatorNode(var=2, var_val=2)
    ind8 = CategoricalIndicatorNode(var=3, var_val=0)
    ind9 = CategoricalIndicatorNode(var=3, var_val=1)
    ind10 = CategoricalIndicatorNode(var=3, var_val=2)
    ind11 = CategoricalIndicatorNode(var=3, var_val=3)

    prod4 = ProductNode()
    prod5 = ProductNode()
    prod6 = ProductNode()
    prod7 = ProductNode()

    # linking nodes
    root.add_child(prod1, 0.3)
    root. add_child(prod2, 0.3)
    root.add_child(prod3, 0.4)

    prod1.add_child(sum1)
    prod1.add_child(sum2)
    prod2.add_child(ind7)
    prod2.add_child(ind8)
    prod2.add_child(ind11)
    prod3.add_child(sum3)
    prod3.add_child(sum4)

    sum1.add_child(ind1, 0.3)
    sum1.add_child(ind2, 0.3)
    sum1.add_child(prod4, 0.4)

    sum2.add_child(ind2, 0.5)
    sum2.add_child(prod4, 0.2)
    sum2.add_child(prod5, 0.3)

    sum3.add_child(prod6, 0.5)
    sum3.add_child(prod7, 0.5)
    sum4.add_child(prod6, 0.5)
    sum4.add_child(prod7, 0.5)

    prod4.add_child(ind3)
    prod4.add_child(ind4)
    prod5.add_child(ind5)
    prod5.add_child(ind6)
    prod6.add_child(ind9)
    prod6.add_child(ind10)
    prod7.add_child(ind9)
    prod7.add_child(ind10)

    spn = SpnFactory.layered_linked_spn(root)

    print(spn)
    print(spn.stats())
Ejemplo n.º 13
0
def test_spn_mpe_eval_and_traversal():
    # create initial layer
    node1 = Node()
    node2 = Node()
    node3 = Node()
    node4 = Node()
    node5 = Node()

    input_layer = CategoricalInputLayer([node1, node2,
                                         node3, node4,
                                         node5])

    # top layer made by 3 sum nodes
    sum1 = SumNode()
    sum2 = SumNode()
    sum3 = SumNode()

    # linking to input nodes
    weight11 = 0.3
    sum1.add_child(node1, weight11)
    weight12 = 0.3
    sum1.add_child(node2, weight12)
    weight13 = 0.4
    sum1.add_child(node3, weight13)

    weight22 = 0.15
    sum2.add_child(node2, weight22)
    weight23 = 0.15
    sum2.add_child(node3, weight23)
    weight24 = 0.7
    sum2.add_child(node4, weight24)

    weight33 = 0.4
    sum3.add_child(node3, weight33)
    weight34 = 0.25
    sum3.add_child(node4, weight34)
    weight35 = 0.35
    sum3.add_child(node5, weight35)

    sum_layer = SumLayer([sum1, sum2, sum3])

    # another layer with two product nodes
    prod1 = ProductNode()
    prod2 = ProductNode()

    prod1.add_child(sum1)
    prod1.add_child(sum2)
    prod2.add_child(sum2)
    prod2.add_child(sum3)

    prod_layer = ProductLayer([prod1, prod2])

    # root layer, double sum
    root1 = SumNode()
    root2 = SumNode()

    weightr11 = 0.5
    root1.add_child(prod1, weightr11)
    weightr12 = 0.5
    root1.add_child(prod2, weightr12)

    weightr21 = 0.9
    root2.add_child(prod1, weightr21)
    weightr22 = 0.1
    root2.add_child(prod2, weightr22)

    root_layer = SumLayer([root1, root2])

    # create the spn
    spn = Spn(input_layer=input_layer,
              layers=[sum_layer, prod_layer, root_layer])

    print('===================')
    print(spn)
    print('===================')

    # setting the input values
    val1 = 0.0
    node1.set_val(val1)
    val2 = 0.5
    node2.set_val(val2)
    val3 = 0.3
    node3.set_val(val3)
    val4 = 1.0
    node4.set_val(val4)
    val5 = 0.0
    node5.set_val(val5)

    # evaluating the spn with MPE inference
    res = spn.test_mpe_eval()
    print('spn eval\'d', res)

    # testing it
    #
    # testing the max layer
    max1 = max(val1 * weight11,
               val2 * weight12,
               val3 * weight13)
    max2 = max(val2 * weight22,
               val3 * weight23,
               val4 * weight24)
    max3 = max(val3 * weight33,
               val4 * weight34,
               val5 * weight35)
    log_max1 = log(max1) if not numpy.isclose(max1, 0) else LOG_ZERO
    log_max2 = log(max2) if not numpy.isclose(max2, 0) else LOG_ZERO
    log_max3 = log(max3) if not numpy.isclose(max3, 0) else LOG_ZERO

    print('expected max vals {0}, {1}, {2}'.format(log_max1,
                                                   log_max2,
                                                   log_max3))
    print('found    max vals {0}, {1}, {2}'.format(sum1.log_val,
                                                   sum2.log_val,
                                                   sum3.log_val))
    if IS_LOG_ZERO(log_max1):
        assert IS_LOG_ZERO(sum1.log_val)
    else:
        assert_almost_equal(log_max1, sum1.log_val)
    if IS_LOG_ZERO(log_max2):
        assert IS_LOG_ZERO(sum2.log_val)
    else:
        assert_almost_equal(log_max2, sum2.log_val)
    if IS_LOG_ZERO(log_max3):
        assert IS_LOG_ZERO(sum3.log_val)
    else:
        assert_almost_equal(log_max3, sum3.log_val)

    # product layer is assumed to be fine, but let's check
    # it anyways
    prod_val1 = max1 * max2
    prod_val2 = max2 * max3
    prod_log_val1 = log_max1 + log_max2
    prod_log_val2 = log_max2 + log_max3

    print('exp prod vals {0}, {1}'.format(prod_log_val1,
                                          prod_log_val2))
    print('rea prod vals {0}, {1}'.format(prod1.log_val,
                                          prod2.log_val))
    if IS_LOG_ZERO(prod_log_val1):
        assert IS_LOG_ZERO(prod1.log_val)
    else:
        assert_almost_equal(prod_log_val1, prod1.log_val)

    if IS_LOG_ZERO(prod_log_val2):
        assert IS_LOG_ZERO(prod2.log_val)
    else:
        assert_almost_equal(prod_log_val2, prod2.log_val)

    # root layer, again a sum layer
    root_val1 = max(prod_val1 * weightr11,
                    prod_val2 * weightr12)
    root_val2 = max(prod_val1 * weightr21,
                    prod_val2 * weightr22)
    root_log_val1 = log(root_val1) if not numpy.isclose(
        root_val1, 0) else LOG_ZERO
    root_log_val2 = log(root_val2) if not numpy.isclose(
        root_val2, 0) else LOG_ZERO

    print('exp root vals {0}, {1}'.format(root_log_val1,
                                          root_log_val2))
    print('found ro vals {0}, {1}'.format(root1.log_val,
                                          root2.log_val))

    if IS_LOG_ZERO(root_log_val1):
        assert IS_LOG_ZERO(root1.log_val)
    else:
        assert_almost_equal(root_log_val1, root1.log_val)
    if IS_LOG_ZERO(root_log_val2):
        assert IS_LOG_ZERO(root2.log_val)
    else:
        assert_almost_equal(root_log_val2, root2.log_val)

    # now we are traversing top down the net
    print('mpe traversing')
    for i, j, k in spn.mpe_traversal():
        print(i, j, k)
Ejemplo n.º 14
0
def test_linked_to_theano_categorical():
    vars = [2, 2, 3, 4]
    freqs = [{'var': 0, 'freqs': [1, 2]},
             {'var': 1, 'freqs': [2, 2]},
             {'var': 0, 'freqs': [3, 2]},
             {'var': 1, 'freqs': [0, 3]},
             {'var': 2, 'freqs': [1, 0, 2]},
             {'var': 3, 'freqs': [1, 2, 1, 2]},
             {'var': 3, 'freqs': [3, 4, 0, 1]}]

    # create input layer first
    input_layer = CategoricalSmoothedLayer(vars=vars,
                                           node_dicts=freqs)
    # get nodes
    ind_nodes = [node for node in input_layer.nodes()]

    root_node = ProductNode()

    sum1 = SumNode()
    sum2 = SumNode()

    prod1 = ProductNode()
    prod2 = ProductNode()

    sum3 = SumNode()
    sum4 = SumNode()

    # linking
    root_node.add_child(sum1)
    root_node.add_child(sum2)
    root_node.add_child(ind_nodes[0])
    root_node.add_child(ind_nodes[1])

    sum1.add_child(ind_nodes[2], 0.4)
    sum1.add_child(ind_nodes[3], 0.6)
    sum2.add_child(ind_nodes[3], 0.2)
    sum2.add_child(prod1, 0.5)
    sum2.add_child(prod2, 0.3)

    prod1.add_child(ind_nodes[4])
    prod1.add_child(sum3)
    prod1.add_child(sum4)
    prod2.add_child(sum3)
    prod2.add_child(sum4)

    sum3.add_child(ind_nodes[5], 0.5)
    sum3.add_child(ind_nodes[6], 0.5)
    sum4.add_child(ind_nodes[5], 0.4)
    sum4.add_child(ind_nodes[6], 0.6)

    # creating layers
    root_layer = ProductLayerLinked([root_node])
    sum_layer = SumLayerLinked([sum1, sum2])
    prod_layer = ProductLayerLinked([prod1, prod2])
    sum_layer2 = SumLayerLinked([sum3, sum4])

    # create the linked spn
    spn_linked = SpnLinked(input_layer=input_layer,
                           layers=[sum_layer2, prod_layer,
                                   sum_layer, root_layer])

    print(spn_linked)

    # converting to theano repr
    spn_theano = SpnFactory.linked_to_theano(spn_linked)
    print(spn_theano)

    # time for some inference comparison
    for instance in I:
        print('linked')
        res_l = spn_linked.eval(instance)
        print(res_l)
        print('theano')
        res_t = spn_theano.eval(instance)
        print(res_t)
        assert_array_almost_equal(res_l, res_t)
Ejemplo n.º 15
0
def test_linked_to_theano_indicator():
    # creating single nodes
    root = SumNode()

    prod1 = ProductNode()
    prod2 = ProductNode()
    prod3 = ProductNode()

    sum1 = SumNode()
    sum2 = SumNode()
    sum3 = SumNode()
    sum4 = SumNode()

    ind1 = CategoricalIndicatorNode(var=0, var_val=0)
    ind2 = CategoricalIndicatorNode(var=0, var_val=1)
    ind3 = CategoricalIndicatorNode(var=1, var_val=0)
    ind4 = CategoricalIndicatorNode(var=1, var_val=1)
    ind5 = CategoricalIndicatorNode(var=2, var_val=0)
    ind6 = CategoricalIndicatorNode(var=2, var_val=1)
    ind7 = CategoricalIndicatorNode(var=2, var_val=2)
    ind8 = CategoricalIndicatorNode(var=3, var_val=0)
    ind9 = CategoricalIndicatorNode(var=3, var_val=1)
    ind10 = CategoricalIndicatorNode(var=3, var_val=2)
    ind11 = CategoricalIndicatorNode(var=3, var_val=3)

    prod4 = ProductNode()
    prod5 = ProductNode()
    prod6 = ProductNode()
    prod7 = ProductNode()

    # linking nodes
    root.add_child(prod1, 0.3)
    root. add_child(prod2, 0.3)
    root.add_child(prod3, 0.4)

    prod1.add_child(sum1)
    prod1.add_child(sum2)
    prod2.add_child(ind7)
    prod2.add_child(ind8)
    prod2.add_child(ind11)
    prod3.add_child(sum3)
    prod3.add_child(sum4)

    sum1.add_child(ind1, 0.3)
    sum1.add_child(ind2, 0.3)
    sum1.add_child(prod4, 0.4)

    sum2.add_child(ind2, 0.5)
    sum2.add_child(prod4, 0.2)
    sum2.add_child(prod5, 0.3)

    sum3.add_child(prod6, 0.5)
    sum3.add_child(prod7, 0.5)
    sum4.add_child(prod6, 0.5)
    sum4.add_child(prod7, 0.5)

    prod4.add_child(ind3)
    prod4.add_child(ind4)
    prod5.add_child(ind5)
    prod5.add_child(ind6)
    prod6.add_child(ind9)
    prod6.add_child(ind10)
    prod7.add_child(ind9)
    prod7.add_child(ind10)

    # building layers from nodes
    root_layer = SumLayerLinked([root])
    prod_layer = ProductLayerLinked([prod1, prod2, prod3])
    sum_layer = SumLayerLinked([sum1, sum2, sum3, sum4])
    aprod_layer = ProductLayerLinked([prod4, prod5, prod6, prod7])
    ind_layer = CategoricalIndicatorLayer(nodes=[ind1, ind2,
                                                 ind3, ind4,
                                                 ind5, ind6,
                                                 ind7, ind8,
                                                 ind9, ind10,
                                                 ind11])

    # creating the linked spn
    spn_linked = SpnLinked(input_layer=ind_layer,
                           layers=[aprod_layer,
                                   sum_layer,
                                   prod_layer,
                                   root_layer])

    print(spn_linked)

    # converting to theano repr
    spn_theano = SpnFactory.linked_to_theano(spn_linked)
    print(spn_theano)

    # time for some inference comparison
    for instance in I:
        print('linked')
        res_l = spn_linked.eval(instance)
        print(res_l)
        print('theano')
        res_t = spn_theano.eval(instance)
        print(res_t)
        assert_array_almost_equal(res_l, res_t)
Ejemplo n.º 16
0
    def fit_structure(self, data, feature_sizes):
        """
        data is a numpy array of size {n_instances X n_features}
        feature_sizes is an array of integers representing feature ranges
        """

        #
        # resetting the data slice ids (just in case)
        DataSlice.reset_id_counter()

        tot_n_instances = data.shape[0]
        tot_n_features = data.shape[1]

        logging.info('Learning SPN structure on a (%d X %d) dataset',
                     tot_n_instances, tot_n_features)
        learn_start_t = perf_counter()

        #
        # a queue containing the data slices to process
        slices_to_process = deque()

        # a stack for building nodes
        building_stack = deque()

        # a dict to keep track of id->nodes
        node_id_assoc = {}

        # creating the first slice
        whole_slice = DataSlice.whole_slice(tot_n_instances, tot_n_features)
        slices_to_process.append(whole_slice)

        first_run = True

        #
        # iteratively process & split slices
        #
        while slices_to_process:

            # process a slice
            current_slice = slices_to_process.popleft()

            # pointers to the current data slice
            current_instances = current_slice.instance_ids
            current_features = current_slice.feature_ids
            current_id = current_slice.id

            n_instances = len(current_instances)
            n_features = len(current_features)

            logging.info('\n*** Processing slice %d (%d X %d)', current_id,
                         n_instances, n_features)
            logging.debug('\tinstances:%s\n\tfeatures:%s', current_instances,
                          current_features)

            #
            # is this a leaf node or we can split?
            if n_features == 1:
                logging.info('---> Adding a leaf (just one feature)')

                (feature_id, ) = current_features
                feature_size = feature_sizes[feature_id]

                # slicing from the original dataset
                slice_data_rows = data[current_instances, :]
                current_slice_data = slice_data_rows[:, current_features]

                # create the node
                leaf_node = CategoricalSmoothedNode(
                    var=feature_id,
                    var_values=feature_size,
                    data=current_slice_data,
                    instances=current_instances,
                    alpha=self._alpha)
                # print('lnvf', leaf_node._var_freqs)
                # storing links
                # input_nodes.append(leaf_node)
                leaf_node.id = current_id
                node_id_assoc[current_id] = leaf_node

                logging.debug('\tCreated Smooth Node %s', leaf_node)

            elif (n_instances <= self._min_instances_slice and n_features > 1):
                #
                # splitting the slice on each feature
                logging.info('---> Few instances (%d), decompose all features',
                             n_instances)
                #
                # shall put a cltree or
                if self._cltree_leaves:
                    logging.info('into a Chow-Liu tree')
                    #
                    # slicing data
                    slice_data_rows = data[current_instances, :]
                    current_slice_data = slice_data_rows[:, current_features]

                    current_feature_sizes = [
                        feature_sizes[i] for i in current_features
                    ]
                    #
                    # creating a Chow-Liu tree as leaf
                    leaf_node = CLTreeNode(vars=current_features,
                                           var_values=current_feature_sizes,
                                           data=current_slice_data,
                                           alpha=self._alpha)
                    #
                    # storing links
                    leaf_node.id = current_id
                    node_id_assoc[current_id] = leaf_node

                    logging.debug('\tCreated Chow-Liu Tree Node %s', leaf_node)

                elif self._kde and n_instances > 1:
                    estimate_kernel_density_spn(current_slice, feature_sizes,
                                                data, self._alpha,
                                                node_id_assoc, building_stack,
                                                slices_to_process)

                # elif n_instances == 1:  # FIXME: there is a bug here
                else:
                    current_slice, slices_to_process, building_stack, node_id_assoc = \
                        self.make_naive_factorization(current_slice,
                                                      slices_to_process,
                                                      building_stack,
                                                      node_id_assoc)
            else:

                #
                # slicing from the original dataset
                slice_data_rows = data[current_instances, :]
                current_slice_data = slice_data_rows[:, current_features]

                split_on_features = False
                #
                # first run is a split on rows
                if first_run:
                    logging.info('-- FIRST RUN --')
                    first_run = False
                else:
                    #
                    # try clustering on cols
                    # logging.debug('...trying to split on columns')
                    split_start_t = perf_counter()
                    print(data.shape)
                    dependent_features, other_features = greedy_feature_split(
                        data, current_slice, feature_sizes, self._g_factor,
                        self._rand_gen)
                    split_end_t = perf_counter()
                    logging.info('...tried to split on columns in {}'.format(
                        split_end_t - split_start_t))
                    if len(other_features) > 0:
                        split_on_features = True
                #
                # have dependent components been found?
                if split_on_features:
                    #
                    # splitting on columns
                    logging.info(
                        '---> Splitting on features' +
                        ' {} -> ({}, {})'.format(len(current_features),
                                                 len(dependent_features),
                                                 len(other_features)))

                    #
                    # creating two new data slices and putting them on queue
                    first_slice = DataSlice(current_instances,
                                            dependent_features)
                    second_slice = DataSlice(current_instances, other_features)
                    slices_to_process.append(first_slice)
                    slices_to_process.append(second_slice)

                    children_ids = [first_slice.id, second_slice.id]

                    #
                    # storing link parent children
                    current_slice.type = ProductNode
                    building_stack.append(current_slice)
                    current_slice.add_child(first_slice)
                    current_slice.add_child(second_slice)

                    #
                    # creating product node
                    prod_node = ProductNode(
                        var_scope=frozenset(current_features))
                    prod_node.id = current_id
                    node_id_assoc[current_id] = prod_node
                    logging.debug('\tCreated Prod Node %s (with children %s)',
                                  prod_node, children_ids)

                else:
                    #
                    # clustering on rows
                    logging.info('---> Splitting on rows')

                    #
                    # at most n_rows clusters, for sklearn
                    k_row_clusters = min(self._n_cluster_splits,
                                         n_instances - 1)

                    clustering = cluster_rows(
                        data,
                        current_slice,
                        n_clusters=k_row_clusters,
                        cluster_method=self._row_cluster_method,
                        n_iters=self._n_iters,
                        n_restarts=self._n_restarts,
                        cluster_penalty=self._cluster_penalty,
                        rand_gen=self._rand_gen,
                        sklearn_args=self._sklearn_args)

                    if len(clustering) < 2:
                        logging.info('\n\n\nLess than 2 clusters\n\n (%d)',
                                     len(clustering))

                        logging.info('forcing a naive factorization')
                        current_slice, slices_to_process, building_stack, node_id_assoc = \
                            self.make_naive_factorization(current_slice,
                                                          slices_to_process,
                                                          building_stack,
                                                          node_id_assoc)

                    else:
                        # logging.debug('obtained clustering %s', clustering)
                        logging.info('clustered into %d parts (min %d)',
                                     len(clustering), k_row_clusters)
                        # splitting
                        cluster_slices = [
                            DataSlice(cluster, current_features)
                            for cluster in clustering
                        ]
                        cluster_slices_ids = [
                            slice.id for slice in cluster_slices
                        ]

                        # cluster_prior = 5.0
                        # cluster_weights = [(slice.n_instances() + cluster_prior) /
                        #                    (n_instances + cluster_prior * len(cluster_slices))
                        #                    for slice in cluster_slices]
                        cluster_weights = [
                            slice.n_instances() / n_instances
                            for slice in cluster_slices
                        ]

                        #
                        # appending for processing
                        slices_to_process.extend(cluster_slices)

                        #
                        # storing links
                        # current_slice.children = cluster_slices_ids
                        # current_slice.weights = cluster_weights
                        current_slice.type = SumNode
                        building_stack.append(current_slice)
                        for child_slice, child_weight in zip(
                                cluster_slices, cluster_weights):
                            current_slice.add_child(child_slice, child_weight)

                        #
                        # building a sum node
                        SCOPES_DICT[frozenset(current_features)] += 1
                        sum_node = SumNode(
                            var_scope=frozenset(current_features))
                        sum_node.id = current_id
                        node_id_assoc[current_id] = sum_node
                        logging.debug(
                            '\tCreated Sum Node %s (with children %s)',
                            sum_node, cluster_slices_ids)

        learn_end_t = perf_counter()

        logging.info('\n\n\tStructure learned in %f secs',
                     (learn_end_t - learn_start_t))

        #
        # linking the spn graph (parent -> children)
        #
        logging.info('===> Building tree')

        link_start_t = perf_counter()
        root_build_node = building_stack[0]
        root_node = node_id_assoc[root_build_node.id]
        logging.debug('root node: %s', root_node)

        root_node = SpnFactory.pruned_spn_from_slices(node_id_assoc,
                                                      building_stack)
        link_end_t = perf_counter()
        logging.info('\tLinked the spn in %f secs (root_node %s)',
                     (link_end_t - link_start_t), root_node)

        #
        # building layers
        #
        logging.info('===> Layering spn')
        layer_start_t = perf_counter()
        spn = SpnFactory.layered_linked_spn(root_node)
        layer_end_t = perf_counter()
        logging.info('\tLayered the spn in %f secs',
                     (layer_end_t - layer_start_t))

        logging.info('\nLearned SPN\n\n%s', spn.stats())
        #logging.info('%s', SCOPES_DICT.most_common(30))

        return spn
Ejemplo n.º 17
0
def test_pruned_spn_from_slices():
    #
    # creating all the data slices
    # the slicing is a fake stub
    rows = 5
    cols = 5
    var = 1
    values = 2

    node_assoc = {}
    building_stack = deque()

    slice_1 = DataSlice.whole_slice(rows, cols)
    slice_1.type = SumNode
    node_1 = SumNode()
    node_1.id = slice_1.id
    node_assoc[node_1.id] = node_1
    building_stack.append(slice_1)

    slice_2 = DataSlice.whole_slice(rows, cols)
    slice_2.type = ProductNode
    node_2 = ProductNode()
    node_2.id = slice_2.id
    node_assoc[node_2.id] = node_2
    building_stack.append(slice_2)

    slice_3 = DataSlice.whole_slice(rows, cols)
    slice_3.type = SumNode
    node_3 = SumNode()
    node_3.id = slice_3.id
    node_assoc[node_3.id] = node_3
    building_stack.append(slice_3)

    # adding first level
    slice_1.add_child(slice_2, 0.8)
    slice_1.add_child(slice_3, 0.2)

    slice_4 = DataSlice.whole_slice(rows, cols)
    slice_4.type = ProductNode
    node_4 = ProductNode()
    node_4.id = slice_4.id
    node_assoc[node_4.id] = node_4
    building_stack.append(slice_4)

    leaf_5 = CategoricalSmoothedNode(var, values)
    slice_5 = DataSlice.whole_slice(rows, cols)
    leaf_5.id = slice_5.id
    node_assoc[leaf_5.id] = leaf_5
    # not adding the slice to the stack

    slice_2.add_child(slice_4)
    slice_2.add_child(slice_5)

    slice_6 = DataSlice.whole_slice(rows, cols)
    slice_6.type = SumNode
    node_6 = SumNode()
    node_6.id = slice_6.id
    node_assoc[node_6.id] = node_6
    building_stack.append(slice_6)

    slice_7 = DataSlice.whole_slice(rows, cols)
    slice_7.type = SumNode
    node_7 = SumNode()
    node_7.id = slice_7.id
    node_assoc[node_7.id] = node_7
    building_stack.append(slice_7)

    slice_3.add_child(slice_6, 0.4)
    slice_3.add_child(slice_7, 0.6)

    slice_8 = DataSlice.whole_slice(rows, cols)
    slice_8.type = ProductNode
    node_8 = ProductNode()
    node_8.id = slice_8.id
    node_assoc[node_8.id] = node_8
    building_stack.append(slice_8)

    leaf_15 = CategoricalSmoothedNode(var, values)
    slice_15 = DataSlice.whole_slice(rows, cols)
    leaf_15.id = slice_15.id
    node_assoc[leaf_15.id] = leaf_15

    slice_4.add_child(slice_8)
    slice_4.add_child(slice_15)

    leaf_13 = CategoricalSmoothedNode(var, values)
    slice_13 = DataSlice.whole_slice(rows, cols)
    leaf_13.id = slice_13.id
    node_assoc[leaf_13.id] = leaf_13

    leaf_14 = CategoricalSmoothedNode(var, values)
    slice_14 = DataSlice.whole_slice(rows, cols)
    leaf_14.id = slice_14.id
    node_assoc[leaf_14.id] = leaf_14

    slice_8.add_child(slice_13)
    slice_8.add_child(slice_14)

    slice_9 = DataSlice.whole_slice(rows, cols)
    slice_9.type = ProductNode
    node_9 = ProductNode()
    node_9.id = slice_9.id
    node_assoc[node_9.id] = node_9
    building_stack.append(slice_9)

    leaf_16 = CategoricalSmoothedNode(var, values)
    slice_16 = DataSlice.whole_slice(rows, cols)
    leaf_16.id = slice_16.id
    node_assoc[leaf_16.id] = leaf_16

    leaf_17 = CategoricalSmoothedNode(var, values)
    slice_17 = DataSlice.whole_slice(rows, cols)
    leaf_17.id = slice_17.id
    node_assoc[leaf_17.id] = leaf_17

    slice_9.add_child(slice_16)
    slice_9.add_child(slice_17)

    slice_10 = DataSlice.whole_slice(rows, cols)
    slice_10.type = ProductNode
    node_10 = ProductNode()
    node_10.id = slice_10.id
    node_assoc[node_10.id] = node_10
    building_stack.append(slice_10)

    leaf_18 = CategoricalSmoothedNode(var, values)
    slice_18 = DataSlice.whole_slice(rows, cols)
    leaf_18.id = slice_18.id
    node_assoc[leaf_18.id] = leaf_18

    leaf_19 = CategoricalSmoothedNode(var, values)
    slice_19 = DataSlice.whole_slice(rows, cols)
    leaf_19.id = slice_19.id
    node_assoc[leaf_19.id] = leaf_19

    slice_10.add_child(slice_18)
    slice_10.add_child(slice_19)

    slice_6.add_child(slice_9, 0.1)
    slice_6.add_child(slice_10, 0.9)

    slice_11 = DataSlice.whole_slice(rows, cols)
    slice_11.type = ProductNode
    node_11 = ProductNode()
    node_11.id = slice_11.id
    node_assoc[node_11.id] = node_11
    building_stack.append(slice_11)

    leaf_20 = CategoricalSmoothedNode(var, values)
    slice_20 = DataSlice.whole_slice(rows, cols)
    leaf_20.id = slice_20.id
    node_assoc[leaf_20.id] = leaf_20

    leaf_21 = CategoricalSmoothedNode(var, values)
    slice_21 = DataSlice.whole_slice(rows, cols)
    leaf_21.id = slice_21.id
    node_assoc[leaf_21.id] = leaf_21

    slice_11.add_child(slice_20)
    slice_11.add_child(slice_21)

    slice_12 = DataSlice.whole_slice(rows, cols)
    slice_12.type = ProductNode
    node_12 = ProductNode()
    node_12.id = slice_12.id
    node_assoc[node_12.id] = node_12
    building_stack.append(slice_12)

    leaf_22 = CategoricalSmoothedNode(var, values)
    slice_22 = DataSlice.whole_slice(rows, cols)
    leaf_22.id = slice_22.id
    node_assoc[leaf_22.id] = leaf_22

    leaf_23 = CategoricalSmoothedNode(var, values)
    slice_23 = DataSlice.whole_slice(rows, cols)
    leaf_23.id = slice_23.id
    node_assoc[leaf_23.id] = leaf_23

    slice_12.add_child(slice_22)
    slice_12.add_child(slice_23)

    slice_7.add_child(slice_11, 0.2)
    slice_7.add_child(slice_12, 0.7)

    root_node = SpnFactory.pruned_spn_from_slices(node_assoc, building_stack)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 5
        elif i == 2:
            assert layer.n_nodes() == 12
Ejemplo n.º 18
0
def build_linked_layered_spn(print_spn=True):
    #
    # building an indicator layer
    ind_x_00 = CategoricalIndicatorNode(0, 0)
    ind_x_01 = CategoricalIndicatorNode(0, 1)
    ind_x_10 = CategoricalIndicatorNode(1, 0)
    ind_x_11 = CategoricalIndicatorNode(1, 1)
    ind_x_20 = CategoricalIndicatorNode(2, 0)
    ind_x_21 = CategoricalIndicatorNode(2, 1)

    input_layer = CategoricalIndicatorLayer(
        [ind_x_00, ind_x_01, ind_x_10, ind_x_11, ind_x_20, ind_x_21])

    #
    # sum layer
    #
    sum_node_1 = SumNode(frozenset([0]))
    sum_node_1.add_child(ind_x_00, 0.1)
    sum_node_1.add_child(ind_x_01, 0.9)

    sum_node_2 = SumNode(frozenset([0]))
    sum_node_2.add_child(ind_x_00, 0.4)
    sum_node_2.add_child(ind_x_01, 0.6)

    sum_node_3 = SumNode(frozenset([1]))
    sum_node_3.add_child(ind_x_10, 0.3)
    sum_node_3.add_child(ind_x_11, 0.7)

    sum_node_4 = SumNode(frozenset([1]))
    sum_node_4.add_child(ind_x_10, 0.6)
    sum_node_4.add_child(ind_x_11, 0.4)

    sum_node_5 = SumNode(frozenset([2]))
    sum_node_5.add_child(ind_x_20, 0.5)
    sum_node_5.add_child(ind_x_21, 0.5)

    sum_node_6 = SumNode(frozenset([2]))
    sum_node_6.add_child(ind_x_20, 0.2)
    sum_node_6.add_child(ind_x_21, 0.8)

    sum_layer_1 = SumLayerLinked([
        sum_node_1, sum_node_2, sum_node_3, sum_node_4, sum_node_5, sum_node_6
    ])

    #
    # product nodes

    #
    # xy
    prod_node_7 = ProductNode(frozenset([0, 1]))
    prod_node_7.add_child(sum_node_1)
    prod_node_7.add_child(sum_node_3)

    prod_node_8 = ProductNode(frozenset([0, 1]))
    prod_node_8.add_child(sum_node_2)
    prod_node_8.add_child(sum_node_4)

    prod_node_9 = ProductNode(frozenset([0, 1]))
    prod_node_9.add_child(sum_node_1)
    prod_node_9.add_child(sum_node_3)

    #
    # yz
    prod_node_10 = ProductNode(frozenset([1, 2]))
    prod_node_10.add_child(sum_node_4)
    prod_node_10.add_child(sum_node_5)

    prod_node_11 = ProductNode(frozenset([1, 2]))
    prod_node_11.add_child(sum_node_4)
    prod_node_11.add_child(sum_node_6)

    prod_layer_2 = ProductLayerLinked(
        [prod_node_7, prod_node_8, prod_node_9, prod_node_10, prod_node_11])

    #
    # sum nodes
    #
    # xy
    sum_node_12 = SumNode(frozenset([0, 1]))
    sum_node_12.add_child(prod_node_7, 0.1)
    sum_node_12.add_child(prod_node_8, 0.9)

    sum_node_13 = SumNode(frozenset([0, 1]))
    sum_node_13.add_child(prod_node_8, 0.7)
    sum_node_13.add_child(prod_node_9, 0.3)

    #
    # yz
    sum_node_14 = SumNode(frozenset([1, 2]))
    sum_node_14.add_child(prod_node_10, 0.6)
    sum_node_14.add_child(prod_node_11, 0.4)

    sum_layer_3 = SumLayerLinked([sum_node_12, sum_node_13, sum_node_14])

    #
    # product nodes
    prod_node_15 = ProductNode(frozenset([0, 1, 2]))
    prod_node_15.add_child(sum_node_12)
    prod_node_15.add_child(sum_node_6)

    prod_node_16 = ProductNode(frozenset([0, 1, 2]))
    prod_node_16.add_child(sum_node_13)
    prod_node_16.add_child(sum_node_5)

    prod_node_17 = ProductNode(frozenset([0, 1, 2]))
    prod_node_17.add_child(sum_node_2)
    prod_node_17.add_child(sum_node_14)

    prod_layer_4 = ProductLayerLinked(
        [prod_node_15, prod_node_16, prod_node_17])

    #
    # root
    sum_node_18 = SumNode(frozenset([0, 1, 2]))
    sum_node_18.add_child(prod_node_15, 0.2)
    sum_node_18.add_child(prod_node_16, 0.2)
    sum_node_18.add_child(prod_node_17, 0.6)

    sum_layer_5 = SumLayerLinked([sum_node_18])

    #
    # creating the spn
    layers = [
        sum_layer_1, prod_layer_2, sum_layer_3, prod_layer_4, sum_layer_5
    ]
    nodes = [node for layer in layers for node in layer.nodes()]

    spn = SpnLinked(input_layer=input_layer, layers=layers)

    if print_spn:
        print(spn)

    return spn, layers, nodes
Ejemplo n.º 19
0
def test_layered_linked_spn():
    # creating single nodes
    # this code is replicated TODO: make a function
    root = SumNode()

    prod1 = ProductNode()
    prod2 = ProductNode()
    prod3 = ProductNode()

    sum1 = SumNode()
    sum2 = SumNode()
    sum3 = SumNode()
    sum4 = SumNode()

    ind1 = CategoricalIndicatorNode(var=0, var_val=0)
    ind2 = CategoricalIndicatorNode(var=0, var_val=1)
    ind3 = CategoricalIndicatorNode(var=1, var_val=0)
    ind4 = CategoricalIndicatorNode(var=1, var_val=1)
    ind5 = CategoricalIndicatorNode(var=2, var_val=0)
    ind6 = CategoricalIndicatorNode(var=2, var_val=1)
    ind7 = CategoricalIndicatorNode(var=2, var_val=2)
    ind8 = CategoricalIndicatorNode(var=3, var_val=0)
    ind9 = CategoricalIndicatorNode(var=3, var_val=1)
    ind10 = CategoricalIndicatorNode(var=3, var_val=2)
    ind11 = CategoricalIndicatorNode(var=3, var_val=3)

    prod4 = ProductNode()
    prod5 = ProductNode()
    prod6 = ProductNode()
    prod7 = ProductNode()

    # linking nodes
    root.add_child(prod1, 0.3)
    root.add_child(prod2, 0.3)
    root.add_child(prod3, 0.4)

    prod1.add_child(sum1)
    prod1.add_child(sum2)
    prod2.add_child(ind7)
    prod2.add_child(ind8)
    prod2.add_child(ind11)
    prod3.add_child(sum3)
    prod3.add_child(sum4)

    sum1.add_child(ind1, 0.3)
    sum1.add_child(ind2, 0.3)
    sum1.add_child(prod4, 0.4)

    sum2.add_child(ind2, 0.5)
    sum2.add_child(prod4, 0.2)
    sum2.add_child(prod5, 0.3)

    sum3.add_child(prod6, 0.5)
    sum3.add_child(prod7, 0.5)
    sum4.add_child(prod6, 0.5)
    sum4.add_child(prod7, 0.5)

    prod4.add_child(ind3)
    prod4.add_child(ind4)
    prod5.add_child(ind5)
    prod5.add_child(ind6)
    prod6.add_child(ind9)
    prod6.add_child(ind10)
    prod7.add_child(ind9)
    prod7.add_child(ind10)

    spn = SpnFactory.layered_linked_spn(root)

    print(spn)
    print(spn.stats())
Ejemplo n.º 20
0
def test_pruned_spn_from_slices():
    #
    # creating all the data slices
    # the slicing is a fake stub
    rows = 5
    cols = 5
    var = 1
    values = 2

    node_assoc = {}
    building_stack = deque()

    slice_1 = DataSlice.whole_slice(rows, cols)
    slice_1.type = SumNode
    node_1 = SumNode()
    node_1.id = slice_1.id
    node_assoc[node_1.id] = node_1
    building_stack.append(slice_1)

    slice_2 = DataSlice.whole_slice(rows, cols)
    slice_2.type = ProductNode
    node_2 = ProductNode()
    node_2.id = slice_2.id
    node_assoc[node_2.id] = node_2
    building_stack.append(slice_2)

    slice_3 = DataSlice.whole_slice(rows, cols)
    slice_3.type = SumNode
    node_3 = SumNode()
    node_3.id = slice_3.id
    node_assoc[node_3.id] = node_3
    building_stack.append(slice_3)

    # adding first level
    slice_1.add_child(slice_2, 0.8)
    slice_1.add_child(slice_3, 0.2)

    slice_4 = DataSlice.whole_slice(rows, cols)
    slice_4.type = ProductNode
    node_4 = ProductNode()
    node_4.id = slice_4.id
    node_assoc[node_4.id] = node_4
    building_stack.append(slice_4)

    leaf_5 = CategoricalSmoothedNode(var,
                                     values)
    slice_5 = DataSlice.whole_slice(rows, cols)
    leaf_5.id = slice_5.id
    node_assoc[leaf_5.id] = leaf_5
    # not adding the slice to the stack

    slice_2.add_child(slice_4)
    slice_2.add_child(slice_5)

    slice_6 = DataSlice.whole_slice(rows, cols)
    slice_6.type = SumNode
    node_6 = SumNode()
    node_6.id = slice_6.id
    node_assoc[node_6.id] = node_6
    building_stack.append(slice_6)

    slice_7 = DataSlice.whole_slice(rows, cols)
    slice_7.type = SumNode
    node_7 = SumNode()
    node_7.id = slice_7.id
    node_assoc[node_7.id] = node_7
    building_stack.append(slice_7)

    slice_3.add_child(slice_6, 0.4)
    slice_3.add_child(slice_7, 0.6)

    slice_8 = DataSlice.whole_slice(rows, cols)
    slice_8.type = ProductNode
    node_8 = ProductNode()
    node_8.id = slice_8.id
    node_assoc[node_8.id] = node_8
    building_stack.append(slice_8)

    leaf_15 = CategoricalSmoothedNode(var,
                                      values)
    slice_15 = DataSlice.whole_slice(rows, cols)
    leaf_15.id = slice_15.id
    node_assoc[leaf_15.id] = leaf_15

    slice_4.add_child(slice_8)
    slice_4.add_child(slice_15)

    leaf_13 = CategoricalSmoothedNode(var,
                                      values)
    slice_13 = DataSlice.whole_slice(rows, cols)
    leaf_13.id = slice_13.id
    node_assoc[leaf_13.id] = leaf_13

    leaf_14 = CategoricalSmoothedNode(var,
                                      values)
    slice_14 = DataSlice.whole_slice(rows, cols)
    leaf_14.id = slice_14.id
    node_assoc[leaf_14.id] = leaf_14

    slice_8.add_child(slice_13)
    slice_8.add_child(slice_14)

    slice_9 = DataSlice.whole_slice(rows, cols)
    slice_9.type = ProductNode
    node_9 = ProductNode()
    node_9.id = slice_9.id
    node_assoc[node_9.id] = node_9
    building_stack.append(slice_9)

    leaf_16 = CategoricalSmoothedNode(var,
                                      values)
    slice_16 = DataSlice.whole_slice(rows, cols)
    leaf_16.id = slice_16.id
    node_assoc[leaf_16.id] = leaf_16

    leaf_17 = CategoricalSmoothedNode(var,
                                      values)
    slice_17 = DataSlice.whole_slice(rows, cols)
    leaf_17.id = slice_17.id
    node_assoc[leaf_17.id] = leaf_17

    slice_9.add_child(slice_16)
    slice_9.add_child(slice_17)

    slice_10 = DataSlice.whole_slice(rows, cols)
    slice_10.type = ProductNode
    node_10 = ProductNode()
    node_10.id = slice_10.id
    node_assoc[node_10.id] = node_10
    building_stack.append(slice_10)

    leaf_18 = CategoricalSmoothedNode(var,
                                      values)
    slice_18 = DataSlice.whole_slice(rows, cols)
    leaf_18.id = slice_18.id
    node_assoc[leaf_18.id] = leaf_18

    leaf_19 = CategoricalSmoothedNode(var,
                                      values)
    slice_19 = DataSlice.whole_slice(rows, cols)
    leaf_19.id = slice_19.id
    node_assoc[leaf_19.id] = leaf_19

    slice_10.add_child(slice_18)
    slice_10.add_child(slice_19)

    slice_6.add_child(slice_9, 0.1)
    slice_6.add_child(slice_10, 0.9)

    slice_11 = DataSlice.whole_slice(rows, cols)
    slice_11.type = ProductNode
    node_11 = ProductNode()
    node_11.id = slice_11.id
    node_assoc[node_11.id] = node_11
    building_stack.append(slice_11)

    leaf_20 = CategoricalSmoothedNode(var,
                                      values)
    slice_20 = DataSlice.whole_slice(rows, cols)
    leaf_20.id = slice_20.id
    node_assoc[leaf_20.id] = leaf_20

    leaf_21 = CategoricalSmoothedNode(var,
                                      values)
    slice_21 = DataSlice.whole_slice(rows, cols)
    leaf_21.id = slice_21.id
    node_assoc[leaf_21.id] = leaf_21

    slice_11.add_child(slice_20)
    slice_11.add_child(slice_21)

    slice_12 = DataSlice.whole_slice(rows, cols)
    slice_12.type = ProductNode
    node_12 = ProductNode()
    node_12.id = slice_12.id
    node_assoc[node_12.id] = node_12
    building_stack.append(slice_12)

    leaf_22 = CategoricalSmoothedNode(var,
                                      values)
    slice_22 = DataSlice.whole_slice(rows, cols)
    leaf_22.id = slice_22.id
    node_assoc[leaf_22.id] = leaf_22

    leaf_23 = CategoricalSmoothedNode(var,
                                      values)
    slice_23 = DataSlice.whole_slice(rows, cols)
    leaf_23.id = slice_23.id
    node_assoc[leaf_23.id] = leaf_23

    slice_12.add_child(slice_22)
    slice_12.add_child(slice_23)

    slice_7.add_child(slice_11, 0.2)
    slice_7.add_child(slice_12, 0.7)

    root_node = SpnFactory.pruned_spn_from_slices(node_assoc,
                                                  building_stack)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 5
        elif i == 2:
            assert layer.n_nodes() == 12
Ejemplo n.º 21
0
def test_spn_backprop():
    # create initial layer
    node1 = Node()
    node2 = Node()
    node3 = Node()
    node4 = Node()
    node5 = Node()

    input_layer = CategoricalInputLayer([node1, node2,
                                         node3, node4,
                                         node5])

    # top layer made by 3 sum nodes
    sum1 = SumNode()
    sum2 = SumNode()
    sum3 = SumNode()

    # linking to input nodes
    weight11 = 0.3
    sum1.add_child(node1, weight11)
    weight12 = 0.3
    sum1.add_child(node2, weight12)
    weight13 = 0.4
    sum1.add_child(node3, weight13)

    weight22 = 0.15
    sum2.add_child(node2, weight22)
    weight23 = 0.15
    sum2.add_child(node3, weight23)
    weight24 = 0.7
    sum2.add_child(node4, weight24)

    weight33 = 0.4
    sum3.add_child(node3, weight33)
    weight34 = 0.25
    sum3.add_child(node4, weight34)
    weight35 = 0.35
    sum3.add_child(node5, weight35)

    sum_layer = SumLayer([sum1, sum2, sum3])

    # another layer with two product nodes
    prod1 = ProductNode()
    prod2 = ProductNode()

    prod1.add_child(sum1)
    prod1.add_child(sum2)
    prod2.add_child(sum2)
    prod2.add_child(sum3)

    prod_layer = ProductLayer([prod1, prod2])

    # root layer, double sum
    root1 = SumNode()
    root2 = SumNode()

    weightr11 = 0.5
    root1.add_child(prod1, weightr11)
    weightr12 = 0.5
    root1.add_child(prod2, weightr12)

    weightr21 = 0.9
    root2.add_child(prod1, weightr21)
    weightr22 = 0.1
    root2.add_child(prod2, weightr22)

    root_layer = SumLayer([root1, root2])
    # root_layer = SumLayer([root1])

    # create the spn
    spn = Spn(input_layer=input_layer,
              layers=[sum_layer, prod_layer, root_layer])

    # setting the input values
    val1 = 0.0
    node1.set_val(val1)
    val2 = 0.5
    node2.set_val(val2)
    val3 = 0.3
    node3.set_val(val3)
    val4 = 1.0
    node4.set_val(val4)
    val5 = 0.0
    node5.set_val(val5)

    # evaluating the spn
    res = spn.test_eval()
    print('spn eval\'d', res)

    # backprop
    spn.backprop()

    # computing derivatives by hand
    # topdown: root layer
    root_der = 1.0
    log_root_der = log(root_der)

    # print('root ders', root1.log_der, root2.log_der)
    print('root ders', root1.log_der)
    assert_almost_equal(log_root_der, root1.log_der)
    assert_almost_equal(log_root_der, root2.log_der)

    # product layer
    prod_der1 = (root_der * weightr11 +
                 root_der * weightr21)

    prod_der2 = (root_der * weightr12 +
                 root_der * weightr22)

    # prod_der1 = (root_der * weightr11)
    # prod_der2 = (root_der * weightr12)

    log_prod_der1 = log(prod_der1) if prod_der1 > 0.0 else LOG_ZERO
    log_prod_der2 = log(prod_der2) if prod_der2 > 0.0 else LOG_ZERO

    print('found  prod ders', prod1.log_der, prod2.log_der)
    print('expect prod ders', log_prod_der1, log_prod_der2)

    if IS_LOG_ZERO(log_prod_der1):
        assert IS_LOG_ZERO(prod1.log_der)
    else:
        assert_almost_equal(log_prod_der1, prod1.log_der)
    if IS_LOG_ZERO(log_prod_der2):
        assert IS_LOG_ZERO(prod2.log_der)
    else:
        assert_almost_equal(log_prod_der2, prod2.log_der)

    # sum layer
    sum_der1 = (
        prod_der1 * (weight22 * val2 +
                     weight23 * val3 +
                     weight24 * val4))

    log_sum_der1 = log(sum_der1) if sum_der1 > 0.0 else LOG_ZERO

    sum_der2 = (prod_der1 * (weight11 * val1 +
                             weight12 * val2 +
                             weight13 * val3) +
                prod_der2 * (weight33 * val3 +
                             weight34 * val4 +
                             weight35 * val5))

    log_sum_der2 = log(sum_der2) if sum_der2 > 0.0 else LOG_ZERO

    sum_der3 = (prod_der2 * (weight22 * val2 +
                             weight23 * val3 +
                             weight24 * val4))

    log_sum_der3 = log(sum_der3) if sum_der3 > 0.0 else LOG_ZERO

    print('expected sum ders', log_sum_der1,
          log_sum_der2,
          log_sum_der3)
    print('found    sum ders', sum1.log_der,
          sum2.log_der,
          sum3.log_der)

    if IS_LOG_ZERO(log_sum_der1):
        assert IS_LOG_ZERO(sum1.log_der)
    else:
        assert_almost_equal(log_sum_der1, sum1.log_der)
    if IS_LOG_ZERO(log_sum_der2):
        assert IS_LOG_ZERO(sum2.log_der)
    else:
        assert_almost_equal(log_sum_der2, sum2.log_der)
    if IS_LOG_ZERO(log_sum_der3):
        assert IS_LOG_ZERO(sum3.log_der)
    else:
        assert_almost_equal(log_sum_der3, sum3.log_der)

    # final level, the first one
    try:
        log_der1 = log(sum_der1 * weight11)
    except:
        log_der1 = LOG_ZERO

    try:
        log_der2 = log(sum_der1 * weight12 +
                       sum_der2 * weight22)
    except:
        log_der2 = LOG_ZERO

    try:
        log_der3 = log(sum_der1 * weight13 +
                       sum_der2 * weight23 +
                       sum_der3 * weight33)
    except:
        log_der3 = LOG_ZERO

    try:
        log_der4 = log(sum_der2 * weight24 +
                       sum_der3 * weight34)
    except:
        log_der4 = LOG_ZERO

    try:
        log_der5 = log(sum_der3 * weight35)
    except:
        log_der5 = LOG_ZERO

    # printing, just in case
    print('child log der', node1.log_der, node2.log_der,
          node3.log_der, node4.log_der, node5.log_der)
    print('exact log der', log_der1, log_der2, log_der3,
          log_der4, log_der5)

    if IS_LOG_ZERO(log_der1):
        assert IS_LOG_ZERO(node1.log_der)
    else:
        assert_almost_equal(log_der1, node1.log_der, 15)
    if IS_LOG_ZERO(log_der2):
        assert IS_LOG_ZERO(node2.log_der)
    else:
        assert_almost_equal(log_der2, node2.log_der, 15)
    if IS_LOG_ZERO(log_der3):
        assert IS_LOG_ZERO(node3.log_der)
    else:
        assert_almost_equal(log_der3, node3.log_der, 15)
    if IS_LOG_ZERO(log_der4):
        assert IS_LOG_ZERO(node4.log_der)
    else:
        assert_almost_equal(log_der4, node4.log_der, 15)
    if IS_LOG_ZERO(log_der5):
        assert IS_LOG_ZERO(node5.log_der)
    else:
        assert_almost_equal(log_der5, node5.log_der, 15)
Ejemplo n.º 22
0
def test_layered_pruned_linked_spn_cltree():
    #
    # creating all the data slices
    # the slicing is a fake stub
    rows = 5
    cols = 5
    var = 1
    values = 2

    vars = [2, 3]
    var_values = [2, 2]
    s_data = numpy.array([[0, 1], [1, 1], [1, 0], [0, 0]])

    node_1 = SumNode()
    node_1.id = 1

    node_2 = ProductNode()
    node_2.id = 2

    node_3 = SumNode()
    node_3.id = 3

    # adding first level
    weight_12 = 0.4
    weight_13 = 0.6
    node_1.add_child(node_2, weight_12)
    node_1.add_child(node_3, weight_13)

    node_4 = ProductNode()
    node_4.id = 4

    leaf_5 = CategoricalSmoothedNode(var,
                                     values)
    leaf_5.id = 5

    # not adding the slice to the stack

    node_2.add_child(node_4)
    node_2.add_child(leaf_5)

    node_6 = SumNode()
    node_6.id = 6

    node_7 = SumNode()
    node_7.id = 7

    weight_36 = 0.1
    weight_37 = 0.9
    node_3.add_child(node_6, weight_36)
    node_3.add_child(node_7, weight_37)

    node_8 = ProductNode()
    node_8.id = 8

    #
    # this is a cltree
    leaf_15 = CLTreeNode(vars=vars,
                         var_values=var_values,
                         data=s_data)
    leaf_15.id = 15

    node_4.add_child(node_8)
    node_4.add_child(leaf_15)

    leaf_13 = CategoricalSmoothedNode(var,
                                      values)
    leaf_13.id = 13

    leaf_14 = CLTreeNode(vars=vars,
                         var_values=var_values,
                         data=s_data)
    leaf_14.id = 14

    node_8.add_child(leaf_13)
    node_8.add_child(leaf_14)

    leaf_9 = CLTreeNode(vars=vars,
                        var_values=var_values,
                        data=s_data)
    leaf_9.id = 9

    node_10 = ProductNode()
    node_10.id = 10

    leaf_18 = CategoricalSmoothedNode(var,
                                      values)
    leaf_18.id = 18

    leaf_19 = CategoricalSmoothedNode(var,
                                      values)
    leaf_19.id = 19

    node_10.add_child(leaf_18)
    node_10.add_child(leaf_19)

    weight_69 = 0.3
    weight_610 = 0.7
    node_6.add_child(leaf_9, weight_69)
    node_6.add_child(node_10, weight_610)

    node_11 = ProductNode()
    node_11.id = 11

    leaf_20 = CategoricalSmoothedNode(var,
                                      values)
    leaf_20.id = 20

    leaf_21 = CategoricalSmoothedNode(var,
                                      values)
    leaf_21.id = 21

    node_11.add_child(leaf_20)
    node_11.add_child(leaf_21)

    node_12 = ProductNode()
    node_12.id = 12

    leaf_22 = CLTreeNode(vars=vars,
                         var_values=var_values,
                         data=s_data)
    leaf_22.id = 22

    leaf_23 = CategoricalSmoothedNode(var,
                                      values)
    leaf_23.id = 23

    node_12.add_child(leaf_22)
    node_12.add_child(leaf_23)

    weight_711 = 0.5
    weight_712 = 0.5
    node_7.add_child(node_11, weight_711)
    node_7.add_child(node_12, weight_712)

    print('Added nodes')

    root_node = SpnFactory.layered_pruned_linked_spn(node_1)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 4
        elif i == 2:
            assert layer.n_nodes() == 10
Ejemplo n.º 23
0
def create_valid_toy_spn():
    # root layer
    whole_scope = frozenset({0, 1, 2, 3})
    root_node = SumNode(var_scope=whole_scope)
    root_layer = SumLayer([root_node])

    # prod layer
    prod_node_1 = ProductNode(var_scope=whole_scope)
    prod_node_2 = ProductNode(var_scope=whole_scope)
    prod_layer_1 = ProductLayer([prod_node_1, prod_node_2])

    root_node.add_child(prod_node_1, 0.5)
    root_node.add_child(prod_node_2, 0.5)

    # sum layer
    scope_1 = frozenset({0, 1})
    scope_2 = frozenset({2})
    scope_3 = frozenset({3})
    scope_4 = frozenset({2, 3})

    sum_node_1 = SumNode(var_scope=scope_1)
    sum_node_2 = SumNode(var_scope=scope_2)
    sum_node_3 = SumNode(var_scope=scope_3)
    sum_node_4 = SumNode(var_scope=scope_4)

    prod_node_1.add_child(sum_node_1)
    prod_node_1.add_child(sum_node_2)
    prod_node_1.add_child(sum_node_3)

    prod_node_2.add_child(sum_node_1)
    prod_node_2.add_child(sum_node_4)

    sum_layer_1 = SumLayer([sum_node_1, sum_node_2,
                            sum_node_3, sum_node_4])

    # another product layer
    prod_node_3 = ProductNode(var_scope=scope_1)
    prod_node_4 = ProductNode(var_scope=scope_1)

    prod_node_5 = ProductNode(var_scope=scope_4)
    prod_node_6 = ProductNode(var_scope=scope_4)

    sum_node_1.add_child(prod_node_3, 0.5)
    sum_node_1.add_child(prod_node_4, 0.5)

    sum_node_4.add_child(prod_node_5, 0.5)
    sum_node_4.add_child(prod_node_6, 0.5)

    prod_layer_2 = ProductLayer([prod_node_3, prod_node_4,
                                 prod_node_5, prod_node_6])

    # last sum one
    scope_5 = frozenset({0})
    scope_6 = frozenset({1})

    sum_node_5 = SumNode(var_scope=scope_5)
    sum_node_6 = SumNode(var_scope=scope_6)
    sum_node_7 = SumNode(var_scope=scope_5)
    sum_node_8 = SumNode(var_scope=scope_6)

    sum_node_9 = SumNode(var_scope=scope_2)
    sum_node_10 = SumNode(var_scope=scope_3)
    sum_node_11 = SumNode(var_scope=scope_2)
    sum_node_12 = SumNode(var_scope=scope_3)

    prod_node_3.add_child(sum_node_5)
    prod_node_3.add_child(sum_node_6)
    prod_node_4.add_child(sum_node_7)
    prod_node_4.add_child(sum_node_8)

    prod_node_5.add_child(sum_node_9)
    prod_node_5.add_child(sum_node_10)
    prod_node_6.add_child(sum_node_11)
    prod_node_6.add_child(sum_node_12)

    sum_layer_2 = SumLayer([sum_node_5, sum_node_6,
                            sum_node_7, sum_node_8,
                            sum_node_9, sum_node_10,
                            sum_node_11, sum_node_12])

    # input layer
    vars = [2, 3, 2, 2]
    input_layer = CategoricalIndicatorLayer(vars=vars)
    last_sum_nodes = [sum_node_2, sum_node_3,
                      sum_node_5, sum_node_6,
                      sum_node_7, sum_node_8,
                      sum_node_9, sum_node_10,
                      sum_node_11, sum_node_12]
    for sum_node in last_sum_nodes:
        (var_scope,) = sum_node.var_scope
        for input_node in input_layer.nodes():
            if input_node.var == var_scope:
                sum_node.add_child(input_node, 1.0)

    spn = Spn(input_layer=input_layer,
              layers=[sum_layer_2, prod_layer_2,
                      sum_layer_1, prod_layer_1,
                      root_layer])

    # print(spn)
    return spn
Ejemplo n.º 24
0
def test_sum_node_backprop():
    # create child nodes
    child1 = Node()
    val1 = 1.
    child1.set_val(val1)

    child2 = Node()
    val2 = 1.
    child2.set_val(val2)

    # create sum node and adding children to it
    sum_node1 = SumNode()
    weight11 = 0.8
    weight12 = 0.2
    sum_node1.add_child(child1, weight11)
    sum_node1.add_child(child2, weight12)

    # adding a coparent
    sum_node2 = SumNode()
    weight21 = 0.6
    weight22 = 0.4
    sum_node2.add_child(child1, weight21)
    sum_node2.add_child(child2, weight22)

    # evaluating
    sum_node1.eval()
    sum_node2.eval()

    # setting the log derivatives to the parents
    sum_node_der1 = 1.0
    sum_node1.log_der = log(sum_node_der1)
    sum_node1.backprop()

    sum_node_der2 = 1.0
    sum_node2.log_der = log(sum_node_der2)
    sum_node2.backprop()

    # checking for correctness
    log_der1 = log(weight11 * sum_node_der1 +
                   weight21 * sum_node_der2)

    log_der2 = log(weight12 * sum_node_der1 +
                   weight22 * sum_node_der2)

    print('log ders 1:{lgd1} 2:{lgd2}'.format(lgd1=log_der1,
                                              lgd2=log_der2))
    assert_almost_equal(log_der1, child1.log_der, 15)
    assert_almost_equal(log_der2, child2.log_der, 15)

    # resetting
    child1.log_der = LOG_ZERO
    child2.log_der = LOG_ZERO

    # now changing the initial der values
    sum_node_der1 = 0.5
    sum_node1.log_der = log(sum_node_der1)
    sum_node1.backprop()

    sum_node_der2 = 0.0
    sum_node2.log_der = LOG_ZERO
    sum_node2.backprop()

    # checking for correctness
    log_der1 = log(weight11 * sum_node_der1 +
                   weight21 * sum_node_der2)

    log_der2 = log(weight12 * sum_node_der1 +
                   weight22 * sum_node_der2)

    print('log ders 1:{lgd1} 2:{lgd2}'.format(lgd1=log_der1,
                                              lgd2=log_der2))
    assert_almost_equal(log_der1, child1.log_der, 15)
    assert_almost_equal(log_der2, child2.log_der, 15)
Ejemplo n.º 25
0
def test_spn_set_get_weights():
    # create a simple spn
    root_node = SumNode()
    root_layer = SumLayer([root_node])

    prod_node_1 = ProductNode()
    prod_node_2 = ProductNode()
    root_node.add_child(prod_node_1, 0.5)
    root_node.add_child(prod_node_2, 0.5)
    prod_layer = ProductLayer([prod_node_1,
                               prod_node_2])

    sum_node_1 = SumNode()
    sum_node_2 = SumNode()
    sum_node_3 = SumNode()
    prod_node_1.add_child(sum_node_1)
    prod_node_1.add_child(sum_node_2)
    prod_node_2.add_child(sum_node_2)
    prod_node_2.add_child(sum_node_3)
    sum_layer = SumLayer([sum_node_1, sum_node_2,
                          sum_node_3])

    ind_node_1 = CategoricalIndicatorNode(var=0, var_val=1)
    ind_node_2 = CategoricalIndicatorNode(var=0, var_val=1)
    ind_node_3 = CategoricalIndicatorNode(var=0, var_val=1)
    ind_node_4 = CategoricalIndicatorNode(var=0, var_val=1)
    ind_node_5 = CategoricalIndicatorNode(var=0, var_val=1)
    input_layer = CategoricalInputLayer(nodes=[ind_node_1,
                                               ind_node_2,
                                               ind_node_3,
                                               ind_node_4,
                                               ind_node_5])
    sum_node_1.add_child(ind_node_1, 0.2)
    sum_node_1.add_child(ind_node_2, 0.2)
    sum_node_2.add_child(ind_node_2, 0.2)
    sum_node_2.add_child(ind_node_3, 0.2)
    sum_node_2.add_child(ind_node_4, 0.2)
    sum_node_3.add_child(ind_node_4, 0.2)
    sum_node_3.add_child(ind_node_5, 0.2)

    spn = Spn(input_layer=input_layer,
              layers=[sum_layer, prod_layer, root_layer])

    print(spn)

    # storing these weights
    curr_weights = spn.get_weights()

    # setting the new weights
    spn.set_weights(weights_ds)

    # getting them again
    new_weights = spn.get_weights()

    # comparing them
    assert new_weights == weights_ds

    # now setting back the previous one
    spn.set_weights(curr_weights)

    # getting them back again
    old_weights = spn.get_weights()

    # and checking
    assert old_weights == curr_weights
Ejemplo n.º 26
0
    def linked_random_spn_top_down(cls,
                                   vars,
                                   n_layers,
                                   n_max_children,
                                   n_scope_children,
                                   max_scope_split,
                                   merge_prob=0.5,
                                   rand_gen=None):
        """
        WRITEME
        """
        def cluster_scopes(scope_list):
            cluster_dict = {}

            for i, var in enumerate(scope_list):
                cluster_dict[var] += {i}
            return cluster_dict

        def cluster_set_scope(scope_list):
            return {scope for scope in scope_list}

        def link_leaf_to_input_layer(sum_leaf, scope_var, input_layer,
                                     rand_gen):
            for indicator_node in input_layer.nodes():
                if indicator_node.var == scope_var:
                    rand_weight = rand_gen.random()
                    sum_leaf.add_child(indicator_node, rand_weight)
                    # print(sum_leaf, indicator_node, rand_weight)
            # normalizing
            sum_leaf.normalize()

        #
        # creating a product layer
        #

        def build_product_layer(parent_layer, parent_scope_list,
                                n_max_children, n_scope_children, input_layer,
                                rand_gen):

            # grouping the scopes of the parents
            scope_clusters = cluster_set_scope(parent_scope_list)
            # for each scope add a fixed number of children
            children_lists = {
                scope: [
                    ProductNode(var_scope=scope)
                    for i in range(n_scope_children)
                ]
                for scope in scope_clusters
            }
            # counting which node is used
            children_counts = {
                scope: [0 for i in range(n_scope_children)]
                for scope in scope_clusters
            }
            # now link those randomly to their parent
            for parent, scope in zip(parent_layer.nodes(), parent_scope_list):
                # only for nodes not becoming leaves
                if len(scope) > 1:
                    # sampling at most n_max_children from those in the same
                    # scope
                    children_scope_list = children_lists[scope]
                    sample_length = min(len(children_scope_list),
                                        n_max_children)
                    sampled_ids = rand_gen.sample(range(n_scope_children),
                                                  sample_length)
                    sampled_children = [None for i in range(sample_length)]
                    for i, id in enumerate(sampled_ids):
                        # getting the sampled child
                        sampled_children[i] = children_scope_list[id]
                        # updating its counter
                        children_counts[scope][id] += 1

                    for child in sampled_children:
                        # parent is a sum layer, we must set a random weight
                        rand_weight = rand_gen.random()
                        parent.add_child(child, rand_weight)

                    # we can now normalize it
                    parent.normalize()
                else:
                    # binding the node to the input layer
                    (scope_var, ) = scope
                    link_leaf_to_input_layer(parent, scope_var, input_layer,
                                             rand_gen)

            # pruning those children never used
            for scope in children_lists.keys():
                children_scope_list = children_lists[scope]
                scope_counts = children_counts[scope]
                used_children = [
                    child
                    for count, child in zip(scope_counts, children_scope_list)
                    if count > 0
                ]
                children_lists[scope] = used_children

            # creating the layer and new scopelist
            # print('children list val', children_lists.values())
            children_list = [
                child for child in itertools.chain.from_iterable(
                    children_lists.values())
            ]
            scope_list = [
                key for key, child_list in children_lists.items()
                for elem in child_list
            ]
            # print('children list', children_list)
            # print('scope list', scope_list)
            prod_layer = ProductLayerLinked(children_list)

            return prod_layer, scope_list

        def build_sum_layer(parent_layer,
                            parent_scope_list,
                            rand_gen,
                            max_scope_split=-1,
                            merge_prob=0.5):

            # keeping track of leaves
            # leaf_props = []
            scope_clusters = cluster_set_scope(parent_scope_list)

            # looping through all the parent nodes and their scopes
            # in order to decompose their scope
            dec_scope_list = []
            for scope in parent_scope_list:
                # decomposing their scope into k random pieces
                k = len(scope)
                if 1 < max_scope_split <= len(scope):
                    k = rand_gen.randint(2, max_scope_split)
                shuffled_scope = list(scope)
                rand_gen.shuffle(shuffled_scope)
                dec_scopes = [
                    frozenset(shuffled_scope[i::k]) for i in range(k)
                ]
                dec_scope_list.append(dec_scopes)
                # if a decomposed scope consists of only one var, generate a
                # leaf
                # leaves = [(parent, (dec_scope,))
                #           for dec_scope in dec_scopes if len(dec_scope) == 1]
                # leaf_props.extend(leaves)

            # generating a unique decomposition
            used_decs = {}
            children_list = []
            scope_list = []
            for parent, decs in zip(parent_layer.nodes(), dec_scope_list):
                merge_count = 0
                for scope in decs:
                    sum_node = None
                    try:
                        rand_perc = rand_gen.random()
                        if (merge_count < len(decs) - 1
                                and rand_perc > merge_prob):
                            sum_node = used_decs[scope]
                            merge_count += 1

                        else:
                            raise Exception()
                    except:
                        # create a node for it
                        sum_node = SumNode(var_scope=scope)
                        children_list.append(sum_node)
                        scope_list.append(scope)
                        used_decs[scope] = sum_node

                    parent.add_child(sum_node)

            # unique_dec = {frozenset(dec) for dec in
            #               itertools.chain.from_iterable(dec_scope_list)}
            # print('unique dec', unique_dec)
            # building a dict scope->child
            # children_dict = {scope: SumNode() for scope in unique_dec}
            # now linking parents to their children
            # for parent, scope in zip(parent_layer.nodes(),
            #                          parent_scope_list):
            #     dec_scopes = dec_scope_list[scope]
            #     for dec in dec_scopes:
            # retrieving children
            # adding it
            #         parent.add_child(children_dict[dec])

            # we already have the nodes and their scopes
            # children_list = [child for child in children_dict.values()]
            # scope_list = [scope for scope in children_dict.keys()]

            sum_layer = SumLayerLinked(nodes=children_list)

            return sum_layer, scope_list

        # if no generator is provided, create a new one
        if rand_gen is None:
            rand_gen = random.Random()

        # create input layer
        # _vars = [2, 3, 2, 2, 4]
        input_layer = CategoricalIndicatorLayerLinked(vars=vars)

        # create root layer
        full_scope = frozenset({i for i in range(len(vars))})
        root = SumNode(var_scope=full_scope)
        root_layer = SumLayerLinked(nodes=[root])
        last_layer = root_layer

        # create top scope list
        last_scope_list = [full_scope]

        layers = [root_layer]
        layer_count = 0
        stop_building = False
        while not stop_building:
            # checking for early termination
            # this one leads to split product nodes into leaves
            if layer_count >= n_layers:
                print('Max level reached, trying to stop')
                max_scope_split = -1

            # build a new layer alternating types
            if isinstance(last_layer, SumLayerLinked):
                print('Building product layer')
                last_layer, last_scope_list = \
                    build_product_layer(last_layer,
                                        last_scope_list,
                                        n_max_children,
                                        n_scope_children,
                                        input_layer,
                                        rand_gen)
            elif isinstance(last_layer, ProductLayerLinked):
                print('Building sum layer')
                last_layer, last_scope_list = \
                    build_sum_layer(last_layer,
                                    last_scope_list,
                                    rand_gen,
                                    max_scope_split,
                                    merge_prob)

            # testing for more nodes to expand
            if last_layer.n_nodes() == 0:
                print('Stop building')
                stop_building = True
            else:
                layers.append(last_layer)
                layer_count += 1

        # checking for early termination
        # if not stop_building:
        #     if isinstance(last_layer, ProductLayerLinked):
        # building a sum layer splitting everything into one
        # length scopes
        #         last_sum_layer, last_scope_list = \
        #             build_sum_layer(last_layer,
        #                             last_scope_list,
        #                             rand_gen,
        #                             max_scope_split=-1)
        # then linking each node to the input layer
        #         for sum_leaf, scope in zip(last_sum_layer.nodes(),
        #                                    last_scope_list):
        #             (scope_var,) = scope
        #             link_leaf_to_input_layer(sum_leaf,
        #                                      scope_var,
        #                                      input_layer,
        #                                      rand_gen)
        #     elif isinstance(last_layer, SumLayerLinked):
        #         pass

        # print('LAYERS ', len(layers), '\n')
        # for i, layer in enumerate(layers):
        #     print('LAYER ', i)
        #     print(layer)
        # print('\n')
        spn = SpnLinked(input_layer=input_layer, layers=layers[::-1])
        # testing
        # scope_list = [
        #     frozenset({1, 3, 4}), frozenset({2, 0}), frozenset({1, 3, 4})]
        # sum_layer = SumLayerLinked(nodes=[SumNode(), SumNode(), SumNode()])

        # prod_layer, scope_list = build_product_layer(
        #     sum_layer, scope_list, 2, 3, input_layer, rand_gen)

        # sum_layer1, scope_list_2 = build_sum_layer(prod_layer,
        #                                            scope_list,
        #                                            rand_gen,
        #                                            max_scope_split=2
        #                                            )
        # prod_layer_2, scope_list_3 = build_product_layer(sum_layer1,
        #                                                  scope_list_2,
        #                                                  2,
        #                                                  3,
        #                                                  input_layer,
        #                                                  rand_gen)
        # create spn from layers
        # spn = SpnLinked(input_layer=input_layer,
        #                 layers=[prod_layer_2, sum_layer1,
        #                         prod_layer, sum_layer, root_layer])
        return spn
Ejemplo n.º 27
0
def test_sum_layer_backprop():
        # input layer made of 5 generic nodes
    node1 = Node()
    node2 = Node()
    node3 = Node()
    node4 = Node()
    node5 = Node()

    # top layer made by 3 sum nodes
    sum1 = SumNode()
    sum2 = SumNode()
    sum3 = SumNode()

    # linking to input nodes
    weight11 = 0.3
    sum1.add_child(node1, weight11)
    weight12 = 0.3
    sum1.add_child(node2, weight12)
    weight13 = 0.4
    sum1.add_child(node3, weight13)

    weight22 = 0.15
    sum2.add_child(node2, weight22)
    weight23 = 0.15
    sum2.add_child(node3, weight23)
    weight24 = 0.7
    sum2.add_child(node4, weight24)

    weight33 = 0.4
    sum3.add_child(node3, weight33)
    weight34 = 0.25
    sum3.add_child(node4, weight34)
    weight35 = 0.35
    sum3.add_child(node5, weight35)

    sum_layer = SumLayer([sum1, sum2, sum3])

    # setting input values
    val1 = 0.0
    node1.set_val(val1)
    val2 = 0.5
    node2.set_val(val2)
    val3 = 0.3
    node3.set_val(val3)
    val4 = 1.0
    node4.set_val(val4)
    val5 = 0.0
    node5.set_val(val5)

    # evaluating
    sum_layer.eval()
    print('eval\'d layer:', sum_layer.node_values())

    # set the parent derivatives
    sum_der1 = 1.0
    sum1.log_der = log(sum_der1)

    sum_der2 = 1.0
    sum2.log_der = log(sum_der2)

    sum_der3 = 0.0
    sum3.log_der = LOG_ZERO

    # back prop layer wise
    sum_layer.backprop()

    # check for correctness
    try:
        log_der1 = log(sum_der1 * weight11)
    except:
        log_der1 = LOG_ZERO

    try:
        log_der2 = log(sum_der1 * weight12 +
                       sum_der2 * weight22)
    except:
        log_der2 = LOG_ZERO

    try:
        log_der3 = log(sum_der1 * weight13 +
                       sum_der2 * weight23 +
                       sum_der3 * weight33)
    except:
        log_der3 = LOG_ZERO

    try:
        log_der4 = log(sum_der2 * weight24 +
                       sum_der3 * weight34)
    except:
        log_der4 = LOG_ZERO

    try:
        log_der5 = log(sum_der3 * weight35)
    except:
        log_der5 = LOG_ZERO

    # printing, just in case
    print('child log der', node1.log_der, node2.log_der,
          node3.log_der, node4.log_der, node5.log_der)
    print('exact log der', log_der1, log_der2, log_der3,
          log_der4, log_der5)

    if IS_LOG_ZERO(log_der1):
        assert IS_LOG_ZERO(node1.log_der)
    else:
        assert_almost_equal(log_der1, node1.log_der, 15)
    if IS_LOG_ZERO(log_der2):
        assert IS_LOG_ZERO(node2.log_der)
    else:
        assert_almost_equal(log_der2, node2.log_der, 15)
    if IS_LOG_ZERO(log_der3):
        assert IS_LOG_ZERO(node3.log_der)
    else:
        assert_almost_equal(log_der3, node3.log_der, 15)
    if IS_LOG_ZERO(log_der4):
        assert IS_LOG_ZERO(node4.log_der)
    else:
        assert_almost_equal(log_der4, node4.log_der, 15)
    if IS_LOG_ZERO(log_der5):
        assert IS_LOG_ZERO(node5.log_der)
    else:
        assert_almost_equal(log_der5, node5.log_der, 15)

    # updating weights
    eta = 0.1
    sum_layer.update_weights(Spn.test_weight_update, 0)
    # checking for correctness
    weight_u11 = sum_der1 * val1 * eta + weight11
    weight_u12 = sum_der1 * val2 * eta + weight12
    weight_u13 = sum_der1 * val3 * eta + weight13

    weight_u22 = sum_der2 * val2 * eta + weight22
    weight_u23 = sum_der2 * val3 * eta + weight23
    weight_u24 = sum_der2 * val4 * eta + weight24

    weight_u33 = sum_der3 * val3 * eta + weight33
    weight_u34 = sum_der3 * val4 * eta + weight34
    weight_u35 = sum_der3 * val5 * eta + weight35

    # normalizing
    weight_sum1 = weight_u11 + weight_u12 + weight_u13
    weight_sum2 = weight_u22 + weight_u23 + weight_u24
    weight_sum3 = weight_u33 + weight_u34 + weight_u35

    weight_u11 = weight_u11 / weight_sum1
    weight_u12 = weight_u12 / weight_sum1
    weight_u13 = weight_u13 / weight_sum1

    weight_u22 = weight_u22 / weight_sum2
    weight_u23 = weight_u23 / weight_sum2
    weight_u24 = weight_u24 / weight_sum2

    weight_u33 = weight_u33 / weight_sum3
    weight_u34 = weight_u34 / weight_sum3
    weight_u35 = weight_u35 / weight_sum3

    print('expected weights', weight_u11, weight_u12, weight_u13,
          weight_u22, weight_u23, weight_u24,
          weight_u33, weight_u34, weight_u35)
    print('found weights', sum1.weights[0], sum1.weights[1], sum1.weights[2],
          sum2.weights[0], sum2.weights[1], sum2.weights[2],
          sum3.weights[0], sum3.weights[1], sum3.weights[2])
    assert_almost_equal(weight_u11, sum1.weights[0], 10)
    assert_almost_equal(weight_u12, sum1.weights[1], 10)
    assert_almost_equal(weight_u13, sum1.weights[2], 10)

    assert_almost_equal(weight_u22, sum2.weights[0], 10)
    assert_almost_equal(weight_u23, sum2.weights[1], 10)
    assert_almost_equal(weight_u24, sum2.weights[2], 10)

    assert_almost_equal(weight_u33, sum3.weights[0], 10)
    assert_almost_equal(weight_u34, sum3.weights[1], 10)
    assert_almost_equal(weight_u35, sum3.weights[2], 10)

    #
    # resetting derivatives
    #
    node1.log_der = LOG_ZERO
    node2.log_der = LOG_ZERO
    node3.log_der = LOG_ZERO
    node4.log_der = LOG_ZERO
    node5.log_der = LOG_ZERO

    # setting new values as inputs
    val1 = 0.0
    node1.set_val(val1)
    val2 = 0.0
    node2.set_val(val2)
    val3 = 0.3
    node3.set_val(val3)
    val4 = 1.0
    node4.set_val(val4)
    val5 = 1.0
    node5.set_val(val5)

    # evaluating again
    sum_layer.eval()
    print('eval\'d layer:', sum_layer.node_values())

    # set the parent derivatives
    sum_der1 = 1.0
    sum1.log_der = log(sum_der1)

    sum_der2 = 1.0
    sum2.log_der = log(sum_der2)

    sum_der3 = 0.0
    sum3.log_der = LOG_ZERO

    # back prop layer wise
    sum_layer.backprop()

    # check for correctness
    try:
        log_der1 = log(sum_der1 * weight_u11)
    except:
        log_der1 = LOG_ZERO

    try:
        log_der2 = log(sum_der1 * weight_u12 +
                       sum_der2 * weight_u22)
    except:
        log_der2 = LOG_ZERO

    try:
        log_der3 = log(sum_der1 * weight_u13 +
                       sum_der2 * weight_u23 +
                       sum_der3 * weight_u33)
    except:
        log_der3 = LOG_ZERO

    try:
        log_der4 = log(sum_der2 * weight_u24 +
                       sum_der3 * weight_u34)
    except:
        log_der4 = LOG_ZERO

    try:
        log_der5 = log(sum_der3 * weight_u35)
    except:
        log_der5 = LOG_ZERO

    # printing, just in case
    print('child log der', node1.log_der, node2.log_der,
          node3.log_der, node4.log_der, node5.log_der)
    print('exact log der', log_der1, log_der2, log_der3,
          log_der4, log_der5)

    if IS_LOG_ZERO(log_der1):
        assert IS_LOG_ZERO(node1.log_der)
    else:
        assert_almost_equal(log_der1, node1.log_der, 15)
    if IS_LOG_ZERO(log_der2):
        assert IS_LOG_ZERO(node2.log_der)
    else:
        assert_almost_equal(log_der2, node2.log_der, 15)
    if IS_LOG_ZERO(log_der3):
        assert IS_LOG_ZERO(node3.log_der)
    else:
        assert_almost_equal(log_der3, node3.log_der, 15)
    if IS_LOG_ZERO(log_der4):
        assert IS_LOG_ZERO(node4.log_der)
    else:
        assert_almost_equal(log_der4, node4.log_der, 15)
    if IS_LOG_ZERO(log_der5):
        assert IS_LOG_ZERO(node5.log_der)
    else:
        assert_almost_equal(log_der5, node5.log_der, 15)
Ejemplo n.º 28
0
    def linked_kernel_density_estimation(cls,
                                         n_instances,
                                         features,
                                         node_dict=None,
                                         alpha=0.1
                                         # ,batch_size=1,
                                         # sparse=False
                                         ):
        """
        WRITEME
        """

        n_features = len(features)

        # the top one is a sum layer with a single node
        root_node = SumNode()
        root_layer = SumLayerLinked([root_node])

        # second one is a product layer with n_instances nodes
        product_nodes = [ProductNode() for i in range(n_instances)]
        product_layer = ProductLayerLinked(product_nodes)
        # linking them to the root node
        for prod_node in product_nodes:
            root_node.add_child(prod_node, 1. / n_instances)

        # last layer can be a categorical smoothed input
        # or sum_layer + categorical indicator input

        input_layer = None
        layers = None
        n_leaf_nodes = n_features * n_instances

        if node_dict is None:
            # creating a sum_layer with n_leaf_nodes
            sum_nodes = [SumNode() for i in range(n_leaf_nodes)]
            # store them into a layer
            sum_layer = SumLayerLinked(sum_nodes)
            # linking them to the products above
            for i, prod_node in enumerate(product_nodes):
                for j in range(n_features):
                    # getting the next n_features nodes
                    prod_node.add_child(sum_nodes[i * n_features + j])
            # now creating the indicator nodes
            input_layer = \
                CategoricalIndicatorLayerLinked(vars=features)
            # linking the sum nodes to the indicator vars
            for i, sum_node in enumerate(sum_nodes):
                # getting the feature id
                j = i % n_features
                # and thus its number of values
                n_values = features[j]
                # getting the indices of indicators
                start_index = sum(features[:j])
                end_index = start_index + n_values
                indicators = [node for node in input_layer.nodes()
                              ][start_index:end_index]
                for ind_node in indicators:
                    sum_node.add_child(ind_node, 1. / n_values)

            # storing levels
            layers = [sum_layer, product_layer, root_layer]
        else:
            # create a categorical smoothed layer
            input_layer = \
                CategoricalSmoothedLayerLinked(vars=features,
                                               node_dicts=node_dict,
                                               alpha=alpha)
            # it shall contain n_leaf_nodes nodes
            smooth_nodes = list(input_layer.nodes())
            assert len(smooth_nodes) == n_leaf_nodes

            # linking it
            for i, prod_node in enumerate(product_nodes):
                for j in range(n_features):
                    # getting the next n_features nodes
                    prod_node.add_child(smooth_nodes[i * n_features + j])
            # setting the used levels
            layers = [product_layer, root_layer]

        # create the spn from levels
        kern_spn = SpnLinked(input_layer, layers)
        return kern_spn
Ejemplo n.º 29
0
def test_mini_spn_fit_em():
    vars = numpy.array([2, 2, 2, 2])
    input_layer = CategoricalIndicatorLayer(vars=vars)

    print(input_layer)
    ind1 = input_layer._nodes[0]
    ind2 = input_layer._nodes[1]
    ind3 = input_layer._nodes[2]
    ind4 = input_layer._nodes[3]
    ind5 = input_layer._nodes[4]
    ind6 = input_layer._nodes[5]
    ind7 = input_layer._nodes[6]
    ind8 = input_layer._nodes[7]

    # creating a sum layer of 4 nodes
    sum1 = SumNode()
    sum2 = SumNode()
    sum3 = SumNode()
    sum4 = SumNode()

    sum1.add_child(ind1, 0.6)
    sum1.add_child(ind2, 0.4)
    sum2.add_child(ind3, 0.5)
    sum2.add_child(ind4, 0.5)
    sum3.add_child(ind5, 0.7)
    sum3.add_child(ind6, 0.3)
    sum4.add_child(ind7, 0.4)
    sum4.add_child(ind8, 0.6)

    sum_layer = SumLayer(nodes=[sum1, sum2, sum3, sum4])

    # and a top layer of 3 products
    prod1 = ProductNode()
    prod2 = ProductNode()
    prod3 = ProductNode()

    prod1.add_child(sum1)
    prod1.add_child(sum2)
    prod2.add_child(sum2)
    prod2.add_child(sum3)
    prod3.add_child(sum3)
    prod3.add_child(sum4)

    prod_layer = ProductLayer(nodes=[prod1, prod2, prod3])

    # root layer
    root = SumNode()

    root.add_child(prod1, 0.4)
    root.add_child(prod2, 0.25)
    root.add_child(prod3, 0.35)

    root_layer = SumLayer(nodes=[root])

    spn = Spn(input_layer=input_layer,
              layers=[sum_layer, prod_layer, root_layer])

    print(spn)

    # training on obs
    spn.fit_em(train=syn_train_data, valid=syn_val_data, test=None, hard=True)
Ejemplo n.º 30
0
def test_linked_to_theano_indicator():
    # creating single nodes
    root = SumNode()

    prod1 = ProductNode()
    prod2 = ProductNode()
    prod3 = ProductNode()

    sum1 = SumNode()
    sum2 = SumNode()
    sum3 = SumNode()
    sum4 = SumNode()

    ind1 = CategoricalIndicatorNode(var=0, var_val=0)
    ind2 = CategoricalIndicatorNode(var=0, var_val=1)
    ind3 = CategoricalIndicatorNode(var=1, var_val=0)
    ind4 = CategoricalIndicatorNode(var=1, var_val=1)
    ind5 = CategoricalIndicatorNode(var=2, var_val=0)
    ind6 = CategoricalIndicatorNode(var=2, var_val=1)
    ind7 = CategoricalIndicatorNode(var=2, var_val=2)
    ind8 = CategoricalIndicatorNode(var=3, var_val=0)
    ind9 = CategoricalIndicatorNode(var=3, var_val=1)
    ind10 = CategoricalIndicatorNode(var=3, var_val=2)
    ind11 = CategoricalIndicatorNode(var=3, var_val=3)

    prod4 = ProductNode()
    prod5 = ProductNode()
    prod6 = ProductNode()
    prod7 = ProductNode()

    # linking nodes
    root.add_child(prod1, 0.3)
    root.add_child(prod2, 0.3)
    root.add_child(prod3, 0.4)

    prod1.add_child(sum1)
    prod1.add_child(sum2)
    prod2.add_child(ind7)
    prod2.add_child(ind8)
    prod2.add_child(ind11)
    prod3.add_child(sum3)
    prod3.add_child(sum4)

    sum1.add_child(ind1, 0.3)
    sum1.add_child(ind2, 0.3)
    sum1.add_child(prod4, 0.4)

    sum2.add_child(ind2, 0.5)
    sum2.add_child(prod4, 0.2)
    sum2.add_child(prod5, 0.3)

    sum3.add_child(prod6, 0.5)
    sum3.add_child(prod7, 0.5)
    sum4.add_child(prod6, 0.5)
    sum4.add_child(prod7, 0.5)

    prod4.add_child(ind3)
    prod4.add_child(ind4)
    prod5.add_child(ind5)
    prod5.add_child(ind6)
    prod6.add_child(ind9)
    prod6.add_child(ind10)
    prod7.add_child(ind9)
    prod7.add_child(ind10)

    # building layers from nodes
    root_layer = SumLayerLinked([root])
    prod_layer = ProductLayerLinked([prod1, prod2, prod3])
    sum_layer = SumLayerLinked([sum1, sum2, sum3, sum4])
    aprod_layer = ProductLayerLinked([prod4, prod5, prod6, prod7])
    ind_layer = CategoricalIndicatorLayer(nodes=[
        ind1, ind2, ind3, ind4, ind5, ind6, ind7, ind8, ind9, ind10, ind11
    ])

    # creating the linked spn
    spn_linked = SpnLinked(
        input_layer=ind_layer,
        layers=[aprod_layer, sum_layer, prod_layer, root_layer])

    print(spn_linked)

    # converting to theano repr
    spn_theano = SpnFactory.linked_to_theano(spn_linked)
    print(spn_theano)

    # time for some inference comparison
    for instance in I:
        print('linked')
        res_l = spn_linked.eval(instance)
        print(res_l)
        print('theano')
        res_t = spn_theano.eval(instance)
        print(res_t)
        assert_array_almost_equal(res_l, res_t)
Ejemplo n.º 31
0
    def linked_naive_factorization(cls, features, node_dict=None, alpha=0.1):
        """
        WRITEME
        """
        n_features = len(features)

        # create an input layer
        input_layer = None
        layers = None

        # first layer is a product layer with n_feature children
        root_node = ProductNode()
        root_layer = ProductLayerLinked([root_node])

        # second is a sum node on an indicator layer
        if node_dict is None:
            # creating sum nodes
            sum_nodes = [SumNode() for i in range(n_features)]
            # linking to the root
            for node in sum_nodes:
                root_node.add_child(node)
            # store into a level
            sum_layer = SumLayerLinked(sum_nodes)
            # now create an indicator layer
            input_layer = CategoricalIndicatorLayerLinked(vars=features)
            # and linking it
            # TODO make this a function
            for i, sum_node in enumerate(sum_nodes):
                # getting the feature id
                j = i % n_features
                # and thus its number of values
                n_values = features[j]
                # getting the indices of indicators
                start_index = sum(features[:j])
                end_index = start_index + n_values
                indicators = [node for node in input_layer.nodes()
                              ][start_index:end_index]
                for ind_node in indicators:
                    sum_node.add_child(ind_node, 1. / n_values)

            # collecting layers
            layers = [sum_layer, root_layer]

        # or a categorical smoothed layer
        else:
            input_layer = CategoricalSmoothedLayerLinked(vars=features,
                                                         node_dicts=node_dict,
                                                         alpha=alpha)
            # it shall contain n_features nodes
            smooth_nodes = list(input_layer.nodes())
            assert len(smooth_nodes) == n_features
            for node in smooth_nodes:
                root_node.add_child(node)

            # set layers accordingly
            layers = [root_layer]

        # build the spn
        naive_fact_spn = SpnLinked(input_layer, layers)

        return naive_fact_spn
Ejemplo n.º 32
0
def test_layered_pruned_linked_spn_cltree():
    #
    # creating all the data slices
    # the slicing is a fake stub
    rows = 5
    cols = 5
    var = 1
    values = 2

    vars = [2, 3]
    var_values = [2, 2]
    s_data = numpy.array([[0, 1], [1, 1], [1, 0], [0, 0]])

    node_1 = SumNode()
    node_1.id = 1

    node_2 = ProductNode()
    node_2.id = 2

    node_3 = SumNode()
    node_3.id = 3

    # adding first level
    weight_12 = 0.4
    weight_13 = 0.6
    node_1.add_child(node_2, weight_12)
    node_1.add_child(node_3, weight_13)

    node_4 = ProductNode()
    node_4.id = 4

    leaf_5 = CategoricalSmoothedNode(var, values)
    leaf_5.id = 5

    # not adding the slice to the stack

    node_2.add_child(node_4)
    node_2.add_child(leaf_5)

    node_6 = SumNode()
    node_6.id = 6

    node_7 = SumNode()
    node_7.id = 7

    weight_36 = 0.1
    weight_37 = 0.9
    node_3.add_child(node_6, weight_36)
    node_3.add_child(node_7, weight_37)

    node_8 = ProductNode()
    node_8.id = 8

    #
    # this is a cltree
    leaf_15 = CLTreeNode(vars=vars, var_values=var_values, data=s_data)
    leaf_15.id = 15

    node_4.add_child(node_8)
    node_4.add_child(leaf_15)

    leaf_13 = CategoricalSmoothedNode(var, values)
    leaf_13.id = 13

    leaf_14 = CLTreeNode(vars=vars, var_values=var_values, data=s_data)
    leaf_14.id = 14

    node_8.add_child(leaf_13)
    node_8.add_child(leaf_14)

    leaf_9 = CLTreeNode(vars=vars, var_values=var_values, data=s_data)
    leaf_9.id = 9

    node_10 = ProductNode()
    node_10.id = 10

    leaf_18 = CategoricalSmoothedNode(var, values)
    leaf_18.id = 18

    leaf_19 = CategoricalSmoothedNode(var, values)
    leaf_19.id = 19

    node_10.add_child(leaf_18)
    node_10.add_child(leaf_19)

    weight_69 = 0.3
    weight_610 = 0.7
    node_6.add_child(leaf_9, weight_69)
    node_6.add_child(node_10, weight_610)

    node_11 = ProductNode()
    node_11.id = 11

    leaf_20 = CategoricalSmoothedNode(var, values)
    leaf_20.id = 20

    leaf_21 = CategoricalSmoothedNode(var, values)
    leaf_21.id = 21

    node_11.add_child(leaf_20)
    node_11.add_child(leaf_21)

    node_12 = ProductNode()
    node_12.id = 12

    leaf_22 = CLTreeNode(vars=vars, var_values=var_values, data=s_data)
    leaf_22.id = 22

    leaf_23 = CategoricalSmoothedNode(var, values)
    leaf_23.id = 23

    node_12.add_child(leaf_22)
    node_12.add_child(leaf_23)

    weight_711 = 0.5
    weight_712 = 0.5
    node_7.add_child(node_11, weight_711)
    node_7.add_child(node_12, weight_712)

    print('Added nodes')

    root_node = SpnFactory.layered_pruned_linked_spn(node_1)

    print('ROOT nODE', root_node)

    spn = SpnFactory.layered_linked_spn(root_node)

    print('SPN', spn)

    assert spn.n_layers() == 3

    for i, layer in enumerate(spn.top_down_layers()):
        if i == 0:
            assert layer.n_nodes() == 1
        elif i == 1:
            assert layer.n_nodes() == 4
        elif i == 2:
            assert layer.n_nodes() == 10
Ejemplo n.º 33
0
        def build_sum_layer(parent_layer,
                            parent_scope_list,
                            rand_gen,
                            max_scope_split=-1,
                            merge_prob=0.5):

            # keeping track of leaves
            # leaf_props = []
            scope_clusters = cluster_set_scope(parent_scope_list)

            # looping through all the parent nodes and their scopes
            # in order to decompose their scope
            dec_scope_list = []
            for scope in parent_scope_list:
                # decomposing their scope into k random pieces
                k = len(scope)
                if 1 < max_scope_split <= len(scope):
                    k = rand_gen.randint(2, max_scope_split)
                shuffled_scope = list(scope)
                rand_gen.shuffle(shuffled_scope)
                dec_scopes = [
                    frozenset(shuffled_scope[i::k]) for i in range(k)
                ]
                dec_scope_list.append(dec_scopes)
                # if a decomposed scope consists of only one var, generate a
                # leaf
                # leaves = [(parent, (dec_scope,))
                #           for dec_scope in dec_scopes if len(dec_scope) == 1]
                # leaf_props.extend(leaves)

            # generating a unique decomposition
            used_decs = {}
            children_list = []
            scope_list = []
            for parent, decs in zip(parent_layer.nodes(), dec_scope_list):
                merge_count = 0
                for scope in decs:
                    sum_node = None
                    try:
                        rand_perc = rand_gen.random()
                        if (merge_count < len(decs) - 1
                                and rand_perc > merge_prob):
                            sum_node = used_decs[scope]
                            merge_count += 1

                        else:
                            raise Exception()
                    except:
                        # create a node for it
                        sum_node = SumNode(var_scope=scope)
                        children_list.append(sum_node)
                        scope_list.append(scope)
                        used_decs[scope] = sum_node

                    parent.add_child(sum_node)

            # unique_dec = {frozenset(dec) for dec in
            #               itertools.chain.from_iterable(dec_scope_list)}
            # print('unique dec', unique_dec)
            # building a dict scope->child
            # children_dict = {scope: SumNode() for scope in unique_dec}
            # now linking parents to their children
            # for parent, scope in zip(parent_layer.nodes(),
            #                          parent_scope_list):
            #     dec_scopes = dec_scope_list[scope]
            #     for dec in dec_scopes:
            # retrieving children
            # adding it
            #         parent.add_child(children_dict[dec])

            # we already have the nodes and their scopes
            # children_list = [child for child in children_dict.values()]
            # scope_list = [scope for scope in children_dict.keys()]

            sum_layer = SumLayerLinked(nodes=children_list)

            return sum_layer, scope_list
Ejemplo n.º 34
0
def test_sum_node_is_complete():
    # create a sum node with a scope
    scope = frozenset({0, 2, 7, 13})
    sum_node = SumNode(var_scope=scope)

    # creating children with same scope
    children = [ProductNode(var_scope=scope) for i in range(4)]
    for prod_node in children:
        sum_node.add_child(prod_node, 1.0)

    assert sum_node.is_complete()

    # now altering one child's scope with one less var
    children[0].var_scope = frozenset({0, 7, 13})

    assert sum_node.is_complete() is False

    # now adding one more
    children[0].var_scope = scope
    children[3].var_scope = frozenset({0, 2, 7, 13, 3})

    assert not sum_node.is_complete()

    # now checking with indicator input nodes
    var = 4
    sum_node = SumNode(var_scope=frozenset({var}))
    children = [CategoricalIndicatorNode(var=var, var_val=i)
                for i in range(4)]
    for input_node in children:
        sum_node.add_child(input_node, 1.0)

    assert sum_node.is_complete()
Ejemplo n.º 35
0
    def fit_structure(self, data):

        #
        # a queue containing the data slices to process
        slices_to_process = deque()

        # a stack for building nodes
        building_stack = deque()

        # a dict to keep track of id->nodes
        node_id_assoc = {}

        # creating the first slice
        whole_slice = DataSlice.whole_slice(data.shape[0], data.shape[1])
        slices_to_process.append(whole_slice)

        cluster_first = self._cluster_first

        #
        # iteratively process & split slices
        #
        while slices_to_process:

            # process a slice
            current_slice = slices_to_process.popleft()

            # pointers to the current data slice
            current_instances = current_slice.instance_ids
            current_features = current_slice.feature_ids
            current_id = current_slice.id

            n_features = len(current_features)

            #             if n_features > 1:
            # #                 # print("removing Zeros")
            #                 datarowsIdx = numpy.sum(data[current_instances, :][:, current_features], 1) > 0
            #                 if not any(datarowsIdx):
            #                     datarowsIdx[0] = True
            #                 current_instances = current_slice.instance_ids[datarowsIdx]

            n_instances = len(current_instances)

            #             if n_instances == 0:
            #                 #too strong cutting the zeroes
            #                 current_instances = [current_slice.instance_ids[0]]
            #                 n_instances = len(current_instances)

            slice_data_rows = data[current_instances, :]
            current_slice_data = slice_data_rows[:, current_features]

            # is this a leaf node or we can split?
            if n_features == 1 and (current_slice.doNotCluster or
                                    n_instances <= self._min_instances_slice):

                (feature_id, ) = current_features

                if self.family == "poisson":
                    leaf_node = PoissonNode(data, current_instances,
                                            current_features)
                elif self.family == "gaussian":
                    leaf_node = GaussianNode(data, current_instances,
                                             current_features)

                # storing links
                # input_nodes.append(leaf_node)
                leaf_node.id = current_id
                node_id_assoc[current_id] = leaf_node

            # elif (current_slice_data.shape[0] < self._min_instances_slice):
            # elif ( (n_instances <= self._min_instances_slice and n_features > 1) and current_slice_data.shape[0]  < self._min_instances_slice):
            # elif ((n_instances <= self._min_instances_slice and n_features > 1)):
            elif n_features > 1 and (current_slice.doNotCluster or
                                     n_instances <= self._min_instances_slice):

                # print('into naive factorization')
                child_slices = [
                    DataSlice(current_instances, [feature_id])
                    for feature_id in current_features
                ]
                slices_to_process.extend(child_slices)

                #children_ids = [child.id for child in child_slices]

                for child_slice in child_slices:
                    child_slice.doNotCluster = current_slice.doNotCluster
                    current_slice.add_child(child_slice)
                current_slice.type = ProductNode
                building_stack.append(current_slice)

                prod_node = ProductNode(data, current_instances,
                                        current_features)
                prod_node.id = current_id

                node_id_assoc[current_id] = prod_node

            else:

                split_on_features = False

                # first_run = False
                #
                # first run is a split on rows
                if n_features == 1 or cluster_first:
                    cluster_first = False
                else:

                    if self._ind_test_method == "pairwise_treeglm" or self._ind_test_method == "subsample":

                        fcdata = current_slice_data

                        if self._ind_test_method == "subsample":
                            #sampled_rows = 2000
                            #sampled_rows = math.floor(current_slice_data.shape[0]*10/100)
                            sampled_rows = self._sub_sample_rows
                            if sampled_rows < current_slice_data.shape[0]:
                                fcdata = current_slice_data[
                                    numpy.random.choice(
                                        current_slice_data.shape[0],
                                        sampled_rows,
                                        replace=False)]
                            else:
                                fcdata = current_slice_data

                        #Using R
                        #from pdn.independenceptest import getIndependentGroups
                        #feature_clusters = retrieve_clustering(getIndependentGroups(fcdata, alpha=self._alpha, family=self.family), current_features)
                        feature_clusters = retrieve_clustering(
                            getIndependentGroupsStabilityTest(
                                fcdata, alpha=self._alpha), current_features)
                    elif self._ind_test_method == "KMeans":

                        feature_clusters = retrieve_clustering(
                            cluster_rows(
                                (data[current_instances, :][:,
                                                            current_features]
                                 ).T,
                                n_clusters=2,
                                cluster_method=self._row_cluster_method,
                                n_iters=self._n_iters,
                                n_restarts=self._n_restarts,
                                cluster_prep_method="sqrt",
                                cluster_penalty=self._cluster_penalty,
                                rand_gen=self._rand_gen,
                                sklearn_args=self._sklearn_args),
                            current_instances)

                    split_on_features = len(feature_clusters) > 1

                #
                # have dependent components been found?
                if split_on_features:
                    #
                    # splitting on columns
                    # print('---> Splitting on features')
                    # print(feature_clusters)

                    slices = [
                        DataSlice(current_instances, cluster)
                        for cluster in feature_clusters
                    ]

                    slices_to_process.extend(slices)

                    current_slice.type = ProductNode
                    building_stack.append(current_slice)
                    for child_slice in slices:
                        current_slice.add_child(child_slice)

                    prod_node = ProductNode(data, current_instances,
                                            current_features)
                    prod_node.id = current_id
                    node_id_assoc[current_id] = prod_node

                else:
                    # print('---> Splitting on rows')

                    k_row_clusters = min(self._n_cluster_splits,
                                         n_instances - 1)

                    if n_features == 1:
                        # do one kmeans run with K large enough to split into N min instances
                        k_row_clusters = math.floor(
                            n_instances / self._min_instances_slice) + 1
                        k_row_clusters = min(k_row_clusters, n_instances - 1)

                    clustering = retrieve_clustering(
                        cluster_rows(
                            data[current_instances, :][:, current_features],
                            n_clusters=k_row_clusters,
                            cluster_method=self._row_cluster_method,
                            n_iters=self._n_iters,
                            n_restarts=self._n_restarts,
                            cluster_prep_method=self._cluster_prep_method,
                            cluster_penalty=self._cluster_penalty,
                            rand_gen=self._rand_gen,
                            sklearn_args=self._sklearn_args),
                        current_instances)

                    cluster_slices = [
                        DataSlice(cluster, current_features)
                        for cluster in clustering
                    ]

                    if len(clustering) < k_row_clusters:
                        for cluster_slice in cluster_slices:
                            cluster_slice.doNotCluster = True

                    n_instances_clusters = sum(
                        [len(cluster) for cluster in clustering])
                    cluster_weights = [
                        len(cluster) / n_instances_clusters
                        for cluster in clustering
                    ]

                    slices_to_process.extend(cluster_slices)

                    current_slice.type = SumNode
                    building_stack.append(current_slice)
                    for child_slice, child_weight in zip(
                            cluster_slices, cluster_weights):
                        current_slice.add_child(child_slice, child_weight)

                    sum_node = SumNode(data, current_instances,
                                       current_features)
                    sum_node.id = current_id
                    node_id_assoc[current_id] = sum_node

        root_node = SpnFactory.pruned_spn_from_slices(node_id_assoc,
                                                      building_stack, True)

        spn = SpnFactory.layered_linked_spn(root_node, data, self.config)

        return spn
Ejemplo n.º 36
0
    def linked_kernel_density_estimation(cls,
                                         n_instances,
                                         features,
                                         node_dict=None,
                                         alpha=0.1
                                         # ,batch_size=1,
                                         # sparse=False
                                         ):
        """
        WRITEME
        """

        n_features = len(features)

        # the top one is a sum layer with a single node
        root_node = SumNode()
        root_layer = SumLayerLinked([root_node])

        # second one is a product layer with n_instances nodes
        product_nodes = [ProductNode() for i in range(n_instances)]
        product_layer = ProductLayerLinked(product_nodes)
        # linking them to the root node
        for prod_node in product_nodes:
            root_node.add_child(prod_node, 1. / n_instances)

        # last layer can be a categorical smoothed input
        # or sum_layer + categorical indicator input

        input_layer = None
        layers = None
        n_leaf_nodes = n_features * n_instances

        if node_dict is None:
            # creating a sum_layer with n_leaf_nodes
            sum_nodes = [SumNode() for i in range(n_leaf_nodes)]
            # store them into a layer
            sum_layer = SumLayerLinked(sum_nodes)
            # linking them to the products above
            for i, prod_node in enumerate(product_nodes):
                for j in range(n_features):
                    # getting the next n_features nodes
                    prod_node.add_child(sum_nodes[i * n_features + j])
            # now creating the indicator nodes
            input_layer = \
                CategoricalIndicatorLayerLinked(vars=features)
            # linking the sum nodes to the indicator vars
            for i, sum_node in enumerate(sum_nodes):
                # getting the feature id
                j = i % n_features
                # and thus its number of values
                n_values = features[j]
                # getting the indices of indicators
                start_index = sum(features[:j])
                end_index = start_index + n_values
                indicators = [node for node
                              in input_layer.nodes()][start_index:end_index]
                for ind_node in indicators:
                    sum_node.add_child(ind_node, 1. / n_values)

            # storing levels
            layers = [sum_layer, product_layer,
                      root_layer]
        else:
            # create a categorical smoothed layer
            input_layer = \
                CategoricalSmoothedLayerLinked(vars=features,
                                               node_dicts=node_dict,
                                               alpha=alpha)
            # it shall contain n_leaf_nodes nodes
            smooth_nodes = list(input_layer.nodes())
            assert len(smooth_nodes) == n_leaf_nodes

            # linking it
            for i, prod_node in enumerate(product_nodes):
                for j in range(n_features):
                    # getting the next n_features nodes
                    prod_node.add_child(smooth_nodes[i * n_features + j])
            # setting the used levels
            layers = [product_layer, root_layer]

        # create the spn from levels
        kern_spn = SpnLinked(input_layer, layers)
        return kern_spn