Ejemplo n.º 1
0
def valid():
    spn = create_SPN()
    spn_marg = marginalize()
    from spn.algorithms.Validity import is_valid

    print(is_valid(spn))
    print(is_valid(spn_marg))
Ejemplo n.º 2
0
def SPN_Reshape(node, max_children=2):
    v, err = is_valid(node)
    assert v, err
    nodes = get_nodes_by_type(node, (Product, Sum))

    while len(nodes) > 0:
        n = nodes.pop()

        if len(n.children) <= max_children:
            continue

        # node has more than 2 nodes, create binary hierarchy
        new_children = []
        new_weights = []
        for i in range(0, len(n.children), max_children):
            children = n.children[i:i + max_children]

            if len(children) > 1:
                if isinstance(n, Product):
                    newChild = Product()
                    for c in children:
                        newChild.scope.extend(c.scope)
                    newChild.children.extend(children)
                    new_children.append(newChild)
                else:  # Sum
                    weights = n.weights[i:i + max_children]
                    branch_weight = sum(weights)
                    new_weights.append(branch_weight)

                    newChild = Sum()
                    newChild.scope.extend(children[0].scope)
                    newChild.children.extend(children)
                    newChild.weights.extend(
                        [w / branch_weight for w in weights])
                    newChild.weights[0] = 1.0 - sum(newChild.weights[1:])
                    new_children.append(newChild)
            else:
                new_children.extend(children)

                if isinstance(n, Sum):
                    new_weights.append(1.0 - sum(new_weights))

        n.children = new_children
        if isinstance(n, Sum):
            n.weights = new_weights
        nodes.append(n)

    assign_ids(node)
    v, err = is_valid(node)
    assert v, err
    return node
Ejemplo n.º 3
0
def meu(node, input_data, node_top_down_meu=_node_top_down_meu, node_bottom_up_meu=_node_bottom_up_meu, in_place=False):
    valid, err = is_valid(node)
    assert valid, err
    if in_place:
        data = input_data
    else:
        data = np.array(input_data)

    nodes = get_nodes_by_type(node)

    lls_per_node = np.zeros((data.shape[0], len(nodes)))

    # one pass bottom up evaluating the likelihoods
    # log_likelihood(node, data, dtype=data.dtype, node_log_likelihood=node_bottom_up_meu, lls_matrix=lls_per_node)
    likelihood(node, data, dtype=data.dtype, node_likelihood=node_bottom_up_meu, lls_matrix=lls_per_node)

    meu_val = lls_per_node[:, 0]

    instance_ids = np.arange(data.shape[0])

    # one pass top down to decide on the max branch until it reaches a leaf; returns  all_result, decisions at each max node for each instance.
    all_result, all_decisions = eval_spn_top_down_meu(node, node_top_down_meu, parent_result=instance_ids, data=data,
                                                      lls_per_node=lls_per_node)

    decisions = merge_rows_for_decisions(all_decisions)

    return meu_val, decisions
Ejemplo n.º 4
0
    def test_piecewise_linear_simple(self):
        piecewise_spn = 0.5 * PiecewiseLinear([0, 1, 2], [0, 1, 0], [], scope=[0]) + \
                        0.5 * PiecewiseLinear([-2, -1, 0], [0, 1, 0], [], scope=[0])
        self.assertTrue(is_valid(piecewise_spn))

        mean = get_means(piecewise_spn)
        self.assertTrue(np.all(mean == np.array([[0]])))
Ejemplo n.º 5
0
    def test_piecewise_leaf(self):
        piecewise1 = PiecewiseLinear([0, 1, 2], [0, 1, 0], [], scope=[0])
        piecewise2 = PiecewiseLinear([-2, -1, 0], [0, 1, 0], [], scope=[0])
        self.assertTrue(is_valid(piecewise1))
        self.assertTrue(is_valid(piecewise2))

        self.assertTrue(
            np.array_equal(mpe(piecewise1, np.array([[np.nan]])),
                           np.array([[1]])), "mpe should be 1")

        self.assertTrue(
            np.array_equal(mpe(piecewise2, np.array([[np.nan]])),
                           np.array([[-1]])), "mpe should be -1")

        with self.assertRaises(AssertionError) as error:
            mpe(piecewise1, np.array([[1]]))
Ejemplo n.º 6
0
def sample_instances(node, input_data, rand_gen, node_sampling=_node_sampling, in_place=False):
    """
    Implementing hierarchical sampling

    """

    # first, we do a bottom-up pass to compute the likelihood taking into account marginals.
    # then we do a top-down pass, to sample taking into account the likelihoods.

    if in_place:
        data = input_data
    else:
        data = np.array(input_data)

    valid, err = is_valid(node)
    assert valid, err

    assert np.all(
        np.any(np.isnan(data), axis=1)), "each row must have at least a nan value where the samples will be substituted"

    nodes = get_nodes_by_type(node)

    lls_per_node = np.zeros((data.shape[0], len(nodes)))

    log_likelihood(node, data, dtype=data.dtype, lls_matrix=lls_per_node)

    instance_ids = np.arange(data.shape[0])

    eval_spn_top_down(node, node_sampling, input_vals=instance_ids, data=data, lls_per_node=lls_per_node,
                      rand_gen=rand_gen)

    return data
Ejemplo n.º 7
0
def mpe(
    node,
    input_data,
    node_top_down_mpe=_node_top_down_mpe,
    node_bottom_up_mpe_log=_node_bottom_up_mpe_log,
    in_place=False,
):
    valid, err = is_valid(node)
    assert valid, err

    assert np.all(
        np.any(np.isnan(input_data), axis=1)
    ), "each row must have at least a nan value where the samples will be substituted"

    if in_place:
        data = input_data
    else:
        data = np.array(input_data)

    nodes = get_nodes_by_type(node)

    lls_per_node = np.zeros((data.shape[0], len(nodes)))

    # one pass bottom up evaluating the likelihoods
    log_likelihood(node, data, dtype=data.dtype, node_log_likelihood=node_bottom_up_mpe_log, lls_matrix=lls_per_node)

    instance_ids = np.arange(data.shape[0])

    # one pass top down to decide on the max branch until it reaches a leaf, then it fills the nan slot with the mode
    eval_spn_top_down(node, node_top_down_mpe, parent_result=instance_ids, data=data, lls_per_node=lls_per_node)

    return data
Ejemplo n.º 8
0
def marginalize(node, scope):
    assert isinstance(scope, set), "scope must be a set"

    def marg_recursive(node):
        node_scope = set(node.scope)

        if node_scope.issubset(scope):
            return None

        if isinstance(node, Leaf):
            if len(node.scope) > 1:
                raise Exception('Leaf Node with |scope| > 1')

            return node

        newNode = node.__class__()

        #a sum node gets copied with all its children, or gets removed completely
        if isinstance(node, Sum):
            newNode.weights.extend(node.weights)

        for i, c in enumerate(node.children):
            newChildren = marg_recursive(c)
            if newChildren is None:
                continue

            newNode.children.append(newChildren)
        return newNode

    newNode = marg_recursive(node)
    rebuild_scopes_bottom_up(newNode)
    newNode = prune(newNode)
    assert is_valid(newNode)
    assign_ids(node)
    return newNode
Ejemplo n.º 9
0
def EM_optimization(spn, data, iterations=5, node_updates=_node_updates, skip_validation=False, **kwargs):
    if not skip_validation:
        valid, err = is_valid(spn)
        assert valid, "invalid spn: " + err

    lls_per_node = np.zeros((data.shape[0], get_number_of_nodes(spn)))

    for _ in range(iterations):
        # one pass bottom up evaluating the likelihoods
        log_likelihood(spn, data, dtype=data.dtype, lls_matrix=lls_per_node)

        gradients = gradient_backward(spn, lls_per_node)

        R = lls_per_node[:, 0]

        for node_type, func in node_updates.items():
            for node in get_nodes_by_type(spn, node_type):
                func(
                    node,
                    node_lls=lls_per_node[:, node.id],
                    node_gradients=gradients[:, node.id],
                    root_lls=R,
                    all_lls=lls_per_node,
                    all_gradients=gradients,
                    data=data,
                    **kwargs
                )
Ejemplo n.º 10
0
def create_SPN2():
    from spn.structure.Base import assign_ids
    from spn.structure.Base import rebuild_scopes_bottom_up

    from spn.algorithms.Validity import is_valid
    from spn.structure.leaves.parametric.Parametric import Categorical

    from spn.structure.Base import Sum, Product

    p0 = Product(children=[
        Categorical(p=[0.3, 0.7], scope=1),
        Categorical(p=[0.4, 0.6], scope=2)
    ])
    p1 = Product(children=[
        Categorical(p=[0.5, 0.5], scope=1),
        Categorical(p=[0.6, 0.4], scope=2)
    ])
    s1 = Sum(weights=[0.3, 0.7], children=[p0, p1])
    p2 = Product(children=[Categorical(p=[0.2, 0.8], scope=0), s1])
    p3 = Product(children=[
        Categorical(p=[0.2, 0.8], scope=0),
        Categorical(p=[0.3, 0.7], scope=1)
    ])
    p4 = Product(children=[p3, Categorical(p=[0.4, 0.6], scope=2)])
    spn = Sum(weights=[0.4, 0.6], children=[p2, p4])

    assign_ids(spn)
    rebuild_scopes_bottom_up(spn)

    val, msg = is_valid(spn)
    assert val, msg

    return spn
Ejemplo n.º 11
0
    def test_piecewise_linear_simple(self):
        piecewise_spn = 0.5 * PiecewiseLinear(
            [0, 1, 2], [0, 1, 0], [], scope=[0]) + 0.5 * PiecewiseLinear(
                [-2, -1, 0], [0, 1, 0], [], scope=[0])
        self.assertTrue(is_valid(piecewise_spn))

        mean = get_mean(piecewise_spn)
        self.assertAlmostEqual(np.array([[0]]), mean, 5)
Ejemplo n.º 12
0
def Prune(node):
    v, err = is_valid(node)
    assert v, err
    nodes = get_nodes_by_type(node, (Product, Sum, Max))

    while len(nodes) > 0:
        n = nodes.pop()

        n_type = type(n)
        is_sum = n_type == Sum

        i = 0
        while i < len(n.children):
            c = n.children[i]

            # if my children has only one node, we can get rid of it
            # and link directly to that grandchildren
            if not (isinstance(c, Leaf) or isinstance(c, Max)) and \
                    len(c.children) == 1:

                n.children[i] = c.children[0]
                continue

            if n_type == type(c):
                del n.children[i]
                n.children.extend(c.children)

                if is_sum:
                    w = n.weights[i]
                    del n.weights[i]

                    n.weights.extend([cw * w for cw in c.weights])
                continue

            i += 1
        if is_sum and i > 0:
            n.weights[0] = 1.0 - sum(n.weights[1:])

    if isinstance(node, (Product, Sum)) and len(node.children) == 1:
        node = node.children[0]

    assign_ids(node)
    v, err = is_valid(node)
    assert v, err
    return node
def Prune(node, contract_single_parents=True, ds_context=None):
    v, err = is_valid(node)
    assert v, err
    nodes = get_nodes_by_type(node, (Product, Sum))

    while len(nodes) > 0:
        n = nodes.pop()

        n_type = type(n)
        is_sum = n_type == Sum

        i = 0
        while i < len(n.children):
            c = n.children[i]

            # if my children has only one node, we can get rid of it and link directly to that grandchildren
            if contract_single_parents and not isinstance(c, Leaf) and len(
                    c.children) == 1:
                n.children[i] = c.children[0]
                continue

            if n_type == type(c):
                del n.children[i]
                n.children.extend(c.children)

                if is_sum:
                    w = n.weights[i]
                    del n.weights[i]
                    # #merge rules
                    # n.rule = n.rule.merge(c.rule, ds_context)
                    n.weights.extend([cw * w for cw in c.weights])
                continue

            i += 1
        if is_sum and i > 0:
            n.weights[0] = 1.0 - sum(n.weights[1:])

    if contract_single_parents and isinstance(node, (Product, Sum)) and len(
            node.children) == 1:
        node = node.children[0]

    assign_ids(node)
    v, err = is_valid(node)
    assert v, err
    return node
Ejemplo n.º 14
0
    def test_histogram_combined(self):
        piecewise_spn = ((0.5 * PiecewiseLinear([0, 1, 2], [0, 1, 0], [], scope=[0]) +
                          0.5 * PiecewiseLinear([-2, -1, 0], [0, 1, 0], [], scope=[0])) *
                         (0.5 * PiecewiseLinear([0, 1, 2], [0, 1, 0], [], scope=[1]) +
                          0.5 * PiecewiseLinear([-1, 0, 1], [0, 1, 0], [], scope=[1])))

        self.assertTrue(is_valid(piecewise_spn))

        mean = get_means(piecewise_spn)
        self.assertTrue(np.all(mean == np.array([[0., 0.5]])))
Ejemplo n.º 15
0
    def test_compression_leaves(self):
        C1 = Gaussian(mean=1, stdev=0, scope=0)
        C2 = Gaussian(mean=1, stdev=0, scope=0)

        A = 0.7 * C1 + 0.3 * C2

        Compress(A)

        self.assertTrue(*is_valid(A))
        self.assertEqual(id(A.children[0]), id(A.children[1]))

        C1 = Gaussian(mean=1, stdev=0, scope=0)
        C2 = Gaussian(mean=1, stdev=0, scope=1)

        B = C1 * C2

        Compress(B)
        self.assertTrue(*is_valid(B))

        self.assertNotEqual(id(B.children[0]), id(B.children[1]))
Ejemplo n.º 16
0
    def test_histogram_combined(self):
        piecewise_spn = (
            0.5 * PiecewiseLinear([0, 1, 2], [0, 1, 0], [], scope=[0]) +
            0.5 * PiecewiseLinear([-2, -1, 0], [0, 1, 0], [], scope=[0])) * (
                0.5 * PiecewiseLinear([0, 1, 2], [0, 1, 0], [], scope=[1]) +
                0.5 * PiecewiseLinear([-1, 0, 1], [0, 1, 0], [], scope=[1]))

        self.assertTrue(is_valid(piecewise_spn))

        mean = get_mean(piecewise_spn)
        self.assertAlmostEqual(0.0, mean[0, 0], 5)
        self.assertAlmostEqual(0.5, mean[0, 1], 5)
Ejemplo n.º 17
0
def learn_classifier(data, ds_context, spn_learn_wrapper, label_idx, **kwargs):
    spn = Sum()
    for label, count in zip(*np.unique(data[:, label_idx], return_counts=True)):
        branch = spn_learn_wrapper(data[data[:, label_idx] == label, :], ds_context, **kwargs)
        spn.children.append(branch)
        spn.weights.append(count / data.shape[0])

    spn.scope.extend(branch.scope)
    assign_ids(spn)

    valid, err = is_valid(spn)
    assert valid, "invalid spn: " + err

    return spn
Ejemplo n.º 18
0
    def test_compression_internal_nodes(self):
        C1 = Gaussian(mean=1, stdev=0, scope=0)
        C2 = Gaussian(mean=1, stdev=1, scope=1)
        C3 = Gaussian(mean=1, stdev=0, scope=0)
        C4 = Gaussian(mean=1, stdev=1, scope=1)

        R = 0.4 * (C1 * C2) + 0.6 * (C3 * C4)

        Compress(R)
        self.assertTrue(*is_valid(R))

        self.assertEqual(id(R.children[0]), id(R.children[1]))
        self.assertEqual(id(R.children[0].children[0]), id(C1))
        self.assertEqual(id(R.children[0].children[1]), id(C2))
Ejemplo n.º 19
0
def create_SPN():
    from spn.algorithms.Validity import is_valid

    from spn.structure.leaves.parametric.Parametric import Categorical

    spn = 0.4 * (Categorical(p=[0.2, 0.8], scope=0) * \
                 (0.3 * (Categorical(p=[0.3, 0.7], scope=1) * Categorical(p=[0.4, 0.6], scope=2)) + \
                  0.7 * (Categorical(p=[0.5, 0.5], scope=1) * Categorical(p=[0.6, 0.4], scope=2)))) \
          + 0.6 * (Categorical(p=[0.2, 0.8], scope=0) * \
                   Categorical(p=[0.3, 0.7], scope=1) * \
                   Categorical(p=[0.4, 0.6], scope=2))

    assert is_valid(spn)

    return spn
    def _deserialize_model(self, model):
        rootID = model.rootNode
        featureType = model.featureType
        name = model.name
        if name == "":
            name = None
        rootNodes = self._binary_deserialize_graph(model.nodes)
        for root in rootNodes:
            rebuild_scopes_bottom_up(root)
            assert is_valid(root), "SPN invalid after deserialization"

        rootNode = next((root for root in rootNodes if root.id == rootID), None)
        if rootNode is None:
            logger.error(f"Did not find serialized root node {rootID}")
        return SPNModel(rootNode, featureType, name)
Ejemplo n.º 21
0
    def test_piecewise_linear_simple(self):
        piecewise_spn = 0.5 * PiecewiseLinear([0, 1, 2], [0, 1, 0], [], scope=[0]) + \
                        0.5 * PiecewiseLinear([-2, -1, 0], [0, 1, 0], [], scope=[0])
        self.assertTrue(is_valid(piecewise_spn))

        evidence = np.array([[0.5], [1.5], [-0.5], [-1.5]])

        results = gradient_forward(piecewise_spn, evidence)
        expected_results = np.array([[0.5], [-0.5], [-0.5], [0.5]])

        for i, _ in enumerate(evidence):
            self.assertTrue(
                results[i] == expected_results[i],
                'Expected result was {}, but computed result was {}'.format(
                    expected_results[i], results[i]))
Ejemplo n.º 22
0
    def test_compression_leaves_deeper(self):
        C1 = Gaussian(mean=1, stdev=0, scope=0)
        C2 = Gaussian(mean=1, stdev=1, scope=1)
        C3 = Gaussian(mean=1, stdev=0, scope=0)
        C4 = Gaussian(mean=2, stdev=0, scope=1)

        R = 0.4 * (C1 * C2) + 0.6 * (C3 * C4)

        Compress(R)
        self.assertTrue(*is_valid(R))

        self.assertNotEqual(id(R.children[0]), id(R.children[1]))
        self.assertEqual(id(R.children[0].children[0]), id(C1))
        self.assertEqual(id(R.children[0].children[1]), id(C2))
        self.assertEqual(id(R.children[1].children[0]), id(C1))
        self.assertEqual(id(R.children[1].children[1]), id(C4))
Ejemplo n.º 23
0
    def test_piecewise_linear_combined(self):
        piecewise_spn = (
            0.5 * PiecewiseLinear([0, 1, 2], [0, 1, 0], [], scope=[0]) +
            0.5 * PiecewiseLinear([-2, -1, 0], [0, 1, 0], [], scope=[0])) * (
                0.5 * PiecewiseLinear([0, 1, 2], [0, 1, 0], [], scope=[1]) +
                0.5 * PiecewiseLinear([-1, 0, 1], [0, 1, 0], [], scope=[1]))

        self.assertTrue(is_valid(piecewise_spn))

        evidence = np.array([[0.5, 0], [-0.5, -0.5], [-1.5, 0.5]])
        results = feature_gradient(piecewise_spn, evidence)
        expected_results = np.array([[0.25, 0.125], [-0.125, 0.125], [0.25,
                                                                      0]])

        self.assertTrue(
            np.all(np.isclose(results, expected_results, atol=0.000001)),
            "Expected result was {}, but computed result was {}".format(
                expected_results, results),
        )
Ejemplo n.º 24
0
    def test_piecewise_linear_combined(self):
        piecewise_spn = (
            (0.5 * PiecewiseLinear([0, 1, 2], [0, 1, 0], [], scope=[0]) +
             0.5 * PiecewiseLinear([-2, -1, 0], [0, 1, 0], [], scope=[0])) *
            (0.5 * PiecewiseLinear([0, 1, 2], [0, 1, 0], [], scope=[1]) +
             0.5 * PiecewiseLinear([-1, 0, 1], [0, 1, 0], [], scope=[1])))

        self.assertTrue(is_valid(piecewise_spn))

        evidence = np.array([[0.5, 0], [100, 36], [-0.5, -0.5], [-1.5, 0.5]])
        results = gradient_forward(piecewise_spn, evidence)
        expected_results = np.array([[0.25, 0.125], [0, 0], [-0.125, 0.125],
                                     [0.25, 0]])

        for i, _ in enumerate(evidence):
            self.assertTrue(
                np.all(np.equal(results[i], expected_results[i])),
                'Expected result was {}, but computed result was {}'.format(
                    expected_results[i], results[i]))
Ejemplo n.º 25
0
def Compress(node):
    all_parents = get_parents(node)

    cache = {}

    for n in get_topological_order(node):

        params = (n.parameters, tuple(sorted(n.scope)))

        cached_node = cache.get(params, None)
        if cached_node is None:
            cache[params] = n
        else:
            for parent, pos in all_parents[n]:
                parent.children[pos] = cached_node

    assign_ids(node)
    val, msg = is_valid(node)
    assert val, msg
    return node
Ejemplo n.º 26
0
def marginalize(node, keep):
    #keep must be a set of features that you want to keep
    
    keep = set(keep)

    def marg_recursive(node):
        new_node_scope = keep.intersection(set(node.scope))

        if len(new_node_scope) == 0:
            # we are summing out this node
            return None

        if isinstance(node, Leaf):
            if len(node.scope) > 1:
                raise Exception('Leaf Node with |scope| > 1')

            return deepcopy(node)

        newNode = node.__class__()

        if isinstance(node, Sum):
            newNode.weights.extend(node.weights)

        for c in node.children:
            new_c = marg_recursive(c)
            if new_c is None:
                continue
            newNode.children.append(new_c)

        newNode.scope.extend(new_node_scope)
        return newNode


    newNode = marg_recursive(node)
    assign_ids(newNode)
    newNode = Prune(newNode)
    valid, err = is_valid(newNode)
    assert valid, err

    return newNode
Ejemplo n.º 27
0
def EM_optimization(spn,
                    data,
                    iterations=5,
                    node_updates=_node_updates,
                    skip_validation=False,
                    **kwargs):
    if not skip_validation:
        valid, err = is_valid(spn)
        assert valid, "invalid spn: " + err

    lls_per_node = np.zeros((data.shape[0], get_number_of_nodes(spn)))

    # node_updates = {Sum_sharedWeights: sum_em_update_shared}
    for _ in range(iterations):
        # one pass bottom up evaluating the likelihoods
        log_likelihood(spn, data, lls_matrix=lls_per_node)  # dtype=data.dtype

        gradients = gradient_backward(spn, lls_per_node)

        weights = [
            node.weights if isinstance(node, Sum_sharedWeights) else None
            for node in get_nodes_by_type(spn)
        ]

        R = lls_per_node[:, 0]
        for node_type, func in node_updates.items():
            for node in get_nodes_by_type(spn, node_type):
                func(node,
                     node_lls=lls_per_node[:, node.id],
                     node_gradients=gradients[:, node.id],
                     root_lls=R,
                     all_lls=lls_per_node,
                     all_gradients=gradients,
                     data=data,
                     spn=spn,
                     weights=weights,
                     **kwargs)
 def _serialize_model(self, model):
     msg = spflow_capnp.Model.new_message()
     assert is_valid(model.root), "SPN invalid before serialization"
     # Assign (new) IDs to the nodes
     # Keep track of already assigned IDs, so the IDs are 
     # unique for the whole file.
     assign_ids(model.root, self.assignedIDs)
     # Rebuild scopes bottom-up
     rebuild_scopes_bottom_up(model.root)
     msg.rootNode = model.root.id
     msg.numFeatures = len(model.root.scope)
     msg.featureType = model.featureType
     scope = msg.init("scope", len(model.root.scope))
     for i,v in enumerate(model.root.scope):
         scope[i] = self._unwrap_value(v)
     name = ""
     if model.name is not None:
         name = model.name
     msg.name = name
     numNodes = get_number_of_nodes(model.root)
     nodes = msg.init("nodes", numNodes)
     nodeList = ListHandler(nodes)
     self._serialize_graph([model.root], nodeList)
     return msg
Ejemplo n.º 29
0
def learn_structure(
    dataset,
    ds_context,
    split_rows,
    split_cols,
    create_leaf,
    next_operation=get_next_operation(),
    initial_scope=None,
    data_slicer=default_slicer,
):
    assert dataset is not None
    assert ds_context is not None
    assert split_rows is not None
    assert split_cols is not None
    assert create_leaf is not None
    assert next_operation is not None

    root = Product()
    root.children.append(None)

    if initial_scope is None:
        initial_scope = list(range(dataset.shape[1]))
        num_conditional_cols = None
    elif len(initial_scope) < dataset.shape[1]:
        num_conditional_cols = dataset.shape[1] - len(initial_scope)
    else:
        num_conditional_cols = None
        assert len(initial_scope) > dataset.shape[
            1], "check initial scope: %s" % initial_scope

    tasks = deque()
    tasks.append((dataset, root, 0, initial_scope, False, False))

    while tasks:

        local_data, parent, children_pos, scope, no_clusters, no_independencies = tasks.popleft(
        )

        operation, op_params = next_operation(
            local_data,
            scope,
            create_leaf,
            no_clusters=no_clusters,
            no_independencies=no_independencies,
            is_first=(parent is root),
        )

        logging.debug("OP: {} on slice {} (remaining tasks {})".format(
            operation, local_data.shape, len(tasks)))

        if operation == Operation.REMOVE_UNINFORMATIVE_FEATURES:
            node = Product()
            node.scope.extend(scope)
            parent.children[children_pos] = node

            rest_scope = set(range(len(scope)))
            for col in op_params:
                rest_scope.remove(col)
                node.children.append(None)
                tasks.append((
                    data_slicer(local_data, [col], num_conditional_cols),
                    node,
                    len(node.children) - 1,
                    [scope[col]],
                    True,
                    True,
                ))

            next_final = False

            if len(rest_scope) == 0:
                continue
            elif len(rest_scope) == 1:
                next_final = True

            node.children.append(None)
            c_pos = len(node.children) - 1

            rest_cols = list(rest_scope)
            rest_scope = [scope[col] for col in rest_scope]

            tasks.append((
                data_slicer(local_data, rest_cols, num_conditional_cols),
                node,
                c_pos,
                rest_scope,
                next_final,
                next_final,
            ))

            continue

        elif operation == Operation.SPLIT_ROWS:

            split_start_t = perf_counter()
            data_slices = split_rows(local_data, ds_context, scope)
            split_end_t = perf_counter()
            logging.debug("\t\tfound {} row clusters (in {:.5f} secs)".format(
                len(data_slices), split_end_t - split_start_t))

            if len(data_slices) == 1:
                tasks.append(
                    (local_data, parent, children_pos, scope, True, False))
                continue

            node = Sum()
            node.scope.extend(scope)
            parent.children[children_pos] = node
            # assert parent.scope == node.scope

            for data_slice, scope_slice, proportion in data_slices:
                assert isinstance(scope_slice, list), "slice must be a list"

                node.children.append(None)
                node.weights.append(proportion)
                tasks.append((data_slice, node, len(node.children) - 1, scope,
                              False, False))

            continue

        elif operation == Operation.SPLIT_COLUMNS:
            split_start_t = perf_counter()
            data_slices = split_cols(local_data, ds_context, scope)
            split_end_t = perf_counter()
            logging.debug("\t\tfound {} col clusters (in {:.5f} secs)".format(
                len(data_slices), split_end_t - split_start_t))

            if len(data_slices) == 1:
                tasks.append(
                    (local_data, parent, children_pos, scope, False, True))
                assert np.shape(data_slices[0][0]) == np.shape(local_data)
                assert data_slices[0][1] == scope
                continue

            node = Product()
            node.scope.extend(scope)
            parent.children[children_pos] = node

            for data_slice, scope_slice, _ in data_slices:
                assert isinstance(scope_slice, list), "slice must be a list"

                node.children.append(None)
                tasks.append((data_slice, node, len(node.children) - 1,
                              scope_slice, False, False))

            continue

        elif operation == Operation.NAIVE_FACTORIZATION:
            node = Product()
            node.scope.extend(scope)
            parent.children[children_pos] = node

            local_tasks = []
            local_children_params = []
            split_start_t = perf_counter()
            for col in range(len(scope)):
                node.children.append(None)
                # tasks.append((data_slicer(local_data, [col], num_conditional_cols), node, len(node.children) - 1, [scope[col]], True, True))
                local_tasks.append(len(node.children) - 1)
                child_data_slice = data_slicer(local_data, [col],
                                               num_conditional_cols)
                local_children_params.append(
                    (child_data_slice, ds_context, [scope[col]]))

            result_nodes = pool.starmap(create_leaf, local_children_params)
            # result_nodes = []
            # for l in tqdm(local_children_params):
            #    result_nodes.append(create_leaf(*l))
            # result_nodes = [create_leaf(*l) for l in local_children_params]
            for child_pos, child in zip(local_tasks, result_nodes):
                node.children[child_pos] = child

            split_end_t = perf_counter()

            logging.debug(
                "\t\tnaive factorization {} columns (in {:.5f} secs)".format(
                    len(scope), split_end_t - split_start_t))

            continue

        elif operation == Operation.CREATE_LEAF:
            leaf_start_t = perf_counter()
            node = create_leaf(local_data, ds_context, scope)
            parent.children[children_pos] = node
            leaf_end_t = perf_counter()

            logging.debug(
                "\t\t created leaf {} for scope={} (in {:.5f} secs)".format(
                    node.__class__.__name__, scope, leaf_end_t - leaf_start_t))

        else:
            raise Exception("Invalid operation: " + operation)

    node = root.children[0]
    assign_ids(node)
    valid, err = is_valid(node)
    assert valid, "invalid spn: " + err
    node = Prune(node)
    valid, err = is_valid(node)
    assert valid, "invalid spn: " + err

    return node
Ejemplo n.º 30
0
def spn_for_evidence(spn,
                     evidence_ranges,
                     node_likelihood=None,
                     distribution_update_ranges=None):
    from spn.structure.Base import Sum, Product, Leaf, assign_ids
    from spn.algorithms.TransformStructure import Prune
    from spn.algorithms.Validity import is_valid
    from copy import deepcopy

    def spn_for_evidence_recursive(node):

        if isinstance(node, Leaf):
            if len(node.scope) > 1:
                raise Exception("Leaf Node with |scope| > 1")

            if evidence_ranges[node.scope[0]] is not None:
                t_node = type(node)
                if t_node in node_likelihood:
                    ranges = np.array([evidence_ranges])
                    prob = node_likelihood[t_node](
                        node, ranges, node_likelihood=node_likelihood)[0][0]
                    if prob == 0:
                        newNode = deepcopy(node)
                    else:
                        newNode = deepcopy(node)
                        distribution_update_ranges[t_node](
                            newNode, evidence_ranges[node.scope[0]])
                else:
                    raise Exception(
                        'No log-likelihood method specified for node type: ' +
                        str(type(node)))
            else:
                prob = 1
                newNode = deepcopy(node)

            return prob, newNode

        newNode = node.__class__()
        newNode.scope = node.scope

        if isinstance(node, Sum):
            new_weights = []
            new_childs = []

            for i, c in enumerate(node.children):
                prob, new_child = spn_for_evidence_recursive(c)
                new_prob = prob * node.weights[i]
                if new_prob > 0:
                    new_weights.append(new_prob)
                    new_childs.append(new_child)

            new_weights = np.array(new_weights)
            newNode.weights = new_weights / np.sum(new_weights)
            newNode.children = new_childs
            return np.sum(new_weights), newNode

        elif isinstance(node, Product):
            new_childs = []

            new_prob = 1.
            for i, c in enumerate(node.children):
                prob, new_child = spn_for_evidence_recursive(c)
                new_prob *= prob
                new_childs.append(new_child)

            newNode.children = new_childs
            return new_prob, newNode

    prob, newNode = spn_for_evidence_recursive(spn)
    assign_ids(newNode)
    newNode = Prune(newNode)
    valid, err = is_valid(newNode)
    assert valid, err

    return prob, newNode