Example #1
0
    def test_optimization(self):
        np.random.seed(17)
        d1 = np.random.normal(10, 5, size=2000).tolist()
        d2 = np.random.normal(30, 5, size=2000).tolist()
        data = d1 + d2
        data = np.array(data).reshape((-1, 10))
        data = data.astype(np.float32)

        ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1],
                             parametric_types=[Gaussian] * data.shape[1])

        spn = learn_parametric(data, ds_context)

        spn.weights = [0.8, 0.2]
        spn.children[0].children[0].mean = 3.0

        py_ll = np.sum(log_likelihood(spn, data))

        print(spn.weights, spn.children[0].children[0].mean)

        EM_optimization(spn, data, iterations=10)

        print(spn.weights, spn.children[0].children[0].mean)

        py_ll_opt = np.sum(log_likelihood(spn, data))

        self.assertLessEqual(py_ll, py_ll_opt)
        self.assertAlmostEqual(spn.weights[0], 0.5, 6)
        self.assertAlmostEqual(spn.weights[1], 0.5, 6)
        self.assertAlmostEqual(spn.children[0].children[0].mean, 10.50531, 4)
Example #2
0
    def test_sum_one_dimension(self):
        add_node_likelihood(Leaf, identity_ll)

        # test that we get basic computations right
        spn = 0.5 * Leaf(scope=0) + 0.5 * Leaf(scope=0)
        data = np.random.rand(10, 1)
        self.assert_correct(spn, data, data)

        spn = 0.1 * Leaf(scope=0) + 0.9 * Leaf(scope=0)
        data = np.random.rand(10, 1)
        self.assert_correct(spn, data, data)

        # test that we can pass whatever dataset, and the scopes are being respected
        # this is important for inner nodes
        spn = 0.1 * Leaf(scope=0) + 0.9 * Leaf(scope=0)
        data = np.random.rand(10, 3)
        r = 0.1 * data[:, 0] + 0.9 * data[:, 0]
        r = r.reshape(-1, 1)
        self.assert_correct(spn, data, r)

        # test that it fails if the weights are not normalized
        spn = 0.1 * Leaf(scope=0) + 0.9 * Leaf(scope=0)
        spn.weights[1] = 0.2
        data = np.random.rand(10, 3)
        with self.assertRaises(AssertionError):
            l = likelihood(spn, data)
        with self.assertRaises(AssertionError):
            log_likelihood(spn, data)

        # test the log space
        spn = 0.1 * Leaf(scope=0) + 0.9 * Leaf(scope=0)
        data = np.random.rand(10, 3)
        r = 0.1 * data[:, 0] + 0.9 * data[:, 0]
        r = r.reshape(-1, 1)
        self.assert_correct(spn, data, r)
Example #3
0
def EM_optimization(spn, data, iterations=5, node_updates=_node_updates, skip_validation=False, **kwargs):
    if not skip_validation:
        valid, err = is_valid(spn)
        assert valid, "invalid spn: " + err

    lls_per_node = np.zeros((data.shape[0], get_number_of_nodes(spn)))

    for _ in range(iterations):
        # one pass bottom up evaluating the likelihoods
        log_likelihood(spn, data, dtype=data.dtype, lls_matrix=lls_per_node)

        gradients = gradient_backward(spn, lls_per_node)

        R = lls_per_node[:, 0]

        for node_type, func in node_updates.items():
            for node in get_nodes_by_type(spn, node_type):
                func(
                    node,
                    node_lls=lls_per_node[:, node.id],
                    node_gradients=gradients[:, node.id],
                    root_lls=R,
                    all_lls=lls_per_node,
                    all_gradients=gradients,
                    data=data,
                    **kwargs
                )
Example #4
0
def mpe(
    node,
    input_data,
    node_top_down_mpe=_node_top_down_mpe,
    node_bottom_up_mpe_log=_node_bottom_up_mpe_log,
    in_place=False,
):
    valid, err = is_valid(node)
    assert valid, err

    assert np.all(
        np.any(np.isnan(input_data), axis=1)
    ), "each row must have at least a nan value where the samples will be substituted"

    if in_place:
        data = input_data
    else:
        data = np.array(input_data)

    nodes = get_nodes_by_type(node)

    lls_per_node = np.zeros((data.shape[0], len(nodes)))

    # one pass bottom up evaluating the likelihoods
    log_likelihood(node, data, dtype=data.dtype, node_log_likelihood=node_bottom_up_mpe_log, lls_matrix=lls_per_node)

    instance_ids = np.arange(data.shape[0])

    # one pass top down to decide on the max branch until it reaches a leaf, then it fills the nan slot with the mode
    eval_spn_top_down(node, node_top_down_mpe, parent_result=instance_ids, data=data, lls_per_node=lls_per_node)

    return data
Example #5
0
def EM_optimization(spn,
                    data,
                    iterations=5,
                    node_updates=_node_updates,
                    **kwargs):
    for _ in range(iterations):
        lls_per_node = np.zeros((data.shape[0], get_number_of_nodes(spn)))

        # one pass bottom up evaluating the likelihoods
        log_likelihood(spn, data, dtype=data.dtype, lls_matrix=lls_per_node)

        gradients = gradient_backward(spn, lls_per_node)

        R = lls_per_node[:, 0]

        for node_type, func in node_updates.items():  # TODO: do in parallel
            for node in get_nodes_by_type(spn, node_type):
                func(node,
                     node_lls=lls_per_node[:, node.id],
                     node_gradients=gradients[:, node.id],
                     root_lls=R,
                     all_lls=lls_per_node,
                     all_gradients=gradients,
                     data=data,
                     **kwargs)
Example #6
0
def calc_ll(wspn, data_train, data_pos, data_neg):
    # calculate LL
    log_msg = 'Log-likelihood calculating...'
    print(log_msg)
    logger.info(log_msg)

    ll_train = log_likelihood(wspn, data_train)
    ll_pos = log_likelihood(wspn, data_pos)
    ll_neg = log_likelihood(wspn, data_neg)
    log_msg = '---------median-----------'
    print(log_msg)
    logger.info(log_msg)
    log_msg = 'LL_train=' + str(np.median(ll_train))
    print(log_msg)
    logger.info(log_msg)
    log_msg = 'LL_test=' + str(np.median(ll_pos))
    print(log_msg)
    logger.info(log_msg)
    log_msg = 'LL_ood=' + str(np.median(ll_neg))
    print(log_msg)
    logger.info(log_msg)
    log_msg = '--------- mean -----------'
    print(log_msg)
    logger.info(log_msg)
    log_msg = 'LL_train=' + str(np.mean(ll_train))
    print(log_msg)
    logger.info(log_msg)
    log_msg = 'LL_test=' + str(np.mean(ll_pos))
    print(log_msg)
    logger.info(log_msg)
    log_msg = 'LL_ood=' + str(np.mean(ll_neg))
    print(log_msg)
    logger.info(log_msg)

    return ll_train, ll_pos, ll_neg
Example #7
0
def sample_instances(node, input_data, rand_gen, node_sampling=_node_sampling, in_place=False):
    """
    Implementing hierarchical sampling

    """

    # first, we do a bottom-up pass to compute the likelihood taking into account marginals.
    # then we do a top-down pass, to sample taking into account the likelihoods.

    if in_place:
        data = input_data
    else:
        data = np.array(input_data)

    valid, err = is_valid(node)
    assert valid, err

    assert np.all(
        np.any(np.isnan(data), axis=1)), "each row must have at least a nan value where the samples will be substituted"

    nodes = get_nodes_by_type(node)

    lls_per_node = np.zeros((data.shape[0], len(nodes)))

    log_likelihood(node, data, dtype=data.dtype, lls_matrix=lls_per_node)

    instance_ids = np.arange(data.shape[0])

    eval_spn_top_down(node, node_sampling, input_vals=instance_ids, data=data, lls_per_node=lls_per_node,
                      rand_gen=rand_gen)

    return data
Example #8
0
def feature_gradient(node,
                     data,
                     node_gradient_functions=_node_feature_gradients,
                     lls_per_node=None):
    """
    Feature gradients are computed for the input query and each feature using
    the backwards automatic differentiation. In mathematicl terms, it computes the
    partial derivatives \partial P(X) / \partial X_i
 

    :param node: Node for the gradient calculation
    :param data: data for the computation. NaN values are implicitely marginalized out
    :param lls_per_node: optional for storing the intermediate results
    """

    all_leaves = get_nodes_by_type(node, Leaf)

    if not lls_per_node:
        lls_per_node = np.full((data.shape[0], get_number_of_nodes(node)),
                               np.nan)
    log_likelihood(node, data, lls_matrix=lls_per_node)

    gradients = np.exp(gradient_backward(node, lls_per_node))

    node_gradients = []

    for spn_node in all_leaves:
        i = spn_node.id
        result = node_gradient_functions[type(spn_node)](spn_node, data)
        node_gradients.append(result * gradients[:, i].reshape(-1, 1))

    node_gradients = np.array(node_gradients)

    return np.nansum(node_gradients, axis=0)
Example #9
0
    def test_Histogram_discrete_inference(self):
        data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1)
        ds_context = Context([MetaType.DISCRETE])
        ds_context.add_domains(data)
        hist = create_histogram_leaf(data, ds_context, [0], alpha=False)
        prob = np.exp(log_likelihood(hist, data))

        self.assertAlmostEqual(float(prob[0]), 2 / 6)
        self.assertAlmostEqual(float(prob[1]), 2 / 6)
        self.assertAlmostEqual(float(prob[2]), 1 / 6)
        self.assertAlmostEqual(float(prob[3]), 3 / 6)
        self.assertAlmostEqual(float(prob[4]), 3 / 6)
        self.assertAlmostEqual(float(prob[5]), 3 / 6)

        data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1)
        ds_context = Context([MetaType.DISCRETE])
        ds_context.add_domains(data)
        hist = create_histogram_leaf(data, ds_context, [0], alpha=True)
        # print(np.var(data.shape[0]))
        prob = np.exp(log_likelihood(hist, data))
        self.assertAlmostEqual(float(prob[0]), 3 / 9)
        self.assertAlmostEqual(float(prob[1]), 3 / 9)
        self.assertAlmostEqual(float(prob[2]), 2 / 9)
        self.assertAlmostEqual(float(prob[3]), 4 / 9)
        self.assertAlmostEqual(float(prob[4]), 4 / 9)
        self.assertAlmostEqual(float(prob[5]), 4 / 9)
Example #10
0
    def test_ll_matrix(self):
        add_node_likelihood(Leaf, sum_and_multiplier_ll)

        node_1_1_1_1 = leaf(2, 1)
        node_1_1_1_2 = leaf(2, 2)
        node_1_1_1 = 0.7 * node_1_1_1_1 + 0.3 * node_1_1_1_2
        node_1_1_2 = leaf([0, 1], 3)
        node_1_1 = node_1_1_1 * node_1_1_2
        node_1_2_1_1_1 = leaf(0, 5)
        node_1_2_1_1_2 = leaf(1, 4)
        node_1_2_1_1 = node_1_2_1_1_1 * node_1_2_1_1_2
        node_1_2_1_2 = leaf([0, 1], 6)
        node_1_2_1 = 0.1 * node_1_2_1_1 + 0.9 * node_1_2_1_2
        node_1_2_2 = leaf(2, 3)
        node_1_2 = node_1_2_1 * node_1_2_2
        spn = 0.4 * node_1_1 + 0.6 * node_1_2

        assign_ids(spn)

        max_id = max([n.id for n in get_nodes_by_type(spn)])

        data = np.random.rand(10, 10)

        node_1_1_1_1_r = data[:, 2] * 1
        node_1_1_1_2_r = data[:, 2] * 2
        node_1_1_1_r = 0.7 * node_1_1_1_1_r + 0.3 * node_1_1_1_2_r
        node_1_1_2_r = 3 * (data[:, 0] + data[:, 1])
        node_1_1_r = node_1_1_1_r * node_1_1_2_r
        node_1_2_1_1_1_r = data[:, 0] * 5
        node_1_2_1_1_2_r = data[:, 1] * 4
        node_1_2_1_1_r = node_1_2_1_1_1_r * node_1_2_1_1_2_r
        node_1_2_1_2_r = 6 * (data[:, 0] + data[:, 1])
        node_1_2_1_r = 0.1 * node_1_2_1_1_r + 0.9 * node_1_2_1_2_r
        node_1_2_2_r = data[:, 2] * 3
        node_1_2_r = node_1_2_1_r * node_1_2_2_r
        spn_r = 0.4 * node_1_1_r + 0.6 * node_1_2_r

        self.assert_correct(spn, data, spn_r)

        lls = np.zeros((data.shape[0], max_id + 1))
        likelihood(spn, data, lls_matrix=lls)
        llls = np.zeros((data.shape[0], max_id + 1))
        log_likelihood(spn, data, lls_matrix=llls)

        self.assertTrue(np.alltrue(np.isclose(lls, np.exp(llls))))

        self.assertTrue(np.alltrue(np.isclose(spn_r, lls[:, spn.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_2_r, lls[:, node_1_2.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_2_2_r, lls[:, node_1_2_2.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_2_1_r, lls[:, node_1_2_1.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_2_1_2_r, lls[:, node_1_2_1_2.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_2_1_1_r, lls[:, node_1_2_1_1.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_2_1_1_2_r, lls[:, node_1_2_1_1_2.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_2_1_1_1_r, lls[:, node_1_2_1_1_1.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_1_r, lls[:, node_1_1.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_1_2_r, lls[:, node_1_1_2.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_1_1_r, lls[:, node_1_1_1.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_1_1_2_r, lls[:, node_1_1_1_2.id])))
        self.assertTrue(np.alltrue(np.isclose(node_1_1_1_1_r, lls[:, node_1_1_1_1.id])))
Example #11
0
 def assert_correct(self, spn, data, result):
     l = likelihood(spn, data)
     self.assertEqual(l.shape[0], data.shape[0])
     self.assertEqual(l.shape[1], 1)
     self.assertTrue(np.alltrue(np.isclose(result.reshape(-1, 1), l)))
     self.assertTrue(np.alltrue(np.isclose(np.log(l), log_likelihood(spn, data))))
     self.assertTrue(np.alltrue(np.isclose(np.log(l), log_likelihood(spn, data, debug=True))))
     self.assertTrue(np.alltrue(np.isclose(l, likelihood(spn, data, debug=True))))
Example #12
0
def learn_CNET():
    import numpy as np

    np.random.seed(123)

    train_data = np.random.binomial(1, [0.1, 0.2, 0.3, 0.4], size=(1000, 4))
    print(np.mean(train_data, axis=0))

    from spn.structure.leaves.cltree.CLTree import create_cltree_leaf
    from spn.structure.Base import Context
    from spn.structure.leaves.parametric.Parametric import Bernoulli

    ds_context = Context(
        parametric_types=[Bernoulli, Bernoulli, Bernoulli, Bernoulli
                          ]).add_domains(train_data)

    from spn.algorithms.LearningWrappers import learn_parametric, learn_cnet

    cnet_naive_mle = learn_cnet(train_data,
                                ds_context,
                                cond="naive_mle",
                                min_instances_slice=20,
                                min_features_slice=1)
    cnet_random = learn_cnet(train_data,
                             ds_context,
                             cond="random",
                             min_instances_slice=20,
                             min_features_slice=1)

    from spn.algorithms.Statistics import get_structure_stats
    from spn.io.Text import spn_to_str_equation
    from spn.algorithms.Inference import log_likelihood

    print(get_structure_stats(cnet_naive_mle))
    print(spn_to_str_equation(cnet_naive_mle))
    ll = log_likelihood(cnet_naive_mle, train_data)
    print(np.mean(ll))
    print(get_structure_stats(cnet_random))
    print(spn_to_str_equation(cnet_random))
    ll = log_likelihood(cnet_random, train_data)
    print(np.mean(ll))

    from spn.algorithms.MPE import mpe

    train_data_mpe = train_data.astype(float)
    train_data_mpe[:, 0] = np.nan
    print(mpe(cnet_random, train_data_mpe)[:30])

    ll = log_likelihood(cnet_random, train_data_mpe)
    print(np.mean(ll))
Example #13
0
def sample_induced_trees(node, data, rand_gen):
    # this requires ids to be set, and to be ordered from 0 to N
    validate_ids(node)

    max_id = reset_node_counters(node)

    lls = np.zeros((data.shape[0], max_id + 1))
    log_likelihood(node, data, llls_matrix=lls)

    map_rows_cols_to_node_id = np.zeros(data.shape, dtype=np.int64) - 1

    # We do not collect all the Zs from all sum nodes as before, but only to those
    # traversed during the top-down descent
    def _sample_induced_trees(node, row_ids):
        if len(row_ids) == 0:
            return
        node.row_ids = row_ids

        if isinstance(node, Product):
            for c in node.children:
                _sample_induced_trees(c, row_ids)
            return

        if isinstance(node, Sum):
            w_children_log_probs = np.zeros((len(row_ids), len(node.weights)))
            for i, c in enumerate(node.children):
                w_children_log_probs[:, i] = lls[row_ids, c.id] + np.log(
                    node.weights[i])

            z_gumbels = rand_gen.gumbel(
                loc=0,
                scale=1,
                # size=(w_children_log_probs.shape[1], w_children_log_probs.shape[0]))
                size=(w_children_log_probs.shape[0],
                      w_children_log_probs.shape[1]))
            g_children_log_probs = w_children_log_probs + z_gumbels
            rand_child_branches = np.argmax(g_children_log_probs, axis=1)

            for i, c in enumerate(node.children):
                new_row_ids = row_ids[rand_child_branches == i]
                node.edge_counts[i] = len(new_row_ids)
                _sample_induced_trees(c, new_row_ids)

        if isinstance(node, Leaf):
            map_rows_cols_to_node_id[row_ids, node.scope] = node.id
            return

    _sample_induced_trees(node, np.arange(data.shape[0]))

    return map_rows_cols_to_node_id, lls
Example #14
0
def run_oSLRAU(dataset, update_after_no_min_batches, prune_after):

    data = get_data(dataset)
    data = np.where(np.isnan(data),
                    np.ma.array(data, mask=np.isnan(data)).mean(axis=0), data)

    from sklearn.model_selection import train_test_split
    train_data, test_data = train_test_split(data,
                                             test_size=0.33,
                                             random_state=42)

    # make first mini_batch from data
    mini_batch_size = 50
    first_mini_batch = data[0:mini_batch_size]

    n = first_mini_batch.shape[1]  # num of variables
    print(n)
    context = [Gaussian] * n
    ds_context = Context(
        parametric_types=context).add_domains(first_mini_batch)

    # Learn initial spn
    spn = learn_parametric(first_mini_batch, ds_context)
    plot_spn(spn, 'intitial_spn.pdf')
    print(np.mean(log_likelihood(spn, test_data)))

    oSLRAU_params = oSLRAUParams(mergebatch_threshold=128,
                                 corrthresh=0.1,
                                 mvmaxscope=1,
                                 equalweight=True,
                                 currVals=True)
    no_of_minibatches = int(data.shape[0] / mini_batch_size)

    # update using oSLRAU
    for i in range(1, no_of_minibatches):
        mini_batch = data[i * mini_batch_size:(i + 1) * mini_batch_size]

        update_structure = False
        if update_after_no_min_batches // i == 0:
            print(i)
            update_structure = True
        spn = oSLRAU(spn, mini_batch, oSLRAU_params, update_structure)

        if i == prune_after:
            spn = Prune_oSLRAU(spn)

    print(np.mean(log_likelihood(spn, test_data)))
    plot_spn(spn, 'final_spn.pdf')
Example #15
0
def entropy(spn, ds_context, RVset, debug=False):
    """
    calc the entropy from spn and the permutation of RVs
    :param spn:       input SPN
    :param ds_context:
    :param RVset:     set of scope integers representing RVs
    :return:          entropy of RVs
    """
    # first check if input RVset is type of set
    check_set(RVset)
    # then check if the RVs are all DISCRETE
    check_discrete(ds_context, RVset)
    # get permutation of RVset
    perm_RV = get_permutation(ds_context, RVset)
    # get entropy
    from spn.algorithms.Inference import log_likelihood

    log_p = log_likelihood(spn, perm_RV)
    log_p[np.isinf(log_p)] = 0
    h = np.exp(log_p) * log_p
    # check, if p==0, log_p will be "-np.inf" and h will be NaN
    # if h==NaN, setting it 0 makes entropy=0
    h[np.isnan(h)] = 0
    H = -(h.sum())

    logger.debug("H(%s)=%s" % (RVset, H))

    return H
def test_log_vector_histogram():
    # Construct a minimal SPN.
    h1 = Histogram([0., 1., 2.], [0.25, 0.75], [1, 1], scope=0)
    h2 = Histogram([0., 1., 2.], [0.45, 0.55], [1, 1], scope=1)
    h3 = Histogram([0., 1., 2.], [0.33, 0.67], [1, 1], scope=0)
    h4 = Histogram([0., 1., 2.], [0.875, 0.125], [1, 1], scope=1)

    p0 = Product(children=[h1, h2])
    p1 = Product(children=[h3, h4])
    spn = Sum([0.3, 0.7], [p0, p1])

    inputs = np.column_stack((
        np.random.randint(2, size=30),
        np.random.randint(2, size=30),
    )).astype("float64")

    if not CPUCompiler.isVectorizationSupported():
        print("Test not supported by the compiler installation")
        return 0

    # Execute the compiled Kernel.
    results = CPUCompiler(maxTaskSize=5).log_likelihood(spn, inputs, supportMarginal=False)

    # Compute the reference results using the inference from SPFlow.
    reference = log_likelihood(spn, inputs)
    reference = reference.reshape(30)

    # Check the computation results against the reference
    # Check in normal space if log-results are not very close to each other.
    assert np.all(np.isclose(results, reference)) or np.all(np.isclose(np.exp(results), np.exp(reference)))
Example #17
0
def extend():
    import numpy as np
    from spn.structure.leaves.parametric.Parametric import Leaf

    class Pareto(Leaf):
        def __init__(self, a, scope=None):
            Leaf.__init__(self, scope=scope)
            self.a = a

    def pareto_likelihood(node, data=None, dtype=np.float64):
        probs = np.ones((data.shape[0], 1), dtype=dtype)
        from scipy.stats import pareto

        probs[:] = pareto.pdf(data[:, node.scope], node.a)
        return probs

    from spn.algorithms.Inference import add_node_likelihood

    add_node_likelihood(Pareto, pareto_likelihood)

    spn = 0.3 * Pareto(2.0, scope=0) + 0.7 * Pareto(3.0, scope=0)

    from spn.algorithms.Inference import log_likelihood

    print("pareto", log_likelihood(spn, np.array([1.5]).reshape(-1, 1)))
Example #18
0
    def test_bcpp(self):
        D = Gaussian(mean=1.0, stdev=1.0, scope=[0])
        E = Gaussian(mean=2.0, stdev=2.0, scope=[1])
        F = Gaussian(mean=3.0, stdev=3.0, scope=[0])
        G = Gaussian(mean=4.0, stdev=4.0, scope=[1])

        B = D * E
        C = F * G

        A = 0.3 * B + 0.7 * C

        spn_cc_eval_func = get_cpp_function(A)

        np.random.seed(17)
        data = np.random.normal(10, 0.01,
                                size=200000).tolist() + np.random.normal(
                                    30, 10, size=200000).tolist()
        data = np.array(data).reshape((-1, 2))

        py_ll = log_likelihood(A, data)

        c_ll = spn_cc_eval_func(data)

        for i in range(py_ll.shape[0]):
            self.assertAlmostEqual(py_ll[i, 0], c_ll[i, 0])
def test_cpu_histogram():
    # Construct a minimal SPN.
    h1 = Histogram([0., 1., 2.], [0.25, 0.75], [1, 1], scope=0)
    h2 = Histogram([0., 3., 6., 8.], [0.35, 0.1, 0.55], [1, 1], scope=1)
    h3 = Histogram([0., 1., 2.], [0.33, 0.67], [1, 1], scope=0)
    h4 = Histogram([0., 5., 8.], [0.875, 0.125], [1, 1], scope=1)

    p0 = Product(children=[h1, h2])
    p1 = Product(children=[h3, h4])
    spn = Sum([0.3, 0.7], [p0, p1])

    inputs = np.column_stack((
        np.random.randint(2, size=30),
        np.random.randint(8, size=30),
    )).astype("float64")

    # Insert some NaN in random places into the input data.
    inputs.ravel()[np.random.choice(inputs.size, 5, replace=False)] = np.nan

    if not CUDACompiler.isAvailable():
        print("Test not supported by the compiler installation")
        return 0

    # Execute the compiled Kernel.
    results = CUDACompiler().log_likelihood(spn, inputs)

    # Compute the reference results using the inference from SPFlow.
    reference = log_likelihood(spn, inputs)
    reference = reference.reshape(30)

    # Check the computation results against the reference
    # Check in normal space if log-results are not very close to each other.
    assert np.all(np.isclose(results, reference)) or np.all(
        np.isclose(np.exp(results), np.exp(reference)))
Example #20
0
    def test_eval_parametric(self):
        data = np.array([1, 1, 1, 1, 1, 1, 1], dtype=np.float32).reshape(
            (1, 7))

        spn = (Gaussian(mean=1.0, stdev=1.0, scope=[0]) *
               Exponential(l=1.0, scope=[1]) *
               Gamma(alpha=1.0, beta=1.0, scope=[2]) *
               LogNormal(mean=1.0, stdev=1.0, scope=[3]) *
               Poisson(mean=1.0, scope=[4]) * Bernoulli(p=0.6, scope=[5]) *
               Categorical(p=[0.1, 0.2, 0.7], scope=[6]))

        ll = log_likelihood(spn, data)

        tf_ll = eval_tf(spn, data)

        self.assertTrue(np.all(np.isclose(ll, tf_ll)))

        spn_copy = Copy(spn)

        tf_graph, data_placeholder, variable_dict = spn_to_tf_graph(
            spn_copy, data, 1)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            tf_graph_to_spn(variable_dict)

        str_val = spn_to_str_equation(spn)
        str_val2 = spn_to_str_equation(spn_copy)

        self.assertEqual(str_val, str_val2)
Example #21
0
def test_cuda_categorical():
    # Construct a minimal SPN
    c1 = Categorical(p=[0.35, 0.55, 0.1], scope=0)
    c2 = Categorical(p=[0.25, 0.625, 0.125], scope=1)
    c3 = Categorical(p=[0.5, 0.2, 0.3], scope=2)
    c4 = Categorical(p=[0.6, 0.15, 0.25], scope=3)
    c5 = Categorical(p=[0.7, 0.11, 0.19], scope=4)
    c6 = Categorical(p=[0.8, 0.14, 0.06], scope=5)
    p = Product(children=[c1, c2, c3, c4, c5, c6])

    # Randomly sample input values.
    inputs = np.column_stack((
        np.random.randint(3, size=30),
        np.random.randint(3, size=30),
        np.random.randint(3, size=30),
        np.random.randint(3, size=30),
        np.random.randint(3, size=30),
        np.random.randint(3, size=30),
    )).astype("float64")

    if not CUDACompiler.isAvailable():
        print("Test not supported by the compiler installation")
        return 0

    # Execute the compiled Kernel.
    results = CUDACompiler().log_likelihood(p, inputs, supportMarginal=False)

    # Compute the reference results using the inference from SPFlow.
    reference = log_likelihood(p, inputs)
    reference = reference.reshape(30)

    # Check the computation results against the reference
    # Check in normal space if log-results are not very close to each other.
    assert np.all(np.isclose(results, reference)) or np.all(
        np.isclose(np.exp(results), np.exp(reference)))
Example #22
0
 def predict(spn):
     data = get_data_in_window(features=values)
     res = np.ndarray((Y.shape[0] * Y.shape[1], number_of_classes))
     for i in range(number_of_classes):
         tmp = data.copy()
         tmp[:, -int((window_size**2) / 2)] = i
         res[:, i] = log_likelihood(spn, tmp)[:, 0]
     return np.argmax(res, axis=1).reshape((Y.shape[0], Y.shape[1]))
Example #23
0
    def test_gaussian_spn_ll(self):
        root = 0.3 * (Gaussian(mean=0, stdev=1, scope=0) * Gaussian(
            mean=1, stdev=1, scope=1)) + 0.7 * (Gaussian(
                mean=2, stdev=1, scope=0) * Gaussian(mean=3, stdev=1, scope=1))

        sympyecc = spn_to_sympy(root)
        logsympyecc = spn_to_sympy(root, log=True)

        sym_l = float(sympyecc.evalf(subs={"x0": 0, "x1": 0}))
        sym_ll = float(logsympyecc.evalf(subs={"x0": 0, "x1": 0}))

        data = np.array([0, 0], dtype=np.float).reshape(-1, 2)

        self.assertTrue(
            np.alltrue(np.isclose(np.log(sym_l), log_likelihood(root, data))))
        self.assertTrue(
            np.alltrue(np.isclose(sym_ll, log_likelihood(root, data))))
Example #24
0
    def test_PWL_no_variance(self):
        data = np.array([1.0, 1.0]).reshape(-1, 1)
        ds_context = Context([MetaType.REAL])
        ds_context.add_domains(data)
        leaf = create_piecewise_leaf(data, ds_context, scope=[0], hist_source="kde")
        prob = np.exp(log_likelihood(leaf, data))

        self.assertAlmostEqual(float(prob[0]), 2 / 6)
        self.assertAlmostEqual(float(prob[1]), 2 / 6)
Example #25
0
def run_spflow(spflow_spn, n_feats, batch_size, repetitions):
    print("Running SPFlow with: nfeat=%s, batch=%s" % (n_feats, batch_size))
    x = np.random.rand(batch_size, n_feats).astype(np.float32)

    # warmup
    for i in range(10):
        ll = log_likelihood(spflow_spn, x)

    # Run SPFlow spn
    t = 0.0
    for i in tqdm(range(repetitions), desc="Repetition loop"):
        x = np.random.rand(batch_size, n_feats).astype(np.float32)
        t0 = time()
        ll = log_likelihood(spflow_spn, x)
        t += time() - t0

    spflow_time = t / repetitions
    return spflow_time
Example #26
0
    def assert_correct(self, node, x, result):
        self.tested.add(type(node))

        data = np.array([x], dtype=np.float).reshape(-1, 1)
        node.scope = [0]
        l = likelihood(node, data)
        self.assertAlmostEqual(result, l[0, 0], 5)
        self.assertTrue(np.alltrue(np.isclose(np.log(l), log_likelihood(node, data))))

        data = np.random.rand(10, 10)
        data[:, 5] = x
        node.scope = [5]
        l = likelihood(node, data)
        self.assertEqual(l.shape[0], data.shape[0])
        self.assertEqual(l.shape[1], 1)
        self.assertTrue(np.isclose(np.var(l), 0))
        self.assertTrue(np.alltrue(np.isclose(result, l[0, 0])))
        self.assertTrue(np.alltrue(np.isclose(np.log(l), log_likelihood(node, data))))
Example #27
0
    def test_optimization(self):
        np.random.seed(17)
        data = np.random.normal(10, 0.01,
                                size=2000).tolist() + np.random.normal(
                                    30, 10, size=2000).tolist()
        data = np.array(data).reshape((-1, 10))
        data = data.astype(np.float32)

        ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1],
                             parametric_types=[Gaussian] * data.shape[1])

        spn = learn_parametric(data, ds_context)

        spn.weights = [0.8, 0.2]

        py_ll = log_likelihood(spn, data)

        tf_graph, data_placeholder, variable_dict = spn_to_tf_graph(spn, data)

        loss = likelihood_loss(tf_graph)

        output = tf.train.AdamOptimizer(0.001).minimize(loss)

        with tf.Session() as session:
            session.run(tf.global_variables_initializer())
            for step in range(50):
                session.run(output, feed_dict={data_placeholder: data})
                # print("loss:", step, session.run(-loss, feed_dict={data_placeholder: data}))

            tf_ll_opt = session.run(tf_graph,
                                    feed_dict={
                                        data_placeholder: data
                                    }).reshape(-1, 1)

            tf_graph_to_spn(variable_dict)

        py_ll_opt = log_likelihood(spn, data)

        # print(tf_ll_opt.sum(), py_ll_opt.sum())

        self.assertTrue(np.all(np.isclose(tf_ll_opt, py_ll_opt)))

        self.assertLess(py_ll.sum(), tf_ll_opt.sum())
Example #28
0
def inference():
    import numpy as np

    spn = create_SPN()
    spn_marg = marginalize()

    test_data = np.array([1.0, 0.0, 1.0]).reshape(-1, 3)

    from spn.algorithms.Inference import log_likelihood

    ll = log_likelihood(spn, test_data)
    print("python ll", ll, np.exp(ll))

    llm = log_likelihood(spn_marg, test_data)
    print("python ll spn_marg", llm, np.exp(llm))

    test_data2 = np.array([np.nan, 0.0, 1.0]).reshape(-1, 3)
    llom = log_likelihood(spn, test_data2)
    print("python ll spn with nan", llom, np.exp(llom))
Example #29
0
def apply(train_datasets, ds_contexts, test_datasets, n_folds, result_path, filename, foldLog):

    # Comment this if you are interested in seen the warnings, we observed that many informative warnings are
    # thrown here, but didn't see nothing suspicious, simly executing Spflow's Mspn method
    warnings.filterwarnings("ignore", category=RuntimeWarning)
    warnings.filterwarnings("ignore", category=UserWarning)

    print("\n========================")
    print("MSPN")
    print("========================")

    results = {}
    folds = {}
    avg_learning_time = 0
    avg_test_ll = 0
    for i in range(1, n_folds + 1):

        index = i-1

        # Only for MSPN:
        ds_contexts[index].add_domains(train_datasets[index])

        init_time = time.time()*1000
        model = learn_mspn(train_datasets[index], ds_contexts[index], min_instances_slice=20)
        end_time = time.time()*1000

        learning_time = end_time - init_time
        test_ll = log_likelihood(model, test_datasets[index])
        test_ll = np.sum(test_ll)

        fold_result = {"test_ll": test_ll, "learning_time": learning_time}

        folds["fold_" + str(i)] = fold_result
        avg_learning_time = avg_learning_time + learning_time
        avg_test_ll = avg_test_ll + test_ll

        if foldLog:
            print("----------------------------------------")
            print("Fold (" + str(i) + "): ")
            print("Test LL: " + str(test_ll))
            print("Learning time: " + str(learning_time))

    # Generate the average results and store them in the dictionary, then store them in a JSON file
    avg_test_ll = avg_test_ll / n_folds
    avg_learning_time = avg_learning_time / n_folds / 1000  # in seconds
    results["average_test_ll"] = avg_test_ll
    results["average_learning_time"] = avg_learning_time
    results["folds"] = folds
    store_json(results, result_path, filename)

    print("----------------------------------------")
    print("----------------------------------------")
    print("Average Test LL: " + str(avg_test_ll))
    print("Average learning time: " + str(avg_learning_time))
Example #30
0
    def assert_correct(self, node, x, result):
        self.tested.add(type(node))

        data = np.array([x], dtype=np.float).reshape(1, -1)
        node.scope = list(range(data.shape[1]))
        l = likelihood(node, data)
        self.assertAlmostEqual(result, l[0, 0], 5)
        self.assertTrue(
            np.alltrue(np.isclose(np.log(l), log_likelihood(node, data))))

        new_scope = (np.array(node.scope) + 5).tolist()
        data = np.random.rand(10, max(new_scope) + 2)
        data[:, new_scope] = x
        node.scope = new_scope
        l = likelihood(node, data)
        self.assertEqual(l.shape[0], data.shape[0])
        self.assertEqual(l.shape[1], 1)
        self.assertTrue(np.isclose(np.var(l), 0))
        self.assertTrue(np.alltrue(np.isclose(result, l[0, 0])))
        self.assertTrue(
            np.alltrue(np.isclose(np.log(l), log_likelihood(node, data))))