Example #1
0
s1 = SumNode("s1", [0.3, 0.7], n1, n4)

assert_array_almost_equal(
    s1.eval(data),
    [-1.848479922904004, -1.767794565136819, -1.767794565136819])

domains = estimate_domains(data, ["continuous"])

print("domains:", domains)

bins = estimate_bins(data[:, 0], "continuous", domains[0])

print("bins: ", bins)

x_range, y_range = piecewise_linear_approximation(data[:, 0],
                                                  bins,
                                                  family="continuous")

print(x_range, y_range)

n5 = PiecewiseLinearPDFNode("pwl1", 0, "X0", domains[0], x_range, y_range)

print(numpy.exp(n5.eval(data)))

print(n5.eval(data))

family = "discrete"
domains = estimate_domains(data, [family])

print("domains:", domains)
def test_piecewise_marginals():

    from mlutils.datasets import loadMLC
    from tfspn.piecewise import piecewise_linear_approximation, estimate_bins, estimate_domains
    from tfspn.tfspn import SumNode, ProductNode, PoissonNode, GaussianNode, BernoulliNode, \
        PiecewiseLinearPDFNode

    #
    # loading australian
    (train, test, valid), fnames, ftypes, domains = loadMLC('australian')

    print(train.shape)
    print(test.shape)
    print(valid.shape)

    for fn, ft, fd in zip(fnames, ftypes, domains):
        print(fn, ft, fd[:2], fd[-2:])

    #
    # some continuous features
    # A2, A3, A7, A10
    # c_feature_ids = [1, 2, 6, 9]
    c_feature_ids = [1]

    #
    #
    n_bins = 100
    for i in c_feature_ids:

        train_data = train[:, i]
        valid_data = valid[:, i]
        test_data = test[:, i]

        # pyplot.hist(train_data, bins=n_bins, alpha=0.4, label='train', normed=True)
        # pyplot.hist(valid_data, bins=n_bins, alpha=0.4, label='valid', normed=True)
        # pyplot.hist(test_data, bins=n_bins, alpha=0.4, label='test', normed=True)
        # pyplot.legend(loc='upper right')
        # pyplot.show()

        #
        # creating a piecewise node
        print('looking at feature', i, ftypes[i], domains[i])
        # bins = estimate_bins(train_data, ftypes[i], domains[i])
        print(train_data.min(), train_data.max())
        # print('computed bins', bins)
        smoothing = 1
        print('domains', domains[i])
        bins = estimate_bins(train_data, ftypes[i], [domains[i]])
        print('bins from domains', bins)
        x_range, y_range = piecewise_linear_approximation(
            train_data,
            bins=bins,
            family=ftypes[i],
            alpha=smoothing,
            # isotonic=True,
            # n_bootstraps=nb,
            # average_bootstraps=False
        )
        # print("PiecewiseLinearPDFNode")
        node = PiecewiseLinearPDFNode("PiecewiseLinearNode_{}".format(i), i,
                                      fnames[i], domains[i], x_range, y_range)
        print(node)

        #
        # compute likelihoods
        train_lls = node.eval(train)
        valid_lls = node.eval(valid)
        test_lls = node.eval(test)

        print('TRAIN LL:', train_lls.mean())
        print('VALID LL:', valid_lls.mean())
        print('TEST LL:', test_lls.mean())

        v_ids = valid_data > 76.75
        print(sum(v_ids), valid_lls[v_ids])

        t_ids = test_data > 76.75
        print(sum(t_ids), test_lls[t_ids])
        print(test_lls)
        print(test_lls[~t_ids].mean())
Example #3
0
def test_sum_node_mpe_eval():

    from tfspn.tfspn import ProductNode
    from tfspn.tfspn import SumNode
    from tfspn.tfspn import PiecewiseLinearPDFNode

    from tfspn.piecewise import domain_to_bins
    from tfspn.piecewise import piecewise_linear_approximation

    rand_gen = numpy.random.RandomState(RAND_STATE)

    n_features = 4
    n_params = 1
    thetas = rand_gen.rand(n_params, 2)
    thetas = thetas / thetas.sum(axis=1, keepdims=True)
    print("Random generated Bernoulli parameters:\n{}".format(thetas))

    p = 0.666
    n_samples = 10
    input_data = rand_gen.binomial(n=1, p=p, size=(n_samples, n_features))
    print(input_data)

    rand_x = rand_gen.choice(n_samples, size=(n_samples // 2))
    rand_y = rand_gen.choice(n_features, size=(n_samples // 2))
    input_data = input_data.astype(numpy.float64)
    input_data[rand_x, rand_y] = numpy.nan
    print(input_data)

    domain = numpy.array([0, 1])
    center = True
    step = 1
    bins = [domain_to_bins(domain, step=step, center=center)]
    laplace_smoothing = 1
    print('\tfrom domain to bins {} -> {}'.format(domain, bins[0]))

    for t in thetas:

        #
        # creating sum node over two product nodes
        prod_node_1 = ProductNode(name='prod-node-1')
        prod_node_2 = ProductNode(name='prod-node-2')

        data_11 = rand_gen.binomial(n=1, p=t[1], size=n_samples).astype(int)
        x_11, y_11 = piecewise_linear_approximation(data_11,
                                                    bins,
                                                    family='categorical',
                                                    bin_width=step,
                                                    alpha=laplace_smoothing,
                                                    n_bootstraps=None,
                                                    average_bootstraps=True,
                                                    remove_duplicates=True,
                                                    isotonic=True,
                                                    rand_gen=None)

        print(x_11, y_11)
        assert numpy.allclose(x_11[1:-1], domain)

        fid = 0
        c_11 = PiecewiseLinearPDFNode('11',
                                      featureIdx=fid,
                                      featureName='f{}'.format(fid),
                                      domain=domain, x_range=x_11, y_range=y_11)
        print(c_11)

        data_12 = rand_gen.binomial(n=1, p=t[0], size=n_samples).astype(int)
        x_12, y_12 = piecewise_linear_approximation(data_12,
                                                    bins,
                                                    family='categorical',
                                                    bin_width=step,
                                                    alpha=laplace_smoothing,
                                                    n_bootstraps=None,
                                                    average_bootstraps=True,
                                                    remove_duplicates=True,
                                                    isotonic=True,
                                                    rand_gen=None)

        print(x_12, y_12)
        assert numpy.allclose(x_12[1:-1], domain)

        fid = 1
        c_12 = PiecewiseLinearPDFNode('12',
                                      featureIdx=fid,
                                      featureName='f{}'.format(fid),
                                      domain=domain, x_range=x_12, y_range=y_12)
        print(c_12)

        prod_node_1.addChild(c_11)
        prod_node_1.addChild(c_12)

        data_21 = rand_gen.binomial(n=1, p=t[0], size=n_samples).astype(int)
        x_21, y_21 = piecewise_linear_approximation(data_21,
                                                    bins,
                                                    family='categorical',
                                                    bin_width=step,
                                                    alpha=laplace_smoothing,
                                                    n_bootstraps=None,
                                                    average_bootstraps=True,
                                                    remove_duplicates=True,
                                                    isotonic=True,
                                                    rand_gen=None)

        print(x_21, y_21)
        assert numpy.allclose(x_21[1:-1], domain)

        fid = 0
        c_21 = PiecewiseLinearPDFNode('21',
                                      featureIdx=fid,
                                      featureName='f{}'.format(fid),
                                      domain=domain, x_range=x_21, y_range=y_21)
        print(c_21)

        data_22 = rand_gen.binomial(n=1, p=t[1], size=n_samples).astype(int)
        x_22, y_22 = piecewise_linear_approximation(data_22,
                                                    bins,
                                                    family='categorical',
                                                    bin_width=step,
                                                    alpha=laplace_smoothing,
                                                    n_bootstraps=None,
                                                    average_bootstraps=True,
                                                    remove_duplicates=True,
                                                    isotonic=True,
                                                    rand_gen=None)

        print(x_22, y_22)
        assert numpy.allclose(x_22[1:-1], domain)

        fid = 1
        c_22 = PiecewiseLinearPDFNode('21',
                                      featureIdx=fid,
                                      featureName='f{}'.format(fid),
                                      domain=domain, x_range=x_22, y_range=y_22)
        print(c_22)

        prod_node_2.addChild(c_21)
        prod_node_2.addChild(c_22)

        prob, res = c_11.mpe_eval(input_data)
        print('c11 mpe log probs {}'.format(prob))
        print('c11 mpe assignment {}'.format(res))
        prob, res = c_12.mpe_eval(input_data)
        print('c12 mpe log probs {}'.format(prob))
        print('c12 mpe assignment {}'.format(res))

        prob, res = c_21.mpe_eval(input_data)
        print('c21 mpe log probs {}'.format(prob))
        print('c21 mpe assignment {}'.format(res))
        prob, res = c_22.mpe_eval(input_data)
        print('c22 mpe log probs {}'.format(prob))
        print('c22 mpe assignment {}'.format(res))

        prob, res = prod_node_1.mpe_eval(input_data)
        print('prod 1 mpe log probs {}'.format(prob))
        print('prod 1 mpe assignment {}'.format(res))

        prob, res = prod_node_2.mpe_eval(input_data)
        print('prod 2 mpe log probs {}'.format(prob))
        print('prod 2 mpe assignment {}'.format(res))

        sum_node = SumNode(name='sum-node')
        sum_node.addChild(0.5, prod_node_1)
        sum_node.addChild(0.5, prod_node_2)

        #
        prob, res = sum_node.mpe_eval(input_data)
        print('mpe log probs {}'.format(prob))
        print('mpe assignment {}'.format(res))
Example #4
0
def test_product_node_mpe_eval():

    from tfspn.tfspn import ProductNode
    from tfspn.tfspn import PiecewiseLinearPDFNode

    from tfspn.piecewise import domain_to_bins
    from tfspn.piecewise import piecewise_linear_approximation

    rand_gen = numpy.random.RandomState(RAND_STATE)

    n_features = 4
    n_children = n_features
    node_names = ['pwnode-leaf-{}'.format(i) for i in range(n_children)]
    feature_id = 0
    feature_name = 'f5'
    n_params = 1

    thetas = rand_gen.rand(n_params, 2)
    thetas = thetas / thetas.sum(axis=1, keepdims=True)
    print("Random generated Bernoulli parameters:\n{}".format(thetas))

    p = 0.666
    n_samples = 10
    input_data = rand_gen.binomial(n=1, p=p, size=(n_samples, n_features))
    print(input_data)

    rand_x = rand_gen.choice(n_samples, size=(n_samples // 2))
    rand_y = rand_gen.choice(n_features, size=(n_samples // 2))
    input_data = input_data.astype(numpy.float64)
    input_data[rand_x, rand_y] = numpy.nan
    print(input_data)

    domain = numpy.array([0, 1])
    center = True
    step = 1
    bins = [domain_to_bins(domain, step=step, center=center)]
    laplace_smoothing = 1
    print('\tfrom domain to bins {} -> {}'.format(domain, bins[0]))

    for t in thetas:

        #
        # creating a product node, testing add_child
        prod_node = ProductNode(name='prod-node')

        for k in range(n_children):
            data = numpy.random.binomial(n=1, p=t[1], size=n_samples).astype(int)
            x, y = piecewise_linear_approximation(data,
                                                  bins,
                                                  family='categorical',
                                                  bin_width=step,
                                                  alpha=laplace_smoothing,
                                                  n_bootstraps=None,
                                                  average_bootstraps=True,
                                                  remove_duplicates=True,
                                                  isotonic=True,
                                                  rand_gen=None)

            print(x, y)
            assert numpy.allclose(x[1:-1], domain)

            fid = rand_gen.choice(n_features)
            c = PiecewiseLinearPDFNode(node_names[k],
                                       featureIdx=k,
                                       featureName='f{}'.format(fid),
                                       domain=domain, x_range=x, y_range=y)
            print(c)

            prod_node.addChild(c)

        #
        prob, res = prod_node.mpe_eval(input_data)
        print('mpe log probs {}'.format(prob))
        print('mpe assignment {}'.format(res))

        for lp, s in zip(prob, res):
            assert_almost_equal(lp, prod_node.eval(s.reshape(-1, s.shape[0]))[0])
Example #5
0
def test_bernoulli_as_piecewise_linear():
    perc = 0.31
    n_obs = 200
    data = numpy.random.binomial(n=1, p=perc, size=n_obs).astype(int)

    #
    # MLE estimate bernoulli's theta
    theta = numpy.sum(data) / n_obs
    print('MLE theta {}'.format(theta))

    #
    # test for smoothing
    laplace_smoothing = 0.0

    #
    # determining domain and bins
    domain = numpy.array([0, 1])
    center = True
    step = 1
    bins = [domain_to_bins(domain, step=step, center=center)]
    print('\tfrom domain to bins {} -> {}'.format(domain, bins[0]))

    x, y = piecewise_linear_approximation(data,
                                          bins,
                                          center=center,
                                          bin_width=step,
                                          alpha=laplace_smoothing,
                                          n_bootstraps=None,
                                          average_bootstraps=True,
                                          remove_duplicates=True,
                                          isotonic=True,
                                          rand_gen=None)

    print(x, y)
    assert numpy.allclose(x[1:-1], domain)

    bernoulli_node = PiecewiseLinearPDFNode(0, domain=domain, x_range=x, y_range=y)

    #
    # check for the one
    input_obs = numpy.array([1.])
    bernoulli_node.eval(input_obs)
    log_theta = bernoulli_node.log_val
    exp_theta = numpy.exp(log_theta)
    print('\t bernoulli node log/theta for 1: {} {}'.format(log_theta, exp_theta))
    assert numpy.allclose(exp_theta, theta)

    #
    # check for the zero
    input_obs = numpy.array([0.])
    bernoulli_node.eval(input_obs)
    log_theta = bernoulli_node.log_val
    exp_theta = numpy.exp(log_theta)
    print('\t bernoulli node log/theta for 0: {} {}'.format(log_theta, exp_theta))
    assert numpy.allclose(exp_theta, 1. - theta)

    #
    # check for a value out of the domain (zero probability expected)
    input_obs = numpy.array([-0.5])
    bernoulli_node.eval(input_obs)
    log_theta = bernoulli_node.log_val
    exp_theta = numpy.exp(log_theta)
    print('\t bernoulli node log/theta for out of domain: {} {}'.format(log_theta, exp_theta))
    assert numpy.allclose(exp_theta, 0)

    #
    # check for a value out of the domain (zero probability expected)
    input_obs = numpy.array([1.01])
    bernoulli_node.eval(input_obs)
    log_theta = bernoulli_node.log_val
    exp_theta = numpy.exp(log_theta)
    print('\t bernoulli node log/theta for out of domain: {} {}'.format(log_theta, exp_theta))
    assert numpy.allclose(exp_theta, 0)
Example #6
0
def test_piecewise_mpe_eval():

    perc = 0.31
    n_obs = 200
    data = numpy.random.binomial(n=1, p=perc, size=n_obs).astype(int)

    #
    # MLE estimate bernoulli's theta
    theta = numpy.sum(data) / n_obs
    print('MLE theta {}'.format(theta))

    #
    # test for smoothing
    laplace_smoothing = 0.0

    #
    # determining domain and bins
    domain = numpy.array([0, 1])
    center = True
    step = 1
    bins = [domain_to_bins(domain, step=step, center=center)]
    print('\tfrom domain to bins {} -> {}'.format(domain, bins[0]))

    x, y = piecewise_linear_approximation(data,
                                          bins,
                                          family='categorical',
                                          bin_width=step,
                                          alpha=laplace_smoothing,
                                          n_bootstraps=None,
                                          average_bootstraps=True,
                                          remove_duplicates=True,
                                          isotonic=True,
                                          rand_gen=None)

    print(x, y)
    assert numpy.allclose(x[1:-1], domain)

    bernoulli_node = PiecewiseLinearPDFNode("node0",
                                            0,
                                            'f0',
                                            domain=domain, x_range=x, y_range=y)

    #
    # check for the one
    input_obs = numpy.array([[1., numpy.nan, 3.], [numpy.nan, 0., 1.], [numpy.nan, 1.0, 0.]])
    probs, res = bernoulli_node.mpe_eval(input_obs)

    f_id = bernoulli_node.featureIdx
    n_features = input_obs.shape[1]
    other_feature_ids = numpy.array([i for i in range(f_id)] +
                                    [i for i in range(f_id + 1, n_features)])
    assert numpy.all(numpy.isnan(res[:, other_feature_ids]))

    print('\t bernoulli node log prob and mpe assignment: {} {}'.format(probs, res))

    bernoulli_node_2 = PiecewiseLinearPDFNode("node1",
                                              1,
                                              'f1',
                                              domain=domain, x_range=x, y_range=y)
    probs_2, res_2 = bernoulli_node_2.mpe_eval(input_obs)

    f_id_2 = bernoulli_node_2.featureIdx
    other_feature_ids = numpy.array([i for i in range(f_id_2)] +
                                    [i for i in range(f_id_2 + 1, n_features)])
    assert numpy.all(numpy.isnan(res_2[:, other_feature_ids]))

    print('\t bernoulli node 2 log prob and mpe assignment: {} {}'.format(probs_2, res_2))