Example #1
0
def test1():
    numpy.random.seed(42)
    data = numpy.random.poisson(5, 1000).reshape(1000, 1)

    for i in numpy.unique(data):
        print(i, numpy.sum(data == i))

    featureTypes = ["discrete"]
    featureTypes = ["categorical"]

    spn = SPN.LearnStructure(
        data,
        featureTypes=featureTypes,
        row_split_method=Splitting.KmeansRows(),
        col_split_method=Splitting.IndependenceTest(),
        # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
        # domains, families=families, row_split_method=Splitting.KmeansRows(),
        # col_split_method=Splitting.RDCTest(),
        min_instances_slice=100)

    print(spn)
    print(numpy.unique(data))
    evdata = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    print(evdata)

    ll = (spn.root.eval(numpy.asarray(evdata).reshape(len(evdata), 1)))

    print("Probs", numpy.exp(ll))
    print("Sum LL", numpy.sum(ll))
    print(numpy.histogram(data, bins="auto", density=True))
Example #2
0
def test2():
    numpy.random.seed(42)
    dsname, data, labels, classes, families = getDiabetes()

    labels = [l for l in labels]

    print(data.shape)

    print(data)
    featureTypes = [
        'discrete', 'continuous', 'continuous', 'continuous', 'continuous',
        'continuous', 'continuous', 'continuous', 'continuous'
    ]
    featureTypes = [
        'continuous', 'categorical', 'continuous', 'continuous', 'continuous',
        'continuous', 'continuous', 'continuous', 'continuous'
    ]
    # families[0] = 'bernoulli'

    # spn = SPN.LearnStructure(data, featureNames=labels, domains = domains,
    # families=families, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.IndependenceTest(alpha=0.00001),
    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTest(threshold=0.3),
                             min_instances_slice=50,
                             cluster_first=False)

    print(spn)
    # print(numpy.unique(data))

    ll = spn.root.eval(data)

    print("Sum LL", numpy.sum(ll))
Example #3
0
def test1(data, features):

    data = data[:, 1:20]
    features = features[0:data.shape[1]]

    arcs, mixt = getArchetypes(data, 3)

    nrfolds = 10

    stats = Stats(name=dsname)

    for train, test, i in kfolded(mixt, nrfolds):
        c = Chrono().start()
        spn = SPN.LearnStructure(
            train,
            featureTypes=["continuous"] * train.shape[1],
            row_split_method=Splitting.KmeansRows(),
            col_split_method=Splitting.RDCTest(threshold=0.3),
            # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
            # domains, families=families, row_split_method=Splitting.KmeansRows(),
            # col_split_method=Splitting.RDCTest(),
            min_instances_slice=100)
        c.end()

        spn.root.validate()
        ll = numpy.mean(spn.root.eval(test))

        print(ll)

        stats.add("HSPN", Stats.LOG_LIKELIHOOD, ll)
        stats.add("HSPN", Stats.TIME, c.elapsed())

        stats.save("stats_" + dsname + ".json")

    print(arcs)
Example #4
0
 def learn(data, min_instances_slice, feature_names, domains, featureTypes):
     spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.1, linear=True),
                             featureNames=feature_names,
                             domains=domains,
                              # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
                              # domains, families=families, row_split_method=Splitting.KmeansRows(),
                              # col_split_method=Splitting.RDCTest(),
                              min_instances_slice=min_instances_slice)
     return spn
Example #5
0
def test2(data, features):
    arc, mixt = getArchetypes(data, 3)

    print(mixt)

    0 / 0

    spn = SPN.LearnStructure(
        mixt,
        featureTypes=["continuous"] * mixt.shape[1],
        row_split_method=Splitting.KmeansRows(),
        col_split_method=Splitting.RDCTest(threshold=0.3),
        # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
        # domains, families=families, row_split_method=Splitting.KmeansRows(),
        # col_split_method=Splitting.RDCTest(),
        min_instances_slice=100)
Example #6
0
def test8():
    from tensorflow.examples.tutorials.mnist import input_data
    mnist = input_data.read_data_sets("MNIST_data/", one_hot=False)
    data, target = mnist.train.images, mnist.train.labels

    featureTypes = ["continuous"] * data.shape[1] + ["categorical"]

    featureNames = ["pixel"] * data.shape[1] + ["label"]

    data = numpy.hstack((data, target.reshape(data.shape[0], 1)))
    print(featureTypes)
    print(data.shape)

    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             featureNames=featureNames,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTest(threshold=0.4),
                             min_instances_slice=500,
                             cluster_first=True)
    # RDCTestOHEpy

    print("learned")

    spn.root.validate()

    data, target = mnist.test.images, mnist.test.labels

    data = numpy.hstack((data, target.reshape(data.shape[0], 1)))

    classes = numpy.unique(target)
    results = numpy.zeros((data.shape[0], len(classes)))

    print("testing")
    # print(spn)
    for c in classes:
        data[:, -1] = c
        results[:, c] = spn.root.eval(data)

    print("done")

    predictions = numpy.argmax(results, axis=1)

    print('MAP accuracy : ', accuracy_score(target, predictions))
Example #7
0
'''
import numpy

from tfspn.SPN import SPN, Splitting
import tensorflow as tf

if __name__ == '__main__':

    gen = numpy.random.poisson(5, 1000)

    data = numpy.transpose(numpy.vstack((gen, gen)))

    spn = SPN.LearnStructure(data,
                             min_instances_slice=200,
                             families=["poisson", "poisson"],
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.IndependenceTest())

    #    THIS PRODUCES THE FOLLOWING SPN:
    #
    #     SumNode_0 SumNode(0.154*ProductNode_3, 0.188*ProductNode_6, 0.158*ProductNode_10, 0.076*ProductNode_13, 0.13999999999999999*ProductNode_18, 0.176*ProductNode_21, 0.108*ProductNode_24){
    #     ProductNode_3 ProductNode(PoissonNode_4, PoissonNode_5){
    #         PoissonNode_4 P(X_0_|λ=6.0)
    #         PoissonNode_5 P(X_1_|λ=6.0)
    #         }
    #     ProductNode_6 ProductNode(PoissonNode_7, PoissonNode_8){
    #         PoissonNode_7 P(X_0_|λ=5.0)
    #         PoissonNode_8 P(X_1_|λ=5.0)
    #         }
    #     ProductNode_10 ProductNode(PoissonNode_11, PoissonNode_12){
    #         PoissonNode_11 P(X_0_|λ=7.360759493670886)
Example #8
0
def test3():
    numpy.random.seed(42)
    dsname, data, featureNames, featureTypes, doms = getAdult()

    doctorateVal = numpy.where(doms[2] == "Doctorate")[0][0]
    stategovVal = numpy.where(doms[1] == "State-gov")[0][0]

    print(featureNames)

    print(len(featureNames))

    print(data[0, :])
    print(data.shape)
    print(doctorateVal, stategovVal)

    pD = numpy.sum(data[:, 2] == doctorateVal) / data.shape[0]
    pSD = numpy.sum(
        numpy.logical_and(data[:, 2] == doctorateVal, data[:, 1]
                          == stategovVal)) / data.shape[0]
    pS = numpy.sum(data[:, 1] == stategovVal) / data.shape[0]

    print("pD", pD)
    print("pSD", pSD)
    pS_D = pSD / pD
    print("pS_D", pS_D)

    # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01),
    # spn = SPN.LearnStructure(data, featureTypes=featureTypes,
    # featureNames=featureNames, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.RDCTest(),
    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             featureNames=featureNames,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTest(threshold=0.3),
                             min_instances_slice=3,
                             cluster_first=True)

    spn.root.validate()

    print("SPN Learned")
    margSPN_SD = spn.root.marginalizeOut(
        [0, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13])
    margSPN_SD.Prune()

    print(margSPN_SD)

    dataSD = numpy.zeros_like(data[0, :]).reshape(1, data.shape[1])
    dataSD[0, 1] = stategovVal
    dataSD[0, 2] = doctorateVal
    print(dataSD)

    spnSD = (numpy.exp(margSPN_SD.eval(dataSD)))

    margSPN_D = spn.root.marginalizeOut(
        [0, 1, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13])
    margSPN_D.Prune()

    print(margSPN_D)

    dataD = numpy.zeros_like(data[0, :]).reshape(1, data.shape[1])
    dataD[0, 2] = doctorateVal
    print(dataD)

    spnD = (numpy.exp(margSPN_D.eval(dataD)))

    print("pD", pD)
    print("pS", pS)
    print("pSD", pSD)
    pS_D = pSD / pD
    print("pS_D", pS_D)

    print("spn pD", spnD)
    print("spn pSD", spnSD)
    spnS_D = spnSD / spnD
    print("spn pS_D", spnS_D)

    print("doctorateVal", doctorateVal)
    print("stategovVal", stategovVal)

    ll = spn.root.eval(data)

    # print("Probs", numpy.exp(ll))
    print("Sum LL", numpy.sum(ll))
Example #9
0
def test7():
    numpy.random.seed(42)

    D = numpy.loadtxt("bank.csv", delimiter=";", skiprows=0, dtype="S")
    D = numpy.char.strip(D)

    featureNames = [str(f) for f in D[0, :]]
    D = D[1:, :]
    featureTypes = [
        "discrete",
        "categorical",
        "categorical",
        "categorical",
        "continuous",
        "continuous",
        "categorical",
        "categorical",
        "categorical",
        "discrete",
        "categorical",
        "discrete",
        "categorical",
        "continuous",
        "discrete",
        "categorical",
        "categorical",
    ]
    print(len(featureTypes))
    print(len(featureNames))

    def isinteger(x):
        return numpy.all(numpy.equal(numpy.mod(x, 1), 0))

    cols = []
    types = []
    domains = []

    index = [0, 5]

    D = D[:, index]

    for col in range(D.shape[1]):
        b, c = numpy.unique(D[:, col], return_inverse=True)

        try:
            # could convert to float
            if isinteger(b.astype(float)):
                # was integer
                cols.append(D[:, col].astype(int))
                types.append("discrete")
                domains.append(b.astype(int))
                continue

            # was float
            cols.append(D[:, col].astype(float))
            types.append("continuous")
            domains.append(b.astype(float))

            continue
        except:
            # was discrete
            cols.append(c)
            types.append("categorical")
            domains.append(b.astype(str))

    data = numpy.column_stack(cols)
    print(featureNames)

    print(domains)
    featureNames = [featureNames[i] for i in index]
    print(featureNames)
    print(types)

    data[:, 1] = numpy.sign(data[:, 1]) * numpy.log(numpy.abs(data[:, 1]) + 1)

    # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01),
    # spn = SPN.LearnStructure(data, featureTypes=featureTypes,
    # featureNames=featureNames, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.RDCTest(),
    spn = SPN.LearnStructure(
        data,
        featureTypes=types,
        featureNames=featureNames,
        row_split_method=Splitting.KmeansRows(),
        col_split_method=Splitting.RDCTest(threshold=0.000001),
        min_instances_slice=1000,
        cluster_first=False)
    # RDCTestOHEpy

    spn.root.validate()

    print(spn)

    spn.save_pdf_graph("bank.pdf")

    ll = spn.root.eval(data)

    from matplotlib.collections import PolyCollection
    from matplotlib.colors import colorConverter
    import matplotlib.pyplot as plt
    import matplotlib.gridspec as gridspec

    for i in [0, 1]:

        x = numpy.sort(data[:, i]).reshape(data.shape[0], 1)

        fig = plt.figure(figsize=(8, 8))
        x1 = numpy.zeros_like(data)
        x1[:, i] = x[:, 0]

        color_idx = numpy.linspace(0, 1, len(spn.root.children))

        for cidx, c in enumerate(spn.root.children):

            y = numpy.exp(c.children[i].eval(x1))

            plt.plot(x, y, '--', color=plt.cm.cool(color_idx[cidx]))

    plt.show()

    # print("Probs", numpy.exp(ll))
    print("Sum LL", numpy.sum(ll))
Example #10
0
def test6():
    numpy.random.seed(42)

    datablocks = []

    yd = [0, 1, 2, 3]
    xd = [0, 1]

    for x in xd:
        for y in yd:
            block = numpy.zeros((2000, 3))
            block[:, 1] = x
            block[:, 2] = y
            if (x == 1 and y == 0) or (x == 0 and y == 1) or (
                    x == 1 and y == 2) or (x == 0 and y == 3):
                block[:, 0] = numpy.random.normal(200, 30, block.shape[0])
            else:
                block[:, 0] = numpy.random.normal(100, 30, block.shape[0])

            datablocks.append(block)

    data = numpy.vstack(datablocks)

    print(data.shape)

    featureNames = ["Gaussian", "Categorical", "Discrete"]
    featureTypes = ["continuous", "categorical", "discrete"]

    # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01),
    # spn = SPN.LearnStructure(data, featureTypes=featureTypes,
    # featureNames=featureNames, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.RDCTest(),
    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             featureNames=featureNames,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTestOHEpy(),
                             min_instances_slice=50,
                             cluster_first=True)

    spn.root.validate()

    from matplotlib.collections import PolyCollection
    from matplotlib.colors import colorConverter
    import matplotlib.pyplot as plt
    import matplotlib.gridspec as gridspec

    gs = gridspec.GridSpec(len(xd), len(yd))

    fig = plt.figure(figsize=(8, 8))

    xall = numpy.arange(0, 300, 0.5)
    i = 0
    for x in xd:
        for y in yd:
            testdata = numpy.zeros((len(xall), 3))
            testdata[:, 0] = xall
            testdata[:, 1] = x
            testdata[:, 2] = y

            pbs = numpy.zeros_like(xall)

            pbs[:] = numpy.exp(spn.root.eval(testdata))

            ax = plt.Subplot(fig, gs[i])
            i += 1

            ax.set_title('%s %s' % (x, y))
            ax.plot(xall, pbs, 'r--')

            fig.add_subplot(ax)

    plt.show()

    ll = spn.root.eval(data)

    print("Sum LL", numpy.sum(ll))
Example #11
0
def test5():
    numpy.random.seed(42)

    data = numpy.zeros((5000, 2))

    idx = numpy.random.choice(data.shape[0],
                              int(data.shape[0] / 2),
                              replace=False)

    data[idx, 1] = 1

    idx0 = data[:, 1] == 0
    idx1 = data[:, 1] == 1

    data[idx0, 0] = numpy.random.normal(100, 30, numpy.sum(idx0))

    data[idx1, 0] = numpy.random.normal(200, 30, numpy.sum(idx1))

    print(data)

    featureNames = ["Gaussian", "Categorical"]
    featureTypes = ["continuous", "discrete"]

    # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01),
    # spn = SPN.LearnStructure(data, featureTypes=featureTypes,
    # featureNames=featureNames, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.RDCTest(),
    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             featureNames=featureNames,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTestOHEpy(),
                             min_instances_slice=500,
                             cluster_first=True)

    spn.root.validate()

    from mpl_toolkits.mplot3d import Axes3D
    from matplotlib.collections import PolyCollection
    from matplotlib.colors import colorConverter
    import matplotlib.pyplot as plt
    import numpy as np

    fig = plt.figure()
    ax = fig.gca(projection='3d')

    cc = lambda arg: colorConverter.to_rgba(arg, alpha=0.6)

    xs = np.arange(0, 300, 0.5)
    verts = []
    zs = [0, 1]

    maxys = 0
    for z in zs:
        testdata = numpy.zeros((len(xs), len(zs)))
        testdata[:, 0] = xs
        testdata[:, 1] = z

        ys = numpy.zeros_like(xs)

        ys[:] = numpy.exp(spn.root.eval(testdata))

        maxys = max(maxys, numpy.max(ys))

        ys[0], ys[-1] = 0, 0
        verts.append(list(zip(xs, ys)))

    poly = PolyCollection(verts, facecolors=[cc('r'), cc('g')])
    poly.set_alpha(0.7)
    ax.add_collection3d(poly, zs=zs, zdir='y')

    ax.set_xlabel('X')
    ax.set_xlim3d(0, 300)
    ax.set_ylabel('Y')
    ax.set_ylim3d(-1, 1)
    ax.set_zlabel('Z')
    ax.set_zlim3d(0, maxys)

    plt.show()

    ll = spn.root.eval(data)

    print("Sum LL", numpy.sum(ll))