def test5(): numpy.random.seed(42) data = numpy.zeros((5000, 2)) idx = numpy.random.choice(data.shape[0], int(data.shape[0] / 2), replace=False) data[idx, 1] = 1 idx0 = data[:, 1] == 0 idx1 = data[:, 1] == 1 data[idx0, 0] = numpy.random.normal(100, 30, numpy.sum(idx0)) data[idx1, 0] = numpy.random.normal(200, 30, numpy.sum(idx1)) print(data) featureNames = ["Gaussian", "Categorical"] featureTypes = ["continuous", "discrete"] # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01), # spn = SPN.LearnStructure(data, featureTypes=featureTypes, # featureNames=featureNames, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTestOHEpy(), min_instances_slice=500, cluster_first=True) spn.root.validate() from mpl_toolkits.mplot3d import Axes3D from matplotlib.collections import PolyCollection from matplotlib.colors import colorConverter import matplotlib.pyplot as plt import numpy as np fig = plt.figure() ax = fig.gca(projection='3d') cc = lambda arg: colorConverter.to_rgba(arg, alpha=0.6) xs = np.arange(0, 300, 0.5) verts = [] zs = [0, 1] maxys = 0 for z in zs: testdata = numpy.zeros((len(xs), len(zs))) testdata[:, 0] = xs testdata[:, 1] = z ys = numpy.zeros_like(xs) ys[:] = numpy.exp(spn.root.eval(testdata)) maxys = max(maxys, numpy.max(ys)) ys[0], ys[-1] = 0, 0 verts.append(list(zip(xs, ys))) poly = PolyCollection(verts, facecolors=[cc('r'), cc('g')]) poly.set_alpha(0.7) ax.add_collection3d(poly, zs=zs, zdir='y') ax.set_xlabel('X') ax.set_xlim3d(0, 300) ax.set_ylabel('Y') ax.set_ylim3d(-1, 1) ax.set_zlabel('Z') ax.set_zlim3d(0, maxys) plt.show() ll = spn.root.eval(data) print("Sum LL", numpy.sum(ll))
def test6(): numpy.random.seed(42) datablocks = [] yd = [0, 1, 2, 3] xd = [0, 1] for x in xd: for y in yd: block = numpy.zeros((2000, 3)) block[:, 1] = x block[:, 2] = y if (x == 1 and y == 0) or (x == 0 and y == 1) or ( x == 1 and y == 2) or (x == 0 and y == 3): block[:, 0] = numpy.random.normal(200, 30, block.shape[0]) else: block[:, 0] = numpy.random.normal(100, 30, block.shape[0]) datablocks.append(block) data = numpy.vstack(datablocks) print(data.shape) featureNames = ["Gaussian", "Categorical", "Discrete"] featureTypes = ["continuous", "categorical", "discrete"] # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01), # spn = SPN.LearnStructure(data, featureTypes=featureTypes, # featureNames=featureNames, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTestOHEpy(), min_instances_slice=50, cluster_first=True) spn.root.validate() from matplotlib.collections import PolyCollection from matplotlib.colors import colorConverter import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec gs = gridspec.GridSpec(len(xd), len(yd)) fig = plt.figure(figsize=(8, 8)) xall = numpy.arange(0, 300, 0.5) i = 0 for x in xd: for y in yd: testdata = numpy.zeros((len(xall), 3)) testdata[:, 0] = xall testdata[:, 1] = x testdata[:, 2] = y pbs = numpy.zeros_like(xall) pbs[:] = numpy.exp(spn.root.eval(testdata)) ax = plt.Subplot(fig, gs[i]) i += 1 ax.set_title('%s %s' % (x, y)) ax.plot(xall, pbs, 'r--') fig.add_subplot(ax) plt.show() ll = spn.root.eval(data) print("Sum LL", numpy.sum(ll))
def test4(): numpy.random.seed(42) dsname, data, featureNames, featureTypes, doms = getAdult() data = data[:, [1, 2, 3, 4]] featureTypes = [ featureTypes[1], featureTypes[2], featureTypes[3], featureTypes[4] ] featureNames = [ featureNames[1], featureNames[2], featureNames[3], featureNames[4] ] doms = [doms[1], doms[2], doms[3], doms[4]] doctorateVal = numpy.where(doms[1] == "Doctorate")[0][0] stategovVal = numpy.where(doms[0] == "State-gov")[0][0] print(featureNames) print(data[0, :]) print(doctorateVal, stategovVal) pD = numpy.sum(data[:, 1] == doctorateVal) / data.shape[0] pSD = numpy.sum( numpy.logical_and(data[:, 1] == doctorateVal, data[:, 0] == stategovVal)) / data.shape[0] pS = numpy.sum(data[:, 0] == stategovVal) / data.shape[0] print("pD", pD) print("pSD", pSD) pS_D = pSD / pD print("pS|D", pS_D) # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01), # spn = SPN.LearnStructure(data, featureTypes=featureTypes, # featureNames=featureNames, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTestOHEpy(), min_instances_slice=100, cluster_first=True) spn.root.validate() print("SPN Learned") margSPN_SD = spn.root.marginalizeOut( [2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13]) margSPN_SD.Prune() print(margSPN_SD) dataSD = numpy.zeros_like(data[0, :]).reshape(1, data.shape[1]) dataSD[0, 0] = stategovVal dataSD[0, 1] = doctorateVal print(dataSD) spnSD = (numpy.exp(margSPN_SD.eval(dataSD))) margSPN_D = spn.root.marginalizeOut( [0, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13]) margSPN_D.Prune() print(margSPN_D) dataD = numpy.zeros_like(data[0, :]).reshape(1, data.shape[1]) dataD[0, 1] = doctorateVal print(dataD) spnD = (numpy.exp(margSPN_D.eval(dataD))) print("pD", pD) print("pS", pS) print("pSD", pSD) pS_D = pSD / pD print("pS_D", pS_D) print("spn pD", spnD) print("spn pSD", spnSD) spnS_D = spnSD / spnD print("spn pS_D", spnS_D) print("doctorateVal", doctorateVal) print("stategovVal", stategovVal) ll = spn.root.eval(data) # print("Probs", numpy.exp(ll)) print("Sum LL", numpy.sum(ll))