def test1(data, features): data = data[:, 1:20] features = features[0:data.shape[1]] arcs, mixt = getArchetypes(data, 3) nrfolds = 10 stats = Stats(name=dsname) for train, test, i in kfolded(mixt, nrfolds): c = Chrono().start() spn = SPN.LearnStructure( train, featureTypes=["continuous"] * train.shape[1], row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3), # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100) c.end() spn.root.validate() ll = numpy.mean(spn.root.eval(test)) print(ll) stats.add("HSPN", Stats.LOG_LIKELIHOOD, ll) stats.add("HSPN", Stats.TIME, c.elapsed()) stats.save("stats_" + dsname + ".json") print(arcs)
def test2(): numpy.random.seed(42) dsname, data, labels, classes, families = getDiabetes() labels = [l for l in labels] print(data.shape) print(data) featureTypes = [ 'discrete', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous' ] featureTypes = [ 'continuous', 'categorical', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous' ] # families[0] = 'bernoulli' # spn = SPN.LearnStructure(data, featureNames=labels, domains = domains, # families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.IndependenceTest(alpha=0.00001), spn = SPN.LearnStructure(data, featureTypes=featureTypes, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3), min_instances_slice=50, cluster_first=False) print(spn) # print(numpy.unique(data)) ll = spn.root.eval(data) print("Sum LL", numpy.sum(ll))
def test1(): numpy.random.seed(42) data = numpy.random.poisson(5, 1000).reshape(1000, 1) for i in numpy.unique(data): print(i, numpy.sum(data == i)) featureTypes = ["discrete"] featureTypes = ["categorical"] spn = SPN.LearnStructure( data, featureTypes=featureTypes, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(), # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100) print(spn) print(numpy.unique(data)) evdata = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] print(evdata) ll = (spn.root.eval(numpy.asarray(evdata).reshape(len(evdata), 1))) print("Probs", numpy.exp(ll)) print("Sum LL", numpy.sum(ll)) print(numpy.histogram(data, bins="auto", density=True))
def learn(): spn = SPN.LearnStructure( data, featureTypes=["discrete"] * data.shape[1], row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3, linear=True), # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=200) return spn
def learn(data): spn = SPN.LearnStructure( data, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=1, # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=50) return spn
def learn(): spn = SPN.LearnStructure( traintopics, featureTypes=featureTypes, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.1, linear=True), featureNames=featureNames, domains=domains, # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100) return spn
def learn(data): spn = SPN.LearnStructure( data, featureTypes=featureTypes, row_split_method=Splitting.KmeansRDCRows(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, alpha=0.1, families=['histogram'] * data.shape[1], # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=min_instances_slice) return spn
def learn(data, featureTypes, families, domains, min_instances_slice, alpha=0.1): spn = SPN.LearnStructure(data, alpha=alpha, featureTypes=featureTypes, row_split_method=Splitting.KmeansRDCRows(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, families=families, min_instances_slice=min_instances_slice) return spn
def estimate_density(self, training_data, validation_data=None): """Fit a MSPN on the training data. The variable validation_data is never used.""" feature_types = [] feature_names = [] families = [] for feat, str_type in training_data.features: feature_types.append(str_type) feature_names.append(feat.symbol_name()) if 'leaf' in self.learner_args: families.append(self.learner_args['leaf']) else: families.append(MSPNLearner.SPN_feat_fams[feat.symbol_type()]) if 'row_split' in self.learner_args: if self.learner_args['row_split'] == 'gower': row_split_method = Splitting.Gower(n_clusters=2) elif self.learner_args['row_split'] == 'rdc-kmeans': row_split_method = Splitting.KmeansRDCRows(n_clusters=2, k=20, OHE=1) else: raise NotImplementedError() else: row_split_method = Splitting.KmeansRDCRows(n_clusters=2, k=20, OHE=1) col_split_method = Splitting.RDCTest(threshold=0.1, OHE=1, linear=1) rand_seed = self.learner_args['seed'] mspnargs = { k: v for k, v in self.learner_args.items() if k not in ['seed', 'leaf', 'row_split'] } # let MSPNs sort this out families = None self.spn = SPN.LearnStructure(asarray(training_data.data), feature_types, families=families, featureNames=feature_names, rand_seed=rand_seed, row_split_method=row_split_method, col_split_method=col_split_method, **mspnargs)
def test2(data, features): arc, mixt = getArchetypes(data, 3) print(mixt) 0 / 0 spn = SPN.LearnStructure( mixt, featureTypes=["continuous"] * mixt.shape[1], row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3), # spn = SPN.LearnStructure(data, featureNames=["X1"], domains = # domains, families=families, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), min_instances_slice=100)
def learn(data, featureTypes, families, domains, feature_names, min_instances_slice, prior_weight=0.0): return SPN.LearnStructure( data, prior_weight=prior_weight, featureTypes=featureTypes, row_split_method=Splitting.KmeansRDCRows(), col_split_method=Splitting.RDCTest(threshold=0.3), domains=domains, families=families, featureNames=feature_names, min_instances_slice=min_instances_slice)
def test8(): from tensorflow.examples.tutorials.mnist import input_data mnist = input_data.read_data_sets("MNIST_data/", one_hot=False) data, target = mnist.train.images, mnist.train.labels featureTypes = ["continuous"] * data.shape[1] + ["categorical"] featureNames = ["pixel"] * data.shape[1] + ["label"] data = numpy.hstack((data, target.reshape(data.shape[0], 1))) print(featureTypes) print(data.shape) spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.4), min_instances_slice=500, cluster_first=True) # RDCTestOHEpy print("learned") spn.root.validate() data, target = mnist.test.images, mnist.test.labels data = numpy.hstack((data, target.reshape(data.shape[0], 1))) classes = numpy.unique(target) results = numpy.zeros((data.shape[0], len(classes))) print("testing") # print(spn) for c in classes: data[:, -1] = c results[:, c] = spn.root.eval(data) print("done") predictions = numpy.argmax(results, axis=1) print('MAP accuracy : ', accuracy_score(target, predictions))
def test6(): numpy.random.seed(42) datablocks = [] yd = [0, 1, 2, 3] xd = [0, 1] for x in xd: for y in yd: block = numpy.zeros((2000, 3)) block[:, 1] = x block[:, 2] = y if (x == 1 and y == 0) or (x == 0 and y == 1) or ( x == 1 and y == 2) or (x == 0 and y == 3): block[:, 0] = numpy.random.normal(200, 30, block.shape[0]) else: block[:, 0] = numpy.random.normal(100, 30, block.shape[0]) datablocks.append(block) data = numpy.vstack(datablocks) print(data.shape) featureNames = ["Gaussian", "Categorical", "Discrete"] featureTypes = ["continuous", "categorical", "discrete"] # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01), # spn = SPN.LearnStructure(data, featureTypes=featureTypes, # featureNames=featureNames, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTestOHEpy(), min_instances_slice=50, cluster_first=True) spn.root.validate() from matplotlib.collections import PolyCollection from matplotlib.colors import colorConverter import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec gs = gridspec.GridSpec(len(xd), len(yd)) fig = plt.figure(figsize=(8, 8)) xall = numpy.arange(0, 300, 0.5) i = 0 for x in xd: for y in yd: testdata = numpy.zeros((len(xall), 3)) testdata[:, 0] = xall testdata[:, 1] = x testdata[:, 2] = y pbs = numpy.zeros_like(xall) pbs[:] = numpy.exp(spn.root.eval(testdata)) ax = plt.Subplot(fig, gs[i]) i += 1 ax.set_title('%s %s' % (x, y)) ax.plot(xall, pbs, 'r--') fig.add_subplot(ax) plt.show() ll = spn.root.eval(data) print("Sum LL", numpy.sum(ll))
''' import numpy from tfspn.SPN import SPN, Splitting import tensorflow as tf if __name__ == '__main__': gen = numpy.random.poisson(5, 1000) data = numpy.transpose(numpy.vstack((gen, gen))) spn = SPN.LearnStructure(data, min_instances_slice=200, families=["poisson", "poisson"], row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest()) # THIS PRODUCES THE FOLLOWING SPN: # # SumNode_0 SumNode(0.154*ProductNode_3, 0.188*ProductNode_6, 0.158*ProductNode_10, 0.076*ProductNode_13, 0.13999999999999999*ProductNode_18, 0.176*ProductNode_21, 0.108*ProductNode_24){ # ProductNode_3 ProductNode(PoissonNode_4, PoissonNode_5){ # PoissonNode_4 P(X_0_|λ=6.0) # PoissonNode_5 P(X_1_|λ=6.0) # } # ProductNode_6 ProductNode(PoissonNode_7, PoissonNode_8){ # PoissonNode_7 P(X_0_|λ=5.0) # PoissonNode_8 P(X_1_|λ=5.0) # } # ProductNode_10 ProductNode(PoissonNode_11, PoissonNode_12){ # PoissonNode_11 P(X_0_|λ=7.360759493670886)
def test_random_binary_conditioning_split(): import logging logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) rand_gen = numpy.random.RandomState(1337) n_instances = 100 n_features = 6 feature_types = [ "continuous", "categorical", "discrete", "categorical", "continuous", "discrete" ] data = numpy.array([ rand_gen.randn(n_instances), rand_gen.choice(4, size=n_instances), rand_gen.choice(numpy.arange(-10, 10), size=n_instances), rand_gen.choice(6, size=n_instances), rand_gen.randn(n_instances), rand_gen.choice(numpy.arange(-20, 20), size=n_instances) ]).T print('data', data) domains = estimate_domains(data, feature_types) print('domains', domains) families = ['piecewise'] * len(domains) names = ['f{}'.format(f) for f in range(len(domains))] # # subset of features rows = rand_gen.choice(n_instances, size=10) cols = numpy.array([3, 4, 5]) data_slice = DataSlice(data, families=families, domains=domains, featureNames=names, featureTypes=feature_types, rows=rows, cols=cols) print(data_slice) print('filtered data', data_slice.getData()) config = {'rand_gen': rand_gen} clustering_func = Splitting.GetFunction('Random Binary Conditioning Split', config) data_slice_clusters, n_clusters = clustering_func(data_slice) print(data_slice_clusters) assert n_clusters == 2 assert len(data_slice_clusters) == 2 abs(len(data_slice_clusters[0].rows) - len(data_slice_clusters[1].rows)) <= 1 assert_array_equal( numpy.sort( numpy.union1d(data_slice_clusters[0].rows, data_slice_clusters[1].rows)), numpy.sort(rows)) assert_array_equal(numpy.sort(cols), numpy.sort(data_slice_clusters[0].cols)) assert_array_equal(numpy.sort(cols), numpy.sort(data_slice_clusters[1].cols))
# plt.hist(test[:,0], bins=100, histtype='stepfilled', normed=True, color='r', alpha=0.5, label='Uniform') # # plt.show() # print(domains) print(feature_names) print(feature_types) print(train.shape) # spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains, featureTypes=feature_types, row_split_method=Splitting.RandomPartitionConditioningRows(), col_split_method=Splitting.RDCTestOHEpy(threshold=0.75), # spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains, featureTypes=feature_types, row_split_method=Splitting.DBScanOHE(eps=1.0, min_samples=2), col_split_method=Splitting.RDCTestOHEpy(threshold=0.75), # spn = SPN.LearnStructure(train, featureNames=feature_names, # domains=domains, featureTypes=feature_types, # row_split_method=Splitting.KmeansOHERows(), # col_split_method=Splitting.RDCTest(threshold=0.75), spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains, featureTypes=feature_types, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.05), min_instances_slice=20, cluster_first=True) print(spn) result.append([dsname, numpy.mean(spn.root.eval(train)), numpy.mean( spn.root.eval(valid)), numpy.mean(spn.root.eval(test))]) print("train", numpy.mean(spn.root.eval(train))) print("valid", numpy.mean(spn.root.eval(valid))) print("test", numpy.mean(spn.root.eval(test))) print("train", numpy.min(spn.root.eval(train))) print("valid", numpy.min(spn.root.eval(valid))) print("test", numpy.min(spn.root.eval(test))) print(result)
row_split_method=row_split_method, col_split_method=col_split_method, domains=domains, families=families, featureNames=feature_names, min_instances_slice=min_instances_slice) # print("learning") pspn = learn(train, featureTypes=["discrete"] * data.shape[1], families=["poisson"] * data.shape[1], domains=domains, feature_names=words, min_instances_slice=200, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(0.001)) marg = pspn.marginalize([0, 1, 2, 3]) print(marg.toEquation()) print(marg) 0 / 0 mspn = learn(train, featureTypes=["discrete"] * data.shape[1], families=["isotonic"] * data.shape[1], domains=domains, feature_names=words, min_instances_slice=200,
def test3(): numpy.random.seed(42) dsname, data, featureNames, featureTypes, doms = getAdult() doctorateVal = numpy.where(doms[2] == "Doctorate")[0][0] stategovVal = numpy.where(doms[1] == "State-gov")[0][0] print(featureNames) print(len(featureNames)) print(data[0, :]) print(data.shape) print(doctorateVal, stategovVal) pD = numpy.sum(data[:, 2] == doctorateVal) / data.shape[0] pSD = numpy.sum( numpy.logical_and(data[:, 2] == doctorateVal, data[:, 1] == stategovVal)) / data.shape[0] pS = numpy.sum(data[:, 1] == stategovVal) / data.shape[0] print("pD", pD) print("pSD", pSD) pS_D = pSD / pD print("pS_D", pS_D) # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01), # spn = SPN.LearnStructure(data, featureTypes=featureTypes, # featureNames=featureNames, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.3), min_instances_slice=3, cluster_first=True) spn.root.validate() print("SPN Learned") margSPN_SD = spn.root.marginalizeOut( [0, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13]) margSPN_SD.Prune() print(margSPN_SD) dataSD = numpy.zeros_like(data[0, :]).reshape(1, data.shape[1]) dataSD[0, 1] = stategovVal dataSD[0, 2] = doctorateVal print(dataSD) spnSD = (numpy.exp(margSPN_SD.eval(dataSD))) margSPN_D = spn.root.marginalizeOut( [0, 1, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13]) margSPN_D.Prune() print(margSPN_D) dataD = numpy.zeros_like(data[0, :]).reshape(1, data.shape[1]) dataD[0, 2] = doctorateVal print(dataD) spnD = (numpy.exp(margSPN_D.eval(dataD))) print("pD", pD) print("pS", pS) print("pSD", pSD) pS_D = pSD / pD print("pS_D", pS_D) print("spn pD", spnD) print("spn pSD", spnSD) spnS_D = spnSD / spnD print("spn pS_D", spnS_D) print("doctorateVal", doctorateVal) print("stategovVal", stategovVal) ll = spn.root.eval(data) # print("Probs", numpy.exp(ll)) print("Sum LL", numpy.sum(ll))
def test7(): numpy.random.seed(42) D = numpy.loadtxt("bank.csv", delimiter=";", skiprows=0, dtype="S") D = numpy.char.strip(D) featureNames = [str(f) for f in D[0, :]] D = D[1:, :] featureTypes = [ "discrete", "categorical", "categorical", "categorical", "continuous", "continuous", "categorical", "categorical", "categorical", "discrete", "categorical", "discrete", "categorical", "continuous", "discrete", "categorical", "categorical", ] print(len(featureTypes)) print(len(featureNames)) def isinteger(x): return numpy.all(numpy.equal(numpy.mod(x, 1), 0)) cols = [] types = [] domains = [] index = [0, 5] D = D[:, index] for col in range(D.shape[1]): b, c = numpy.unique(D[:, col], return_inverse=True) try: # could convert to float if isinteger(b.astype(float)): # was integer cols.append(D[:, col].astype(int)) types.append("discrete") domains.append(b.astype(int)) continue # was float cols.append(D[:, col].astype(float)) types.append("continuous") domains.append(b.astype(float)) continue except: # was discrete cols.append(c) types.append("categorical") domains.append(b.astype(str)) data = numpy.column_stack(cols) print(featureNames) print(domains) featureNames = [featureNames[i] for i in index] print(featureNames) print(types) data[:, 1] = numpy.sign(data[:, 1]) * numpy.log(numpy.abs(data[:, 1]) + 1) # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01), # spn = SPN.LearnStructure(data, featureTypes=featureTypes, # featureNames=featureNames, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), spn = SPN.LearnStructure( data, featureTypes=types, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTest(threshold=0.000001), min_instances_slice=1000, cluster_first=False) # RDCTestOHEpy spn.root.validate() print(spn) spn.save_pdf_graph("bank.pdf") ll = spn.root.eval(data) from matplotlib.collections import PolyCollection from matplotlib.colors import colorConverter import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec for i in [0, 1]: x = numpy.sort(data[:, i]).reshape(data.shape[0], 1) fig = plt.figure(figsize=(8, 8)) x1 = numpy.zeros_like(data) x1[:, i] = x[:, 0] color_idx = numpy.linspace(0, 1, len(spn.root.children)) for cidx, c in enumerate(spn.root.children): y = numpy.exp(c.children[i].eval(x1)) plt.plot(x, y, '--', color=plt.cm.cool(color_idx[cidx])) plt.show() # print("Probs", numpy.exp(ll)) print("Sum LL", numpy.sum(ll))
def learn_spn(dataset="data/iris", precision=25, independence=0.1, header=0, date=None, isotonic=False, histogram=True, types=False): skiprows = [1] if types else [] df = pd.read_csv(dataset, delimiter=",", header=header, parse_dates=date, skiprows=skiprows) df = df.dropna(axis=0, how='any') featureNames = df.columns.values.tolist() if header == 0 else [ "X_{}".format(i) for i in range(len(df.columns)) ] dtypes = df.dtypes if types: featureTypes = [] families = [] with open(dataset, 'r') as csvfile: csvreader = csv.reader(csvfile, delimiter=',', quotechar='|') csvreader.__next__() _types = csvreader.__next__() for featureType in _types: print(featureType) if featureType == 'cat': featureTypes.append('categorical') if histogram: families.append('histogram') elif isotonic: families.append('isotonic') else: families.append('piecewise') elif featureType == 'con': featureTypes.append('continuous') families.append('piecewise' if not isotonic else 'isotonic') elif featureType == 'dis': featureTypes.append('discrete') families.append('piecewise' if not isotonic else 'isotonic') else: featureTypes.append('unknown') families.append('piecewise' if not isotonic else 'isotonic') def to_featureTypes(types): featureTypes = [] families = [] for featureType in types: if featureType.kind == 'O': featureTypes.append('categorical') if histogram: families.append('histogram') elif isotonic: families.append('isotonic') else: families.append('piecewise') elif featureType.kind == 'f': featureTypes.append('continuous') families.append('piecewise' if not isotonic else 'isotonic') elif featureType.kind == np.dtype('i'): featureTypes.append('discrete') families.append('piecewise' if not isotonic else 'isotonic') else: featureTypes.append('unknown') families.append('piecewise' if not isotonic else 'isotonic') return featureTypes, families if not types: featureTypes, families = to_featureTypes(dtypes) data_dictionary = { 'features': [{ "name": name, "family": family, "type": typ, 'pandas_type': dtypes[i] } for i, (name, family, typ) in enumerate(zip(featureNames, families, featureTypes))], 'num_entries': len(df) } # print(df.info()) idx = df.columns for id, name in enumerate(idx): if featureTypes[id] == 'categorical': lb = LabelEncoder() data_dictionary['features'][id]["encoder"] = lb df[name] = df[name].astype('category') df[name] = lb.fit_transform(df[name]) data_dictionary['features'][id]["values"] = lb.transform( lb.classes_) if dtypes[id].kind == 'M': df[name] = (df[name] - df[name].min()) / np.timedelta64(1, 'D') # print(df.head()) data = np.array(df) # print(featureTypes) spn = SPN.LearnStructure( data, featureTypes=featureTypes, featureNames=featureNames, min_instances_slice=precision, families=families, row_split_method=Splitting.KmeansRDCRows(), col_split_method=Splitting.RDCTest(threshold=independence)) spn.name = dataset return spn, data_dictionary
def test5(): numpy.random.seed(42) data = numpy.zeros((5000, 2)) idx = numpy.random.choice(data.shape[0], int(data.shape[0] / 2), replace=False) data[idx, 1] = 1 idx0 = data[:, 1] == 0 idx1 = data[:, 1] == 1 data[idx0, 0] = numpy.random.normal(100, 30, numpy.sum(idx0)) data[idx1, 0] = numpy.random.normal(200, 30, numpy.sum(idx1)) print(data) featureNames = ["Gaussian", "Categorical"] featureTypes = ["continuous", "discrete"] # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01), # spn = SPN.LearnStructure(data, featureTypes=featureTypes, # featureNames=featureNames, row_split_method=Splitting.KmeansRows(), # col_split_method=Splitting.RDCTest(), spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.RDCTestOHEpy(), min_instances_slice=500, cluster_first=True) spn.root.validate() from mpl_toolkits.mplot3d import Axes3D from matplotlib.collections import PolyCollection from matplotlib.colors import colorConverter import matplotlib.pyplot as plt import numpy as np fig = plt.figure() ax = fig.gca(projection='3d') cc = lambda arg: colorConverter.to_rgba(arg, alpha=0.6) xs = np.arange(0, 300, 0.5) verts = [] zs = [0, 1] maxys = 0 for z in zs: testdata = numpy.zeros((len(xs), len(zs))) testdata[:, 0] = xs testdata[:, 1] = z ys = numpy.zeros_like(xs) ys[:] = numpy.exp(spn.root.eval(testdata)) maxys = max(maxys, numpy.max(ys)) ys[0], ys[-1] = 0, 0 verts.append(list(zip(xs, ys))) poly = PolyCollection(verts, facecolors=[cc('r'), cc('g')]) poly.set_alpha(0.7) ax.add_collection3d(poly, zs=zs, zdir='y') ax.set_xlabel('X') ax.set_xlim3d(0, 300) ax.set_ylabel('Y') ax.set_ylim3d(-1, 1) ax.set_zlabel('Z') ax.set_zlim3d(0, maxys) plt.show() ll = spn.root.eval(data) print("Sum LL", numpy.sum(ll))
from mlutils.datasets import getCIFAR10 from tfspn.SPN import SPN, Splitting dsname, train, test, labels_train, labels_test = getCIFAR10(grayscale=True) data = numpy.vstack((train, test)) ds = numpy.hstack((train, labels_train)) domains = [numpy.unique(ds[:, i]) for i in range(ds.shape[1])] spn = SPN.LearnStructure(ds, prior_weight=0.0, featureTypes=["gaussian"] * train.shape[1] + ["discrete"], row_split_method=Splitting.RandomPartitionRows(), col_split_method=Splitting.RDCTest(threshold=0.3, OHE=True), domains=domains, families=["gaussian"] * ds.shape[1], min_instances_slice=5000000) print("learned") ts = numpy.hstack(test, numpy.zeros_like(labels_test) / 0) ts = ts[0:10, :] print(ts[0, :]) predicted_labels = spn.root.mpe_eval(ts)