Beispiel #1
0
def test1(data, features):

    data = data[:, 1:20]
    features = features[0:data.shape[1]]

    arcs, mixt = getArchetypes(data, 3)

    nrfolds = 10

    stats = Stats(name=dsname)

    for train, test, i in kfolded(mixt, nrfolds):
        c = Chrono().start()
        spn = SPN.LearnStructure(
            train,
            featureTypes=["continuous"] * train.shape[1],
            row_split_method=Splitting.KmeansRows(),
            col_split_method=Splitting.RDCTest(threshold=0.3),
            # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
            # domains, families=families, row_split_method=Splitting.KmeansRows(),
            # col_split_method=Splitting.RDCTest(),
            min_instances_slice=100)
        c.end()

        spn.root.validate()
        ll = numpy.mean(spn.root.eval(test))

        print(ll)

        stats.add("HSPN", Stats.LOG_LIKELIHOOD, ll)
        stats.add("HSPN", Stats.TIME, c.elapsed())

        stats.save("stats_" + dsname + ".json")

    print(arcs)
Beispiel #2
0
def test2():
    numpy.random.seed(42)
    dsname, data, labels, classes, families = getDiabetes()

    labels = [l for l in labels]

    print(data.shape)

    print(data)
    featureTypes = [
        'discrete', 'continuous', 'continuous', 'continuous', 'continuous',
        'continuous', 'continuous', 'continuous', 'continuous'
    ]
    featureTypes = [
        'continuous', 'categorical', 'continuous', 'continuous', 'continuous',
        'continuous', 'continuous', 'continuous', 'continuous'
    ]
    # families[0] = 'bernoulli'

    # spn = SPN.LearnStructure(data, featureNames=labels, domains = domains,
    # families=families, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.IndependenceTest(alpha=0.00001),
    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTest(threshold=0.3),
                             min_instances_slice=50,
                             cluster_first=False)

    print(spn)
    # print(numpy.unique(data))

    ll = spn.root.eval(data)

    print("Sum LL", numpy.sum(ll))
Beispiel #3
0
def test1():
    numpy.random.seed(42)
    data = numpy.random.poisson(5, 1000).reshape(1000, 1)

    for i in numpy.unique(data):
        print(i, numpy.sum(data == i))

    featureTypes = ["discrete"]
    featureTypes = ["categorical"]

    spn = SPN.LearnStructure(
        data,
        featureTypes=featureTypes,
        row_split_method=Splitting.KmeansRows(),
        col_split_method=Splitting.IndependenceTest(),
        # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
        # domains, families=families, row_split_method=Splitting.KmeansRows(),
        # col_split_method=Splitting.RDCTest(),
        min_instances_slice=100)

    print(spn)
    print(numpy.unique(data))
    evdata = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    print(evdata)

    ll = (spn.root.eval(numpy.asarray(evdata).reshape(len(evdata), 1)))

    print("Probs", numpy.exp(ll))
    print("Sum LL", numpy.sum(ll))
    print(numpy.histogram(data, bins="auto", density=True))
Beispiel #4
0
    def learn():
        spn = SPN.LearnStructure(
            data,
            featureTypes=["discrete"] * data.shape[1],
            row_split_method=Splitting.Gower(),
            col_split_method=Splitting.RDCTest(threshold=0.3, linear=True),
            # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
            # domains, families=families, row_split_method=Splitting.KmeansRows(),
            # col_split_method=Splitting.RDCTest(),
            min_instances_slice=200)

        return spn
Beispiel #5
0
 def learn(data):
     spn = SPN.LearnStructure(
         data,
         featureTypes=featureTypes,
         row_split_method=Splitting.Gower(),
         col_split_method=Splitting.RDCTest(threshold=0.3),
         domains=domains,
         alpha=1,
         # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
         # domains, families=families, row_split_method=Splitting.KmeansRows(),
         # col_split_method=Splitting.RDCTest(),
         min_instances_slice=50)
     return spn
Beispiel #6
0
 def learn():
     spn = SPN.LearnStructure(
         traintopics,
         featureTypes=featureTypes,
         row_split_method=Splitting.Gower(),
         col_split_method=Splitting.RDCTest(threshold=0.1, linear=True),
         featureNames=featureNames,
         domains=domains,
         # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
         # domains, families=families, row_split_method=Splitting.KmeansRows(),
         # col_split_method=Splitting.RDCTest(),
         min_instances_slice=100)
     return spn
Beispiel #7
0
 def learn(data):
     spn = SPN.LearnStructure(
         data,
         featureTypes=featureTypes,
         row_split_method=Splitting.KmeansRDCRows(),
         col_split_method=Splitting.RDCTest(threshold=0.3),
         domains=domains,
         alpha=0.1,
         families=['histogram'] * data.shape[1],
         # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
         # domains, families=families, row_split_method=Splitting.KmeansRows(),
         # col_split_method=Splitting.RDCTest(),
         min_instances_slice=min_instances_slice)
     return spn
Beispiel #8
0
def learn(data,
          featureTypes,
          families,
          domains,
          min_instances_slice,
          alpha=0.1):
    spn = SPN.LearnStructure(data,
                             alpha=alpha,
                             featureTypes=featureTypes,
                             row_split_method=Splitting.KmeansRDCRows(),
                             col_split_method=Splitting.RDCTest(threshold=0.3),
                             domains=domains,
                             families=families,
                             min_instances_slice=min_instances_slice)
    return spn
    def estimate_density(self, training_data, validation_data=None):
        """Fit a MSPN on the training data. The variable validation_data is
        never used."""
        feature_types = []
        feature_names = []
        families = []
        for feat, str_type in training_data.features:
            feature_types.append(str_type)
            feature_names.append(feat.symbol_name())
            if 'leaf' in self.learner_args:
                families.append(self.learner_args['leaf'])
            else:
                families.append(MSPNLearner.SPN_feat_fams[feat.symbol_type()])

        if 'row_split' in self.learner_args:
            if self.learner_args['row_split'] == 'gower':
                row_split_method = Splitting.Gower(n_clusters=2)
            elif self.learner_args['row_split'] == 'rdc-kmeans':
                row_split_method = Splitting.KmeansRDCRows(n_clusters=2,
                                                           k=20,
                                                           OHE=1)
            else:
                raise NotImplementedError()

        else:
            row_split_method = Splitting.KmeansRDCRows(n_clusters=2,
                                                       k=20,
                                                       OHE=1)

        col_split_method = Splitting.RDCTest(threshold=0.1, OHE=1, linear=1)

        rand_seed = self.learner_args['seed']
        mspnargs = {
            k: v
            for k, v in self.learner_args.items()
            if k not in ['seed', 'leaf', 'row_split']
        }

        # let MSPNs sort this out
        families = None
        self.spn = SPN.LearnStructure(asarray(training_data.data),
                                      feature_types,
                                      families=families,
                                      featureNames=feature_names,
                                      rand_seed=rand_seed,
                                      row_split_method=row_split_method,
                                      col_split_method=col_split_method,
                                      **mspnargs)
Beispiel #10
0
def test2(data, features):
    arc, mixt = getArchetypes(data, 3)

    print(mixt)

    0 / 0

    spn = SPN.LearnStructure(
        mixt,
        featureTypes=["continuous"] * mixt.shape[1],
        row_split_method=Splitting.KmeansRows(),
        col_split_method=Splitting.RDCTest(threshold=0.3),
        # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
        # domains, families=families, row_split_method=Splitting.KmeansRows(),
        # col_split_method=Splitting.RDCTest(),
        min_instances_slice=100)
Beispiel #11
0
def learn(data,
          featureTypes,
          families,
          domains,
          feature_names,
          min_instances_slice,
          prior_weight=0.0):
    return SPN.LearnStructure(
        data,
        prior_weight=prior_weight,
        featureTypes=featureTypes,
        row_split_method=Splitting.KmeansRDCRows(),
        col_split_method=Splitting.RDCTest(threshold=0.3),
        domains=domains,
        families=families,
        featureNames=feature_names,
        min_instances_slice=min_instances_slice)
Beispiel #12
0
def test8():
    from tensorflow.examples.tutorials.mnist import input_data
    mnist = input_data.read_data_sets("MNIST_data/", one_hot=False)
    data, target = mnist.train.images, mnist.train.labels

    featureTypes = ["continuous"] * data.shape[1] + ["categorical"]

    featureNames = ["pixel"] * data.shape[1] + ["label"]

    data = numpy.hstack((data, target.reshape(data.shape[0], 1)))
    print(featureTypes)
    print(data.shape)

    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             featureNames=featureNames,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTest(threshold=0.4),
                             min_instances_slice=500,
                             cluster_first=True)
    # RDCTestOHEpy

    print("learned")

    spn.root.validate()

    data, target = mnist.test.images, mnist.test.labels

    data = numpy.hstack((data, target.reshape(data.shape[0], 1)))

    classes = numpy.unique(target)
    results = numpy.zeros((data.shape[0], len(classes)))

    print("testing")
    # print(spn)
    for c in classes:
        data[:, -1] = c
        results[:, c] = spn.root.eval(data)

    print("done")

    predictions = numpy.argmax(results, axis=1)

    print('MAP accuracy : ', accuracy_score(target, predictions))
Beispiel #13
0
def test6():
    numpy.random.seed(42)

    datablocks = []

    yd = [0, 1, 2, 3]
    xd = [0, 1]

    for x in xd:
        for y in yd:
            block = numpy.zeros((2000, 3))
            block[:, 1] = x
            block[:, 2] = y
            if (x == 1 and y == 0) or (x == 0 and y == 1) or (
                    x == 1 and y == 2) or (x == 0 and y == 3):
                block[:, 0] = numpy.random.normal(200, 30, block.shape[0])
            else:
                block[:, 0] = numpy.random.normal(100, 30, block.shape[0])

            datablocks.append(block)

    data = numpy.vstack(datablocks)

    print(data.shape)

    featureNames = ["Gaussian", "Categorical", "Discrete"]
    featureTypes = ["continuous", "categorical", "discrete"]

    # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01),
    # spn = SPN.LearnStructure(data, featureTypes=featureTypes,
    # featureNames=featureNames, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.RDCTest(),
    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             featureNames=featureNames,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTestOHEpy(),
                             min_instances_slice=50,
                             cluster_first=True)

    spn.root.validate()

    from matplotlib.collections import PolyCollection
    from matplotlib.colors import colorConverter
    import matplotlib.pyplot as plt
    import matplotlib.gridspec as gridspec

    gs = gridspec.GridSpec(len(xd), len(yd))

    fig = plt.figure(figsize=(8, 8))

    xall = numpy.arange(0, 300, 0.5)
    i = 0
    for x in xd:
        for y in yd:
            testdata = numpy.zeros((len(xall), 3))
            testdata[:, 0] = xall
            testdata[:, 1] = x
            testdata[:, 2] = y

            pbs = numpy.zeros_like(xall)

            pbs[:] = numpy.exp(spn.root.eval(testdata))

            ax = plt.Subplot(fig, gs[i])
            i += 1

            ax.set_title('%s %s' % (x, y))
            ax.plot(xall, pbs, 'r--')

            fig.add_subplot(ax)

    plt.show()

    ll = spn.root.eval(data)

    print("Sum LL", numpy.sum(ll))
Beispiel #14
0
'''
import numpy

from tfspn.SPN import SPN, Splitting
import tensorflow as tf

if __name__ == '__main__':

    gen = numpy.random.poisson(5, 1000)

    data = numpy.transpose(numpy.vstack((gen, gen)))

    spn = SPN.LearnStructure(data,
                             min_instances_slice=200,
                             families=["poisson", "poisson"],
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.IndependenceTest())

    #    THIS PRODUCES THE FOLLOWING SPN:
    #
    #     SumNode_0 SumNode(0.154*ProductNode_3, 0.188*ProductNode_6, 0.158*ProductNode_10, 0.076*ProductNode_13, 0.13999999999999999*ProductNode_18, 0.176*ProductNode_21, 0.108*ProductNode_24){
    #     ProductNode_3 ProductNode(PoissonNode_4, PoissonNode_5){
    #         PoissonNode_4 P(X_0_|λ=6.0)
    #         PoissonNode_5 P(X_1_|λ=6.0)
    #         }
    #     ProductNode_6 ProductNode(PoissonNode_7, PoissonNode_8){
    #         PoissonNode_7 P(X_0_|λ=5.0)
    #         PoissonNode_8 P(X_1_|λ=5.0)
    #         }
    #     ProductNode_10 ProductNode(PoissonNode_11, PoissonNode_12){
    #         PoissonNode_11 P(X_0_|λ=7.360759493670886)
Beispiel #15
0
def test_random_binary_conditioning_split():

    import logging
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

    rand_gen = numpy.random.RandomState(1337)

    n_instances = 100
    n_features = 6
    feature_types = [
        "continuous", "categorical", "discrete", "categorical", "continuous",
        "discrete"
    ]
    data = numpy.array([
        rand_gen.randn(n_instances),
        rand_gen.choice(4, size=n_instances),
        rand_gen.choice(numpy.arange(-10, 10), size=n_instances),
        rand_gen.choice(6, size=n_instances),
        rand_gen.randn(n_instances),
        rand_gen.choice(numpy.arange(-20, 20), size=n_instances)
    ]).T

    print('data', data)

    domains = estimate_domains(data, feature_types)
    print('domains', domains)

    families = ['piecewise'] * len(domains)
    names = ['f{}'.format(f) for f in range(len(domains))]

    #
    # subset of features
    rows = rand_gen.choice(n_instances, size=10)
    cols = numpy.array([3, 4, 5])
    data_slice = DataSlice(data,
                           families=families,
                           domains=domains,
                           featureNames=names,
                           featureTypes=feature_types,
                           rows=rows,
                           cols=cols)
    print(data_slice)
    print('filtered data', data_slice.getData())

    config = {'rand_gen': rand_gen}
    clustering_func = Splitting.GetFunction('Random Binary Conditioning Split',
                                            config)

    data_slice_clusters, n_clusters = clustering_func(data_slice)
    print(data_slice_clusters)

    assert n_clusters == 2
    assert len(data_slice_clusters) == 2
    abs(len(data_slice_clusters[0].rows) -
        len(data_slice_clusters[1].rows)) <= 1
    assert_array_equal(
        numpy.sort(
            numpy.union1d(data_slice_clusters[0].rows,
                          data_slice_clusters[1].rows)), numpy.sort(rows))
    assert_array_equal(numpy.sort(cols),
                       numpy.sort(data_slice_clusters[0].cols))
    assert_array_equal(numpy.sort(cols),
                       numpy.sort(data_slice_clusters[1].cols))
Beispiel #16
0
    # plt.hist(test[:,0], bins=100, histtype='stepfilled', normed=True, color='r', alpha=0.5, label='Uniform')
    #
    # plt.show()

    # print(domains)
    print(feature_names)
    print(feature_types)
    print(train.shape)

    # spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains,  featureTypes=feature_types, row_split_method=Splitting.RandomPartitionConditioningRows(), col_split_method=Splitting.RDCTestOHEpy(threshold=0.75),
    # spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains,  featureTypes=feature_types, row_split_method=Splitting.DBScanOHE(eps=1.0, min_samples=2), col_split_method=Splitting.RDCTestOHEpy(threshold=0.75),
    # spn = SPN.LearnStructure(train, featureNames=feature_names,
    # domains=domains,  featureTypes=feature_types,
    # row_split_method=Splitting.KmeansOHERows(),
    # col_split_method=Splitting.RDCTest(threshold=0.75),
    spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains,  featureTypes=feature_types, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.05),
                             min_instances_slice=20,  cluster_first=True)

    print(spn)

    result.append([dsname, numpy.mean(spn.root.eval(train)), numpy.mean(
        spn.root.eval(valid)), numpy.mean(spn.root.eval(test))])
    print("train", numpy.mean(spn.root.eval(train)))
    print("valid", numpy.mean(spn.root.eval(valid)))
    print("test", numpy.mean(spn.root.eval(test)))

    print("train", numpy.min(spn.root.eval(train)))
    print("valid", numpy.min(spn.root.eval(valid)))
    print("test", numpy.min(spn.root.eval(test)))

print(result)
                              row_split_method=row_split_method,
                              col_split_method=col_split_method,
                              domains=domains,
                              families=families,
                              featureNames=feature_names,
                              min_instances_slice=min_instances_slice)


# print("learning")
pspn = learn(train,
             featureTypes=["discrete"] * data.shape[1],
             families=["poisson"] * data.shape[1],
             domains=domains,
             feature_names=words,
             min_instances_slice=200,
             row_split_method=Splitting.KmeansRows(),
             col_split_method=Splitting.IndependenceTest(0.001))

marg = pspn.marginalize([0, 1, 2, 3])

print(marg.toEquation())
print(marg)

0 / 0

mspn = learn(train,
             featureTypes=["discrete"] * data.shape[1],
             families=["isotonic"] * data.shape[1],
             domains=domains,
             feature_names=words,
             min_instances_slice=200,
Beispiel #18
0
def test3():
    numpy.random.seed(42)
    dsname, data, featureNames, featureTypes, doms = getAdult()

    doctorateVal = numpy.where(doms[2] == "Doctorate")[0][0]
    stategovVal = numpy.where(doms[1] == "State-gov")[0][0]

    print(featureNames)

    print(len(featureNames))

    print(data[0, :])
    print(data.shape)
    print(doctorateVal, stategovVal)

    pD = numpy.sum(data[:, 2] == doctorateVal) / data.shape[0]
    pSD = numpy.sum(
        numpy.logical_and(data[:, 2] == doctorateVal, data[:, 1]
                          == stategovVal)) / data.shape[0]
    pS = numpy.sum(data[:, 1] == stategovVal) / data.shape[0]

    print("pD", pD)
    print("pSD", pSD)
    pS_D = pSD / pD
    print("pS_D", pS_D)

    # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01),
    # spn = SPN.LearnStructure(data, featureTypes=featureTypes,
    # featureNames=featureNames, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.RDCTest(),
    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             featureNames=featureNames,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTest(threshold=0.3),
                             min_instances_slice=3,
                             cluster_first=True)

    spn.root.validate()

    print("SPN Learned")
    margSPN_SD = spn.root.marginalizeOut(
        [0, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13])
    margSPN_SD.Prune()

    print(margSPN_SD)

    dataSD = numpy.zeros_like(data[0, :]).reshape(1, data.shape[1])
    dataSD[0, 1] = stategovVal
    dataSD[0, 2] = doctorateVal
    print(dataSD)

    spnSD = (numpy.exp(margSPN_SD.eval(dataSD)))

    margSPN_D = spn.root.marginalizeOut(
        [0, 1, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13])
    margSPN_D.Prune()

    print(margSPN_D)

    dataD = numpy.zeros_like(data[0, :]).reshape(1, data.shape[1])
    dataD[0, 2] = doctorateVal
    print(dataD)

    spnD = (numpy.exp(margSPN_D.eval(dataD)))

    print("pD", pD)
    print("pS", pS)
    print("pSD", pSD)
    pS_D = pSD / pD
    print("pS_D", pS_D)

    print("spn pD", spnD)
    print("spn pSD", spnSD)
    spnS_D = spnSD / spnD
    print("spn pS_D", spnS_D)

    print("doctorateVal", doctorateVal)
    print("stategovVal", stategovVal)

    ll = spn.root.eval(data)

    # print("Probs", numpy.exp(ll))
    print("Sum LL", numpy.sum(ll))
Beispiel #19
0
def test7():
    numpy.random.seed(42)

    D = numpy.loadtxt("bank.csv", delimiter=";", skiprows=0, dtype="S")
    D = numpy.char.strip(D)

    featureNames = [str(f) for f in D[0, :]]
    D = D[1:, :]
    featureTypes = [
        "discrete",
        "categorical",
        "categorical",
        "categorical",
        "continuous",
        "continuous",
        "categorical",
        "categorical",
        "categorical",
        "discrete",
        "categorical",
        "discrete",
        "categorical",
        "continuous",
        "discrete",
        "categorical",
        "categorical",
    ]
    print(len(featureTypes))
    print(len(featureNames))

    def isinteger(x):
        return numpy.all(numpy.equal(numpy.mod(x, 1), 0))

    cols = []
    types = []
    domains = []

    index = [0, 5]

    D = D[:, index]

    for col in range(D.shape[1]):
        b, c = numpy.unique(D[:, col], return_inverse=True)

        try:
            # could convert to float
            if isinteger(b.astype(float)):
                # was integer
                cols.append(D[:, col].astype(int))
                types.append("discrete")
                domains.append(b.astype(int))
                continue

            # was float
            cols.append(D[:, col].astype(float))
            types.append("continuous")
            domains.append(b.astype(float))

            continue
        except:
            # was discrete
            cols.append(c)
            types.append("categorical")
            domains.append(b.astype(str))

    data = numpy.column_stack(cols)
    print(featureNames)

    print(domains)
    featureNames = [featureNames[i] for i in index]
    print(featureNames)
    print(types)

    data[:, 1] = numpy.sign(data[:, 1]) * numpy.log(numpy.abs(data[:, 1]) + 1)

    # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01),
    # spn = SPN.LearnStructure(data, featureTypes=featureTypes,
    # featureNames=featureNames, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.RDCTest(),
    spn = SPN.LearnStructure(
        data,
        featureTypes=types,
        featureNames=featureNames,
        row_split_method=Splitting.KmeansRows(),
        col_split_method=Splitting.RDCTest(threshold=0.000001),
        min_instances_slice=1000,
        cluster_first=False)
    # RDCTestOHEpy

    spn.root.validate()

    print(spn)

    spn.save_pdf_graph("bank.pdf")

    ll = spn.root.eval(data)

    from matplotlib.collections import PolyCollection
    from matplotlib.colors import colorConverter
    import matplotlib.pyplot as plt
    import matplotlib.gridspec as gridspec

    for i in [0, 1]:

        x = numpy.sort(data[:, i]).reshape(data.shape[0], 1)

        fig = plt.figure(figsize=(8, 8))
        x1 = numpy.zeros_like(data)
        x1[:, i] = x[:, 0]

        color_idx = numpy.linspace(0, 1, len(spn.root.children))

        for cidx, c in enumerate(spn.root.children):

            y = numpy.exp(c.children[i].eval(x1))

            plt.plot(x, y, '--', color=plt.cm.cool(color_idx[cidx]))

    plt.show()

    # print("Probs", numpy.exp(ll))
    print("Sum LL", numpy.sum(ll))
Beispiel #20
0
def learn_spn(dataset="data/iris",
              precision=25,
              independence=0.1,
              header=0,
              date=None,
              isotonic=False,
              histogram=True,
              types=False):
    skiprows = [1] if types else []
    df = pd.read_csv(dataset,
                     delimiter=",",
                     header=header,
                     parse_dates=date,
                     skiprows=skiprows)
    df = df.dropna(axis=0, how='any')
    featureNames = df.columns.values.tolist() if header == 0 else [
        "X_{}".format(i) for i in range(len(df.columns))
    ]

    dtypes = df.dtypes

    if types:
        featureTypes = []
        families = []
        with open(dataset, 'r') as csvfile:
            csvreader = csv.reader(csvfile, delimiter=',', quotechar='|')
            csvreader.__next__()
            _types = csvreader.__next__()
        for featureType in _types:
            print(featureType)
            if featureType == 'cat':
                featureTypes.append('categorical')
                if histogram:
                    families.append('histogram')
                elif isotonic:
                    families.append('isotonic')
                else:
                    families.append('piecewise')
            elif featureType == 'con':
                featureTypes.append('continuous')
                families.append('piecewise' if not isotonic else 'isotonic')
            elif featureType == 'dis':
                featureTypes.append('discrete')
                families.append('piecewise' if not isotonic else 'isotonic')
            else:
                featureTypes.append('unknown')
                families.append('piecewise' if not isotonic else 'isotonic')

    def to_featureTypes(types):
        featureTypes = []
        families = []
        for featureType in types:
            if featureType.kind == 'O':
                featureTypes.append('categorical')
                if histogram:
                    families.append('histogram')
                elif isotonic:
                    families.append('isotonic')
                else:
                    families.append('piecewise')
            elif featureType.kind == 'f':
                featureTypes.append('continuous')
                families.append('piecewise' if not isotonic else 'isotonic')
            elif featureType.kind == np.dtype('i'):
                featureTypes.append('discrete')
                families.append('piecewise' if not isotonic else 'isotonic')
            else:
                featureTypes.append('unknown')
                families.append('piecewise' if not isotonic else 'isotonic')
        return featureTypes, families

    if not types:
        featureTypes, families = to_featureTypes(dtypes)

    data_dictionary = {
        'features':
        [{
            "name": name,
            "family": family,
            "type": typ,
            'pandas_type': dtypes[i]
        }
         for i, (name, family,
                 typ) in enumerate(zip(featureNames, families, featureTypes))],
        'num_entries':
        len(df)
    }

    # print(df.info())

    idx = df.columns

    for id, name in enumerate(idx):
        if featureTypes[id] == 'categorical':
            lb = LabelEncoder()
            data_dictionary['features'][id]["encoder"] = lb
            df[name] = df[name].astype('category')
            df[name] = lb.fit_transform(df[name])
            data_dictionary['features'][id]["values"] = lb.transform(
                lb.classes_)
        if dtypes[id].kind == 'M':
            df[name] = (df[name] - df[name].min()) / np.timedelta64(1, 'D')

    # print(df.head())
    data = np.array(df)

    # print(featureTypes)
    spn = SPN.LearnStructure(
        data,
        featureTypes=featureTypes,
        featureNames=featureNames,
        min_instances_slice=precision,
        families=families,
        row_split_method=Splitting.KmeansRDCRows(),
        col_split_method=Splitting.RDCTest(threshold=independence))

    spn.name = dataset
    return spn, data_dictionary
Beispiel #21
0
def test5():
    numpy.random.seed(42)

    data = numpy.zeros((5000, 2))

    idx = numpy.random.choice(data.shape[0],
                              int(data.shape[0] / 2),
                              replace=False)

    data[idx, 1] = 1

    idx0 = data[:, 1] == 0
    idx1 = data[:, 1] == 1

    data[idx0, 0] = numpy.random.normal(100, 30, numpy.sum(idx0))

    data[idx1, 0] = numpy.random.normal(200, 30, numpy.sum(idx1))

    print(data)

    featureNames = ["Gaussian", "Categorical"]
    featureTypes = ["continuous", "discrete"]

    # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01),
    # spn = SPN.LearnStructure(data, featureTypes=featureTypes,
    # featureNames=featureNames, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.RDCTest(),
    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             featureNames=featureNames,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTestOHEpy(),
                             min_instances_slice=500,
                             cluster_first=True)

    spn.root.validate()

    from mpl_toolkits.mplot3d import Axes3D
    from matplotlib.collections import PolyCollection
    from matplotlib.colors import colorConverter
    import matplotlib.pyplot as plt
    import numpy as np

    fig = plt.figure()
    ax = fig.gca(projection='3d')

    cc = lambda arg: colorConverter.to_rgba(arg, alpha=0.6)

    xs = np.arange(0, 300, 0.5)
    verts = []
    zs = [0, 1]

    maxys = 0
    for z in zs:
        testdata = numpy.zeros((len(xs), len(zs)))
        testdata[:, 0] = xs
        testdata[:, 1] = z

        ys = numpy.zeros_like(xs)

        ys[:] = numpy.exp(spn.root.eval(testdata))

        maxys = max(maxys, numpy.max(ys))

        ys[0], ys[-1] = 0, 0
        verts.append(list(zip(xs, ys)))

    poly = PolyCollection(verts, facecolors=[cc('r'), cc('g')])
    poly.set_alpha(0.7)
    ax.add_collection3d(poly, zs=zs, zdir='y')

    ax.set_xlabel('X')
    ax.set_xlim3d(0, 300)
    ax.set_ylabel('Y')
    ax.set_ylim3d(-1, 1)
    ax.set_zlabel('Z')
    ax.set_zlim3d(0, maxys)

    plt.show()

    ll = spn.root.eval(data)

    print("Sum LL", numpy.sum(ll))
Beispiel #22
0
from mlutils.datasets import getCIFAR10
from tfspn.SPN import SPN, Splitting

dsname, train, test, labels_train, labels_test = getCIFAR10(grayscale=True)

data = numpy.vstack((train, test))

ds = numpy.hstack((train, labels_train))

domains = [numpy.unique(ds[:, i]) for i in range(ds.shape[1])]

spn = SPN.LearnStructure(ds,
                         prior_weight=0.0,
                         featureTypes=["gaussian"] * train.shape[1] +
                         ["discrete"],
                         row_split_method=Splitting.RandomPartitionRows(),
                         col_split_method=Splitting.RDCTest(threshold=0.3,
                                                            OHE=True),
                         domains=domains,
                         families=["gaussian"] * ds.shape[1],
                         min_instances_slice=5000000)

print("learned")

ts = numpy.hstack(test, numpy.zeros_like(labels_test) / 0)

ts = ts[0:10, :]

print(ts[0, :])

predicted_labels = spn.root.mpe_eval(ts)