Example #1
0
def test1(data, features):

    data = data[:, 1:20]
    features = features[0:data.shape[1]]

    arcs, mixt = getArchetypes(data, 3)

    nrfolds = 10

    stats = Stats(name=dsname)

    for train, test, i in kfolded(mixt, nrfolds):
        c = Chrono().start()
        spn = SPN.LearnStructure(
            train,
            featureTypes=["continuous"] * train.shape[1],
            row_split_method=Splitting.KmeansRows(),
            col_split_method=Splitting.RDCTest(threshold=0.3),
            # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
            # domains, families=families, row_split_method=Splitting.KmeansRows(),
            # col_split_method=Splitting.RDCTest(),
            min_instances_slice=100)
        c.end()

        spn.root.validate()
        ll = numpy.mean(spn.root.eval(test))

        print(ll)

        stats.add("HSPN", Stats.LOG_LIKELIHOOD, ll)
        stats.add("HSPN", Stats.TIME, c.elapsed())

        stats.save("stats_" + dsname + ".json")

    print(arcs)
Example #2
0
def test2():
    numpy.random.seed(42)
    dsname, data, labels, classes, families = getDiabetes()

    labels = [l for l in labels]

    print(data.shape)

    print(data)
    featureTypes = [
        'discrete', 'continuous', 'continuous', 'continuous', 'continuous',
        'continuous', 'continuous', 'continuous', 'continuous'
    ]
    featureTypes = [
        'continuous', 'categorical', 'continuous', 'continuous', 'continuous',
        'continuous', 'continuous', 'continuous', 'continuous'
    ]
    # families[0] = 'bernoulli'

    # spn = SPN.LearnStructure(data, featureNames=labels, domains = domains,
    # families=families, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.IndependenceTest(alpha=0.00001),
    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTest(threshold=0.3),
                             min_instances_slice=50,
                             cluster_first=False)

    print(spn)
    # print(numpy.unique(data))

    ll = spn.root.eval(data)

    print("Sum LL", numpy.sum(ll))
Example #3
0
    def learn():
        spn = SPN.LearnStructure(
            data,
            featureTypes=["discrete"] * data.shape[1],
            row_split_method=Splitting.Gower(),
            col_split_method=Splitting.RDCTest(threshold=0.3, linear=True),
            # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
            # domains, families=families, row_split_method=Splitting.KmeansRows(),
            # col_split_method=Splitting.RDCTest(),
            min_instances_slice=200)

        return spn
Example #4
0
 def learn():
     spn = SPN.LearnStructure(
         traintopics,
         featureTypes=featureTypes,
         row_split_method=Splitting.Gower(),
         col_split_method=Splitting.RDCTest(threshold=0.1, linear=True),
         featureNames=featureNames,
         domains=domains,
         # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
         # domains, families=families, row_split_method=Splitting.KmeansRows(),
         # col_split_method=Splitting.RDCTest(),
         min_instances_slice=100)
     return spn
Example #5
0
 def learn(data):
     spn = SPN.LearnStructure(
         data,
         featureTypes=featureTypes,
         row_split_method=Splitting.Gower(),
         col_split_method=Splitting.RDCTest(threshold=0.3),
         domains=domains,
         alpha=1,
         # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
         # domains, families=families, row_split_method=Splitting.KmeansRows(),
         # col_split_method=Splitting.RDCTest(),
         min_instances_slice=50)
     return spn
Example #6
0
 def learn(data):
     spn = SPN.LearnStructure(
         data,
         featureTypes=featureTypes,
         row_split_method=Splitting.KmeansRDCRows(),
         col_split_method=Splitting.RDCTest(threshold=0.3),
         domains=domains,
         alpha=0.1,
         families=['histogram'] * data.shape[1],
         # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
         # domains, families=families, row_split_method=Splitting.KmeansRows(),
         # col_split_method=Splitting.RDCTest(),
         min_instances_slice=min_instances_slice)
     return spn
Example #7
0
def learn(data,
          featureTypes,
          families,
          domains,
          min_instances_slice,
          alpha=0.1):
    spn = SPN.LearnStructure(data,
                             alpha=alpha,
                             featureTypes=featureTypes,
                             row_split_method=Splitting.KmeansRDCRows(),
                             col_split_method=Splitting.RDCTest(threshold=0.3),
                             domains=domains,
                             families=families,
                             min_instances_slice=min_instances_slice)
    return spn
    def estimate_density(self, training_data, validation_data=None):
        """Fit a MSPN on the training data. The variable validation_data is
        never used."""
        feature_types = []
        feature_names = []
        families = []
        for feat, str_type in training_data.features:
            feature_types.append(str_type)
            feature_names.append(feat.symbol_name())
            if 'leaf' in self.learner_args:
                families.append(self.learner_args['leaf'])
            else:
                families.append(MSPNLearner.SPN_feat_fams[feat.symbol_type()])

        if 'row_split' in self.learner_args:
            if self.learner_args['row_split'] == 'gower':
                row_split_method = Splitting.Gower(n_clusters=2)
            elif self.learner_args['row_split'] == 'rdc-kmeans':
                row_split_method = Splitting.KmeansRDCRows(n_clusters=2,
                                                           k=20,
                                                           OHE=1)
            else:
                raise NotImplementedError()

        else:
            row_split_method = Splitting.KmeansRDCRows(n_clusters=2,
                                                       k=20,
                                                       OHE=1)

        col_split_method = Splitting.RDCTest(threshold=0.1, OHE=1, linear=1)

        rand_seed = self.learner_args['seed']
        mspnargs = {
            k: v
            for k, v in self.learner_args.items()
            if k not in ['seed', 'leaf', 'row_split']
        }

        # let MSPNs sort this out
        families = None
        self.spn = SPN.LearnStructure(asarray(training_data.data),
                                      feature_types,
                                      families=families,
                                      featureNames=feature_names,
                                      rand_seed=rand_seed,
                                      row_split_method=row_split_method,
                                      col_split_method=col_split_method,
                                      **mspnargs)
Example #9
0
def test2(data, features):
    arc, mixt = getArchetypes(data, 3)

    print(mixt)

    0 / 0

    spn = SPN.LearnStructure(
        mixt,
        featureTypes=["continuous"] * mixt.shape[1],
        row_split_method=Splitting.KmeansRows(),
        col_split_method=Splitting.RDCTest(threshold=0.3),
        # spn = SPN.LearnStructure(data, featureNames=["X1"], domains =
        # domains, families=families, row_split_method=Splitting.KmeansRows(),
        # col_split_method=Splitting.RDCTest(),
        min_instances_slice=100)
Example #10
0
def learn(data,
          featureTypes,
          families,
          domains,
          feature_names,
          min_instances_slice,
          prior_weight=0.0):
    return SPN.LearnStructure(
        data,
        prior_weight=prior_weight,
        featureTypes=featureTypes,
        row_split_method=Splitting.KmeansRDCRows(),
        col_split_method=Splitting.RDCTest(threshold=0.3),
        domains=domains,
        families=families,
        featureNames=feature_names,
        min_instances_slice=min_instances_slice)
Example #11
0
def test8():
    from tensorflow.examples.tutorials.mnist import input_data
    mnist = input_data.read_data_sets("MNIST_data/", one_hot=False)
    data, target = mnist.train.images, mnist.train.labels

    featureTypes = ["continuous"] * data.shape[1] + ["categorical"]

    featureNames = ["pixel"] * data.shape[1] + ["label"]

    data = numpy.hstack((data, target.reshape(data.shape[0], 1)))
    print(featureTypes)
    print(data.shape)

    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             featureNames=featureNames,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTest(threshold=0.4),
                             min_instances_slice=500,
                             cluster_first=True)
    # RDCTestOHEpy

    print("learned")

    spn.root.validate()

    data, target = mnist.test.images, mnist.test.labels

    data = numpy.hstack((data, target.reshape(data.shape[0], 1)))

    classes = numpy.unique(target)
    results = numpy.zeros((data.shape[0], len(classes)))

    print("testing")
    # print(spn)
    for c in classes:
        data[:, -1] = c
        results[:, c] = spn.root.eval(data)

    print("done")

    predictions = numpy.argmax(results, axis=1)

    print('MAP accuracy : ', accuracy_score(target, predictions))
Example #12
0
    # plt.hist(test[:,0], bins=100, histtype='stepfilled', normed=True, color='r', alpha=0.5, label='Uniform')
    #
    # plt.show()

    # print(domains)
    print(feature_names)
    print(feature_types)
    print(train.shape)

    # spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains,  featureTypes=feature_types, row_split_method=Splitting.RandomPartitionConditioningRows(), col_split_method=Splitting.RDCTestOHEpy(threshold=0.75),
    # spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains,  featureTypes=feature_types, row_split_method=Splitting.DBScanOHE(eps=1.0, min_samples=2), col_split_method=Splitting.RDCTestOHEpy(threshold=0.75),
    # spn = SPN.LearnStructure(train, featureNames=feature_names,
    # domains=domains,  featureTypes=feature_types,
    # row_split_method=Splitting.KmeansOHERows(),
    # col_split_method=Splitting.RDCTest(threshold=0.75),
    spn = SPN.LearnStructure(train, featureNames=feature_names, domains=domains,  featureTypes=feature_types, row_split_method=Splitting.Gower(), col_split_method=Splitting.RDCTest(threshold=0.05),
                             min_instances_slice=20,  cluster_first=True)

    print(spn)

    result.append([dsname, numpy.mean(spn.root.eval(train)), numpy.mean(
        spn.root.eval(valid)), numpy.mean(spn.root.eval(test))])
    print("train", numpy.mean(spn.root.eval(train)))
    print("valid", numpy.mean(spn.root.eval(valid)))
    print("test", numpy.mean(spn.root.eval(test)))

    print("train", numpy.min(spn.root.eval(train)))
    print("valid", numpy.min(spn.root.eval(valid)))
    print("test", numpy.min(spn.root.eval(test)))

print(result)
marg = pspn.marginalize([0, 1, 2, 3])

print(marg.toEquation())
print(marg)

0 / 0

mspn = learn(train,
             featureTypes=["discrete"] * data.shape[1],
             families=["isotonic"] * data.shape[1],
             domains=domains,
             feature_names=words,
             min_instances_slice=200,
             row_split_method=Splitting.KmeansRDCRows(),
             col_split_method=Splitting.RDCTest(threshold=0.1, OHE=False))

#print(pspn)
# print(mspn)

print("sum LL pspn", numpy.sum(pspn.root.eval(test)))
print("sum LL mspn", numpy.sum(mspn.root.eval(test)))
print("mean LL pspn", numpy.mean(pspn.root.eval(test)))
print("mean LL mspn", numpy.mean(mspn.root.eval(test)))

0 / 0


def getmiforfeature(input):
    spn, i, j = input
    # return i+j
Example #14
0
def test3():
    numpy.random.seed(42)
    dsname, data, featureNames, featureTypes, doms = getAdult()

    doctorateVal = numpy.where(doms[2] == "Doctorate")[0][0]
    stategovVal = numpy.where(doms[1] == "State-gov")[0][0]

    print(featureNames)

    print(len(featureNames))

    print(data[0, :])
    print(data.shape)
    print(doctorateVal, stategovVal)

    pD = numpy.sum(data[:, 2] == doctorateVal) / data.shape[0]
    pSD = numpy.sum(
        numpy.logical_and(data[:, 2] == doctorateVal, data[:, 1]
                          == stategovVal)) / data.shape[0]
    pS = numpy.sum(data[:, 1] == stategovVal) / data.shape[0]

    print("pD", pD)
    print("pSD", pSD)
    pS_D = pSD / pD
    print("pS_D", pS_D)

    # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01),
    # spn = SPN.LearnStructure(data, featureTypes=featureTypes,
    # featureNames=featureNames, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.RDCTest(),
    spn = SPN.LearnStructure(data,
                             featureTypes=featureTypes,
                             featureNames=featureNames,
                             row_split_method=Splitting.KmeansRows(),
                             col_split_method=Splitting.RDCTest(threshold=0.3),
                             min_instances_slice=3,
                             cluster_first=True)

    spn.root.validate()

    print("SPN Learned")
    margSPN_SD = spn.root.marginalizeOut(
        [0, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13])
    margSPN_SD.Prune()

    print(margSPN_SD)

    dataSD = numpy.zeros_like(data[0, :]).reshape(1, data.shape[1])
    dataSD[0, 1] = stategovVal
    dataSD[0, 2] = doctorateVal
    print(dataSD)

    spnSD = (numpy.exp(margSPN_SD.eval(dataSD)))

    margSPN_D = spn.root.marginalizeOut(
        [0, 1, 3, 4, 5, 6, 7, 8, 9, 9, 10, 11, 12, 13])
    margSPN_D.Prune()

    print(margSPN_D)

    dataD = numpy.zeros_like(data[0, :]).reshape(1, data.shape[1])
    dataD[0, 2] = doctorateVal
    print(dataD)

    spnD = (numpy.exp(margSPN_D.eval(dataD)))

    print("pD", pD)
    print("pS", pS)
    print("pSD", pSD)
    pS_D = pSD / pD
    print("pS_D", pS_D)

    print("spn pD", spnD)
    print("spn pSD", spnSD)
    spnS_D = spnSD / spnD
    print("spn pS_D", spnS_D)

    print("doctorateVal", doctorateVal)
    print("stategovVal", stategovVal)

    ll = spn.root.eval(data)

    # print("Probs", numpy.exp(ll))
    print("Sum LL", numpy.sum(ll))
Example #15
0
def test7():
    numpy.random.seed(42)

    D = numpy.loadtxt("bank.csv", delimiter=";", skiprows=0, dtype="S")
    D = numpy.char.strip(D)

    featureNames = [str(f) for f in D[0, :]]
    D = D[1:, :]
    featureTypes = [
        "discrete",
        "categorical",
        "categorical",
        "categorical",
        "continuous",
        "continuous",
        "categorical",
        "categorical",
        "categorical",
        "discrete",
        "categorical",
        "discrete",
        "categorical",
        "continuous",
        "discrete",
        "categorical",
        "categorical",
    ]
    print(len(featureTypes))
    print(len(featureNames))

    def isinteger(x):
        return numpy.all(numpy.equal(numpy.mod(x, 1), 0))

    cols = []
    types = []
    domains = []

    index = [0, 5]

    D = D[:, index]

    for col in range(D.shape[1]):
        b, c = numpy.unique(D[:, col], return_inverse=True)

        try:
            # could convert to float
            if isinteger(b.astype(float)):
                # was integer
                cols.append(D[:, col].astype(int))
                types.append("discrete")
                domains.append(b.astype(int))
                continue

            # was float
            cols.append(D[:, col].astype(float))
            types.append("continuous")
            domains.append(b.astype(float))

            continue
        except:
            # was discrete
            cols.append(c)
            types.append("categorical")
            domains.append(b.astype(str))

    data = numpy.column_stack(cols)
    print(featureNames)

    print(domains)
    featureNames = [featureNames[i] for i in index]
    print(featureNames)
    print(types)

    data[:, 1] = numpy.sign(data[:, 1]) * numpy.log(numpy.abs(data[:, 1]) + 1)

    # spn = SPN.LearnStructure(data, featureTypes=featureTypes, featureNames=featureNames, row_split_method=Splitting.KmeansRows(), col_split_method=Splitting.IndependenceTest(alpha=0.01),
    # spn = SPN.LearnStructure(data, featureTypes=featureTypes,
    # featureNames=featureNames, row_split_method=Splitting.KmeansRows(),
    # col_split_method=Splitting.RDCTest(),
    spn = SPN.LearnStructure(
        data,
        featureTypes=types,
        featureNames=featureNames,
        row_split_method=Splitting.KmeansRows(),
        col_split_method=Splitting.RDCTest(threshold=0.000001),
        min_instances_slice=1000,
        cluster_first=False)
    # RDCTestOHEpy

    spn.root.validate()

    print(spn)

    spn.save_pdf_graph("bank.pdf")

    ll = spn.root.eval(data)

    from matplotlib.collections import PolyCollection
    from matplotlib.colors import colorConverter
    import matplotlib.pyplot as plt
    import matplotlib.gridspec as gridspec

    for i in [0, 1]:

        x = numpy.sort(data[:, i]).reshape(data.shape[0], 1)

        fig = plt.figure(figsize=(8, 8))
        x1 = numpy.zeros_like(data)
        x1[:, i] = x[:, 0]

        color_idx = numpy.linspace(0, 1, len(spn.root.children))

        for cidx, c in enumerate(spn.root.children):

            y = numpy.exp(c.children[i].eval(x1))

            plt.plot(x, y, '--', color=plt.cm.cool(color_idx[cidx]))

    plt.show()

    # print("Probs", numpy.exp(ll))
    print("Sum LL", numpy.sum(ll))
Example #16
0
def learn_spn(dataset="data/iris",
              precision=25,
              independence=0.1,
              header=0,
              date=None,
              isotonic=False,
              histogram=True,
              types=False):
    skiprows = [1] if types else []
    df = pd.read_csv(dataset,
                     delimiter=",",
                     header=header,
                     parse_dates=date,
                     skiprows=skiprows)
    df = df.dropna(axis=0, how='any')
    featureNames = df.columns.values.tolist() if header == 0 else [
        "X_{}".format(i) for i in range(len(df.columns))
    ]

    dtypes = df.dtypes

    if types:
        featureTypes = []
        families = []
        with open(dataset, 'r') as csvfile:
            csvreader = csv.reader(csvfile, delimiter=',', quotechar='|')
            csvreader.__next__()
            _types = csvreader.__next__()
        for featureType in _types:
            print(featureType)
            if featureType == 'cat':
                featureTypes.append('categorical')
                if histogram:
                    families.append('histogram')
                elif isotonic:
                    families.append('isotonic')
                else:
                    families.append('piecewise')
            elif featureType == 'con':
                featureTypes.append('continuous')
                families.append('piecewise' if not isotonic else 'isotonic')
            elif featureType == 'dis':
                featureTypes.append('discrete')
                families.append('piecewise' if not isotonic else 'isotonic')
            else:
                featureTypes.append('unknown')
                families.append('piecewise' if not isotonic else 'isotonic')

    def to_featureTypes(types):
        featureTypes = []
        families = []
        for featureType in types:
            if featureType.kind == 'O':
                featureTypes.append('categorical')
                if histogram:
                    families.append('histogram')
                elif isotonic:
                    families.append('isotonic')
                else:
                    families.append('piecewise')
            elif featureType.kind == 'f':
                featureTypes.append('continuous')
                families.append('piecewise' if not isotonic else 'isotonic')
            elif featureType.kind == np.dtype('i'):
                featureTypes.append('discrete')
                families.append('piecewise' if not isotonic else 'isotonic')
            else:
                featureTypes.append('unknown')
                families.append('piecewise' if not isotonic else 'isotonic')
        return featureTypes, families

    if not types:
        featureTypes, families = to_featureTypes(dtypes)

    data_dictionary = {
        'features':
        [{
            "name": name,
            "family": family,
            "type": typ,
            'pandas_type': dtypes[i]
        }
         for i, (name, family,
                 typ) in enumerate(zip(featureNames, families, featureTypes))],
        'num_entries':
        len(df)
    }

    # print(df.info())

    idx = df.columns

    for id, name in enumerate(idx):
        if featureTypes[id] == 'categorical':
            lb = LabelEncoder()
            data_dictionary['features'][id]["encoder"] = lb
            df[name] = df[name].astype('category')
            df[name] = lb.fit_transform(df[name])
            data_dictionary['features'][id]["values"] = lb.transform(
                lb.classes_)
        if dtypes[id].kind == 'M':
            df[name] = (df[name] - df[name].min()) / np.timedelta64(1, 'D')

    # print(df.head())
    data = np.array(df)

    # print(featureTypes)
    spn = SPN.LearnStructure(
        data,
        featureTypes=featureTypes,
        featureNames=featureNames,
        min_instances_slice=precision,
        families=families,
        row_split_method=Splitting.KmeansRDCRows(),
        col_split_method=Splitting.RDCTest(threshold=independence))

    spn.name = dataset
    return spn, data_dictionary
Example #17
0
from tfspn.SPN import SPN, Splitting

dsname, train, test, labels_train, labels_test = getCIFAR10(grayscale=True)

data = numpy.vstack((train, test))

ds = numpy.hstack((train, labels_train))

domains = [numpy.unique(ds[:, i]) for i in range(ds.shape[1])]

spn = SPN.LearnStructure(ds,
                         prior_weight=0.0,
                         featureTypes=["gaussian"] * train.shape[1] +
                         ["discrete"],
                         row_split_method=Splitting.RandomPartitionRows(),
                         col_split_method=Splitting.RDCTest(threshold=0.3,
                                                            OHE=True),
                         domains=domains,
                         families=["gaussian"] * ds.shape[1],
                         min_instances_slice=5000000)

print("learned")

ts = numpy.hstack(test, numpy.zeros_like(labels_test) / 0)

ts = ts[0:10, :]

print(ts[0, :])

predicted_labels = spn.root.mpe_eval(ts)

print(predicted_labels[0, :])