Beispiel #1
0
    def test_Histogram_discrete_inference(self):
        data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1)
        ds_context = Context([MetaType.DISCRETE])
        ds_context.add_domains(data)
        hist = create_histogram_leaf(data, ds_context, [0], alpha=False)
        prob = np.exp(log_likelihood(hist, data))

        self.assertAlmostEqual(float(prob[0]), 2 / 6)
        self.assertAlmostEqual(float(prob[1]), 2 / 6)
        self.assertAlmostEqual(float(prob[2]), 1 / 6)
        self.assertAlmostEqual(float(prob[3]), 3 / 6)
        self.assertAlmostEqual(float(prob[4]), 3 / 6)
        self.assertAlmostEqual(float(prob[5]), 3 / 6)

        data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1)
        ds_context = Context([MetaType.DISCRETE])
        ds_context.add_domains(data)
        hist = create_histogram_leaf(data, ds_context, [0], alpha=True)
        # print(np.var(data.shape[0]))
        prob = np.exp(log_likelihood(hist, data))
        self.assertAlmostEqual(float(prob[0]), 3 / 9)
        self.assertAlmostEqual(float(prob[1]), 3 / 9)
        self.assertAlmostEqual(float(prob[2]), 2 / 9)
        self.assertAlmostEqual(float(prob[3]), 4 / 9)
        self.assertAlmostEqual(float(prob[4]), 4 / 9)
        self.assertAlmostEqual(float(prob[5]), 4 / 9)
Beispiel #2
0
def get_ds_context_sum(curr_train_data, scope, index, scope_index, params):
    """
    returns the Context object of spflow to use with split_rows method while creating sum node for spmn

    """
    n = curr_train_data.shape[1]
    curr_var_set_sum = params.partial_order[index:len(params.partial_order) +
                                            1]
    curr_var_set_sum1 = [
        var for curr_var_set in curr_var_set_sum for var in curr_var_set
    ]

    if params.util_to_bin:
        context = [Categorical] * n
        ds_context = Context(
            parametric_types=context,
            scope=scope,
            feature_names=curr_var_set_sum1).add_domains(curr_train_data)

    # utilty is meta type -- real
    else:

        if params.utility_node[0] in curr_var_set_sum1:
            context = [MetaType.DISCRETE] * (n - 1)
            context.append(MetaType.REAL)
        else:
            context = [MetaType.DISCRETE] * (n)
        scope = scope
        ds_context = Context(
            meta_types=context, scope=scope,
            feature_names=curr_var_set_sum1).add_domains(curr_train_data)

    return ds_context
Beispiel #3
0
def get_ds_context_prod(curr_train_data, scope, index, scope_index, params):
    """
    returns the Context object of spflow to use with split_cols, learn_mspn or learn_parametric methods of spflow while creating product node for spmn
    """
    n = curr_train_data.shape[1]
    scope_var = params.feature_names[scope_index:scope_index + n]
    context = []

    # if parametric, all variables are meta type -- categorical
    if params.util_to_bin:
        context = [Categorical] * n
        ds_context = Context(
            parametric_types=context, scope=scope,
            feature_names=scope_var).add_domains(curr_train_data)

    # if mixed, utilty is meta type -- real
    else:
        if params.utility_node[0] in scope_var:
            context = [MetaType.DISCRETE] * (n - 1)
            context.append(MetaType.REAL)
        else:
            context = [MetaType.DISCRETE] * (n)

        scope = scope
        ds_context = Context(
            meta_types=context, scope=scope,
            feature_names=scope_var).add_domains(curr_train_data)
    return ds_context
    def run(self, run: int, n_folds: int, fold_log: bool):
        base_path = "../../../data/continuous/" + self.data_name + "/10_folds/"
        train_datasets = []
        test_datasets = []
        ds_contexts = []

        # Prepare folds' data
        for i in range(1, 11):
            train_data_path = base_path + self.data_name + "_" + str(i) + "_train.arff"
            test_data_path = base_path + self.data_name + "_" + str(i) + "_test.arff"

            # Load data
            train_data = arff.loadarff(train_data_path)
            train_data = pd.DataFrame(train_data[0])
            train_data = train_data.values
            train_datasets.append(train_data)

            test_data = arff.loadarff(test_data_path)
            test_data = pd.DataFrame(test_data[0])
            test_data = test_data.values
            test_datasets.append(test_data)

            # Create context for MSPN algorithm
            ds_context = Context(self.meta_types)
            ds_contexts.append(ds_context)

        # Apply KDE
        results_path = "../../../results/run_" + str(run) + "/continuous/" + self.data_name + "/" + str(n_folds) + "_folds/KDE/"
        KDE.apply(train_datasets, self.var_types_string, test_datasets, n_folds, results_path, self.data_name, fold_log)
Beispiel #5
0
def train(args):
    print('Training...')
    for i in range(len(args.spk_list)):
        spn_path = args.MODEL_DIR + '/' + args.spk_list[i]['spk_id'] + '.p'
        if not os.path.isfile(spn_path):
            with open(spn_path, 'wb') as f:
                pickle.dump([], f)
            print(chr(27) + "[2J")
            print(
                "Learn structure, spk: %i (%s)... (min_instances_slice: %i, threshold: %1.3f)."
                % (i, args.spk_list[i]['spk_id'], args.min_instances_slice,
                   args.threshold))
            train_batch = featpy.lsse(
                args.spk_list[i]['train_clean_speech'],
                args.spk_list[i]['train_clean_speech_len'], args.Nw, args.Ns,
                args.NFFT, args.fs, args.H)
            print("Features extracted.")
            ds_context = Context(parametric_types=[Gaussian] *
                                 args.M).add_domains(train_batch)
            with silence():
                spn_spk = learn_parametric(
                    train_batch,
                    ds_context,
                    min_instances_slice=args.min_instances_slice,
                    threshold=args.threshold,
                    cpus=args.ncores)
            with open(spn_path, 'wb') as f:
                pickle.dump(spn_spk, f)
Beispiel #6
0
def learn_parametric_spn(data, parametric_types):
    
    from spn.algorithms.LearningWrappers import learn_parametric
    ds_context = Context(parametric_types=parametric_types).add_domains(data)
    ds_context.add_domains(data)
    spn = learn_parametric(data, ds_context, min_instances_slice=100, threshold=0.01)
    return spn
    def test_Histogram_expectations(self):
        data = np.random.randn(20000).reshape(-1, 1)
        ds_context = Context(meta_types=[MetaType.REAL])
        ds_context.add_domains(data)
        hl = create_histogram_leaf(data, ds_context, scope=[0])
        expectation = Expectation(hl, set([0]))

        self.assertAlmostEqual(np.mean(data[:, 0]), expectation[0, 0], 3)

        data = np.random.randint(0, high=100, size=20000).reshape(-1, 1)
        ds_context = Context(meta_types=[MetaType.DISCRETE])
        ds_context.add_domains(data)
        hl = create_histogram_leaf(data, ds_context, scope=[0])
        expectation = Expectation(hl, set([0]))

        self.assertAlmostEqual(np.mean(data[:, 0]), expectation[0, 0], 3)
Beispiel #8
0
    def fit(self, X, y=None):
        y = y.reshape(y.shape[0], -1)
        self.num_labels = y.shape[1]
        self.context = Context(parametric_types=[Bernoulli] *
                               self.num_labels).add_domains(y)
        self.context.feature_size = X.shape[1]
        self.scope = list(range(y.shape[1]))
        data = concatenate_yx(y, X)

        cspn_type = 1
        if cspn_type == 0:
            self.cspn = create_conditional_leaf(data, self.context, self.scope)
        elif cspn_type == 1:
            split_rows = get_split_conditional_rows_KMeans()
            self.cspn, subtasks = create_sum(data=data,
                                             node_id=0,
                                             parent_id=0,
                                             pos=0,
                                             context=self.context,
                                             scope=self.scope,
                                             split_rows=split_rows)
            for i, subtask in enumerate(subtasks):
                self.cspn.children[i] = create_conditional_leaf(
                    subtask[1]['data'], self.context, subtask[1]['scope'])
            print(self.cspn)
Beispiel #9
0
    def test_leaf_mpe_bernoulli(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 5000),
                np.random.multivariate_normal([1, 1], np.eye(2), 5000),
            ),
            axis=0,
        )
        y = np.array([0] * 5000 + [1] * 5000).reshape(-1, 1)

        # associates y=0 with X=[10,10]
        # associates y=1 with X=[1,1]

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Bernoulli])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])

        res = mpe(leaf, np.array([np.nan, 10, 10]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 0)

        res = mpe(leaf, np.array([np.nan, 1, 1]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 1)

        res = mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 1)
        self.assertAlmostEqual(res[1, 0], 0)

        with self.assertRaises(AssertionError):
            mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3))
Beispiel #10
0
    def test_leaf_mpe_gaussian(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 5000),
                np.random.multivariate_normal([1, 1], np.eye(2), 5000),
            ),
            axis=0,
        )
        y = np.array(np.random.normal(20, 2, 5000).tolist() + np.random.normal(60, 2, 5000).tolist()).reshape(-1, 1)

        # associates y=20 with X=[10,10]
        # associates y=60 with X=[1,1]

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Gaussian])
        ds_context.feature_size = 2

        # leaf = create_conditional_leaf(data, ds_context, [0])
        leaf = create_parametric_leaf(data, ds_context, [0])

        res = mpe(leaf, np.array([np.nan, 10, 10]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 20.435226001909466)

        res = mpe(leaf, np.array([np.nan, 1, 1]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 59.4752193542575)

        res = mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10]).reshape(-1, 3))
        self.assertAlmostEqual(res[0, 0], 59.4752193542575)
        self.assertAlmostEqual(res[1, 0], 20.435226001909466)

        with self.assertRaises(AssertionError):
            mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3))
Beispiel #11
0
    def fit(self, X, y=None):
        self.context = Context(
            parametric_types=self.parametric_types).add_domains(y)
        self.context.feature_size = X.shape[1]
        self.num_labels = y.shape[1]

        def label_conditional(y, x):
            from sklearn.cluster import KMeans

            clusters = KMeans(n_clusters=2,
                              random_state=17,
                              precompute_distances=True).fit_predict(x)
            return clusters

        self.cspn = learn_cspn_structure(
            concatenate_yx(y, X),
            self.context,
            split_rows=get_split_rows_conditional_Gower(),
            # split_rows=get_split_rows_KMeans(),
            # split_cols=get_split_cols_RDC_py(),
            split_cols=getCIGroup(alpha=self.alpha),
            # creeate_leaf = create_leaf_node,
            create_leaf=create_conditional_leaf,
            label_conditional=label_conditional,
            **self.kwargs)

        return self
    def test_sample_range(self):

        np.random.seed(10)
        data = np.random.normal(20, scale=5, size=1000).reshape((1000, 1))
        numpy_data = np.array(data, np.float64)
        meta_types = [MetaType.REAL]
        domains = [[np.min(numpy_data[:, 0]), np.max(numpy_data[:, 0])]]
        ds_context = Context(meta_types=meta_types, domains=domains)
        rand_gen = np.random.RandomState(100)
        pwl = create_piecewise_leaf(data,
                                    ds_context,
                                    scope=[0],
                                    prior_weight=None)

        rang = [NumericRange([[20]])]
        ranges = np.array(rang)
        samples = SamplingRange.sample_piecewise_node(pwl, 10, rand_gen,
                                                      ranges)
        self.assertEqual(len(samples), 10)
        self.assertAlmostEqual(np.average(samples), 20)

        rang = [NumericRange([[20, 100]])]
        ranges = np.array(rang)
        samples = SamplingRange.sample_piecewise_node(pwl, 10, rand_gen,
                                                      ranges)
        self.assertTrue(all(samples[samples > 20]))
        self.assertTrue(all(samples[samples < 100]))

        rang = [NumericRange([[10, 13], [20, 100]])]
        ranges = np.array(rang)
        samples = SamplingRange.sample_piecewise_node(pwl, 10, rand_gen,
                                                      ranges)
        self.assertFalse(
            any(samples[np.where((samples > 13) & (samples < 20))]))
        self.assertFalse(any(samples[samples < 10]))
Beispiel #13
0
 def test_conditional_probability(self):
     # test if conditional probability is correct
     # same spn as in entropy test
     # only for generating the ds_context
     train_data = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [2.0, 0.0, 1.0]])
     # spn
     ds_context = Context(meta_types=[MetaType.DISCRETE] * 3)
     ds_context.add_domains(train_data)
     ds_context.parametric_type = [Categorical] * 3
     spn = 0.64 * (
         (
             Categorical(p=[0.25, 0.75, 0.0], scope=0)
             * (
                 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2)))
                 + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2)))
             )
         )
     ) + 0.36 * (
         (
             Categorical(p=[0.0, 0.0, 1.0], scope=0)
             * (
                 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2)))
                 + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2)))
             )
         )
     )
     # tests
     x_instance = np.array([1, 1, 0], dtype=float).reshape(1, -1)
     self.assertAlmostEqual(conditional_probability(spn, 2, x_instance)[0][0], 0.9)
     self.assertAlmostEqual(conditional_probability(spn, 0, x_instance)[0][0], 0.48)
     x_instance = np.array([2, 1, 0], dtype=float).reshape(1, -1)
     self.assertAlmostEqual(conditional_probability(spn, 0, x_instance)[0][0], 0.36)
Beispiel #14
0
    def test_histogram_samples(self):
        import numpy as np
        from numpy.random.mtrand import RandomState
        from spn.algorithms.Sampling import sample_instances
        from spn.structure.Base import Context
        from spn.structure.StatisticalTypes import MetaType
        from spn.algorithms.LearningWrappers import learn_mspn

        np.random.seed(123)
        a = np.random.randint(2, size=10000).reshape(-1, 1)
        b = np.random.randint(3, size=10000).reshape(-1, 1)
        c = np.r_[np.random.normal(10, 5, (3000, 1)),
                  np.random.normal(20, 10, (7000, 1))]
        d = 5 * a + 3 * b + c
        train_data = np.c_[a, b, c, d]

        ds_context = Context(meta_types=[
            MetaType.DISCRETE, MetaType.DISCRETE, MetaType.REAL, MetaType.REAL
        ]).add_domains(train_data)
        mspn = learn_mspn(train_data, ds_context, min_instances_slice=200)

        samples = sample_instances(
            mspn,
            np.array([np.nan, np.nan, np.nan, np.nan] * 100).reshape(-1, 4),
            RandomState(123))
        print(np.max(samples, axis=0), np.min(samples, axis=0))
        print(ds_context.domains)
Beispiel #15
0
    def test_leaf_bernoulli_bootstrap(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 100),
                np.random.multivariate_normal([1, 1], np.eye(2), 100),
            ),
            axis=0,
        )
        y = np.array([1] * 100 + [0] * 100).reshape(-1, 1)

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Bernoulli])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])

        l = likelihood(leaf, data)
        neg_data = np.concatenate([1 - y, x], axis=1)
        lneg = likelihood(leaf, neg_data)

        np.testing.assert_array_almost_equal(l + lneg, 1.0)

        self.assertTrue(np.all(l >= 0.5))
        self.assertTrue(np.all(lneg < 0.5))
Beispiel #16
0
    def test_leaf_categorical(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([20, 20], np.eye(2), 500),
                np.random.multivariate_normal([10, 10], np.eye(2), 500),
                np.random.multivariate_normal([1, 1], np.eye(2), 500),
            ),
            axis=0,
        )
        y = np.array([2] * 500 + [1] * 500 + [0] * 500).reshape(-1, 1)

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Categorical])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])

        l0 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 0, x))
        l1 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 1, x))
        l2 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 2, x))

        np.testing.assert_array_almost_equal(l0 + l1 + l2, 1.0)

        self.assertTrue(np.all(l0[1000:1500] > 0.85))
        self.assertTrue(np.all(l0[0:1000] < 0.15))

        self.assertTrue(np.all(l1[500:1000] > 0.85))
        self.assertTrue(np.all(l1[0:500] < 0.15))
        self.assertTrue(np.all(l1[1000:1500] < 0.15))

        self.assertTrue(np.all(l2[0:500] > 0.85))
        self.assertTrue(np.all(l2[500:15000] < 0.15))
Beispiel #17
0
    def test_leaf_gaussian(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 5000),
                np.random.multivariate_normal([1, 1], np.eye(2), 5000),
            ),
            axis=0,
        )
        y = np.array(
            np.random.normal(20, 2, 5000).tolist() +
            np.random.normal(60, 2, 5000).tolist()).reshape(-1, 1)

        # associates y=20 with X=[10,10]
        # associates y=60 with X=[1,1]

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Gaussian])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])

        self.assertFalse(np.any(np.isnan(likelihood(leaf, data))))

        self.assertGreater(get_ll(leaf, [20, 10, 10]),
                           get_ll(leaf, [20, 1, 1]))
        self.assertGreater(get_ll(leaf, [60, 1, 1]),
                           get_ll(leaf, [60, 10, 10]))
        self.assertAlmostEqual(get_ll(leaf, [60, 1, 1]), 0.3476232862652)
        self.assertAlmostEqual(get_ll(leaf, [20, 10, 10]), 0.3628922322773634)
Beispiel #18
0
    def test_leaf_no_variance_gaussian(self):
        np.random.seed(17)
        x = np.concatenate(
            (
                np.random.multivariate_normal([10, 10], np.eye(2), 500),
                np.random.multivariate_normal([1, 1], np.eye(2), 500),
            ),
            axis=0,
        )
        y = np.array([1] * 1000).reshape(-1, 1)

        data = concatenate_yx(y, x)

        ds_context = Context(parametric_types=[Gaussian])
        ds_context.feature_size = 2

        leaf = create_conditional_leaf(data, ds_context, [0])
        l = likelihood(leaf, data)
        self.assertEqual(np.var(l[:, 0]), 0)
        self.assertAlmostEqual(l[0, 0], 0.398942280401432)

        data[:, 0] = 2
        leaf = create_conditional_leaf(data, ds_context, [0])
        l = likelihood(leaf, data)
        self.assertEqual(np.var(l[:, 0]), 0)
        self.assertAlmostEqual(l[0, 0], 0.398942280401432)

        data3 = np.array(data)
        data3[:, 0] = 3
        leaf = create_conditional_leaf(data3, ds_context, [0])
        l = likelihood(leaf, data)
        self.assertAlmostEqual(np.var(l[:, 0]), 0)
        self.assertAlmostEqual(l[0, 0], 0.241970724519143)
Beispiel #19
0
def learn_MSPN():
    import numpy as np

    np.random.seed(123)

    a = np.random.randint(2, size=1000).reshape(-1, 1)
    b = np.random.randint(3, size=1000).reshape(-1, 1)
    c = np.r_[np.random.normal(10, 5, (300, 1)),
              np.random.normal(20, 10, (700, 1))]
    d = 5 * a + 3 * b + c
    train_data = np.c_[a, b, c, d]

    from spn.structure.Base import Context
    from spn.structure.StatisticalTypes import MetaType

    ds_context = Context(meta_types=[
        MetaType.DISCRETE, MetaType.DISCRETE, MetaType.REAL, MetaType.REAL
    ]).add_domains(train_data)

    from spn.algorithms.LearningWrappers import learn_mspn

    mspn = learn_mspn(train_data, ds_context, min_instances_slice=20)

    from spn.algorithms.Statistics import get_structure_stats

    print(get_structure_stats(mspn))
Beispiel #20
0
def learn_PSPN():
    import numpy as np

    np.random.seed(123)

    a = np.random.randint(2, size=1000).reshape(-1, 1)
    b = np.random.randint(3, size=1000).reshape(-1, 1)
    c = np.r_[np.random.normal(10, 5, (300, 1)),
              np.random.normal(20, 10, (700, 1))]
    d = 5 * a + 3 * b + c
    train_data = np.c_[a, b, c, d]

    from spn.structure.Base import Context
    from spn.structure.leaves.parametric.Parametric import Categorical, Gaussian

    ds_context = Context(
        parametric_types=[Categorical, Categorical, Gaussian, Gaussian
                          ]).add_domains(train_data)

    from spn.algorithms.LearningWrappers import learn_parametric

    spn = learn_parametric(train_data, ds_context, min_instances_slice=20)

    from spn.algorithms.Statistics import get_structure_stats

    print(get_structure_stats(spn))
Beispiel #21
0
def classification():
    import numpy as np
    np.random.seed(123)
    train_data = np.c_[np.r_[np.random.normal(5, 1, (500, 2)), np.random.normal(10, 1, (500, 2))],
                       np.r_[np.zeros((500, 1)), np.ones((500, 1))]]

    centers = [[5, 5], [10, 10]]

    import matplotlib.pyplot as plt
    colors = ['#bda36b', '#7aaab4']
    plt.figure()
    # plt.hold(True)
    for k, col in zip(range(2), colors):
        my_members = train_data[:, 2] == k
        plt.plot(train_data[my_members, 0], train_data[my_members, 1], 'w', markerfacecolor=col, marker='.')
        plt.plot(centers[k][0], centers[k][1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=6)
    plt.title('Training Data')
    plt.grid(True)
    plt.savefig("classification_training_data.png", bbox_inches='tight', pad_inches=0)

    from spn.algorithms.LearningWrappers import learn_parametric, learn_classifier
    from spn.structure.leaves.parametric.Parametric import Categorical, Gaussian
    from spn.structure.Base import Context
    spn_classification = learn_classifier(train_data,
                                          Context(parametric_types=[Gaussian, Gaussian, Categorical]).add_domains(
                                              train_data),
                                          learn_parametric, 2)

    test_classification = np.array([3.0, 4.0, np.nan, 12.0, 18.0, np.nan]).reshape(-1, 3)
    print(test_classification)
    from spn.algorithms.MPE import mpe
    print(mpe(spn_classification, test_classification))
Beispiel #22
0
def learn_whittle_spn_2d(train_data, n_RV, n_min_slice, init_scope=None):
    from spn.structure.leaves.parametric.Parametric import MultivariateGaussian

    # learn spn
    ds_context = Context(parametric_types=[MultivariateGaussian] *
                         n_RV).add_domains(train_data)

    print('learning WSPN')
    # need to pair RVs
    # need flag for 2d?
    l_rfft = get_l_rfft(args)
    # l_rfft!=None --> 2d/pair gaussian node, is_2d=True --> pairwise gaussian, full covariance matrix
    wspn = learn_parametric(train_data,
                            ds_context,
                            min_instances_slice=n_min_slice,
                            threshold=args.threshold,
                            initial_scope=init_scope,
                            cpus=1,
                            l_rfft=l_rfft,
                            is_2d=True)
    save_path = get_save_path(args)
    check_path(save_path)
    f = open(save_path + 'wspn_2d.pkl', 'wb')
    pickle.dump(wspn, f)
    f.close()

    return wspn
Beispiel #23
0
    def test_learn(self):
        from sklearn.datasets import load_iris

        iris = load_iris()
        X = iris.data
        y = iris.target.reshape(-1, 1)

        train_data = np.hstack((X, y))

        from spn.algorithms.LearningWrappers import learn_parametric, learn_classifier
        from spn.structure.leaves.parametric.Parametric import Categorical, MultivariateGaussian
        from spn.structure.Base import Context

        spn_classification = learn_parametric(
            train_data,
            Context(
                parametric_types=[
                    MultivariateGaussian,
                    MultivariateGaussian,
                    MultivariateGaussian,
                    MultivariateGaussian,
                    Categorical,
                ]
            ).add_domains(train_data),
            multivariate_leaf=True,
        )
Beispiel #24
0
    def test_conditional(self):
        labels = np.c_[np.zeros((500, 1)), np.ones((500, 1))]
        features = np.c_[
            np.r_[np.random.normal(5, 1, (500, 2)), np.random.normal(10, 1, (500, 2))]
        ]

        train_data = concatenate_yx(labels, features)

        ds_context = Context(
            parametric_types=[Bernoulli] * labels.shape[1]
        ).add_domains(labels)
        ds_context.feature_size = 2

        def label_conditional(y, x):
            from sklearn.cluster import KMeans

            clusters = KMeans(
                n_clusters=2, random_state=17, precompute_distances=True
            ).fit_predict(y)
            return clusters

        spn = learn_cspn_structure(
            train_data,
            ds_context,
            split_rows=get_split_conditional_rows_KMeans(),
            split_cols=getCIGroup(),
            create_leaf=create_conditional_leaf,
            label_conditional=label_conditional,
            cluster_univariate=True,
        )
Beispiel #25
0
def build_spn(features):
    spn_classification = learn_classifier(
        features,
        Context(
            parametric_types=[Gaussian, Categorical, Categorical, Gaussian
                              ]).add_domains(features), learn_parametric, 2)
    return spn_classification
Beispiel #26
0
    def test_optimization(self):
        np.random.seed(17)
        d1 = np.random.normal(10, 5, size=2000).tolist()
        d2 = np.random.normal(30, 5, size=2000).tolist()
        data = d1 + d2
        data = np.array(data).reshape((-1, 10))
        data = data.astype(np.float32)

        ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1],
                             parametric_types=[Gaussian] * data.shape[1])

        spn = learn_parametric(data, ds_context)

        spn.weights = [0.8, 0.2]
        spn.children[0].children[0].mean = 3.0

        py_ll = np.sum(log_likelihood(spn, data))

        print(spn.weights, spn.children[0].children[0].mean)

        EM_optimization(spn, data, iterations=10)

        print(spn.weights, spn.children[0].children[0].mean)

        py_ll_opt = np.sum(log_likelihood(spn, data))

        self.assertLessEqual(py_ll, py_ll_opt)
        self.assertAlmostEqual(spn.weights[0], 0.5, 6)
        self.assertAlmostEqual(spn.weights[1], 0.5, 6)
        self.assertAlmostEqual(spn.children[0].children[0].mean, 10.50531, 4)
Beispiel #27
0
    def test_naive_factorization(self):
        np.random.seed(17)
        data = np.arange(0, 1000).reshape(-1, 8)

        parent = Sum()
        parent.children.append(None)

        ctx = Context()
        ctx.feature_size = 4

        scope = [1, 3, 4, 6]
        data2 = np.array(data)
        result = naive_factorization(data=data2,
                                     parent=parent,
                                     pos=0,
                                     context=ctx,
                                     scope=list(scope))

        self.assertListEqual(data.tolist(), data2.tolist())

        self.assertEqual(parent.children[0], result[0][1]['parent'])

        y, x = get_YX(data, 4)

        self.assertEqual(len(result), len(scope))
        for i, s in enumerate(scope):
            r = result[i]
            self.assertEqual(len(r), 2)
            self.assertEqual(r[0], SplittingOperations.CREATE_LEAF_NODE)
            self.assertEqual(type(r[1]['parent']), Product)
            self.assertEqual(r[1]['pos'], i)
            self.assertListEqual(r[1]['scope'], [s])
            self.assertListEqual(r[1]['data'].tolist(),
                                 concatenate_yx(y[:, i], x).tolist())
Beispiel #28
0
 def test_histogram_leaf(self):
     data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1)
     ds_context = Context([MetaType.DISCRETE])
     ds_context.add_domains(data)
     hist = create_histogram_leaf(data, ds_context, [0], alpha=False)
     self.assertTrue(
         np.array_equal(mpe(hist, np.array([[np.nan]])), np.array([[3]])),
         "mpe should be 3")
Beispiel #29
0
    def test_histogram_to_str_and_back(self):

        data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1)
        ds_context = Context([MetaType.DISCRETE])
        ds_context.add_domains(data)
        hist = create_histogram_leaf(data, ds_context, [0], alpha=False)

        self.check_obj_and_reconstruction(hist)
Beispiel #30
0
    def _fit(self, var_types=None, **kwargs):
        df = self.data.copy()
        # Exchange all object columns for their codes
        for key, value in self._categorical_variables.items():
            df[key] = value['categorical'].codes

        self._nameToVarType = var_types

        #Check if variable types are given
        if self._nameToVarType is None:
            raise ValueError("missing argument 'var_types'")

        self._initial_names = self.names.copy()
        self._initial_names_count = len(self._initial_names)
        self._initial_names_to_index = {self._initial_names[i]: i for i in range(self._initial_names_count)}

        # Initialize _density_mask with np.nan
        self._density_mask = np.array(
            [np.nan for i in self._initial_names]
        ).reshape(-1, self._initial_names_count).astype(float)

        # Initialize _condition with np.nan
        self._condition = np.repeat(
            np.nan,
            self._initial_names_count
        ).reshape(-1, self._initial_names_count).astype(float)

        self._marginalized = set()
        self._conditioned = set()

        try:
            var_types = [self._nameToVarType[name] for name in self.names]
        except KeyError as err:
            raise ValueError('missing var type information for some dimension {}.'.format(err.args[0]))

        if self._spn_type == 'spn':
            context = Context(parametric_types=var_types).add_domains(df.values)
            self._spn = learn_parametric(df.values, context)

        elif self._spn_type == 'mspn':
            context = Context(meta_types=var_types).add_domains(df.values)
            self._spn = learn_mspn(df.values, context)
        else:
            raise Exception("Type of SPN not known: " + self._spn_type)
        return self._unbound_updater,