Exemple #1
0
    def test_optimization(self):
        np.random.seed(17)
        d1 = np.random.normal(10, 1, size=4000).tolist()
        d2 = np.random.normal(30, 1, size=4000).tolist()
        data = d1 + d2
        data = np.array(data).reshape((-1, 4))
        data = data.astype(np.float32)

        ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1],
                             parametric_types=[Gaussian] * data.shape[1])
        ds_context.add_domains(data)

        spn = learn_parametric(data, ds_context)

        spn.weights = [0.8, 0.2]
        spn.children[0].children[0].mean = 3.0

        py_ll = np.sum(log_likelihood(spn, data))

        print(spn.weights, spn.children[0].children[0].mean)

        EM_optimization(spn, data, iterations=1000)

        print(spn.weights, spn.children[0].children[0].mean)

        py_ll_opt = np.sum(log_likelihood(spn, data))

        self.assertLessEqual(py_ll, py_ll_opt)
        self.assertAlmostEqual(spn.weights[0], 0.5, 4)
        self.assertAlmostEqual(spn.weights[1], 0.5, 4)

        c1_mean = spn.children[0].children[0].mean
        c2_mean = spn.children[1].children[0].mean
        self.assertEqual(round(min(c1_mean, c2_mean)), 10)
        self.assertEqual(round(max(c1_mean, c2_mean)), 30)
Exemple #2
0
 def test_conditional_probability(self):
     # test if conditional probability is correct
     # same spn as in entropy test
     # only for generating the ds_context
     train_data = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [2.0, 0.0, 1.0]])
     # spn
     ds_context = Context(meta_types=[MetaType.DISCRETE] * 3)
     ds_context.add_domains(train_data)
     ds_context.parametric_type = [Categorical] * 3
     spn = 0.64 * (
         (
             Categorical(p=[0.25, 0.75, 0.0], scope=0)
             * (
                 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2)))
                 + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2)))
             )
         )
     ) + 0.36 * (
         (
             Categorical(p=[0.0, 0.0, 1.0], scope=0)
             * (
                 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2)))
                 + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2)))
             )
         )
     )
     # tests
     x_instance = np.array([1, 1, 0], dtype=float).reshape(1, -1)
     self.assertAlmostEqual(conditional_probability(spn, 2, x_instance)[0][0], 0.9)
     self.assertAlmostEqual(conditional_probability(spn, 0, x_instance)[0][0], 0.48)
     x_instance = np.array([2, 1, 0], dtype=float).reshape(1, -1)
     self.assertAlmostEqual(conditional_probability(spn, 0, x_instance)[0][0], 0.36)
Exemple #3
0
def learn_parametric_spn(data, parametric_types):
    
    from spn.algorithms.LearningWrappers import learn_parametric
    ds_context = Context(parametric_types=parametric_types).add_domains(data)
    ds_context.add_domains(data)
    spn = learn_parametric(data, ds_context, min_instances_slice=100, threshold=0.01)
    return spn
Exemple #4
0
    def test_Histogram_discrete_inference(self):
        data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1)
        ds_context = Context([MetaType.DISCRETE])
        ds_context.add_domains(data)
        hist = create_histogram_leaf(data, ds_context, [0], alpha=False)
        prob = np.exp(log_likelihood(hist, data))

        self.assertAlmostEqual(float(prob[0]), 2 / 6)
        self.assertAlmostEqual(float(prob[1]), 2 / 6)
        self.assertAlmostEqual(float(prob[2]), 1 / 6)
        self.assertAlmostEqual(float(prob[3]), 3 / 6)
        self.assertAlmostEqual(float(prob[4]), 3 / 6)
        self.assertAlmostEqual(float(prob[5]), 3 / 6)

        data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1)
        ds_context = Context([MetaType.DISCRETE])
        ds_context.add_domains(data)
        hist = create_histogram_leaf(data, ds_context, [0], alpha=True)
        # print(np.var(data.shape[0]))
        prob = np.exp(log_likelihood(hist, data))
        self.assertAlmostEqual(float(prob[0]), 3 / 9)
        self.assertAlmostEqual(float(prob[1]), 3 / 9)
        self.assertAlmostEqual(float(prob[2]), 2 / 9)
        self.assertAlmostEqual(float(prob[3]), 4 / 9)
        self.assertAlmostEqual(float(prob[4]), 4 / 9)
        self.assertAlmostEqual(float(prob[5]), 4 / 9)
Exemple #5
0
    def test_histogram_to_str_and_back(self):

        data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1)
        ds_context = Context([MetaType.DISCRETE])
        ds_context.add_domains(data)
        hist = create_histogram_leaf(data, ds_context, [0], alpha=False)

        self.check_obj_and_reconstruction(hist)
Exemple #6
0
 def test_histogram_leaf(self):
     data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1)
     ds_context = Context([MetaType.DISCRETE])
     ds_context.add_domains(data)
     hist = create_histogram_leaf(data, ds_context, [0], alpha=False)
     self.assertTrue(
         np.array_equal(mpe(hist, np.array([[np.nan]])), np.array([[3]])),
         "mpe should be 3")
Exemple #7
0
 def test_valid_histogram(self):
     np.random.seed(17)
     data = [1] + [5]*20 + [7] + [10]*50 + [20] + [30]*10
     data = np.array(data).reshape((-1, 1))
     ds_context = Context([MetaType.REAL])
     ds_context.add_domains(data)
     
     hist = create_histogram_leaf(data, ds_context, [0], alpha=False, hist_source="kde")
     self.assertGreater(len(hist.bin_repr_points), 1)
Exemple #8
0
 def test_PWL_no_variance(self):
     data = np.array([1.0, 1.0]).reshape(-1, 1)
     ds_context = Context([MetaType.REAL])
     ds_context.add_domains(data)
     with self.assertRaises(AssertionError):
         create_piecewise_leaf(data,
                               ds_context,
                               scope=[0],
                               hist_source="kde")
Exemple #9
0
    def test_PWL_no_variance(self):
        data = np.array([1.0, 1.0]).reshape(-1, 1)
        ds_context = Context([MetaType.REAL])
        ds_context.add_domains(data)
        leaf = create_piecewise_leaf(data, ds_context, scope=[0], hist_source="kde")
        prob = np.exp(log_likelihood(leaf, data))

        self.assertAlmostEqual(float(prob[0]), 2 / 6)
        self.assertAlmostEqual(float(prob[1]), 2 / 6)
Exemple #10
0
 def test_we_score(self):
     # test if we_score is correct
     """
     # explain how training data and the spn comes
     # number of RVs
     M = 3
     # table of probabilities
     p1 = 0.6
     p2 = 0.3
     p31 = 0.1
     p32 = 0.9
     # generate x1 and x2
     x1 = np.random.binomial(1, p1, size=N) + np.random.binomial(1, p1, size=N)
     x2 = np.random.binomial(1, p2, size=N)
     x3 = np.zeros(N)
     # generate x3
     for i in range(N):
         if x2[i] == 1:
             x3[i] = np.random.binomial(1, p31, size=1)
         else:
             x3[i] = np.random.binomial(1, p32, size=1)
     # form a matrix, rows are instances and columns are RVs
     train_data = np.concatenate((x1, x2, x3)).reshape((M, N)).transpose()
     """
     # only for generating the ds_context
     train_data = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [2.0, 0.0, 1.0]])
     # spn
     ds_context = Context(meta_types=[MetaType.DISCRETE] * 3)
     ds_context.add_domains(train_data)
     ds_context.parametric_type = [Categorical] * 3
     spn = 0.64 * (
         (
             Categorical(p=[0.25, 0.75, 0.0], scope=0)
             * (
                 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2)))
                 + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2)))
             )
         )
     ) + 0.36 * (
         (
             Categorical(p=[0.0, 0.0, 1.0], scope=0)
             * (
                 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2)))
                 + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2)))
             )
         )
     )
     # test
     n = 40000
     x_instance = np.array([1, 1, 0], dtype=float).reshape(1, -1)
     y_index = 0
     we = weight_of_evidence(spn, 0, x_instance, n, ds_context.domains[y_index].shape[0])
     we_true = np.array([[np.nan, 0, 0]])
     we = we[~np.isnan(we)]
     we_true = we_true[~np.isnan(we_true)]
     self.assertTrue((we == we_true).all())
Exemple #11
0
    def test_PWL(self):
        #data = np.array([1.0, 1.0, 2.0, 3.0]*100).reshape(-1, 1)

        data = np.r_[np.random.normal(10, 5, (300, 1)),
                     np.random.normal(20, 10, (700, 1))]

        ds_context = Context([MetaType.REAL])
        ds_context.add_domains(data)
        leaf = create_piecewise_leaf(data,
                                     ds_context,
                                     scope=[0],
                                     prior_weight=None,
                                     hist_source="kde")
        prob = np.exp(log_likelihood(leaf, data))
Exemple #12
0
 def test_mixture_gaussians(self):
     np.random.seed(17)
     data = np.random.normal(10, 1, size=200).tolist() + np.random.normal(30, 1, size=200).tolist()
     data = np.array(data).reshape((-1, 1))
     ds_context = Context([MetaType.REAL])
     ds_context.add_domains(data)
     hist = create_histogram_leaf(data, ds_context, [0], alpha=False, hist_source="kde")
     x = np.linspace(0, 60, 1000).tolist() + data[:, 0].tolist()
     x = np.sort(x)
     from scipy.stats import norm
     y = 0.5 * norm.pdf(x, 10, 1) + 0.5 * norm.pdf(x, 30, 1)
     ye = likelihood(hist, x.reshape((-1, 1)))
     error = np.sum(np.abs(ye[:, 0] - y))
     # print(error)
     self.assertLessEqual(error, 7)
Exemple #13
0
def get_ds_context(data, scope, params):
    """
    :param data: numpy array of data for Context object
    :param scope: scope of data
    :param params: params of SPMN
    :return: Context object of SPFlow
    """

    num_of_variables = data.shape[1]
    scope_var = np.array(params.feature_names)[scope].tolist()
    ds_context = Context(meta_types=[params.meta_types[i] for i in scope],
                         scope=scope,
                         feature_names=scope_var)
    ds_context.add_domains(data)
    return ds_context
    def test_Histogram_expectations(self):
        data = np.random.randn(20000).reshape(-1, 1)
        ds_context = Context(meta_types=[MetaType.REAL])
        ds_context.add_domains(data)
        hl = create_histogram_leaf(data, ds_context, scope=[0])
        expectation = Expectation(hl, set([0]))

        self.assertAlmostEqual(np.mean(data[:, 0]), expectation[0, 0], 3)

        data = np.random.randint(0, high=100, size=20000).reshape(-1, 1)
        ds_context = Context(meta_types=[MetaType.DISCRETE])
        ds_context.add_domains(data)
        hl = create_histogram_leaf(data, ds_context, scope=[0])
        expectation = Expectation(hl, set([0]))

        self.assertAlmostEqual(np.mean(data[:, 0]), expectation[0, 0], 3)
Exemple #15
0
    def test_singular_domain(self):
        import numpy as np
        np.random.seed(123)

        b = np.random.randint(3, size=1000).reshape(-1, 1)
        d = np.random.randint(2, size=1000).reshape(-1, 1)
        train_data = np.c_[b, d]

        from spn.structure.Base import Context
        from spn.structure.StatisticalTypes import MetaType

        ds_context = Context(meta_types=[MetaType.DISCRETE, MetaType.BINARY])
        ds_context.add_domains(train_data)

        from spn.algorithms.LearningWrappers import learn_mspn

        mspn = learn_mspn(train_data, ds_context, min_instances_slice=20)
Exemple #16
0
 def test_conditional_mutual_info(self):
     # test if conditional mutual info is correct
     # same spn as in entropy test
     # only for generating the ds_context
     train_data = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [2.0, 0.0, 1.0]])
     # spn
     ds_context = Context(meta_types=[MetaType.DISCRETE] * 3)
     ds_context.add_domains(train_data)
     ds_context.parametric_type = [Categorical] * 3
     spn = 0.64 * (
         (
             Categorical(p=[0.25, 0.75, 0.0], scope=0)
             * (
                 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2)))
                 + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2)))
             )
         )
     ) + 0.36 * (
         (
             Categorical(p=[0.0, 0.0, 1.0], scope=0)
             * (
                 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2)))
                 + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2)))
             )
         )
     )
     # real mutual info
     p2 = 0.3
     p3 = 0.66
     h_x1 = -(0.16 * np.log(0.16) + 0.36 * np.log(0.36) + 0.48 * np.log(0.48))
     h_x2x1 = -(0.7 * np.log(0.7) + 0.3 * np.log(0.3)) + h_x1
     h_x3x1 = -(0.66 * np.log(0.66) + 0.34 * np.log(0.34)) + h_x1
     h_x2x3 = -(p2 * np.log(p2) + (1 - p2) * np.log(1 - p2) + 0.9 * np.log(0.9) + 0.1 * np.log(0.1))
     h_x2x3x1 = h_x1 + h_x2x3
     cmi_x2x3_x1 = h_x2x1 + h_x3x1 - h_x2x3x1 - h_x1
     self.assertAlmostEqual(cmi_x2x3_x1, conditional_mutual_information(spn, ds_context, {1}, {2}, {0}))
     h_x1x3 = h_x3x1
     h_x1x2x3 = h_x2x3x1
     h_x3 = -p3 * np.log(p3) - (1 - p3) * np.log(1 - p3)
     cmi_x1x2_x3 = h_x1x3 + h_x2x3 - h_x1x2x3 - h_x3
     self.assertAlmostEqual(cmi_x1x2_x3, conditional_mutual_information(spn, ds_context, {1}, {0}, {2}))
     h_x1x2x3 = h_x2x3x1
     h_x2 = -p2 * np.log(p2) - (1 - p2) * np.log(1 - p2)
     cmi_x1x3_x2 = h_x2x1 + h_x2x3 - h_x1x2x3 - h_x2
     self.assertAlmostEqual(cmi_x1x3_x2, conditional_mutual_information(spn, ds_context, {2}, {0}, {1}))
Exemple #17
0
 def test_mutual_info(self):
     # test if mutual info is correct
     # same spn as in entropy test
     # only for generating the ds_context
     train_data = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [2.0, 0.0, 1.0]])
     # spn
     ds_context = Context(meta_types=[MetaType.DISCRETE] * 3)
     ds_context.add_domains(train_data)
     ds_context.parametric_type = [Categorical] * 3
     spn = 0.64 * (
         (
             Categorical(p=[0.25, 0.75, 0.0], scope=0)
             * (
                 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2)))
                 + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2)))
             )
         )
     ) + 0.36 * (
         (
             Categorical(p=[0.0, 0.0, 1.0], scope=0)
             * (
                 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2)))
                 + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2)))
             )
         )
     )
     # real mutual info
     p2 = 0.3
     p3 = 0.66
     h_x2 = -p2 * np.log(p2) - (1 - p2) * np.log(1 - p2)
     h_x3 = -p3 * np.log(p3) - (1 - p3) * np.log(1 - p3)
     h_x2x3 = -(p2 * np.log(p2) + (1 - p2) * np.log(1 - p2) + 0.9 * np.log(0.9) + 0.1 * np.log(0.1))
     mi_x2x3 = h_x2 + h_x3 - h_x2x3
     self.assertAlmostEqual(mi_x2x3, mutual_information(spn, ds_context, {1}, {2}))
     mi_x1x2 = 0
     self.assertAlmostEqual(mi_x1x2, mutual_information(spn, ds_context, {1}, {0}))
     # test symmetry
     self.assertAlmostEqual(
         mutual_information(spn, ds_context, {2}, {1}), mutual_information(spn, ds_context, {1}, {2})
     )
     self.assertAlmostEqual(
         mutual_information(spn, ds_context, {0, 2}, {1}), mutual_information(spn, ds_context, {1}, {0, 2})
     )
     # rest 0
     self.assertAlmostEqual(0, mutual_information(spn, ds_context, {2, 1}, {0}))
Exemple #18
0
    def compute_kmeans_rdc(data):
        ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1])
        ds_context.add_domains(data)
        scope = list(range(data.shape[1]))
        meta_types = ds_context.get_meta_types_by_scope(scope)
        domains = ds_context.get_domains_by_scope(scope)

        rdc_data = rdc_transformer(data,
                                   meta_types,
                                   domains,
                                   k=10,
                                   s=1 / 6,
                                   non_linearity=np.sin,
                                   return_matrix=True,
                                   rand_gen=RandomState(17))

        return KMeans(n_clusters=2,
                      random_state=RandomState(17)).fit_predict(rdc_data)
Exemple #19
0
def learn_parametric_spn(data,
                         parametric_types,
                         rdc_threshold=0.3,
                         min_instances_slice=0.05,
                         clustering='kmeans'):

    ds_context = Context(parametric_types=parametric_types).add_domains(data)
    ds_context.add_domains(data)
    mis = int(len(data) * min_instances_slice)

    t0 = time.time()
    spn = learn_parametric(data,
                           ds_context,
                           threshold=rdc_threshold,
                           min_instances_slice=mis,
                           rows=clustering)
    const_time = time.time() - t0

    return spn, const_time
Exemple #20
0
    def test_eval_histogram(self):
        np.random.seed(17)
        data = np.random.normal(10, 0.01,
                                size=2000).tolist() + np.random.normal(
                                    30, 10, size=2000).tolist()
        data = np.array(data).reshape((-1, 10))
        data[data < 0] = 0
        data = data.astype(int)

        ds_context = Context(meta_types=[MetaType.DISCRETE] * data.shape[1])
        ds_context.add_domains(data)

        spn = learn_mspn(data, ds_context)

        ll = log_likelihood(spn, data)

        tf_ll = eval_tf(spn, data)

        self.assertTrue(np.all(np.isclose(ll, tf_ll)))
    def test_Piecewise_expectations_with_evidence(self):
        adata = np.zeros((20000, 2))
        adata[:, 1] = 0
        adata[:, 0] = np.random.normal(loc=100.0,
                                       scale=5.00,
                                       size=adata.shape[0])

        bdata = np.zeros_like(adata)
        bdata[:, 1] = 1
        bdata[:, 0] = np.random.normal(loc=50.0,
                                       scale=5.00,
                                       size=bdata.shape[0])

        data = np.vstack((adata, bdata))

        ds_context = Context(meta_types=[MetaType.REAL, MetaType.DISCRETE])
        ds_context.parametric_types = [None, Categorical]
        ds_context.add_domains(data)
        L = create_piecewise_leaf(
            adata[:, 0].reshape(-1, 1),
            ds_context,
            scope=[0],
            prior_weight=None,
            hist_source="numpy") * create_parametric_leaf(
                adata[:, 1].reshape(-1, 1), ds_context, scope=[1])
        R = create_piecewise_leaf(
            bdata[:, 0].reshape(-1, 1),
            ds_context,
            scope=[0],
            prior_weight=None,
            hist_source="numpy") * create_parametric_leaf(
                bdata[:, 1].reshape(-1, 1), ds_context, scope=[1])

        spn = 0.5 * L + 0.5 * R

        evidence = np.zeros((2, 2))
        evidence[1, 1] = 1
        evidence[:, 0] = np.nan
        expectation = Expectation(spn, set([0]), evidence)

        self.assertAlmostEqual(np.mean(adata[:, 0]), expectation[0, 0], 2)
        self.assertAlmostEqual(np.mean(bdata[:, 0]), expectation[1, 0], 2)
    def test_Piecewise_expectations(self):
        data = np.random.normal(loc=100.0, scale=5.00,
                                size=20000).reshape(-1, 1)
        ds_context = Context(meta_types=[MetaType.REAL])
        ds_context.add_domains(data)
        pl = create_piecewise_leaf(data,
                                   ds_context,
                                   scope=[0],
                                   prior_weight=None)
        expectation = Expectation(pl, set([0]))

        self.assertAlmostEqual(np.mean(data[:, 0]), expectation[0, 0], 2)

        data = np.random.randint(0, high=100, size=2000).reshape(-1, 1)
        ds_context = Context(meta_types=[MetaType.DISCRETE])
        ds_context.add_domains(data)
        pl = create_piecewise_leaf(data,
                                   ds_context,
                                   scope=[0],
                                   prior_weight=None)
        expectation = Expectation(pl, set([0]))

        self.assertAlmostEqual(np.mean(data[:, 0]), expectation[0, 0], 3)
Exemple #23
0
import numpy as np
from spn.structure.Base import Context
from spn.structure.StatisticalTypes import MetaType
from spn.algorithms.LearningWrappers import learn_mspn
from spn.io.Graphics import plot_spn

np.random.seed(123)

a = np.random.randint(2, size=1000).reshape(-1, 1)
b = np.random.randint(3, size=1000).reshape(-1, 1)
c = np.r_[np.random.normal(10, 5, (300, 1)),
          np.random.normal(20, 10, (700, 1))]
d = 5 * a + 3 * b + c
train_data = np.c_[a, b, c, d]

ds_context = Context(meta_types=[
    MetaType.DISCRETE, MetaType.DISCRETE, MetaType.REAL, MetaType.REAL
])
ds_context.add_domains(train_data)

mspn = learn_mspn(train_data, ds_context, min_instances_slice=20)
Exemple #24
0
        data[:, horizontal_middle:, vertical_middle:].reshape(len(data), -1),
    )

    # # spn
    # ds_context = Context(meta_types=[MetaType.REAL] * blocked_images[0].shape[1])
    # ds_context.add_domains(blocked_images[0])
    # ds_context.parametric_type = [Poisson] * blocked_images[0].shape[1]
    #
    # print("data ready", data.shape)
    # #the following two options should be working now.
    # # spn = learn_structure(upperimage, ds_context, get_split_rows_random_partition(np.random.RandomState(17)), get_split_cols_random_partition(np.random.RandomState(17)), create_parametric_leaf)
    # spn = learn_parametric(blocked_images[0], ds_context, min_instances_slice=0.1*len(data), ohe=False)

    # spn
    ds_context = Context(meta_types=[MetaType.DISCRETE] * 10)
    ds_context.add_domains(data_labels)
    ds_context.parametric_types = [Bernoulli] * blocked_images[0].shape[1]
    spn = learn_parametric(data_labels,
                           ds_context,
                           min_instances_slice=0.3 * len(data_labels))

    # first cspn
    dataIn = data_labels
    dataOut = blocked_images[0]
    ds_context = Context(meta_types=[MetaType.DISCRETE] * dataOut.shape[1])
    ds_context.add_domains(dataOut)
    ds_context.parametric_types = [Conditional_Poisson] * dataOut.shape[1]

    scope = list(range(dataOut.shape[1]))
    print(np.shape(dataIn), np.shape(dataOut))
    print(dataIn[0], dataOut[0])
Exemple #25
0
add_parametric_inference_support()
memory = Memory(cachedir="cache", verbose=0, compress=9)

data = []
for x in range(10):
    for y in range(10):
        for z in range(10):
            data.append([x, y, z, int(((x + y + z) / 5))])
data = np.array(data).astype(np.float)
types = [
    MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE
]

ds_context = Context(meta_types=types)
ds_context.parametric_types = [Gaussian, Gaussian, Gaussian, Categorical]
ds_context.add_domains(data)

num_classes = len(np.unique(data[:, 3]))

#spn = learn_mspn(data, ds_context, min_instances_slice=10, leaves=create_leaf, threshold=0.3)

spn = Sum()
for label, count in zip(*np.unique(data[:, 3], return_counts=True)):
    branch = learn_mspn(data[data[:, 3] == label, :],
                        ds_context,
                        min_instances_slice=10,
                        leaves=create_leaf,
                        threshold=0.1)
    spn.children.append(branch)
    spn.weights.append(count / data.shape[0])
        block_ids = np.arange(i, -1, -1)
        datasets.append((get_blocks(images, num_blocks=num_blocks, blocks=block_ids.tolist()), 1))

    num_mpes = 1
    num_samples = 10

    cspns = []
    mpe_query_blocks = None
    sample_query_blocks = None
    for i, ((tr_block, block_idx), conditional_blocks) in enumerate(datasets):
        print("learning", i)
        conditional_features_count = (tr_block.shape[1] // len(block_idx)) * conditional_blocks
        if i == 0:
            # spn
            ds_context = Context(meta_types=[MetaType.REAL] * tr_block.shape[1])
            ds_context.add_domains(tr_block)
            ds_context.parametric_types = [Gaussian] * tr_block.shape[1]

            cspn = learn_parametric(tr_block, ds_context, min_instances_slice=20, ohe=False, memory=memory)
        else:
            cspn = learn_conditional(
                tr_block,
                Context(
                    meta_types=[MetaType.REAL] * tr_block.shape[1],
                    parametric_types=[Conditional_Gaussian] * tr_block.shape[1],
                ).add_domains(tr_block),
                scope=list(range(conditional_features_count)),
                min_instances_slice=30,
                memory=memory,
            )
        cspns.append(cspn)
def train_spn(window_size=3,
              min_instances_slice=10000,
              features=None,
              number_of_classes=3):
    if features is None:
        features = [20, 120]

    add_parametric_inference_support()
    add_parametric_text_support()

    data = get_data_in_window(window_size=window_size,
                              features=features,
                              three_classes=number_of_classes == 3)

    sss = sk.model_selection.StratifiedShuffleSplit(test_size=0.2,
                                                    train_size=0.8,
                                                    random_state=42)
    for train_index, test_index in sss.split(
            data[:, 0:window_size * window_size * len(features)],
            data[:, (window_size * window_size * len(features)) +
                 (int(window_size * window_size / 2))]):
        X_train, X_test = data[train_index], data[test_index]

    context_list = list()
    parametric_list = list()
    number_of_features = len(features)
    for _ in range(number_of_features * window_size * window_size):
        context_list.append(MetaType.REAL)
        parametric_list.append(Gaussian)

    for _ in range(window_size * window_size):
        context_list.append(MetaType.DISCRETE)
        parametric_list.append(Categorical)

    ds_context = Context(meta_types=context_list)
    ds_context.add_domains(data)
    ds_context.parametric_types = parametric_list

    spn = load_spn(window_size, features, min_instances_slice,
                   number_of_classes)
    if spn is None:
        spn = Sum()
        for class_pixel in tqdm(range(-window_size * window_size, 0)):
            for label, count in zip(
                    *np.unique(data[:, class_pixel], return_counts=True)):
                train_data = X_train[X_train[:, class_pixel] == label, :]
                branch = learn_parametric(
                    train_data,
                    ds_context,
                    min_instances_slice=min_instances_slice)
                spn.children.append(branch)
                spn.weights.append(train_data.shape[0])

        spn.scope.extend(branch.scope)
        spn.weights = (np.array(spn.weights) / sum(spn.weights)).tolist()

        assign_ids(spn)
        save_spn(spn, window_size, features, min_instances_slice,
                 number_of_classes)

    res = np.ndarray((X_test.shape[0], number_of_classes))

    for i in tqdm(range(number_of_classes)):
        tmp = X_test.copy()
        tmp[:, -int((window_size**2) / 2)] = i
        res[:, i] = log_likelihood(spn, tmp)[:, 0]

    predicted_classes = np.argmax(res, axis=1).reshape((X_test.shape[0], 1))

    correct_predicted = 0
    for x, y in zip(X_test[:, -5], predicted_classes):
        if x == y[0]:
            correct_predicted += 1
    accuracy = correct_predicted / X_test.shape[0]
    return spn, accuracy
    add_conditional_inference_support()

    np.random.seed(42)
    dataIn = np.random.randint(low=0, high=3, size=600).reshape(-1, 2)
    dataOut = np.random.randint(low=0, high=3, size=1200).reshape(-1, 4)
    data = np.concatenate((dataOut, dataIn), axis=1)
    assert data.shape[
        1] == dataIn.shape[1] + dataOut.shape[1], 'invalid column size'
    assert data.shape[0] == dataIn.shape[0] == dataOut.shape[
        0], 'invalid row size'

    ds_context = Context(meta_types=[
        MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE,
        MetaType.DISCRETE
    ])
    ds_context.add_domains(dataOut)
    ds_context.parametric_types = [
        Conditional_Poisson, Conditional_Poisson, Conditional_Poisson,
        Conditional_Poisson
    ]

    scope = list(range(dataOut.shape[1]))

    spn = Sum()

    for label, count in zip(*np.unique(data[:, 2], return_counts=True)):
        branch = learn_conditional(data,
                                   ds_context,
                                   scope,
                                   min_instances_slice=10000)
        spn.children.append(branch)
Exemple #29
0
    dataOut = data[:, :horizontal_middle, vertical_middle:].reshape(len(data), -1)
    print(data[0])
    print("_______")
    print(dataIn[0])
    print("_______")
    print(dataOut[0])
    print("_______")
    zeros[:, :horizontal_middle, :vertical_middle] = dataIn.reshape(len(data), 4, 4)  #data[:, :horizontal_middle, :vertical_middle]
    zeros[:, :horizontal_middle, vertical_middle:] = dataOut.reshape(len(data), 4, 4) #data[:, :horizontal_middle, vertical_middle:]
    print(zeros[0], np.shape(zeros))    #print(np.concatenate((dataIn, dataOut), axis=1).reshape(len(dataIn), 4, 8)[0])
    """

    # spn
    ds_context = Context(meta_types=[MetaType.REAL] *
                         blocked_images[0].shape[1])
    ds_context.add_domains(blocked_images[0])
    ds_context.parametric_types = [Poisson] * blocked_images[0].shape[1]

    print("data ready", data.shape)
    # the following two options should be working now.
    spn = learn_parametric(blocked_images[0],
                           ds_context,
                           min_instances_slice=0.1 * len(data),
                           ohe=False)

    # cspn
    dataIn = blocked_images[
        0]  # data[:, :horizontal_middle, :vertical_middle].reshape(len(data), -1)
    dataOut = blocked_images[
        1]  # data[:, :horizontal_middle, vertical_middle:].reshape(len(data), -1)
Exemple #30
0
def max_rdc(schema,
            left_table,
            right_table,
            df_samples,
            meta_types,
            rdc_attribute_dict,
            max_sampling_threshold_rows=10000,
            k=10,
            s=1 / 6,
            non_linearity=np.sin,
            n_jobs=-2,
            debug=True):
    # only keep columns of left or right table
    irrelevant_cols = []
    relevant_meta_types = []
    for i, column in enumerate(df_samples.columns):
        not_of_left_or_right = not (column.startswith(left_table + '.')
                                    or column.startswith(right_table + '.'))
        is_nn_attribute = (column == left_table + '.' + schema.table_dictionary[left_table].table_nn_attribute) or \
                          (column == right_table + '.' + schema.table_dictionary[right_table].table_nn_attribute)
        is_multiplier = False
        is_fk_field = False
        for relationship_obj in schema.relationships:  # [relationship_obj_list[0], relationship_obj_list[-1]]
            if relationship_obj.end + '.' + relationship_obj.end_attr == column or \
                    relationship_obj.start + '.' + relationship_obj.start_attr == column:
                is_fk_field = True
                break

            if relationship_obj.end + '.' + relationship_obj.multiplier_attribute_name_nn == column or \
                    relationship_obj.end + '.' + relationship_obj.multiplier_attribute_name == column:
                is_multiplier = True
                break
        is_uninformative = False

        if not_of_left_or_right or is_nn_attribute or is_multiplier or is_fk_field or is_uninformative:
            irrelevant_cols.append(column)
        else:
            relevant_meta_types.append(meta_types[i])

    df_samples.drop(columns=irrelevant_cols, inplace=True)

    left_column_names = [(i, column)
                         for i, column in enumerate(df_samples.columns)
                         if column.startswith(left_table + '.')]
    right_column_names = [(i, column)
                          for i, column in enumerate(df_samples.columns)
                          if column.startswith(right_table + '.')]
    left_columns = [i for i, column in left_column_names]
    right_columns = [i for i, column in right_column_names]

    data = df_samples.values
    # sample if necessary
    if data.shape[0] > max_sampling_threshold_rows:
        data = data[np.random.randint(data.shape[0],
                                      size=max_sampling_threshold_rows), :]

    n_features = data.shape[1]
    assert n_features == len(relevant_meta_types)

    ds_context = Context(meta_types=relevant_meta_types)
    ds_context.add_domains(data)

    rdc_features = rdc_transformer(data,
                                   relevant_meta_types,
                                   ds_context.domains,
                                   k=k,
                                   s=s,
                                   non_linearity=non_linearity,
                                   return_matrix=False)
    pairwise_comparisons = [(i, j) for i in left_columns
                            for j in right_columns]

    from joblib import Parallel, delayed

    rdc_vals = Parallel(n_jobs=n_jobs, max_nbytes=1024,
                        backend="threading")(delayed(rdc_cca)((i, j,
                                                               rdc_features))
                                             for i, j in pairwise_comparisons)

    for (i, j), rdc in zip(pairwise_comparisons, rdc_vals):
        if np.isnan(rdc):
            rdc = 0
        if debug:
            logger.debug(
                f"{df_samples.columns[i]}, {df_samples.columns[j]}: {rdc}")

    pairwise_comparisons = [(column_left, column_right)
                            for i, column_left in left_column_names
                            for j, column_right in right_column_names]
    for (column_left, column_right), rdc in zip(pairwise_comparisons,
                                                rdc_vals):
        rdc_attribute_dict[(column_left, column_right)] = rdc
        rdc_attribute_dict[(column_right, column_left)] = rdc

    return max(rdc_vals)