def test_optimization(self): np.random.seed(17) d1 = np.random.normal(10, 1, size=4000).tolist() d2 = np.random.normal(30, 1, size=4000).tolist() data = d1 + d2 data = np.array(data).reshape((-1, 4)) data = data.astype(np.float32) ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1], parametric_types=[Gaussian] * data.shape[1]) ds_context.add_domains(data) spn = learn_parametric(data, ds_context) spn.weights = [0.8, 0.2] spn.children[0].children[0].mean = 3.0 py_ll = np.sum(log_likelihood(spn, data)) print(spn.weights, spn.children[0].children[0].mean) EM_optimization(spn, data, iterations=1000) print(spn.weights, spn.children[0].children[0].mean) py_ll_opt = np.sum(log_likelihood(spn, data)) self.assertLessEqual(py_ll, py_ll_opt) self.assertAlmostEqual(spn.weights[0], 0.5, 4) self.assertAlmostEqual(spn.weights[1], 0.5, 4) c1_mean = spn.children[0].children[0].mean c2_mean = spn.children[1].children[0].mean self.assertEqual(round(min(c1_mean, c2_mean)), 10) self.assertEqual(round(max(c1_mean, c2_mean)), 30)
def test_conditional_probability(self): # test if conditional probability is correct # same spn as in entropy test # only for generating the ds_context train_data = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [2.0, 0.0, 1.0]]) # spn ds_context = Context(meta_types=[MetaType.DISCRETE] * 3) ds_context.add_domains(train_data) ds_context.parametric_type = [Categorical] * 3 spn = 0.64 * ( ( Categorical(p=[0.25, 0.75, 0.0], scope=0) * ( 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2))) + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2))) ) ) ) + 0.36 * ( ( Categorical(p=[0.0, 0.0, 1.0], scope=0) * ( 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2))) + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2))) ) ) ) # tests x_instance = np.array([1, 1, 0], dtype=float).reshape(1, -1) self.assertAlmostEqual(conditional_probability(spn, 2, x_instance)[0][0], 0.9) self.assertAlmostEqual(conditional_probability(spn, 0, x_instance)[0][0], 0.48) x_instance = np.array([2, 1, 0], dtype=float).reshape(1, -1) self.assertAlmostEqual(conditional_probability(spn, 0, x_instance)[0][0], 0.36)
def learn_parametric_spn(data, parametric_types): from spn.algorithms.LearningWrappers import learn_parametric ds_context = Context(parametric_types=parametric_types).add_domains(data) ds_context.add_domains(data) spn = learn_parametric(data, ds_context, min_instances_slice=100, threshold=0.01) return spn
def test_Histogram_discrete_inference(self): data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1) ds_context = Context([MetaType.DISCRETE]) ds_context.add_domains(data) hist = create_histogram_leaf(data, ds_context, [0], alpha=False) prob = np.exp(log_likelihood(hist, data)) self.assertAlmostEqual(float(prob[0]), 2 / 6) self.assertAlmostEqual(float(prob[1]), 2 / 6) self.assertAlmostEqual(float(prob[2]), 1 / 6) self.assertAlmostEqual(float(prob[3]), 3 / 6) self.assertAlmostEqual(float(prob[4]), 3 / 6) self.assertAlmostEqual(float(prob[5]), 3 / 6) data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1) ds_context = Context([MetaType.DISCRETE]) ds_context.add_domains(data) hist = create_histogram_leaf(data, ds_context, [0], alpha=True) # print(np.var(data.shape[0])) prob = np.exp(log_likelihood(hist, data)) self.assertAlmostEqual(float(prob[0]), 3 / 9) self.assertAlmostEqual(float(prob[1]), 3 / 9) self.assertAlmostEqual(float(prob[2]), 2 / 9) self.assertAlmostEqual(float(prob[3]), 4 / 9) self.assertAlmostEqual(float(prob[4]), 4 / 9) self.assertAlmostEqual(float(prob[5]), 4 / 9)
def test_histogram_to_str_and_back(self): data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1) ds_context = Context([MetaType.DISCRETE]) ds_context.add_domains(data) hist = create_histogram_leaf(data, ds_context, [0], alpha=False) self.check_obj_and_reconstruction(hist)
def test_histogram_leaf(self): data = np.array([1, 1, 2, 3, 3, 3]).reshape(-1, 1) ds_context = Context([MetaType.DISCRETE]) ds_context.add_domains(data) hist = create_histogram_leaf(data, ds_context, [0], alpha=False) self.assertTrue( np.array_equal(mpe(hist, np.array([[np.nan]])), np.array([[3]])), "mpe should be 3")
def test_valid_histogram(self): np.random.seed(17) data = [1] + [5]*20 + [7] + [10]*50 + [20] + [30]*10 data = np.array(data).reshape((-1, 1)) ds_context = Context([MetaType.REAL]) ds_context.add_domains(data) hist = create_histogram_leaf(data, ds_context, [0], alpha=False, hist_source="kde") self.assertGreater(len(hist.bin_repr_points), 1)
def test_PWL_no_variance(self): data = np.array([1.0, 1.0]).reshape(-1, 1) ds_context = Context([MetaType.REAL]) ds_context.add_domains(data) with self.assertRaises(AssertionError): create_piecewise_leaf(data, ds_context, scope=[0], hist_source="kde")
def test_PWL_no_variance(self): data = np.array([1.0, 1.0]).reshape(-1, 1) ds_context = Context([MetaType.REAL]) ds_context.add_domains(data) leaf = create_piecewise_leaf(data, ds_context, scope=[0], hist_source="kde") prob = np.exp(log_likelihood(leaf, data)) self.assertAlmostEqual(float(prob[0]), 2 / 6) self.assertAlmostEqual(float(prob[1]), 2 / 6)
def test_we_score(self): # test if we_score is correct """ # explain how training data and the spn comes # number of RVs M = 3 # table of probabilities p1 = 0.6 p2 = 0.3 p31 = 0.1 p32 = 0.9 # generate x1 and x2 x1 = np.random.binomial(1, p1, size=N) + np.random.binomial(1, p1, size=N) x2 = np.random.binomial(1, p2, size=N) x3 = np.zeros(N) # generate x3 for i in range(N): if x2[i] == 1: x3[i] = np.random.binomial(1, p31, size=1) else: x3[i] = np.random.binomial(1, p32, size=1) # form a matrix, rows are instances and columns are RVs train_data = np.concatenate((x1, x2, x3)).reshape((M, N)).transpose() """ # only for generating the ds_context train_data = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [2.0, 0.0, 1.0]]) # spn ds_context = Context(meta_types=[MetaType.DISCRETE] * 3) ds_context.add_domains(train_data) ds_context.parametric_type = [Categorical] * 3 spn = 0.64 * ( ( Categorical(p=[0.25, 0.75, 0.0], scope=0) * ( 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2))) + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2))) ) ) ) + 0.36 * ( ( Categorical(p=[0.0, 0.0, 1.0], scope=0) * ( 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2))) + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2))) ) ) ) # test n = 40000 x_instance = np.array([1, 1, 0], dtype=float).reshape(1, -1) y_index = 0 we = weight_of_evidence(spn, 0, x_instance, n, ds_context.domains[y_index].shape[0]) we_true = np.array([[np.nan, 0, 0]]) we = we[~np.isnan(we)] we_true = we_true[~np.isnan(we_true)] self.assertTrue((we == we_true).all())
def test_PWL(self): #data = np.array([1.0, 1.0, 2.0, 3.0]*100).reshape(-1, 1) data = np.r_[np.random.normal(10, 5, (300, 1)), np.random.normal(20, 10, (700, 1))] ds_context = Context([MetaType.REAL]) ds_context.add_domains(data) leaf = create_piecewise_leaf(data, ds_context, scope=[0], prior_weight=None, hist_source="kde") prob = np.exp(log_likelihood(leaf, data))
def test_mixture_gaussians(self): np.random.seed(17) data = np.random.normal(10, 1, size=200).tolist() + np.random.normal(30, 1, size=200).tolist() data = np.array(data).reshape((-1, 1)) ds_context = Context([MetaType.REAL]) ds_context.add_domains(data) hist = create_histogram_leaf(data, ds_context, [0], alpha=False, hist_source="kde") x = np.linspace(0, 60, 1000).tolist() + data[:, 0].tolist() x = np.sort(x) from scipy.stats import norm y = 0.5 * norm.pdf(x, 10, 1) + 0.5 * norm.pdf(x, 30, 1) ye = likelihood(hist, x.reshape((-1, 1))) error = np.sum(np.abs(ye[:, 0] - y)) # print(error) self.assertLessEqual(error, 7)
def get_ds_context(data, scope, params): """ :param data: numpy array of data for Context object :param scope: scope of data :param params: params of SPMN :return: Context object of SPFlow """ num_of_variables = data.shape[1] scope_var = np.array(params.feature_names)[scope].tolist() ds_context = Context(meta_types=[params.meta_types[i] for i in scope], scope=scope, feature_names=scope_var) ds_context.add_domains(data) return ds_context
def test_Histogram_expectations(self): data = np.random.randn(20000).reshape(-1, 1) ds_context = Context(meta_types=[MetaType.REAL]) ds_context.add_domains(data) hl = create_histogram_leaf(data, ds_context, scope=[0]) expectation = Expectation(hl, set([0])) self.assertAlmostEqual(np.mean(data[:, 0]), expectation[0, 0], 3) data = np.random.randint(0, high=100, size=20000).reshape(-1, 1) ds_context = Context(meta_types=[MetaType.DISCRETE]) ds_context.add_domains(data) hl = create_histogram_leaf(data, ds_context, scope=[0]) expectation = Expectation(hl, set([0])) self.assertAlmostEqual(np.mean(data[:, 0]), expectation[0, 0], 3)
def test_singular_domain(self): import numpy as np np.random.seed(123) b = np.random.randint(3, size=1000).reshape(-1, 1) d = np.random.randint(2, size=1000).reshape(-1, 1) train_data = np.c_[b, d] from spn.structure.Base import Context from spn.structure.StatisticalTypes import MetaType ds_context = Context(meta_types=[MetaType.DISCRETE, MetaType.BINARY]) ds_context.add_domains(train_data) from spn.algorithms.LearningWrappers import learn_mspn mspn = learn_mspn(train_data, ds_context, min_instances_slice=20)
def test_conditional_mutual_info(self): # test if conditional mutual info is correct # same spn as in entropy test # only for generating the ds_context train_data = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [2.0, 0.0, 1.0]]) # spn ds_context = Context(meta_types=[MetaType.DISCRETE] * 3) ds_context.add_domains(train_data) ds_context.parametric_type = [Categorical] * 3 spn = 0.64 * ( ( Categorical(p=[0.25, 0.75, 0.0], scope=0) * ( 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2))) + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2))) ) ) ) + 0.36 * ( ( Categorical(p=[0.0, 0.0, 1.0], scope=0) * ( 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2))) + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2))) ) ) ) # real mutual info p2 = 0.3 p3 = 0.66 h_x1 = -(0.16 * np.log(0.16) + 0.36 * np.log(0.36) + 0.48 * np.log(0.48)) h_x2x1 = -(0.7 * np.log(0.7) + 0.3 * np.log(0.3)) + h_x1 h_x3x1 = -(0.66 * np.log(0.66) + 0.34 * np.log(0.34)) + h_x1 h_x2x3 = -(p2 * np.log(p2) + (1 - p2) * np.log(1 - p2) + 0.9 * np.log(0.9) + 0.1 * np.log(0.1)) h_x2x3x1 = h_x1 + h_x2x3 cmi_x2x3_x1 = h_x2x1 + h_x3x1 - h_x2x3x1 - h_x1 self.assertAlmostEqual(cmi_x2x3_x1, conditional_mutual_information(spn, ds_context, {1}, {2}, {0})) h_x1x3 = h_x3x1 h_x1x2x3 = h_x2x3x1 h_x3 = -p3 * np.log(p3) - (1 - p3) * np.log(1 - p3) cmi_x1x2_x3 = h_x1x3 + h_x2x3 - h_x1x2x3 - h_x3 self.assertAlmostEqual(cmi_x1x2_x3, conditional_mutual_information(spn, ds_context, {1}, {0}, {2})) h_x1x2x3 = h_x2x3x1 h_x2 = -p2 * np.log(p2) - (1 - p2) * np.log(1 - p2) cmi_x1x3_x2 = h_x2x1 + h_x2x3 - h_x1x2x3 - h_x2 self.assertAlmostEqual(cmi_x1x3_x2, conditional_mutual_information(spn, ds_context, {2}, {0}, {1}))
def test_mutual_info(self): # test if mutual info is correct # same spn as in entropy test # only for generating the ds_context train_data = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [2.0, 0.0, 1.0]]) # spn ds_context = Context(meta_types=[MetaType.DISCRETE] * 3) ds_context.add_domains(train_data) ds_context.parametric_type = [Categorical] * 3 spn = 0.64 * ( ( Categorical(p=[0.25, 0.75, 0.0], scope=0) * ( 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2))) + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2))) ) ) ) + 0.36 * ( ( Categorical(p=[0.0, 0.0, 1.0], scope=0) * ( 0.34 * ((Categorical(p=[7 / 34, 27 / 34], scope=1) * Categorical(p=[1.0, 0.0], scope=2))) + 0.66 * ((Categorical(p=[21 / 22, 1 / 22], scope=1) * Categorical(p=[0.0, 1.0], scope=2))) ) ) ) # real mutual info p2 = 0.3 p3 = 0.66 h_x2 = -p2 * np.log(p2) - (1 - p2) * np.log(1 - p2) h_x3 = -p3 * np.log(p3) - (1 - p3) * np.log(1 - p3) h_x2x3 = -(p2 * np.log(p2) + (1 - p2) * np.log(1 - p2) + 0.9 * np.log(0.9) + 0.1 * np.log(0.1)) mi_x2x3 = h_x2 + h_x3 - h_x2x3 self.assertAlmostEqual(mi_x2x3, mutual_information(spn, ds_context, {1}, {2})) mi_x1x2 = 0 self.assertAlmostEqual(mi_x1x2, mutual_information(spn, ds_context, {1}, {0})) # test symmetry self.assertAlmostEqual( mutual_information(spn, ds_context, {2}, {1}), mutual_information(spn, ds_context, {1}, {2}) ) self.assertAlmostEqual( mutual_information(spn, ds_context, {0, 2}, {1}), mutual_information(spn, ds_context, {1}, {0, 2}) ) # rest 0 self.assertAlmostEqual(0, mutual_information(spn, ds_context, {2, 1}, {0}))
def compute_kmeans_rdc(data): ds_context = Context(meta_types=[MetaType.REAL] * data.shape[1]) ds_context.add_domains(data) scope = list(range(data.shape[1])) meta_types = ds_context.get_meta_types_by_scope(scope) domains = ds_context.get_domains_by_scope(scope) rdc_data = rdc_transformer(data, meta_types, domains, k=10, s=1 / 6, non_linearity=np.sin, return_matrix=True, rand_gen=RandomState(17)) return KMeans(n_clusters=2, random_state=RandomState(17)).fit_predict(rdc_data)
def learn_parametric_spn(data, parametric_types, rdc_threshold=0.3, min_instances_slice=0.05, clustering='kmeans'): ds_context = Context(parametric_types=parametric_types).add_domains(data) ds_context.add_domains(data) mis = int(len(data) * min_instances_slice) t0 = time.time() spn = learn_parametric(data, ds_context, threshold=rdc_threshold, min_instances_slice=mis, rows=clustering) const_time = time.time() - t0 return spn, const_time
def test_eval_histogram(self): np.random.seed(17) data = np.random.normal(10, 0.01, size=2000).tolist() + np.random.normal( 30, 10, size=2000).tolist() data = np.array(data).reshape((-1, 10)) data[data < 0] = 0 data = data.astype(int) ds_context = Context(meta_types=[MetaType.DISCRETE] * data.shape[1]) ds_context.add_domains(data) spn = learn_mspn(data, ds_context) ll = log_likelihood(spn, data) tf_ll = eval_tf(spn, data) self.assertTrue(np.all(np.isclose(ll, tf_ll)))
def test_Piecewise_expectations_with_evidence(self): adata = np.zeros((20000, 2)) adata[:, 1] = 0 adata[:, 0] = np.random.normal(loc=100.0, scale=5.00, size=adata.shape[0]) bdata = np.zeros_like(adata) bdata[:, 1] = 1 bdata[:, 0] = np.random.normal(loc=50.0, scale=5.00, size=bdata.shape[0]) data = np.vstack((adata, bdata)) ds_context = Context(meta_types=[MetaType.REAL, MetaType.DISCRETE]) ds_context.parametric_types = [None, Categorical] ds_context.add_domains(data) L = create_piecewise_leaf( adata[:, 0].reshape(-1, 1), ds_context, scope=[0], prior_weight=None, hist_source="numpy") * create_parametric_leaf( adata[:, 1].reshape(-1, 1), ds_context, scope=[1]) R = create_piecewise_leaf( bdata[:, 0].reshape(-1, 1), ds_context, scope=[0], prior_weight=None, hist_source="numpy") * create_parametric_leaf( bdata[:, 1].reshape(-1, 1), ds_context, scope=[1]) spn = 0.5 * L + 0.5 * R evidence = np.zeros((2, 2)) evidence[1, 1] = 1 evidence[:, 0] = np.nan expectation = Expectation(spn, set([0]), evidence) self.assertAlmostEqual(np.mean(adata[:, 0]), expectation[0, 0], 2) self.assertAlmostEqual(np.mean(bdata[:, 0]), expectation[1, 0], 2)
def test_Piecewise_expectations(self): data = np.random.normal(loc=100.0, scale=5.00, size=20000).reshape(-1, 1) ds_context = Context(meta_types=[MetaType.REAL]) ds_context.add_domains(data) pl = create_piecewise_leaf(data, ds_context, scope=[0], prior_weight=None) expectation = Expectation(pl, set([0])) self.assertAlmostEqual(np.mean(data[:, 0]), expectation[0, 0], 2) data = np.random.randint(0, high=100, size=2000).reshape(-1, 1) ds_context = Context(meta_types=[MetaType.DISCRETE]) ds_context.add_domains(data) pl = create_piecewise_leaf(data, ds_context, scope=[0], prior_weight=None) expectation = Expectation(pl, set([0])) self.assertAlmostEqual(np.mean(data[:, 0]), expectation[0, 0], 3)
import numpy as np from spn.structure.Base import Context from spn.structure.StatisticalTypes import MetaType from spn.algorithms.LearningWrappers import learn_mspn from spn.io.Graphics import plot_spn np.random.seed(123) a = np.random.randint(2, size=1000).reshape(-1, 1) b = np.random.randint(3, size=1000).reshape(-1, 1) c = np.r_[np.random.normal(10, 5, (300, 1)), np.random.normal(20, 10, (700, 1))] d = 5 * a + 3 * b + c train_data = np.c_[a, b, c, d] ds_context = Context(meta_types=[ MetaType.DISCRETE, MetaType.DISCRETE, MetaType.REAL, MetaType.REAL ]) ds_context.add_domains(train_data) mspn = learn_mspn(train_data, ds_context, min_instances_slice=20)
data[:, horizontal_middle:, vertical_middle:].reshape(len(data), -1), ) # # spn # ds_context = Context(meta_types=[MetaType.REAL] * blocked_images[0].shape[1]) # ds_context.add_domains(blocked_images[0]) # ds_context.parametric_type = [Poisson] * blocked_images[0].shape[1] # # print("data ready", data.shape) # #the following two options should be working now. # # spn = learn_structure(upperimage, ds_context, get_split_rows_random_partition(np.random.RandomState(17)), get_split_cols_random_partition(np.random.RandomState(17)), create_parametric_leaf) # spn = learn_parametric(blocked_images[0], ds_context, min_instances_slice=0.1*len(data), ohe=False) # spn ds_context = Context(meta_types=[MetaType.DISCRETE] * 10) ds_context.add_domains(data_labels) ds_context.parametric_types = [Bernoulli] * blocked_images[0].shape[1] spn = learn_parametric(data_labels, ds_context, min_instances_slice=0.3 * len(data_labels)) # first cspn dataIn = data_labels dataOut = blocked_images[0] ds_context = Context(meta_types=[MetaType.DISCRETE] * dataOut.shape[1]) ds_context.add_domains(dataOut) ds_context.parametric_types = [Conditional_Poisson] * dataOut.shape[1] scope = list(range(dataOut.shape[1])) print(np.shape(dataIn), np.shape(dataOut)) print(dataIn[0], dataOut[0])
add_parametric_inference_support() memory = Memory(cachedir="cache", verbose=0, compress=9) data = [] for x in range(10): for y in range(10): for z in range(10): data.append([x, y, z, int(((x + y + z) / 5))]) data = np.array(data).astype(np.float) types = [ MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE ] ds_context = Context(meta_types=types) ds_context.parametric_types = [Gaussian, Gaussian, Gaussian, Categorical] ds_context.add_domains(data) num_classes = len(np.unique(data[:, 3])) #spn = learn_mspn(data, ds_context, min_instances_slice=10, leaves=create_leaf, threshold=0.3) spn = Sum() for label, count in zip(*np.unique(data[:, 3], return_counts=True)): branch = learn_mspn(data[data[:, 3] == label, :], ds_context, min_instances_slice=10, leaves=create_leaf, threshold=0.1) spn.children.append(branch) spn.weights.append(count / data.shape[0])
block_ids = np.arange(i, -1, -1) datasets.append((get_blocks(images, num_blocks=num_blocks, blocks=block_ids.tolist()), 1)) num_mpes = 1 num_samples = 10 cspns = [] mpe_query_blocks = None sample_query_blocks = None for i, ((tr_block, block_idx), conditional_blocks) in enumerate(datasets): print("learning", i) conditional_features_count = (tr_block.shape[1] // len(block_idx)) * conditional_blocks if i == 0: # spn ds_context = Context(meta_types=[MetaType.REAL] * tr_block.shape[1]) ds_context.add_domains(tr_block) ds_context.parametric_types = [Gaussian] * tr_block.shape[1] cspn = learn_parametric(tr_block, ds_context, min_instances_slice=20, ohe=False, memory=memory) else: cspn = learn_conditional( tr_block, Context( meta_types=[MetaType.REAL] * tr_block.shape[1], parametric_types=[Conditional_Gaussian] * tr_block.shape[1], ).add_domains(tr_block), scope=list(range(conditional_features_count)), min_instances_slice=30, memory=memory, ) cspns.append(cspn)
def train_spn(window_size=3, min_instances_slice=10000, features=None, number_of_classes=3): if features is None: features = [20, 120] add_parametric_inference_support() add_parametric_text_support() data = get_data_in_window(window_size=window_size, features=features, three_classes=number_of_classes == 3) sss = sk.model_selection.StratifiedShuffleSplit(test_size=0.2, train_size=0.8, random_state=42) for train_index, test_index in sss.split( data[:, 0:window_size * window_size * len(features)], data[:, (window_size * window_size * len(features)) + (int(window_size * window_size / 2))]): X_train, X_test = data[train_index], data[test_index] context_list = list() parametric_list = list() number_of_features = len(features) for _ in range(number_of_features * window_size * window_size): context_list.append(MetaType.REAL) parametric_list.append(Gaussian) for _ in range(window_size * window_size): context_list.append(MetaType.DISCRETE) parametric_list.append(Categorical) ds_context = Context(meta_types=context_list) ds_context.add_domains(data) ds_context.parametric_types = parametric_list spn = load_spn(window_size, features, min_instances_slice, number_of_classes) if spn is None: spn = Sum() for class_pixel in tqdm(range(-window_size * window_size, 0)): for label, count in zip( *np.unique(data[:, class_pixel], return_counts=True)): train_data = X_train[X_train[:, class_pixel] == label, :] branch = learn_parametric( train_data, ds_context, min_instances_slice=min_instances_slice) spn.children.append(branch) spn.weights.append(train_data.shape[0]) spn.scope.extend(branch.scope) spn.weights = (np.array(spn.weights) / sum(spn.weights)).tolist() assign_ids(spn) save_spn(spn, window_size, features, min_instances_slice, number_of_classes) res = np.ndarray((X_test.shape[0], number_of_classes)) for i in tqdm(range(number_of_classes)): tmp = X_test.copy() tmp[:, -int((window_size**2) / 2)] = i res[:, i] = log_likelihood(spn, tmp)[:, 0] predicted_classes = np.argmax(res, axis=1).reshape((X_test.shape[0], 1)) correct_predicted = 0 for x, y in zip(X_test[:, -5], predicted_classes): if x == y[0]: correct_predicted += 1 accuracy = correct_predicted / X_test.shape[0] return spn, accuracy
add_conditional_inference_support() np.random.seed(42) dataIn = np.random.randint(low=0, high=3, size=600).reshape(-1, 2) dataOut = np.random.randint(low=0, high=3, size=1200).reshape(-1, 4) data = np.concatenate((dataOut, dataIn), axis=1) assert data.shape[ 1] == dataIn.shape[1] + dataOut.shape[1], 'invalid column size' assert data.shape[0] == dataIn.shape[0] == dataOut.shape[ 0], 'invalid row size' ds_context = Context(meta_types=[ MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE ]) ds_context.add_domains(dataOut) ds_context.parametric_types = [ Conditional_Poisson, Conditional_Poisson, Conditional_Poisson, Conditional_Poisson ] scope = list(range(dataOut.shape[1])) spn = Sum() for label, count in zip(*np.unique(data[:, 2], return_counts=True)): branch = learn_conditional(data, ds_context, scope, min_instances_slice=10000) spn.children.append(branch)
dataOut = data[:, :horizontal_middle, vertical_middle:].reshape(len(data), -1) print(data[0]) print("_______") print(dataIn[0]) print("_______") print(dataOut[0]) print("_______") zeros[:, :horizontal_middle, :vertical_middle] = dataIn.reshape(len(data), 4, 4) #data[:, :horizontal_middle, :vertical_middle] zeros[:, :horizontal_middle, vertical_middle:] = dataOut.reshape(len(data), 4, 4) #data[:, :horizontal_middle, vertical_middle:] print(zeros[0], np.shape(zeros)) #print(np.concatenate((dataIn, dataOut), axis=1).reshape(len(dataIn), 4, 8)[0]) """ # spn ds_context = Context(meta_types=[MetaType.REAL] * blocked_images[0].shape[1]) ds_context.add_domains(blocked_images[0]) ds_context.parametric_types = [Poisson] * blocked_images[0].shape[1] print("data ready", data.shape) # the following two options should be working now. spn = learn_parametric(blocked_images[0], ds_context, min_instances_slice=0.1 * len(data), ohe=False) # cspn dataIn = blocked_images[ 0] # data[:, :horizontal_middle, :vertical_middle].reshape(len(data), -1) dataOut = blocked_images[ 1] # data[:, :horizontal_middle, vertical_middle:].reshape(len(data), -1)
def max_rdc(schema, left_table, right_table, df_samples, meta_types, rdc_attribute_dict, max_sampling_threshold_rows=10000, k=10, s=1 / 6, non_linearity=np.sin, n_jobs=-2, debug=True): # only keep columns of left or right table irrelevant_cols = [] relevant_meta_types = [] for i, column in enumerate(df_samples.columns): not_of_left_or_right = not (column.startswith(left_table + '.') or column.startswith(right_table + '.')) is_nn_attribute = (column == left_table + '.' + schema.table_dictionary[left_table].table_nn_attribute) or \ (column == right_table + '.' + schema.table_dictionary[right_table].table_nn_attribute) is_multiplier = False is_fk_field = False for relationship_obj in schema.relationships: # [relationship_obj_list[0], relationship_obj_list[-1]] if relationship_obj.end + '.' + relationship_obj.end_attr == column or \ relationship_obj.start + '.' + relationship_obj.start_attr == column: is_fk_field = True break if relationship_obj.end + '.' + relationship_obj.multiplier_attribute_name_nn == column or \ relationship_obj.end + '.' + relationship_obj.multiplier_attribute_name == column: is_multiplier = True break is_uninformative = False if not_of_left_or_right or is_nn_attribute or is_multiplier or is_fk_field or is_uninformative: irrelevant_cols.append(column) else: relevant_meta_types.append(meta_types[i]) df_samples.drop(columns=irrelevant_cols, inplace=True) left_column_names = [(i, column) for i, column in enumerate(df_samples.columns) if column.startswith(left_table + '.')] right_column_names = [(i, column) for i, column in enumerate(df_samples.columns) if column.startswith(right_table + '.')] left_columns = [i for i, column in left_column_names] right_columns = [i for i, column in right_column_names] data = df_samples.values # sample if necessary if data.shape[0] > max_sampling_threshold_rows: data = data[np.random.randint(data.shape[0], size=max_sampling_threshold_rows), :] n_features = data.shape[1] assert n_features == len(relevant_meta_types) ds_context = Context(meta_types=relevant_meta_types) ds_context.add_domains(data) rdc_features = rdc_transformer(data, relevant_meta_types, ds_context.domains, k=k, s=s, non_linearity=non_linearity, return_matrix=False) pairwise_comparisons = [(i, j) for i in left_columns for j in right_columns] from joblib import Parallel, delayed rdc_vals = Parallel(n_jobs=n_jobs, max_nbytes=1024, backend="threading")(delayed(rdc_cca)((i, j, rdc_features)) for i, j in pairwise_comparisons) for (i, j), rdc in zip(pairwise_comparisons, rdc_vals): if np.isnan(rdc): rdc = 0 if debug: logger.debug( f"{df_samples.columns[i]}, {df_samples.columns[j]}: {rdc}") pairwise_comparisons = [(column_left, column_right) for i, column_left in left_column_names for j, column_right in right_column_names] for (column_left, column_right), rdc in zip(pairwise_comparisons, rdc_vals): rdc_attribute_dict[(column_left, column_right)] = rdc rdc_attribute_dict[(column_right, column_left)] = rdc return max(rdc_vals)