def test_leaf_categorical(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([20, 20], np.eye(2), 500), np.random.multivariate_normal([10, 10], np.eye(2), 500), np.random.multivariate_normal([1, 1], np.eye(2), 500), ), axis=0, ) y = np.array([2] * 500 + [1] * 500 + [0] * 500).reshape(-1, 1) data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Categorical]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) l0 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 0, x)) l1 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 1, x)) l2 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 2, x)) np.testing.assert_array_almost_equal(l0 + l1 + l2, 1.0) self.assertTrue(np.all(l0[1000:1500] > 0.85)) self.assertTrue(np.all(l0[0:1000] < 0.15)) self.assertTrue(np.all(l1[500:1000] > 0.85)) self.assertTrue(np.all(l1[0:500] < 0.15)) self.assertTrue(np.all(l1[1000:1500] < 0.15)) self.assertTrue(np.all(l2[0:500] > 0.85)) self.assertTrue(np.all(l2[500:15000] < 0.15))
def test_leaf_bernoulli_bootstrap(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 100), np.random.multivariate_normal([1, 1], np.eye(2), 100), ), axis=0, ) y = np.array([1] * 100 + [0] * 100).reshape(-1, 1) data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Bernoulli]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) l = likelihood(leaf, data) neg_data = np.concatenate([1 - y, x], axis=1) lneg = likelihood(leaf, neg_data) np.testing.assert_array_almost_equal(l + lneg, 1.0) self.assertTrue(np.all(l >= 0.5)) self.assertTrue(np.all(lneg < 0.5))
def test_leaf_gaussian(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 5000), np.random.multivariate_normal([1, 1], np.eye(2), 5000), ), axis=0, ) y = np.array( np.random.normal(20, 2, 5000).tolist() + np.random.normal(60, 2, 5000).tolist()).reshape(-1, 1) # associates y=20 with X=[10,10] # associates y=60 with X=[1,1] data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Gaussian]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) self.assertFalse(np.any(np.isnan(likelihood(leaf, data)))) self.assertGreater(get_ll(leaf, [20, 10, 10]), get_ll(leaf, [20, 1, 1])) self.assertGreater(get_ll(leaf, [60, 1, 1]), get_ll(leaf, [60, 10, 10])) self.assertAlmostEqual(get_ll(leaf, [60, 1, 1]), 0.3476232862652) self.assertAlmostEqual(get_ll(leaf, [20, 10, 10]), 0.3628922322773634)
def test_leaf_no_variance_gaussian(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 500), np.random.multivariate_normal([1, 1], np.eye(2), 500), ), axis=0, ) y = np.array([1] * 1000).reshape(-1, 1) data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Gaussian]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) l = likelihood(leaf, data) self.assertEqual(np.var(l[:, 0]), 0) self.assertAlmostEqual(l[0, 0], 0.398942280401432) data[:, 0] = 2 leaf = create_conditional_leaf(data, ds_context, [0]) l = likelihood(leaf, data) self.assertEqual(np.var(l[:, 0]), 0) self.assertAlmostEqual(l[0, 0], 0.398942280401432) data3 = np.array(data) data3[:, 0] = 3 leaf = create_conditional_leaf(data3, ds_context, [0]) l = likelihood(leaf, data) self.assertAlmostEqual(np.var(l[:, 0]), 0) self.assertAlmostEqual(l[0, 0], 0.241970724519143)
def test_naive_factorization(self): np.random.seed(17) data = np.arange(0, 1000).reshape(-1, 8) parent = Sum() parent.children.append(None) ctx = Context() ctx.feature_size = 4 scope = [1, 3, 4, 6] data2 = np.array(data) result = naive_factorization(data=data2, parent=parent, pos=0, context=ctx, scope=list(scope)) self.assertListEqual(data.tolist(), data2.tolist()) self.assertEqual(parent.children[0], result[0][1]['parent']) y, x = get_YX(data, 4) self.assertEqual(len(result), len(scope)) for i, s in enumerate(scope): r = result[i] self.assertEqual(len(r), 2) self.assertEqual(r[0], SplittingOperations.CREATE_LEAF_NODE) self.assertEqual(type(r[1]['parent']), Product) self.assertEqual(r[1]['pos'], i) self.assertListEqual(r[1]['scope'], [s]) self.assertListEqual(r[1]['data'].tolist(), concatenate_yx(y[:, i], x).tolist())
def test_conditional(self): labels = np.c_[np.zeros((500, 1)), np.ones((500, 1))] features = np.c_[ np.r_[np.random.normal(5, 1, (500, 2)), np.random.normal(10, 1, (500, 2))] ] train_data = concatenate_yx(labels, features) ds_context = Context( parametric_types=[Bernoulli] * labels.shape[1] ).add_domains(labels) ds_context.feature_size = 2 def label_conditional(y, x): from sklearn.cluster import KMeans clusters = KMeans( n_clusters=2, random_state=17, precompute_distances=True ).fit_predict(y) return clusters spn = learn_cspn_structure( train_data, ds_context, split_rows=get_split_conditional_rows_KMeans(), split_cols=getCIGroup(), create_leaf=create_conditional_leaf, label_conditional=label_conditional, cluster_univariate=True, )
def test_leaf_mpe_bernoulli(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 5000), np.random.multivariate_normal([1, 1], np.eye(2), 5000), ), axis=0, ) y = np.array([0] * 5000 + [1] * 5000).reshape(-1, 1) # associates y=0 with X=[10,10] # associates y=1 with X=[1,1] data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Bernoulli]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) res = mpe(leaf, np.array([np.nan, 10, 10]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 0) res = mpe(leaf, np.array([np.nan, 1, 1]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 1) res = mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 1) self.assertAlmostEqual(res[1, 0], 0) with self.assertRaises(AssertionError): mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3))
def test_leaf_mpe_gaussian(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 5000), np.random.multivariate_normal([1, 1], np.eye(2), 5000), ), axis=0, ) y = np.array(np.random.normal(20, 2, 5000).tolist() + np.random.normal(60, 2, 5000).tolist()).reshape(-1, 1) # associates y=20 with X=[10,10] # associates y=60 with X=[1,1] data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Gaussian]) ds_context.feature_size = 2 # leaf = create_conditional_leaf(data, ds_context, [0]) leaf = create_parametric_leaf(data, ds_context, [0]) res = mpe(leaf, np.array([np.nan, 10, 10]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 20.435226001909466) res = mpe(leaf, np.array([np.nan, 1, 1]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 59.4752193542575) res = mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 59.4752193542575) self.assertAlmostEqual(res[1, 0], 20.435226001909466) with self.assertRaises(AssertionError): mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3))
def test_create_conditional(self): np.random.seed(17) data = np.arange(0, 1000).reshape(-1, 8) parent = Sum() parent.children.append(None) ctx = Context() ctx.feature_size = 4 scope = [1, 3, 4, 6] data2 = np.array(data) K = int(data.shape[0] * 0.25) split_idx = np.array([0] * K + [1] * (data.shape[0] - K)) np.random.shuffle(split_idx) y, x = get_YX(data, 4) def label_conditional(local_y, local_x): self.assertListEqual(local_y.tolist(), y.tolist()) self.assertListEqual(local_x.tolist(), x.tolist()) return split_idx result = create_conditional(data=data2, parent=parent, pos=0, context=ctx, scope=list(scope), label_conditional=label_conditional) self.assertListEqual(data.tolist(), data2.tolist()) self.assertEqual(len(result), 2) for i, r in enumerate(result): self.assertEqual(r[0], SplittingOperations.GET_NEXT_OP) self.assertIn('data', r[1]) self.assertEqual(parent.children[0], r[1]['parent']) self.assertEqual(r[1]['pos'], i) self.assertListEqual(scope, r[1]['scope']) self.assertEqual(r[1]['data'].shape[1], data.shape[1]) conditional_node = result[0][1]['parent'] child_idx = conditional_supervised_likelihood( conditional_node, [np.zeros((data.shape[0], 1)), np.ones((data.shape[0], 1))], data) self.assertListEqual(result[0][1]['data'].tolist(), data[child_idx[:, 0] == 0, :].tolist()) self.assertListEqual(result[1][1]['data'].tolist(), data[child_idx[:, 0] == 1, :].tolist())
def test_create_sum_with_split(self): np.random.seed(17) data = np.arange(0, 1000).reshape(-1, 8) parent = Sum() parent.children.append(None) ctx = Context() ctx.feature_size = 4 scope = [1, 3, 4, 6] data2 = np.array(data) K = int(data.shape[0] * 0.25) split_idx = np.array([0] * K + [1] * (data.shape[0] - K)) np.random.shuffle(split_idx) def split_rows(data, context, scope): result = [] result.append((data[split_idx == 0, :], scope, 0.25)) result.append((data[split_idx == 1, :], scope, 0.75)) return result result = create_sum(data=data2, parent=parent, pos=0, context=ctx, scope=list(scope), split_rows=split_rows, split_on_sum=True) self.assertListEqual(data.tolist(), data2.tolist()) self.assertEqual(len(result), 2) for i, r in enumerate(result): self.assertEqual(r[0], SplittingOperations.GET_NEXT_OP) self.assertIn('data', r[1]) self.assertEqual(parent.children[0], r[1]['parent']) self.assertEqual(r[1]['pos'], i) self.assertListEqual(scope, r[1]['scope']) self.assertEqual(r[1]['data'].shape[1], data.shape[1]) self.assertEqual(r[1]['data'].shape[0], int(np.sum(split_idx == i))) self.assertListEqual(result[0][1]['data'].tolist(), data[split_idx == 0, :].tolist()) self.assertListEqual(result[1][1]['data'].tolist(), data[split_idx == 1, :].tolist()) self.assertAlmostEqual(np.sum(parent.children[0].weights), 1.0)
def test_leaf_sampling(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 5000), np.random.multivariate_normal([1, 1], np.eye(2), 5000), ), axis=0, ) y = np.array( np.random.normal(20, 2, 5000).tolist() + np.random.normal(60, 2, 5000).tolist()).reshape(-1, 1) # associates y=20 with X=[10,10] # associates y=60 with X=[1,1] data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Gaussian]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) res = sample_instances( leaf, np.array([np.nan, 10, 10] * 1000).reshape(-1, 3), 17) self.assertAlmostEqual(np.mean(res[:, 0]), 20.456669723751173) res = sample_instances(leaf, np.array([np.nan, 1, 1] * 1000).reshape(-1, 3), 17) self.assertAlmostEqual(np.mean(res[:, 0]), 59.496663076099196) res = sample_instances( leaf, np.array([np.nan, 1, 1, np.nan, 10, 10] * 1000).reshape(-1, 3), 17) self.assertAlmostEqual(np.mean(res[::2, 0]), 59.546359637084564) self.assertAlmostEqual(np.mean(res[1::2, 0]), 20.452118792501008) with self.assertRaises(AssertionError): sample_instances( leaf, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3), 17)
def test_leaf_no_variance_bernoulli(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 500), np.random.multivariate_normal([1, 1], np.eye(2), 500), ), axis=0, ) y = np.array([1] * 1000).reshape(-1, 1) data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Bernoulli]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) l = likelihood(leaf, data) self.assertTrue(np.all(l >= 0.5))
def test_remove_non_informative_features(self): np.random.seed(17) data = np.arange(0, 1000).reshape(-1, 8) data[:, 1] = 1 data[:, 3] = 3 parent = Sum() parent.children.append(None) ctx = Context() ctx.feature_size = 4 scope = [1, 3, 4, 6] data2 = np.array(data) y, x = get_YX(data, 4) uninformative_features_idx = np.var(y, 0) == 0 result = remove_non_informative_features( data=data2, parent=parent, pos=0, context=ctx, scope=list(scope), uninformative_features_idx=uninformative_features_idx) self.assertListEqual(data.tolist(), data2.tolist()) self.assertEqual(len(parent.children[0].children), len(result)) resulting_scopes = [[3], [6], [1, 4]] resulting_data_y = [y[:, 1], y[:, 3], y[:, [0, 2]]] for i, r in enumerate(result): self.assertEqual(len(r), 2) self.assertEqual(type(r[1]['parent']), Product) self.assertEqual(parent.children[0], r[1]['parent']) self.assertListEqual(r[1]['scope'], resulting_scopes[i]) self.assertEqual(r[1]['pos'], i) self.assertListEqual( r[1]['data'].tolist(), concatenate_yx(resulting_data_y[i], x).tolist())
def test_leaf_sampling_categorical(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([20, 20], np.eye(2), 500), np.random.multivariate_normal([10, 10], np.eye(2), 500), np.random.multivariate_normal([1, 1], np.eye(2), 500), ), axis=0, ) y = np.array([2] * 500 + [1] * 500 + [0] * 500).reshape(-1, 1) data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Categorical]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) res = sample_instances( leaf, np.array([np.nan, 10, 10] * 1000).reshape(-1, 3), RandomState(17)) self.assertAlmostEqual(np.mean(res[:, 0]), 1, 1) res = sample_instances(leaf, np.array([np.nan, 1, 1] * 1000).reshape(-1, 3), RandomState(17)) self.assertAlmostEqual(np.mean(res[:, 0]), 0, 1) res = sample_instances( leaf, np.array([np.nan, 1, 1, np.nan, 10, 10] * 1000).reshape(-1, 3), RandomState(17)) self.assertAlmostEqual(np.mean(res[::2, 0]), 0, 1) self.assertAlmostEqual(np.mean(res[1::2, 0]), 1, 1) with self.assertRaises(AssertionError): sample_instances( leaf, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3), RandomState(17))