def test_leaf_categorical(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([20, 20], np.eye(2), 500), np.random.multivariate_normal([10, 10], np.eye(2), 500), np.random.multivariate_normal([1, 1], np.eye(2), 500), ), axis=0, ) y = np.array([2] * 500 + [1] * 500 + [0] * 500).reshape(-1, 1) data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Categorical]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) l0 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 0, x)) l1 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 1, x)) l2 = likelihood(leaf, concatenate_yx(np.ones_like(y) * 2, x)) np.testing.assert_array_almost_equal(l0 + l1 + l2, 1.0) self.assertTrue(np.all(l0[1000:1500] > 0.85)) self.assertTrue(np.all(l0[0:1000] < 0.15)) self.assertTrue(np.all(l1[500:1000] > 0.85)) self.assertTrue(np.all(l1[0:500] < 0.15)) self.assertTrue(np.all(l1[1000:1500] < 0.15)) self.assertTrue(np.all(l2[0:500] > 0.85)) self.assertTrue(np.all(l2[500:15000] < 0.15))
def remove_non_informative_features(data=None, node_id=0, scope=None, context=0, uninformative_features_idx=None, **kwargs): assert uninformative_features_idx is not None, "parameter uninformative_features_idx can't be None" prod_node = Product() prod_node.scope = scope prod_node.id = node_id y, x = get_YX(data, context.feature_size) non_zero_variance_rvs = [] non_zero_variance_idx = [] result = [] for idx, zero_var in enumerate(uninformative_features_idx): rv = scope[idx] if not zero_var: non_zero_variance_rvs.append(rv) non_zero_variance_idx.append(idx) continue prod_node.children.append(None) data_slice = concatenate_yx(y[:, idx].reshape(-1, 1), x) result.append(( SplittingOperations.CREATE_LEAF_NODE, { "data": data_slice, "parent_id": prod_node.id, "pos": len(prod_node.children) - 1, "scope": [rv], }, )) assert len(result) > 0 if len(non_zero_variance_idx) > 0: prod_node.children.append(None) result.append(( SplittingOperations.GET_NEXT_OP, { "data": concatenate_yx(data[:, non_zero_variance_idx], x), "parent_id": prod_node.id, "pos": len(prod_node.children) - 1, "scope": non_zero_variance_rvs, }, )) return prod_node, result
def naive_factorization(data=None, node_id=0, context=None, scope=None, **kwargs): assert scope is not None, "No scope" prod_node = Product() prod_node.scope = scope prod_node.id = node_id y, x = get_YX(data, context.feature_size) result = [] for i, rv in enumerate(scope): prod_node.children.append(None) data_slice = concatenate_yx(y[:, i].reshape(-1, 1), x) result.append(( SplittingOperations.CREATE_LEAF_NODE, { "data": data_slice, "parent_id": prod_node.id, "pos": len(prod_node.children) - 1, "scope": [rv], }, )) return prod_node, result
def test_leaf_mpe_conditional(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 5000), np.random.multivariate_normal([1, 1], np.eye(2), 5000), ), axis=0, ) y = np.array([0] * 5000 + [1] * 5000).reshape(-1, 1) # associates y=0 with X=[10,10] # associates y=1 with X=[1,1] data = concatenate_yx(y, x) cspn = CSPNClassifier([Bernoulli] * y.shape[1], min_instances_slice=4990, cluster_univariate=True) cspn.fit(x, y) res = mpe(cspn.cspn, np.array([np.nan, 10, 10]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 0) res = mpe(cspn.cspn, np.array([np.nan, 1, 1]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 1) res = mpe(cspn.cspn, np.array([np.nan, 1, 1, np.nan, 10, 10]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 1) self.assertAlmostEqual(res[1, 0], 0) with self.assertRaises(AssertionError): mpe(cspn.cspn, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3))
def test_leaf_no_variance_gaussian(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 500), np.random.multivariate_normal([1, 1], np.eye(2), 500), ), axis=0, ) y = np.array([1] * 1000).reshape(-1, 1) data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Gaussian]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) l = likelihood(leaf, data) self.assertEqual(np.var(l[:, 0]), 0) self.assertAlmostEqual(l[0, 0], 0.398942280401432) data[:, 0] = 2 leaf = create_conditional_leaf(data, ds_context, [0]) l = likelihood(leaf, data) self.assertEqual(np.var(l[:, 0]), 0) self.assertAlmostEqual(l[0, 0], 0.398942280401432) data3 = np.array(data) data3[:, 0] = 3 leaf = create_conditional_leaf(data3, ds_context, [0]) l = likelihood(leaf, data) self.assertAlmostEqual(np.var(l[:, 0]), 0) self.assertAlmostEqual(l[0, 0], 0.241970724519143)
def test_naive_factorization(self): np.random.seed(17) data = np.arange(0, 1000).reshape(-1, 8) parent = Sum() parent.children.append(None) ctx = Context() ctx.feature_size = 4 scope = [1, 3, 4, 6] data2 = np.array(data) result = naive_factorization(data=data2, parent=parent, pos=0, context=ctx, scope=list(scope)) self.assertListEqual(data.tolist(), data2.tolist()) self.assertEqual(parent.children[0], result[0][1]['parent']) y, x = get_YX(data, 4) self.assertEqual(len(result), len(scope)) for i, s in enumerate(scope): r = result[i] self.assertEqual(len(r), 2) self.assertEqual(r[0], SplittingOperations.CREATE_LEAF_NODE) self.assertEqual(type(r[1]['parent']), Product) self.assertEqual(r[1]['pos'], i) self.assertListEqual(r[1]['scope'], [s]) self.assertListEqual(r[1]['data'].tolist(), concatenate_yx(y[:, i], x).tolist())
def test_leaf_bernoulli_bootstrap(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 100), np.random.multivariate_normal([1, 1], np.eye(2), 100), ), axis=0, ) y = np.array([1] * 100 + [0] * 100).reshape(-1, 1) data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Bernoulli]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) l = likelihood(leaf, data) neg_data = np.concatenate([1 - y, x], axis=1) lneg = likelihood(leaf, neg_data) np.testing.assert_array_almost_equal(l + lneg, 1.0) self.assertTrue(np.all(l >= 0.5)) self.assertTrue(np.all(lneg < 0.5))
def fit(self, X, y=None): self.context = Context( parametric_types=self.parametric_types).add_domains(y) self.context.feature_size = X.shape[1] self.num_labels = y.shape[1] def label_conditional(y, x): from sklearn.cluster import KMeans clusters = KMeans(n_clusters=2, random_state=17, precompute_distances=True).fit_predict(x) return clusters self.cspn = learn_cspn_structure( concatenate_yx(y, X), self.context, split_rows=get_split_rows_conditional_Gower(), # split_rows=get_split_rows_KMeans(), # split_cols=get_split_cols_RDC_py(), split_cols=getCIGroup(alpha=self.alpha), # creeate_leaf = create_leaf_node, create_leaf=create_conditional_leaf, label_conditional=label_conditional, **self.kwargs) return self
def test_leaf_mpe_bernoulli(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 5000), np.random.multivariate_normal([1, 1], np.eye(2), 5000), ), axis=0, ) y = np.array([0] * 5000 + [1] * 5000).reshape(-1, 1) # associates y=0 with X=[10,10] # associates y=1 with X=[1,1] data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Bernoulli]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) res = mpe(leaf, np.array([np.nan, 10, 10]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 0) res = mpe(leaf, np.array([np.nan, 1, 1]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 1) res = mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 1) self.assertAlmostEqual(res[1, 0], 0) with self.assertRaises(AssertionError): mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3))
def test_leaf_mpe_gaussian(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 5000), np.random.multivariate_normal([1, 1], np.eye(2), 5000), ), axis=0, ) y = np.array(np.random.normal(20, 2, 5000).tolist() + np.random.normal(60, 2, 5000).tolist()).reshape(-1, 1) # associates y=20 with X=[10,10] # associates y=60 with X=[1,1] data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Gaussian]) ds_context.feature_size = 2 # leaf = create_conditional_leaf(data, ds_context, [0]) leaf = create_parametric_leaf(data, ds_context, [0]) res = mpe(leaf, np.array([np.nan, 10, 10]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 20.435226001909466) res = mpe(leaf, np.array([np.nan, 1, 1]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 59.4752193542575) res = mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10]).reshape(-1, 3)) self.assertAlmostEqual(res[0, 0], 59.4752193542575) self.assertAlmostEqual(res[1, 0], 20.435226001909466) with self.assertRaises(AssertionError): mpe(leaf, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3))
def test_datasets(self): # start with jester ds = "jester" ev = "ev80" name, features, validation, train, test, n_discrete, n_bernoulli = get_binary_data( ds) _, features_msk, validation_msk, train_msk, test_msk, n_discrete_msk, n_bernoulli_msk = get_binary_mask( ds, ev) col_msk = np.isnan(train_msk)[0] train_x, valid_x, test_x = train[:, ~col_msk], validation[:, ~col_msk], test[:, ~col_msk] train_y, valid_y, test_y = train[:, col_msk], validation[:, col_msk], test[:, col_msk] cspn = CSPNClassifier(parametric_types=[Bernoulli] * train_y.shape[1], alpha=0.0001, min_splitting_instances=3000, min_clustering_instances=2000) cspn.fit(train_x, y=train_y) ll = cspn.score_samples(concatenate_yx(test_y, test_x)) print(ll.mean())
def test_leaf_gaussian(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 5000), np.random.multivariate_normal([1, 1], np.eye(2), 5000), ), axis=0, ) y = np.array( np.random.normal(20, 2, 5000).tolist() + np.random.normal(60, 2, 5000).tolist()).reshape(-1, 1) # associates y=20 with X=[10,10] # associates y=60 with X=[1,1] data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Gaussian]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) self.assertFalse(np.any(np.isnan(likelihood(leaf, data)))) self.assertGreater(get_ll(leaf, [20, 10, 10]), get_ll(leaf, [20, 1, 1])) self.assertGreater(get_ll(leaf, [60, 1, 1]), get_ll(leaf, [60, 10, 10])) self.assertAlmostEqual(get_ll(leaf, [60, 1, 1]), 0.3476232862652) self.assertAlmostEqual(get_ll(leaf, [20, 10, 10]), 0.3628922322773634)
def test_conditional(self): labels = np.c_[np.zeros((500, 1)), np.ones((500, 1))] features = np.c_[ np.r_[np.random.normal(5, 1, (500, 2)), np.random.normal(10, 1, (500, 2))] ] train_data = concatenate_yx(labels, features) ds_context = Context( parametric_types=[Bernoulli] * labels.shape[1] ).add_domains(labels) ds_context.feature_size = 2 def label_conditional(y, x): from sklearn.cluster import KMeans clusters = KMeans( n_clusters=2, random_state=17, precompute_distances=True ).fit_predict(y) return clusters spn = learn_cspn_structure( train_data, ds_context, split_rows=get_split_conditional_rows_KMeans(), split_cols=getCIGroup(), create_leaf=create_conditional_leaf, label_conditional=label_conditional, cluster_univariate=True, )
def fit(self, X, y=None): y = y.reshape(y.shape[0], -1) self.num_labels = y.shape[1] self.context = Context(parametric_types=[Bernoulli] * self.num_labels).add_domains(y) self.context.feature_size = X.shape[1] self.scope = list(range(y.shape[1])) data = concatenate_yx(y, X) cspn_type = 1 if cspn_type == 0: self.cspn = create_conditional_leaf(data, self.context, self.scope) elif cspn_type == 1: split_rows = get_split_conditional_rows_KMeans() self.cspn, subtasks = create_sum(data=data, node_id=0, parent_id=0, pos=0, context=self.context, scope=self.scope, split_rows=split_rows) for i, subtask in enumerate(subtasks): self.cspn.children[i] = create_conditional_leaf( subtask[1]['data'], self.context, subtask[1]['scope']) print(self.cspn)
def test_leaf_sampling_multilabel(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 5000), np.random.multivariate_normal([1, 1], np.eye(2), 5000), ), axis=0, ) y = np.concatenate( ( np.array([0] * 5000 + [1] * 5000).reshape(-1, 1), np.array([1] * 5000 + [0] * 5000).reshape(-1, 1), ), axis=1, ) # associates y0=0 with X=[10,10] # associates y0=1 with X=[1,1] # associates y1=1 with X=[10,10] # associates y1=0 with X=[1,1] data = concatenate_yx(y, x) cspn = CSPNClassifier([Bernoulli] * y.shape[1], min_instances_slice=4990, cluster_univariate=True) cspn.fit(x, y) res = sample_instances( cspn.cspn, np.array([np.nan, np.nan, 10, 10] * 1000).reshape(-1, 4), 17) self.assertAlmostEqual(np.unique(res[:, 0]), 0) self.assertAlmostEqual(np.unique(res[:, 1]), 1) res = sample_instances( cspn.cspn, np.array([np.nan, np.nan, 1, 1] * 1000).reshape(-1, 4), 17) self.assertAlmostEqual(np.unique(res[:, 0]), 1) self.assertAlmostEqual(np.unique(res[:, 1]), 0) res = sample_instances( cspn.cspn, np.array([np.nan, 0, 1, 1, np.nan, 1, 10, 10] * 1000).reshape( -1, 4), 17) self.assertAlmostEqual(np.unique(res[::2, 0]), 1) self.assertAlmostEqual(np.unique(res[1::2, 0]), 0) self.assertAlmostEqual(np.unique(res[::2, 1]), 0) self.assertAlmostEqual(np.unique(res[1::2, 1]), 1) with self.assertRaises(AssertionError): sample_instances( cspn.cspn, np.array([np.nan, 1, 1, 1, np.nan, 0, 10, 10, 1, 1, 10, 10]).reshape(-1, 4), 17)
def predict(self, X, check_input=True): if self.cspn is None: raise RuntimeError("Classifier not fitted") y = np.array([np.nan] * X.shape[0] * len(self.cspn.scope)).reshape( X.shape[0], -1) test_data = concatenate_yx(y, X) mpe_y = ExactMPE(self.cspn, test_data, self.context) return mpe_y
def split_conditional_data_by_clusters(y, x, clusters, scope, rows=True): assert not rows, "split conditional only for columns" nscope = np.asarray(scope) unique_clusters = np.unique(clusters) result = [] for uc in unique_clusters: col_idx = clusters == uc local_data = concatenate_yx(y[:, col_idx].reshape((x.shape[0], -1)), x) proportion = 1 result.append((local_data, nscope[col_idx].tolist(), proportion)) return result
def predict_proba(self, X): y = np.ones((X.shape[0], self.num_labels)) y[:] = np.nan test_data = concatenate_yx(y, X) results = np.ones_like(y) for n in self.cspn.scope: local_test = np.array(test_data) local_test[:, n] = 1 results[:, n] = likelihood(self.cspn, local_test)[:, 0] return results
def predict_proba(self, X): y = np.ones((X.shape[0], self.num_labels)) y[:] = np.nan test_data = concatenate_yx(y, X) results = np.ones_like(y) local_test = np.array(test_data) for n in range(2): local_test = np.array(test_data) local_test[:, n] = 1 results[:, n] = likelihood(self.cspn, local_test)[:, 0] rbinc = np.zeros((X.shape[0], 2)) rbinc[:, 0] = 1 - results[:, 0] rbinc[:, 1] = results[:, 0] return rbinc
def supervised_leaf_likelihood(node, data=None, dtype=np.float64): assert len(node.scope) == 1, node.scope y, x = get_YX(data, node.feature_size) y = y[:, node.scope] probs = np.ones((y.shape[0], 1), dtype=dtype) marg_ids = np.isnan(y[:, 0]) if np.sum(~marg_ids) > 0: observations_data = concatenate_yx(y[~marg_ids], x[~marg_ids]) probs[~marg_ids] = node.predictor.predict_proba(observations_data) probs[np.isclose(probs, 0)] = 0.000000001 return probs
def test_leaf_sampling(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 5000), np.random.multivariate_normal([1, 1], np.eye(2), 5000), ), axis=0, ) y = np.array( np.random.normal(20, 2, 5000).tolist() + np.random.normal(60, 2, 5000).tolist()).reshape(-1, 1) # associates y=20 with X=[10,10] # associates y=60 with X=[1,1] data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Gaussian]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) res = sample_instances( leaf, np.array([np.nan, 10, 10] * 1000).reshape(-1, 3), 17) self.assertAlmostEqual(np.mean(res[:, 0]), 20.456669723751173) res = sample_instances(leaf, np.array([np.nan, 1, 1] * 1000).reshape(-1, 3), 17) self.assertAlmostEqual(np.mean(res[:, 0]), 59.496663076099196) res = sample_instances( leaf, np.array([np.nan, 1, 1, np.nan, 10, 10] * 1000).reshape(-1, 3), 17) self.assertAlmostEqual(np.mean(res[::2, 0]), 59.546359637084564) self.assertAlmostEqual(np.mean(res[1::2, 0]), 20.452118792501008) with self.assertRaises(AssertionError): sample_instances( leaf, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3), 17)
def test_remove_non_informative_features(self): np.random.seed(17) data = np.arange(0, 1000).reshape(-1, 8) data[:, 1] = 1 data[:, 3] = 3 parent = Sum() parent.children.append(None) ctx = Context() ctx.feature_size = 4 scope = [1, 3, 4, 6] data2 = np.array(data) y, x = get_YX(data, 4) uninformative_features_idx = np.var(y, 0) == 0 result = remove_non_informative_features( data=data2, parent=parent, pos=0, context=ctx, scope=list(scope), uninformative_features_idx=uninformative_features_idx) self.assertListEqual(data.tolist(), data2.tolist()) self.assertEqual(len(parent.children[0].children), len(result)) resulting_scopes = [[3], [6], [1, 4]] resulting_data_y = [y[:, 1], y[:, 3], y[:, [0, 2]]] for i, r in enumerate(result): self.assertEqual(len(r), 2) self.assertEqual(type(r[1]['parent']), Product) self.assertEqual(parent.children[0], r[1]['parent']) self.assertListEqual(r[1]['scope'], resulting_scopes[i]) self.assertEqual(r[1]['pos'], i) self.assertListEqual( r[1]['data'].tolist(), concatenate_yx(resulting_data_y[i], x).tolist())
def test_leaf_no_variance_bernoulli(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([10, 10], np.eye(2), 500), np.random.multivariate_normal([1, 1], np.eye(2), 500), ), axis=0, ) y = np.array([1] * 1000).reshape(-1, 1) data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Bernoulli]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) l = likelihood(leaf, data) self.assertTrue(np.all(l >= 0.5))
def test_leaf_sampling_categorical(self): np.random.seed(17) x = np.concatenate( ( np.random.multivariate_normal([20, 20], np.eye(2), 500), np.random.multivariate_normal([10, 10], np.eye(2), 500), np.random.multivariate_normal([1, 1], np.eye(2), 500), ), axis=0, ) y = np.array([2] * 500 + [1] * 500 + [0] * 500).reshape(-1, 1) data = concatenate_yx(y, x) ds_context = Context(parametric_types=[Categorical]) ds_context.feature_size = 2 leaf = create_conditional_leaf(data, ds_context, [0]) res = sample_instances( leaf, np.array([np.nan, 10, 10] * 1000).reshape(-1, 3), RandomState(17)) self.assertAlmostEqual(np.mean(res[:, 0]), 1, 1) res = sample_instances(leaf, np.array([np.nan, 1, 1] * 1000).reshape(-1, 3), RandomState(17)) self.assertAlmostEqual(np.mean(res[:, 0]), 0, 1) res = sample_instances( leaf, np.array([np.nan, 1, 1, np.nan, 10, 10] * 1000).reshape(-1, 3), RandomState(17)) self.assertAlmostEqual(np.mean(res[::2, 0]), 0, 1) self.assertAlmostEqual(np.mean(res[1::2, 0]), 1, 1) with self.assertRaises(AssertionError): sample_instances( leaf, np.array([np.nan, 1, 1, np.nan, 10, 10, 5, 10, 10]).reshape(-1, 3), RandomState(17))
y[:] = 0 data = np.zeros_like(to_ohe(y[:, 0].astype(int), n_people)) data = np.eye(n_people) # data[:, 9] = 1 # data[:, 11] = 1 # data[:] = 1 sample_images.insert(0, data) else: y = np.zeros((num_images, block_size)) y[:] = np.nan X = np.concatenate(sample_images, axis=1) tr_block = sample_instances(spn, concatenate_yx(y, X), rng, in_place=False) y = tr_block[:, 0:block_size] sample_images.insert(0, y) all_sample_images = np.concatenate(sample_images, axis=1) samples_person_id = np.argmax(all_sample_images[:, -n_people:], axis=1) all_sample_images = all_sample_images[:, 0:-n_people] # remove person id block_ids = tuple(list(reversed(range((num_blocks[0] * num_blocks[1]))))) sample_img_blocks = stitch_imgs(all_sample_images.shape[0], img_size=images[0].shape, num_blocks=num_blocks, blocks={block_ids: all_sample_images}) result_scaled = [] for i in range(num_images):