def test_utils(self): data = np.array([0, 1, 2, 3, 4, 5]).reshape(1, -1).repeat(10, axis=0) y, x = get_YX(data, 2) self.assertEqual(y.shape[0], 10) self.assertEqual(x.shape[0], 10) self.assertEqual(y.shape[1], 4) self.assertEqual(x.shape[1], 2) self.assertTrue(np.all(y[0, :] == [0, 1, 2, 3])) self.assertTrue(np.all(x[0, :] == [4, 5])) y, x = get_YX(data, 1) self.assertEqual(y.shape[0], 10) self.assertEqual(x.shape[0], 10) self.assertEqual(y.shape[1], 5) self.assertEqual(x.shape[1], 1) self.assertTrue(np.all(y[0, :] == [0, 1, 2, 3, 4])) self.assertTrue(np.all(x[0, :] == [5])) y, x = get_YX(data, 5) self.assertEqual(y.shape[0], 10) self.assertEqual(x.shape[0], 10) self.assertEqual(y.shape[1], 1) self.assertEqual(x.shape[1], 5) self.assertTrue(np.all(y[0, :] == [0])) self.assertTrue(np.all(x[0, :] == [1, 2, 3, 4, 5]))
def test_naive_factorization(self): np.random.seed(17) data = np.arange(0, 1000).reshape(-1, 8) parent = Sum() parent.children.append(None) ctx = Context() ctx.feature_size = 4 scope = [1, 3, 4, 6] data2 = np.array(data) result = naive_factorization(data=data2, parent=parent, pos=0, context=ctx, scope=list(scope)) self.assertListEqual(data.tolist(), data2.tolist()) self.assertEqual(parent.children[0], result[0][1]['parent']) y, x = get_YX(data, 4) self.assertEqual(len(result), len(scope)) for i, s in enumerate(scope): r = result[i] self.assertEqual(len(r), 2) self.assertEqual(r[0], SplittingOperations.CREATE_LEAF_NODE) self.assertEqual(type(r[1]['parent']), Product) self.assertEqual(r[1]['pos'], i) self.assertListEqual(r[1]['scope'], [s]) self.assertListEqual(r[1]['data'].tolist(), concatenate_yx(y[:, i], x).tolist())
def getCIGroups(local_data, ds_context=None, scope=None, families=None): """ :param local_data: np array :param scope: a list of index to output variables :param alpha: threshold :param families: obsolete :return: np array of clustering This function take tuple (output, conditional) as input and returns independent groups alpha is the cutoff parameter for connected components BE CAREFUL WITH SPARSE DATA! """ # data = preproc(local_data, ds_context, None, ohe) y, x = get_YX(local_data, ds_context.feature_size) pvals = testRcoT(y, x) + epsilon pvals[pvals > alpha] = 0 clusters = np.zeros(y.shape[1]) for i, c in enumerate(connected_components(from_numpy_matrix(pvals))): clusters[list(c)] = i + 1 return split_conditional_data_by_clusters(y, x, clusters, scope, rows=False)
def ExactMPE(spn, data, ds_context, node_top_down_mpe=_node_top_down_mpe, node_bottom_up_mpe_log=_node_bottom_up_mpe_log): y, x = get_YX(data, ds_context.feature_size) result = np.array(y) nodes = get_nodes_by_type(spn) lls_per_node = np.zeros((data.shape[0], len(nodes))) # one pass bottom up evaluating the likelihoods # for i in range(data.shape[0]): # result[i, :] = exact_mpe_row(spn, y[i, :], x[i, :]) # log_likelihood(spn, data, dtype=data.dtype,lls_matrix=lls_per_node) a = likelihood(spn, data, dtype=data.dtype, lls_matrix=lls_per_node) # a = concatenate_yx(result,x) # instance_ids = np.arange(data.shape[0]) # # # one pass top down to decide on the max branch until it reaches a leaf, then it fills the nan slot with the mode # eval_spn_top_down(spn,eval_functions=node_top_down_mpe, parent_result=instance_ids, data=data, lls_per_node=lls_per_node) # return data return a
def naive_factorization(data=None, node_id=0, context=None, scope=None, **kwargs): assert scope is not None, "No scope" prod_node = Product() prod_node.scope = scope prod_node.id = node_id y, x = get_YX(data, context.feature_size) result = [] for i, rv in enumerate(scope): prod_node.children.append(None) data_slice = concatenate_yx(y[:, i].reshape(-1, 1), x) result.append(( SplittingOperations.CREATE_LEAF_NODE, { "data": data_slice, "parent_id": prod_node.id, "pos": len(prod_node.children) - 1, "scope": [rv], }, )) return prod_node, result
def predict(self, data, y=None): # params = self.predict_params(data, y) if self.parametric_type == Gaussian: y, x = get_YX(data, feature_size=self.feature_size) # return params label = np.ones((len(y), 1)) return self.coefficients.forward(x, smudge=0)[0]
def split_conditional_rows_KMeans(local_data, ds_context, scope): y, x = get_YX(local_data, ds_context.feature_size) data = preproc(y, ds_context, pre_proc, ohe) clusters = KMeans(n_clusters=n_clusters, random_state=seed, precompute_distances=True).fit_predict(data) return split_data_by_clusters(local_data, clusters, scope, rows=True)
def test_create_conditional(self): np.random.seed(17) data = np.arange(0, 1000).reshape(-1, 8) parent = Sum() parent.children.append(None) ctx = Context() ctx.feature_size = 4 scope = [1, 3, 4, 6] data2 = np.array(data) K = int(data.shape[0] * 0.25) split_idx = np.array([0] * K + [1] * (data.shape[0] - K)) np.random.shuffle(split_idx) y, x = get_YX(data, 4) def label_conditional(local_y, local_x): self.assertListEqual(local_y.tolist(), y.tolist()) self.assertListEqual(local_x.tolist(), x.tolist()) return split_idx result = create_conditional(data=data2, parent=parent, pos=0, context=ctx, scope=list(scope), label_conditional=label_conditional) self.assertListEqual(data.tolist(), data2.tolist()) self.assertEqual(len(result), 2) for i, r in enumerate(result): self.assertEqual(r[0], SplittingOperations.GET_NEXT_OP) self.assertIn('data', r[1]) self.assertEqual(parent.children[0], r[1]['parent']) self.assertEqual(r[1]['pos'], i) self.assertListEqual(scope, r[1]['scope']) self.assertEqual(r[1]['data'].shape[1], data.shape[1]) conditional_node = result[0][1]['parent'] child_idx = conditional_supervised_likelihood( conditional_node, [np.zeros((data.shape[0], 1)), np.ones((data.shape[0], 1))], data) self.assertListEqual(result[0][1]['data'].tolist(), data[child_idx[:, 0] == 0, :].tolist()) self.assertListEqual(result[1][1]['data'].tolist(), data[child_idx[:, 0] == 1, :].tolist())
def predict_proba(self, data): y, X = get_YX(data, self.feature_size) params = self.predict_params(X, y) if self.parametric_type == Categorical: result = np.zeros((data.shape[0], 1)) for j, c in enumerate(self.classes_): idx = y == c result[idx] = params[idx[:, 0], j] return result return self.proba_func(y, params)
def remove_non_informative_features(data=None, node_id=0, scope=None, context=0, uninformative_features_idx=None, **kwargs): assert uninformative_features_idx is not None, "parameter uninformative_features_idx can't be None" prod_node = Product() prod_node.scope = scope prod_node.id = node_id y, x = get_YX(data, context.feature_size) non_zero_variance_rvs = [] non_zero_variance_idx = [] result = [] for idx, zero_var in enumerate(uninformative_features_idx): rv = scope[idx] if not zero_var: non_zero_variance_rvs.append(rv) non_zero_variance_idx.append(idx) continue prod_node.children.append(None) data_slice = concatenate_yx(y[:, idx].reshape(-1, 1), x) result.append(( SplittingOperations.CREATE_LEAF_NODE, { "data": data_slice, "parent_id": prod_node.id, "pos": len(prod_node.children) - 1, "scope": [rv], }, )) assert len(result) > 0 if len(non_zero_variance_idx) > 0: prod_node.children.append(None) result.append(( SplittingOperations.GET_NEXT_OP, { "data": concatenate_yx(data[:, non_zero_variance_idx], x), "parent_id": prod_node.id, "pos": len(prod_node.children) - 1, "scope": non_zero_variance_rvs, }, )) return prod_node, result
def create_conditional_slice(local_data, feature_size, scope, label_conditional): cluster_labels = label_conditional(*get_YX(local_data, feature_size)) unique_labels = np.unique(cluster_labels) if len(unique_labels) == 1: return None, None assert len(unique_labels) == 2 features = get_X(local_data, feature_size) assert features.shape[0] == cluster_labels.shape[0] node = SupervisedOr(feature_size=feature_size) node.scope.extend(scope) # from sklearn.linear_model import LogisticRegression # # node.classifier = LogisticRegression( # C=1, # max_iter=10000, # fit_intercept=True, # tol=1e-15, # class_weight="balanced", # solver="lbfgs", # ) # # # if local_data.shape[0] < 1000: # # idx = np.random.randint(low=0, high=local_data.shape[0], size=1000) # # node.classifier.fit(features[idx], cluster_labels[idx]) # # else: # node.classifier.fit(features, cluster_labels) # # slice_idx = node.classifier.predict(features) # # if len(np.unique(slice_idx)) == 1: # return None, None # # data_slices = [] # idx = slice_idx == 0 # data_slices.append((local_data[idx, :], scope, np.sum(idx) / len(slice_idx))) # # idx = slice_idx == 1 # data_slices.append((local_data[idx, :], scope, np.sum(idx) / len(slice_idx))) return node, data_slices
def learn_cspn_structure(train_data, ds_context, op_lambdas=_conditional_op_lambdas, split_rows=None, split_cols=None, create_leaf=None, **kwargs): y, x = get_YX(train_data, ds_context.feature_size) return learn_structure(train_data, ds_context, compress=False, scope=list(range(y.shape[1])), op_lambdas=op_lambdas, split_rows=split_rows, split_cols=split_cols, create_leaf=create_leaf, **kwargs)
def supervised_leaf_likelihood(node, data=None, dtype=np.float64): assert len(node.scope) == 1, node.scope y, x = get_YX(data, node.feature_size) y = y[:, node.scope] probs = np.ones((y.shape[0], 1), dtype=dtype) marg_ids = np.isnan(y[:, 0]) if np.sum(~marg_ids) > 0: observations_data = concatenate_yx(y[~marg_ids], x[~marg_ids]) probs[~marg_ids] = node.predictor.predict_proba(observations_data) probs[np.isclose(probs, 0)] = 0.000000001 return probs
def test_remove_non_informative_features(self): np.random.seed(17) data = np.arange(0, 1000).reshape(-1, 8) data[:, 1] = 1 data[:, 3] = 3 parent = Sum() parent.children.append(None) ctx = Context() ctx.feature_size = 4 scope = [1, 3, 4, 6] data2 = np.array(data) y, x = get_YX(data, 4) uninformative_features_idx = np.var(y, 0) == 0 result = remove_non_informative_features( data=data2, parent=parent, pos=0, context=ctx, scope=list(scope), uninformative_features_idx=uninformative_features_idx) self.assertListEqual(data.tolist(), data2.tolist()) self.assertEqual(len(parent.children[0].children), len(result)) resulting_scopes = [[3], [6], [1, 4]] resulting_data_y = [y[:, 1], y[:, 3], y[:, [0, 2]]] for i, r in enumerate(result): self.assertEqual(len(r), 2) self.assertEqual(type(r[1]['parent']), Product) self.assertEqual(parent.children[0], r[1]['parent']) self.assertListEqual(r[1]['scope'], resulting_scopes[i]) self.assertEqual(r[1]['pos'], i) self.assertListEqual( r[1]['data'].tolist(), concatenate_yx(resulting_data_y[i], x).tolist())
def create_conditional_leaf(data, context, scope): assert len( scope ) == 1, "scope of univariate parametric for more than one variable?" feature_size = context.feature_size parametric_type = context.parametric_types[scope[0]] # predictor = CSPNLinearModel(parametric_type=parametric_type, feature_size=feature_size) ##modefy node = SupervisedLeaf(scope=scope, predictor=predictor, parametric_type=parametric_type, feature_size=feature_size) # node = SupervisedLeaf(data=data,scope=scope, predictor=None, parametric_type=parametric_type, feature_size=feature_size) y, X = get_YX(data, node.feature_size) node.predictor.fit(X, y) return node
def split_rows_Gower(local_data, ds_context, scope): y, x = get_YX(local_data, ds_context.feature_size) data = preproc(y, ds_context, pre_proc, False) feature_types = [] for s in scope: mt = ds_context.meta_types[s] if mt == MetaType.BINARY: feature_types.append("categorical") elif mt == MetaType.DISCRETE: feature_types.append("discrete") else: feature_types.append("continuous") try: df = robjects.r["as.data.frame"](data) clusters = robjects.r["mixedclustering"](df, feature_types, n_clusters, seed) clusters = np.asarray(clusters) except Exception as e: np.savetxt("/tmp/errordata.txt", local_data) raise e return split_data_by_clusters(local_data, clusters, scope, rows=True)
def next_operation( data=None, parent_id=0, parent_type=None, pos=0, scope=None, no_clusters=False, no_splitting=False, no_independencies=True, # is_first=False, is_first=True, cluster_first=True, cluster_univariate=False, # cluster_univariate=True, min_features_slice=1, min_splitting_instances=500, min_clustering_instances=500, context=None, allow_sum_nodes=True, allow_conditioning_nodes=True, remove_uninformative_features=False, **kwargs): y, x = get_YX(data, context.feature_size) isMinimalFeatures = y.shape[1] <= min_features_slice isMinimalClusteringInstances = y.shape[0] <= min_clustering_instances isMinimalSplittingInstances = y.shape[0] <= min_splitting_instances result_params = { "data": data, "parent_id": parent_id, "pos": pos, "scope": scope, "no_clusters": no_clusters, "no_independencies": no_independencies, } uninformative_features = np.var(y, 0) == 0 if remove_uninformative_features and np.any(uninformative_features): result_op = SplittingOperations.REMOVE_UNINFORMATIVE_FEATURES result_params["uninformative_features_idx"] = uninformative_features return None, [(result_op, result_params)] # # if not isMinimalSplittingInstances and not no_splitting: # #split as much as you can # result_op = SplittingOperations.CREATE_CONDITIONAL_NODE # # result_op = SplittingOperations.CREATE_SUM_NODE # return None, [(result_op, result_params)] if isMinimalFeatures: if isMinimalClusteringInstances or no_clusters: return None, [(SplittingOperations.CREATE_LEAF_NODE, result_params) ] else: if cluster_univariate: return None, [(SplittingOperations.CREATE_SUM_NODE, result_params)] else: return None, [(SplittingOperations.CREATE_LEAF_NODE, result_params)] if isMinimalClusteringInstances or (no_clusters and no_independencies): return None, [(SplittingOperations.NAIVE_FACTORIZATION, result_params)] if no_independencies: return None, [(SplittingOperations.CREATE_SUM_NODE, result_params)] if no_clusters: return None, [(SplittingOperations.CREATE_PRODUCT_NODE, result_params)] if is_first: if cluster_first: return None, [(SplittingOperations.CREATE_SUM_NODE, result_params)] else: return None, [(SplittingOperations.CREATE_PRODUCT_NODE, result_params)] return None, [(SplittingOperations.CREATE_PRODUCT_NODE, result_params)]