def create_SPN2(): from spn.structure.Base import assign_ids from spn.structure.Base import rebuild_scopes_bottom_up from spn.algorithms.Validity import is_valid from spn.structure.leaves.parametric.Parametric import Categorical from spn.structure.Base import Sum, Product p0 = Product(children=[ Categorical(p=[0.3, 0.7], scope=1), Categorical(p=[0.4, 0.6], scope=2) ]) p1 = Product(children=[ Categorical(p=[0.5, 0.5], scope=1), Categorical(p=[0.6, 0.4], scope=2) ]) s1 = Sum(weights=[0.3, 0.7], children=[p0, p1]) p2 = Product(children=[Categorical(p=[0.2, 0.8], scope=0), s1]) p3 = Product(children=[ Categorical(p=[0.2, 0.8], scope=0), Categorical(p=[0.3, 0.7], scope=1) ]) p4 = Product(children=[p3, Categorical(p=[0.4, 0.6], scope=2)]) spn = Sum(weights=[0.4, 0.6], children=[p2, p4]) assign_ids(spn) rebuild_scopes_bottom_up(spn) val, msg = is_valid(spn) assert val, msg return spn
def _deserialize_sum(self, node, node_map): child_ids = node.sum.children # Resolve references to child nodes by ID. children = [node_map.get(id) for id in child_ids] # Check all childs have been resolved. assert None not in children, "Child node ID could not be resolved" sum = Sum(children = children, weights=node.sum.weights) sum.id = node.id return sum
def create_sum(data=None, node_id=0, parent_id=0, pos=0, context=None, scope=None, split_rows=None, split_on_sum=True, **kwargs): assert split_rows is not None, "No split_rows lambda" assert scope is not None, "No scope" result = [] data_slices = split_rows(data, context, scope) if len(data_slices) == 1: result.append(( SplittingOperations.GET_NEXT_OP, { "data": data, "parent_id": parent_id, "pos": pos, "no_clusters": True, "scope": scope, }, )) return result node = Sum() node.scope.extend(scope) node.id = node_id # assert parent.scope == node.scope for data_slice, scope_slice, proportion in data_slices: assert isinstance(scope_slice, list), "slice must be a list" child_data = data if split_on_sum: child_data = data_slice node.children.append(None) node.weights.append(proportion) result.append(( SplittingOperations.GET_NEXT_OP, { "data": child_data, "parent_id": node.id, "pos": len(node.children) - 1, "scope": scope, }, )) return node, result
def tree_to_spn(tree, features): tnode = tree.data if tnode == "sumnode": node = Sum() for i in range(int(len(tree.children) / 2)): j = 2 * i w, c = tree.children[j], tree.children[j + 1] node.weights.append(float(w)) node.children.append(tree_to_spn(c, features)) return node if tnode == "prodnode": if len(tree.children) == 1: return tree_to_spn(tree.children[0], features) node = Product() for c in tree.children: node.children.append(tree_to_spn(c, features)) return node if tnode in str_to_spn_lambdas: return str_to_spn_lambdas[tnode][0](tree, features, str_to_spn_lambdas[tnode][2], tree_to_spn) raise Exception('Node type not registered: ' + tnode)
def build_recursive(dep_tree, table_keys, scopes, attribute_owners, path_constraints=None, cache=None): if path_constraints is None: path_constraints = [] new_node = Sum() for (table_names_keys, dep_node) in get_dependncy_keys(dep_tree, table_keys, attribute_owners, path_constraints): for constraint_configuration, cached_node_count in get_constraint_values(table_names_keys, path_constraints, cache): p_node = Product() new_node.children.append(p_node) count_value = 1 for cached_node, node_count in cached_node_count: p_node.children.append(cached_node) count_value *= node_count for dep_children_node in dep_node.children: if dep_children_node.name[0] == '@': continue node, count = build_recursive(dep_children_node, table_keys, scopes, attribute_owners, path_constraints=constraint_configuration, cache=cache) p_node.children.append(node) count_value *= count new_node.weights.append(count_value) wsum = np.sum(new_node.weights) # new_node.weights = [w / wsum for w in new_node.weights] return new_node, wsum
def create_spflow_spn(n_feats): gaussians1 = [] gaussians2 = [] for i in range(n_feats): g1 = Gaussian(np.random.randn(), np.random.rand(), scope=i) g2 = Gaussian(np.random.randn(), np.random.rand(), scope=i) gaussians1.append(g1) gaussians2.append(g2) prods1 = [] prods2 = [] for i in range(0, n_feats, 2): p1 = Product([gaussians1[i], gaussians1[i + 1]]) p2 = Product([gaussians2[i], gaussians2[i + 1]]) prods1.append(p1) prods2.append(p2) sums = [] for i in range(n_feats // 2): s = Sum(weights=[0.5, 0.5], children=[prods1[i], prods2[i]]) sums.append(s) spflow_spn = Product(sums) assign_ids(spflow_spn) rebuild_scopes_bottom_up(spflow_spn) return spflow_spn
def test_spn_to_str_and_back(self): self.check_obj_and_reconstruction( Categorical(p=[0.1, 0.2, 0.7], scope=0)) self.check_obj_and_reconstruction(Gaussian(mean=0, stdev=10, scope=0)) self.check_obj_and_reconstruction( Gaussian(mean=1.2, stdev=1.5, scope=0)) self.check_obj_and_reconstruction(Gaussian(mean=-1.2, stdev=1, scope=0)) gamma = Gamma(alpha=1, beta=2, scope=0) lnorm = LogNormal(mean=1, stdev=2, scope=0) self.check_obj_and_reconstruction(gamma) self.check_obj_and_reconstruction(lnorm) root = Sum(children=[gamma, lnorm], weights=[0.2, 0.8]) assign_ids(root) rebuild_scopes_bottom_up(root) self.check_obj_and_reconstruction(root) root = 0.3 * (Gaussian(mean=0, stdev=1, scope=0) * Gaussian( mean=1, stdev=1, scope=1)) + 0.7 * (Gaussian( mean=2, stdev=1, scope=0) * Gaussian(mean=3, stdev=1, scope=1)) self.check_obj_and_reconstruction(root)
def create_spflow_spn(n_feats, ctype=Gaussian): children1 = [] children2 = [] for i in range(n_feats): if ctype == Gaussian: c1 = Gaussian(np.random.randn(), np.random.rand(), scope=i) c2 = Gaussian(np.random.randn(), np.random.rand(), scope=i) else: #c1 = Bernoulli(p=1.0, scope=i) #c2 = Bernoulli(p=1.0, scope=i) c1 = Bernoulli(p=np.random.rand(), scope=i) c2 = Bernoulli(p=np.random.rand(), scope=i) children1.append(c1) children2.append(c2) prods1 = [] prods2 = [] for i in range(0, n_feats, 2): p1 = Product([children1[i], children1[i + 1]]) p2 = Product([children2[i], children2[i + 1]]) prods1.append(p1) prods2.append(p2) sums = [] for i in range(n_feats // 2): s = Sum(weights=[0.5, 0.5], children=[prods1[i], prods2[i]]) sums.append(s) spflow_spn = Product(sums) assign_ids(spflow_spn) rebuild_scopes_bottom_up(spflow_spn) return spflow_spn
def test_cpu_histogram(): # Construct a minimal SPN. h1 = Histogram([0., 1., 2.], [0.25, 0.75], [1, 1], scope=0) h2 = Histogram([0., 3., 6., 8.], [0.35, 0.1, 0.55], [1, 1], scope=1) h3 = Histogram([0., 1., 2.], [0.33, 0.67], [1, 1], scope=0) h4 = Histogram([0., 5., 8.], [0.875, 0.125], [1, 1], scope=1) p0 = Product(children=[h1, h2]) p1 = Product(children=[h3, h4]) spn = Sum([0.3, 0.7], [p0, p1]) inputs = np.column_stack(( np.random.randint(2, size=30), np.random.randint(8, size=30), )).astype("float64") # Insert some NaN in random places into the input data. inputs.ravel()[np.random.choice(inputs.size, 5, replace=False)] = np.nan if not CUDACompiler.isAvailable(): print("Test not supported by the compiler installation") return 0 # Execute the compiled Kernel. results = CUDACompiler().log_likelihood(spn, inputs) # Compute the reference results using the inference from SPFlow. reference = log_likelihood(spn, inputs) reference = reference.reshape(30) # Check the computation results against the reference # Check in normal space if log-results are not very close to each other. assert np.all(np.isclose(results, reference)) or np.all( np.isclose(np.exp(results), np.exp(reference)))
def test_naive_factorization(self): np.random.seed(17) data = np.arange(0, 1000).reshape(-1, 8) parent = Sum() parent.children.append(None) ctx = Context() ctx.feature_size = 4 scope = [1, 3, 4, 6] data2 = np.array(data) result = naive_factorization(data=data2, parent=parent, pos=0, context=ctx, scope=list(scope)) self.assertListEqual(data.tolist(), data2.tolist()) self.assertEqual(parent.children[0], result[0][1]['parent']) y, x = get_YX(data, 4) self.assertEqual(len(result), len(scope)) for i, s in enumerate(scope): r = result[i] self.assertEqual(len(r), 2) self.assertEqual(r[0], SplittingOperations.CREATE_LEAF_NODE) self.assertEqual(type(r[1]['parent']), Product) self.assertEqual(r[1]['pos'], i) self.assertListEqual(r[1]['scope'], [s]) self.assertListEqual(r[1]['data'].tolist(), concatenate_yx(y[:, i], x).tolist())
def create_disj(data, scope, assignments, alpha): unq_data, counts = np.unique(data, axis=0, return_counts=True) probs = np.zeros(assignments.shape[0]) for i in range(assignments.shape[0]): index = np.where(np.all(assignments[i] == unq_data, axis=1))[0] if len(index): probs[i] = counts[index[0]] probs = (probs + alpha) / (probs + alpha).sum() indicators = { var: [Bernoulli(scope=[var], p=0), Bernoulli(scope=[var], p=1)] for var in scope } prods = [] for i in range(assignments.shape[0]): children = [] for j in range(assignments.shape[1]): children.append(indicators[scope[j]][assignments[i, j]]) # children.append(Bernoulli(scope=[scope[j]], p=assignments[i, j])) prods.append(Product(children=children)) if len(prods) > 1: disj = Sum(children=prods, weights=probs) else: disj = prods[0] assign_ids(disj) rebuild_scopes_bottom_up(disj) return disj
def test_log_vector_histogram(): # Construct a minimal SPN. h1 = Histogram([0., 1., 2.], [0.25, 0.75], [1, 1], scope=0) h2 = Histogram([0., 1., 2.], [0.45, 0.55], [1, 1], scope=1) h3 = Histogram([0., 1., 2.], [0.33, 0.67], [1, 1], scope=0) h4 = Histogram([0., 1., 2.], [0.875, 0.125], [1, 1], scope=1) p0 = Product(children=[h1, h2]) p1 = Product(children=[h3, h4]) spn = Sum([0.3, 0.7], [p0, p1]) inputs = np.column_stack(( np.random.randint(2, size=30), np.random.randint(2, size=30), )).astype("float64") if not CPUCompiler.isVectorizationSupported(): print("Test not supported by the compiler installation") return 0 # Execute the compiled Kernel. results = CPUCompiler(maxTaskSize=5).log_likelihood(spn, inputs, supportMarginal=False) # Compute the reference results using the inference from SPFlow. reference = log_likelihood(spn, inputs) reference = reference.reshape(30) # Check the computation results against the reference # Check in normal space if log-results are not very close to each other. assert np.all(np.isclose(results, reference)) or np.all(np.isclose(np.exp(results), np.exp(reference)))
def sum_condition(node, children, input_vals=None, scope=None): if not scope.intersection(node.scope): return Copy(node), 0 new_node = Sum() new_node.scope = list(set(node.scope) - scope) new_weights = [] probs = [] for i, c in enumerate(children): if c[0]: new_node.children.append(c[0]) new_weights.append(node.weights[i] * np.exp(c[1])) else: probs.append(node.weights[i] * np.exp(c[1])) new_node.weights = [w / sum(new_weights) for w in new_weights] assert np.all(np.logical_not(np.isnan( new_node.weights))), 'Found nan weights' if not new_node.scope: return None, np.log(sum(probs)) return new_node, np.log(sum(new_weights))
def test_create_conditional(self): np.random.seed(17) data = np.arange(0, 1000).reshape(-1, 8) parent = Sum() parent.children.append(None) ctx = Context() ctx.feature_size = 4 scope = [1, 3, 4, 6] data2 = np.array(data) K = int(data.shape[0] * 0.25) split_idx = np.array([0] * K + [1] * (data.shape[0] - K)) np.random.shuffle(split_idx) y, x = get_YX(data, 4) def label_conditional(local_y, local_x): self.assertListEqual(local_y.tolist(), y.tolist()) self.assertListEqual(local_x.tolist(), x.tolist()) return split_idx result = create_conditional(data=data2, parent=parent, pos=0, context=ctx, scope=list(scope), label_conditional=label_conditional) self.assertListEqual(data.tolist(), data2.tolist()) self.assertEqual(len(result), 2) for i, r in enumerate(result): self.assertEqual(r[0], SplittingOperations.GET_NEXT_OP) self.assertIn('data', r[1]) self.assertEqual(parent.children[0], r[1]['parent']) self.assertEqual(r[1]['pos'], i) self.assertListEqual(scope, r[1]['scope']) self.assertEqual(r[1]['data'].shape[1], data.shape[1]) conditional_node = result[0][1]['parent'] child_idx = conditional_supervised_likelihood( conditional_node, [np.zeros((data.shape[0], 1)), np.ones((data.shape[0], 1))], data) self.assertListEqual(result[0][1]['data'].tolist(), data[child_idx[:, 0] == 0, :].tolist()) self.assertListEqual(result[1][1]['data'].tolist(), data[child_idx[:, 0] == 1, :].tolist())
def __init__(self): p0 = Product(children=[ Categorical(p=[0.3, 0.7], scope=1), Categorical(p=[0.4, 0.6], scope=2) ]) p1 = Product(children=[ Categorical(p=[0.5, 0.5], scope=1), Categorical(p=[0.6, 0.4], scope=2) ]) s1 = Sum(weights=[0.3, 0.7], children=[p0, p1]) p2 = Product(children=[Categorical(p=[0.2, 0.8], scope=0), s1]) p3 = Product(children=[ Categorical(p=[0.2, 0.8], scope=0), Categorical(p=[0.3, 0.7], scope=1) ]) p4 = Product(children=[p3, Categorical(p=[0.4, 0.6], scope=2)]) self.spn = Sum(weights=[0.4, 0.6], children=[p2, p4]) assign_ids(self.spn) rebuild_scopes_bottom_up(self.spn)
def SPN_Reshape(node, max_children=2): v, err = is_valid(node) assert v, err nodes = get_nodes_by_type(node, (Product, Sum)) while len(nodes) > 0: n = nodes.pop() if len(n.children) <= max_children: continue # node has more than 2 nodes, create binary hierarchy new_children = [] new_weights = [] for i in range(0, len(n.children), max_children): children = n.children[i:i + max_children] if len(children) > 1: if isinstance(n, Product): newChild = Product() for c in children: newChild.scope.extend(c.scope) newChild.children.extend(children) new_children.append(newChild) else: # Sum weights = n.weights[i:i + max_children] branch_weight = sum(weights) new_weights.append(branch_weight) newChild = Sum() newChild.scope.extend(children[0].scope) newChild.children.extend(children) newChild.weights.extend( [w / branch_weight for w in weights]) newChild.weights[0] = 1.0 - sum(newChild.weights[1:]) new_children.append(newChild) else: new_children.extend(children) if isinstance(n, Sum): new_weights.append(1.0 - sum(new_weights)) n.children = new_children if isinstance(n, Sum): n.weights = new_weights nodes.append(n) assign_ids(node) v, err = is_valid(node) assert v, err return node
def learn_classifier(data, ds_context, spn_learn_wrapper, label_idx, **kwargs): spn = Sum() for label, count in zip(*np.unique(data[:, label_idx], return_counts=True)): branch = spn_learn_wrapper(data[data[:, label_idx] == label, :], ds_context, **kwargs) spn.children.append(branch) spn.weights.append(count / data.shape[0]) spn.scope.extend(branch.scope) assign_ids(spn) valid, err = is_valid(spn) assert valid, "invalid spn: " + err return spn
def test_create_sum_with_split(self): np.random.seed(17) data = np.arange(0, 1000).reshape(-1, 8) parent = Sum() parent.children.append(None) ctx = Context() ctx.feature_size = 4 scope = [1, 3, 4, 6] data2 = np.array(data) K = int(data.shape[0] * 0.25) split_idx = np.array([0] * K + [1] * (data.shape[0] - K)) np.random.shuffle(split_idx) def split_rows(data, context, scope): result = [] result.append((data[split_idx == 0, :], scope, 0.25)) result.append((data[split_idx == 1, :], scope, 0.75)) return result result = create_sum(data=data2, parent=parent, pos=0, context=ctx, scope=list(scope), split_rows=split_rows, split_on_sum=True) self.assertListEqual(data.tolist(), data2.tolist()) self.assertEqual(len(result), 2) for i, r in enumerate(result): self.assertEqual(r[0], SplittingOperations.GET_NEXT_OP) self.assertIn('data', r[1]) self.assertEqual(parent.children[0], r[1]['parent']) self.assertEqual(r[1]['pos'], i) self.assertListEqual(scope, r[1]['scope']) self.assertEqual(r[1]['data'].shape[1], data.shape[1]) self.assertEqual(r[1]['data'].shape[0], int(np.sum(split_idx == i))) self.assertListEqual(result[0][1]['data'].tolist(), data[split_idx == 0, :].tolist()) self.assertListEqual(result[1][1]['data'].tolist(), data[split_idx == 1, :].tolist()) self.assertAlmostEqual(np.sum(parent.children[0].weights), 1.0)
def test_torch_vs_tf_time(self): # Create sample data from sklearn.datasets.samples_generator import make_blobs import tensorflow as tf from time import time X, y = make_blobs(n_samples=10, centers=3, n_features=2, random_state=0) X = X.astype(np.float32) # SPFLow implementation g00 = Gaussian(mean=0.0, stdev=1.0, scope=0) g10 = Gaussian(mean=1.0, stdev=2.0, scope=1) g01 = Gaussian(mean=3.0, stdev=2.0, scope=0) g11 = Gaussian(mean=5.0, stdev=1.0, scope=1) p0 = Product(children=[g00, g10]) p1 = Product(children=[g01, g11]) s = Sum(weights=[0.2, 0.8], children=[p0, p1]) assign_ids(s) rebuild_scopes_bottom_up(s) # Convert tf_spn, data_placeholder, variable_dict = spn_to_tf_graph(s, data=X) torch_spn = SumNode.from_spn(s) # Optimizer lr = 0.001 tf_optim = tf.train.AdamOptimizer(lr) torch_optim = optim.Adam(torch_spn.parameters(), lr) t0 = time() epochs = 10 optimize_tf_graph(tf_spn, variable_dict, data_placeholder, X, epochs=epochs, optimizer=tf_optim) t1 = time() optimize_torch(torch_spn, X, epochs=epochs, optimizer=torch_optim) t2 = time() print("Tensorflow took: ", t1 - t0) print("PyTorch took: ", t2 - t1)
def test_remove_non_informative_features(self): np.random.seed(17) data = np.arange(0, 1000).reshape(-1, 8) data[:, 1] = 1 data[:, 3] = 3 parent = Sum() parent.children.append(None) ctx = Context() ctx.feature_size = 4 scope = [1, 3, 4, 6] data2 = np.array(data) y, x = get_YX(data, 4) uninformative_features_idx = np.var(y, 0) == 0 result = remove_non_informative_features( data=data2, parent=parent, pos=0, context=ctx, scope=list(scope), uninformative_features_idx=uninformative_features_idx) self.assertListEqual(data.tolist(), data2.tolist()) self.assertEqual(len(parent.children[0].children), len(result)) resulting_scopes = [[3], [6], [1, 4]] resulting_data_y = [y[:, 1], y[:, 3], y[:, [0, 2]]] for i, r in enumerate(result): self.assertEqual(len(r), 2) self.assertEqual(type(r[1]['parent']), Product) self.assertEqual(parent.children[0], r[1]['parent']) self.assertListEqual(r[1]['scope'], resulting_scopes[i]) self.assertEqual(r[1]['pos'], i) self.assertListEqual( r[1]['data'].tolist(), concatenate_yx(resulting_data_y[i], x).tolist())
def test_vector_slp_mini(): g0 = Gaussian(mean=0.13, stdev=0.5, scope=0) g1 = Gaussian(mean=0.14, stdev=0.25, scope=2) g2 = Gaussian(mean=0.11, stdev=1.0, scope=3) g3 = Gaussian(mean=0.12, stdev=0.75, scope=1) spn = Sum(children=[g0, g1, g2, g3], weights=[0.2, 0.4, 0.1, 0.3]) # Randomly sample input values from Gaussian (normal) distributions. num_samples = 100 inputs = np.column_stack( (np.random.normal(loc=0.5, scale=1, size=num_samples), np.random.normal(loc=0.125, scale=0.25, size=num_samples), np.random.normal(loc=0.345, scale=0.24, size=num_samples), np.random.normal(loc=0.456, scale=0.1, size=num_samples))).astype("float64") # Compute the reference results using the inference from SPFlow. reference = log_likelihood(spn, inputs) reference = reference.reshape(num_samples) # Compile the kernel with batch size 1 to enable SLP vectorization. compiler = CPUCompiler(vectorize=True, computeInLogSpace=True, vectorLibrary="LIBMVEC") kernel = compiler.compile_ll(spn=spn, batchSize=1, supportMarginal=False) # Execute the compiled Kernel. time_sum = 0 for i in range(len(reference)): # Check the computation results against the reference start = time.time() result = compiler.execute(kernel, inputs=np.array([inputs[i]])) time_sum = time_sum + time.time() - start print( f"evaluation #{i}: result: {result[0]:16.8f}, reference: {reference[i]:16.8f}", end='\r') if not np.isclose(result, reference[i]): print( f"\nevaluation #{i} failed: result: {result[0]:16.8f}, reference: {reference[i]:16.8f}" ) raise AssertionError() print(f"\nExecution of {len(reference)} samples took {time_sum} seconds.")
def test_spn_to_torch(self): # SPFLow implementation n0 = Gaussian(mean=0.0, stdev=1.0, scope=0) n1 = Categorical(p=[0.1, 0.3, 0.6]) n2 = Sum(weights=[0.1, 0.2, 0.3, 0.4], children=[n0, n1]) n3 = Product(children=[n0, n1]) torch_n0 = GaussianNode.from_spn(n0) torch_n1 = CategoricalNode.from_spn(n1) torch_n2 = SumNode.from_spn(n2) torch_n3 = ProductNode.from_spn(n3) self.assertEqual(torch_n0.mean, n0.mean) self.assertEqual(torch_n0.std, n0.stdev) self.assertTrue( np.isclose(torch_n1.p.detach().numpy(), n1.p, atol=DELTA).all()) self.assertTrue( np.isclose(torch_n2.weights.detach().numpy(), n2.weights, atol=DELTA).all())
def complete_layers(layer_nodes, current_node_type=Sum, depth=None): # all leaves should be at same depth root_layer = False if depth is None: root_layer = True depth = get_depth(layer_nodes[0]) if depth == 2: return children_layer = [] if current_node_type == Sum: for i in range(len(layer_nodes)): n = layer_nodes[i] assert isinstance(n, Sum) for j in range(len(n.children)): c = n.children[j] if not isinstance(c, Product): n.children[j] = Product([c]) children_layer.extend(n.children) children_layer_type = Product elif current_node_type == Product: for i in range(len(layer_nodes)): n = layer_nodes[i] assert isinstance(n, Product) for j in range(len(n.children)): c = n.children[j] if not isinstance(c, Sum): n.children[j] = Sum([1.0], [c]) children_layer.extend(n.children) children_layer_type = Sum else: raise Exception('node type' + str(current_node_type)) complete_layers(children_layer, current_node_type=children_layer_type, depth=depth - 1) if root_layer: rebuild_scopes_bottom_up(layer_nodes[0]) assign_ids(layer_nodes[0])
def test_binary_serialization_roundtrip(tmpdir): """Tests the binary serialization for SPFlow SPNs by round-tripping a simple SPN through serialization and de-serialization and comparing the graph-structure before and after serialization & de-serialization.""" h1 = Histogram([0., 1., 2.], [0.25, 0.75], [1, 1], scope=1) h2 = Histogram([0., 1., 2.], [0.45, 0.55], [1, 1], scope=2) h3 = Histogram([0., 1., 2.], [0.33, 0.67], [1, 1], scope=1) h4 = Histogram([0., 1., 2.], [0.875, 0.125], [1, 1], scope=2) p0 = Product(children=[h1, h2]) p1 = Product(children=[h3, h4]) spn = Sum([0.3, 0.7], [p0, p1]) model = SPNModel(spn, featureValueType="uint32") query = JointProbability(model) binary_file = os.path.join(tmpdir, "test.bin") print(f"Test binary file: {binary_file}") BinarySerializer(binary_file).serialize_to_file(query) deserialized = BinaryDeserializer(binary_file).deserialize_from_file() assert (isinstance(deserialized, JointProbability)) assert (deserialized.batchSize == query.batchSize) assert (deserialized.errorModel.error == query.errorModel.error) assert (deserialized.errorModel.kind == query.errorModel.kind) assert (deserialized.graph.featureType == model.featureType) assert (deserialized.graph.name == model.name) deserialized = deserialized.graph.root assert get_number_of_nodes(spn) == get_number_of_nodes(deserialized) assert get_number_of_nodes(spn, Sum) == get_number_of_nodes(deserialized, Sum) assert get_number_of_nodes(spn, Product) == get_number_of_nodes( deserialized, Product) assert get_number_of_nodes(spn, Histogram) == get_number_of_nodes( deserialized, Histogram) assert get_number_of_edges(spn) == get_number_of_edges(deserialized)
def test_equal_to_tf(self): # SPFLow implementation g00 = Gaussian(mean=0.0, stdev=1.0, scope=0) g10 = Gaussian(mean=1.0, stdev=2.0, scope=1) g01 = Gaussian(mean=3.0, stdev=2.0, scope=0) g11 = Gaussian(mean=5.0, stdev=1.0, scope=1) p0 = Product(children=[g00, g10]) p1 = Product(children=[g01, g11]) s = Sum(weights=[0.2, 0.8], children=[p0, p1]) assign_ids(s) rebuild_scopes_bottom_up(s) # Test for 100 random samples data = np.random.randn(100, 2) # LL from SPN ll = log_likelihood(s, data) # PyTorch implementation g00 = GaussianNode(mean=0.0, std=1.0, scope=0) g10 = GaussianNode(mean=1.0, std=2.0, scope=1) g01 = GaussianNode(mean=3.0, std=2.0, scope=0) g11 = GaussianNode(mean=5.0, std=1.0, scope=1) p0 = ProductNode(children=[g00, g10]) p1 = ProductNode(children=[g01, g11]) rootnode = SumNode(weights=[0.2, 0.8], children=[p0, p1]) datatensor = torch.Tensor(data) # LL from pytorch ll_torch = rootnode(datatensor) # Assert equality self.assertTrue( np.isclose(np.array(ll).squeeze(), ll_torch.detach().numpy(), atol=DELTA).all())
def learn_structure( dataset, ds_context, split_rows, split_cols, create_leaf, next_operation=get_next_operation(), initial_scope=None, data_slicer=default_slicer, ): assert dataset is not None assert ds_context is not None assert split_rows is not None assert split_cols is not None assert create_leaf is not None assert next_operation is not None root = Product() root.children.append(None) if initial_scope is None: initial_scope = list(range(dataset.shape[1])) num_conditional_cols = None elif len(initial_scope) < dataset.shape[1]: num_conditional_cols = dataset.shape[1] - len(initial_scope) else: num_conditional_cols = None assert len(initial_scope) > dataset.shape[ 1], "check initial scope: %s" % initial_scope tasks = deque() tasks.append((dataset, root, 0, initial_scope, False, False)) while tasks: local_data, parent, children_pos, scope, no_clusters, no_independencies = tasks.popleft( ) operation, op_params = next_operation( local_data, scope, create_leaf, no_clusters=no_clusters, no_independencies=no_independencies, is_first=(parent is root), ) logging.debug("OP: {} on slice {} (remaining tasks {})".format( operation, local_data.shape, len(tasks))) if operation == Operation.REMOVE_UNINFORMATIVE_FEATURES: node = Product() node.scope.extend(scope) parent.children[children_pos] = node rest_scope = set(range(len(scope))) for col in op_params: rest_scope.remove(col) node.children.append(None) tasks.append(( data_slicer(local_data, [col], num_conditional_cols), node, len(node.children) - 1, [scope[col]], True, True, )) next_final = False if len(rest_scope) == 0: continue elif len(rest_scope) == 1: next_final = True node.children.append(None) c_pos = len(node.children) - 1 rest_cols = list(rest_scope) rest_scope = [scope[col] for col in rest_scope] tasks.append(( data_slicer(local_data, rest_cols, num_conditional_cols), node, c_pos, rest_scope, next_final, next_final, )) continue elif operation == Operation.SPLIT_ROWS: split_start_t = perf_counter() data_slices = split_rows(local_data, ds_context, scope) split_end_t = perf_counter() logging.debug("\t\tfound {} row clusters (in {:.5f} secs)".format( len(data_slices), split_end_t - split_start_t)) if len(data_slices) == 1: tasks.append( (local_data, parent, children_pos, scope, True, False)) continue node = Sum() node.scope.extend(scope) parent.children[children_pos] = node # assert parent.scope == node.scope for data_slice, scope_slice, proportion in data_slices: assert isinstance(scope_slice, list), "slice must be a list" node.children.append(None) node.weights.append(proportion) tasks.append((data_slice, node, len(node.children) - 1, scope, False, False)) continue elif operation == Operation.SPLIT_COLUMNS: split_start_t = perf_counter() data_slices = split_cols(local_data, ds_context, scope) split_end_t = perf_counter() logging.debug("\t\tfound {} col clusters (in {:.5f} secs)".format( len(data_slices), split_end_t - split_start_t)) if len(data_slices) == 1: tasks.append( (local_data, parent, children_pos, scope, False, True)) assert np.shape(data_slices[0][0]) == np.shape(local_data) assert data_slices[0][1] == scope continue node = Product() node.scope.extend(scope) parent.children[children_pos] = node for data_slice, scope_slice, _ in data_slices: assert isinstance(scope_slice, list), "slice must be a list" node.children.append(None) tasks.append((data_slice, node, len(node.children) - 1, scope_slice, False, False)) continue elif operation == Operation.NAIVE_FACTORIZATION: node = Product() node.scope.extend(scope) parent.children[children_pos] = node local_tasks = [] local_children_params = [] split_start_t = perf_counter() for col in range(len(scope)): node.children.append(None) # tasks.append((data_slicer(local_data, [col], num_conditional_cols), node, len(node.children) - 1, [scope[col]], True, True)) local_tasks.append(len(node.children) - 1) child_data_slice = data_slicer(local_data, [col], num_conditional_cols) local_children_params.append( (child_data_slice, ds_context, [scope[col]])) result_nodes = pool.starmap(create_leaf, local_children_params) # result_nodes = [] # for l in tqdm(local_children_params): # result_nodes.append(create_leaf(*l)) # result_nodes = [create_leaf(*l) for l in local_children_params] for child_pos, child in zip(local_tasks, result_nodes): node.children[child_pos] = child split_end_t = perf_counter() logging.debug( "\t\tnaive factorization {} columns (in {:.5f} secs)".format( len(scope), split_end_t - split_start_t)) continue elif operation == Operation.CREATE_LEAF: leaf_start_t = perf_counter() node = create_leaf(local_data, ds_context, scope) parent.children[children_pos] = node leaf_end_t = perf_counter() logging.debug( "\t\t created leaf {} for scope={} (in {:.5f} secs)".format( node.__class__.__name__, scope, leaf_end_t - leaf_start_t)) else: raise Exception("Invalid operation: " + operation) node = root.children[0] assign_ids(node) valid, err = is_valid(node) assert valid, "invalid spn: " + err node = Prune(node) valid, err = is_valid(node) assert valid, "invalid spn: " + err return node
for z in range(10): data.append([x, y, z, int(((x + y + z) / 5))]) data = np.array(data).astype(np.float) types = [ MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE, MetaType.DISCRETE ] ds_context = Context(meta_types=types) ds_context.parametric_types = [Gaussian, Gaussian, Gaussian, Categorical] ds_context.add_domains(data) num_classes = len(np.unique(data[:, 3])) #spn = learn_mspn(data, ds_context, min_instances_slice=10, leaves=create_leaf, threshold=0.3) spn = Sum() for label, count in zip(*np.unique(data[:, 3], return_counts=True)): branch = learn_mspn(data[data[:, 3] == label, :], ds_context, min_instances_slice=10, leaves=create_leaf, threshold=0.1) spn.children.append(branch) spn.weights.append(count / data.shape[0]) spn.scope.extend(branch.scope) print("learned") prediction = []
def train_spn(window_size=3, min_instances_slice=10000, features=None, number_of_classes=3): if features is None: features = [20, 120] add_parametric_inference_support() add_parametric_text_support() data = get_data_in_window(window_size=window_size, features=features, three_classes=number_of_classes == 3) sss = sk.model_selection.StratifiedShuffleSplit(test_size=0.2, train_size=0.8, random_state=42) for train_index, test_index in sss.split( data[:, 0:window_size * window_size * len(features)], data[:, (window_size * window_size * len(features)) + (int(window_size * window_size / 2))]): X_train, X_test = data[train_index], data[test_index] context_list = list() parametric_list = list() number_of_features = len(features) for _ in range(number_of_features * window_size * window_size): context_list.append(MetaType.REAL) parametric_list.append(Gaussian) for _ in range(window_size * window_size): context_list.append(MetaType.DISCRETE) parametric_list.append(Categorical) ds_context = Context(meta_types=context_list) ds_context.add_domains(data) ds_context.parametric_types = parametric_list spn = load_spn(window_size, features, min_instances_slice, number_of_classes) if spn is None: spn = Sum() for class_pixel in tqdm(range(-window_size * window_size, 0)): for label, count in zip( *np.unique(data[:, class_pixel], return_counts=True)): train_data = X_train[X_train[:, class_pixel] == label, :] branch = learn_parametric( train_data, ds_context, min_instances_slice=min_instances_slice) spn.children.append(branch) spn.weights.append(train_data.shape[0]) spn.scope.extend(branch.scope) spn.weights = (np.array(spn.weights) / sum(spn.weights)).tolist() assign_ids(spn) save_spn(spn, window_size, features, min_instances_slice, number_of_classes) res = np.ndarray((X_test.shape[0], number_of_classes)) for i in tqdm(range(number_of_classes)): tmp = X_test.copy() tmp[:, -int((window_size**2) / 2)] = i res[:, i] = log_likelihood(spn, tmp)[:, 0] predicted_classes = np.argmax(res, axis=1).reshape((X_test.shape[0], 1)) correct_predicted = 0 for x, y in zip(X_test[:, -5], predicted_classes): if x == y[0]: correct_predicted += 1 accuracy = correct_predicted / X_test.shape[0] return spn, accuracy
from spn.structure.leaves.parametric.Parametric import * from spn.structure.StatisticalTypes import MetaType, Type from spn.structure.leaves.parametric.Text import add_parametric_text_support from spn.io.Text import to_JSON, spn_to_str_equation from spn.structure.leaves.parametric.Inference import add_parametric_inference_support # # create an SPN over three random variables X_1, X_2, X_3 from spn.structure.leaves.typedleaves.Text import add_typed_leaves_text_support from spn.structure.leaves.typedleaves.TypedLeaves import type_mixture_leaf_factory add_typed_leaves_text_support() add_parametric_inference_support() # # root is a sum root = Sum() # # two product nodes l_prod = Product() r_prod = Product() root.children = [l_prod, r_prod] root.weights = np.array([0.75, 0.25]) # # priors, but useless pm_continuous_param_map = OrderedDict({ Type.REAL: OrderedDict({Gaussian: { 'params': { 'mean': 5,
def learn_structure_cnet( dataset, ds_context, conditioning, create_leaf, next_operation_cnet=get_next_operation_cnet(), initial_scope=None, data_slicer=default_slicer, ): assert dataset is not None assert ds_context is not None assert create_leaf is not None assert next_operation_cnet is not None root = Product() root.children.append(None) if initial_scope is None: initial_scope = list(range(dataset.shape[1])) tasks = deque() tasks.append((dataset, root, 0, initial_scope)) while tasks: local_data, parent, children_pos, scope = tasks.popleft() operation, op_params = next_operation_cnet(local_data, scope) logging.debug("OP: {} on slice {} (remaining tasks {})".format( operation, local_data.shape, len(tasks))) if operation == Operation.CONDITIONING: from spn.algorithms.splitting.Base import split_data_by_clusters conditioning_start_t = perf_counter() col_conditioning, found_conditioning = conditioning(local_data) if not found_conditioning: node = create_leaf(local_data, ds_context, scope) parent.children[children_pos] = node continue clusters = (local_data[:, col_conditioning] == 1).astype(int) data_slices = split_data_by_clusters(local_data, clusters, scope, rows=True) node = Sum() node.scope.extend(scope) parent.children[children_pos] = node for data_slice, scope_slice, proportion in data_slices: assert isinstance(scope_slice, list), "slice must be a list" node.weights.append(proportion) product_node = Product() node.children.append(product_node) node.children[-1].scope.extend(scope) right_data_slice = np.hstack( (data_slice[:, :col_conditioning], data_slice[:, (col_conditioning + 1):])).reshape( data_slice.shape[0], data_slice.shape[1] - 1) product_node.children.append(None) tasks.append(( right_data_slice, product_node, len(product_node.children) - 1, scope_slice[:col_conditioning] + scope_slice[col_conditioning + 1:], )) left_data_slice = data_slice[:, col_conditioning].reshape( data_slice.shape[0], 1) product_node.children.append(None) tasks.append((left_data_slice, product_node, len(product_node.children) - 1, [scope_slice[col_conditioning]])) conditioning_end_t = perf_counter() logging.debug("\t\tconditioning (in {:.5f} secs)".format( conditioning_end_t - conditioning_start_t)) continue elif operation == Operation.CREATE_LEAF: cltree_start_t = perf_counter() node = create_leaf(local_data, ds_context, scope) parent.children[children_pos] = node cltree_end_t = perf_counter() else: raise Exception("Invalid operation: " + operation) node = root.children[0] assign_ids(node) valid, err = is_valid(node) assert valid, "invalid spn: " + err node = Prune(node) valid, err = is_valid(node) assert valid, "invalid spn: " + err return node