def update_cltree_parameters_mle(node, data, alpha=0.01): """ learn the structure and parameters of a CLTree """ log_factors = np.zeros((node.n_features, 2, 2)) if node.n_features == 1: p = (data.sum() + 2 * alpha) / (len(data) + 4 * alpha) log_factors[0, 0, 0] = np.log(1 - p) log_factors[0, 0, 1] = np.log(1 - p) log_factors[0, 1, 0] = np.log(p) log_factors[0, 1, 1] = np.log(p) node.tree = [-1] node.df_order = [0] node.post_order = [0] else: node.tree = [0] * node.n_features node.tree[0] = -1 (log_probs, log_j_probs) = compute_log_probs(node, data, alpha) MI = np.zeros((node.n_features, node.n_features)) for i in range(node.n_features): for j in range(i + 1, node.n_features): for v0 in range(2): for v1 in range(2): MI[i,j] = MI[i,j] + np.exp(log_j_probs[i,j,v0,v1]) * \ ( log_j_probs[i,j,v0,v1] - log_probs[i,v0] - log_probs[j,v1]) MI[j, i] = MI[i, j] mst = minimum_spanning_tree(-(MI)) dfs_tree = depth_first_order(mst, directed=False, i_start=0) node.df_order = dfs_tree[0].tolist() node.post_order = dfs_tree[0][::-1].tolist() for p in range(1, node.n_features): node.tree[p] = dfs_tree[1][p] # computing the factored represetation for feature in range(0, node.n_features): if node.tree[feature] == -1: log_factors[feature, 0, 0] = log_probs[feature, 0] log_factors[feature, 0, 1] = log_probs[feature, 0] log_factors[feature, 1, 0] = log_probs[feature, 1] log_factors[feature, 1, 1] = log_probs[feature, 1] else: parent = int(node.tree[feature]) for feature_val in range(2): for parent_val in range(2): log_factors[feature, feature_val, parent_val] = log_j_probs[ feature, parent, feature_val, parent_val] - log_probs[parent, parent_val] node.log_factors = log_factors.tolist()
def build_dependency_tree_from_mi(mut_info, scope, root_var=None): if root_var is None: root_var = np.random.choice(scope) else: if root_var not in scope: raise RootVarError() root_id = scope.index(root_var) mst = minimum_spanning_tree(-(mut_info + 1)) dfs_tree = depth_first_order(mst, directed=False, i_start=root_id) tree = dfs_tree[1].tolist() tree[root_id] = ROOT dtree_nodes = {var_id: DTreeNode(var_id) for var_id in scope} parents = np.arange(mut_info.shape[0]).tolist() parents.remove(root_id) for p in parents: dtree_nodes[scope[p]].set_parent(dtree_nodes[scope[tree[p]]]) dtree_nodes[scope[root_id]].set_tree(scope, tree) return dtree_nodes[scope[root_id]]
def __getSequence(self, idxA, idxB): from scipy.sparse.csgraph import depth_first_order def traverseToRoot(nodeSeq, pred): # scipy uses -9999 __SCIPY_END = -9999 seqV = [idxB] parent = pred[idxB] while parent != __SCIPY_END: seqV = [parent] + seqV parent = pred[parent] return seqV if self.__CSRspanTree is None: self.__CSRspanTree = csr_matrix(self.__spanningTree) (nodeSeq, pred) = depth_first_order(self.__CSRspanTree, i_start=idxA, \ directed=False, \ return_predecessors=True) # Traverse through predecessors to the root node seqV = traverseToRoot(nodeSeq, pred) if (seqV[0] != idxA): raise ValueError("Traversal Incorrect") else: return seqV
def dfs(adjMat): """ Find the depth first search order of a graph defined by adjacency matrix. Parameters ---------- `adjMat` : scipy sparse matrix adjacency matrix of a graph Returns ------- `target_path_index` : 1d numpy array a path of indices of points that walk through the graph in depth first order """ i_start = divmod(np.argmax(adjMat), adjMat.shape[0])[0] path_index, predecessors = depth_first_order(adjMat, i_start, directed=False) target_path_index = [] for i in range(len(path_index) - 1): curVertex = path_index[i] target_path_index.append(curVertex) nextVertex = path_index[i+1] while predecessors[nextVertex] != curVertex: curVertex = predecessors[curVertex] target_path_index.append(curVertex) target_path_index.append(path_index[-1]) target_path_ = np.array(target_path_index) return target_path_index
def update_exact(self, dataset_, weights=np.array([]), structure_update_flag=False): # Perform based on weights # assume that dataset_.shape[0] equals weights.shape[0] because each example has a weight # try to avoid sum(weights = 0 if weights.shape[0] == dataset_.shape[0] and np.sum(weights > 0): smooth = max(np.sum(weights), 1.0) / dataset_.shape[0] self.xycounts = Util.compute_weighted_xycounts(dataset_, weights) + smooth self.xcounts = Util.compute_weighted_xcounts( dataset_, weights) + 2.0 * smooth else: dataset = dataset_ print("Not using weight to update") self.xycounts += Util.compute_xycounts(dataset) self.xcounts += Util.compute_xcounts(dataset) self.xyprob = Util.normalize2d(self.xycounts) self.xprob = Util.normalize1d(self.xcounts) if structure_update_flag == True: edgemat = Util.compute_edge_weights(self.xycounts, self.xcounts) * (-1.0) Tree = minimum_spanning_tree(csr_matrix(edgemat)) self.topo_order, self.parents = depth_first_order(Tree, 0, directed=False)
def order_to_stitch(self, parents): return csgraph.depth_first_order( csgraph.reconstruct_path(self.edge_matrix, parents, directed=False), self.center, return_predecessors=False, )[::-1]
def __GRASP(self, forest_approach, vdata): times = 3 k = 3 # Best k edges if len(forest_approach) > 1: times = int(forest_approach[1]) if len(forest_approach) > 2: k = int(forest_approach[2]) """GRASP""" t = 0 while t < times: """CONSTRUCT""" initial_tree = None mst = minimum_spanning_tree_K( -(self.MI), k) # Using modified version of kruskal algorithm dfs_tree = depth_first_order(mst, directed=False, i_start=0) initial_tree = self.create_tree(dfs_tree) """End Construct""" """ Local Search""" initial_valid_ll = self.score_samples_log_proba_v( vdata, initial_tree) initial_num_tree = 1 improved = True while improved: improved = False best_ll = -np.inf best_edge = None valid_edges = np.where(initial_tree != -1) if np.size(valid_edges) > 0: for i in np.nditer(valid_edges): new = np.copy(initial_tree) new[i] = -1 valid_ll = self.score_samples_log_proba_v(vdata, new) if valid_ll > best_ll: best_edge = i best_ll = valid_ll if best_ll > initial_valid_ll: initial_valid_ll = best_ll initial_num_tree += 1 initial_tree[best_edge] = -1 improved = True """End local search""" if initial_valid_ll > self.current_best_validationll: self.current_best_validationll = initial_valid_ll self.num_trees = initial_num_tree self.tree = initial_tree # Now i can compute the log factors self.log_factors = np.zeros((self.n_features, 2, 2)) self.log_factors = compute_log_factors(self.tree, self.n_features, self.log_probs, self.log_c_probs, self.log_factors) t += 1
def __GRASP(self, forest_approach, vdata): times = 3 k = 3 # Best k edges if len(forest_approach) > 1: times = int(forest_approach[1]) if len(forest_approach) > 2: k = int(forest_approach[2]) """GRASP""" t = 0 while t < times: """CONSTRUCT""" initial_tree = None mst = minimum_spanning_tree_K(-(self.MI), k) # Using modified version of kruskal algorithm dfs_tree = depth_first_order(mst, directed=False, i_start=0) initial_tree = self.create_tree(dfs_tree) """End Construct""" """ Local Search""" initial_valid_ll = self.score_samples_log_proba_v(vdata, initial_tree) initial_num_tree = 1 improved = True while improved: improved = False best_ll = -np.inf best_edge = None valid_edges = np.where(initial_tree != -1) if np.size(valid_edges) > 0: for i in np.nditer(valid_edges): new = np.copy(initial_tree) new[i] = -1 valid_ll = self.score_samples_log_proba_v(vdata, new) if valid_ll > best_ll: best_edge = i best_ll = valid_ll if best_ll > initial_valid_ll: initial_valid_ll = best_ll initial_num_tree += 1 initial_tree[best_edge] = -1 improved = True """End local search""" if initial_valid_ll > self.current_best_validationll: self.current_best_validationll = initial_valid_ll self.num_trees = initial_num_tree self.tree = initial_tree # Now i can compute the log factors self.log_factors = np.zeros((self.n_features, 2, 2)) self.log_factors = compute_log_factors(self.tree, self.n_features, self.log_probs, self.log_c_probs, self.log_factors) t += 1
def _Minimum_SPTree_log_probs(self, log_probs, log_c_probs): """ the tree is represented as a sequence of parents""" mst = minimum_spanning_tree(-(self.MI)) dfs_tree = depth_first_order(mst, directed=False, i_start=0) self.df_order = dfs_tree[0] self.tree = self.create_tree(dfs_tree) # computing the factored representation self.log_factors = np.zeros((self.n_features, 2, 2)) self.log_factors = compute_log_factors(self.tree, self.n_features, log_probs, log_c_probs, self.log_factors)
def learnStructure(self, n_variable, ids): #print ('n_varialbe: ', n_variable) self.nvariables = n_variable self.ids = ids #print ('random tree ids: ', self.ids) edgemat = np.random.rand(self.nvariables, self.nvariables) # compute the minimum spanning tree Tree = minimum_spanning_tree(csr_matrix(edgemat)) # Convert the spanning tree to a Bayesian network self.topo_order, self.parents = depth_first_order(Tree, 0, directed=False)
def calculate_draw_order(self, parents): order = csgraph.depth_first_order(csgraph.reconstruct_path( self._edge_matrix, parents, directed=False), self.center, return_predecessors=False)[::-1] print('Order to Draw:') strf = '' for i in order: strf += str(self._images[i].name) + ', ' print(strf) return order
def _calculate_draw_order(self, parents): order = csgraph.depth_first_order( csgraph.reconstruct_path(self._edge_matrix, parents, directed=False), self.center, return_predecessors=False, )[::-1] log.info('Draw order: %s', ', '.join(self._images[i].name for i in order)) return order
def learnStructure_MI(self, mi): self.nvariables = mi.shape[0] #self.xyprob = p_xy #self.xprob = p_x # compute mutual information score for all pairs of variables # weights are multiplied by -1.0 because we compute the minimum spanning tree edgemat = mi * (-1.0) # compute the minimum spanning tree edgemat[edgemat == 0.0] = 1e-20 # sha1225 # to avoid tree not connected Tree = minimum_spanning_tree(csr_matrix(edgemat)) # Convert the spanning tree to a Bayesian network self.topo_order, self.parents = depth_first_order(Tree, 0, directed=False)
def make_face_normals_consistent(mesh): from graph import get_face_neighbors vertices = np.array(mesh.vertices) faces = np.array(mesh.faces) print('Making face normals consistent, nv = %d, nf = %d' % (len(vertices), len(faces))) print('Calculating normals...') normals = get_normals(vertices, faces, normalize=False) normalized_normals = guarded_normalized(normals) print('Calculating neighbors...') neighbors = get_face_neighbors(faces) print('Creating adjacency matrix...') ii = [] jj = [] vv = [] eps = 1e-4 for i, n in enumerate(neighbors): ni = normalized_normals[i] for j in n: ii.append(i) jj.append(j) nj = normalized_normals[j] weight = 1 + eps - abs(np.dot(ni, nj)) vv.append(weight) nf = len(faces) m = coo_matrix((vv, (ii, jj)), shape=(nf, nf)) print('Calculating minimum spanning tree...') minimum_spanning_tree(m, overwrite=True) print('Flipping inconsistent faces...') to_flip = [] n = depth_first_order(m, 0)[0] for i in n: row = m.getrow(i) norm_i = normalized_normals[i] js = row.nonzero()[1] norm_js = normalized_normals[js] dots = np.dot(norm_js, norm_i) for j, dot in zip(js, dots): if dot < 0: normalized_normals[j] *= -1 to_flip.append(j) faces[j] = faces[j, -1::-1] faces[to_flip] = faces[to_flip, -1::-1] print('Evaluating flux...') c = (np.max(vertices, axis=0) + np.min(vertices, axis=0)) / 2 centroids = get_centroids(vertices, faces) - c guarded_normalize(centroids) flux = np.sum(centroids * normals) if flux < 0: print('Flipping all faces') faces = faces[:, -1::-1] return pymesh.form_mesh(vertices, faces)
def _make_normals_consistent(m, normals, start_index): n, _ = depth_first_order(m, start_index) for i in n: row = m.getrow(i) norm_i = normals[i] for j in row.nonzero()[1]: norm_j = normals[j] dot = np.dot(norm_i, norm_j) if dot < 0: # print('Flipping %d - %d' % (i, j)) # norm_j *= -1 normals[j] = -norm_j
def dfs_order_vertices(edges): rows, cols = zip(*edges) ids = list(set(rows + cols)) idmap = {v: i for (i, v) in enumerate(ids)} rows = [idmap[v] for v in rows] cols = [idmap[v] for v in cols] vals = np.ones((len(rows), ), dtype=np.uint8) num_ids = max(max(rows), max(cols)) + 1 g = sp.coo_matrix((vals, (rows, cols)), shape=(num_ids, num_ids)).tocsr() return [ids[i] for i in csgraph.depth_first_order(g, 0, directed=False)[0]]
def learnStructure_prob(self, p_xy, p_x): self.nvariables = p_x.shape[0] self.xyprob = p_xy self.xprob = p_x # compute mutual information score for all pairs of variables # weights are multiplied by -1.0 because we compute the minimum spanning tree edgemat = Util.compute_MI_prob(self.xyprob, self.xprob) * (-1.0) edgemat[edgemat == 0.0] = 1e-20 # sha1225 # to avoid tree not connected # compute the minimum spanning tree Tree = minimum_spanning_tree(csr_matrix(edgemat)) # Convert the spanning tree to a Bayesian network self.topo_order, self.parents = depth_first_order(Tree, 0, directed=False) #self.Tree = Tree # self.get_log_cond_cpt()
def train_weighted(self, weights, data): # weights is a np vector assigning weights to every data-vector in data N = data.shape[0] alpha = max(np.sum(weights), 1) alpha /= N pairwise_counts = utils.compute_pairwise_counts_weighted(data, weights) + alpha self.prob_pair = utils.normalize2D(pairwise_counts) single_counts = utils.compute_single_counts_weighted(data, weights) + 2 * alpha self.prob_sing = utils.normalize1D(single_counts) adjmat = utils.compute_adjmatrix(self.prob_pair, self.prob_sing) adjmat *= -1.0 # making negative for MST calc adjmat[adjmat == 0.0] = 1e-10 mstree = minimum_spanning_tree(csr_matrix(adjmat)) self.node_order, self.parent = depth_first_order(mstree, 0, directed=False)
def dfs_traversals(edges): ''' Given an edge list, generate a sequence of ordered depth first search traversals, using scipy.csgraph routines. Parameters ------------ edges: (n,2) int, undirected edges of a graph Returns ----------- traversals: (m,) sequence of (p,) int, ordered DFS traversals of the graph. ''' edges = np.asanyarray(edges, dtype=np.int64) if not util.is_shape(edges, (-1, 2)): raise ValueError('edges are not (n,2)!') # make sure edges are sorted so we can query # an ordered pair later edges.sort(axis=1) # set of nodes to make sure we get every node nodes = set(edges.reshape(-1)) # coo_matrix for csgraph routines graph = edges_to_coo(edges) # we're going to make a sequence of traversals traversals = [] while len(nodes) > 0: # starting at any node start = nodes.pop() # get an (n,) ordered traversal ordered = csgraph.depth_first_order(graph, i_start=start, return_predecessors=False, directed=False) # even if the traversal is closed there won't be an # indication from the DFS, so add the first node # to the end of the path if np.sort(ordered[[0, -1]]) in edges: ordered = np.append(ordered, ordered[0]) # add the traversal to our result traversals.append(ordered) # remove the nodes we've consumed nodes.difference_update(ordered) return traversals
def learnStructure(self, dataset): self.nvariables = dataset.shape[1] self.xycounts = Util.compute_xycounts(dataset) + 1 # laplace correction self.xcounts = Util.compute_xcounts(dataset) + 2 # laplace correction self.xyprob = Util.normalize2d(self.xycounts) self.xprob = Util.normalize1d(self.xcounts) # compute mutual information score for all pairs of variables # weights are multiplied by -1.0 because we compute the minimum spanning tree edgemat = Util.compute_edge_weights(self.xycounts, self.xcounts) * (-1.0) edgemat[edgemat == 0.0] = 1e-20 # sha1225 # to avoid tree not connected # compute the minimum spanning tree Tree = minimum_spanning_tree(csr_matrix(edgemat)) # Convert the spanning tree to a Bayesian network self.topo_order, self.parents = depth_first_order(Tree, 0, directed=False) #self.Tree = Tree self.get_log_cond_cpt()
def update(self, dataset_, weights=np.array([])): # Perform Sampling importance resampling based on weights # assume that dataset_.shape[0] equals weights.shape[0] because each example has a weight if weights.shape[0]==dataset_.shape[0]: norm_weights = Util.normalize(weights) indices = np.argwhere(np.random.multinomial(dataset_.shape[0], norm_weights)).ravel() dataset = dataset_[indices, :] else: dataset=dataset_ print ("Not using weight to update") self.xycounts += Util.compute_xycounts(dataset) self.xcounts += Util.compute_xcounts(dataset) self.xyprob = Util.normalize2d(self.xycounts) self.xprob = Util.normalize1d(self.xcounts) edgemat = Util.compute_edge_weights(self.xycounts, self.xcounts) * (-1.0) Tree = minimum_spanning_tree(csr_matrix(edgemat)) self.topo_order, self.parents = depth_first_order(Tree, 0, directed=False)
def clique_to_tree(self): neighbors = np.zeros((self.n_cliques, self.n_cliques)) for k in self.var_in_clique.keys(): nb_val = self.var_in_clique[k] nb_num = len( nb_val) # how many cliques that conatain this variable # for cliques connected to root clique if k == 0: for i in xrange(nb_num): neighbors[0, nb_val[i]] = 1 neighbors[nb_val[i], 0] = 1 continue if nb_num > 1: for i in xrange(nb_num): for j in xrange(i + 1, nb_num): # connect only parent and child, for tree only if self.clique_list[nb_val[i]].var[0] == self.clique_list[nb_val[j]].var[1] \ or self.clique_list[nb_val[i]].var[1] == self.clique_list[nb_val[j]].var[0] : neighbors[nb_val[i], nb_val[j]] = 1 neighbors[nb_val[j], nb_val[i]] = 1 # compute the minimum spanning tree Tree = minimum_spanning_tree(csr_matrix(neighbors * (-1))) # Convert the spanning tree to a Bayesian network self.jt_order, self.jt_parents = depth_first_order(Tree, 0, directed=False) for i in xrange(self.n_cliques): child_index = np.where(self.jt_parents == i)[0] if child_index.shape[0] > 0: child_list = [] for c in child_index: child_list.append(self.clique_list[c]) self.clique_list[i].set_child_list(child_list) if self.jt_parents[i] != -9999: self.clique_list[i].set_parent( self.clique_list[self.jt_parents[i]])
def ordered_hull_idx_2d(hull): n = hull.simplices.shape[0] # determine order of edges in the convex hull v = coo_matrix((np.ones(2*n), (np.repeat(np.arange(n), 2), hull.neighbors.ravel()))) facet_order = csgraph.depth_first_order(v, 0, return_predecessors=False) facet_vidx = hull.simplices[facet_order] # pick one vertex for each edge, based on which direction the walk went m = hull.neighbors[facet_order][:-1] == facet_order[1:,None] i = np.arange(n) j = np.r_[np.where(m)[1], 0] ordered_vertex_idx = facet_vidx[i, j] # sanity check assert np.all(np.unique(ordered_vertex_idx) == np.unique(hull.simplices.ravel())) return ordered_vertex_idx
def __init__(self, data, features_name=None, alpha=1.0): self.data = data self.alpha = alpha self.n_features = data.shape[1] if features_name is None: self.features = [i for i in range(self.n_features)] else: self.features = features_name self.num_instances = data.shape[0] (self.log_probs, self.log_j_probs) = self.log_p_jp(self.data, self.n_features, self.num_instances) self.log_c_probs = self.log_cp(self.n_features, self.log_probs, self.log_j_probs) self.MI = self.cMI(self.n_features, self.log_probs, self.log_j_probs) " the tree is represented as a sequence of parents" mst = minimum_spanning_tree(-(self.MI + 1)) dfs_tree = depth_first_order(mst, directed=False, i_start=0) self.tree = np.zeros(self.n_features) self.tree[0] = -1 for p in range(1, self.n_features): self.tree[p] = dfs_tree[1][p] # computing the factored represetation self.factors = np.zeros((self.n_features, 2, 2)) self.factors = self.log_factors() self.data = None self.MI = None self.log_j_probs = None self.log_probs = None self.log_c_probs = None mst = None dfs_tree = None
def compute(self): cov = cov3D(self.k_neighbors) u, s, v = np.linalg.svd(cov) normals = u[:, :, -1] # Orient normals as in "Surface Reconstruction from Unorganized Points" max_z = self.pyntcloud.xyz.argmax(0)[-1] if normals[max_z, 2] < 0: normals[max_z] = -normals[max_z] # Dot product between each point's normal and the normals of it's neighbours dot3D = 1 - abs(np.einsum("ij, ikj -> ik", normals, normals[self.k_neighbors_idx])) n = self.pyntcloud.xyz.shape[0] graph = np.zeros((n, n), dtype=np.float32) for i in range(n): graph[i, self.k_neighbors_idx[i]] = dot3D[i] MST = minimum_spanning_tree(csr_matrix(graph)) DFO = depth_first_order(MST, max_z, directed=False, return_predecessors=False) """ for i in range(1, len(DFO)): n1 = normals[DFO[i - 1]] n2 = normals[DFO[i]] if np.dot(n1, n2) < 0: normals[DFO[i]] *= -1 """ nx = normals[:, 0] ny = normals[:, 1] nz = normals[:, 2] k = self.k_neighbors.shape[1] self.to_be_added["nx({})".format(k)] = nx self.to_be_added["ny({})".format(k)] = ny self.to_be_added["nz({})".format(k)] = nz
def __init__(self, data, features_name=None, alpha=1.0): self.data = data self.alpha = alpha self.n_features = data.shape[1] if features_name is None: self.features = [i for i in range(self.n_features)] else: self.features = features_name self.num_instances = data.shape[0] (self.log_probs, self.log_j_probs) = self.log_p_jp( self.data, self.n_features, self.num_instances) self.log_c_probs = self.log_cp( self.n_features, self.log_probs, self.log_j_probs) self.MI = self.cMI(self.n_features, self.log_probs, self.log_j_probs) " the tree is represented as a sequence of parents" mst = minimum_spanning_tree(-(self.MI + 1)) dfs_tree = depth_first_order(mst, directed=False, i_start=0) self.tree = np.zeros(self.n_features) self.tree[0] = -1 for p in range(1, self.n_features): self.tree[p] = dfs_tree[1][p] # computing the factored represetation self.factors = np.zeros((self.n_features, 2, 2)) self.factors = self.log_factors() self.data = None self.MI = None self.log_j_probs = None self.log_probs = None self.log_c_probs = None mst = None dfs_tree = None
def compute(self): cov = cov3D(self.k_neighbors) u, s, v = np.linalg.svd(cov) normals = u[:, :, -1] # Orient normals as in "Surface Reconstruction from Unorganized Points" max_z = self.pyntcloud.xyz.argmax(0)[-1] if normals[max_z, 2] < 0: normals[max_z] = -normals[max_z] # Dot product between each point's normal and the normals of its neighbors dot3D = 1 - abs( np.einsum("ij, ikj -> ik", normals, normals[self.k_neighbors_idx])) n = self.pyntcloud.xyz.shape[0] graph = np.zeros((n, n), dtype=np.float32) for i in range(n): graph[i, self.k_neighbors_idx[i]] = dot3D[i] MST = minimum_spanning_tree(csr_matrix(graph)) DFO = depth_first_order(MST, max_z, directed=False, return_predecessors=False) """ for i in range(1, len(DFO)): n1 = normals[DFO[i - 1]] n2 = normals[DFO[i]] if np.dot(n1, n2) < 0: normals[DFO[i]] *= -1 """ nx = normals[:, 0] ny = normals[:, 1] nz = normals[:, 2] k = self.k_neighbors.shape[1] self.to_be_added["nx({})".format(k)] = nx self.to_be_added["ny({})".format(k)] = ny self.to_be_added["nz({})".format(k)] = nz
def group_branches(self, graph, root, directed=False): """ :param graph: :type graph: csr_matrix :param root: :type root: int :return: :rtype: list(list(int)) """ dft, preds = sp.depth_first_order(graph, root, directed=directed, return_predecessors=True) branches = [] current_branch = -1 for node in dft[1:]: if preds[node] == root: current_branch += 1 branches.append([]) branches[current_branch].append(node) return branches
def train(self, data, approx=0, samp_k=None): """ approx (default = 0) means no approx. approx = 1 means use the AST method described in todo approx = 2 means use the AGH method described in todo For either of the above, use samp_k number of nodes to sample. samp_k should be about log N """ np.random.shuffle(data) num_feats = data.shape[1] if not samp_k: samp_k = int(np.ceil(np.log2(num_feats))) # print(samp_k) if approx == 1: # Use the AST method (approx_spantree) self.node_order, self.parent = self.compute_approx_spantree(data, samp_k) else: if approx == 0: adjmat = self.compute_exact_graph(data) elif approx == 2: # Use the AGH method (approx_graph) adjmat = self.compute_approx_graph(data, samp_k) adjmat *= -1.0 # making negative for MST calc mstree = self.compute_exact_mst(adjmat) self.node_order, self.parent = depth_first_order(mstree, i_start=0, directed=False)
def fit(self, X, m_priors, j_priors, alpha=1.0, sample_weight=None, scope=None, and_leaves=False, multilabel = False, n_labels=0, ml_tree_structure=0): """Fit the model to the data. Parameters ---------- X : ndarray, shape=(n, m) The data array. m_priors: the marginal priors for each feature j_priors: the joint priors for each couple of features alpha: float, default=1.0 the constant for the smoothing sample_weight: ndarray, shape=(n,) The weight of each sample. scope: unique identifiers for the features and_leaves: boolean, default=False multilabel: boolean, default=False its value indicates whether the cltree are used for multilabel classification problems when imported by mlcsn.py n_labels: integer, default=0 in case of multilabel classification problem indicates the number of labels, assumed to be the n_labels rows of X ml_tree_structure: integer, default=0 in case of multilabel classification problem indicates the structure of the tree to be learned. The set of features F corresponds to the union of A (the attributes) and Y (the labels): - 0, no constraint on the resulting tree - 1, the parent of each variable in Y must have the parent in Y, while the parent of each variable in A can have the parent in A or in Y. A label variable depends on a label variable; an attribute variable can depend on a label variable or on an attribute variable - 2, the parent of each variable in Y must have the parent in Y, and the parent of each variable in A can have the parent in Y. A label variable depends on a label variable; an attribute variable depends on a label variable """ self.alpha = alpha self.and_leaves = and_leaves self.n_features = X.shape[1] rootTree = False if scope is None: self.scope = np.array([i for i in range(self.n_features)]) rootTree = True else: self.scope = scope if sample_weight is None: self.n_samples = X.shape[0] else: self.n_samples = np.sum(sample_weight) (log_probs, log_j_probs) = self.compute_log_probs(X, sample_weight, m_priors, j_priors) MI = self.cMI(log_probs, log_j_probs) if multilabel == True: if ml_tree_structure == 1: MI[-n_labels:,-n_labels:] += np.max(MI) elif ml_tree_structure == 2: MI[-n_labels:,-n_labels:] += np.max(MI) MI[:-n_labels,:-n_labels] = 0 elif ml_tree_structure == 3: MI[:-n_labels,:-n_labels] = 0 " the tree is represented as a sequence of parents" mst = minimum_spanning_tree(-(MI)) dfs_tree = depth_first_order(mst, directed=False, i_start=0) self.df_order = dfs_tree[0] self.post_order = dfs_tree[0][::-1] self.tree = np.zeros(self.n_features, dtype=np.int) self.tree[0] = -1 for p in range(1, self.n_features): self.tree[p]=dfs_tree[1][p] penalization = logr(X.shape[0])/(2*X.shape[0]) if self.and_leaves == True: for p in range(1,self.n_features): if MI[self.tree[p],p]<penalization: self.tree[p]=-1 self.num_trees = self.num_trees + 1 if self.num_trees > 1: self._forest = True """ selected_MI = [] for p in range(1,self.n_features): selected_MI.append((p,MI[self.tree[p],p])) selected_MI.sort(key=lambda mi: mi[1], reverse=True) for p in range(10,self.n_features-1): self.tree[selected_MI[p][0]]=-1 """ if multilabel == True and rootTree: pX = 0 for i in range(self.n_features-n_labels): if self.tree[i]>=(self.n_features-n_labels): pX += 1 pY = 0 for i in range(self.n_features-n_labels,self.n_features): if self.tree[i]>=(self.n_features-n_labels): pY += 1 print("Xs with Y parent: ", pX) print("Ys with Y parent: ", pY) self.num_edges = self.n_features - self.num_trees # computing the factored represetation self.log_factors = np.zeros((self.n_features, 2, 2)) self.log_factors = compute_log_factors(self.tree, self.n_features, log_probs, log_j_probs, self.log_factors)
def fit(self, X, alpha=1.0, scope=None, multilabel=False, n_labels=0, ml_tree_structure=0): """Fit the model to the data. Parameters ---------- X : ndarray, shape=(n, m) The data array. alpha: float, default=1.0 the constant for the smoothing scope: unique identifiers for the features multilabel: boolean, default=False its value indicates whether the cltree are used for multilabel classification problems when imported by mlcsn.py n_labels: integer, default=0 in case of multilabel classification problem indicates the number of labels, assumed to be the n_labels rows of X ml_tree_structure: integer, default=0 in case of multilabel classification problem indicates the structure of the tree to be learned. The set of features F corresponds to the union of A (the attributes) and Y (the labels): - 0, no constraint on the resulting tree - 1, the parent of each variable in Y must have the parent in Y, while the parent of each variable in A can have the parent in A or in Y. A label variable depends on a label variable; an attribute variable can depend on a label variable or on an attribute variable - 2, the parent of each variable in Y must have the parent in Y, and the parent of each variable in A can have the parent in Y. A label variable depends on a label variable; an attribute variable depends on a label variable """ self.alpha = alpha self.n_features = X.shape[1] rootTree = False if scope is None: self.scope = np.array([i for i in range(self.n_features)]) rootTree = True else: self.scope = scope self.n_samples = X.shape[0] (log_probs, log_j_probs) = self.compute_log_probs(X) MI = self.cMI(log_probs, log_j_probs) if multilabel == True: if ml_tree_structure == 1: MI[-n_labels:, -n_labels:] += np.max(MI) elif ml_tree_structure == 2: MI[-n_labels:, -n_labels:] += np.max(MI) MI[:-n_labels, :-n_labels] = 0 elif ml_tree_structure == 3: MI[:-n_labels, :-n_labels] = 0 " the tree is represented as a sequence of parents" MI = MI + 0.01 mst = minimum_spanning_tree(-(MI)) dfs_tree = depth_first_order(mst, directed=False, i_start=0) self.df_order = dfs_tree[0] self.post_order = dfs_tree[0][::-1] self.tree = np.zeros(self.n_features, dtype=np.int) self.tree[0] = -1 for p in range(1, self.n_features): self.tree[p] = dfs_tree[1][p] if multilabel == True and rootTree: pX = 0 for i in range(self.n_features - n_labels): if self.tree[i] >= (self.n_features - n_labels): pX += 1 pY = 0 for i in range(self.n_features - n_labels, self.n_features): if self.tree[i] >= (self.n_features - n_labels): pY += 1 print("Xs with Y parent: ", pX) print("Ys with Y parent: ", pY) self.num_edges = self.n_features - self.num_trees # computing the factored represetation self.log_factors = np.zeros((self.n_features, 2, 2)) self.log_factors = compute_log_factors(self.tree, self.n_features, log_probs, log_j_probs, self.log_factors)
def _learn_from_data(self, data, features=None, n_feature_vals=2, feature_vals=None, alpha=0.1, sparse=True, mem_free=True): """ Chow and Liu learning algorithm """ # # this trick helps for sparse matrices # TODO: check if this cond is needed or the sparse dot is equal to # the dense one performance-wise if sparse: self._data = scipy.sparse.csr_matrix(data) else: self._data = data self._alpha = alpha self._n_features = data.shape[1] self._n_instances = data.shape[0] self.features = features # # assuming homogeneous features this could be restrictive # TODO: extend the whole code to categorical non homogeneous features self._feature_vals = feature_vals if self._feature_vals is None: self._feature_vals = \ numpy.array([n_feature_vals for i in range(self._n_features)]) # # getting the max to pre-allocate the memory self._n_feature_vals = n_feature_vals if self._n_feature_vals is None: self._n_feature_vals = max(self._feature_vals) if self.features is None: self.features = numpy.array([i for i in range(self._n_features)]) # # pre-allocating arrays for freqs and probs # self._marg_freqs = numpy.zeros(self._n_features) self._joint_freqs = numpy.zeros((self._n_features, self._n_features, self._n_feature_vals, self._n_feature_vals)) self._log_marg_probs = numpy.zeros((self._n_features, self._n_feature_vals)) self._log_joint_probs = numpy.zeros((self._n_features, self._n_features, self._n_feature_vals, self._n_feature_vals)) self._log_cond_probs = numpy.zeros((self._n_features, self._n_features, self._n_feature_vals, self._n_feature_vals)) self._mutual_info = numpy.zeros((self._n_features, self._n_features)) # # computing freqs and probs (and smoothing) co_occ_matrix = self._data.T.dot(self._data) # # marginal frequencies if sparse: co_occ_matrix = numpy.array(co_occ_matrix.todense()) self._marg_freqs = co_occ_matrix.diagonal() else: self._marg_freqs = co_occ_matrix.diagonal() self._log_marg_probs = self.log_marg_probs(self._marg_freqs, self._log_marg_probs) # # joint estimation self._joint_freqs = self.joint_freqs(self._joint_freqs, co_occ_matrix) self._log_joint_probs = self.log_joint_probs(self._joint_freqs, self._log_joint_probs) # # conditional estimation self._log_cond_probs = self.log_cond_probs(self._log_marg_probs, self._log_joint_probs, self._log_cond_probs) self._mutual_info = self.mutual_information(self._log_marg_probs, self._log_joint_probs, self._mutual_info) # # computing the MST (this way we are not overwriting mutual_info) # this can be useful for testing but not for efficiency # mst = minimum_spanning_tree(-self._mutual_info, copy=copy_mi) mst = minimum_spanning_tree(-(self._mutual_info + 1)) dfs_tree = depth_first_order(mst, directed=False, i_start=0) # # representing the CLTree as a sequence of parents ids self._tree = numpy.zeros(self._n_features, dtype=int) # self._tree[0] = -1 # the root is its parent self._tree[0] = 0 for feature in range(1, self._n_features): self._tree[feature] = dfs_tree[1][feature] # # computing the factored represetation self._factors = numpy.zeros((self._n_features, self._n_feature_vals, self._n_feature_vals)) self._factors = self.log_factors(self._log_marg_probs, self._log_joint_probs, self._factors) # # removing references,this is optional for test purposes if mem_free: self._mutual_info = None self._joint_freqs = None self._log_marg_probs = None self._log_joint_probs = None self._log_cond_probs = None self._marg_freqs = None self._data = None
# This function takes following arguments: # # the graph. # the starting element to traverse graph from. from scipy.sparse.csgraph import depth_first_order from scipy.sparse import csr_matrix arr = np.array([ [1, 2, 43, 1], [23, 23, 4, 1], [32, 23, 4, 1], [0, 1, 30, 1] ]) newarr = csr_matrix(arr) print(depth_first_order(newarr, 1)) # Breadth First Order # The breadth_first_order() method returns a breadth first traversal from a node. # # This function takes following arguments: # # the graph. # the starting element to traverse graph from. from scipy.sparse.csgraph import breadth_first_order from scipy.sparse import csr_matrix arr = np.array([ [1, 2, 4, 5], [2, 3, 0, 7],
import scipy.sparse.csgraph as csg import depth_first i, j = zip((0, 1), (0, 2), (1, 2), (2, 3), (2, 5), (3, 4), (3, 6), (4, 6), (5, 7)) g = sparse.csr_matrix((np.ones(len(i)),(i,j)), shape=(8,8)) g = g + g.T import timeit o, p = depth_first.depth_first_order(g, 0) print "order:", o print "pred :", p o, p = csg.depth_first_order(g, 0) print "order:", o print "pred :", p # timing print "my:", timeit.timeit('o, p = depth_first.depth_first_order(g, 0)', setup='from __main__ import *', number=100000) print "scipy:", timeit.timeit('o, p = csg.depth_first_order(g, 0)', setup='from __main__ import *', number=100000)
def fit(self, X, m_priors, j_priors, alpha=1.0, sample_weight=None, scope=None, and_leaves=False, multilabel=False, n_labels=0, ml_tree_structure=0): """Fit the model to the data. Parameters ---------- X : ndarray, shape=(n, m) The data array. m_priors: the marginal priors for each feature j_priors: the joint priors for each couple of features alpha: float, default=1.0 the constant for the smoothing sample_weight: ndarray, shape=(n,) The weight of each sample. scope: unique identifiers for the features and_leaves: boolean, default=False multilabel: boolean, default=False its value indicates whether the cltree are used for multilabel classification problems when imported by mlcsn.py n_labels: integer, default=0 in case of multilabel classification problem indicates the number of labels, assumed to be the n_labels rows of X ml_tree_structure: integer, default=0 in case of multilabel classification problem indicates the structure of the tree to be learned. The set of features F corresponds to the union of A (the attributes) and Y (the labels): - 0, no constraint on the resulting tree - 1, the parent of each variable in Y must have the parent in Y, while the parent of each variable in A can have the parent in A or in Y. A label variable depends on a label variable; an attribute variable can depend on a label variable or on an attribute variable - 2, the parent of each variable in Y must have the parent in Y, and the parent of each variable in A can have the parent in Y. A label variable depends on a label variable; an attribute variable depends on a label variable """ self.alpha = alpha self.and_leaves = and_leaves self.n_features = X.shape[1] rootTree = False if scope is None: self.scope = np.array([i for i in range(self.n_features)]) rootTree = True else: self.scope = scope if sample_weight is None: self.n_samples = X.shape[0] else: self.n_samples = np.sum(sample_weight) (log_probs, log_j_probs) = self.compute_log_probs(X, sample_weight, m_priors, j_priors) MI = self.cMI(log_probs, log_j_probs) if multilabel == True: if ml_tree_structure == 1: MI[-n_labels:, -n_labels:] += np.max(MI) elif ml_tree_structure == 2: MI[-n_labels:, -n_labels:] += np.max(MI) MI[:-n_labels, :-n_labels] = 0 elif ml_tree_structure == 3: MI[:-n_labels, :-n_labels] = 0 " the tree is represented as a sequence of parents" mst = minimum_spanning_tree(-(MI)) dfs_tree = depth_first_order(mst, directed=False, i_start=0) self.df_order = dfs_tree[0] self.post_order = dfs_tree[0][::-1] self.tree = np.zeros(self.n_features, dtype=np.int) self.tree[0] = -1 for p in range(1, self.n_features): self.tree[p] = dfs_tree[1][p] penalization = logr(X.shape[0]) / (2 * X.shape[0]) if self.and_leaves == True: for p in range(1, self.n_features): if MI[self.tree[p], p] < penalization: self.tree[p] = -1 self.num_trees = self.num_trees + 1 if self.num_trees > 1: self._forest = True """ selected_MI = [] for p in range(1,self.n_features): selected_MI.append((p,MI[self.tree[p],p])) selected_MI.sort(key=lambda mi: mi[1], reverse=True) for p in range(10,self.n_features-1): self.tree[selected_MI[p][0]]=-1 """ if multilabel == True and rootTree: pX = 0 for i in range(self.n_features - n_labels): if self.tree[i] >= (self.n_features - n_labels): pX += 1 pY = 0 for i in range(self.n_features - n_labels, self.n_features): if self.tree[i] >= (self.n_features - n_labels): pY += 1 print("Xs with Y parent: ", pX) print("Ys with Y parent: ", pY) self.num_edges = self.n_features - self.num_trees # computing the factored represetation self.log_factors = np.zeros((self.n_features, 2, 2)) self.log_factors = compute_log_factors(self.tree, self.n_features, log_probs, log_j_probs, self.log_factors)
from scipy.sparse.csgraph import depth_first_order from scipy.sparse.csgraph import dijkstra from scipy.sparse import csr_matrix print("1 year =", constants.year, "s") #Return in Seconds print("1 centimeter =", constants.centi, "m") #Return in Meter print("1 KB =", constants.kibi, "Bytes") print("1 Bar =", constants.bar, "Pascal") print("1 km/s =", constants.kmh, "m/s") print("1 eV =", constants.eV, "Joules") print("1 HP =", constants.hp, "Watt") print("1 Dyne =", constants.dyn, "Newton") arr = np.array([[0, 0, 0, 2], [0, 0, 1, 0], [1, 0, 2, 0]]) print(csr_matrix(arr)) print() print("Non - Zero Data:", csr_matrix(arr).data) print() print("Total Non - Zero Data:", csr_matrix(arr).count_nonzero()) arr2 = np.array([[0, 1, 2], [1, 0, 0], [2, 0, 0]]) gp = np.array([[0, 1, 0, 1], [1, 1, 1, 1], [2, 1, 1, 0], [0, 1, 0, 1]]) newarr = csr_matrix(arr2) gpcsr = csr_matrix(gp) print() print(dijkstra(newarr, return_predecessors=True, indices=0)) print() print(depth_first_order(gpcsr, 3))