def marginal_inference(self, evidence={}): messages = np.zeros((self.n_features, 2)) logprob = 0.0 for i in self.post_order: if i != 0: state_evidence = evidence.get(self.scope[i]) if state_evidence != None: messages[self.tree[i], 0] += self.log_factors[ i, state_evidence, 0] + messages[i, state_evidence] messages[self.tree[i], 1] += self.log_factors[ i, state_evidence, 1] + messages[i, state_evidence] else: # marginalization messages[self.tree[i], 0] += logr( np.exp(self.log_factors[i, 0, 0] + messages[i, 0]) + np.exp(self.log_factors[i, 1, 0] + messages[i, 1])) messages[self.tree[i], 1] += logr( np.exp(self.log_factors[i, 0, 1] + messages[i, 0]) + np.exp(self.log_factors[i, 1, 1] + messages[i, 1])) else: state_evidence = evidence.get(self.scope[i]) if state_evidence != None: logprob = self.log_factors[i, state_evidence, 0] + messages[0, state_evidence] else: # marginalization logprob = logr( np.exp(self.log_factors[i, 0, 0] + messages[0, 0]) + np.exp(self.log_factors[i, 1, 0] + messages[0, 1])) return logprob
def score_sample_log_proba(self, x): """ WRITEME """ prob = 0.0 x1 = np.concatenate((x[0:self.or_feature], x[self.or_feature + 1:])) if x[self.or_feature] == 0: prob = prob + logr(self.left_weight) + self.left_child.score_sample_log_proba(x1) else: prob = prob + logr(self.right_weight) + self.right_child.score_sample_log_proba(x1) return prob
def score_samples(self, data, n_c, out_filename): with open(out_filename, 'w') as out_log: self.compute_weights(n_c) mean = 0.0 for x in data: prob = 0.0 for k in range(n_c): prob = prob + np.exp(self.csns[k].score_sample_log_proba(x))*self.weights[k] mean = mean + logr(prob) out_log.write('%.10f\n'%logr(prob)) out_log.close() return mean / data.shape[0]
def score_sample_log_proba(self, x): """ WRITEME """ prob = 0.0 for i in range(len(self.tree_forest)): if self.or_features[i] == None: prob = prob + self.cltree.score_sample_scope_log_proba(x, self.tree_forest[i]) else: x0 = x[self.tree_forest[i]] x1 = np.concatenate((x0[0:self.or_features[i]], x0[self.or_features[i] + 1:])) if x0[self.or_features[i]] == 0: prob = prob + logr(self.left_weights[i]) + self.children_left[i].score_sample_log_proba(x1) else: prob = prob + logr(self.right_weights[i]) + self.children_right[i].score_sample_log_proba(x1)
def score_samples(self, data, n_c, out_filename): with open(out_filename, 'w') as out_log: self.compute_weights(n_c) mean = 0.0 for x in data: prob = 0.0 for k in range(n_c): prob = prob + np.exp(self.csns[k].score_sample_log_proba( x)) * self.weights[k] mean = mean + logr(prob) out_log.write('%.10f\n' % logr(prob)) out_log.close() return mean / data.shape[0]
def marginal_inference(self, evidence={}): log_proba = 0.0 state_evidence = evidence.get(self.or_feature_scope) if state_evidence is not None: if state_evidence == 0: log_proba = self.left_child.marginal_inference(evidence) log_proba += logr(self.left_weight) else: log_proba = self.right_child.marginal_inference(evidence) log_proba += logr(self.right_weight) else: left_log_proba = self.left_child.marginal_inference(evidence) right_log_proba = self.right_child.marginal_inference(evidence) log_proba = logr(np.exp(left_log_proba)*self.left_weight + np.exp(right_log_proba)*self.right_weight) return log_proba
def score_sample_log_proba(self, x): """ WRITEME """ prob = 0.0 for s in range(len(self.children)): prob = prob + (self.weights[s] * np.exp(self.children[s].score_sample_log_proba(x))) return logr(prob)
def marginal_inference(self, evidence={}): log_proba = 0.0 state_evidence = evidence.get(self.or_feature_scope) if state_evidence is not None: if state_evidence == 0: log_proba = self.left_child.marginal_inference(evidence) log_proba += logr(self.left_weight) else: log_proba = self.right_child.marginal_inference(evidence) log_proba += logr(self.right_weight) else: left_log_proba = self.left_child.marginal_inference(evidence) right_log_proba = self.right_child.marginal_inference(evidence) log_proba = logr( np.exp(left_log_proba) * self.left_weight + np.exp(right_log_proba) * self.right_weight) return log_proba
def log_probs_numba(n_features, scope, n_samples, alpha, mpriors, priors, log_probs, log_j_probs, cond, p): for i in range(n_features): id_i = scope[i] prob = (p[i] + alpha*mpriors[id_i,1])/(n_samples + alpha) log_probs[i,0] = logr(1-prob) log_probs[i,1] = logr(prob) for i in range(n_features): for j in range(n_features): id_i = scope[i] id_j = scope[j] log_j_probs[i,j,1,1] = logr((cond[i,j] + alpha*priors[id_i,id_j,1,1]) / ( n_samples + alpha)) log_j_probs[i,j,0,1] = logr((cond[j,j] - cond[i,j] + alpha*priors[id_i,id_j,0,1]) / ( n_samples + alpha)) log_j_probs[i,j,1,0] = logr((cond[i,i] - cond[i,j] + alpha*priors[id_i,id_j,1,0]) / ( n_samples + alpha)) log_j_probs[i,j,0,0] = logr((n_samples - cond[j,j] - cond[i,i] + cond[i,j] + alpha*priors[id_i,id_j,0,0]) / ( n_samples + alpha)) log_j_probs[j,i,1,1] = log_j_probs[i,j,1,1] log_j_probs[j,i,1,0] = log_j_probs[i,j,0,1] log_j_probs[j,i,0,1] = log_j_probs[i,j,1,0] log_j_probs[j,i,0,0] = log_j_probs[i,j,0,0] return (log_probs, log_j_probs)
def log_probs_numba(n_features, scope, n_samples, alpha, mpriors, priors, log_probs, log_j_probs, cond, p): for i in range(n_features): id_i = scope[i] prob = (p[i] + alpha * mpriors[id_i, 1]) / (n_samples + alpha) log_probs[i, 0] = logr(1 - prob) log_probs[i, 1] = logr(prob) for i in range(n_features): for j in range(n_features): id_i = scope[i] id_j = scope[j] log_j_probs[i, j, 1, 1] = logr( (cond[i, j] + alpha * priors[id_i, id_j, 1, 1]) / (n_samples + alpha)) log_j_probs[i, j, 0, 1] = logr( (cond[j, j] - cond[i, j] + alpha * priors[id_i, id_j, 0, 1]) / (n_samples + alpha)) log_j_probs[i, j, 1, 0] = logr( (cond[i, i] - cond[i, j] + alpha * priors[id_i, id_j, 1, 0]) / (n_samples + alpha)) log_j_probs[i, j, 0, 0] = logr( (n_samples - cond[j, j] - cond[i, i] + cond[i, j] + alpha * priors[id_i, id_j, 0, 0]) / (n_samples + alpha)) log_j_probs[j, i, 1, 1] = log_j_probs[i, j, 1, 1] log_j_probs[j, i, 1, 0] = log_j_probs[i, j, 0, 1] log_j_probs[j, i, 0, 1] = log_j_probs[i, j, 1, 0] log_j_probs[j, i, 0, 0] = log_j_probs[i, j, 0, 0] return (log_probs, log_j_probs)
def mpe(self, evidence={}): mpe_log_proba = 0.0 state_evidence = evidence.get(self.or_feature_scope) if state_evidence is not None: if state_evidence == 0: (mpe_state, mpe_log_proba) = self.left_child.mpe(evidence) mpe_state[self.or_feature_scope] = 0 mpe_log_proba += logr(self.left_weight) else: (mpe_state, mpe_log_proba) = self.right_child.mpe(evidence) mpe_state[self.or_feature_scope] = 1 mpe_log_proba += logr(self.right_weight) else: (left_mpe_state, left_mpe_log_proba) = self.left_child.mpe(evidence) (right_mpe_state, right_mpe_log_proba) = self.right_child.mpe(evidence) if left_mpe_log_proba + logr( self.left_weight) > right_mpe_log_proba + logr( self.right_weight): mpe_state = left_mpe_state mpe_state[self.or_feature_scope] = 0 mpe_log_proba = left_mpe_log_proba + logr(self.left_weight) else: mpe_state = right_mpe_state mpe_state[self.or_feature_scope] = 1 mpe_log_proba = right_mpe_log_proba + logr(self.right_weight) return (mpe_state, mpe_log_proba)
def marginal_inference(self, evidence = {}): messages = np.zeros((self.n_features, 2)) logprob = 0.0 for i in self.post_order: if i != 0: state_evidence = evidence.get(self.scope[i]) if state_evidence != None: messages[self.tree[i],0] += self.log_factors[i,state_evidence,0] + messages[i,state_evidence] messages[self.tree[i],1] += self.log_factors[i,state_evidence,1] + messages[i,state_evidence] else: # marginalization messages[self.tree[i], 0] += logr(np.exp(self.log_factors[i, 0, 0] + messages[i,0]) + np.exp(self.log_factors[i, 1, 0] + messages[i,1])) messages[self.tree[i], 1] += logr(np.exp(self.log_factors[i, 0, 1] + messages[i,0]) + np.exp(self.log_factors[i, 1, 1] + messages[i,1])) else: state_evidence = evidence.get(self.scope[i]) if state_evidence != None: logprob = self.log_factors[i,state_evidence,0] + messages[0,state_evidence] else: # marginalization logprob = logr(np.exp(self.log_factors[i,0,0]+messages[0,0])+np.exp(self.log_factors[i,1,0]+messages[0,1])) return logprob
def naive_marginal(self, evidence = {}): probm = 0.0 M = {} for i in range(self.n_features): if evidence.get(i) == None: M[i] = [0,1] A = [dict(zip(M,prod)) for prod in itertools.product(*(M[param] for param in M))] for D in A: D.update(evidence) prob = self.log_factors[0, D[0], 0] for i in range(1,self.n_features): prob = prob + self.log_factors[i, D[i], D[self.tree[i]]] probm += np.exp(prob) return logr(probm)
def naive_marginal(self, evidence={}): probm = 0.0 M = {} for i in range(self.n_features): if evidence.get(i) == None: M[i] = [0, 1] A = [ dict(zip(M, prod)) for prod in itertools.product(*(M[param] for param in M)) ] for D in A: D.update(evidence) prob = self.log_factors[0, D[0], 0] for i in range(1, self.n_features): prob = prob + self.log_factors[i, D[i], D[self.tree[i]]] probm += np.exp(prob) return logr(probm)
def mpe(self, evidence={}): mpe_log_proba = 0.0 state_evidence = evidence.get(self.or_feature_scope) if state_evidence is not None: if state_evidence == 0: (mpe_state, mpe_log_proba) = self.left_child.mpe(evidence) mpe_state[self.or_feature_scope] = 0 mpe_log_proba += logr(self.left_weight) else: (mpe_state, mpe_log_proba) = self.right_child.mpe(evidence) mpe_state[self.or_feature_scope] = 1 mpe_log_proba += logr(self.right_weight) else: (left_mpe_state, left_mpe_log_proba) = self.left_child.mpe(evidence) (right_mpe_state, right_mpe_log_proba) = self.right_child.mpe(evidence) if left_mpe_log_proba + logr(self.left_weight) > right_mpe_log_proba + logr(self.right_weight): mpe_state = left_mpe_state mpe_state[self.or_feature_scope] = 0 mpe_log_proba = left_mpe_log_proba + logr(self.left_weight) else: mpe_state = right_mpe_state mpe_state[self.or_feature_scope] = 1 mpe_log_proba = right_mpe_log_proba + logr(self.right_weight) return (mpe_state, mpe_log_proba)
def or_cut(self): """ WRITEME """ # print(" > trying to cut ... ") sys.stdout.flush() found = False bestlik = self.orig_ll best_clt_l = None best_clt_r = None best_feature_cut = None best_left_weight = 0.0 best_right_weight = 0.0 best_right_data = None best_left_data = None best_v_ll = 0.0 best_gain = -np.inf best_left_sample_weight = None best_right_sample_weight = None best_left_vdata = None best_right_vdata = None if self.sum_nodes: # check for clustering n_clusters = 2 cov_type = 'tied' rand_gen = None n_iters = 1000 n_restarts = 1 gmm_c = sklearn.mixture.GMM(n_components=n_clusters, covariance_type=cov_type, random_state=rand_gen, n_iter=n_iters, n_init=n_restarts) gmm_c.fit(self.data) clustering = gmm_c.predict(self.data) # preventing to have a cluster with zero instances cardinality = np.sum(clustering) # print(" - Clustering instances:",self.data.shape[0], "-", cardinality,self.data.shape[0] - cardinality, end=" ") if cardinality > 0 and (self.data.shape[0] - cardinality) > 0: cluster_0 = (clustering == 0) cluster_0_data = self.data[cluster_0] cluster_1_data = self.data[~cluster_0] cluster_0_tree = Cltree() cluster_1_tree = Cltree() cluster_0_weight = cluster_0_data.shape[0] / self.data.shape[0] cluster_1_weight = cluster_1_data.shape[0] / self.data.shape[0] cluster_0_tree.fit(cluster_0_data, vdata=self.vdata, m_priors=self.m_priors, j_priors=self.j_priors, scope=self.scope, alpha=self.alpha * cluster_0_weight, and_leaves=self.and_leaves, sample_weight=None) cluster_1_tree.fit(cluster_1_data, m_priors=self.m_priors, j_priors=self.j_priors, scope=self.scope, alpha=self.alpha * cluster_1_weight, and_leaves=self.and_leaves, sample_weight=None) cluster_0_ll = cluster_0_tree.score_samples_log_proba( cluster_0_data, sample_weight=None) cluster_1_ll = cluster_1_tree.score_samples_log_proba( cluster_1_data, sample_weight=None) # log sum exp clustering_ll = 0.0 for d in self.data: clustering_ll = clustering_ll + logr( cluster_0_weight * np.exp(cluster_0_tree.score_sample_log_proba(d)) + cluster_1_weight * np.exp(cluster_1_tree.score_sample_log_proba(d))) clustering_ll = clustering_ll / self.data.shape[0] # print("ll:", clustering_ll) else: clustering_ll = -np.inf else: clustering_ll = -np.inf if self.random_forest: if self.d > self.node.cltree.n_features: selected = range(self.node.cltree.n_features) else: selected = sorted( random.sample(range(self.node.cltree.n_features), self.d)) else: selected = range(self.node.cltree.n_features) for feature in selected: condition = self.data[:, feature] == 0 new_features = np.ones(self.data.shape[1], dtype=bool) new_features[feature] = False left_data = self.data[condition, :][:, new_features] right_data = self.data[~condition, :][:, new_features] vdata_condition = self.vdata[:, feature] == 0 left_vdata = self.vdata[vdata_condition, :][:, new_features] right_vdata = self.vdata[~vdata_condition, :][:, new_features] if self.sample_weight is not None: left_sample_weight = self.sample_weight[condition] right_sample_weight = self.sample_weight[~condition] left_weight = np.sum(left_sample_weight) / np.sum( self.sample_weight) right_weight = np.sum(right_sample_weight) / np.sum( self.sample_weight) else: left_sample_weight = None right_sample_weight = None left_weight = (left_data.shape[0]) / (self.data.shape[0]) right_weight = (right_data.shape[0]) / (self.data.shape[0]) if left_data.shape[0] > 0 and right_data.shape[ 0] > 0 and left_vdata.shape[0] > 0 and right_vdata.shape[ 0] > 0: left_scope = np.concatenate( (self.node.cltree.scope[0:feature], self.node.cltree.scope[feature + 1:])) right_scope = np.concatenate( (self.node.cltree.scope[0:feature], self.node.cltree.scope[feature + 1:])) CL_l = Cltree() CL_r = Cltree() CL_l.fit(left_data, self.m_priors, self.j_priors, scope=left_scope, alpha=self.alpha * left_weight, and_leaves=self.and_leaves, sample_weight=left_sample_weight, noise=self.noise) CL_r.fit(right_data, self.m_priors, self.j_priors, scope=right_scope, alpha=self.alpha * right_weight, and_leaves=self.and_leaves, sample_weight=right_sample_weight, noise=self.noise) l_ll = CL_l.score_samples_log_proba( left_data, sample_weight=left_sample_weight) r_ll = CL_r.score_samples_log_proba( right_data, sample_weight=right_sample_weight) if self.sample_weight is not None: ll = ((l_ll + logr(left_weight)) * np.sum(left_sample_weight) + (r_ll + logr(right_weight)) * np.sum(right_sample_weight)) / np.sum( self.sample_weight) else: ll = ((l_ll + logr(left_weight)) * left_data.shape[0] + (r_ll + logr(right_weight)) * right_data.shape[0]) / self.data.shape[0] else: ll = -np.inf if ll > bestlik: bestlik = ll best_clt_l = CL_l best_clt_r = CL_r best_feature_cut = feature best_left_weight = left_weight best_right_weight = right_weight best_right_data = right_data best_left_data = left_data best_right_vdata = right_vdata best_left_vdata = left_vdata best_l_ll = l_ll best_r_ll = r_ll best_left_sample_weight = left_sample_weight best_right_sample_weight = right_sample_weight found = True """ if (self.depth+1) % 2 == 0: bestlik = self.orig_ll else: clustering_ll = self.orig_ll """ gain = (bestlik - self.orig_ll) # print (" - gain cut:", gain, end = "") gain_c = (clustering_ll - self.orig_ll) # print (" gain clustering:", gain_c) if (found == True and gain > self.min_gain) or (gain_c > gain and gain_c > self.min_gain): if (gain > gain_c): self.node = OrNode() Csn._or_nodes = Csn._or_nodes + 1 Csn._or_edges = Csn._or_edges + 2 self.node.or_feature = best_feature_cut # print(" - cutting on feature ", self.node.or_feature, "[#l:",best_left_data.shape[0],", #r:",best_right_data.shape[0],"], gain:", bestlik - self.orig_ll) instances = self.data.shape[0] self.node.left_weight = best_left_weight self.node.right_weight = best_right_weight # free memory before to recurse self.free_memory() self.node.left_child = Csn( data=best_left_data, vdata=best_left_vdata, clt=best_clt_l, ll=best_l_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha * best_left_weight, d=self.d, random_forest=self.random_forest, m_priors=self.m_priors, j_priors=self.j_priors, n_original_samples=self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain=self.min_gain, depth=self.depth + 1, sample_weight=best_left_sample_weight, forest_approach=self.forest_approach, noise=self.noise) self.node.right_child = Csn( data=best_right_data, vdata=best_right_vdata, clt=best_clt_r, ll=best_r_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha * best_right_weight, d=self.d, random_forest=self.random_forest, m_priors=self.m_priors, j_priors=self.j_priors, n_original_samples=self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain=self.min_gain, depth=self.depth + 1, sample_weight=best_right_sample_weight, forest_approach=self.forest_approach, noise=self.noise) else: self.node = SumNode() # print(" - Adding a sum node") Csn._sum_nodes = Csn._sum_nodes + 1 instances = self.data.shape[0] self.node.weights.append(cluster_0_weight) self.node.weights.append(cluster_1_weight) # free memory before to recurse self.free_memory() self.node.children.append( Csn(data=cluster_0_data, vdata=None, clt=cluster_0_tree, ll=cluster_0_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha * cluster_0_weight, d=self.d, random_forest=self.random_forest, m_priors=self.m_priors, j_priors=self.j_priors, n_original_samples=self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain=self.min_gain, depth=self.depth + 1, sample_weight=None, forest_approach=self.forest_approach, noise=self.noise)) self.node.children.append( Csn(data=cluster_1_data, vdata=None, clt=cluster_1_tree, ll=cluster_1_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha * cluster_1_weight, d=self.d, random_forest=self.random_forest, m_priors=self.m_priors, j_priors=self.j_priors, n_original_samples=self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain=self.min_gain, depth=self.depth + 1, sample_weight=None, forest_approach=self.forest_approach, noise=self.noise)) else: # Make a forest if self.and_leaves: self.node.cltree.makeForest( vdata=self.vdata, forest_approach=self.forest_approach) # print(" no cutting") """if self.node.cltree.is_forest():
def and_cut(self): """ WRITEME """ n_features = self.data.shape[1] self.forest = np.zeros(n_features, dtype=np.int) self.roots = [] # naive approach to build the tree_forest for i in range(n_features): if self.node.cltree.tree[i] == -1: self.roots.append(i) for i in range(n_features): if self.node.cltree.tree[i] != -1: parent = self.node.cltree.tree[i] while self.node.cltree.tree[parent] != -1: parent = self.node.cltree.tree[parent] self.forest[i] = parent else: self.forest[i] = i self.tree_forest = [] for r in self.roots: t_forest = [] for i in range(n_features): if self.forest[i] == r: t_forest.append(i) self.tree_forest.append(t_forest) """print ("AND node") print (self.tree_forest)""" for i in range(self.node.cltree.num_trees): # print(" tree", self.tree_forest[i]) sys.stdout.flush() tree_n_features = len(self.tree_forest[i]) if self.data.shape[0] > self.min_instances: if tree_n_features >= self.min_features: tree_data = self.data[:, self.tree_forest[i]] found = False orig_ll = self.node.cltree.score_samples_scope_log_proba( self.data, self.tree_forest[i]) bestlik = orig_ll best_clt_l = None best_clt_r = None best_feature_cut = None best_left_weight = 0.0 best_right_weight = 0.0 best_right_data = None best_left_data = None best_v_ll = 0.0 best_gain = -np.inf if self.random_forest: if self.d > tree_n_features: selected = range(tree_n_features) else: selected = sorted( random.sample(range(tree_n_features), self.d)) else: selected = range(tree_n_features) for feature in selected: condition = tree_data[:, feature] == 0 new_features = np.ones(tree_data.shape[1], dtype=bool) new_features[feature] = False left_data = tree_data[condition, :][:, new_features] right_data = tree_data[~condition, :][:, new_features] left_weight = (left_data.shape[0]) / ( tree_data.shape[0]) right_weight = (right_data.shape[0]) / ( tree_data.shape[0]) if self.sample_weight is not None: left_sample_weight = self.sample_weight[condition] right_sample_weight = self.sample_weight[ ~condition] else: left_sample_weight = None right_sample_weight = None if left_data.shape[0] > 1 and right_data.shape[0] > 1: # compute the tree features id tree_scope = np.zeros(tree_n_features, dtype=np.int) for f in range(tree_n_features): tree_scope[f] = self.node.cltree.scope[ self.tree_forest[i][f]] left_scope = np.concatenate( (tree_scope[0:feature], tree_scope[feature + 1:])) right_scope = np.concatenate( (tree_scope[0:feature], tree_scope[feature + 1:])) CL_l = Cltree() CL_r = Cltree() CL_l.fit(left_data, vdata=self.vdata, m_priors=self.m_priors, j_priors=self.j_priors, scope=left_scope, alpha=self.alpha * left_weight, and_leaves=self.and_leaves, sample_weight=left_sample_weight) CL_r.fit(right_data, vdata=self.vdata, m_priors=self.m_priors, j_priors=self.j_priors, scope=right_scope, alpha=self.alpha * right_weight, and_leaves=self.and_leaves, sample_weight=right_sample_weight) l_ll = CL_l.score_samples_log_proba(left_data) r_ll = CL_r.score_samples_log_proba(right_data) ll = ((l_ll + logr(left_weight)) * left_data.shape[0] + (r_ll + logr(right_weight)) * right_data.shape[0]) / self.data.shape[0] else: ll = -np.inf if ll > bestlik: bestlik = ll best_clt_l = CL_l best_clt_r = CL_r best_feature_cut = feature best_left_weight = left_weight best_right_weight = right_weight best_right_data = right_data best_left_data = left_data best_l_ll = l_ll best_r_ll = r_ll best_left_sample_weight = left_sample_weight best_right_sample_weight = right_sample_weight found = True gain = (bestlik - orig_ll) # print (" gain:", gain, end = " ") """if gain <= self.min_gain: print("no improvement")""" if found == True and gain > self.min_gain: if not is_and_node(self.node): clt = self.node.cltree self.node = AndNode() self.node.cltree = clt self.node.children_left = [ None ] * self.node.cltree.num_trees self.node.children_right = [ None ] * self.node.cltree.num_trees self.node.or_features = [ None ] * self.node.cltree.num_trees self.node.left_weights = [ None ] * self.node.cltree.num_trees self.node.right_weights = [ None ] * self.node.cltree.num_trees self.node.tree_forest = self.tree_forest Csn._or_nodes = Csn._or_nodes + 1 Csn._or_edges = Csn._or_edges + 2 self.node.or_features[i] = best_feature_cut # print(" cutting on feature ", self.node.or_features[i]) instances = self.data.shape[0] self.node.left_weights[i] = best_left_weight self.node.right_weights[i] = best_right_weight self.node.children_left[i] = Csn( data=best_left_data, vdata=self.vdata, clt=best_clt_l, ll=best_l_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha * best_left_weight, d=self.d, random_forest=self.random_forest, m_priors=self.m_priors, j_priors=self.j_priors, n_original_samples=self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain=self.min_gain, depth=self.depth + 1, sample_weight=best_left_sample_weight) self.node.children_right[i] = Csn( data=best_right_data, vdata=self.vdata, clt=best_clt_r, ll=best_r_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha * best_right_weight, d=self.d, random_forest=self.random_forest, m_priors=self.m_priors, j_priors=self.j_priors, n_original_samples=self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain=self.min_gain, depth=self.depth + 1, sample_weight=best_right_sample_weight) """else: print( " > no cutting due to few features") else: print(" > no cutting due to few instances")""" if is_and_node(self.node): Csn._and_nodes += 1 # free memory before to recurse self.free_memory()
def or_cut(self): """ WRITEME """ # print(" > trying to cut ... ") sys.stdout.flush() found = False bestlik = self.orig_ll best_clt_l = None best_clt_r = None best_feature_cut = None best_left_weight = 0.0 best_right_weight = 0.0 best_right_data = None best_left_data = None best_v_ll = 0.0 best_gain = -np.inf best_left_sample_weight = None best_right_sample_weight = None best_left_vdata = None best_right_vdata = None if self.sum_nodes: # check for clustering n_clusters = 2 cov_type = 'tied' rand_gen = None n_iters = 1000 n_restarts = 1 gmm_c = sklearn.mixture.GMM(n_components=n_clusters, covariance_type=cov_type, random_state=rand_gen, n_iter=n_iters, n_init=n_restarts) gmm_c.fit(self.data) clustering = gmm_c.predict(self.data) # preventing to have a cluster with zero instances cardinality = np.sum(clustering) # print(" - Clustering instances:",self.data.shape[0], "-", cardinality,self.data.shape[0] - cardinality, end=" ") if cardinality > 0 and (self.data.shape[0] - cardinality) > 0: cluster_0 = (clustering == 0) cluster_0_data = self.data[cluster_0] cluster_1_data = self.data[~cluster_0] cluster_0_tree = Cltree() cluster_1_tree = Cltree() cluster_0_weight = cluster_0_data.shape[0] / self.data.shape[0] cluster_1_weight = cluster_1_data.shape[0] / self.data.shape[0] cluster_0_tree.fit(cluster_0_data, vdata=self.vdata, m_priors=self.m_priors, j_priors=self.j_priors, scope=self.scope, alpha=self.alpha * cluster_0_weight, and_leaves=self.and_leaves, sample_weight=None) cluster_1_tree.fit(cluster_1_data, m_priors=self.m_priors, j_priors=self.j_priors, scope=self.scope, alpha=self.alpha * cluster_1_weight, and_leaves=self.and_leaves, sample_weight=None) cluster_0_ll = cluster_0_tree.score_samples_log_proba(cluster_0_data, sample_weight=None) cluster_1_ll = cluster_1_tree.score_samples_log_proba(cluster_1_data, sample_weight=None) # log sum exp clustering_ll = 0.0 for d in self.data: clustering_ll = clustering_ll + logr( cluster_0_weight * np.exp(cluster_0_tree.score_sample_log_proba(d)) + cluster_1_weight * np.exp( cluster_1_tree.score_sample_log_proba(d))) clustering_ll = clustering_ll / self.data.shape[0] # print("ll:", clustering_ll) else: clustering_ll = -np.inf else: clustering_ll = -np.inf if self.random_forest: if self.d > self.node.cltree.n_features: selected = range(self.node.cltree.n_features) else: selected = sorted(random.sample(range(self.node.cltree.n_features), self.d)) else: selected = range(self.node.cltree.n_features) for feature in selected: condition = self.data[:, feature] == 0 new_features = np.ones(self.data.shape[1], dtype=bool) new_features[feature] = False left_data = self.data[condition, :][:, new_features] right_data = self.data[~condition, :][:, new_features] vdata_condition = self.vdata[:, feature] == 0 left_vdata = self.vdata[vdata_condition, :][:, new_features] right_vdata = self.vdata[~vdata_condition, :][:, new_features] if self.sample_weight is not None: left_sample_weight = self.sample_weight[condition] right_sample_weight = self.sample_weight[~condition] left_weight = np.sum(left_sample_weight) / np.sum(self.sample_weight) right_weight = np.sum(right_sample_weight) / np.sum(self.sample_weight) else: left_sample_weight = None right_sample_weight = None left_weight = (left_data.shape[0]) / (self.data.shape[0]) right_weight = (right_data.shape[0]) / (self.data.shape[0]) if left_data.shape[0] > 0 and right_data.shape[0] > 0 and left_vdata.shape[0] > 0 and right_vdata.shape[ 0] > 0: left_scope = np.concatenate((self.node.cltree.scope[0:feature], self.node.cltree.scope[feature + 1:])) right_scope = np.concatenate((self.node.cltree.scope[0:feature], self.node.cltree.scope[feature + 1:])) CL_l = Cltree() CL_r = Cltree() CL_l.fit(left_data, self.m_priors, self.j_priors, scope=left_scope, alpha=self.alpha * left_weight, and_leaves=self.and_leaves, sample_weight=left_sample_weight, noise=self.noise) CL_r.fit(right_data,self.m_priors, self.j_priors, scope=right_scope, alpha=self.alpha * right_weight, and_leaves=self.and_leaves, sample_weight=right_sample_weight, noise=self.noise) l_ll = CL_l.score_samples_log_proba(left_data, sample_weight=left_sample_weight) r_ll = CL_r.score_samples_log_proba(right_data, sample_weight=right_sample_weight) if self.sample_weight is not None: ll = ( (l_ll + logr(left_weight)) * np.sum(left_sample_weight) + ( r_ll + logr(right_weight)) * np.sum( right_sample_weight)) / np.sum(self.sample_weight) else: ll = ((l_ll + logr(left_weight)) * left_data.shape[0] + (r_ll + logr(right_weight)) * right_data.shape[0]) / self.data.shape[0] else: ll = -np.inf if ll > bestlik: bestlik = ll best_clt_l = CL_l best_clt_r = CL_r best_feature_cut = feature best_left_weight = left_weight best_right_weight = right_weight best_right_data = right_data best_left_data = left_data best_right_vdata = right_vdata best_left_vdata = left_vdata best_l_ll = l_ll best_r_ll = r_ll best_left_sample_weight = left_sample_weight best_right_sample_weight = right_sample_weight found = True """ if (self.depth+1) % 2 == 0: bestlik = self.orig_ll else: clustering_ll = self.orig_ll """ gain = (bestlik - self.orig_ll) # print (" - gain cut:", gain, end = "") gain_c = (clustering_ll - self.orig_ll) # print (" gain clustering:", gain_c) if (found == True and gain > self.min_gain) or (gain_c > gain and gain_c > self.min_gain): if (gain > gain_c): self.node = OrNode() Csn._or_nodes = Csn._or_nodes + 1 Csn._or_edges = Csn._or_edges + 2 self.node.or_feature = best_feature_cut # print(" - cutting on feature ", self.node.or_feature, "[#l:",best_left_data.shape[0],", #r:",best_right_data.shape[0],"], gain:", bestlik - self.orig_ll) instances = self.data.shape[0] self.node.left_weight = best_left_weight self.node.right_weight = best_right_weight # free memory before to recurse self.free_memory() self.node.left_child = Csn(data=best_left_data, vdata=best_left_vdata, clt=best_clt_l, ll=best_l_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha * best_left_weight, d=self.d, random_forest=self.random_forest, m_priors=self.m_priors, j_priors=self.j_priors, n_original_samples=self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain=self.min_gain, depth=self.depth + 1, sample_weight=best_left_sample_weight, forest_approach=self.forest_approach, noise=self.noise) self.node.right_child = Csn(data=best_right_data, vdata=best_right_vdata, clt=best_clt_r, ll=best_r_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha * best_right_weight, d=self.d, random_forest=self.random_forest, m_priors=self.m_priors, j_priors=self.j_priors, n_original_samples=self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain=self.min_gain, depth=self.depth + 1, sample_weight=best_right_sample_weight, forest_approach=self.forest_approach, noise=self.noise) else: self.node = SumNode() # print(" - Adding a sum node") Csn._sum_nodes = Csn._sum_nodes + 1 instances = self.data.shape[0] self.node.weights.append(cluster_0_weight) self.node.weights.append(cluster_1_weight) # free memory before to recurse self.free_memory() self.node.children.append(Csn(data=cluster_0_data, vdata=None, clt=cluster_0_tree, ll=cluster_0_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha * cluster_0_weight, d=self.d, random_forest=self.random_forest, m_priors=self.m_priors, j_priors=self.j_priors, n_original_samples=self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain=self.min_gain, depth=self.depth + 1, sample_weight=None, forest_approach=self.forest_approach, noise=self.noise)) self.node.children.append(Csn(data=cluster_1_data, vdata=None, clt=cluster_1_tree, ll=cluster_1_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha * cluster_1_weight, d=self.d, random_forest=self.random_forest, m_priors=self.m_priors, j_priors=self.j_priors, n_original_samples=self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain=self.min_gain, depth=self.depth + 1, sample_weight=None, forest_approach=self.forest_approach, noise=self.noise)) else: # Make a forest if self.and_leaves: self.node.cltree.makeForest(vdata=self.vdata, forest_approach=self.forest_approach) # print(" no cutting") """if self.node.cltree.is_forest():
def and_cut(self): """ WRITEME """ n_features = self.data.shape[1] self.forest = np.zeros(n_features, dtype=np.int) self.roots = [] # naive approach to build the tree_forest for i in range(n_features): if self.node.cltree.tree[i] == -1: self.roots.append(i) for i in range(n_features): if self.node.cltree.tree[i] != -1: parent = self.node.cltree.tree[i] while self.node.cltree.tree[parent] != -1: parent = self.node.cltree.tree[parent] self.forest[i] = parent else: self.forest[i] = i self.tree_forest = [] for r in self.roots: t_forest = [] for i in range(n_features): if self.forest[i] == r: t_forest.append(i) self.tree_forest.append(t_forest) """print ("AND node") print (self.tree_forest)""" for i in range(self.node.cltree.num_trees): # print(" tree", self.tree_forest[i]) sys.stdout.flush() tree_n_features = len(self.tree_forest[i]) if self.data.shape[0] > self.min_instances: if tree_n_features >= self.min_features: tree_data = self.data[:, self.tree_forest[i]] found = False orig_ll = self.node.cltree.score_samples_scope_log_proba(self.data, self.tree_forest[i]) bestlik = orig_ll best_clt_l = None best_clt_r = None best_feature_cut = None best_left_weight = 0.0 best_right_weight = 0.0 best_right_data = None best_left_data = None best_v_ll = 0.0 best_gain = -np.inf if self.random_forest: if self.d > tree_n_features: selected = range(tree_n_features) else: selected = sorted(random.sample(range(tree_n_features), self.d)) else: selected = range(tree_n_features) for feature in selected: condition = tree_data[:, feature] == 0 new_features = np.ones(tree_data.shape[1], dtype=bool) new_features[feature] = False left_data = tree_data[condition, :][:, new_features] right_data = tree_data[~condition, :][:, new_features] left_weight = (left_data.shape[0]) / (tree_data.shape[0]) right_weight = (right_data.shape[0]) / (tree_data.shape[0]) if self.sample_weight is not None: left_sample_weight = self.sample_weight[condition] right_sample_weight = self.sample_weight[~condition] else: left_sample_weight = None right_sample_weight = None if left_data.shape[0] > 1 and right_data.shape[0] > 1: # compute the tree features id tree_scope = np.zeros(tree_n_features, dtype=np.int) for f in range(tree_n_features): tree_scope[f] = self.node.cltree.scope[self.tree_forest[i][f]] left_scope = np.concatenate((tree_scope[0:feature], tree_scope[feature + 1:])) right_scope = np.concatenate((tree_scope[0:feature], tree_scope[feature + 1:])) CL_l = Cltree() CL_r = Cltree() CL_l.fit(left_data, vdata=self.vdata, m_priors=self.m_priors, j_priors=self.j_priors, scope=left_scope, alpha=self.alpha * left_weight, and_leaves=self.and_leaves, sample_weight=left_sample_weight) CL_r.fit(right_data, vdata=self.vdata, m_priors=self.m_priors, j_priors=self.j_priors, scope=right_scope, alpha=self.alpha * right_weight, and_leaves=self.and_leaves, sample_weight=right_sample_weight) l_ll = CL_l.score_samples_log_proba(left_data) r_ll = CL_r.score_samples_log_proba(right_data) ll = ((l_ll + logr(left_weight)) * left_data.shape[0] + (r_ll + logr(right_weight)) * right_data.shape[0]) / self.data.shape[0] else: ll = -np.inf if ll > bestlik: bestlik = ll best_clt_l = CL_l best_clt_r = CL_r best_feature_cut = feature best_left_weight = left_weight best_right_weight = right_weight best_right_data = right_data best_left_data = left_data best_l_ll = l_ll best_r_ll = r_ll best_left_sample_weight = left_sample_weight best_right_sample_weight = right_sample_weight found = True gain = (bestlik - orig_ll) # print (" gain:", gain, end = " ") """if gain <= self.min_gain: print("no improvement")""" if found == True and gain > self.min_gain: if not is_and_node(self.node): clt = self.node.cltree self.node = AndNode() self.node.cltree = clt self.node.children_left = [None] * self.node.cltree.num_trees self.node.children_right = [None] * self.node.cltree.num_trees self.node.or_features = [None] * self.node.cltree.num_trees self.node.left_weights = [None] * self.node.cltree.num_trees self.node.right_weights = [None] * self.node.cltree.num_trees self.node.tree_forest = self.tree_forest Csn._or_nodes = Csn._or_nodes + 1 Csn._or_edges = Csn._or_edges + 2 self.node.or_features[i] = best_feature_cut # print(" cutting on feature ", self.node.or_features[i]) instances = self.data.shape[0] self.node.left_weights[i] = best_left_weight self.node.right_weights[i] = best_right_weight self.node.children_left[i] = Csn(data=best_left_data, vdata=self.vdata, clt=best_clt_l, ll=best_l_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha * best_left_weight, d=self.d, random_forest=self.random_forest, m_priors=self.m_priors, j_priors=self.j_priors, n_original_samples=self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain=self.min_gain, depth=self.depth + 1, sample_weight=best_left_sample_weight) self.node.children_right[i] = Csn(data=best_right_data, vdata=self.vdata, clt=best_clt_r, ll=best_r_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha * best_right_weight, d=self.d, random_forest=self.random_forest, m_priors=self.m_priors, j_priors=self.j_priors, n_original_samples=self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain=self.min_gain, depth=self.depth + 1, sample_weight=best_right_sample_weight) """else: print( " > no cutting due to few features") else: print(" > no cutting due to few instances")""" if is_and_node(self.node): Csn._and_nodes += 1 # free memory before to recurse self.free_memory()
def or_cut(self): """ WRITEME """ print(" > trying to cut ... ") sys.stdout.flush() found = False bestlik = self.orig_ll best_clt_l = None best_clt_r = None best_feature_cut = None best_left_weight = 0.0 best_right_weight = 0.0 best_right_data = None best_left_data = None best_v_ll = 0.0 best_gain = -np.inf best_left_sample_weight = None best_right_sample_weight = None if self.sum_nodes: # check for clustering n_clusters = 2 cov_type = 'tied' rand_gen = None n_iters = 1000 n_restarts=1 gmm_c = sklearn.mixture.GMM(n_components=n_clusters, covariance_type=cov_type, random_state=rand_gen, n_iter=n_iters, n_init=n_restarts) gmm_c.fit(self.data) clustering = gmm_c.predict(self.data) # preventing to have a cluster with zero instances cardinality = np.sum(clustering) print(" - Clustering instances:",self.data.shape[0], "-", cardinality,self.data.shape[0] - cardinality, end=" ") if cardinality > 0 and (self.data.shape[0] - cardinality) > 0: cluster_0 = (clustering == 0) cluster_0_data = self.data[cluster_0] cluster_1_data = self.data[~cluster_0] cluster_0_tree = Cltree() cluster_1_tree = Cltree() cluster_0_weight = cluster_0_data.shape[0] / self.data.shape[0] cluster_1_weight = cluster_1_data.shape[0] / self.data.shape[0] cluster_0_tree.fit(cluster_0_data,self.m_priors,self.j_priors,scope=self.scope,alpha=self.alpha*cluster_0_weight, and_leaves=self.and_leaves, sample_weight = None) cluster_1_tree.fit(cluster_1_data,self.m_priors,self.j_priors,scope=self.scope,alpha=self.alpha*cluster_1_weight, and_leaves=self.and_leaves, sample_weight = None) cluster_0_ll = cluster_0_tree.score_samples_log_proba(cluster_0_data, sample_weight = None) cluster_1_ll = cluster_1_tree.score_samples_log_proba(cluster_1_data, sample_weight = None) # log sum exp clustering_ll = 0.0 for d in self.data: clustering_ll = clustering_ll + logr( cluster_0_weight * np.exp(cluster_0_tree.score_sample_log_proba(d)) + cluster_1_weight * np.exp(cluster_1_tree.score_sample_log_proba(d))) clustering_ll = clustering_ll / self.data.shape[0] print("ll:", clustering_ll) else: clustering_ll = -np.inf else: clustering_ll = -np.inf cutting_features = [] for f in range(self.node.cltree.n_features): if self.scope[f] not in self.leaf_vars: cutting_features.append(f) if self.random_forest: if self.d > len(cutting_feaures): selected = cutting_features else: selected = sorted(random.sample(cutting_features, self.d)) else: selected = cutting_features PQ = [] ll = 0.0 CL_l = None CL_r = None feature = None left_weight = 0.0 right_weight = 0.0 left_data = None right_data = None l_ll = 0.0 r_ll = 0.0 left_sample_weight = 0.0 right_sample_weight = 0.0 for feature in selected: condition = self.data[:,feature]==0 new_features = np.ones(self.data.shape[1], dtype=bool) new_features[feature] = False left_data = self.data[condition,:][:, new_features] right_data = self.data[~condition,:][:, new_features] if self.sample_weight is not None: left_sample_weight = self.sample_weight[condition] right_sample_weight = self.sample_weight[~condition] left_weight = np.sum(left_sample_weight ) / np.sum(self.sample_weight ) right_weight = np.sum(right_sample_weight ) / np.sum(self.sample_weight ) else: left_sample_weight = None right_sample_weight = None left_weight = (left_data.shape[0] ) / (self.data.shape[0] ) right_weight = (right_data.shape[0] ) / (self.data.shape[0] ) if left_data.shape[0] > 0 and right_data.shape[0] > 0: left_scope = np.concatenate((self.node.cltree.scope[0:feature],self.node.cltree.scope[feature+1:])) right_scope = np.concatenate((self.node.cltree.scope[0:feature],self.node.cltree.scope[feature+1:])) CL_l = Cltree() CL_r = Cltree() CL_l.fit(left_data,self.m_priors,self.j_priors,scope=left_scope,alpha=self.alpha*left_weight, and_leaves=self.and_leaves, sample_weight = left_sample_weight, multilabel = self.multilabel, n_labels=self.n_labels, ml_tree_structure=self.ml_tree_structure) CL_r.fit(right_data,self.m_priors,self.j_priors,scope=right_scope,alpha=self.alpha*right_weight, and_leaves=self.and_leaves, sample_weight = right_sample_weight, multilabel = self.multilabel, n_labels=self.n_labels, ml_tree_structure=self.ml_tree_structure) l_ll = CL_l.score_samples_log_proba(left_data, sample_weight = left_sample_weight) r_ll = CL_r.score_samples_log_proba(right_data, sample_weight = right_sample_weight) if self.sample_weight is not None: ll = ((l_ll+logr(left_weight))*np.sum(left_sample_weight) + (r_ll+logr(right_weight))*np.sum(right_sample_weight))/np.sum(self.sample_weight) else: ll = ((l_ll+logr(left_weight))*left_data.shape[0] + (r_ll+logr(right_weight))*right_data.shape[0])/self.data.shape[0] else: ll = -np.inf if len(PQ) == 0: PQ.append((ll, CL_l, CL_r, feature, left_weight, right_weight, left_data, right_data, l_ll, r_ll, left_sample_weight, right_sample_weight)) else: for e in range(len(PQ)): if PQ[e][0] < ll: PQ.insert(e, (ll, CL_l, CL_r, feature, left_weight, right_weight, left_data, right_data, l_ll, r_ll, left_sample_weight, right_sample_weight)) break if len(PQ)>3: PQ.pop() if ll>bestlik: bestlik = ll best_clt_l = CL_l best_clt_r = CL_r best_feature_cut = feature best_left_weight = left_weight best_right_weight = right_weight best_right_data = right_data best_left_data = left_data best_l_ll = l_ll best_r_ll = r_ll best_left_sample_weight = left_sample_weight best_right_sample_weight = right_sample_weight found = True gain = (bestlik - self.orig_ll) print (" - gain cut:", gain, end = "") gain_c = (clustering_ll - self.orig_ll) print (" gain clustering:", gain_c) if (found==True and gain > self.min_gain) or (gain_c > gain and gain_c > self.min_gain): PQ = [] if self.depth < 4 and len(PQ)>1: # Csn._sum_nodes = Csn._sum_nodes + 1 instances = self.data.shape[0] sum_w = 0.0 for i in range(len(PQ)): sum_w += np.exp(PQ[i][0]) # sum_w += PQ[i][0] for i in range(len(PQ)): self.node.weights.append(np.exp(PQ[i][0])/sum_w) # self.node.weights.append(PQ[i][0]/sum_w) # self.node.weights.append(1/len(PQ)) self.node.children.append(OrNode()) (pq_ll, pq_CL_l, pq_CL_r, pq_feature, pq_left_weight, pq_right_weight, pq_left_data, pq_right_data, pq_l_ll, pq_r_ll, pq_left_sample_weight, pq_right_sample_weight) = PQ[i] self.node.children[i].or_feature_scope = self.scope[pq_feature] self.node.children[i].or_feature = pq_feature instances = self.data.shape[0] self.node.children[i].left_weight = pq_left_weight self.node.children[i].right_weight = pq_right_weight self.node.children[i].left_child = Csn(data=pq_left_data, clt=pq_CL_l, ll=pq_l_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha*pq_left_weight, d=self.d, random_forest=self.random_forest, leaf_vars = self.leaf_vars, m_priors = self.m_priors, j_priors = self.j_priors, n_original_samples = self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain = self.min_gain, depth=self.depth+1, sample_weight = pq_left_sample_weight, multilabel = self.multilabel, n_labels=self.n_labels, ml_tree_structure=self.ml_tree_structure) self.node.children[i].right_child = Csn(data=pq_right_data, clt=pq_CL_r, ll=pq_r_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha*pq_right_weight, d=self.d, random_forest=self.random_forest, leaf_vars = self.leaf_vars, m_priors = self.m_priors, j_priors = self.j_priors, n_original_samples = self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain = self.min_gain, depth=self.depth+1, sample_weight = pq_right_sample_weight, multilabel = self.multilabel, n_labels=self.n_labels, ml_tree_structure=self.ml_tree_structure) elif (gain > gain_c): self.node = OrNode() self.node.or_feature_scope = self.scope[best_feature_cut] Csn._or_nodes = Csn._or_nodes + 1 Csn._or_edges = Csn._or_edges + 2 self.node.or_feature = best_feature_cut print(" - cutting on feature ", self.node.or_feature, "[#l:",best_left_data.shape[0],", #r:",best_right_data.shape[0],"], gain:", bestlik - self.orig_ll) instances = self.data.shape[0] self.node.left_weight = best_left_weight self.node.right_weight = best_right_weight # free memory before to recurse self.free_memory() self.node.left_child = Csn(data=best_left_data, clt=best_clt_l, ll=best_l_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha*best_left_weight, d=self.d, random_forest=self.random_forest, leaf_vars = self.leaf_vars, m_priors = self.m_priors, j_priors = self.j_priors, n_original_samples = self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain = self.min_gain, depth=self.depth+1, sample_weight = best_left_sample_weight, multilabel = self.multilabel, n_labels=self.n_labels, ml_tree_structure=self.ml_tree_structure) self.node.right_child = Csn(data=best_right_data, clt=best_clt_r, ll=best_r_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha*best_right_weight, d=self.d, random_forest=self.random_forest, leaf_vars = self.leaf_vars, m_priors = self.m_priors, j_priors = self.j_priors, n_original_samples = self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain = self.min_gain, depth=self.depth+1, sample_weight = best_right_sample_weight, multilabel = self.multilabel, n_labels=self.n_labels, ml_tree_structure=self.ml_tree_structure) else: self.node = SumNode() print(" - Adding a sum node") Csn._sum_nodes = Csn._sum_nodes + 1 instances = self.data.shape[0] self.node.weights.append(cluster_0_weight) self.node.weights.append(cluster_1_weight) # free memory before to recurse self.free_memory() self.node.children.append(Csn(data=cluster_0_data, clt=cluster_0_tree, ll=cluster_0_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha*cluster_0_weight, d=self.d, random_forest=self.random_forest, m_priors = self.m_priors, j_priors = self.j_priors, n_original_samples = self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain = self.min_gain, depth=self.depth+1, sample_weight = None)) self.node.children.append(Csn(data=cluster_1_data, clt=cluster_1_tree, ll=cluster_1_ll, min_instances=self.min_instances, min_features=self.min_features, alpha=self.alpha*cluster_1_weight, d=self.d, random_forest=self.random_forest, m_priors = self.m_priors, j_priors = self.j_priors, n_original_samples = self.n_original_samples, and_leaves=self.and_leaves, and_inners=self.and_inners, min_gain = self.min_gain, depth=self.depth+1, sample_weight = None)) else: print(" no cutting") if self.node.cltree.is_forest(): print(" -> Forest with",self.node.cltree.num_trees, "trees") else: print(" -> Tree")
def fit(self, X, m_priors, j_priors, alpha=1.0, sample_weight=None, scope=None, and_leaves=False, multilabel = False, n_labels=0, ml_tree_structure=0): """Fit the model to the data. Parameters ---------- X : ndarray, shape=(n, m) The data array. m_priors: the marginal priors for each feature j_priors: the joint priors for each couple of features alpha: float, default=1.0 the constant for the smoothing sample_weight: ndarray, shape=(n,) The weight of each sample. scope: unique identifiers for the features and_leaves: boolean, default=False multilabel: boolean, default=False its value indicates whether the cltree are used for multilabel classification problems when imported by mlcsn.py n_labels: integer, default=0 in case of multilabel classification problem indicates the number of labels, assumed to be the n_labels rows of X ml_tree_structure: integer, default=0 in case of multilabel classification problem indicates the structure of the tree to be learned. The set of features F corresponds to the union of A (the attributes) and Y (the labels): - 0, no constraint on the resulting tree - 1, the parent of each variable in Y must have the parent in Y, while the parent of each variable in A can have the parent in A or in Y. A label variable depends on a label variable; an attribute variable can depend on a label variable or on an attribute variable - 2, the parent of each variable in Y must have the parent in Y, and the parent of each variable in A can have the parent in Y. A label variable depends on a label variable; an attribute variable depends on a label variable """ self.alpha = alpha self.and_leaves = and_leaves self.n_features = X.shape[1] rootTree = False if scope is None: self.scope = np.array([i for i in range(self.n_features)]) rootTree = True else: self.scope = scope if sample_weight is None: self.n_samples = X.shape[0] else: self.n_samples = np.sum(sample_weight) (log_probs, log_j_probs) = self.compute_log_probs(X, sample_weight, m_priors, j_priors) MI = self.cMI(log_probs, log_j_probs) if multilabel == True: if ml_tree_structure == 1: MI[-n_labels:,-n_labels:] += np.max(MI) elif ml_tree_structure == 2: MI[-n_labels:,-n_labels:] += np.max(MI) MI[:-n_labels,:-n_labels] = 0 elif ml_tree_structure == 3: MI[:-n_labels,:-n_labels] = 0 " the tree is represented as a sequence of parents" mst = minimum_spanning_tree(-(MI)) dfs_tree = depth_first_order(mst, directed=False, i_start=0) self.df_order = dfs_tree[0] self.post_order = dfs_tree[0][::-1] self.tree = np.zeros(self.n_features, dtype=np.int) self.tree[0] = -1 for p in range(1, self.n_features): self.tree[p]=dfs_tree[1][p] penalization = logr(X.shape[0])/(2*X.shape[0]) if self.and_leaves == True: for p in range(1,self.n_features): if MI[self.tree[p],p]<penalization: self.tree[p]=-1 self.num_trees = self.num_trees + 1 if self.num_trees > 1: self._forest = True """ selected_MI = [] for p in range(1,self.n_features): selected_MI.append((p,MI[self.tree[p],p])) selected_MI.sort(key=lambda mi: mi[1], reverse=True) for p in range(10,self.n_features-1): self.tree[selected_MI[p][0]]=-1 """ if multilabel == True and rootTree: pX = 0 for i in range(self.n_features-n_labels): if self.tree[i]>=(self.n_features-n_labels): pX += 1 pY = 0 for i in range(self.n_features-n_labels,self.n_features): if self.tree[i]>=(self.n_features-n_labels): pY += 1 print("Xs with Y parent: ", pX) print("Ys with Y parent: ", pY) self.num_edges = self.n_features - self.num_trees # computing the factored represetation self.log_factors = np.zeros((self.n_features, 2, 2)) self.log_factors = compute_log_factors(self.tree, self.n_features, log_probs, log_j_probs, self.log_factors)
def fit(self, X, m_priors, j_priors, alpha=1.0, sample_weight=None, scope=None, and_leaves=False, multilabel=False, n_labels=0, ml_tree_structure=0): """Fit the model to the data. Parameters ---------- X : ndarray, shape=(n, m) The data array. m_priors: the marginal priors for each feature j_priors: the joint priors for each couple of features alpha: float, default=1.0 the constant for the smoothing sample_weight: ndarray, shape=(n,) The weight of each sample. scope: unique identifiers for the features and_leaves: boolean, default=False multilabel: boolean, default=False its value indicates whether the cltree are used for multilabel classification problems when imported by mlcsn.py n_labels: integer, default=0 in case of multilabel classification problem indicates the number of labels, assumed to be the n_labels rows of X ml_tree_structure: integer, default=0 in case of multilabel classification problem indicates the structure of the tree to be learned. The set of features F corresponds to the union of A (the attributes) and Y (the labels): - 0, no constraint on the resulting tree - 1, the parent of each variable in Y must have the parent in Y, while the parent of each variable in A can have the parent in A or in Y. A label variable depends on a label variable; an attribute variable can depend on a label variable or on an attribute variable - 2, the parent of each variable in Y must have the parent in Y, and the parent of each variable in A can have the parent in Y. A label variable depends on a label variable; an attribute variable depends on a label variable """ self.alpha = alpha self.and_leaves = and_leaves self.n_features = X.shape[1] rootTree = False if scope is None: self.scope = np.array([i for i in range(self.n_features)]) rootTree = True else: self.scope = scope if sample_weight is None: self.n_samples = X.shape[0] else: self.n_samples = np.sum(sample_weight) (log_probs, log_j_probs) = self.compute_log_probs(X, sample_weight, m_priors, j_priors) MI = self.cMI(log_probs, log_j_probs) if multilabel == True: if ml_tree_structure == 1: MI[-n_labels:, -n_labels:] += np.max(MI) elif ml_tree_structure == 2: MI[-n_labels:, -n_labels:] += np.max(MI) MI[:-n_labels, :-n_labels] = 0 elif ml_tree_structure == 3: MI[:-n_labels, :-n_labels] = 0 " the tree is represented as a sequence of parents" mst = minimum_spanning_tree(-(MI)) dfs_tree = depth_first_order(mst, directed=False, i_start=0) self.df_order = dfs_tree[0] self.post_order = dfs_tree[0][::-1] self.tree = np.zeros(self.n_features, dtype=np.int) self.tree[0] = -1 for p in range(1, self.n_features): self.tree[p] = dfs_tree[1][p] penalization = logr(X.shape[0]) / (2 * X.shape[0]) if self.and_leaves == True: for p in range(1, self.n_features): if MI[self.tree[p], p] < penalization: self.tree[p] = -1 self.num_trees = self.num_trees + 1 if self.num_trees > 1: self._forest = True """ selected_MI = [] for p in range(1,self.n_features): selected_MI.append((p,MI[self.tree[p],p])) selected_MI.sort(key=lambda mi: mi[1], reverse=True) for p in range(10,self.n_features-1): self.tree[selected_MI[p][0]]=-1 """ if multilabel == True and rootTree: pX = 0 for i in range(self.n_features - n_labels): if self.tree[i] >= (self.n_features - n_labels): pX += 1 pY = 0 for i in range(self.n_features - n_labels, self.n_features): if self.tree[i] >= (self.n_features - n_labels): pY += 1 print("Xs with Y parent: ", pX) print("Ys with Y parent: ", pY) self.num_edges = self.n_features - self.num_trees # computing the factored represetation self.log_factors = np.zeros((self.n_features, 2, 2)) self.log_factors = compute_log_factors(self.tree, self.n_features, log_probs, log_j_probs, self.log_factors)