def get_best_split(self, features, target, root, node, father, side): selector = get_selector_for_node(root, node, features) positive_target_ratio = np.sum(target[selector], axis=0) / np.sum( target[selector]) if (positive_target_ratio > 1 - 1e-8).any() or len(selector) < 10: return mc.List([]) else: split = find_best_split(features, target, selector, kind=self.kind) return mc.List([SplitLocation(split, side, father, root)])
def get_best_split(self, features, target, root, node, father, side): selector = get_selector_for_node(root, node, features) if len(selector) < 10: return mc.List([]) else: split = find_best_split_beta(features, target, selector, self.cfg) if split.score > 0: return mc.List([BetaSplitLocation(split, side, father, root)]) else: return mc.List([])
def get_path_model(model, columns=None): graft = model.tree_initial dic = jsonify(graft, columns) set_fathers(dic) leaves = mc.List(get_all_leaves(dic)) leaves = leaves.filter(lambda x: not np.isnan(x["proba"][0])).sorted( key=lambda x: x["proba"][0]) path = mc.List(node_to_path(leaves[-1])).map(drop_fr_dic) path_opt = path_redundancy_removal(path) return path_opt
def fit(self, features, target, sample_weight=None): self.target_dim = target.shape[1] assert self.target_dim == 2, "only target dim 2 supported, one hot encoded" self.target_dtype = target.dtype split = find_best_split_beta(features, target, np.arange(target.shape[0]), self.cfg) if self.verbosity > 0: print(split) self.tree_initial = get_basic_thump(split.feature_num, split.value, 1, target, features, np.arange(features.shape[0])) pq = PriorityQueue( self.get_splits_for_node(self.tree_initial, self.tree_initial, features, target)) if self.verbosity > 0: print(self.pvalue) for i in range(self.max_depth): if pq.empty(): break split = pq.pop() if self.verbosity > 0: print("\nn split is", str(split.split), end="\n") print( "\nqueue is ", mc.List(pq.container[0:3]).map(lambda x: x.split).map( str).mk_string()) if (split.emptiness_check(features)): new_node = split.apply(features, target, sample_weight) pq.push_list( self.get_splits_for_node(self.tree_initial, new_node, features, target))
def get_split_approx(k, num_st, n_k, num_2nd, prior, pval): m1 = get_approx_max(k + 1, num_st - k + 1, 10000, lambda x, y: SST.beta(x, y).ppf(pval) - prior) + (0) m2 = get_approx_max( k + 1, num_st - k + 1, 10000, lambda x, y: prior - SST.beta(x, y).ppf(1 - pval)) + (1) m3 = get_approx_max(n_k + 1, num_2nd - k + 1, 10000, lambda x, y: SST.beta(x, y).ppf(pval) - prior) + (2) m4 = get_approx_max( k + 1, num_2nd - k + 1, 10000, lambda x, y: prior - SST.beta(x, y).ppf(1 - pval)) + (3) return mc.List( [m1, m2, m3, m4]).map(lambda x: (x[0], (x[1], x[2]))).sort(lambda x: x[0])[-1]
def node_to_path(node): import copy node = copy.copy(node) arr = mc.List([node]) while "father" in node: fath = copy.copy(node["father"]) if node["side"] == "right": fath["sign"] = ">" if node["side"] == "left": fath["sign"] = "<" node = fath arr.append(mc.Dict(node)) return arr[::-1]
def fit(self, features, target, sample_weight=None): self.target_dim = target.shape[1] assert self.target_dim == 2, "only target dim 2 supported, one hot encoded" self.target_dtype = target.dtype split = find_best_split(features, target, np.arange(target.shape[0]), kind=self.kind) if self.verbosity > 0: print(split) self.tree_initial = get_basic_thump(split.feature_num, split.value, 1, target, features, np.arange(features.shape[0])) pq = PriorityQueue( self.get_splits_for_node(self.tree_initial, self.tree_initial, features, target)) self.pvalue = 1.0 - split.pvalue() if self.verbosity > 0: print(self.pvalue) for i in range(self.max_depth): split = pq.pop() self.pvalue *= (1 - split.split.pvalue()) if self.pvalue < self.pvalue_limit: if self.verbosity > 0: print( f"Next pval would be {self.pvalue}, final complexity {i-1}" ) self.pvalue /= (1 - split.split.pvalue()) break if self.verbosity > 0: print(str(split.split), end=" ") print(self.pvalue, end=" ") print( "que", mc.List(pq.container[0:3]).map(lambda x: x.split).map( str).mk_string()) new_node = split.apply(features, target, sample_weight) pq.push_list( self.get_splits_for_node(self.tree_initial, new_node, features, target))