def dtree(tbl, rows=None, lvl=-1, asIs=10**32, up=None, klass=-1, branch=[], f=None, val=None, opt=None): if not opt: opt = Thing(min=1, maxLvL=10, infoPrune=0.5, klass=-1, prune=True, debug=True, verbose=True) here = Thing(t=tbl, kids=[], f=f, val=val, up=up, lvl=lvl, rows=rows, modes={}, branch=branch) features = fWeight(tbl) if opt.prune and lvl < 0: features = fWeight(tbl)[:int(len(features) * opt.infoPrune)] name = features.pop(0) remaining = tbl[features + [tbl.columns[opt.klass]]] feature = tbl[name].values klass = tbl[tbl.columns[opt.klass]].values N = len(klass) here.score = np.mean(klass) splits = discretize(feature, klass) LO, HI = min(feature), max(feature) def pairs(lst): while len(lst) > 1: yield (lst.pop(0), lst[0]) cutoffs = [t for t in pairs(sorted(list(set(splits + [LO, HI]))))] if lvl > (opt.maxLvL if opt.prune else int(len(features) * opt.infoPrune)): return here if asIs == 0: return here if len(features) < 1: return here def rows(): for span in cutoffs: new = [] for f, row in zip(feature, remaining.values.tolist()): if span[0] <= f < span[1]: new.append(row) elif f == span[1] == HI: new.append(row) yield pd.DataFrame(new, columns=remaining.columns), span def ent(x): C = Counter(x) N = len(x) return sum([-C[n] / N * np.log(C[n] / N) for n in C.keys()]) for child, span in rows(): n = child.shape[0] toBe = ent(child[child.columns[opt.klass]]) if opt.min <= n < N: here.kids += [ dtree(child, lvl=lvl + 1, asIs=toBe, up=here, branch=branch + [(name, span)], f=name, val=span, opt=opt) ] return here
def dtree2(tbl, rows=None, lvl=-1, asIs=10 ** 32, up=None, klass = -1, branch=[], f=None, val=None, opt=None): """ Discrete independent variables """ if not opt: opt = Thing( min=1, maxLvL=10, infoPrune=1, klass=-1, prune=True, debug=True, verbose=True) here = Thing(t=tbl, kids=[], f=f, val=val, up=up, lvl=lvl , rows=rows, modes={}, branch=branch) features = fWeight(tbl) if opt.prune and lvl<0: features = fWeight(tbl)[:int(len(features)*opt.infoPrune)] name = features.pop(0) remaining = tbl[features+[tbl.columns[opt.klass]]] feature = tbl[name].values klass = tbl[tbl.columns[opt.klass]].values N = len(klass) here.score = np.mean(klass) splits = discretize(feature, klass, discrete=True) LO, HI = min(feature), max(feature) def pairs(lst): while len(lst)>1: yield (lst.pop(0), lst[0]) cutoffs = [LO, HI] # set_trace() if lvl>(opt.maxLvL if opt.prune else int(len(features)*opt.infoPrune)): return here if asIs == 0: return here if len(features)<1: return here def rows(): for span in cutoffs: new=[] for f, row in zip(feature, remaining.values.tolist()): if f==span: new.append(row) yield pd.DataFrame(new,columns=remaining.columns), span def ent(x): C = Counter(x) N = len(x) return sum([-C[n]/N*np.log(C[n]/N) for n in C.keys()]) for child, span in rows(): # set_trace() n = child.shape[0] toBe = ent(child[child.columns[opt.klass]]) if opt.min<=n<N: here.kids += [dtree2(child, lvl=lvl + 1, asIs=toBe, up=here , branch= branch + [(name, span)] , f=name, val=(span, span), opt=opt)] return here
def _tree_builder(self, dframe, lvl=-1, as_is=float("inf"), parent=None, branch=[], f=None, val=None): """ Construct decision tree Parameters ---------- dframe: <pandas.core.Frame.DataFrame> Raw data as a dataframe lvl: int (default -1) Level of the tree as_is: float (defaulf "inf") Entropy of the class variable in the current rows parent: Thing (default None) Parent Node branch: List[Thing] (default []) Parent nodes visitied to reach current node f: str (default None) Name of the attribute represented by the current node val: Tuple(low, high) The minimum and maximum range of the attribute in the current node Returns ------- Thing: The root node of the tree Notes ----- + Thing is a generic container, in this case it's a node in the tree. + You'll find it in <src.tools.containers> """ current = Thing(t=dframe, kids=[], f=f, val=val, parent=parent, lvl=lvl, branch=branch) features = fWeight(dframe) if self.prune and lvl < 0: features = fWeight(dframe)[:int(len(features) * self.info_prune)] name = features.pop(0) remaining = dframe[features + [dframe.columns[self.klass]]] feature = dframe[name].values dependent_var = dframe[dframe.columns[self.klass]].values N = len(dependent_var) current.score = np.mean(dependent_var) splits = discretize(feature, dependent_var) low = min(feature) high = max(feature) cutoffs = [t for t in self.pairs( sorted(list(set(splits + [low, high]))))] if lvl > (self.max_levels if self.prune else int( len(features) * self.info_prune)): return current if as_is == 0: return current if len(features) < 1: return current def _rows(): for span in cutoffs: new = [] for f, row in zip(feature, remaining.values.tolist()): if span[0] <= f < span[1]: new.append(row) elif f == span[1] == high: new.append(row) yield pd.DataFrame(new, columns=remaining.columns), span for child, span in _rows(): n = child.shape[0] to_be = self._entropy(child[child.columns[self.klass]]) if self.min_levels <= n < N: current.kids += [ self._tree_builder(child, lvl=lvl + 1, as_is=to_be, parent=current, branch=branch + [(name, span)], f=name, val=span)] return current
def _tree_builder(self, tbl, rows=None, lvl=-1, asIs=10**32, up=None, klass=-1, branch=[], f=None, val=None, opt=None): here = Thing(t=tbl, kids=[], f=f, val=val, up=up, lvl=lvl, rows=rows, modes={}, branch=branch) features = fWeight(tbl) if self.prune and lvl < 0: features = fWeight(tbl)[:int(len(features) * self.infoPrune)] name = features.pop(0) remaining = tbl[features + [tbl.columns[self.klass]]] feature = tbl[name].values klass = tbl[tbl.columns[self.klass]].values N = len(klass) here.score = np.mean(klass) splits = discretize(feature, klass) lo, hi = min(feature), max(feature) def _pairs(lst): while len(lst) > 1: yield (lst.pop(0), lst[0]) cutoffs = [t for t in _pairs(sorted(list(set(splits + [lo, hi]))))] if lvl > (self.max_level if self.prune else int( len(features) * self.infoPrune)): return here if asIs == 0: return here if len(features) < 1: return here def _rows(): for span in cutoffs: new = [] for f, row in zip(feature, remaining.values.tolist()): if span[0] <= f < span[1]: new.append(row) elif f == span[1] == hi: new.append(row) yield pd.DataFrame(new, columns=remaining.columns), span def _entropy(x): C = Counter(x) N = len(x) return sum([-C[n] / N * np.log(C[n] / N) for n in C.keys()]) for child, span in _rows(): n = child.shape[0] toBe = _entropy(child[child.columns[self.klass]]) if self.min <= n < N: here.kids += [ self._tree_builder(child, lvl=lvl + 1, asIs=toBe, up=here, branch=branch + [(name, span)], f=name, val=span, opt=opt) ] return here