def dtree(tbl, rows=None, lvl=-1, asIs=10**32, up=None, klass=-1, branch=[], f=None, val=None, opt=None): if not opt: opt = Thing(min=1, maxLvL=10, infoPrune=0.5, klass=-1, prune=True, debug=True, verbose=True) here = Thing(t=tbl, kids=[], f=f, val=val, up=up, lvl=lvl, rows=rows, modes={}, branch=branch) features = fWeight(tbl) if opt.prune and lvl < 0: features = fWeight(tbl)[:int(len(features) * opt.infoPrune)] name = features.pop(0) remaining = tbl[features + [tbl.columns[opt.klass]]] feature = tbl[name].values klass = tbl[tbl.columns[opt.klass]].values N = len(klass) here.score = np.mean(klass) splits = discretize(feature, klass) LO, HI = min(feature), max(feature) def pairs(lst): while len(lst) > 1: yield (lst.pop(0), lst[0]) cutoffs = [t for t in pairs(sorted(list(set(splits + [LO, HI]))))] if lvl > (opt.maxLvL if opt.prune else int(len(features) * opt.infoPrune)): return here if asIs == 0: return here if len(features) < 1: return here def rows(): for span in cutoffs: new = [] for f, row in zip(feature, remaining.values.tolist()): if span[0] <= f < span[1]: new.append(row) elif f == span[1] == HI: new.append(row) yield pd.DataFrame(new, columns=remaining.columns), span def ent(x): C = Counter(x) N = len(x) return sum([-C[n] / N * np.log(C[n] / N) for n in C.keys()]) for child, span in rows(): n = child.shape[0] toBe = ent(child[child.columns[opt.klass]]) if opt.min <= n < N: here.kids += [ dtree(child, lvl=lvl + 1, asIs=toBe, up=here, branch=branch + [(name, span)], f=name, val=span, opt=opt) ] return here
def dtree2(tbl, rows=None, lvl=-1, asIs=10 ** 32, up=None, klass = -1, branch=[], f=None, val=None, opt=None): """ Discrete independent variables """ if not opt: opt = Thing( min=1, maxLvL=10, infoPrune=1, klass=-1, prune=True, debug=True, verbose=True) here = Thing(t=tbl, kids=[], f=f, val=val, up=up, lvl=lvl , rows=rows, modes={}, branch=branch) features = fWeight(tbl) if opt.prune and lvl<0: features = fWeight(tbl)[:int(len(features)*opt.infoPrune)] name = features.pop(0) remaining = tbl[features+[tbl.columns[opt.klass]]] feature = tbl[name].values klass = tbl[tbl.columns[opt.klass]].values N = len(klass) here.score = np.mean(klass) splits = discretize(feature, klass, discrete=True) LO, HI = min(feature), max(feature) def pairs(lst): while len(lst)>1: yield (lst.pop(0), lst[0]) cutoffs = [LO, HI] # set_trace() if lvl>(opt.maxLvL if opt.prune else int(len(features)*opt.infoPrune)): return here if asIs == 0: return here if len(features)<1: return here def rows(): for span in cutoffs: new=[] for f, row in zip(feature, remaining.values.tolist()): if f==span: new.append(row) yield pd.DataFrame(new,columns=remaining.columns), span def ent(x): C = Counter(x) N = len(x) return sum([-C[n]/N*np.log(C[n]/N) for n in C.keys()]) for child, span in rows(): # set_trace() n = child.shape[0] toBe = ent(child[child.columns[opt.klass]]) if opt.min<=n<N: here.kids += [dtree2(child, lvl=lvl + 1, asIs=toBe, up=here , branch= branch + [(name, span)] , f=name, val=(span, span), opt=opt)] return here
def dtree2(tbl, rows=None, lvl=-1, asIs=10**32, up=None, klass=-5, branch=[], f=None, val=None, opt=None, encode=True): """ Discrete independent variables """ if not opt: opt = Thing(min=1, maxLvL=10, infoPrune=1, klass=-1, prune=True, debug=True, verbose=True) features = fWeight(tbl) # if encode==True: # encode(tbl, features, opt=opt) here = Thing(t=tbl, kids=[], f=f, val=val, up=up, lvl=lvl, rows=rows, modes={}, branch=branch) if opt.prune and lvl < 0: features = fWeight(tbl)[:int(len(features) * opt.infoPrune)] name = features.pop(0) remaining = tbl[features + [tbl.columns[opt.klass]]] feature = tbl[name].values klass = tbl[tbl.columns[opt.klass:]].values N = len(klass) here.score = np.mean(klass, axis=0) # splits = discretize(feature, klass, discrete=True) LO, HI = min(feature), max(feature) def pairs(lst): while len(lst) > 1: yield (lst.pop(0), lst[0]) cutoffs = [LO, HI] # set_trace() if lvl > (opt.maxLvL if opt.prune else int(len(features) * opt.infoPrune)): return here if asIs < 0.1: return here if len(features) < 1: return here def rows(): for span in cutoffs: new = [] for f, row in zip(feature, remaining.values.tolist()): if f == span: new.append(row) yield pd.DataFrame(new, columns=remaining.columns), span ent = lambda x: sum([ -Counter[n] / len(x) * np.log(Counter[n] / len(x)) for n in Counter.keys() ]) sdv = lambda x: np.mean(np.var(x, axis=0)) for child, span in rows(): # set_trace() n = child.shape[0] toBe = sdv(child[child.columns[opt.klass]]) if opt.min <= n < N: here.kids += [ dtree2(child, lvl=lvl + 1, asIs=toBe, up=here, branch=branch + [(name, span)], f=name, val=(span, span), opt=opt, encode=False) ] return here
def _tree_builder(self, tbl, rows=None, lvl=-1, asIs=10**32, up=None, klass=-1, branch=[], f=None, val=None, opt=None): here = Thing(t=tbl, kids=[], f=f, val=val, up=up, lvl=lvl, rows=rows, modes={}, branch=branch) features = fWeight(tbl) if self.prune and lvl < 0: features = fWeight(tbl)[:int(len(features) * self.infoPrune)] name = features.pop(0) remaining = tbl[features + [tbl.columns[self.klass]]] feature = tbl[name].values klass = tbl[tbl.columns[self.klass]].values N = len(klass) here.score = np.mean(klass) splits = discretize(feature, klass) lo, hi = min(feature), max(feature) def _pairs(lst): while len(lst) > 1: yield (lst.pop(0), lst[0]) cutoffs = [t for t in _pairs(sorted(list(set(splits + [lo, hi]))))] if lvl > (self.max_level if self.prune else int( len(features) * self.infoPrune)): return here if asIs == 0: return here if len(features) < 1: return here def _rows(): for span in cutoffs: new = [] for f, row in zip(feature, remaining.values.tolist()): if span[0] <= f < span[1]: new.append(row) elif f == span[1] == hi: new.append(row) yield pd.DataFrame(new, columns=remaining.columns), span def _entropy(x): C = Counter(x) N = len(x) return sum([-C[n] / N * np.log(C[n] / N) for n in C.keys()]) for child, span in _rows(): n = child.shape[0] toBe = _entropy(child[child.columns[self.klass]]) if self.min <= n < N: here.kids += [ self._tree_builder(child, lvl=lvl + 1, asIs=toBe, up=here, branch=branch + [(name, span)], f=name, val=span, opt=opt) ] return here