def divide(self, lo, hi, rank): "Find a split between lo and hi, then recurse on each split." xleft = Num(key=self.x) yleft = self.ctype(key=self.y) xright = Num(self.lst[lo:hi], key=self.x) yright = self.ctype(self.lst[lo:hi], key=self.y) xb4 = deepcopy(xright) yb4 = deepcopy(yright) best = yb4.variety() cut = None for j in range(lo, hi): xleft + self.lst[j] yleft + self.lst[j] xright - self.lst[j] yright - self.lst[j] if xleft.n >= self.step: if xright.n >= self.step: now = self.x(self.lst[j - 1]) after = self.x(self.lst[j]) if now == after: continue if abs(xright.mu - xleft.mu) >= self.epsilon: if after - self.start >= self.epsilon: if self.stop - now >= self.epsilon: xpect = yleft.xpect(yright) if xpect * THE.div.trivial < best: best, cut = xpect, j if cut: rank = self.divide(lo, cut, rank) + 1 rank = self.divide(cut, hi, rank) else: self.gain += xb4.n * xb4.variety() xb4.rank = rank self.ranges += [(xb4, yb4)] return rank
def createXYList(i, lst, yis, y_index): # print("list!!", lst) # print("List before: ",lst) if yis == "Num": y_lst = Num() else: y_lst = Sym() x_lst = [] lst = sorted(lst, key=lambda x: x.cells[y_index]) for column in range(len(lst[0].cells) - 1): #last column excluded for goal if not column == y_index: temp = Num() for row in lst: temp.add(row.cells[column]) # print("temp: ", temp) x_lst.append(temp) for row in lst: y_lst.add(row.cells[y_index]) # print(x_lst) return x_lst, y_lst
def createXYList(i, lst, yis): lst = sorted(lst, key=lambda x: x[1]) x_lst = Num() if yis == "Num": y_lst = Num() else: y_lst = Sym() for i in lst: x_lst.add(i[0]) y_lst.add(i[1]) return x_lst, y_lst
def tree(i, lst, y, yis, lvl=0): if len(lst) >= THE.tree.minObs * 2: # find the best column lo, cut, col = -10**32, None, None for col1 in i.cols.indep: x = lambda row: row.cells[col1.pos] d = Div2(lst, x=x, y=y, yis=yis) cut1, lo1 = d.finalcutlow() # print(cut1, lo1) if cut1: if lo1 < lo: cut, lo, col = cut1, lo1, col1 print("updated: ", cut, lo, col) # if a cut exists if cut: # split data on best col, call i.tree on each split x = lambda row: row.cells[col.pos] return [ o(lo=lo, hi=hi, n=len(kids), txt=col.txt, kids=i.tree(kids, y, yis, lvl + 1)) for lo, hi, kids in col.split(lst, x, cut) ] if yis == "Num": return Num(lst, key=y) else: return Sym(lst, key=y)
class Div2_3(Pretty): """ Recursively divide a list of numns by finding splits that minimizing the expected value of the standard deviation (after the splits). """ def __init__(self, lst, x=first, y=last, yis=Num): self.ctype = yis self.x = x self.y = y self.lst = ordered(lst, key=x) self.xtype = Num(self.lst, key=x) self.ytype = self.ctype(self.lst, key=y) self.gain = 0 # where we will be, once done #i.x = x # how to get values from 'lst' items self.step = int(len( self.lst)**THE.div.min) # each split need >= 'step' items self.stop = x(last(self.lst)) # top list value self.start = x(first(self.lst)) # bottom list value self.ranges = [] # the generted ranges self.epsilon = self.xtype.sd( ) * THE.div.cohen # bins must be seperated >= epsilon self.divide(1, len(self.lst), 1) self.gain /= len(self.lst) #TODO: check the argument passing thing def divide(self, lo, hi, rank): "Find a split between lo and hi, then recurse on each split." xleft = Num(key=self.x) yleft = self.ctype(key=self.y) xright = Num(self.lst[lo:hi], key=self.x) yright = self.ctype(self.lst[lo:hi], key=self.y) xb4 = deepcopy(xright) yb4 = deepcopy(yright) best = yb4.variety() cut = None for j in range(lo, hi): xleft + self.lst[j] yleft + self.lst[j] xright - self.lst[j] yright - self.lst[j] if xleft.n >= self.step: if xright.n >= self.step: now = self.x(self.lst[j - 1]) after = self.x(self.lst[j]) if now == after: continue if abs(xright.mu - xleft.mu) >= self.epsilon: if after - self.start >= self.epsilon: if self.stop - now >= self.epsilon: xpect = yleft.xpect(yright) if xpect * THE.div.trivial < best: best, cut = xpect, j if cut: rank = self.divide(lo, cut, rank) + 1 rank = self.divide(cut, hi, rank) else: self.gain += xb4.n * xb4.variety() xb4.rank = rank self.ranges += [(xb4, yb4)] return rank
def __init__(self, lst, x=first, y=last, yis=Num): self.ctype = yis self.x = x self.y = y self.lst = ordered(lst, key=x) self.xtype = Num(self.lst, key=x) self.ytype = self.ctype(self.lst, key=y) self.gain = 0 # where we will be, once done #i.x = x # how to get values from 'lst' items self.step = int(len( self.lst)**THE.div.min) # each split need >= 'step' items self.stop = x(last(self.lst)) # top list value self.start = x(first(self.lst)) # bottom list value self.ranges = [] # the generted ranges self.epsilon = self.xtype.sd( ) * THE.div.cohen # bins must be seperated >= epsilon self.divide(1, len(self.lst), 1) self.gain /= len(self.lst)
def numSplit(i, lst): newNumber = Num() if lst: for x in lst: newNumber.add(x) return newNumber