def kFoldCrossVal(self, data, fSel, ext, _prune, _info, method, k=5): acc, md, auc = [], [], [] bef, aft = [], [] chunks = lambda l, n: [l[i:i + n] for i in range(0, len(l), int(n))] from random import shuffle rows = data._rows shuffle(rows) sqe = chunks(rows, int(len(rows) / k)) if len(sqe) > k: sqe = sqe[:-2] + [sqe[-2] + sqe[-1]] for indx in xrange(k): try: testRows = sqe.pop(indx) except: set_trace() trainRows = self.flatten([s for s in sqe if not s == testRows]) train, test = clone(data, rows=[ i.cells for i in trainRows]), clone(data, rows=[ i.cells for i in testRows]) train_df = formatData(train) test_df = formatData(test) actual = test_df[ test_df.columns[-2]].astype('float32').tolist() before = predictor(train=train_df, test=test_df).rforest() _, __, after = self.planner( train, test, fSel, ext, _prune, _info, method) bef.extend(before) aft.extend(after) md.append((median(before) - median(after)) * 100 / median(before)) auc.append((sum(before) - sum(after)) * 100 / sum(before)) acc.extend( [(1 - abs(b - a) / a) * 100 for b, a in zip(before, actual)]) sqe.insert(k, testRows) return acc, auc, md, bef, aft
def mainraw(self, name='Apache', reps=10, fSel=True, ext=0.5, _prune=False, _info=0.25, method='best'): data = self.explorer(name) before, after = [], [] for _ in xrange(reps): for d in data: if name == d[0].strip().split('/')[-1]: train = createTbl([d[0] + '/' + d[1][1]], isBin=False) test = createTbl([d[0] + '/' + d[1][0]], isBin=False) train_df = formatData(train) test_df = formatData(test) actual = test_df[ test_df.columns[-2]].astype('float32').tolist() before.append(predictor(train=train_df, test=test_df).rforest()) # set_trace() newTab = WHAT( train=[d[0] + '/' + d[1][1]], test=[d[0] + '/' + d[1][0]], train_df=train, bin=True, test_df=test, extent=ext, fSelect=fSel, far=False, infoPrune=_info, method=method, Prune=_prune).main() newTab_df = formatData(newTab) after.append(predictor(train=train_df, test=newTab_df).rforest()) return before, after
def kFoldCrossVal(self, data, fSel, ext, _prune, _info, method, k=5): acc, md, auc = [], [], [] bef, aft = [], [] chunks = lambda l, n: [l[i:i + n] for i in range(0, len(l), int(n))] from random import shuffle, sample rows = data._rows shuffle(rows) sqe = chunks(rows, int(len(rows) / k)) if len(sqe) > k: sqe = sqe[:-2] + [sqe[-2] + sqe[-1]] for indx in xrange(k): try: testRows = sqe.pop(indx) except: set_trace() trainRows = self.flatten([s for s in sqe if not s == testRows]) train, test = clone(data, rows=[i.cells for i in trainRows]), clone( data, rows=[i.cells for i in testRows]) train_df = formatData(train) test_df = formatData(test) actual = test_df[test_df.columns[-2]].astype('float32').tolist() before = predictor(train=train_df, test=test_df).rforest() _, __, after = self.planner(train, test, fSel, ext, _prune, _info, method) bef.extend(before) aft.extend(after) md.append((median(before) - median(after)) * 100 / median(before)) auc.append((sum(before) - sum(after)) * 100 / sum(before)) acc.extend([(1 - abs(b - a) / a) * 100 for b, a in zip(before, actual)]) sqe.insert(k, testRows) return acc, auc, md, bef, aft
def planner(self, train, test, fSel, ext, _prune, _info, name, method='best', justDeltas=False): train_df = formatData(train) test_df = formatData(test) actual = test_df[ test_df.columns[-2]].astype('float32').tolist() before = predictor(train=train_df, test=test_df).rforest() # set_trace() newTab = WHAT( name=name, train=None, test=None, train_df=train, bin=True, test_df=test, extent=ext, fSelect=fSel, far=False, infoPrune=_info, method=method, Prune=_prune).main(justDeltas=justDeltas) # newTab_df = formatData(newTab) after = predictor(train=train_df, test=test_df).rforest() return actual, before, after, newTab
def planner(self, train, test): train_df = formatData(createTbl(train, _smote=False, isBin=False)) test_df = formatData(createTbl(test, _smote=False, isBin=False)) actual = test_df[test_df.columns[-2]].astype('float32').tolist() before = predictor(train=train_df, test=test_df).rforest() # set_trace() newTab = HOW(train=train, test=test, bin=False).main() newTab_df = formatData(newTab) after = predictor(train=train_df, test=newTab_df).rforest() return newTab
def main(self, name='Apache', reps=20): out_xtrees = ['xtrees'] out_HOW = ['HOW'] out_cart = ['CART'] out_basln = ['Base'] out_baslnFss = ['Base+FSS'] for _ in xrange(reps): data = self.explorer(name) for d in data: if name == d[0].strip().split('/')[-2]: # set_trace() train = [d[0] + d[1][1]] test = [d[0] + d[1][0]] # set_trace() train_df = formatData(createTbl(train, _smote=False, isBin=False)) test_df = formatData(createTbl(test, _smote=False, isBin=False)) actual = test_df[test_df.columns[-2]].astype('float32').tolist() before = predictor(train=train_df, test=test_df).rforest() "Apply Different Planners" xTrees = xtrees(train=train, test=test, bin=False, majority=True).main() cart = xtrees(train=train, test=test, bin=False, majority=False).main() how = HOW(name) baseln = strawman( train=train, test=test).main( config=True) baselnFss = strawman( train=train, test=test, prune=True).main(config=True) after = lambda newTab: predictor( train=train_df, test=formatData(newTab)).rforest() frac = lambda aft: sum(aft) / sum(before) # set_trace() out_xtrees.append(frac(after(xTrees))) out_cart.append(frac(after(cart))) out_HOW.extend(how) out_basln.append(frac(after(baseln))) out_baslnFss.append(frac(after(baselnFss))) return [out_xtrees, out_cart, out_HOW, out_basln, out_baslnFss]
def planner(self, train, test): train_df = formatData(createTbl(train, _smote=False, isBin=False)) test_df = formatData(createTbl(test, _smote=False, isBin=False)) actual = test_df[ test_df.columns[-2]].astype('float32').tolist() before = predictor(train=train_df, test=test_df).rforest() # set_trace() newTab = HOW( train=train, test=test, bin=False).main() newTab_df = formatData(newTab) after = predictor(train=train_df, test=newTab_df).rforest() return newTab
def main(self, name='Apache', reps=20): out_xtrees = ['xtrees'] out_HOW = ['HOW'] out_cart = ['CART'] out_basln = ['Base'] out_baslnFss = ['Base+FSS'] for _ in xrange(reps): data = self.explorer(name) for d in data: if name == d[0].strip().split('/')[-2]: # set_trace() train = [d[0] + d[1][1]] test = [d[0] + d[1][0]] # set_trace() train_df = formatData( createTbl(train, _smote=False, isBin=False)) test_df = formatData( createTbl(test, _smote=False, isBin=False)) actual = test_df[test_df.columns[-2]].astype( 'float32').tolist() before = predictor(train=train_df, test=test_df).rforest() "Apply Different Planners" xTrees = xtrees(train=train, test=test, bin=False, majority=True).main() cart = xtrees(train=train, test=test, bin=False, majority=False).main() how = HOW(name) baseln = strawman(train=train, test=test).main(config=True) baselnFss = strawman(train=train, test=test, prune=True).main(config=True) after = lambda newTab: predictor( train=train_df, test=formatData(newTab)).rforest() frac = lambda aft: sum(aft) / sum(before) # set_trace() out_xtrees.append(frac(after(xTrees))) out_cart.append(frac(after(cart))) out_HOW.extend(how) out_basln.append(frac(after(baseln))) out_baslnFss.append(frac(after(baselnFss))) return [out_xtrees, out_cart, out_HOW, out_basln, out_baslnFss]
def planner(self, train, test, fSel, ext, _prune, _info, method='best'): train_df = formatData(train) test_df = formatData(test) actual = test_df[test_df.columns[-2]].astype('float32').tolist() before = predictor(train=train_df, test=test_df).rforest() # set_trace() newTab = WHAT(train=None, test=None, train_df=train, bin=True, test_df=test, extent=ext, fSelect=fSel, far=False, infoPrune=_info, method=method, Prune=_prune).main() newTab_df = formatData(newTab) after = predictor(train=train_df, test=newTab_df).rforest() return actual, before, after
def mainraw(self, name='Apache', reps=10, fSel=True, ext=0.5, _prune=False, _info=0.25, method='best'): data = self.explorer(name) before, after = [], [] for _ in xrange(reps): for d in data: if name == d[0].strip().split('/')[-1]: train = createTbl([d[0] + '/' + d[1][1]], isBin=False) test = createTbl([d[0] + '/' + d[1][0]], isBin=False) train_df = formatData(train) test_df = formatData(test) actual = test_df[test_df.columns[-2]].astype( 'float32').tolist() before.append( predictor(train=train_df, test=test_df).rforest()) # set_trace() newTab = WHAT(train=[d[0] + '/' + d[1][1]], test=[d[0] + '/' + d[1][0]], train_df=train, bin=True, test_df=test, extent=ext, fSelect=fSel, far=False, infoPrune=_info, method=method, Prune=_prune).main() newTab_df = formatData(newTab) after.append( predictor(train=train_df, test=newTab_df).rforest()) return before, after
def main(self, name='Apache', reps=20): rseed(1) for planner in ['DTREE', 'CD+FS', 'CD', 'BIC']: out = [planner] after = lambda newTab: predictor( train=train_df, test=formatData(newTab)).rforest() frac = lambda aft: (1 - sum(aft) / sum(before)) data = self.explorer(name) for d in data: if name == d[0].strip().split('/')[-2]: # set_trace() train = [d[0] + d[1][1]] test = [d[0] + d[1][0]] # set_trace() train_df = formatData(createTbl(train, _smote=False, isBin=False)) test_df = formatData(createTbl(test, _smote=False, isBin=False)) valid = [ isValid( new.cells, name=name) for new in createTbl( test, _smote=False, isBin=False)._rows] actual = test_df[test_df.columns[-2]].astype('float32').tolist() before = predictor(train=train_df, test=test_df).rforest() for _ in xrange(reps): newTab = None # Just so I am sure, there isn't any residue. "Apply Different Planners" if planner == 'xtrees': newTab = xtrees(train=train, test=test, bin=False, majority=True, name=name).main() if planner == 'DTREE': newTab = xtrees(train=train, test=test, bin=False, majority=False, name=name).main() valid = [isValid(new.cells, name=name) for new in newTab._rows] # set_trace() if planner == 'BIC': newTab = HOW(name) valid = [isValid(new.cells, name=name) for new in newTab._rows] # set_trace() if planner == 'CD': newTab = strawman(name=name, train=train, test=test).main(mode="config") valid = [isValid(new.cells, name=name) for new in newTab._rows] # set_trace() if planner == 'CD+FS': newTab = strawman(name=name, train=train, test=test, prune=True).main(mode="config") valid = [isValid(new.cells, name=name) for new in newTab._rows] # set_trace() try: out.append(frac(after(newTab))) except: set_trace() yield out