def find_best_label_regression(df_old, col, y): least_sq = {} sarray = [] df = df_old.copy() sorted = df.sort([col]).reset_index(drop=True) # sorted.reset_index(drop=True) # for row in enumerate(df.sort([col])): # sarray.append(row) # sorted = pd.DataFrame(sarray) # sorted.columns = df.columns print "Print reset index" print df[y][0:10] print sorted[y][0:10] # sys.exit() i = 0 print "Finding label for " + col for _, row in sorted.iterrows(): i += 1 if i == 1 or i > len(sorted) - 1: continue # print 'i:' + str(i) # print list(sorted[y])[i:len(sorted[y])] # print 'ls {} + {}'.format(len(list(sorted[y])[0:i]), len(list(sorted[y])[i:len(sorted[y])])) lsq = mystats.least_squares(list(sorted[y])[0:i]) + mystats.least_squares(list(sorted[y])[i : len(sorted[y])]) least_sq[row[col]] = lsq # print 'ls {} + {}'.format(str(least_squares(list(sorted[y])[0:i])), least_squares(list(sorted[y])[i:len(sorted[y])])) return min(least_sq, key=lambda k: least_sq[k])
def compute_info_gain_regression(df, feature, split, y): A = df[[feature, y]] # series = [split for x in range(0, len(A[feature]))] # print series mask = A[feature] <= split B = A[mask] C = A[~mask] info_gain = mystats.least_squares(A, y) - mystats.least_squares(B, y) - mystats.least_squares(C, y) print "Information Gain: %s" % info_gain return info_gain