def find_best_label_regression(df_old, col, y):
    least_sq = {}
    sarray = []
    df = df_old.copy()
    sorted = df.sort([col]).reset_index(drop=True)
    # sorted.reset_index(drop=True)
    # for row in enumerate(df.sort([col])):
    #    sarray.append(row)
    # sorted = pd.DataFrame(sarray)
    # sorted.columns = df.columns
    print "Print reset index"
    print df[y][0:10]
    print sorted[y][0:10]
    # sys.exit()
    i = 0
    print "Finding label for " + col
    for _, row in sorted.iterrows():
        i += 1
        if i == 1 or i > len(sorted) - 1:
            continue
        # print 'i:' + str(i)
        # print list(sorted[y])[i:len(sorted[y])]
        # print 'ls {} + {}'.format(len(list(sorted[y])[0:i]), len(list(sorted[y])[i:len(sorted[y])]))
        lsq = mystats.least_squares(list(sorted[y])[0:i]) + mystats.least_squares(list(sorted[y])[i : len(sorted[y])])
        least_sq[row[col]] = lsq
        # print 'ls {} + {}'.format(str(least_squares(list(sorted[y])[0:i])), least_squares(list(sorted[y])[i:len(sorted[y])]))
    return min(least_sq, key=lambda k: least_sq[k])
def compute_info_gain_regression(df, feature, split, y):
    A = df[[feature, y]]
    # series = [split for x in range(0, len(A[feature]))]
    # print series
    mask = A[feature] <= split
    B = A[mask]
    C = A[~mask]
    info_gain = mystats.least_squares(A, y) - mystats.least_squares(B, y) - mystats.least_squares(C, y)
    print "Information Gain: %s" % info_gain
    return info_gain