def split_y(data, col):    

    replace_nan(data, col)
    print data[:, col]
    median = np.median(data[:, col])
    print median
    
    for i in range(data.shape[0]):
        print '========='
        print data[i,[3,7,8]]
        print data[i, col]
        
        if data[i][col] > median:
            print 1
        else:
            print 0
    
    label = LABELS[col-16] 
    output_fp = os.path.join('data', 'matrix_data', 'logit', "wifi_features_" + label + '.csv')
    fw = open(output_fp, 'a')
      
    for i in range(data.shape[0]):
        line = [str(x) for x in data[i,[3,7,8]].tolist()]
        #print type(data[i][col])
        if data[i][col] > median:
            y = 1
        else:
            y = 0
        line.append(str(y))
        fw.write(','.join(line) + '\n')
    fw.close()
if __name__ == '__main__':
    fp = r"data\matrix_data\all_wifi_features.csv"
    data = np.genfromtxt(fp, delimiter=",", dtype=float, skip_header=1)

    x_cols = [1,2,3]
    n = data.shape[0]
    m = data.shape[1]
    fold = 5
    test_data = np.empty([n/fold, m])
    train_data = np.empty([n-n/fold, m])
     
    y_cols = range(4, 19)
    for y_col in y_cols:
        print '================================================================'
        print LABELS[y_col-4]
        replace_nan(data, y_col)
         
        avg_mse = 0.0
        for j in range(fold):
            #print "fold %d" % j
            for i, x in enumerate(data):
                if i%fold == j:
                    test_data[i/fold] = data[i]
                else:
                    train_data[(i/fold)*(fold-1)+i%fold-1] = data[i]
          
            beta = get_beta(train_data, y_col)
            mse = get_mse(test_data, beta, y_col)
            avg_mse += mse
        avg_mse /= fold
        print "average mse: " + str(avg_mse)