music_test.X = z_norm_by_feature(music_test.X, mean_X, std_X) # Balacing train data. # print "Balacing train data." # music_train.balance_data_oversampling_smote_regular() # Set train parameters. # lambdav = 0.00001 lambdav = 0 # alpha = 0.0000001 # iterations = 1000000 alpha = 0.1 iterations = 1200 # print "Solving normal equation." theta = solve_normal_equation(music_train.X, music_train.y, lambdav) print "Solving using gradient descent." # theta = gradient_descent(music_train.X, music_train.y, None, alpha, lambdav, iterations) #theta, J_history = gradient_descent_with_J_history(music_train.X, music_train.y, None, alpha, lambdav, iterations) #plot_history(J_history) print "Computing cost." print compute_cost(music_train.X, music_train.y, theta, lambdav) print compute_cost(music_validation.X, music_validation.y, theta, lambdav) print compute_cost(music_test.X, music_test.y, theta, lambdav) for delta_year in range(10): print delta_year print "Computing train accuracy."
for year2 in range(2000, 2010): print year1, year2 delta_year = 5 less_year = music_train.y <= year1 less_year.shape = (len(music_train.y)) greater_year = music_train.y > year1 greater_year.shape = (len(music_train.y)) music_train_y_year_yes_or_not = np.array(music_train.y) music_train_y_year_yes_or_not[less_year] = year1 music_train_y_year_yes_or_not[greater_year] = year2 music_train_y_year_yes_or_not.shape = ( len(music_train_y_year_yes_or_not), 1) # < year or > year classifier. theta_year_yes_or_not = solve_normal_equation( music_train.X, music_train_y_year_yes_or_not, 0) # < year classifier. y = np.array(music_train.y[less_year]) y.shape = (len(y), 1) X = music_train.X[np.where(less_year)] theta_year_less = solve_normal_equation(X, y, 0) print compute_accuracy(X, y, theta_year_less, delta_year) # > year classifier. y = np.array(music_train.y[greater_year]) y.shape = (len(y), 1) X = music_train.X[np.where(greater_year)] theta_year_more = solve_normal_equation( music_train.X[greater_year], y, 0) print compute_accuracy(X, y, theta_year_more, delta_year)
#music_train.balance_data_undersampling_cluster_centroids() #music_train.balance_data_undersampling_tomek_links() music_train.balance_data_ensemblesampling_balance_cascade() #music_train.balance_data_ensemblesampling_balance_cascade() after_balacing_size = len(music_train.X) print "Before balacing size: " + str(before_balacing_size) print "After balacing size: " + str(after_balacing_size) # Set train parameters. lambdav = 0.0000000001 n = len(music_train.X[0]) print "Solving normal equation." # Get thetas to reduce data. theta = solve_normal_equation(music_train.X, music_train.y, lambdav) ordered_theta = np.argsort(np.abs(theta).reshape(len(theta))) ordered_theta = ordered_theta[::-1] # Initialize costs. J_history_train = np.zeros(n) J_history_validation = np.zeros(n) for iteration in range(n): theta = solve_normal_equation(music_train.X[:, ordered_theta[:(n - iteration)]], music_train.y, lambdav) J_history_train[iteration] = compute_cost(music_train.X[:, ordered_theta[:(n - iteration)]], music_train.y, theta, 0) J_history_validation[iteration] = compute_cost(music_validation.X[:, ordered_theta[:(n - iteration)]], music_validation.y, theta, 0) print "Theta size: " + str(n - iteration) print "J_train: %f" % J_history_train[iteration] print "J_validation: %f" % J_history_validation[iteration]