def run(): file_path = FileUtils.get_abs_path(__file__, "./data/emailSample1.txt") vocab_path = FileUtils.get_abs_path(__file__, "./data/vocab.txt") file_contents = open(file_path, "r").read() vocabList = open(vocab_path, "r").read() vocabList = vocabList.split("\n")[:-1] vocabList_d = {} for ea in vocabList: value, key = ea.split("\t")[:] vocabList_d[key] = value print(file_contents) word_indices = process_email(file_contents, vocabList_d) features = email_features(word_indices, vocabList_d) print("Length of feature vector: ", len(features)) print("Number of non-zero entries: ", np.sum(features)) spam_mat_path = FileUtils.get_abs_path(__file__, "./data/spamTrain.mat") spam_mat = loadmat(spam_mat_path) X_train = spam_mat["X"] y_train = spam_mat["y"] C = 0.1 spam_svc = SVC(C=0.1, kernel="linear") spam_svc.fit(X_train, y_train.ravel()) print("Training Accuracy:", (spam_svc.score(X_train, y_train.ravel())) * 100, "%") spam_mat_test_path = FileUtils.get_abs_path(__file__, "./data/spamTest.mat") spam_mat_test = loadmat(spam_mat_test_path) X_test = spam_mat_test["Xtest"] y_test = spam_mat_test["ytest"] print("Test Accuracy:", (spam_svc.score(X_test, y_test.ravel())) * 100, "%") file_path = FileUtils.get_abs_path(__file__, "./data/spamSample1.txt") file_contents = open(file_path, "r").read() word_indices = process_email(file_contents, vocabList_d) features = email_features(word_indices, vocabList_d) features = features.reshape([1, 1899]) print(spam_svc.predict(features)) print('1 is spam, 0 is not spam')
def run(): data_path = FileUtils.get_abs_path(__file__, "./data/ex3weights.mat") mat2 = loadmat(data_path) Theta1 = mat2['Theta1'] Theta2 = mat2['Theta2'] np.set_printoptions(suppress=True) data_path = FileUtils.get_abs_path(__file__, "./data/ex3data1.mat") mat = loadmat(data_path) X = mat["X"] y = mat["y"] res = predict_nn(Theta1, Theta2, X) print("Accuracy on training set with Neural Network:", np.mean((res == y)) * 100)
def run(): np.set_printoptions(suppress=True) data_path = FileUtils.get_abs_path(__file__, "./data/ex3data1.mat") mat = loadmat(data_path) X = mat["X"] y = mat["y"] fig, axis = plt.subplots(10, 10, figsize=(12, 12)) for i in range(10): for j in range(10): axis[i, j].imshow(X[np.random.randint(0, 5001), :].reshape(20, 20, order="F"), cmap="hot") # reshape back to 20 pixel by 20 pixel axis[i, j].axis("off") plt.show() theta_t = str2arr('[-2; -1; 1; 2]') X_t = np.array([np.linspace(0.1, 1.5, 15)]).reshape(3, 5).T X_t = np.hstack((np.ones((5, 1)), X_t)) y_t = (str2arr('[1;0;1;0;1]')) lambda_t = 3 cost, grad = cost_function_regularized(theta_t, X_t, y_t, lambda_t) print("Cost:", cost, "Expected cost: 2.534819") print( "Gradients:\n", grad, "\nExpected gradients:\n 0.146561\n -0.548558\n 0.724722\n 1.398003") lambda_value = 0.1 num_labels = 10 all_theta = one_vs_all(X, y, num_labels, lambda_value) res = predict_one_vs_all(all_theta, X) print("Accuracy on training set with OneVsAll:", np.mean((res == y)) * 100)
def run(): data_path = FileUtils.get_abs_path(__file__, "./data/ex2data1.txt") data = np.loadtxt(data_path, delimiter=',') n = np.size(data, 1) x = data[:, range(n - 1)] y = data[:, n - 1] m = np.size(y, 0) x = np.reshape(x, [m, n - 1]) y = np.reshape(y, [m, 1]) ones = np.ones([m, 1]) x = np.hstack([ones, x]) theta = np.zeros([n, 1]) cost, grad = cost_function(theta, x, y) print("Cost with theta [0;0;0]: ", cost) print('Theta Result with [0;0;0]:\n', grad) test_theta = str2arr('[-24; 0.2; 0.2]') cost, grad = cost_function(test_theta, x, y) print("Cost with theta [-24; 0.2; 0.2]: ", cost) print('Theta Result with [-24; 0.2; 0.2]:\n', grad) Result = op.minimize(fun=cost_function, x0=theta, args=(x, y), method='TNC', jac=True) optimal_theta = Result.x print('Optimal theta: ', optimal_theta) res = predict(optimal_theta, x) print("Accuracy:", np.mean(((res == y).flatten())) * 100) plot_decision_boundary(optimal_theta, x, y)
def plot_data(): data_path = FileUtils.get_abs_path(__file__, "./data/ex1data1.txt") data = np.loadtxt(data_path, delimiter=',') x = data[:, 0] y = data[:, 1] plt.scatter(x, y, marker='x', cmap='red') plt.xlabel("Population of City in 10,000s") plt.ylabel('Profit in $10,000s')
def run(): data_path = FileUtils.get_abs_path(__file__, "./data/ex6data1.mat") mat = loadmat(data_path) X = mat["X"] y = mat["y"] plot_data(X, y) classifier = SVC(C=1, kernel="linear") classifier.fit(X, np.ravel(y)) plot_svc(classifier, X) x1 = np.array([1, 2, 1]) x2 = np.array([0, 4, -1]) sigma = 2 print(gaussian_kernel(x1, x2, sigma)) data_path = FileUtils.get_abs_path(__file__, "./data/ex6data2.mat") data2 = loadmat(data_path) y2 = data2['y'] X2 = data2['X'] plot_data(X2, y2) clf2 = SVC(kernel='rbf', gamma=30) clf2.fit(X2, y2.ravel()) plot_svc(clf2, X2) data_path = FileUtils.get_abs_path(__file__, "./data/ex6data3.mat") data3 = loadmat(data_path) X3 = data3["X"] y3 = data3["y"] Xval = data3["Xval"] yval = data3["yval"] plot_data(X3, y3) C, gamma = dataset_3_params(X3, y3, Xval, yval) clf3 = SVC(C=C, gamma=gamma) clf3.fit(X3, y3.ravel()) plot_svc(clf3, X3)
def run(): data_path = FileUtils.get_abs_path(__file__, "./data/ex7data2.mat") mat = loadmat(data_path) X = mat["X"] K = 3 initial_centroids = np.array([[3, 3], [6, 2], [8, 5]]) idx = find_closest_centroids(X, initial_centroids) print("Closest centroids for the first 3 examples:\n", idx[0:3]) centroids = compute_centroids(X, idx, K) print("Centroids computed after initial finding of closest centroids:\n", centroids) m, n = X.shape[0], X.shape[1] initial_centroids = init_random_centroid(X, K) idx = find_closest_centroids(X, initial_centroids) plot_kmeans(X, initial_centroids, idx, K, 10) plt.show() data_path = FileUtils.get_abs_path(__file__, "./data/bird_small.png") A = plt.imread(data_path) A /= 255 img_size1, img_size2, rgb = A.shape X2 = A.reshape(img_size1 * img_size2, 3) K2 = 16 num_iters = 10 initial_centroids2 = init_random_centroid(X2, K2) centroids2, idx2 = run_kmeans(X2, initial_centroids2, num_iters, K2) X2_recovered = centroids2[idx2, :].reshape(A.shape) fig, ax = plt.subplots(1, 2, figsize=(8, 4)) ax[0].imshow(A * 255) ax[0].set_title('Original') ax[0].grid(False) # Display compressed image, rescale back by 255 ax[1].imshow(X2_recovered * 255) ax[1].set_title('Compressed, with %d colors' % K2) ax[1].grid(False) plt.show()
def run(): data_path = FileUtils.get_abs_path(__file__, "./data/ex1data1.txt") data = np.loadtxt(data_path, delimiter=',') n = np.size(data, 1) x = data[:, range(n - 1)] y = data[:, n - 1] m = np.size(y, 0) x = np.reshape(x, [m, n - 1]) y = np.reshape(y, [m, 1]) ones = np.ones([m, 1]) x = np.hstack([ones, x]) theta = np.zeros([n, 1]) alpha = 0.01 iterations = 1500 cost = cost_function_j(x, y, theta) print('Cost', cost) thetaRes, j_hist = gradient_descent(x, y, theta, alpha, iterations) print(thetaRes) cost = cost_function_j(x, y, str2arr('[-1;2]')) print(cost) theta0_vals = np.linspace(-10, 10, 100) theta1_vals = np.linspace(-1, 4, 100) J_vals = np.zeros([len(theta0_vals), len(theta1_vals)]) for i in range(len(theta0_vals)): for j in range(len(theta1_vals)): t = np.vstack([theta0_vals[i], theta1_vals[j]]) J_vals[i, j] = cost_function_j(x, y, t) pltData.plot_data() plt.plot(x[:, 1], x @ thetaRes, '-', color='red') fig1 = plt.figure() ax = fig1.add_subplot(111) ax.contour(theta0_vals, theta1_vals, J_vals, np.logspace(-2, 3, 20)) fig2 = plt.figure() ax2 = fig2.add_subplot(111, projection='3d') theta0_vals, theta1_vals = np.meshgrid(theta0_vals, theta1_vals) ax2.plot_surface(theta0_vals, theta1_vals, np.transpose(J_vals)) plt.show()
def run(): data_path = FileUtils.get_abs_path(__file__, "./data/ex8data1.mat") mat = loadmat(data_path) X = mat["X"] Xval = mat["Xval"] yval = mat["yval"] plt.scatter(X[:, 0], X[:, 1], marker="x") plt.xlim(0, 30) plt.ylim(0, 30) plt.xlabel("Latency (ms)") plt.ylabel("Throughput (mb/s)") plt.show() mu, sigma2 = estimate_gaussian(X) p = multivariate_gaussian(X, mu, sigma2) visualize_fit(X, mu, sigma2) pval = multivariate_gaussian(Xval, mu, sigma2) epsilon, F1 = select_threshold(yval, pval) print("Best epsilon found using cross-validation:", epsilon) print("Best F1 on Cross Validation Set:", F1) outliers = np.nonzero(p < epsilon)[0] plt.scatter(X[outliers, 0], X[outliers, 1], marker="o", facecolor="none", edgecolor="r", s=70) plt.xlim(0, 35) plt.ylim(0, 35) plt.xlabel("Latency (ms)") plt.ylabel("Throughput (mb/s)") plt.show()
def run(): data_path = FileUtils.get_abs_path(__file__, "./data/ex1data2.txt") data = np.loadtxt(data_path, delimiter=',') n = np.size(data, 1) x = data[:, range(n - 1)] y = data[:, n - 1] m = np.size(y, 0) x = np.reshape(x, [m, n - 1]) y = np.reshape(y, [m, 1]) ones = np.ones([m, 1]) X, mu, sigma = feature_normalize(x) x = np.hstack([ones, x]) X = np.hstack([ones, X]) theta = np.zeros([n, 1]) alpha = 0.01 iterations = 400 cost = cost_function_j(X, y, theta) print('Cost', cost) thetaRes, j_hist = gradient_descent(X, y, theta, alpha, iterations) print('Theta using gradient descent:\n', thetaRes) print('Price of 1650 sq ft and 3 bedroom house: ', predict([[1653, 3]], thetaRes, mu, sigma)) plt.plot(range(400), j_hist) plt.xlabel("No of iterations") plt.ylabel("Cost") plt.title("Gradient Descent") plt.show() thetaRes = normal_equation(x, y) print('Theta using normal equation: \n', thetaRes) cost = thetaRes.T @ np.array([[1], [1650], [3]]) print('Price of 1650 sq ft and 3 bedroom house: ', cost[0][0])
def run(): data_path = FileUtils.get_abs_path(__file__, "./data/ex8_movies.mat") mat3 = loadmat(data_path) data_path = FileUtils.get_abs_path(__file__, "./data/ex8_movieParams.mat") mat4 = loadmat(data_path) Y = mat3[ "Y"] # 1682 X 943 matrix, containing ratings (1-5) of 1682 movies on 943 user R = mat3[ "R"] # 1682 X 943 matrix, where R(i,j) = 1 if and only if user j give rating to movie i X = mat4[ "X"] # 1682 X 10 matrix , num_movies X num_features matrix of movie features Theta = mat4[ "Theta"] # 943 X 10 matrix, num_users X num_features matrix of user features # Compute average rating print("Average rating for movie 1 (Toy Story):", np.sum(Y[0, :] * R[0, :]) / np.sum(R[0, :]), "/5") # Reduce the data set size to run faster num_users, num_movies, num_features = 4, 5, 3 X_test = X[:num_movies, :num_features] Theta_test = Theta[:num_users, :num_features] Y_test = Y[:num_movies, :num_users] R_test = R[:num_movies, :num_users] params = np.append(X_test.flatten(), Theta_test.flatten()) # Evaluate cost function J, grad = cofi_cost_function(params, Y_test, R_test, num_users, num_movies, num_features, 0) print("Cost at loaded parameters:", J) J2, grad2 = cofi_cost_function(params, Y_test, R_test, num_users, num_movies, num_features, 1.5) print("Cost at loaded parameters (lambda = 1.5):", J2) # load movie list data_path = FileUtils.get_abs_path(__file__, "./data/movie_ids.txt") movieList = open(data_path, "r").read().split("\n")[:-1] # see movie list # Initialize my ratings my_ratings = np.zeros((1682, 1)) # Create own ratings my_ratings[0] = 4 my_ratings[97] = 2 my_ratings[6] = 3 my_ratings[11] = 5 my_ratings[53] = 4 my_ratings[63] = 5 my_ratings[65] = 3 my_ratings[68] = 5 my_ratings[82] = 4 my_ratings[225] = 5 my_ratings[354] = 5 print("New user ratings:\n") for i in range(len(my_ratings)): if my_ratings[i] > 0: print("Rated", int(my_ratings[i]), "for index", movieList[i]) Y = np.hstack((my_ratings, Y)) R = np.hstack((my_ratings != 0, R)) # Normalize Ratings Ynorm, Ymean = normalize_ratings(Y, R) num_users = Y.shape[1] num_movies = Y.shape[0] num_features = 10 # Set initial Parameters (Theta,X) X = np.random.randn(num_movies, num_features) Theta = np.random.randn(num_users, num_features) initial_parameters = np.append(X.flatten(), Theta.flatten()) Lambda = 10 options = {'maxiter': 100} result = op.minimize(fun=cofi_cost_function, x0=initial_parameters, args=(Ynorm, R, num_users, num_movies, num_features, Lambda), method='TNC', jac=True, options=options) paramsFinal = result.x X = paramsFinal[0:num_movies * num_features].reshape( num_movies, num_features) Theta = paramsFinal[num_movies * num_features:].reshape( num_users, num_features) p = X @ Theta.T my_predictions = p[:, 0][:, np.newaxis] + Ymean df = pd.DataFrame( np.hstack((my_predictions, np.array(movieList)[:, np.newaxis]))) df.sort_values(by=[0], ascending=False, inplace=True) df.reset_index(drop=True, inplace=True) print("Top recommendations for you:\n") for i in range(10): print("Predicting rating", round(float(df[0][i]), 1), " for index", df[1][i])
def run(): np.set_printoptions(suppress=True) data_path = FileUtils.get_abs_path(__file__, "./data/ex4data1.mat") mat = loadmat(data_path) X = mat["X"] y = mat["y"] X = np.array(X) y = np.array(y) data_path = FileUtils.get_abs_path(__file__, "./data/ex4weights.mat") mat2 = loadmat(data_path) Theta1 = mat2['Theta1'] Theta2 = mat2['Theta2'] Theta1 = np.array(Theta1) Theta2 = np.array(Theta2) nn_params = np.append(Theta1.flatten(), Theta2.flatten()) input_layer_size = 400 hidden_layer_size = 25 num_labels = 10 lambda_value = 0 cost = nn_cost_function(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lambda_value)[0] print(cost) input_layer_size = 400 hidden_layer_size = 25 num_labels = 10 lambda_value = 1 cost = nn_cost_function(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lambda_value)[0] print(cost) initial_Theta1 = random_init_weights(input_layer_size, hidden_layer_size) initial_Theta2 = random_init_weights(hidden_layer_size, num_labels) nn_params_rand = np.append(initial_Theta1.flatten(), initial_Theta2.flatten()) lambda_value = 1 options = {'maxiter': 100} result = op.minimize(fun=nn_cost_function, x0=nn_params_rand, args=(input_layer_size, hidden_layer_size, num_labels, X, y, lambda_value), method='TNC', jac=True, options=options) optimal_theta = result.x Theta1 = optimal_theta[0:hidden_layer_size * (input_layer_size + 1)] Theta1 = np.reshape(Theta1, [hidden_layer_size, (input_layer_size + 1)]) Theta2 = optimal_theta[hidden_layer_size * (input_layer_size + 1):] Theta2 = np.reshape(Theta2, [num_labels, (hidden_layer_size + 1)]) print(Theta1.shape) print(Theta2.shape) res = predict_nn(Theta1, Theta2, X) print("Accuracy on training set with Neural Network:", np.mean((res == y)) * 100) lambda_value = 2 options = {'maxiter': 100} result = op.minimize(fun=nn_cost_function, x0=nn_params_rand, args=(input_layer_size, hidden_layer_size, num_labels, X, y, lambda_value), method='TNC', jac=True, options=options) optimal_theta = result.x Theta1 = optimal_theta[0:hidden_layer_size * (input_layer_size + 1)] Theta1 = np.reshape(Theta1, [hidden_layer_size, (input_layer_size + 1)]) Theta2 = optimal_theta[hidden_layer_size * (input_layer_size + 1):] Theta2 = np.reshape(Theta2, [num_labels, (hidden_layer_size + 1)]) print(Theta1.shape) print(Theta2.shape) res = predict_nn(Theta1, Theta2, X) print("Accuracy on training set with Neural Network:", np.mean((res == y)) * 100)
def run(): data_path = FileUtils.get_abs_path(__file__, "./data/ex5data1.mat") mat = loadmat(data_path) X = mat["X"] y = mat["y"] Xval=mat["Xval"] yval=mat["yval"] Xtest=mat["Xtest"] ytest=mat["ytest"] m, n = X.shape plt.scatter(X, y) plt.xlabel("Change in water level (x)") plt.ylabel("Water flowing out of dam (y)") plt.show() ones = np.ones([m, 1]) x = np.hstack([ones, X]) onesVal = np.ones([np.size(Xval,0), 1]) Xval_ones= np.hstack([onesVal, Xval]) theta = np.array([[1], [1]]) J, grad = linear_reg_cost(theta, x , y , 1) print('Cost at theta = [1 ; 1]: %', J, '\n(this value should be about 303.993192)\n') print('Gradient at theta = [1 ; 1]: ', grad, '\n(this value should be about [-15.303016; 598.250744])\n') lambda_value = 0 theta = train_linear_reg(x, y, lambda_value) plt.scatter(X, y) plt.xlabel("Change in water level (x)") plt.ylabel("Water flowing out of dam (y)") plt.plot(x[:, 1], x @ theta, '-', color='red') plt.show() lambda_value = 0 error_train,error_val=learning_curve(x, y, Xval_ones, yval, lambda_value) plt.plot(error_val, '-', color='red') plt.plot(error_train, '-', color='blue') plt.title("Leaning Curve") plt.legend(['error_val','error_train']) plt.ylabel("Error") plt.xlabel("No of samples") plt.show() p=8 x_poly = poly_features(X,p) x_poly,mu,sigma = feature_normalize(x_poly) ones = np.ones([m, 1]) x_poly = np.hstack([ones, x_poly]) X_poly_test = poly_features(Xtest, p) X_poly_test=(X_poly_test-mu)/sigma X_poly_val = poly_features(Xval, p) X_poly_val=(X_poly_val-mu)/sigma m_poly_val = np.size(X_poly_val,0) ones = np.ones([m_poly_val, 1]) X_poly_val = np.hstack([ones, X_poly_val]) print('Normalized Training Example 1:\n'); print(x_poly[0, :]) lambda_value = 0 theta = train_linear_reg(x_poly, y, lambda_value) plt.scatter(X, y,color='red') # plt.plot(x[:, 1], x_poly @ theta, '-', color='red') plot_fit(min(x[:, 1]),max(x[:, 1]),mu,sigma,theta,p) plt.title("Polynomial Features Fitting") plt.show() error_train,error_val=learning_curve(x_poly, y, X_poly_val, yval, lambda_value) plt.plot(range(1,m+1),error_val, '-', color='red') plt.plot(range(1,m+1),error_train, '-', color='blue') plt.title("Learning Curve for Polynomial Features") plt.legend(['error_val','error_train']) plt.ylabel("Error") plt.xlabel("No of samples") plt.show() lambda_vec,error_train,error_val=validation_curve(x_poly, y, X_poly_val, yval) plt.plot(lambda_vec,error_val, '-', color='red') plt.plot(lambda_vec,error_train, '-', color='blue') plt.title("Lambda vs Error for Polynomial Features") plt.legend(['error_val','error_train']) plt.ylabel("Error") plt.xlabel("Lambda") plt.show()
def run(): data_path = FileUtils.get_abs_path(__file__, "./data/ex7data1.mat") mat3 = loadmat(data_path) X3 = mat3["X"] plt.scatter(X3[:, 0], X3[:, 1], marker="o", facecolors="none", edgecolors="b") X_norm, mu, std = feature_normalize(X3) U, S = pca(X_norm)[:2] plt.scatter(X3[:, 0], X3[:, 1], marker="o", facecolors="none", edgecolors="b") plt.plot([mu[0], (mu + 1.5 * S[0] * U[:, 0].T)[0]], [mu[1], (mu + 1.5 * S[0] * U[:, 0].T)[1]], color="black", linewidth=3) plt.plot([mu[0], (mu + 1.5 * S[1] * U[:, 1].T)[0]], [mu[1], (mu + 1.5 * S[1] * U[:, 1].T)[1]], color="black", linewidth=3) plt.xlim(-1, 7) plt.ylim(2, 8) plt.show() print("Top eigenvector U(:,1) =:", U[:, 0]) K = 1 Z = project_data(X_norm, U, K) print("Projection of the first example:", Z[0][0]) X_rec = recover_data(Z, U, K) print("Approximation of the first example:", X_rec[0, :]) plt.scatter(X_norm[:, 0], X_norm[:, 1], marker="o", label="Original", facecolors="none", edgecolors="b", s=15) plt.scatter(X_rec[:, 0], X_rec[:, 1], marker="o", label="Approximation", facecolors="none", edgecolors="r", s=15) plt.title("The Normalized and Projected Data after PCA") plt.legend() plt.show() data_path = FileUtils.get_abs_path(__file__, "./data/ex7faces.mat") mat4 = loadmat(data_path) X4 = mat4["X"] m, n = X4.shape print(n) fig, ax = plt.subplots(nrows=10, ncols=10, figsize=(30, 30)) for i in range(0, 100, 10): for j in range(10): ax[int(i / 10), j].imshow(X4[i + j, :].reshape(32, 32, order="F"), cmap="gray") ax[int(i / 10), j].axis("off") plt.show() X_norm2 = feature_normalize(X4)[0] # Run PCA U2, S = pca(X_norm2) U_reduced = U2[:, :36].T fig2, ax2 = plt.subplots(6, 6, figsize=(12, 12)) for i in range(0, 36, 6): for j in range(6): ax2[int(i / 6), j].imshow(U_reduced[i + j, :].reshape(32, 32, order="F"), cmap="gray") ax2[int(i / 6), j].axis("off") plt.show() Z2, K2 = project_data_optimal_K(X_norm2,U2, S) print("The projected data Z has a size of:", Z2.shape) X_rec2 = recover_data(Z2, U2, K2) fig3, ax3 = plt.subplots(10, 10, figsize=(20, 20)) for i in range(0, 100, 10): for j in range(10): ax3[int(i / 10), j].imshow(X_rec2[i + j, :].reshape(32, 32, order="F"), cmap="gray") ax3[int(i / 10), j].axis("off") plt.show()