def main(): Y_train = np.loadtxt('data/train.txt').astype(int) Y_test = np.loadtxt('data/test.txt').astype(int) M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies print("Factorizing with ", M, " users, ", N, " movies.") Ks = [10,20,30,50,100] reg = 0.0 eta = 0.03 # learning rate E_in = [] E_out = [] # Use to compute Ein and Eout for K in Ks: U,V, err = train_model(M, N, K, eta, reg, Y_train) E_in.append(err) E_out.append(get_err(U, V, Y_test)) plt.plot(Ks, E_in, label='$E_{in}$') plt.plot(Ks, E_out, label='$E_{out}$') plt.title('Error vs. K') plt.xlabel('K') plt.ylabel('Error') plt.legend() plt.savefig('2d.png')
def testreg(): reg = [0.0001, 0.001, 0.01, 0.1, 1, 10] err_list = [] for r in reg: U, V, err = prob2utils.train_model(M, N, K, 0.1, 0.1, data) err_list.append(err) print err
def getUtVt(): U, V, err = prob2utils.train_model(M, N, K, 0.1, 0.1, data) allratings = np.matmul(U, V) A, Si, B = linalg.svd(V) A12 = A[:, :2] Vt = np.matmul(A12.T, V) Ut = np.matmul(A12.T, U.T) return Ut, Vt
def main(): Y_train = np.loadtxt('data/train.txt').astype(int) Y_test = np.loadtxt('data/test.txt').astype(int) M = max(max(Y_train[:, 0]), max(Y_test[:, 0])).astype(int) # users N = max(max(Y_train[:, 1]), max(Y_test[:, 1])).astype(int) # movies k = 20 #regularization constants regs = [10**-4, 10**-3, 10**-2, 10**-1, 1] #learning rate eta = 0.01 #0.00005 best epsilons = [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.002] E_ins = [] E_outs = [] # Use to compute Ein and Eout for reg in regs: E_ins_for_lambda = [] E_outs_for_lambda = [] for ep in epsilons: print( "Training model with M = %s, N = %s, k = %s, eta = %s, reg = %s, ep = %s" % (M, N, k, eta, reg, ep)) U, V, e_in = train_model(M, N, k, eta, reg, Y_train, ep) E_ins_for_lambda.append(e_in) eout = get_err(U, V, Y_test) E_outs_for_lambda.append(eout) E_ins.append(E_ins_for_lambda) E_outs.append(E_outs_for_lambda) for i in range(len(regs)): plt.plot(epsilons, E_ins[i], label='$E_{in}, \lambda=$' + str(regs[i])) plt.title('$E_{in}$ vs. Epsilon') plt.xlabel('Epsilon') plt.ylabel('Error') plt.xscale('log') plt.legend() plt.savefig('E_in.png') plt.clf() for i in range(len(regs)): plt.plot(epsilons, E_outs[i], label='$E_{out}, \lambda=$' + str(regs[i])) plt.title('$E_{out}$ vs. Epsilon') plt.xlabel('Epsilon') plt.ylabel('Error') plt.xscale('log') plt.legend() plt.savefig('E_out.png')
def get_internal_matrices(Y, K): '''Returns U and V from factorizing Y.''' M = max([d[0] for d in Y]) + 1 # number of users N = max([d[1] for d in Y]) + 1 # number of movies reg = .1 # used data set from last week eta = 0.01 # initially .03 U, V, err = svd.train_model(M, N, K, eta, reg, Y) print err print V return U, V
def main(): Y_train = np.loadtxt('./data/traintest.txt').astype(int) Y_test = np.loadtxt('./data/test.txt').astype(int) data = np.loadtxt('./data/data.txt').astype(int) movie_file = codecs.open('./data/movies.txt', mode='r', encoding='windows-1252') movie_names = {} genres = {} for line in movie_file: movie_info = line.split() movie_names[int(movie_info[0])] = " ".join(movie_info[1:-19]) genres[int(movie_info[0])] = list(map(int, movie_info[-19:])) M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies Ks = [20] # Ein and Eout for different regs have been recorded # 2*10**-1 gives us an eout of 0.5 but it has meaningful graphs? regs = [2*10**-1] eta = 0.03 # learning rate E_ins = [] E_outs = [] # Use to compute Ein and Eout for reg in regs: E_ins_for_lambda = [] E_outs_for_lambda = [] for k in Ks: print("Training model with M = %s, N = %s, k = %s, eta = %s, reg = %s"%(M, N, k, eta, reg)) # U,V, e_in = train_model(M, N, k, eta, reg, Y_train, mode='basic') # E_ins_for_lambda.append(e_in) # eout = get_err(U, V, Y_test) U,V, e_in, aVec, bVec, mu = train_model(M, N, k, eta, reg, Y_train, mode='advanced') E_ins_for_lambda.append(e_in) eout = get_err_advanced(U, V, Y_test, mu, aVec, bVec, reg) E_outs_for_lambda.append(eout) E_ins.append(E_ins_for_lambda) E_outs.append(E_outs_for_lambda) # basic gives Ein = 0.3002 and Eout = 0.4495 # advanced gives Ein = 0.4254 and Eout = 0.5097 newU, newV = projectUV(U, V) return newU, newV
def main(): Y_train = np.loadtxt('train.txt').astype(int) Y_test = np.loadtxt('test.txt').astype(int) M = max(max(Y_train[:, 0]), max(Y_test[:, 0])).astype(int) # users N = max(max(Y_train[:, 1]), max(Y_test[:, 1])).astype(int) # movies Ks = [10, 20, 30, 50, 100] regs = [10**-4, 10**-3, 10**-2, 10**-1, 1] #regs = [10**-4] eta = 0.03 # learning rate E_ins = [] E_outs = [] # Use to compute Ein and Eout for reg in regs: E_ins_for_lambda = [] E_outs_for_lambda = [] for k in Ks: print( "Training model with M = %s, N = %s, k = %s, eta = %s, reg = %s" % (M, N, k, eta, reg)) U, V, e_in = train_model(M, N, k, eta, reg, Y_train) E_ins_for_lambda.append(e_in) eout = get_err(U, V, Y_test) E_outs_for_lambda.append(eout) E_ins.append(E_ins_for_lambda) E_outs.append(E_outs_for_lambda) # Plot values of E_in across k for each value of lambda for i in range(len(regs)): plt.plot(Ks, E_ins[i], label='$E_{in}, \lambda=$' + str(regs[i])) plt.title('$E_{in}$ vs. K') plt.xlabel('K') plt.ylabel('Error') plt.legend() plt.savefig('2e_ein.png') plt.clf() # Plot values of E_out across k for each value of lambda for i in range(len(regs)): plt.plot(Ks, E_outs[i], label='$E_{out}, \lambda=$' + str(regs[i])) plt.title('$E_{out}$ vs. K') plt.xlabel('K') plt.ylabel('Error') plt.legend() plt.savefig('2e_eout.png')
def create_u_v_matrices(): Y, _, num_ratings, _ = ut.get_rating_data([]) Y = np.array(Y) M = int(max([x[0] for x in Y])) N = int(max(num_ratings)) K = 20 eta = 0.001 reg = 0.01 eps = 0.01 max_epochs = 100 U, V, error = HW5utils.train_model(M, N, K, eta, reg, Y, eps, max_epochs) np.savetxt('U_matrix.txt', U, delimiter=',') np.savetxt('V_matrix.txt', V, delimiter=',')
def RunModel1(M, N, k, eta, reg, Y_train, Y_test, GraphFlag=True): print("Training model 1 with M = %s, N = %s, k = %s, eta = %s, reg = %s" % (M, N, k, eta, reg)) U_1, V_1, e_in_1 = model_1.train_model(M, N, k, eta, reg, Y_train) e_out_1 = model_1.get_err(U_1, V_1, Y_test) print("model 1 results: e_in = %.3f, e_out = %.3f" % (e_in_1, e_out_1)) if GraphFlag is False: return e_in_1, e_out_1 # Transform model 1 to 2D U_proj_1, V_proj_1 = project_to_2D(U_1, V_1) # Plot model 1 for ids, category in to_plot: visualize(V_proj_1, ids, 'Model 1: ' + category) return e_in_1, e_out_1
def train(Y, reg, eta, Y_test=None, zero_mean=True, save=True): ''' learns U, V, a, b ''' (U, V, a, b, err) = train_model(M, N, K, eta, reg, Y, Y_test=Y_test, eps=0.003) if zero_mean: V = V - V.mean(axis=0) A, S, B = np.linalg.svd(V, full_matrices=False) if save: np.save('models/{:6.5f}-U-{:.5f}-{:.4f}'.format(err, reg, eta), U) np.save('models/{:6.5f}-V-{:.5f}-{:.4f}'.format(err, reg, eta), V) np.save('models/{:6.5f}-a-bias-{:.5f}-{:.4f}'.format(err, reg, eta), a) np.save('models/{:6.5f}-b-bias-{:.5f}-{:.4f}'.format(err, reg, eta), b) np.save('models/{:6.5f}-A-{:.5f}-{:.4f}'.format(err, reg, eta), A[:, :2]) return U, V, a, b, err
def main(): Y_train = np.loadtxt('data/train.txt').astype(int) Y_test = np.loadtxt('data/test.txt').astype(int) M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies print("Factorizing with ", M, " users, ", N, " movies.") Ks = [20] reg = 0.1 eta = 0.01 # learning rate epsilon = 0.00005 E_in = [] E_out = [] # Use to compute Ein and Eout for K in Ks: U, V, a, b, err = train_model(M, N, K, eta, reg, Y_train, epsilon) # E_in.append(err) # E_out.append(get_err(U, V, a, b, Y_test)) print(get_err(U, V, a, b, Y_test))
def train(Y, reg, eta, Y_test=None, save=True): ''' learns U, V, a, b ''' (U, V, a, b, err) = train_model(M, N, K, eta, reg, Y, Y_test=Y_test, eps=0.005, max_epochs=20) if save: np.save('models/{:6.5f}-U-{:.5f}-{:.4f}'.format(err, reg, eta), U) np.save('models/{:6.5f}-V-{:.5f}-{:.4f}'.format(err, reg, eta), V) np.save('models/{:6.5f}-a-bias-{:.5f}-{:.4f}'.format(err, reg, eta), a) np.save('models/{:6.5f}-b-bias-{:.5f}-{:.4f}'.format(err, reg, eta), b) # np.save('models/{:6.5f}-A-{:.5f}-{:.4f}'.format(err, reg, eta), A[:, :2]) return U, V, a, b, err
def main(): #movie_info = np.genfromtxt('../data/movies.txt', dtype="str", delimiter="\t", usecols=(0, 1, 3, 7, 16)) movie_info = np.loadtxt('../data/movies.txt', dtype="str", delimiter="\t", usecols=(0, 1, 3, 7, 16)) data = np.loadtxt('../data/data.txt').astype(int) Y_train = np.loadtxt('../data/train.txt').astype(int) Y_test = np.loadtxt('../data/test.txt').astype(int) print(movie_info) M = max(max(Y_train[:, 0]), max(Y_test[:, 0])).astype(int) # users N = max(max(Y_train[:, 1]), max(Y_test[:, 1])).astype(int) # movies print("Factorizing with ", M, " users, ", N, " movies.") reg = 0.0 eta = 0.03 # learning rate k = 20 E_in = [] E_out = [] # Use to compute Ein and Eout U, V, err = train_model(M, N, k, eta, reg, Y_train) e_out = get_err(U, V, Y_test) print("e_in", err) print("e_out", e_out) #model.score(Y_test) a, sigma, b = np.linalg.svd(V) print(V.shape, a.shape) a_t = a #np.transpose(a) #movie ID starts at 1, but matrix starts at 0 v_proj = np.transpose(np.dot(a_t[:2], V)) x = [] y = [] for i in v_proj: x.append(i[0]) y.append(i[1]) ratings = {} for user, movie_id, rating in data: if movie_id in ratings: ratings[movie_id].append(rating) else: ratings[movie_id] = [rating] #x = v_proj[0] #y = v_proj[1] #print(x) print(v_proj.shape) # Setup ids = movie_info[:, 0].astype(int) movie_names = movie_info[:, 1] # 1. 10 movies of our choice from the MovieLens dataset plt.scatter(x[2:12], y[2:12]) texts = [] for j, txt in enumerate(movie_names[2:12]): texts.append(plt.text(x[2:12][j], y[2:12][j], txt)) adjust_text(texts) plt.xlabel('Feature 0') plt.ylabel('Feature 1') plt.title('10 Movies of Our Choice') plt.savefig('Standard-choice.png') plt.clf() # 2. All ratings of the ten most popular movies max_10 = dict( sorted(ratings.items(), key=lambda r: len(r[1]), reverse=True)[:10]) x_pop = [] y_pop = [] top_ratings = [] top_ratings = max_10.keys() movie_title = [] print(top_ratings) counter = 0 for i in v_proj: counter += 1 if counter in top_ratings: x_pop.append(i[0]) y_pop.append(i[1]) movie_title.append(movie_names[counter]) print(movie_title) plt.scatter(x_pop, y_pop) texts = [] for j, txt in enumerate(movie_title): texts.append(plt.text(x_pop[j], y_pop[j], txt)) #plt.annotate(txt, (x_pop[j], y_pop[j])) adjust_text(texts) plt.xlabel('Feature 0') plt.ylabel('Feature 1') plt.title('10 Most Popular Movies') plt.savefig('Standard-popular.png') plt.clf() # 3. All ratings of the ten best movies best_10 = dict( sorted(ratings.items(), key=lambda r: sum(r[1]) / len(r[1]), reverse=True)[:10]) x_best = [] y_best = [] best = [] best = best_10.keys() print(best) count = 0 for i in v_proj: count += 1 if count in best: x_best.append(i[0]) y_best.append(i[1]) for j, txt in enumerate(movie_title): plt.annotate(txt, (x_best[j], y_best[j])) plt.scatter(x_best, y_best) plt.xlabel('Feature 0') plt.ylabel('Feature 1') plt.title('10 Best Movies') plt.xlabel("Feature 0") plt.ylabel("Feature 1") plt.savefig('Standard-best.png') plt.clf() # 4. 10 ratings of movies from three genres of your choice ids = movie_info[:, 0].astype(int) movie_names = movie_info[:, 1] action = (movie_info[:, 2].astype(int)) action_movies = dict((k, v) for k, v in zip(ids, action) if v == 1) action_ratings_dict = dict((k, ratings[k]) for k in action_movies.keys()) x_action = [] y_action = [] action_ratings = [] action_ratings = action_ratings_dict.keys() comedy = movie_info[:, 3].astype(int) comedy_movies = dict((k, v) for k, v in zip(ids, comedy) if v == 1) comedy_ratings_dict = dict((k, ratings[k]) for k in comedy_movies.keys()) x_comedy = [] y_comedy = [] comedy_ratings = [] comedy_ratings = comedy_ratings_dict.keys() romance = movie_info[:, 4].astype(int) romance_movies = dict((k, v) for k, v in zip(ids, romance) if v == 1) romance_ratings_dict = dict((k, ratings[k]) for k in romance_movies.keys()) x_romance = [] y_romance = [] romance_ratings = [] romance_ratings = romance_ratings_dict.keys() count = 0 for i in v_proj: count += 1 if count in action_ratings: x_action.append(i[0]) y_action.append(i[1]) if count in comedy_ratings: x_comedy.append(i[0]) y_comedy.append(i[1]) if count in romance_ratings: x_romance.append(i[0]) y_romance.append(i[1]) plt.scatter(x_action[2:12], y_action[2:12], label="Action") plt.scatter(x_comedy[2:12], y_comedy[2:12], color='orange', label="Comedy") plt.scatter(x_romance[2:12], y_romance[2:12], color='green', label="Romance") plt.legend() plt.title("Three Genres") plt.savefig('Standard-genres.png') plt.clf()
transposed_data = np.transpose(data) user_ids_vector = list(transposed_data[0]) movie_ids_vector = list(transposed_data[1]) rating_vector = list(transposed_data[2]) max_user_id = max(user_ids_vector) max_movie_id = max(movie_ids_vector) eta = 3e-2 reg = 1e-3 K = 20 U, V, err = prob2utils.train_model(max_user_id, max_movie_id, K, eta, reg, data) a, s, b = np.linalg.svd(V) # print a # print s # print b a_tilde = a[:, :2] V_tilde = np.dot(np.transpose(a_tilde), V) #U_tilde = np.dot(np.transpose(a_tilde), U) def make_plot(indices, title, file):
genre6_movies, genre_6_ratings = Rating.rating_genres(6) genre1_movies, genre_1_ratings = Rating.rating_genres(1) genre7_movies, genre_2_ratings = Rating.rating_genres(7) do_histogram(genre_6_ratings, "Ratings for Movies of Genre 6", "Rating") #do_histogram(genre_1_ratings, "Ratings for Movies of Genre 1", "Rating") #do_histogram(genre_2_ratings, "Ratings for Movies of Genre 7", "Rating") # Do matrix factorization M = max(Rating.user_id) N = max(Rating.movie_id) K = 20 learning_rate = 0.02 lam = 0.0 Y = np.column_stack((Rating.user_id, Rating.movie_id, Rating.ratings)) U, V, err = p2util.train_model(M, N, K, learning_rate, lam, Y) V_2D = SVD(V) # V should have 2 x N shape, i.e. 2 features for N movies. random_movies = np.random.choice(Rating.movie_dict.keys(), 10) print random_movies, len(random_movies) rand_movie_labels = [Rating.movie_dict[x] for x in random_movies] printable = set(string.printable) rand_movie_labels = convert_to_ASCII(rand_movie_labels) print rand_movie_labels, len(rand_movie_labels) do_2D_plot(V_2D[0, random_movies], V_2D[1, random_movies], "10 Random Movies in 2D Factorization",
import numpy as np import prob2utils data = np.genfromtxt('../project3data/data.txt') M = np.amax(data[:, 0]).astype(int) N = np.amax(data[:, 1]).astype(int) K = 20 eta = 2e-3 reg = 0.01 results = prob2utils.train_model(M, N, K, eta, reg, data, eps=0.0001, max_epochs=500) np.savetxt('U_matrix.csv', results[0], delimiter=',') np.savetxt('V_matrix.csv', results[1], delimiter=',') np.savetxt('E_in.csv', [results[2]], delimiter=',')