def train(reg): logdir = 'logs/mf/numpy' if not os.path.exists(logdir): os.makedirs(logdir) print("Loading data...") movie_titles, ratings, rating_indices, n_users, n_items = get_netflix_data(n_samples=1000000) print("number of users with ratings: {}".format(len(np.unique(rating_indices[:,0])))) print("number of movies with ratings: {}".format(len(np.unique(rating_indices[:,1])))) method = 'als' if reg: print("Performing cross validation with reg: {}.".format(reg)) else: print("Finding optimal regularization penalty.") reg_vals = [0.01, 0.1, 1, 10] best_reg = 0 mean_loss = 0.0 n_splits = 5 n_features = 15 loss_path = np.zeros((len(reg_vals), n_splits)) kf = KFold(n_splits=n_splits, shuffle=True) kf.get_n_splits(rating_indices) for k, (train_index, test_index) in enumerate(kf.split(rating_indices)): print("Fold {}".format(k)) train_indices, test_indices = rating_indices[train_index], rating_indices[test_index] train_indices = (train_indices[:,0], train_indices[:,1], train_indices[:,2]) test_indices = (test_indices[:,0], test_indices[:,1], test_indices[:,2]) if reg: start = time.time() model = MF(n_users, n_items, n_features, method=method) model.fit(train_indices, verbose=1) acc, loss = model.predict(test_indices) print("val_loss: {:.4f} - val_acc: {:.4f}".format(loss, acc)) mean_loss = (mean_loss*k + loss) / (k+1) else: for i, reg in enumerate(reg_vals): print("lambda: {}".format(reg)) start = time.time() model = MF(n_users, n_items, n_features, method=method) model.fit(train_indices, verbose=1) acc, loss = model.predict(test_indices) print("val_loss: {:.4f} - val_acc: {:.4f}".format(loss, acc)) loss_path[i, k] = loss if reg: print("mean loss: {:.4f}".format(mean_loss)) else: loss_means = np.mean(loss_path, axis=1) print(loss_means) best_reg = reg_vals[np.argmin(loss_means)] best_loss = np.amin(loss_means) print("best lambda: {} - loss: {}".format(best_reg, best_loss)) print("Successfully finished training MF. See logs directory.")
all_users = user_train[:, :4] all_users[all_users[:,1]=='M', 1] = 1 all_users[all_users[:,1]=='F', 1] = 0 print(all_users) rs = MF(rate_train, K = 100, lam = .1, print_every = 10,learning_rate = 0.75, max_iter = 100, user_based = 1) # print("X0:\n", rs.X) # print("rate_test:\n", rate_test) # in_file = open("MF.obj", "rb") # opening for [r]eading as [b]inary # rs = pickle.load(in_file) # if you only wanted to read 512 bytes, do .read(512) # in_file.close() # print(type(rs)) rs.fit() file_mf = open('MF_1m.obj', 'wb') pickle.dump(rs, file_mf) file_mf.close() print(type(rs)) # print("X1:\n", rs.X) print("utility:\n", rs.X.dot(rs.W) + rs.mu) # evaluate on test data RMSE = rs.evaluate_RMSE(rate_test) print ('\nUser-based MF, RMSE =', RMSE) # # Read the Movies File # movies = pd.read_csv(os.path.join(MOVIELENS_DIR, MOVIE_DATA_FILE), # sep='::', # engine='python',