Esempio n. 1
0
def train(reg):
    logdir = 'logs/mf/numpy'
    if not os.path.exists(logdir):
        os.makedirs(logdir)

    print("Loading data...")
    movie_titles, ratings, rating_indices, n_users, n_items = get_netflix_data(n_samples=1000000)
    print("number of users with ratings: {}".format(len(np.unique(rating_indices[:,0]))))
    print("number of movies with ratings: {}".format(len(np.unique(rating_indices[:,1]))))

    method = 'als'
    if reg:
        print("Performing cross validation with reg: {}.".format(reg))
    else:
        print("Finding optimal regularization penalty.")
    reg_vals = [0.01, 0.1, 1, 10]
    best_reg = 0
    mean_loss = 0.0
    n_splits = 5
    n_features = 15
    loss_path = np.zeros((len(reg_vals), n_splits))
    kf = KFold(n_splits=n_splits, shuffle=True)
    kf.get_n_splits(rating_indices)
    for k, (train_index, test_index) in enumerate(kf.split(rating_indices)):
        print("Fold {}".format(k))
        train_indices, test_indices = rating_indices[train_index], rating_indices[test_index]
        train_indices = (train_indices[:,0], train_indices[:,1], train_indices[:,2])
        test_indices = (test_indices[:,0], test_indices[:,1], test_indices[:,2])
        if reg:
            start = time.time() 
            model = MF(n_users, n_items, n_features, method=method)
            model.fit(train_indices, verbose=1) 
            acc, loss = model.predict(test_indices)
            print("val_loss: {:.4f} - val_acc: {:.4f}".format(loss, acc))
            mean_loss = (mean_loss*k + loss) / (k+1)
        else:
            for i, reg in enumerate(reg_vals):
                print("lambda: {}".format(reg))
                start = time.time()
                model = MF(n_users, n_items, n_features, method=method)
                model.fit(train_indices, verbose=1)
                acc, loss = model.predict(test_indices)
                print("val_loss: {:.4f} - val_acc: {:.4f}".format(loss, acc))
                loss_path[i, k] = loss
    if reg:
        print("mean loss: {:.4f}".format(mean_loss))
    else:
        loss_means = np.mean(loss_path, axis=1)
        print(loss_means)
        best_reg = reg_vals[np.argmin(loss_means)]
        best_loss = np.amin(loss_means)
        print("best lambda: {} - loss: {}".format(best_reg, best_loss))
    print("Successfully finished training MF. See logs directory.")
Esempio n. 2
0
all_users = user_train[:, :4]
all_users[all_users[:,1]=='M', 1] = 1
all_users[all_users[:,1]=='F', 1] = 0
print(all_users)



rs = MF(rate_train, K = 100, lam = .1, print_every = 10,learning_rate = 0.75, max_iter = 100, user_based = 1)
# print("X0:\n", rs.X)
# print("rate_test:\n", rate_test)
# in_file = open("MF.obj", "rb") # opening for [r]eading as [b]inary
# rs = pickle.load(in_file) # if you only wanted to read 512 bytes, do .read(512)
# in_file.close()
# print(type(rs))

rs.fit()

file_mf = open('MF_1m.obj', 'wb')
pickle.dump(rs, file_mf)
file_mf.close()
print(type(rs))
# print("X1:\n", rs.X)
print("utility:\n", rs.X.dot(rs.W) + rs.mu)
# evaluate on test data
RMSE = rs.evaluate_RMSE(rate_test)
print ('\nUser-based MF, RMSE =', RMSE)

# # Read the Movies File
# movies = pd.read_csv(os.path.join(MOVIELENS_DIR, MOVIE_DATA_FILE), 
#                     sep='::', 
#                     engine='python',