def main():
    Y_train = np.loadtxt('data/train.txt').astype(int)
    Y_test = np.loadtxt('data/test.txt').astype(int)

    M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
    N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies
    print("Factorizing with ", M, " users, ", N, " movies.")
    Ks = [10,20,30,50,100]

    reg = 0.0
    eta = 0.03 # learning rate
    E_in = []
    E_out = []

    # Use to compute Ein and Eout
    for K in Ks:
        U,V, err = train_model(M, N, K, eta, reg, Y_train)
        E_in.append(err)
        E_out.append(get_err(U, V, Y_test))

    plt.plot(Ks, E_in, label='$E_{in}$')
    plt.plot(Ks, E_out, label='$E_{out}$')
    plt.title('Error vs. K')
    plt.xlabel('K')
    plt.ylabel('Error')
    plt.legend()
    plt.savefig('2d.png')
Ejemplo n.º 2
0
def testreg():
    reg = [0.0001, 0.001, 0.01, 0.1, 1, 10]
    err_list = []
    for r in reg:
        U, V, err = prob2utils.train_model(M, N, K, 0.1, 0.1, data)
        err_list.append(err)
        print err
Ejemplo n.º 3
0
def getUtVt():
    U, V, err = prob2utils.train_model(M, N, K, 0.1, 0.1, data)
    allratings = np.matmul(U, V)

    A, Si, B = linalg.svd(V)
    A12 = A[:, :2]
    Vt = np.matmul(A12.T, V)
    Ut = np.matmul(A12.T, U.T)
    return Ut, Vt
Ejemplo n.º 4
0
def main():
    Y_train = np.loadtxt('data/train.txt').astype(int)
    Y_test = np.loadtxt('data/test.txt').astype(int)

    M = max(max(Y_train[:, 0]), max(Y_test[:, 0])).astype(int)  # users
    N = max(max(Y_train[:, 1]), max(Y_test[:, 1])).astype(int)  # movies

    k = 20
    #regularization constants
    regs = [10**-4, 10**-3, 10**-2, 10**-1, 1]
    #learning rate
    eta = 0.01
    #0.00005 best
    epsilons = [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.002]
    E_ins = []
    E_outs = []

    # Use to compute Ein and Eout
    for reg in regs:
        E_ins_for_lambda = []
        E_outs_for_lambda = []

        for ep in epsilons:
            print(
                "Training model with M = %s, N = %s, k = %s, eta = %s, reg = %s, ep = %s"
                % (M, N, k, eta, reg, ep))
            U, V, e_in = train_model(M, N, k, eta, reg, Y_train, ep)
            E_ins_for_lambda.append(e_in)
            eout = get_err(U, V, Y_test)
            E_outs_for_lambda.append(eout)

        E_ins.append(E_ins_for_lambda)
        E_outs.append(E_outs_for_lambda)

    for i in range(len(regs)):
        plt.plot(epsilons, E_ins[i], label='$E_{in}, \lambda=$' + str(regs[i]))
    plt.title('$E_{in}$ vs. Epsilon')
    plt.xlabel('Epsilon')
    plt.ylabel('Error')
    plt.xscale('log')
    plt.legend()
    plt.savefig('E_in.png')
    plt.clf()

    for i in range(len(regs)):
        plt.plot(epsilons,
                 E_outs[i],
                 label='$E_{out}, \lambda=$' + str(regs[i]))
    plt.title('$E_{out}$ vs. Epsilon')
    plt.xlabel('Epsilon')
    plt.ylabel('Error')
    plt.xscale('log')
    plt.legend()
    plt.savefig('E_out.png')
Ejemplo n.º 5
0
def get_internal_matrices(Y, K):
    '''Returns U and V from factorizing Y.'''
    M = max([d[0] for d in Y]) + 1 # number of users
    N = max([d[1] for d in Y]) + 1 # number of movies

    reg = .1 # used data set from last week
    eta = 0.01 # initially .03
    U, V, err = svd.train_model(M, N, K, eta, reg, Y)
    print err
    print V
    return U, V
Ejemplo n.º 6
0
def main():
    Y_train = np.loadtxt('./data/traintest.txt').astype(int)
    Y_test = np.loadtxt('./data/test.txt').astype(int)
    data = np.loadtxt('./data/data.txt').astype(int)
    movie_file = codecs.open('./data/movies.txt', mode='r', encoding='windows-1252')
    movie_names = {}
    genres = {}
    for line in movie_file:
        movie_info = line.split()
        movie_names[int(movie_info[0])] = " ".join(movie_info[1:-19])
        genres[int(movie_info[0])] = list(map(int, movie_info[-19:]))


    M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
    N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies
    Ks = [20]

    # Ein and Eout for different regs have been recorded
    # 2*10**-1 gives us an eout of 0.5 but it has meaningful graphs?
    regs = [2*10**-1]
    eta = 0.03 # learning rate
    E_ins = []
    E_outs = []

    # Use to compute Ein and Eout
    for reg in regs:
        E_ins_for_lambda = []
        E_outs_for_lambda = []
        
        for k in Ks:
            print("Training model with M = %s, N = %s, k = %s, eta = %s, reg = %s"%(M, N, k, eta, reg))
            # U,V, e_in = train_model(M, N, k, eta, reg, Y_train, mode='basic')
            # E_ins_for_lambda.append(e_in)
            # eout = get_err(U, V, Y_test)

            U,V, e_in, aVec, bVec, mu = train_model(M, N, k, eta, reg, Y_train, mode='advanced')
            E_ins_for_lambda.append(e_in)
            eout = get_err_advanced(U, V, Y_test, mu, aVec, bVec, reg)

            E_outs_for_lambda.append(eout)
            
        E_ins.append(E_ins_for_lambda)
        E_outs.append(E_outs_for_lambda)

    # basic gives Ein = 0.3002 and Eout = 0.4495
    # advanced gives Ein = 0.4254 and Eout = 0.5097


    newU, newV = projectUV(U, V)
    return newU, newV
Ejemplo n.º 7
0
def main():
    Y_train = np.loadtxt('train.txt').astype(int)
    Y_test = np.loadtxt('test.txt').astype(int)

    M = max(max(Y_train[:, 0]), max(Y_test[:, 0])).astype(int)  # users
    N = max(max(Y_train[:, 1]), max(Y_test[:, 1])).astype(int)  # movies
    Ks = [10, 20, 30, 50, 100]

    regs = [10**-4, 10**-3, 10**-2, 10**-1, 1]
    #regs = [10**-4]
    eta = 0.03  # learning rate
    E_ins = []
    E_outs = []

    # Use to compute Ein and Eout
    for reg in regs:
        E_ins_for_lambda = []
        E_outs_for_lambda = []

        for k in Ks:
            print(
                "Training model with M = %s, N = %s, k = %s, eta = %s, reg = %s"
                % (M, N, k, eta, reg))
            U, V, e_in = train_model(M, N, k, eta, reg, Y_train)
            E_ins_for_lambda.append(e_in)
            eout = get_err(U, V, Y_test)
            E_outs_for_lambda.append(eout)

        E_ins.append(E_ins_for_lambda)
        E_outs.append(E_outs_for_lambda)

    # Plot values of E_in across k for each value of lambda
    for i in range(len(regs)):
        plt.plot(Ks, E_ins[i], label='$E_{in}, \lambda=$' + str(regs[i]))
    plt.title('$E_{in}$ vs. K')
    plt.xlabel('K')
    plt.ylabel('Error')
    plt.legend()
    plt.savefig('2e_ein.png')
    plt.clf()

    # Plot values of E_out across k for each value of lambda
    for i in range(len(regs)):
        plt.plot(Ks, E_outs[i], label='$E_{out}, \lambda=$' + str(regs[i]))
    plt.title('$E_{out}$ vs. K')
    plt.xlabel('K')
    plt.ylabel('Error')
    plt.legend()
    plt.savefig('2e_eout.png')
Ejemplo n.º 8
0
def create_u_v_matrices():
    Y, _, num_ratings, _ = ut.get_rating_data([])
    Y = np.array(Y)
    M = int(max([x[0] for x in Y]))
    N = int(max(num_ratings))

    K = 20
    eta = 0.001
    reg = 0.01
    eps = 0.01
    max_epochs = 100

    U, V, error = HW5utils.train_model(M, N, K, eta, reg, Y, eps, max_epochs)

    np.savetxt('U_matrix.txt', U, delimiter=',')
    np.savetxt('V_matrix.txt', V, delimiter=',')
Ejemplo n.º 9
0
def RunModel1(M, N, k, eta, reg, Y_train, Y_test, GraphFlag=True):

    print("Training model 1 with M = %s, N = %s, k = %s, eta = %s, reg = %s" %
          (M, N, k, eta, reg))
    U_1, V_1, e_in_1 = model_1.train_model(M, N, k, eta, reg, Y_train)
    e_out_1 = model_1.get_err(U_1, V_1, Y_test)
    print("model 1 results: e_in = %.3f, e_out = %.3f" % (e_in_1, e_out_1))

    if GraphFlag is False:
        return e_in_1, e_out_1

    # Transform model 1 to 2D
    U_proj_1, V_proj_1 = project_to_2D(U_1, V_1)

    # Plot model 1
    for ids, category in to_plot:
        visualize(V_proj_1, ids, 'Model 1: ' + category)

    return e_in_1, e_out_1
Ejemplo n.º 10
0
def train(Y, reg, eta, Y_test=None, zero_mean=True, save=True):
    '''
    learns U, V, a, b
    '''

    (U, V, a, b, err) = train_model(M, N, K, eta, reg, Y, Y_test=Y_test, eps=0.003)

    if zero_mean:
        V = V - V.mean(axis=0)

    A, S, B = np.linalg.svd(V, full_matrices=False)

    if save:
        np.save('models/{:6.5f}-U-{:.5f}-{:.4f}'.format(err, reg, eta), U)
        np.save('models/{:6.5f}-V-{:.5f}-{:.4f}'.format(err, reg, eta), V)
        np.save('models/{:6.5f}-a-bias-{:.5f}-{:.4f}'.format(err, reg, eta), a)
        np.save('models/{:6.5f}-b-bias-{:.5f}-{:.4f}'.format(err, reg, eta), b)
        np.save('models/{:6.5f}-A-{:.5f}-{:.4f}'.format(err, reg, eta), A[:, :2])

    return U, V, a, b, err
Ejemplo n.º 11
0
def main():
    Y_train = np.loadtxt('data/train.txt').astype(int)
    Y_test = np.loadtxt('data/test.txt').astype(int)

    M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
    N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies
    print("Factorizing with ", M, " users, ", N, " movies.")
    Ks = [20]

    reg = 0.1
    eta = 0.01 # learning rate
    epsilon = 0.00005
    E_in = []
    E_out = []

    # Use to compute Ein and Eout
    for K in Ks:
        U, V, a, b, err = train_model(M, N, K, eta, reg, Y_train, epsilon)
        # E_in.append(err)
        # E_out.append(get_err(U, V, a, b, Y_test))
        print(get_err(U, V, a, b, Y_test))
Ejemplo n.º 12
0
def train(Y, reg, eta, Y_test=None, save=True):
    '''
    learns U, V, a, b
    '''

    (U, V, a, b, err) = train_model(M,
                                    N,
                                    K,
                                    eta,
                                    reg,
                                    Y,
                                    Y_test=Y_test,
                                    eps=0.005,
                                    max_epochs=20)

    if save:
        np.save('models/{:6.5f}-U-{:.5f}-{:.4f}'.format(err, reg, eta), U)
        np.save('models/{:6.5f}-V-{:.5f}-{:.4f}'.format(err, reg, eta), V)
        np.save('models/{:6.5f}-a-bias-{:.5f}-{:.4f}'.format(err, reg, eta), a)
        np.save('models/{:6.5f}-b-bias-{:.5f}-{:.4f}'.format(err, reg, eta), b)
        # np.save('models/{:6.5f}-A-{:.5f}-{:.4f}'.format(err, reg, eta), A[:, :2])

    return U, V, a, b, err
Ejemplo n.º 13
0
def main():
    #movie_info = np.genfromtxt('../data/movies.txt', dtype="str", delimiter="\t", usecols=(0, 1, 3, 7, 16))
    movie_info = np.loadtxt('../data/movies.txt',
                            dtype="str",
                            delimiter="\t",
                            usecols=(0, 1, 3, 7, 16))
    data = np.loadtxt('../data/data.txt').astype(int)
    Y_train = np.loadtxt('../data/train.txt').astype(int)
    Y_test = np.loadtxt('../data/test.txt').astype(int)
    print(movie_info)

    M = max(max(Y_train[:, 0]), max(Y_test[:, 0])).astype(int)  # users
    N = max(max(Y_train[:, 1]), max(Y_test[:, 1])).astype(int)  # movies
    print("Factorizing with ", M, " users, ", N, " movies.")

    reg = 0.0
    eta = 0.03  # learning rate
    k = 20
    E_in = []
    E_out = []

    # Use to compute Ein and Eout

    U, V, err = train_model(M, N, k, eta, reg, Y_train)
    e_out = get_err(U, V, Y_test)
    print("e_in", err)
    print("e_out", e_out)

    #model.score(Y_test)
    a, sigma, b = np.linalg.svd(V)
    print(V.shape, a.shape)
    a_t = a  #np.transpose(a)

    #movie ID starts at 1, but matrix starts at 0
    v_proj = np.transpose(np.dot(a_t[:2], V))

    x = []
    y = []
    for i in v_proj:
        x.append(i[0])
        y.append(i[1])

    ratings = {}
    for user, movie_id, rating in data:
        if movie_id in ratings:
            ratings[movie_id].append(rating)
        else:
            ratings[movie_id] = [rating]
    #x = v_proj[0]
    #y = v_proj[1]
    #print(x)

    print(v_proj.shape)

    # Setup

    ids = movie_info[:, 0].astype(int)
    movie_names = movie_info[:, 1]

    # 1. 10 movies of our choice from the MovieLens dataset

    plt.scatter(x[2:12], y[2:12])
    texts = []
    for j, txt in enumerate(movie_names[2:12]):
        texts.append(plt.text(x[2:12][j], y[2:12][j], txt))
    adjust_text(texts)
    plt.xlabel('Feature 0')
    plt.ylabel('Feature 1')
    plt.title('10 Movies of Our Choice')
    plt.savefig('Standard-choice.png')
    plt.clf()

    # 2. All ratings of the ten most popular movies

    max_10 = dict(
        sorted(ratings.items(), key=lambda r: len(r[1]), reverse=True)[:10])
    x_pop = []
    y_pop = []
    top_ratings = []
    top_ratings = max_10.keys()
    movie_title = []
    print(top_ratings)
    counter = 0
    for i in v_proj:
        counter += 1
        if counter in top_ratings:
            x_pop.append(i[0])
            y_pop.append(i[1])
            movie_title.append(movie_names[counter])
    print(movie_title)

    plt.scatter(x_pop, y_pop)
    texts = []
    for j, txt in enumerate(movie_title):
        texts.append(plt.text(x_pop[j], y_pop[j], txt))
        #plt.annotate(txt, (x_pop[j], y_pop[j]))
    adjust_text(texts)
    plt.xlabel('Feature 0')
    plt.ylabel('Feature 1')
    plt.title('10 Most Popular Movies')
    plt.savefig('Standard-popular.png')
    plt.clf()

    # 3. All ratings of the ten best movies

    best_10 = dict(
        sorted(ratings.items(),
               key=lambda r: sum(r[1]) / len(r[1]),
               reverse=True)[:10])
    x_best = []
    y_best = []
    best = []
    best = best_10.keys()
    print(best)
    count = 0
    for i in v_proj:
        count += 1
        if count in best:
            x_best.append(i[0])
            y_best.append(i[1])

    for j, txt in enumerate(movie_title):
        plt.annotate(txt, (x_best[j], y_best[j]))
    plt.scatter(x_best, y_best)
    plt.xlabel('Feature 0')
    plt.ylabel('Feature 1')
    plt.title('10 Best Movies')
    plt.xlabel("Feature 0")
    plt.ylabel("Feature 1")
    plt.savefig('Standard-best.png')
    plt.clf()

    # 4. 10 ratings of movies from three genres of your choice

    ids = movie_info[:, 0].astype(int)
    movie_names = movie_info[:, 1]

    action = (movie_info[:, 2].astype(int))
    action_movies = dict((k, v) for k, v in zip(ids, action) if v == 1)
    action_ratings_dict = dict((k, ratings[k]) for k in action_movies.keys())
    x_action = []
    y_action = []
    action_ratings = []
    action_ratings = action_ratings_dict.keys()

    comedy = movie_info[:, 3].astype(int)
    comedy_movies = dict((k, v) for k, v in zip(ids, comedy) if v == 1)
    comedy_ratings_dict = dict((k, ratings[k]) for k in comedy_movies.keys())
    x_comedy = []
    y_comedy = []
    comedy_ratings = []
    comedy_ratings = comedy_ratings_dict.keys()

    romance = movie_info[:, 4].astype(int)
    romance_movies = dict((k, v) for k, v in zip(ids, romance) if v == 1)
    romance_ratings_dict = dict((k, ratings[k]) for k in romance_movies.keys())
    x_romance = []
    y_romance = []
    romance_ratings = []
    romance_ratings = romance_ratings_dict.keys()

    count = 0
    for i in v_proj:
        count += 1
        if count in action_ratings:
            x_action.append(i[0])
            y_action.append(i[1])

        if count in comedy_ratings:
            x_comedy.append(i[0])
            y_comedy.append(i[1])

        if count in romance_ratings:
            x_romance.append(i[0])
            y_romance.append(i[1])

    plt.scatter(x_action[2:12], y_action[2:12], label="Action")
    plt.scatter(x_comedy[2:12], y_comedy[2:12], color='orange', label="Comedy")
    plt.scatter(x_romance[2:12],
                y_romance[2:12],
                color='green',
                label="Romance")
    plt.legend()
    plt.title("Three Genres")
    plt.savefig('Standard-genres.png')
    plt.clf()
Ejemplo n.º 14
0
transposed_data = np.transpose(data)

user_ids_vector = list(transposed_data[0])
movie_ids_vector = list(transposed_data[1])
rating_vector = list(transposed_data[2])

max_user_id = max(user_ids_vector)
max_movie_id = max(movie_ids_vector)

eta = 3e-2
reg = 1e-3

K = 20

U, V, err = prob2utils.train_model(max_user_id, max_movie_id, K, eta, reg,
                                   data)

a, s, b = np.linalg.svd(V)

# print a
# print s
# print b

a_tilde = a[:, :2]

V_tilde = np.dot(np.transpose(a_tilde), V)
#U_tilde = np.dot(np.transpose(a_tilde), U)


def make_plot(indices, title, file):
Ejemplo n.º 15
0
    genre6_movies, genre_6_ratings = Rating.rating_genres(6)
    genre1_movies, genre_1_ratings = Rating.rating_genres(1)
    genre7_movies, genre_2_ratings = Rating.rating_genres(7)

    do_histogram(genre_6_ratings, "Ratings for Movies of Genre 6", "Rating")
    #do_histogram(genre_1_ratings, "Ratings for Movies of Genre 1", "Rating")
    #do_histogram(genre_2_ratings, "Ratings for Movies of Genre 7", "Rating")
    
    # Do matrix factorization
    M = max(Rating.user_id)
    N = max(Rating.movie_id)
    K = 20
    learning_rate = 0.02
    lam = 0.0
    Y = np.column_stack((Rating.user_id, Rating.movie_id, Rating.ratings))
    U, V, err = p2util.train_model(M, N, K, learning_rate, lam, Y)

    V_2D = SVD(V)

    # V should have 2 x N shape, i.e. 2 features for N movies.
    random_movies = np.random.choice(Rating.movie_dict.keys(), 10)
    print random_movies, len(random_movies)
    rand_movie_labels = [Rating.movie_dict[x] for x in random_movies]
    printable = set(string.printable)

    rand_movie_labels = convert_to_ASCII(rand_movie_labels)

    print rand_movie_labels, len(rand_movie_labels)
    do_2D_plot(V_2D[0, random_movies], 
        V_2D[1, random_movies], 
        "10 Random Movies in 2D Factorization",
Ejemplo n.º 16
0
import numpy as np
import prob2utils

data = np.genfromtxt('../project3data/data.txt')

M = np.amax(data[:, 0]).astype(int)
N = np.amax(data[:, 1]).astype(int)
K = 20
eta = 2e-3
reg = 0.01

results = prob2utils.train_model(M,
                                 N,
                                 K,
                                 eta,
                                 reg,
                                 data,
                                 eps=0.0001,
                                 max_epochs=500)

np.savetxt('U_matrix.csv', results[0], delimiter=',')
np.savetxt('V_matrix.csv', results[1], delimiter=',')
np.savetxt('E_in.csv', [results[2]], delimiter=',')