def main():
    Y_train = np.loadtxt('data/train.txt').astype(int)
    Y_test = np.loadtxt('data/test.txt').astype(int)

    M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
    N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies
    print("Factorizing with ", M, " users, ", N, " movies.")
    Ks = [10,20,30,50,100]

    reg = 0.0
    eta = 0.03 # learning rate
    E_in = []
    E_out = []

    # Use to compute Ein and Eout
    for K in Ks:
        U,V, err = train_model(M, N, K, eta, reg, Y_train)
        E_in.append(err)
        E_out.append(get_err(U, V, Y_test))

    plt.plot(Ks, E_in, label='$E_{in}$')
    plt.plot(Ks, E_out, label='$E_{out}$')
    plt.title('Error vs. K')
    plt.xlabel('K')
    plt.ylabel('Error')
    plt.legend()
    plt.savefig('2d.png')
Ejemplo n.º 2
0
def cross_validate(Y_train, Y_test, regs, etas):
    '''
    cross validates the model, varying regularization strength and step size.
    '''
    print('training size =', len(Y_train))
    print('testing size  =', len(Y_test))
    print()

    for reg in regs:
        for eta in etas:
            U, V, a, b, _ = train(Y_train, reg, eta, Y_test=Y_test, zero_mean=False, save=False)
            errIn = get_err(U, V, a, b, Y_train, reg=0)
            errOut = get_err(U, V, a, b, Y_test, reg=0)
            output_str = ''
            output_str = '{}, errOut = {:.6f}'.format(output_str, errOut)
            output_str = '{}, reg = {:.5f}'.format(output_str, reg)
            output_str = '{}, eta = {:.4f}'.format(output_str, eta)
            output_str = '{}, errIn = {:.6f}'.format(output_str, errIn)
            print(output_str[2:])
Ejemplo n.º 3
0
def main():
    Y_train = np.loadtxt('data/train.txt').astype(int)
    Y_test = np.loadtxt('data/test.txt').astype(int)

    M = max(max(Y_train[:, 0]), max(Y_test[:, 0])).astype(int)  # users
    N = max(max(Y_train[:, 1]), max(Y_test[:, 1])).astype(int)  # movies

    k = 20
    #regularization constants
    regs = [10**-4, 10**-3, 10**-2, 10**-1, 1]
    #learning rate
    eta = 0.01
    #0.00005 best
    epsilons = [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.002]
    E_ins = []
    E_outs = []

    # Use to compute Ein and Eout
    for reg in regs:
        E_ins_for_lambda = []
        E_outs_for_lambda = []

        for ep in epsilons:
            print(
                "Training model with M = %s, N = %s, k = %s, eta = %s, reg = %s, ep = %s"
                % (M, N, k, eta, reg, ep))
            U, V, e_in = train_model(M, N, k, eta, reg, Y_train, ep)
            E_ins_for_lambda.append(e_in)
            eout = get_err(U, V, Y_test)
            E_outs_for_lambda.append(eout)

        E_ins.append(E_ins_for_lambda)
        E_outs.append(E_outs_for_lambda)

    for i in range(len(regs)):
        plt.plot(epsilons, E_ins[i], label='$E_{in}, \lambda=$' + str(regs[i]))
    plt.title('$E_{in}$ vs. Epsilon')
    plt.xlabel('Epsilon')
    plt.ylabel('Error')
    plt.xscale('log')
    plt.legend()
    plt.savefig('E_in.png')
    plt.clf()

    for i in range(len(regs)):
        plt.plot(epsilons,
                 E_outs[i],
                 label='$E_{out}, \lambda=$' + str(regs[i]))
    plt.title('$E_{out}$ vs. Epsilon')
    plt.xlabel('Epsilon')
    plt.ylabel('Error')
    plt.xscale('log')
    plt.legend()
    plt.savefig('E_out.png')
Ejemplo n.º 4
0
def main():
    Y_train = np.loadtxt('train.txt').astype(int)
    Y_test = np.loadtxt('test.txt').astype(int)

    M = max(max(Y_train[:, 0]), max(Y_test[:, 0])).astype(int)  # users
    N = max(max(Y_train[:, 1]), max(Y_test[:, 1])).astype(int)  # movies
    Ks = [10, 20, 30, 50, 100]

    regs = [10**-4, 10**-3, 10**-2, 10**-1, 1]
    #regs = [10**-4]
    eta = 0.03  # learning rate
    E_ins = []
    E_outs = []

    # Use to compute Ein and Eout
    for reg in regs:
        E_ins_for_lambda = []
        E_outs_for_lambda = []

        for k in Ks:
            print(
                "Training model with M = %s, N = %s, k = %s, eta = %s, reg = %s"
                % (M, N, k, eta, reg))
            U, V, e_in = train_model(M, N, k, eta, reg, Y_train)
            E_ins_for_lambda.append(e_in)
            eout = get_err(U, V, Y_test)
            E_outs_for_lambda.append(eout)

        E_ins.append(E_ins_for_lambda)
        E_outs.append(E_outs_for_lambda)

    # Plot values of E_in across k for each value of lambda
    for i in range(len(regs)):
        plt.plot(Ks, E_ins[i], label='$E_{in}, \lambda=$' + str(regs[i]))
    plt.title('$E_{in}$ vs. K')
    plt.xlabel('K')
    plt.ylabel('Error')
    plt.legend()
    plt.savefig('2e_ein.png')
    plt.clf()

    # Plot values of E_out across k for each value of lambda
    for i in range(len(regs)):
        plt.plot(Ks, E_outs[i], label='$E_{out}, \lambda=$' + str(regs[i]))
    plt.title('$E_{out}$ vs. K')
    plt.xlabel('K')
    plt.ylabel('Error')
    plt.legend()
    plt.savefig('2e_eout.png')
Ejemplo n.º 5
0
def RunModel1(M, N, k, eta, reg, Y_train, Y_test, GraphFlag=True):

    print("Training model 1 with M = %s, N = %s, k = %s, eta = %s, reg = %s" %
          (M, N, k, eta, reg))
    U_1, V_1, e_in_1 = model_1.train_model(M, N, k, eta, reg, Y_train)
    e_out_1 = model_1.get_err(U_1, V_1, Y_test)
    print("model 1 results: e_in = %.3f, e_out = %.3f" % (e_in_1, e_out_1))

    if GraphFlag is False:
        return e_in_1, e_out_1

    # Transform model 1 to 2D
    U_proj_1, V_proj_1 = project_to_2D(U_1, V_1)

    # Plot model 1
    for ids, category in to_plot:
        visualize(V_proj_1, ids, 'Model 1: ' + category)

    return e_in_1, e_out_1
def main():
    Y_train = np.loadtxt('data/train.txt').astype(int)
    Y_test = np.loadtxt('data/test.txt').astype(int)

    M = max(max(Y_train[:,0]), max(Y_test[:,0])).astype(int) # users
    N = max(max(Y_train[:,1]), max(Y_test[:,1])).astype(int) # movies
    print("Factorizing with ", M, " users, ", N, " movies.")
    Ks = [20]

    reg = 0.1
    eta = 0.01 # learning rate
    epsilon = 0.00005
    E_in = []
    E_out = []

    # Use to compute Ein and Eout
    for K in Ks:
        U, V, a, b, err = train_model(M, N, K, eta, reg, Y_train, epsilon)
        # E_in.append(err)
        # E_out.append(get_err(U, V, a, b, Y_test))
        print(get_err(U, V, a, b, Y_test))
Ejemplo n.º 7
0
def main():
    #movie_info = np.genfromtxt('../data/movies.txt', dtype="str", delimiter="\t", usecols=(0, 1, 3, 7, 16))
    movie_info = np.loadtxt('../data/movies.txt',
                            dtype="str",
                            delimiter="\t",
                            usecols=(0, 1, 3, 7, 16))
    data = np.loadtxt('../data/data.txt').astype(int)
    Y_train = np.loadtxt('../data/train.txt').astype(int)
    Y_test = np.loadtxt('../data/test.txt').astype(int)
    print(movie_info)

    M = max(max(Y_train[:, 0]), max(Y_test[:, 0])).astype(int)  # users
    N = max(max(Y_train[:, 1]), max(Y_test[:, 1])).astype(int)  # movies
    print("Factorizing with ", M, " users, ", N, " movies.")

    reg = 0.0
    eta = 0.03  # learning rate
    k = 20
    E_in = []
    E_out = []

    # Use to compute Ein and Eout

    U, V, err = train_model(M, N, k, eta, reg, Y_train)
    e_out = get_err(U, V, Y_test)
    print("e_in", err)
    print("e_out", e_out)

    #model.score(Y_test)
    a, sigma, b = np.linalg.svd(V)
    print(V.shape, a.shape)
    a_t = a  #np.transpose(a)

    #movie ID starts at 1, but matrix starts at 0
    v_proj = np.transpose(np.dot(a_t[:2], V))

    x = []
    y = []
    for i in v_proj:
        x.append(i[0])
        y.append(i[1])

    ratings = {}
    for user, movie_id, rating in data:
        if movie_id in ratings:
            ratings[movie_id].append(rating)
        else:
            ratings[movie_id] = [rating]
    #x = v_proj[0]
    #y = v_proj[1]
    #print(x)

    print(v_proj.shape)

    # Setup

    ids = movie_info[:, 0].astype(int)
    movie_names = movie_info[:, 1]

    # 1. 10 movies of our choice from the MovieLens dataset

    plt.scatter(x[2:12], y[2:12])
    texts = []
    for j, txt in enumerate(movie_names[2:12]):
        texts.append(plt.text(x[2:12][j], y[2:12][j], txt))
    adjust_text(texts)
    plt.xlabel('Feature 0')
    plt.ylabel('Feature 1')
    plt.title('10 Movies of Our Choice')
    plt.savefig('Standard-choice.png')
    plt.clf()

    # 2. All ratings of the ten most popular movies

    max_10 = dict(
        sorted(ratings.items(), key=lambda r: len(r[1]), reverse=True)[:10])
    x_pop = []
    y_pop = []
    top_ratings = []
    top_ratings = max_10.keys()
    movie_title = []
    print(top_ratings)
    counter = 0
    for i in v_proj:
        counter += 1
        if counter in top_ratings:
            x_pop.append(i[0])
            y_pop.append(i[1])
            movie_title.append(movie_names[counter])
    print(movie_title)

    plt.scatter(x_pop, y_pop)
    texts = []
    for j, txt in enumerate(movie_title):
        texts.append(plt.text(x_pop[j], y_pop[j], txt))
        #plt.annotate(txt, (x_pop[j], y_pop[j]))
    adjust_text(texts)
    plt.xlabel('Feature 0')
    plt.ylabel('Feature 1')
    plt.title('10 Most Popular Movies')
    plt.savefig('Standard-popular.png')
    plt.clf()

    # 3. All ratings of the ten best movies

    best_10 = dict(
        sorted(ratings.items(),
               key=lambda r: sum(r[1]) / len(r[1]),
               reverse=True)[:10])
    x_best = []
    y_best = []
    best = []
    best = best_10.keys()
    print(best)
    count = 0
    for i in v_proj:
        count += 1
        if count in best:
            x_best.append(i[0])
            y_best.append(i[1])

    for j, txt in enumerate(movie_title):
        plt.annotate(txt, (x_best[j], y_best[j]))
    plt.scatter(x_best, y_best)
    plt.xlabel('Feature 0')
    plt.ylabel('Feature 1')
    plt.title('10 Best Movies')
    plt.xlabel("Feature 0")
    plt.ylabel("Feature 1")
    plt.savefig('Standard-best.png')
    plt.clf()

    # 4. 10 ratings of movies from three genres of your choice

    ids = movie_info[:, 0].astype(int)
    movie_names = movie_info[:, 1]

    action = (movie_info[:, 2].astype(int))
    action_movies = dict((k, v) for k, v in zip(ids, action) if v == 1)
    action_ratings_dict = dict((k, ratings[k]) for k in action_movies.keys())
    x_action = []
    y_action = []
    action_ratings = []
    action_ratings = action_ratings_dict.keys()

    comedy = movie_info[:, 3].astype(int)
    comedy_movies = dict((k, v) for k, v in zip(ids, comedy) if v == 1)
    comedy_ratings_dict = dict((k, ratings[k]) for k in comedy_movies.keys())
    x_comedy = []
    y_comedy = []
    comedy_ratings = []
    comedy_ratings = comedy_ratings_dict.keys()

    romance = movie_info[:, 4].astype(int)
    romance_movies = dict((k, v) for k, v in zip(ids, romance) if v == 1)
    romance_ratings_dict = dict((k, ratings[k]) for k in romance_movies.keys())
    x_romance = []
    y_romance = []
    romance_ratings = []
    romance_ratings = romance_ratings_dict.keys()

    count = 0
    for i in v_proj:
        count += 1
        if count in action_ratings:
            x_action.append(i[0])
            y_action.append(i[1])

        if count in comedy_ratings:
            x_comedy.append(i[0])
            y_comedy.append(i[1])

        if count in romance_ratings:
            x_romance.append(i[0])
            y_romance.append(i[1])

    plt.scatter(x_action[2:12], y_action[2:12], label="Action")
    plt.scatter(x_comedy[2:12], y_comedy[2:12], color='orange', label="Comedy")
    plt.scatter(x_romance[2:12],
                y_romance[2:12],
                color='green',
                label="Romance")
    plt.legend()
    plt.title("Three Genres")
    plt.savefig('Standard-genres.png')
    plt.clf()