Ejemplo n.º 1
0
def bipartite(user_k, item_k):
    train_mtx_ori = rating_matrix.matrix_transfer(2)
    [row, col] = train_mtx_ori.shape
    train_mtx = np.transpose(train_mtx_ori)
    num_of_round = 0
    user_dict = {}
    item_dict = {}
    train_mtx_p = []
    train_mtx_pp = []

    while num_of_round < 5:
        num_of_round += 1
        # step 1
        user_dict = k_means(train_mtx, user_k)
        # step 2
        train_mtx_p = np.zeros((row, user_k))
        for cluster in user_dict:
            train_mtx_p[:, cluster] = np.asarray(
                train_mtx_ori[:, user_dict.get(cluster)].mean(
                    axis=1)).reshape(row)
        # step 3
        item_dict = k_means(train_mtx_p, item_k)
        # step 4
        train_mtx_pp = np.zeros((item_k, col))
        for cluster in item_dict:
            train_mtx_pp[cluster, :] = np.asarray(
                train_mtx_ori[item_dict.get(cluster), :].mean(
                    axis=0)).reshape(col)
        # step 5
        train_mtx = np.transpose(train_mtx_pp)

    print 'bipartite finished.'
    user_item_dict = (user_dict, train_mtx_p, item_dict, train_mtx_pp)
    return user_item_dict
def bipartite(user_k, item_k):
    train_mtx_ori = rating_matrix.matrix_transfer(2)
    [row, col] = train_mtx_ori.shape
    train_mtx = np.transpose(train_mtx_ori)
    num_of_round = 0
    user_dict = {}
    item_dict = {}
    train_mtx_p = []
    train_mtx_pp = []

    while num_of_round < 5:
        num_of_round += 1
        # step 1
        user_dict = k_means(train_mtx, user_k)
        # step 2
        train_mtx_p = np.zeros((row, user_k))
        for cluster in user_dict:
            train_mtx_p[:, cluster] = np.asarray(train_mtx_ori[:, user_dict.get(cluster)].mean(axis=1)).reshape(row)
        # step 3
        item_dict = k_means(train_mtx_p, item_k)
        # step 4
        train_mtx_pp = np.zeros((item_k, col))
        for cluster in item_dict:
            train_mtx_pp[cluster, :] = np.asarray(train_mtx_ori[item_dict.get(cluster), :].mean(axis=0)).reshape(col)
        # step 5
        train_mtx = np.transpose(train_mtx_pp)

    print 'bipartite finished.'
    user_item_dict = (user_dict, train_mtx_p, item_dict, train_mtx_pp)
    return user_item_dict
def pcc_item_rating_pred(pair_path, k, option):
    pair = pred_set.pred_pair(pair_path)
    train_mtx = rating_matrix.matrix_transfer(2)
    item_zero_vec = np.where(~train_mtx.any(axis=0))[0]
    # add a bias to the all zero column vectors
    train_mtx[:, [item_zero_vec]] = 0.001
    pcc_mtx = np.transpose(train_mtx)
    # user rating standardization
    pcc_mtx = pcc_mtx - np.sum(pcc_mtx, axis=0) / len(pcc_mtx)
    pcc_mtx /= np.linalg.norm(pcc_mtx, axis=0)
    pcc_mtx = np.transpose(pcc_mtx)
    item_sim_mtx = []
    pred_list = []
    if option == 1 or option == 2:
        item_sim_mtx = movie_sim.item_dot_sim(pcc_mtx)
    if option == 3 or option == 4:
        train_mtx[:, [item_zero_vec]] = 0.001
        item_sim_mtx = movie_sim.item_cos_sim(pcc_mtx)

    for row in pair:
        pred_rating = 0
        movie_id = row[0]
        user_id = row[1]
        item_sim_list = item_sim_mtx[movie_id]
        # top k+1 nearest neighbors
        item_knn_list = np.argsort(item_sim_list)[::-1][0:k + 1]
        if movie_id in item_knn_list:
            position = np.where(item_knn_list == movie_id)
            item_knn_list = np.delete(item_knn_list, position)
        else:
            item_knn_list = np.delete(item_knn_list, len(item_knn_list) - 1)

        if option == 1 or option == 3:
            pred_rating = np.sum(
                np.take(train_mtx[:, user_id],
                        item_knn_list.tolist())) / float(k) + 3
        if option == 2 or option == 4:
            item_knn_sim = item_sim_list[item_knn_list]
            if np.sum(item_knn_sim) != 0:
                weight = item_knn_sim / np.sum(item_knn_sim)
                pred_rating = np.sum(
                    np.multiply(
                        np.take(train_mtx[:, user_id], item_knn_list.tolist()),
                        weight)) + 3
            else:
                pred_rating = 3.0
        pred_list.append(pred_rating)
    # output the result
    pred_result.file_writer(pred_list)
    return pred_list
def user_rating_pred(pair_path, k, option):
    pair = pred_set.pred_pair(pair_path)
    train_mtx = rating_matrix.matrix_transfer(2)
    user_sim_mtx = []
    pred_list = []
    user_zero_vec = np.where(~train_mtx.any(axis=0))[0]
    if option == 1 or option == 2:
        user_sim_mtx = user_sim.user_dot_sim(train_mtx)
    if option == 3 or option == 4:
        # add a bias to the all zero column vectors
        train_mtx[0, [user_zero_vec]] = 0.001
        user_sim_mtx = user_sim.user_cos_sim(train_mtx)

    # TODO: weighted mean need refine
    for row in pair:
        pred_rating = 0
        movie_id = row[0]
        user_id = row[1]
        user_sim_list = user_sim_mtx[user_id]
        # top k+1 nearest neighbors
        user_knn_list = np.argsort(user_sim_list)[::-1][0:k + 1]
        # TODO: if two sim equals, small user_id comes first
        if user_id in user_knn_list:
            position = np.where(user_knn_list == user_id)
            user_knn_list = np.delete(user_knn_list, position)
        else:
            user_knn_list = np.delete(user_knn_list, len(user_knn_list) - 1)

        if option == 1 or option == 3:
            pred_rating = np.sum(
                np.take(train_mtx[movie_id, :],
                        user_knn_list.tolist())) / float(k) + 3
        # TODO: problem exists, what if weighted sum is zero
        if option == 2 or option == 4:
            user_knn_sim = user_sim_list[user_knn_list]
            if np.sum(user_knn_sim) != 0:
                weight = user_knn_sim / np.sum(user_knn_sim)
                pred_rating = np.sum(
                    np.multiply(
                        np.take(train_mtx[movie_id, :],
                                user_knn_list.tolist()), weight)) + 3
            else:
                pred_rating = np.sum(train_mtx[movie_id, :]) / np.size(
                    np.nonzero(train_mtx[movie_id, :])) + 3

        pred_list.append(pred_rating)
    # output the result
    pred_result.file_writer(pred_list)
    return pred_list
def pcc_item_rating_pred(pair_path, k, option):
    pair = pred_set.pred_pair(pair_path)
    train_mtx = rating_matrix.matrix_transfer(2)
    item_zero_vec = np.where(~train_mtx.any(axis=0))[0]
    # add a bias to the all zero column vectors
    train_mtx[:, [item_zero_vec]] = 0.001
    pcc_mtx = np.transpose(train_mtx)
    # user rating standardization
    pcc_mtx = pcc_mtx - np.sum(pcc_mtx, axis=0) / len(pcc_mtx)
    pcc_mtx /= np.linalg.norm(pcc_mtx, axis=0)
    pcc_mtx = np.transpose(pcc_mtx)
    item_sim_mtx = []
    pred_list = []
    if option == 1 or option == 2:
        item_sim_mtx = movie_sim.item_dot_sim(pcc_mtx)
    if option == 3 or option == 4:
        train_mtx[:, [item_zero_vec]] = 0.001
        item_sim_mtx = movie_sim.item_cos_sim(pcc_mtx)

    for row in pair:
        pred_rating = 0
        movie_id = row[0]
        user_id = row[1]
        item_sim_list = item_sim_mtx[movie_id]
        # top k+1 nearest neighbors
        item_knn_list = np.argsort(item_sim_list)[::-1][0: k+1]
        if movie_id in item_knn_list:
            position = np.where(item_knn_list == movie_id)
            item_knn_list = np.delete(item_knn_list, position)
        else:
            item_knn_list = np.delete(item_knn_list, len(item_knn_list) - 1)

        if option == 1 or option == 3:
            pred_rating = np.sum(np.take(train_mtx[:, user_id], item_knn_list.tolist())) / float(k) + 3
        if option == 2 or option == 4:
            item_knn_sim = item_sim_list[item_knn_list]
            if np.sum(item_knn_sim) != 0:
                weight = item_knn_sim / np.sum(item_knn_sim)
                pred_rating = np.sum(np.multiply(np.take(train_mtx[:, user_id], item_knn_list.tolist()), weight)) + 3
            else:
                pred_rating = 3.0
        pred_list.append(pred_rating)
    # output the result
    pred_result.file_writer(pred_list)
    return pred_list
Ejemplo n.º 6
0
def item_rating_pred(pair_path, k, option):
    pair = pred_set.pred_pair(pair_path)
    train_mtx = rating_matrix.matrix_transfer(2)
    item_zero_vec = np.where(~train_mtx.any(axis=0))[0]
    item_sim_mtx = []
    pred_list = []
    if option == 1 or option == 2:
        item_sim_mtx = movie_sim.item_dot_sim(train_mtx)
    if option == 3 or option == 4:
        train_mtx[:, [item_zero_vec]] = 0.001
        item_sim_mtx = movie_sim.item_cos_sim(train_mtx)

    for row in pair:
        pred_rating = 0
        movie_id = row[0]
        user_id = row[1]
        item_sim_list = item_sim_mtx[movie_id]
        # top k+1 nearest neighbors
        item_knn_list = np.argsort(item_sim_list)[::-1][0:k + 1]
        if movie_id in item_knn_list:
            position = np.where(item_knn_list == movie_id)
            item_knn_list = np.delete(item_knn_list, position)
        else:
            item_knn_list = np.delete(item_knn_list, len(item_knn_list) - 1)

        if option == 1 or option == 3:
            pred_rating = np.sum(
                np.take(train_mtx[:, user_id],
                        item_knn_list.tolist())) / float(k) + 3
        if option == 2 or option == 4:
            item_knn_sim = item_sim_list[item_knn_list]
            if np.sum(item_knn_sim) != 0:
                weight = item_knn_sim / np.sum(item_knn_sim)
                pred_rating = np.sum(
                    np.multiply(
                        np.take(train_mtx[:, user_id], item_knn_list.tolist()),
                        weight)) + 3
            else:
                pred_rating = np.sum(train_mtx[movie_id, :]) / np.size(
                    np.nonzero(train_mtx[movie_id, :])) + 3
        pred_list.append(pred_rating)
    # output the result
    pred_result.file_writer(pred_list)
    return pred_list
def user_rating_pred(pair_path, k, option):
    pair = pred_set.pred_pair(pair_path)
    train_mtx = rating_matrix.matrix_transfer(2)
    user_sim_mtx = []
    pred_list = []
    user_zero_vec = np.where(~train_mtx.any(axis=0))[0]
    if option == 1 or option == 2:
        user_sim_mtx = user_sim.user_dot_sim(train_mtx)
    if option == 3 or option == 4:
        # add a bias to the all zero column vectors
        train_mtx[0, [user_zero_vec]] = 0.001
        user_sim_mtx = user_sim.user_cos_sim(train_mtx)

    # TODO: weighted mean need refine
    for row in pair:
        pred_rating = 0
        movie_id = row[0]
        user_id = row[1]
        user_sim_list = user_sim_mtx[user_id]
        # top k+1 nearest neighbors
        user_knn_list = np.argsort(user_sim_list)[::-1][0: k+1]
        # TODO: if two sim equals, small user_id comes first
        if user_id in user_knn_list:
            position = np.where(user_knn_list == user_id)
            user_knn_list = np.delete(user_knn_list, position)
        else:
            user_knn_list = np.delete(user_knn_list, len(user_knn_list) - 1)

        if option == 1 or option == 3:
            pred_rating = np.sum(np.take(train_mtx[movie_id, :], user_knn_list.tolist())) / float(k) + 3
        # TODO: problem exists, what if weighted sum is zero
        if option == 2 or option == 4:
            user_knn_sim = user_sim_list[user_knn_list]
            if np.sum(user_knn_sim) != 0:
                weight = user_knn_sim / np.sum(user_knn_sim)
                pred_rating = np.sum(np.multiply(np.take(train_mtx[movie_id, :], user_knn_list.tolist()), weight)) + 3
            else:
                pred_rating = np.sum(train_mtx[movie_id, :]) / np.size(np.nonzero(train_mtx[movie_id, :])) + 3

        pred_list.append(pred_rating)
    # output the result
    pred_result.file_writer(pred_list)
    return pred_list
def item_rating_pred(pair_path, k, option):
    pair = pred_set.pred_pair(pair_path)
    train_mtx = rating_matrix.matrix_transfer(2)
    item_zero_vec = np.where(~train_mtx.any(axis=0))[0]
    item_sim_mtx = []
    pred_list = []
    if option == 1 or option == 2:
        item_sim_mtx = movie_sim.item_dot_sim(train_mtx)
    if option == 3 or option == 4:
        train_mtx[:, [item_zero_vec]] = 0.001
        item_sim_mtx = movie_sim.item_cos_sim(train_mtx)

    for row in pair:
        pred_rating = 0
        movie_id = row[0]
        user_id = row[1]
        item_sim_list = item_sim_mtx[movie_id]
        # top k+1 nearest neighbors
        item_knn_list = np.argsort(item_sim_list)[::-1][0: k+1]
        if movie_id in item_knn_list:
            position = np.where(item_knn_list == movie_id)
            item_knn_list = np.delete(item_knn_list, position)
        else:
            item_knn_list = np.delete(item_knn_list, len(item_knn_list) - 1)

        if option == 1 or option == 3:
            pred_rating = np.sum(np.take(train_mtx[:, user_id], item_knn_list.tolist())) / float(k) + 3
        if option == 2 or option == 4:
            item_knn_sim = item_sim_list[item_knn_list]
            if np.sum(item_knn_sim) != 0:
                weight = item_knn_sim / np.sum(item_knn_sim)
                pred_rating = np.sum(np.multiply(np.take(train_mtx[:, user_id], item_knn_list.tolist()), weight)) + 3
            else:
                pred_rating = np.sum(train_mtx[movie_id, :]) / np.size(np.nonzero(train_mtx[movie_id, :])) + 3
        pred_list.append(pred_rating)
    # output the result
    pred_result.file_writer(pred_list)
    return pred_list
def pcc_user_rating_pred(pair_path, k, option):
    pair = pred_set.pred_pair(pair_path)
    train_mtx = rating_matrix.matrix_transfer(2)
    user_zero_vec = np.where(~train_mtx.any(axis=0))[0]
    # add a bias to the all zero column vectors
    train_mtx[:, [user_zero_vec]] = 0.001
    # user rating standardization
    pcc_mtx = train_mtx - np.sum(train_mtx, axis=0) / len(train_mtx)
    pcc_mtx /= np.linalg.norm(train_mtx, axis=0)
    user_sim_mtx = []
    pred_list = []
    if option == 1 or option == 2:
        user_sim_mtx = user_sim.user_dot_sim(pcc_mtx)
    if option == 3 or option == 4:
        user_sim_mtx = user_sim.user_cos_sim(pcc_mtx)

    # TODO: weighted mean need refine
    for row in pair:
        # pred_rating = 0
        movie_id = row[0]
        user_id = row[1]
        user_sim_list = user_sim_mtx[user_id]
        # top k+1 nearest neighbors
        user_knn_list = np.argsort(user_sim_list)[::-1][0:k + 1]
        # TODO: if two sim equals, small user_id comes first
        if user_id in user_knn_list:
            position = np.where(user_knn_list == user_id)
            user_knn_list = np.delete(user_knn_list, position)
        else:
            user_knn_list = np.delete(user_knn_list, len(user_knn_list) - 1)

        pred_rating = np.sum(
            np.take(train_mtx[movie_id, :],
                    user_knn_list.tolist())) / float(k) + 3
        pred_list.append(pred_rating)
    # output the result
    pred_result.file_writer(pred_list)
    return pred_list
def explorer():
    train_mtx = rating_matrix.matrix_transfer(0)
    [row, col] = train_mtx.shape
    # ***** part 1.1.1: statistics *****
    rating_one = np.where(train_mtx == 1)
    print 'movie with rating 1: ', rating_one[0].size, '\n'
    rating_three = np.where(train_mtx == 3)
    print 'movie with rating 3: ', rating_three[0].size, '\n'
    rating_five = np.where(train_mtx == 5)
    print 'movie with rating 5: ', rating_five[0].size, '\n'
    rating_avg = np.sum(train_mtx) / np.count_nonzero(train_mtx)
    print 'movie rating average: ', rating_avg, '\n'

    # ***** part 1.1.2: user_id 4321 *****
    curuser = train_mtx[:, 4321]
    movie_num = np.count_nonzero(curuser)
    print 'number of movie rated: ', movie_num, '\n'
    rating_one_num = np.where(curuser == 1)
    print 'movie with rating 1: ', rating_one_num[0].size, '\n'
    rating_three_num = np.where(curuser == 3)
    print 'movie with rating 3: ', rating_three_num[0].size, '\n'
    rating_five_num = np.where(curuser == 5)
    print 'movie with rating 5: ', rating_five_num[0].size, '\n'
    rating_avg_score = np.sum(curuser) / np.count_nonzero(curuser)
    print 'movie rating average: ', rating_avg_score, '\n'

    # ***** part 1.1: movie_id 3 ****
    curmovie = train_mtx[3, :]
    user_num = np.count_nonzero(curmovie)
    print 'number of user rated: ', user_num, '\n'
    rating_one_user = np.where(curmovie == 1)
    print 'movie with rating 1: ', rating_one_user[0].size, '\n'
    rating_three_user = np.where(curmovie == 3)
    print 'movie with rating 3: ', rating_three_user[0].size, '\n'
    rating_five_user = np.where(curmovie == 5)
    print 'movie with rating 5: ', rating_five_user[0].size, '\n'
    rating_avg_user = np.sum(curmovie) / np.count_nonzero(curmovie)
    print 'movie rating average: ', rating_avg_user, '\n'
def pcc_user_rating_pred(pair_path, k, option):
    pair = pred_set.pred_pair(pair_path)
    train_mtx = rating_matrix.matrix_transfer(2)
    user_zero_vec = np.where(~train_mtx.any(axis=0))[0]
    # add a bias to the all zero column vectors
    train_mtx[:, [user_zero_vec]] = 0.001
    # user rating standardization
    pcc_mtx = train_mtx - np.sum(train_mtx, axis=0) / len(train_mtx)
    pcc_mtx /= np.linalg.norm(train_mtx, axis=0)
    user_sim_mtx = []
    pred_list = []
    if option == 1 or option == 2:
        user_sim_mtx = user_sim.user_dot_sim(pcc_mtx)
    if option == 3 or option == 4:
        user_sim_mtx = user_sim.user_cos_sim(pcc_mtx)

    # TODO: weighted mean need refine
    for row in pair:
        # pred_rating = 0
        movie_id = row[0]
        user_id = row[1]
        user_sim_list = user_sim_mtx[user_id]
        # top k+1 nearest neighbors
        user_knn_list = np.argsort(user_sim_list)[::-1][0: k+1]
        # TODO: if two sim equals, small user_id comes first
        if user_id in user_knn_list:
            position = np.where(user_knn_list == user_id)
            user_knn_list = np.delete(user_knn_list, position)
        else:
            user_knn_list = np.delete(user_knn_list, len(user_knn_list) - 1)

        pred_rating = np.sum(np.take(train_mtx[movie_id, :], user_knn_list.tolist())) / float(k) + 3
        pred_list.append(pred_rating)
    # output the result
    pred_result.file_writer(pred_list)
    return pred_list