def construct_full_features(predicted_user_features, predicted_item_features,
                            valid_users_idx, valid_items_idx,
                            min_num_ratings, train_full,
                            lambda_user, lambda_item):
    """ Construct the full user and item features matrix to match the size in the prediction

    :param predicted_user_features: predicted user features matrix that has to be filled
    :param predicted_item_features: predicted item features matrix that has to be filled
    :param valid_users_idx: indices of valid users
    :param valid_items_idx: indices of valid items
    :param min_num_ratings: minimum number of ratings
    :param train_full: full training data set
    :param lambda_user: weight of the regularizer for user_features
    :param lambda_item: weight of the regularizer for item_features
    :return: the full user and item features matrix of shapes (num_features, num_users) and (num_features, num_items)
    """
    # Check for the base case
    if min_num_ratings == 0:
        full_user_features = predicted_user_features
        full_item_features = predicted_item_features

    else:
        total_num_items, total_num_users = train_full.shape

        # Add columns for the deleted user and items
        full_user_features = add_removed_elements(predicted_user_features, valid_users_idx, total_num_users)
        full_item_features = add_removed_elements(predicted_item_features, valid_items_idx, total_num_items)

        # Select the unvalid indexes
        added_users = unvalid_indexes(total_num_users, valid_users_idx)
        added_items = unvalid_indexes(total_num_items, valid_items_idx)

        # Find the number of non zero for items and users
        nnz_items_per_user = train_full.getnnz(axis=0)
        nnz_users_per_item = train_full.getnnz(axis=1)

        # Create the non zero item indices for each user and the non zero user indices for each item
        nz_user_itemindices = []
        nz_item_userindices = []
        nz_ratings, nz_row_colindices, nz_col_rowindices = build_index_groups(train_full)
        for row, colindices in nz_row_colindices:
            nz_item_userindices.append(colindices)
        for col, rowindices in nz_col_rowindices:
            nz_user_itemindices.append(rowindices)

        # Update the features matrices in order to converge to a better prediction than the everage
        full_item_features = fill_added_item_features(full_item_features, full_user_features, added_items, train_full,
                                                      lambda_item, nnz_users_per_item, nz_item_userindices)
        full_user_features = fill_added_user_features(full_item_features, full_user_features, added_users, train_full,
                                                      lambda_user, nnz_items_per_user, nz_user_itemindices)

    return full_user_features, full_item_features
Beispiel #2
0
def ALS(train, test, n_f, l_u, l_i):
  print("Running ALS with {} features, lambda user = {}, lambda item = {}".format(n_f, l_u, l_i))
  # define parameters
  num_features = n_f   # K in the lecture notes
  lambda_user = l_u
  lambda_item = l_i
  stop_criterion = 1e-4

  # set seed
  np.random.seed(988)

  # init ALS
  user_features, item_features = init_MF(train, num_features)

  # find the non-zero ratings indices
  nz_train, nz_item_userindices, nz_user_itemindices = build_index_groups(train)
  if test is not None:
    nz_row, nz_col = test.nonzero()
    nz_test = list(zip(nz_row, nz_col))

  rmse = compute_error(train, user_features, item_features, nz_train)
  delta_rmse = np.inf
  it = 0
  while np.abs(delta_rmse - rmse) > stop_criterion:
    user_features = update_user_feature(train, item_features, lambda_user, train.nnz, nz_user_itemindices)
    item_features = update_item_feature(train, user_features, lambda_item, train.nnz, nz_item_userindices)
    delta_rmse = rmse
    rmse = compute_error(train, user_features, item_features, nz_train)
    it += 1
    if test is not None: print("iter: {}, RMSE on training set: {}.".format(it, rmse))
    else: print("iter: {}, RMSE: {}.".format(it, rmse))

  rmse_test = 0
  if test is not None:
    rmse_test = compute_error(test, user_features, item_features, nz_test)
    # Uncomment if logging needed for multiple runs during a long period of time
    # with open('logs/overnight_logging', 'a') as f:
    #   f.write("RMSE on testing set: {}, with k: {}, l_u: {}, l_i {}\n".format(rmse_test, num_features, lambda_user, lambda_item))
    print("RMSE on testing set: {}, with k: {}, l_u: {}, l_i {}".format(rmse_test, num_features, lambda_user, lambda_item))

  return item_features.dot(user_features.T), rmse_test
Beispiel #3
0
def ALS(train, test, n_features, lambda_user, lambda_item, verbose=1):
    """Alternating Least Squares (ALS) algorithm."""
    print(
        '\nStarting ALS with n_features = %d, lambda_user = %f, lambda_item = %f'
        % (n_features, lambda_user, lambda_item))

    n_epochs = 20

    user_features_file_path = 'ALSdump/user_features_%s_%s_%s_%s.npy' \
        % (n_epochs, n_features, lambda_user, lambda_item)

    item_features_file_path = 'ALSdump/item_features_%s_%s_%s_%s.npy' \
        % (n_epochs, n_features, lambda_user, lambda_item)

    if (os.path.exists(user_features_file_path)
            and os.path.exists(item_features_file_path)):
        user_features = np.load(user_features_file_path)
        item_features = np.load(item_features_file_path)

        train_rmse = helpers.calculate_rmse(
            np.dot(item_features, user_features)[train.nonzero()],
            train[train.nonzero()].toarray()[0])

        test_rmse = helpers.calculate_rmse(
            np.dot(item_features, user_features)[test.nonzero()],
            test[test.nonzero()].toarray()[0])

        print("Train error: %f, test error: %f" % (train_rmse, test_rmse))

        return user_features, item_features

    user_features, item_features = init_MF(train, n_features)

    nz_row, nz_col = test.nonzero()
    nz_test = list(zip(nz_row, nz_col))
    nz_train, nz_row_colindices, nz_col_rowindices = helpers.build_index_groups(
        train)
    _, nz_user_itemindices = map(list, zip(*nz_col_rowindices))
    nnz_items_per_user = [len(i) for i in nz_user_itemindices]
    _, nz_item_userindices = map(list, zip(*nz_row_colindices))
    nnz_users_per_item = [len(i) for i in nz_item_userindices]

    prev_train_rmse = 100
    #prev_test_rmse =100
    for it in range(n_epochs):
        user_features = update_user_feature(train, item_features, lambda_user,
                                            nnz_items_per_user,
                                            nz_user_itemindices)

        item_features = update_item_feature(train, user_features, lambda_item,
                                            nnz_users_per_item,
                                            nz_item_userindices)

        train_rmse = helpers.calculate_rmse(
            np.dot(item_features, user_features)[train.nonzero()],
            train[train.nonzero()].toarray()[0])

        test_rmse = helpers.calculate_rmse(
            np.dot(item_features, user_features)[test.nonzero()],
            test[test.nonzero()].toarray()[0])

        if verbose == 1:
            print("[Epoch %d / %d] train error: %f, test error: %f" %
                  (it + 1, n_epochs, train_rmse, test_rmse))

        if (train_rmse > prev_train_rmse
                or abs(train_rmse - prev_train_rmse) < 1e-5):
            if verbose == 1:
                print('Algorithm has converged!')
            break
        #prev_test_rmse = test_rmse
        prev_train_rmse = train_rmse

    if verbose == 0:
        print("[Epoch %d / %d] train error: %f, test error: %f" %
              (it + 1, n_epochs, train_rmse, test_rmse))

    np.save(user_features_file_path, user_features)
    np.save(item_features_file_path, item_features)

    return user_features, item_features
def ALS(train, test, lambda_user, lambda_item, num_features):
    """ Matrix factorization using Alternating Least Squares (ALS).

    :param train: train data matrix of size (num_items, num_users)
    :param test: test data matrix of size (num_items, num_users)
    :param lambda_user: weight of the regularizer for user_features
    :param lambda_item: weight of the regularizer for item_features
    :param num_features: number of features for the factorization, also called k
    :return: user_features, item_features of size (num_features, num_users) and (num_features, num_items) respectively.
             error_table containing the RMSEs after every iteration until it converges to the stopping criterion.
             rmse_test that is -1 if there is no test set.
    """
    # Define initial parameters
    stop_criterion = 1e-4
    change = 1
    error_list = [0, 0]
    error_table = []

    # Set seed
    np.random.seed(988)

    # Initialize the factorization matrices
    user_features, item_features = init_MF(train, num_features)

    # Create an array of size 2 in order to keep in memory the previous and the present RMSE to know if we have reached
    # the stop_criterion
    error_list[0] = 1000

    # Calculate arguments for the update of Z and W
    nnz_items_per_user = train.getnnz(axis=0)
    nnz_users_per_item = train.getnnz(axis=1)
    nz_train_indices, nz_row_colindices, nz_col_rowindices = build_index_groups(
        train)

    while abs(error_list[0] - error_list[1]) > stop_criterion:

        # Fix W (item), estimate Z (user)
        for i, nz_user_itemindices in nz_col_rowindices:
            user_features[:, i] = update_user_feature(train[:,
                                                            i], item_features,
                                                      lambda_user,
                                                      nnz_items_per_user[i],
                                                      nz_user_itemindices)

        # Fix Z, estimate W
        for j, nz_item_userindices in nz_row_colindices:
            item_features[:, j] = update_item_feature(train[j], user_features,
                                                      lambda_item,
                                                      nnz_users_per_item[j],
                                                      nz_item_userindices)

        # Create a list of non zero indices of the training set
        nz_row, nz_col = train.nonzero()
        nz_train_indices = list(zip(nz_row, nz_col))

        # Store the RMSE
        error_list[change] = compute_error(train, user_features, item_features,
                                           nz_train_indices)
        error_table.append(error_list[change])

        print("RMSE on train data: {}".format(error_list[change]))

        # Update the index of the array to not overwrite the previous RMSE
        if (change == 1):
            change = 0
        else:
            change = 1

    print("Converged\n")

    # Create a list of non zero indices of the test set
    nz_row_te, nz_col_te = test.nonzero()
    nz_test = list(zip(nz_row_te, nz_col_te))

    # Check if the test is non null, otherwise we set its RMSE to -1
    if len(nz_test) == 0:
        rmse_test = -1
    else:
        rmse_test = compute_error(test, user_features, item_features, nz_test)
        print("RMSE on test data: {}.".format(rmse_test))

    return user_features, item_features, error_table, rmse_test