def calc_user_user_cf2(training_data, num_partitions=20): """ A very simple user-user CF algorithm in PySpark. Method is more stable than calc_user_user_cf Method derived from the Coursera course: Recommender Systems taught by Prof Joseph Konstan (Universitu of Minesota) and Prof Michael Ekstrand (Texas State University) Args: y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] Returns: predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. """ user_groups = training_data.groupBy(lambda (user, item, rating): user) user_groups_sim = user_groups.cartesian(user_groups).map(lambda ((user1_id, user1_rows), (user2_id, user2_rows)):\ (user1_id, user2_id, similarity(user1_rows, user2_rows, 1))).coalesce(num_partitions) user_averages = training_data.map(lambda (user, item, rating): (user, (rating))).groupByKey().\ map(lambda (user, ratings): (user, np.mean(list(ratings)))) user_resids = training_data.map(lambda (user, item, rating): (user, (item, rating))).join(user_averages)\ .map(lambda (user, ((item, rating), avg_rating)): (user, (item, rating-avg_rating))) item_adjustments = user_resids.join(user_groups_sim.map(lambda (u1, u2, sim): (u1, (u2, sim))))\ .map(lambda (u1, ((item, resid), (u2, sim))): ((u2,item), (resid*sim, sim))).\ groupByKey().map(lambda ((user, item), sim_list): (user, item, calc_item_adjust(sim_list))) predictions = item_adjustments.map(lambda (user, item, item_adj): (user, (item, item_adj))).join(user_averages)\ .map(lambda (user, ((item, item_adj), (avg_rate))): (user, item, avg_rate+item_adj)) #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here max_rating = training_data.map(lambda (user, item, rating): rating).max() min_rating = training_data.map(lambda (user, item, rating): rating).min() if max_rating == min_rating: min_rating = 0 norm_predictions = predictions.map(lambda (user, item, pred): ( user, item, rechelp.squish_preds(pred, min_rating, max_rating))) return norm_predictions
def calc_user_user_cf2(training_data, num_partitions=20): """ A very simple user-user CF algorithm in PySpark. Method is more stable than calc_user_user_cf Method derived from the Coursera course: Recommender Systems taught by Prof Joseph Konstan (Universitu of Minesota) and Prof Michael Ekstrand (Texas State University) Args: y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] Returns: predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. """ user_groups = training_data.groupBy(lambda (user, item, rating): user) user_groups_sim = user_groups.cartesian(user_groups).map(lambda ((user1_id, user1_rows), (user2_id, user2_rows)):\ (user1_id, user2_id, similarity(user1_rows, user2_rows, 1))).coalesce(num_partitions) user_averages = training_data.map(lambda (user, item, rating): (user, (rating))).groupByKey().\ map(lambda (user, ratings): (user, np.mean(list(ratings)))) user_resids = training_data.map(lambda (user, item, rating): (user, (item, rating))).join(user_averages)\ .map(lambda (user, ((item, rating), avg_rating)): (user, (item, rating-avg_rating))) item_adjustments = user_resids.join(user_groups_sim.map(lambda (u1, u2, sim): (u1, (u2, sim))))\ .map(lambda (u1, ((item, resid), (u2, sim))): ((u2,item), (resid*sim, sim))).\ groupByKey().map(lambda ((user, item), sim_list): (user, item, calc_item_adjust(sim_list))) predictions = item_adjustments.map(lambda (user, item, item_adj): (user, (item, item_adj))).join(user_averages)\ .map(lambda (user, ((item, item_adj), (avg_rate))): (user, item, avg_rate+item_adj)) #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here max_rating = training_data.map(lambda (user, item, rating): rating).max() min_rating = training_data.map(lambda (user, item, rating): rating).min() if max_rating == min_rating: min_rating=0 norm_predictions = predictions.map(lambda (user,item,pred): (user,item, rechelp.squish_preds(pred,min_rating,max_rating))) return norm_predictions
def calc_item_item_cf(training_data, num_partitions): """ A very simple item-item CF algorithm in PySpark. Method derived from the Coursera course: Recommender Systems taught by Prof Joseph Konstan (Universitu of Minesota) and Prof Michael Ekstrand (Texas State University) Args: y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] Returns: predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. """ item_groups = training_data.groupBy(lambda (user, item, rating): item) item_similarity = item_groups.cartesian(item_groups).map(lambda ((item1_id, item1_rows), (item2_id, item2_rows)):\ (item1_id, item2_id, similarity(item1_rows, item2_rows, 0))).coalesce(num_partitions) user_item_sim = training_data.keyBy(lambda (user, item, rating): item)\ .join(item_similarity.keyBy(lambda (item1, item2, sim): item1))\ .map(lambda (item_id,((user, item, rating),(item1, item2, sim))):((user, item2), (item,rating,sim)))\ .filter(lambda ((user, item2), (item,rating,sim)): item2!=item) predictions = user_item_sim.groupByKey()\ .map(lambda ((user, item), rows): (user, item, get_item_prob(rows))) #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here max_rating = training_data.map(lambda (user, item, rating): rating).max() min_rating = training_data.map(lambda (user, item, rating): rating).min() if max_rating == min_rating: min_rating = 0 norm_predictions = predictions.map(lambda (user, item, pred): ( user, item, rechelp.squish_preds(pred, min_rating, max_rating))) return norm_predictions
def calc_item_item_cf(training_data, num_partitions): """ A very simple item-item CF algorithm in PySpark. Method derived from the Coursera course: Recommender Systems taught by Prof Joseph Konstan (Universitu of Minesota) and Prof Michael Ekstrand (Texas State University) Args: y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] Returns: predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. """ item_groups = training_data.groupBy(lambda (user, item, rating): item) item_similarity = item_groups.cartesian(item_groups).map(lambda ((item1_id, item1_rows), (item2_id, item2_rows)):\ (item1_id, item2_id, similarity(item1_rows, item2_rows, 0))).coalesce(num_partitions) user_item_sim = training_data.keyBy(lambda (user, item, rating): item)\ .join(item_similarity.keyBy(lambda (item1, item2, sim): item1))\ .map(lambda (item_id,((user, item, rating),(item1, item2, sim))):((user, item2), (item,rating,sim)))\ .filter(lambda ((user, item2), (item,rating,sim)): item2!=item) predictions = user_item_sim.groupByKey()\ .map(lambda ((user, item), rows): (user, item, get_item_prob(rows))) #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here max_rating = training_data.map(lambda (user, item, rating): rating).max() min_rating = training_data.map(lambda (user, item, rating): rating).min() if max_rating == min_rating: min_rating=0 norm_predictions = predictions.map(lambda (user,item,pred): (user,item, rechelp.squish_preds(pred,min_rating,max_rating))) return norm_predictions
def calc_cf_mllib(y_training_data, num_partitions=20): """ Utilizes the ALS collaborative filtering algorithm in MLLib to determine the predicted ratings Args: y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] Returns: predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. """ #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here max_rating = y_training_data.map(lambda (user, item, rating): rating).max() min_rating = y_training_data.map(lambda (user, item, rating): rating).min() if max_rating == min_rating: min_rating = 0 #MLLIb has two methods, train and trainImplicit(). Implicit data will go between zero and 1 if min_rating == 0 and max_rating == 1: model = ALS.trainImplicit(y_training_data, rank=10, iterations=5) else: model = ALS.train(y_training_data, rank=10, iterations=5) #predict all user, item pairs item_ids = y_training_data.map(lambda (u, i, r): i).distinct() user_ids = y_training_data.map(lambda (u, i, r): u).distinct() user_item_combo = user_ids.cartesian(item_ids).coalesce(num_partitions) predicted = model.predictAll(user_item_combo.map(lambda x: (x[0], x[1]))) norm_predictions = predicted.map(lambda (user, item, pred): ( user, item, rechelp.squish_preds(pred, min_rating, max_rating))) return norm_predictions
def calc_cf_mllib(y_training_data, num_partitions = 20): """ Utilizes the ALS collaborative filtering algorithm in MLLib to determine the predicted ratings Args: y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] Returns: predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. """ #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here max_rating = y_training_data.map(lambda (user, item, rating): rating).max() min_rating = y_training_data.map(lambda (user, item, rating): rating).min() if max_rating == min_rating: min_rating=0 #MLLIb has two methods, train and trainImplicit(). Implicit data will go between zero and 1 if min_rating==0 and max_rating==1: model = ALS.trainImplicit(y_training_data, rank = 10, iterations = 5) else: model = ALS.train(y_training_data, rank = 10, iterations = 5) #predict all user, item pairs item_ids = y_training_data.map(lambda (u,i,r): i).distinct() user_ids = y_training_data.map(lambda (u,i,r): u).distinct() user_item_combo = user_ids.cartesian(item_ids).coalesce(num_partitions) predicted = model.predictAll(user_item_combo.map(lambda x: (x[0], x[1]))) norm_predictions = predicted.map(lambda (user,item,pred): (user,item, rechelp.squish_preds(pred,min_rating,max_rating))) return norm_predictions