def Tester(spark, model, df_test, rank, regParam, alpha, K=500): #df_test = spark.read.parquet(formatted_test_address) targetUsers = df_test.select("user_id_numeric").distinct() userRecs = model.recommendForUserSubset(targetUsers, K) userRecs = userRecs.select("user_id_numeric", "recommendations.track_id_numeric", "recommendations.rating") # need to get ordered list of track_id based on counts groupby individual users. # reference:https://stackoverflow.com/questions/46580253/collect-list-by-preserving-order-based-on-another-variable w = Window.partitionBy("user_id_numeric").orderBy(df_val['count'].desc()) labels = df_val.withColumn('ActualRanking', F.collect_list("track_id_numeric").over(w)) labels = labels.select(['user_id_numeric', 'ActualRanking' ]).dropDuplicates(['user_id_numeric']) # Get the metrics # predictionsAndlabels should be an RDD of (predicted ranking, ground truth set) pairs. # reference: https://spark.apache.org/docs/2.2.0/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics predictionsAndlabels = userRecs.join( labels, [labels.user_id_numeric == userRecs.user_id_numeric], 'left').select('track_id_numeric', 'ActualRanking') metricsRank = RankingMetrics(predictionsAndlabels.rdd) print("------------------------------------------") print("Params: Rank %f | regParam %f | alpha = %f" % (rank, regParam, alpha)) print("p(15) %.8f" % metricsRank.precisionAt(15)) print("p(500) %.8f" % metricsRank.precisionAt(500)) print("MAP %.8f" % metricsRank.meanAveragePrecision) print("nDCG %.8f" % metricsRank.ndcgAt(K)) return
def annoy(als_model, user_truth, test_user, sc, n_trees=10, search_k=-1): print('creating annoy baseline with n_trees: ' + str(n_trees), 'search_k: ' + str(search_k)) sc = SparkContext.getOrCreate() factors = als_model.userFactors size = factors.limit(1).select(F.size('features').alias('calculation')).collect()[0].calculation time_start = time() annoy_list = AnnoyIndex(size) for row in factors.collect(): annoy_list.add_item(row.id, row.features) annoy_list.build(n_trees) annoy_list.save('./home/hj1325/final-project-final-project/annoy_list' + str(n_trees) + '_k_' + str(search_k) + '.ann') recommend_list = [(user.user_label, annoy_list.get_nns_by_item(int(user.user_label), 500)) for user in test_user.collect()] temp = sc.parallelize(recommend_list) print('recommendations has been created') recommend = spark.createDataFrame(temp, ['user_label', 'recommendation']) predictions = recommend.join(user_truth, recommend.user_label == user_truth.user_label, 'inner') score = predictions.select('recommendation', 'truth').rdd.map(tuple) metrics = RankingMetrics(score) precision = metrics.precisionAt(500) mean_average_precision = metrics.meanAveragePrecision print('time taken: ' + str(time() - time_start)) print('precision at 500: ' + str(precision)) print('mean average precision: ' + str(mean_average_precision)) annoy_list.unload()
def main(spark, model_file): ### train = spark.read.parquet('./train.parquet') #validation = spark.read.parquet('./validation.parquet') test = spark.read.parquet('./test.parquet') train_model = ALSModel.load(model_file) users = test.select('convert_user_id').distinct() user_recs = train_model.recommendForUserSubset(users, 500) prediction_df = user_recs.select('convert_user_id', 'recommendations.convert_track_id') true_df = test.groupBy('convert_user_id').agg( expr('collect_list(convert_track_id) as true_items')) prediction_df.write.parquet('./recommendation_count.parquet') true_df.write.parquet('./true_count.parquet') prediction_rdd = prediction_df.join(true_df, 'convert_user_id') \ .rdd \ .map(lambda row: (row[1], row[2])) rankingMetrics = RankingMetrics(prediction_rdd) print(rankingMetrics.meanAveragePrecision) print(rankingMetrics.precisionAt(500))
def main(spark, test_file, index_file, model_file): # Load the dataframe test = spark.read.parquet(test_file) indexer = PipelineModel.load(index_file) #transform user and track ids test = indexer.transform(test) #select distinct users for recommendations #testUsers = test.select("userNew").distinct().alias("userCol") #establish "ground truth" groundTruth = test.groupby("userNew").agg( F.collect_list("trackNew").alias("truth")) print("created ground truth df") alsmodel = ALSModel.load(model_file) rec = alsmodel.recommendForAllUsers(500) print("created recs") predictions = rec.join(groundTruth, rec.userNew == groundTruth.userNew, 'inner') scoreAndLabels = predictions.select('recommendations.trackNew', 'truth').rdd.map(tuple) metrics = RankingMetrics(scoreAndLabels) precision = metrics.precisionAt(500) map_out = metrics.meanAveragePrecision print(f"precision at 500: {precision}") print(f"map : {map_out}")
def main(spark, model_file, data_file): '''Main routine for supervised evaluation Parameters ---------- spark : SparkSession object model_file : string, path to store the serialized model file data_file : string, path to the parquet file to load ''' ### # TODO: YOUR CODE GOES HERE df = spark.read.parquet(data_file) model = PipelineModel.load(model_file) predictions = model.transform(df) #predictions_sorted = predictions.orderBy(desc('count')).limit(500).collect() print("smile") scoreAndLabels = predictions.select('prediction','count') print("smile again") scoreAndLabels.show(5) scoreAndLabels = scoreAndLabels.rdd print("I am smiling") metrics = RankingMetrics(scoreAndLabels) precision = metrics.precisionAt(500) print(precision)
def evaluateTopk(model,data,top_k=500): ''' Input: validation: RDD - user, product (book_id), rating ''' truth=spark.createDataFrame(data).groupby("user").agg(F.collect_set("product")) print("Getting Predictions...") tmp1=model.recommendProductsForUsers(top_k).map(lambda r: [r[0],[k.product for k in r[1]]]) predictions=spark.createDataFrame(tmp1,["user","predictions"]) print("Predictions and Labels...") k=predictions.join(truth,truth.user==predictions.user) final=k.rdd.map(lambda r: [r[1],r[3]]) metrics=RankingMetrics(final) print("\nCalculate NDCG at {}...".format(top_k)) res1=metrics.ndcgAt(top_k) print("NDCG at {}: {}".format(top_k,res1)) print("\nCalculate MAP...") res2=metrics.meanAveragePrecision print("MAP: {}".format(res2)) print("\nCalculate Precision at {}...".format(top_k)) res3=metrics.precisionAt(top_k) print("Precision at {}: {}".format(top_k,res1)) return res1,res2,res3
def main(spark, txt): model = ALSModel.load('hdfs:/user/jm7955/' + args.model) distinct_users = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' % args.distinct) print("distinct_users") print('finished writing in %d seconds' % int(timer() - start)) #distinct_users.show() labels = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' % args.labels) print("labels") #labels.show() print('finished writing in %d seconds' % int(timer() - start)) predictions = model.recommendForUserSubset(distinct_users, 500)\ .select('user', F.col('recommendations.item').alias('item')) print("predictions") #predictions.show() print('finished writing in %d seconds' % int(timer() - start)) predictionsAndLabels = predictions.join( labels, ["user"], "inner").rdd.map(lambda tup: (tup[1], tup[2])) print("predictionsAndLabels") print('finished writing in %d seconds' % int(timer() - start)) metrics = RankingMetrics(predictionsAndLabels) print('finished writing in %d seconds' % int(timer() - start)) file = open(txt, 'w') file.write('metrics.meanAveragePrecision: %s\n' % metrics.meanAveragePrecision) file.write('metrics.precisionAt(500) %s\n' % metrics.precisionAt(500)) file.write('metrics.ndcgAt(500) %s\n' % metrics.ndcgAt(500)) file.close()
def main(spark, model_file, test_file): ''' Parameters ---------- spark : SparkSession object data_file : string, path to the parquet file to load model_file : string, path to store the serialized model file ''' # Load the parquet file test = spark.read.parquet(test_file) test = test.sort('user', ascending=False) test.createOrReplaceTempView('test_table') model = ALSModel.load(model_file) user_subset = test.select("user").distinct() user_subset = model.recommendForUserSubset(user_subset, 500) user_subset = user_subset.select("user", col("recommendations.item").alias("item")) user_subset = user_subset.sort('user', ascending=False) print("sort user") predictionAndLabels = user_subset.join( test, ["user"], "inner").rdd.map(lambda tup: (tup[1], tup[2])) print("joined predictions and counts") metrics = RankingMetrics(predictionAndLabels) print("made metrics") MAP = metrics.meanAveragePrecision precision = metrics.precisionAt(500) ndcg = metrics.ndcgAt(500) print('MAP: %f' % MAP) print('Precision: %f' % precision) print('NDCG: %f' % ndcg)
def annoy(alsmodel, groundTruth, testUsers, sc, n_trees=10, search_k=-1): print(f"annoy index version with n_trees: {n_trees}, search_k: {search_k}") sc = SparkContext.getOrCreate() userfactors = alsmodel.userFactors size = userfactors.limit(1).select( F.size("features").alias("calc_size")).collect()[0].calc_size start_time = time() a = AnnoyIndex(size) for row in userfactors.collect(): a.add_item(row.id, row.features) a.build(n_trees) a.save("./anns/annoy_t" + str(n_trees) + "_k_" + str(search_k) + ".ann") rec_list = [(u.userNew, a.get_nns_by_item(int(u.userNew), 500)) for u in testUsers.collect()] temp = sc.parallelize(rec_list) print("created recs") rec = spark.createDataFrame(temp, ["userNew", "recs"]) predictions = rec.join(groundTruth, rec.userNew == groundTruth.userNew, 'inner') scoreAndLabels = predictions.select('recs', 'truth').rdd.map(tuple) metrics = RankingMetrics(scoreAndLabels) precision = metrics.precisionAt(500) MAP = metrics.meanAveragePrecision print(f"time elapsed: {time()-start_time}s") print(f"precision at 500: {precision}") print(f"MAP: {MAP}") a.unload()
def mAPandprecisionatK(spark, model, k, labels, user_ids): ''' Function to print the metric meanAveragePrecision and Precisionatk Parameters ---------- spark : spark session object model: type-MLlib model: developed model k: type-int: Top-k predictions for every user scoreAndLabels: type-RDD: predicted scores and actual Labels user_ids: user_ids to recommend products for return ---------- None ''' recs = [] for uid in user_ids: # recommend k products for each user temp_recs = model.recommendProducts(uid.user_id, k) # collect only the book_ids from the recommendations recs.append([temp_rec.product for temp_rec in temp_recs]) l = labels.map(lambda tup: float(tup[1])).collect() rdd = spark.sparkContext.parallelize([(recs, l)]) m = RankingMetrics(rdd) print("meanAveragePrecision {}".format(m.meanAveragePrecision)) print("Precision at K for K ={} is {}" .format(k, m.precisionAt(k)))
def evaluation(df, model, ks): ''' Evaluate the model. ks: a list of parameter k used in precision at k and NDCG at k. ''' print(' Make predictions...') predictions = model.recommendForUserSubset(df, 500) print(' Prepare ground truth set and predicted set...') labels = df.groupBy('user').agg(F.collect_set('item')).collect() user_pred = predictions.select('user','recommendations.item').rdd.flatMap(lambda x:[x]).collect() labels = sorted(labels, key = lambda x: x.user) user_pred = sorted(user_pred, key = lambda x: x.user) print(' Combine ground truth set and predicted set...') predictionAndLabels = [] for i in range(len(user_pred)): predictionAndLabels.append((user_pred[i].item, labels[i][1])) print(' Parallelize...') predictionAndLabels = sc.parallelize(predictionAndLabels, numSlices=2000) print(' Calculate metrics...') metrics = RankingMetrics(predictionAndLabels) eval_results = [] eval_results.append(metrics.meanAveragePrecision) for k in ks: eval_results.append(metrics.precisionAt(k)) eval_results.append(metrics.ndcgAt(k)) return eval_results
def get_rankMetrics(spark, df, trained_model, approx=False, k=500): """ This function evaluates the performance of a given model on a given dataset using Ranking Metrics, and returns the final performance metrics. Parameters ---------- df: DataFrame to evaluate on trained_model: trained model to evaluate approx: boolean; use ANN(approximate nearest neighbors) when True k: number of recommendation ---------- """ import datetime import nmslib_recommend2 import pyspark.sql.functions as F from pyspark.mllib.evaluation import RankingMetrics # change column names df = df.select(['user_id', 'book_id', 'rating']).toDF('user', 'item', 'rating') # relevant item if its centered rating > 0 fn = F.udf(lambda x: 1.0 if x >= 3 else 0.0) df = df.withColumn('rating', fn(df.rating)) relevant = df[df.rating == 1.0].groupBy('user').agg(F.collect_list('item')) # recommend k items for each user print("recommendation time comparison start: ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) if approx: recommend = nmslib_recommend2.nmslib_recommend(spark, df, trained_model, k) recommend = spark.createDataFrame(recommend, ["user", "recommend"]) joined = recommend.join(relevant, on='user') rec_and_rel = [] for user, rec, rel in joined.collect(): rec_and_rel.append((rec, rel)) else: userSubset = relevant.select('user') recommend = trained_model.recommendForUserSubset(userSubset, 500) joined = recommend.join(relevant, on='user') rec_and_rel = [] for user, rec, rel in joined.collect(): predict_items = [i.item for i in rec] rec_and_rel.append((predict_items, rel)) print("recommendation time comparison end: ", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) # Compute metrics rec_and_rel_rdd = spark.sparkContext.parallelize(rec_and_rel) metric_class = RankingMetrics(rec_and_rel_rdd) ndcg = metric_class.ndcgAt(k) map_ = metric_class.meanAveragePrecision pk = metric_class.precisionAt(k) return print("NDCG:", ndcg, "\nMAP:", map_, "\nPrecision:", pk)
def precision_at_k(self, k): """ Calculate precision at k for the predicted rankings :param k: int, calculate precision at k :return : int, precision """ rank = self.pred_rankings.rdd.map(lambda tup: (tup[2], tup[1])) metrics = RankingMetrics(rank) return metrics.precisionAt(k)
def basic_rec_val(spark, dirname, rank, regParam, k, random_seed): val_set = spark.read.parquet(f'{dirname}/val.parquet') print( f'Validating on model with rank = {rank} and regParam = {regParam} trained using {dirname} data ...' ) # load corresponding trained model model = ALSModel.load(f'{dirname}/{rank}_{regParam}_model') # computing RMSE on validation set predictions = model.transform(val_set) evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction') rmse = evaluator.evaluate(predictions) print(f'rmse: {rmse}') print(f'Constructing top {k} books recommended to per user ...') val_users = val_set.select('user_id').distinct() start_time = time.time() perUserPredictedTopKItemsDF = model.recommendForUserSubset(val_users, k) myudf = udf(extract_item, ArrayType(IntegerType())) perUserPredictedTopKItemsDF = perUserPredictedTopKItemsDF.withColumn( 'predictions', myudf(perUserPredictedTopKItemsDF['recommendations'])).drop( 'recommendations') print('Constructing actual books per user ...') perUserActualItemsDF = val_set.filter( column('rating') >= 3.0).groupBy('user_id').agg( expr('collect_list(book_id) as book_ids')) print('Constructing Ranking Metrics ...') perUserItemsRDD = perUserPredictedTopKItemsDF.join( perUserActualItemsDF, 'user_id').rdd.map(lambda row: (row[1], row[2])) rankingMetrics = RankingMetrics(perUserItemsRDD) precisionAtK = rankingMetrics.precisionAt(k) mAP = rankingMetrics.meanAveragePrecision end_time = time.time() time_delta = str(datetime.timedelta(seconds=end_time - start_time)) print(f'p@{k}: {precisionAtK}') print(f'mAP: {mAP}') print(f'run time: {time_delta}')
def top_k_rankingmetrics(dataset=None, k=10, ranking_metrics="precisionAt", user="******", item="book_id", rating="rating", prediction="prediction"): ''' This function is to compute the ranking metrics from predictions. Input: 1. k: only evaluate the performance of the top k items 2. ranking_metrics: precisionAt, meanAveragePrecision, ndcgAt 3. user, item, prediction: column names; string type refer to https://vinta.ws/code/spark-ml-cookbook-pyspark.html ''' if dataset == None: print("Error! Please specify a dataset.") return # prediction table windowSpec = Window.partitionBy(user).orderBy(col(prediction).desc()) perUserPredictedItemsDF = dataset \ .select(user, item, prediction, F.rank().over(windowSpec).alias('rank')) \ .where('rank <= {}'.format(k)) \ .groupBy(user) \ .agg(expr('collect_list({}) as items'.format(item))) # actual target table windowSpec = Window.partitionBy(user).orderBy(col(rating).desc()) perUserActualItemsDF = dataset \ .select(user, item, rating, F.rank().over(windowSpec).alias('rank')) \ .where('rank <= {}'.format(k)) \ .groupBy(user) \ .agg(expr('collect_list({}) as items'.format(item))) # join perUserItemsRDD = perUserPredictedItemsDF \ .join(F.broadcast(perUserActualItemsDF), user, 'inner') \ .rdd \ .map(lambda row: (row[1], row[2])) ranking_metrics_evaluator = RankingMetrics(perUserItemsRDD) # get the result of the metric if ranking_metrics == "precisionAt": precision_at_k = ranking_metrics_evaluator.precisionAt(k) #print("precisionAt: {}".format(round(precision_at_k, 4))) return precision_at_k elif ranking_metrics == "meanAveragePrecision": mean_avg_precision = ranking_metrics_evaluator.meanAveragePrecision(k) #print("meanAveragePrecision: {}".format(round(mean_avg_precision, 4))) return mean_avg_precision elif ranking_metrics == "ndcgAt": ndcg_at_k = ranking_metrics_evaluator.ndcgAt(k) #print("meanAveragePrecision: {}".format(round(ndcg_at_k, 4))) return ndcg_at_k
def main(spark, test_file, train_file, model_path): # Read data from parquet print('Reading parquet file ...') test = spark.read.parquet(test_file) test.createOrReplaceTempView('test') train = spark.read.parquet(train_file) train.createOrReplaceTempView('train') # Load the best model from training print('Loading model ...') best_model = ALSModel.load(model_path) # get recommendations for users in test set print('Evaluating model on test set ...') test_users = test.select("user_id").distinct() rec_test = best_model.recommendForUserSubset(test_users, 700) pred_test_700 = rec_test.select( rec_test.user_id, rec_test.recommendations.book_id.alias('rec_book_id')) sub_train_test = spark.sql('SELECT user_id, book_id \ FROM train \ WHERE user_id IN (SELECT DISTINCT user_id FROM test)' ) df_train_book_test = sub_train_test.groupby('user_id').agg( F.collect_set('book_id').alias('train_book_id')) df_join_test = pred_test_700.join(df_train_book_test, 'user_id') diff = F.udf(book_diff, ArrayType(IntegerType())) df_join_pred_test = df_join_test.withColumn( 'predictions', diff(df_join_test.rec_book_id, df_join_test.train_book_id)) pred_test = df_join_pred_test.select(df_join_pred_test.user_id, df_join_pred_test.predictions) # get true preferences of users in validation set label_test = test.filter(test.rating >= 3).groupby("user_id").agg( F.collect_list("book_id")) predAndLabel_test = pred_test.join( label_test, 'user_id').rdd.map(lambda row: (row[1], row[2])) # Use Mean Average Precision as evaluation metric metrics_test = RankingMetrics(predAndLabel_test) MAP_test = metrics_test.meanAveragePrecision pak_100_test = metrics_test.precisionAt(100) pak_500_test = metrics_test.precisionAt(500) print('\n') print( 'Ranking scores of the best model on test data: MAP = {}, Precision@100 = {}, Precision@500 = {}' .format(MAP, pak_100_test, pak_500_test))
def dummy_run(spark): from pyspark.ml.recommendation import ALS from pyspark.mllib.evaluation import RankingMetrics import pyspark.sql.functions as F from pyspark.sql.functions import expr train=spark.createDataFrame( [ (82, 124, 5.0), (64, 123, 4.0), (27, 122, 3.0), (25, 122, 1.0), (12, 124, 2.0) ], ['user_id', 'book_id', 'rating'] ) val=spark.createDataFrame( [ (82, 123, 5.0), (64, 122, 4.0), (27, 124, 3.0), (64, 123, 2.0), (12, 122, 4.0) ], ['user_id', 'book_id', 'rating'] ) user_id = val.select('user_id').distinct() true_label = val.select('user_id', 'book_id')\ .groupBy('user_id')\ .agg(expr('collect_list(book_id) as true_item')) als = ALS(rank = 3 , regParam=0.1, userCol="user_id", itemCol="book_id", ratingCol='rating', implicitPrefs=False, coldStartStrategy="drop") model = als.fit(train) recs = model.recommendForUserSubset(user_id, 2) pred_labels = recs.select('user_id','recommendations.book_id') pred_true_rdd = pred_labels.join(F.broadcast(true_label), 'user_id', 'inner') \ .rdd \ .map(lambda row: (row[1], row[2])) metrics = RankingMetrics(pred_true_rdd) mean_ap = metrics.meanAveragePrecision ndcg_at_k = metrics.ndcgAt(2) p_at_k= metrics.precisionAt(2) print('MAP: ', mean_ap , 'NDCG: ', ndcg_at_k, 'Precision at k: ', p_at_k) return
def hyperparameter_tuning(spark, train_file, val_file): # load in the data train = spark.read.parquet(train_file) train.createOrReplaceTempView('train') val = spark.read.parquet(val_file) val.createOrReplaceTempView('val') user_idxer = StringIndexer(inputCol = 'user_id', outputCol = 'user', handleInvalid = "skip") item_idxer = StringIndexer(inputCol = 'book_id', outputCol = 'item', handleInvalid = "skip") pipeline = Pipeline(stages = [user_idxer, item_idxer]) indexers = pipeline.fit(train) train = indexers.transform(train) val = indexers.transform(val) val = val.withColumn('item', val['item'].cast('int')) val = val.withColumn('user', val['user'].cast('int')) val_users = val.select('user').distinct() val_groundtruth = val.groupby('user').agg(F.collect_list('item').alias('truth')).cache() # ranks to test # ranks = [1,2,5,10,50] ranks = [10] # regParams to test # lambdas = [0.01, 0.1, 1, 2, 10] lambdas = [0.01] # Set up list for results p = [] iters = len(ranks) * len(lambdas) count = 0 for r in ranks: for lam in lambdas: print('regParam: {}, Rank: {}'.format(lam, r)) als = ALS(regParam = lam, rank = r, userCol='user', itemCol='item', seed=2020, ratingCol='rating', nonnegative=True, coldStartStrategy='drop', intermediateStorageLevel='MEMORY_AND_DISK', finalStorageLevel='MEMORY_AND_DISK') model = als.fit(train) rec = model.recommendForAllUsers(500) predictions = rec.join(val_groundtruth, rec.user == val_groundtruth.user, 'inner') predictions = predictions.select('recommendations.item', 'truth') predictionAndLabels = predictions.rdd.map(tuple).repartition(1000) metrics = RankingMetrics(predictionAndLabels) precision = metrics.precisionAt(500) MAP = metrics.meanAveragePrecision p.append([lam, r, MAP, precision, model, als]) count += 1 print('precision: {}, MAP: {}'.format(precision, MAP)) print('done with iter {} out of {}'.format(count, iters))
def baseline(als_model, user_truth, test_user): print('creating baseline model') time_start = time() recommend = als_model.recommendForUserSubset(test_user, 500) print('recommendation has been created.') predictions = recommend.join(user_truth, recommend.user_label == user_truth.user_label, 'inner') score = predictions.select('recommendations.book_label', 'truth').rdd.map(tuple) metrics = RankingMetrics(score) precision = metrics.precisionAt(500) mean_average_precision = metrics.meanAveragePrecision print('time taken: ' + str(time() - time_start)) print('precision at 500: ' + str(precision)) print('mean average precision: ' + str(mean_average_precision))
def main(spark, train_file, test_file, output_file): sys.stdout = open(output_file, 'w') # Read data from parquet print('Reading parquet files ...') train = spark.read.parquet(train_file) train.createOrReplaceTempView('train') test = spark.read.parquet(test_file) test.createOrReplaceTempView('test') print( 'Recommending most popular 500 books in terms of average rating counted in training set ...' ) # get recommendations for users in test set test_users = test.select("user_id").distinct() book_top500_train = spark.sql('SELECT book_id, AVG(rating) \ FROM train \ WHERE book_id IN(SELECT DISTINCT book_id \ FROM train \ GROUP BY book_id \ HAVING COUNT(*) >= 20) \ GROUP BY book_id \ ORDER BY AVG(rating) DESC \ LIMIT 500') rec_list = book_top500_train.select(book_top500_train.book_id).agg( F.collect_list('book_id')) rec = test_users.rdd.cartesian( rec_list.rdd).map(lambda row: (row[0][0], row[1][0])).toDF() pred = rec.select(rec._1.alias('user_id'), rec._2.alias('pred')) print('Collecting true labels for each test user') # get true preferences of users in test set # ground truth in test set sub_test = spark.sql('SELECT user_id, book_id FROM test WHERE rating >= 3') label = sub_test.groupby('user_id').agg( F.collect_set('book_id').alias('label')) predAndLabel = pred.join(label, 'user_id').rdd.map(lambda row: (row[1], row[2])) # Use Mean Average Precision as evaluation metric metrics = RankingMetrics(predAndLabel) MAP = metrics.meanAveragePrecision pat500 = metrics.precisionAt(500) print('Scores on test set: MAP = {} and Precision at 500 = {}'.format( MAP, pat500))
def annoy_model(als_model, sc, groundTruth_test, test_users, n_trees=10, search_k=-1): print(f"annoy model with n_trees: {n_trees}, search_k: {search_k}") sc = SparkContext.getOrCreate() user_factors = als_model.userFactors size = user_factors.limit(1).select( F.size("features").alias("calc_size")).collect()[0].calc_size start_time = time() index_size = AnnoyIndex(size) for row in user_factors.collect(): index_size.add_item(row.id, row.features) index_size.build(n_trees) index_size.save("./annoy_result/annoy_t" + str(n_trees) + "_k_" + str(search_k) + ".ann") rec_list = [(user.user_id, index_size.get_nns_by_item(int(user.user_id), 500)) for user in test_users.collect()] temp = sc.parallelize(rec_list) print("Annoy-Recommendations (500) created for test users") rec = spark.createDataFrame(temp, ["user_id", "recommendations"]) pred_test = rec.join(groundTruth_test, rec.user_id == groundTruth_test.user_id, 'inner') predAndLabels_test_annoy = pred_test.select('recommendations', 'test_truth').rdd.map(tuple) metrics_test_annoy = RankingMetrics(predAndLabels_test_annoy) precision_test_annoy = metrics_test_annoy.precisionAt(500) map_test_annoy = metrics_test_annoy.meanAveragePrecision print(f"Time taken: {time() - start_time}s") print(f"Precision at 500: {precision_test_annoy}") print(f"Mean Average Precision: {map_test_annoy}") index_size.unload()
def baseline(alsmodel, groundTruth, testUsers): print("baseline version") start_time = time() rec = alsmodel.recommendForUserSubset(testUsers, 500) print("created recs") predictions = rec.join(groundTruth, rec.userNew == groundTruth.userNew, 'inner') scoreAndLabels = predictions.select('recommendations.trackNew', 'truth').rdd.map(tuple) metrics = RankingMetrics(scoreAndLabels) precision = metrics.precisionAt(500) MAP = metrics.meanAveragePrecision print(f"time elapsed: {time()-start_time}s") print(f"precision at 500: {precision}") print(f"MAP: {MAP}")
def __evaluate_ranking(self, rnk_inf: SparkDF): test_ground_truth = self.__test.groupBy("user_id").agg(collect_list("business_id").alias("business_gt")) pred_with_labels = rnk_inf.join(test_ground_truth, on="user_id").drop("user_id") metrics = RankingMetrics(pred_with_labels.rdd) results = {} for m in self.ranking_metrics: metric_name = "{}@{}".format(m, self.top_k) if "ndcg" in m: results[metric_name] = metrics.ndcgAt(self.top_k) elif m == "precision": results[metric_name] = metrics.precisionAt(self.top_k) return results
def recsys(spark): # Load data from parquet val = spark.read.parquet("val_set.parquet") test = spark.read.parquet("test_set.parquet") cols_to_drop = ['is_read', 'is_reviewed'] test = test.drop(*cols_to_drop) val = val.drop(*cols_to_drop) # Load model from path model_path = "hdfs:/user/ago265/best_model" best_model = ALSModel.load(model_path) # Compile a list of all the books each user read val_users = val.select("user_id").distinct() val_books = val.select("user_id", "book_id")\ .groupBy("user_id")\ .agg(expr('collect_list(book_id) as books')) test_users = test.select("user_id").distinct() test_books = test.select("user_id", "book_id").groupBy("user_id").agg(expr('collect_list(book_id) as books')) # # Recommender System for all users at k=500 # k = 500 # print('Making top 500 recommendations for all users') # rec = best_model.recommendForAllUsers(k) # Recommender System for subset of users at k=10 k = 10 print('Making top {} recommendations for a subset of users'.format(k)) rec = best_model.recommendForUserSubset(test_users, k) pred_label = rec.select('user_id','recommendations.book_id') # Create an RDD to evaluate with Ranking Metrics final_df = pred_label.join(test_books,['user_id'],'inner').select('book_id','books') final_rdd = final_df.rdd.map(lambda x: (x.book_id, x.books)) metrics = RankingMetrics(final_rdd) result1 = metrics.meanAveragePrecision result2 = metrics.precisionAt(k) result3 = metrics.ndcgAt(k) print("MAP = ", result1) print("Precision at k = ", result2) print("NDCG at k = ", result3)
def brute_force(als_model, groundTruth_test, test_users): print("Normal Recommender system-Brute force") start_time = time() rec = als_model.recommendForUserSubset(test_users, 500) print("Normal-500 recommendations for test users generated") predictions_test = rec.join(groundTruth_test, rec.user_id == groundTruth_test.user_id, 'inner') predAndLabels_test = predictions_test.select('recommendations.book_id', 'test_truth').rdd.map(tuple) metrics_test = RankingMetrics(predAndLabels_test) precision_test = metrics_test.precisionAt(500) map_test = metrics_test.meanAveragePrecision print(f"Time taken: {time() - start_time}s") print(f"Precision at 500: {precision_test}") print(f"Mean Average Precision: {map_test}")
def get_val_metrics(model, val): preds = model.transform(val) recs = model.recommendForUserSubset(val, 500) top_items = recs.selectExpr('user as user', 'recommendations.item as top_items') true_items = val.where(val.rating >= 3).groupby('user').agg(collect_list('item').alias('true_item_list')) predictions_and_labels_rankings = top_items.join(true_items, how = 'inner', on = 'user')\ .select('true_item_list', 'top_items') predictions_and_labels_rankings.write.json('val_recs.json') ranking_metrics = RankingMetrics(predictions_and_labels_rankings.cache().rdd) prec_at = ranking_metrics.precisionAt(500) mean_avg_prec = ranking_metrics.meanAveragePrecision ndcg = ranking_metrics.ndcgAt(500) rmse = RegressionMetrics(preds.select('rating', 'prediction').cache().rdd).rootMeanSquaredError evaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'rating', metricName = 'rmse') rmse = evaluator.evaluate(preds) return rmse, prec_at, mean_avg_prec, ndcg
def main(spark): val_df = spark.read.parquet( 'hdfs:/user/jm7955/test_full_indexed.parquet').drop('count') labels = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' % args.labels) predictions = val_df.groupBy("item").count().orderBy( "count", ascending=False).limit(500).collect() predictions = [row.item for row in predictions] print("predictions") #predictions.show() print('finished writing in %d seconds' % int(timer() - start)) predictionsAndLabels = labels.rdd.map(lambda tup: (predictions, tup[1])) print("predictionsAndLabels") print('finished writing in %d seconds' % int(timer() - start)) metrics = RankingMetrics(predictionsAndLabels) print('finished writing in %d seconds' % int(timer() - start)) print('metrics.meanAveragePrecision: %s\n' % metrics.meanAveragePrecision) print('metrics.precisionAt(500) %s\n' % metrics.precisionAt(500)) print('metrics.ndcgAt(500) %s\n' % metrics.ndcgAt(500))
def main(spark, model_file, data_file, K): '''Main routine for Collaborative Filtering Model testing Parameters ---------- spark: SparkSession object model_file: string, path to store the model data_file: string, path to the parquet file to load K: int, evaluations are based on predictions of the top K items for each user ''' testIdx = spark.read.parquet(data_file) model = ALSModel.load(model_file) users_val = testIdx.select("user_idx").distinct() perUserPredictedItemsDF = model.recommendForUserSubset(users_val, K) perUserPredictedItemsDF = perUserPredictedItemsDF.select( "user_idx", "recommendations.track_idx").withColumnRenamed( 'user_idx', 'user').withColumnRenamed('recommendations.track_idx', 'items') w2 = Window.partitionBy('user_idx').orderBy(col('count').desc()) perUserActualItemsDF = testIdx.select( 'user_idx', 'track_idx', 'count', F.rank().over(w2).alias('rank')).where( 'rank <= {0}'.format(K)).groupBy('user_idx').agg( expr('collect_list(track_idx) as items')).withColumnRenamed( 'user_idx', 'user') perUserItemsRDD = perUserPredictedItemsDF.join( perUserActualItemsDF, 'user').rdd.map(lambda row: (row[1], row[2])) rankingMetrics = RankingMetrics(perUserItemsRDD) print("============================================") print("meanAveragePrecision = %.8f" % rankingMetrics.meanAveragePrecision) print("precisionAt(K) = %.8f" % rankingMetrics.precisionAt(K)) print("ndcgAt(K) = %.8f" % rankingMetrics.ndcgAt(K))
def main(spark, data_file, val_file, model_file): # Load the dataframe df = spark.read.parquet(data_file) df = df.sample(True, 0.0001) val_df = spark.read.parquet(val_file) val_df = df.sample(True, 0.01) user_indexer = StringIndexer(inputCol = "user_id", outputCol = "userNew", handleInvalid = "skip") track_indexer = StringIndexer(inputCol = "track_id", outputCol = "trackNew", handleInvalid = "skip") RegParam = [0.001, 0.01] # 0.1, 1, 10] Alpha = [0.1, 1]#5,10, 100] Rank = [5,10] #50,100,1000] sc = spark.sparkContext PRECISIONS = {} count = 0 for i in RegParam: for j in Alpha: for k in Rank: print(f"i: {i}, j: {j}, k: {k}") als = ALS(maxIter=5, regParam = i, alpha = j, rank = k, \ userCol="userNew", itemCol="trackNew", ratingCol="count",\ coldStartStrategy="drop") pipeline = Pipeline(stages = [user_indexer, track_indexer, als]) model = pipeline.fit(df) #val_predictions = model.transform(val_df) alsmodel = model.stages[-1] rec = alsmodel.recommendForAllUsers(500) print(rec.show(10)) #scoreAndLabels = val_predictions.rdd #sc = spark.sparkContext #scoreAndLabels = sc.parallelize(scoreAndLabels) metrics = RankingMetrics(scoreAndLabels) precision = metrics.precisionAt(500) PRECISIONS[precision] = model count += 1 print(count) print(precision)
def main(spark, model_file, test_file): test_data = spark.read.parquet(test_file) als_model_tuned = ALSModel.load(model_file) print("Imported trained model and test data sets") #generating true values of book_id for each user_id groundTruth_test = test_data.groupby("user_id").agg( F.collect_list("book_id").alias("test_truth")) print("Created ground truth df for test set") # user_test_list=spark.sql('select distinct user_id from groundTruth_val where user_id=14') # rec = als_model_normal.recommendForUserSubset(user_test_list,500) #generating recs rec = als_model_tuned.recommendForAllUsers(500) print("500 recommendations for all users generated") #creating dataframe to have both true values and predicted values predictions_test = rec.join(groundTruth_test, rec.user_id == groundTruth_test.user_id, 'inner') #coverting to rdd for RankingMetrics() predAndLabels_test = predictions_test.select('recommendations.book_id', 'test_truth').rdd.map(tuple) print("starting ranking metrics for test data") metrics_test = RankingMetrics(predAndLabels_test) #calculating metrics precision_test = metrics_test.precisionAt(500) map_test = metrics_test.meanAveragePrecision ndcg_test = metrics_test.ndcgAt(500) print('Test set , Precision at 500: {}'.format(precision_test)) print('Test set , Mean Average Precision : {}'.format(map_test)) print('Test set, ndcgAt500 : {}'.format(ndcg_test))
ratings_train = train.map(lambda r: parseLine(r)) ratings_test = test.map(lambda r: parseLine(r)) sample_test = ratings_test.sample(False,0.1) #tirando uma amostra dos usuarios de teste sample_test.count() #quantidade de usuarios de teste usados nesse exemplo test_users = sample_test.map(lambda x: x.user).collect() model = ALS.trainImplicit(ratings_train, 10, 10) recs={} for u in test_users: rec = model.recommendProducts(u,10) recs[u]=map(lambda r: r[1],rec) groundTruth = {} userItemTestRDD = sample_test.map(lambda x: (x.user,x.product)) trueRec = userItemTestRDD.groupByKey().collect() for x in trueRec: groundTruth[x[0]]=list(x[1]) predictionsAndLabels = [] for u in test_users: predictionsAndLabels.append((recs[u],groundTruth[u])) predictionsAndLabelsRDD = sc.parallelize(predictionsAndLabels) metrics = RankingMetrics(predictionsAndLabelsRDD) metrics.precisionAt(5)