def main(spark, data_file, val_file, model_file): # Load the dataframe df = spark.read.parquet(data_file) df = df.sample(True, 0.0001) val_df = spark.read.parquet(val_file) val_df = df.sample(True, 0.01) user_indexer = StringIndexer(inputCol = "user_id", outputCol = "userNew", handleInvalid = "skip") track_indexer = StringIndexer(inputCol = "track_id", outputCol = "trackNew", handleInvalid = "skip") RegParam = [0.001, 0.01] # 0.1, 1, 10] Alpha = [0.1, 1]#5,10, 100] Rank = [5,10] #50,100,1000] sc = spark.sparkContext PRECISIONS = {} count = 0 for i in RegParam: for j in Alpha: for k in Rank: print(f"i: {i}, j: {j}, k: {k}") als = ALS(maxIter=5, regParam = i, alpha = j, rank = k, \ userCol="userNew", itemCol="trackNew", ratingCol="count",\ coldStartStrategy="drop") pipeline = Pipeline(stages = [user_indexer, track_indexer, als]) model = pipeline.fit(df) #val_predictions = model.transform(val_df) alsmodel = model.stages[-1] rec = alsmodel.recommendForAllUsers(500) print(rec.show(10)) #scoreAndLabels = val_predictions.rdd #sc = spark.sparkContext #scoreAndLabels = sc.parallelize(scoreAndLabels) metrics = RankingMetrics(scoreAndLabels) precision = metrics.precisionAt(500) PRECISIONS[precision] = model count += 1 print(count) print(precision)
def _calculate_metrics(self): """Calculate ranking metrics.""" self._items_for_user_pred = self.rating_pred self._items_for_user_true = (self.rating_true.groupBy( self.col_user).agg( expr("collect_list(" + self.col_item + ") as ground_truth")).select(self.col_user, "ground_truth")) self._items_for_user_all = self._items_for_user_pred.join( self._items_for_user_true, on=self.col_user).drop(self.col_user) return RankingMetrics(self._items_for_user_all.rdd)
def main(spark, data_file_train, data_file_val): start = time.time() # reading training and validation files df_train = spark.read.parquet(data_file_train) df_val = spark.read.parquet(data_file_val) window_user_ordered = Window.partitionBy('user_id').orderBy('rating') window_user = Window.partitionBy('user_id') actual_df_val = df_val.withColumn( 'actual_books', F.collect_list('book_id').over(window_user_ordered)).groupBy( 'user_id').agg(F.max('actual_books').alias('actual_books')) print("Datasets loaded | Time taken: {}".format(time.time() - start)) start = time.time() als = ALS(maxIter=10, regParam=0.001, userCol="user_id", itemCol="book_id", ratingCol="rating", rank=100) model = als.fit(df_train) print("Done with model fitting | Time taken: {}".format(time.time() - start)) start = time.time() # predictions = model.transform(df_val) # evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") # rmse = evaluator.evaluate(predictions) # print("RMSE: {}".format(rmse)) recommendations = model.recommendForUserSubset(df_val, 500) userPredictions = recommendations.select( 'user_id', F.explode('recommendations.book_id')).withColumn( 'pred_books', F.collect_list('col').over(window_user)).groupBy('user_id').agg( F.max('pred_books').alias('pred_books')) predAndLabels = userPredictions.join(actual_df_val, on='user_id').select( 'pred_books', 'actual_books') metrics = RankingMetrics(predAndLabels.rdd) score = metrics.meanAveragePrecision print('MAP for test data: {}'.format(score)) print('Time taken: {}'.format(time.time() - start))
def main(spark, rank, regParam, path, fraction): TEMP_PATH = "/models/ALS_{}_{}_{}".format(rank, regParam, fraction) ALS_PATH = TEMP_PATH + "/als" MODEL_PATH = TEMP_PATH + "/als_model" print("Loading model...") als = ALS.load(path + ALS_PATH) model = ALSModel.load(path + MODEL_PATH) print("Loading data...") testing = spark.read.parquet("{}/data/processed/testing_{}.parquet".format( path, fraction)) testing.createOrReplaceTempView("testing") # RMSE predictions = model.transform(testing) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("RSME:", rmse) predictions = model.recommendForAllUsers(500) predictions.createOrReplaceTempView("predictions") groundtruth = testing.groupby("user_id").agg( F.collect_set("book_id").alias('groundtruth')) groundtruth.createOrReplaceTempView("groundtruth") total = spark.sql( "SELECT g.user_id, g.groundtruth AS groundtruth, p.recommendations AS predictions FROM groundtruth g JOIN predictions p ON g.user_id = p.user_id" ) total.createOrReplaceTempView("total") data = total.selectExpr("predictions.book_id", "groundtruth") print("df to rdd...") rdd = data.rdd.map(tuple) print("creating metrics...") metrics = RankingMetrics(rdd) print("meanAveragePrecision:", metrics.meanAveragePrecision) print("precision at 500:", metrics.precisionAt(500)) print("ndcgAt 500:", metrics.ndcgAt(500))
def Ranking_evaluator (spark,model, val, metric_type): val.createOrReplaceTempView('val') val_user = spark.sql('SELECT DISTINCT user_id FROM val') #val_user = val.select('user_id').distinct() val_rec = model.recommendForUserSubset(val_user,500) #val_rec.printSchema() val_rec = val_rec.select('user_id','recommendations',f.posexplode('recommendations')).drop('pos').drop('recommendations') val_rec = val_rec.select('user_id',f.expr('col.book_id'),f.expr('col.rating')) w= Window.partitionBy('user_id') val_recrank=val_rec.select('user_id',f.collect_list('book_id').over(w).alias('rec_rank')).sort('user_id').distinct() val = val.sort(f.desc('rating')) val_truerank=val.select('user_id', f.collect_list('book_id').over(w).alias('true_rank')).sort('user_id').distinct() scoreAndLabels = val_recrank.join(val_truerank,on=['user_id'],how='inner') rankLists=scoreAndLabels.select("rec_rank", "true_rank").rdd.map(lambda x: tuple([x[0],x[1]])).collect() ranks = spark.sparkContext.parallelize(rankLists) metrics = RankingMetrics(ranks) MAP = metrics.meanAveragePrecision Precision = metrics.precisionAt(500) NDCG = metrics.ndcgAt(500) if metric_type == 'Precision': return Precision, {'MAP': MAP,'NDCG': NDCG} elif metric_type == 'MAP': return MAP, {'Precision': Precision,'NDCG': NDCG} elif metric_type == 'NDCG': return NDCG, {'MAP': MAP, 'Precision': Precision} else: return None
def main(spark, test_file, index_file, model_file): # load test data and create dataframe test_df = spark.read.parquet(test_file) model_indexer = PipelineModel.load(index_file) # transform user and track ids for test data test_df = model_indexer.transform(test_df) # store ground truth for user user_truth = test_df.groupby('user_label').agg( F.collect_list('book_label').alias('truth')) print('created ground truth df') als_model = ALSModel.load(model_file) # predict based on the top 500 item of each user recommend = als_model.recommendForAllUsers(500) print('recommendation has been created.') # RMSE predict = als_model.transform(test_df) evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction') rmse = evaluator.evaluate(predict) print('Root mean square error is ' + str(rmse)) # prediction = spark.sql('SELECT * FROM recommend INNER JOIN user_truth WHERE recommend.user_label=user_truth.user_label') # after running panda udf is faster than using sparksql prediction = recommend.join(user_truth, recommend.user_label == user_truth.user_label, 'inner') score = prediction.select('recommendations.book_label', 'truth').rdd.map(tuple) rank_metric = RankingMetrics(score) precision = rank_metric.precisionAt(500) mean_precision = rank_metric.meanAveragePrecision print(' precision at 500 ' + str(precision) + 'mean average precision of ' + str(mean_precision))
def main(spark): ''' Parameters ---------- spark : SparkSession object ''' # Reading train and transforming with StringIndexer train_file = 'hdfs:/user/dev241/train_sample.parquet' val_file = 'hdfs:/user/bm106/pub/project/cf_validation.parquet' train_sample = spark.read.parquet(train_file) val = spark.read.parquet(val_file) idx_pipe = PipelineModel.load('hdfs:/user/dev241/DieterStringIndexer') train_idx = idx_pipe.transform(train_sample) val_idx = idx_pipe.transform(val) val_idx = val_idx.select('user_idx','track_idx','count') val_users = val_idx.select('user_idx').distinct() val_comb = val_idx.groupBy('user_idx').agg(F.collect_set('track_idx').alias('val_labels')) # Hyperparameter values results = [] i = 0 # Looping through the hyperparameter values - low alpha for i in range(50): rank = np.random.randint(100) alpha = np.random.uniform(0.1,15) reg = np.random.uniform(0.1,1) als = ALS(rank = rank, alpha = alpha, regParam = reg, userCol="user_idx", itemCol="track_idx", ratingCol="count", coldStartStrategy="drop", implicitPrefs = True) model = als.fit(train_idx) model.save('model_random_search'+str(rank)+'_'+str(alpha)+'_'+str(reg)) track_number = 500 rec_val = model.recommendForUserSubset(val_users, track_number) join = val_comb.join(rec_val,val_comb.user_idx == rec_val.user_idx) predictionAndLabels = join.rdd.map(lambda r: ([track.track_idx for track in r.recommendations], r.val_labels)) metrics = RankingMetrics(predictionAndLabels) mavgp = metrics.meanAveragePrecision results.append((rank,alpha,reg,mavgp)) print("Rank : ",rank,"Alpha : ",alpha,"Reg : ",reg,"MAP : ",mavgp) print('First Validation completed.') sc.parallelize(results).saveAsTextFile("MAP_random_search_high.txt") Print('MAP_random_search_high.txt saved')
def main(spark): ''' Parameters ---------- spark : SparkSession object ''' # File names test_file = 'hdfs:/user/bm106/pub/project/cf_test.parquet' train_sample_file = 'hdfs:/user/ah3243/extension1_count_greater_1.parquet' # Reading the parquet files test = spark.read.parquet(test_file) train_sample = spark.read.parquet(train_sample_file) # StringIndexer print("String Indexer entered") StringIndexer = PipelineModel.load('hdfs:/user/dev241/DieterStringIndexer') test_idx = StringIndexer.transform(test) test_idx = test_idx.sample(.3) train_idx = StringIndexer.transform(train_sample) print("String Indexer done") #change to best rank = 78 alpha = 14.287069059772636 reg = 0.41772043857578584 #model als = ALS(rank = rank, alpha = alpha, regParam = reg, userCol="user_idx", itemCol="track_idx", ratingCol="count", coldStartStrategy="drop", implicitPrefs = True) model = als.fit(train_idx) print("Model fit for test done") #test ranking metrics test_idx = test_idx.select('user_idx','track_idx','count') test_users = test_idx.select('user_idx').distinct() test_comb = test_idx.groupBy('user_idx').agg(F.collect_set('track_idx').alias('test_labels')) track_number = 500 rec_test = model.recommendForUserSubset(test_users, track_number) join = test_comb.join(rec_test,test_comb.user_idx == rec_test.user_idx) predictionAndLabels = join.rdd.map(lambda r: ([track.track_idx for track in r.recommendations], r.test_labels)) metrics = RankingMetrics(predictionAndLabels) mavgp = metrics.meanAveragePrecision print("Test mean Average Precision : ",mavgp) pass
def main(spark, model_file, test_file): model = MatrixFactorizationModel.load(sc, model_file) test_df = spark.read.parquet(test_file) test_df = test_df.select('user_label', 'track_label', 'count') #predictions = model.recommendProductsForUsers(500) predictions = model.recommendProductsForUsers(2) prediction_flat = predictions.flatMap(lambda p: p[1]) prediction_df = prediction_flat.toDF() intersections = prediction_df.join(test_df, (prediction_df.product == test_df.track_label)& (prediction_df.user == test_df.user_label), how = 'inner') predLabel = intersections.select('rating', 'count') predLabel_rdd = predLabel.rdd.map(lambda x: Row(x[0], x[1])) metrics = RankingMetrics(predLabl_rdd) print(metrics.meanAveragePrecision)
def bestmodel(traindata, validatedata): bestValidationRmse = float("inf") #map validate data to userId, movieId validation = validatedata.map(lambda r: (r[0], r[1])) #get actual rating data for pairs of userId, movieId ratingTuple = validatedata.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))) for rank in ranks: #create model by train data model = ALS.train(traindata, rank, numIterations, lambda_=regulz_para) #predict ratings for validation data predictions = model.predictAll(validation).map(lambda r: ((r[0], r[1]), r[2])) #create predict and actual ratings scoreAndLabels = predictions.join(ratingTuple).map(lambda tup: tup[1]) regMetrics = RegressionMetrics(scoreAndLabels) RMSE = regMetrics.rootMeanSquaredError MSE = regMetrics.meanSquaredError print("For rank %s:" % rank) print("RMSE = %s" % RMSE) print("MSE = %s" % MSE) if RMSE < bestValidationRmse: bestValidationRmse = RMSE best_rank = rank print 'The best model was trained with rank %s' % best_rank #MAP: #actual top 10 movie sequence for users by rating model = ALS.train(traindata, best_rank, numIterations, lambda_=regulz_para) actual_user_movie = validatedata.map(lambda x: (x[0], (x[1], x[2]))).groupByKey() actual_user_movie1 = actual_user_movie.map(order_movies) predict_user_movie = model.predictAll(validation).map( lambda r: (r[0], (r[1], r[2]))).groupByKey() predict_user_movie1 = predict_user_movie.map(order_movies) movie_seq = predict_user_movie1.join(actual_user_movie1).map( lambda x: x[1]) movie_seq = movie_seq.map(movie_index) rankMetrics = RankingMetrics(movie_seq) MAP = rankMetrics.meanAveragePrecision print("MAP = %s" % MAP)
def main(sc): ratings_info = sc.textFile("input/ratings.csv") ratings_data = ratings_info.map(split).map(parse).filter( lambda line: line != None) fold1, fold2, fold3, fold4, fold5 = ratings_data.randomSplit( [0.2, 0.2, 0.2, 0.2, 0.2]) folds = [fold1, fold2, fold3, fold4, fold5] rank = 12 itr = 25 mse = 0 rmse = 0 map = 0 for i in range(5): test_data = folds[i] train_data = sc.emptyRDD() for j in range(5): if i == j: continue else: train_data = train_data.union(folds[j]) model = ALS.train(train_data, rank, iterations=itr, lambda_=0.1) testdata = test_data.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) rates = test_data.map(lambda r: ((r[0], r[1]), r[2])) predsAndlabels = predictions.join(rates).map(lambda tup: tup[1]) actual_rating = predsAndlabels.map(lambda r: r[1]).collect() predicted_rating = predsAndlabels.map(lambda r: r[0]).collect() predAndReal = sc.parallelize([(predicted_rating, actual_rating)]) metrics = RegressionMetrics(predsAndlabels) metric = RankingMetrics(predAndReal) mse += metrics.meanSquaredError rmse += metrics.rootMeanSquaredError map += metric.meanAveragePrecision k_mse = mse / 5.0 k_rmse = rmse / 5.0 k_map = map / 5.0 print("MSE = %s" % k_mse) print("RMSE = %s" % k_rmse) print("MAP = %s" % k_map)
def compute_MAP(model, users, df): predictions = model.recommendForUserSubset(users, 500) print("Generated predictions") userRec = (predictions.select( "userIndex", F.explode("recommendations").alias("recommendation")).select( "userIndex", "recommendation.*")) rankings = userRec.groupby('userIndex').agg( F.collect_list('trackIndex').alias('ranked_tracks')) print("Generated rankings") truth = df.groupby('userIndex').agg( F.collect_list('trackIndex').alias('ground_truth')) print("Generated ground truth") final = rankings.join(truth, rankings.userIndex == truth.userIndex).select( 'ranked_tracks', 'ground_truth') metrics = RankingMetrics(final.rdd) map_val = metrics.meanAveragePrecision return map_val
def main(spark, train_file, val_file, model_file): train_df = spark.read.parquet(train_file) val_df = spark.read.parquet(val_file) print('Finish reading data') train_df = train_df.withColumn('inc_count', train_df['count'] + 1) train_df = train_df.withColumn('log_count', F.log(train_df['inc_count'])) print('finish transforming data') print(train_df.first()) print(val_df.first()) train_df = train_df.select('user_label', 'track_label', 'log_count') val_df = val_df.select('user_label', 'track_label', 'count') val_grouped = val_df.groupBy('user_label').agg( F.collect_list(F.col('track_label')).alias('track_label')) print('finish preparing data') val_grouped.cache() train_df.cache() print('start fitting') # ALS for implicit feedback als = ALS(maxIter = 5, regParam = 0.01, alpha = 0.1, rank =10, implicitPrefs = True, \ userCol = 'user_label', itemCol = 'track_label', ratingCol = 'log_count') als_model = als.fit(train_df) print('Model fitted') als_model.save(model_file) print('Model Saved') predictions = als_model.recommendForAllUsers(100) prediction_df = predictions.rdd.map( lambda r: (r.user_label, [i[0] for i in r.recommendations])).toDF() prediction_df = prediction_df.selectExpr('_1 as user_label', '_2 as recommendations') # Join table val_pred = val_grouped.join(prediction_df, 'user_label', 'inner') rdd = val_pred.select('recommendations', 'track_label').rdd ranking_metrics = RankingMetrics(rdd) print( 'Log: Current log job alpha is : 0.1, current rank is 10, reg is 0.01') print('Single model, MAP = %s' % ranking_metrics.meanAveragePrecision)
def main(spark, model_file, data_file,count): df = spark.read.parquet(data_file).repartition(2000,['userIndex']) if count =='log': df = df.withColumn("count",log(col("count")+1)) elif count =='drop1': df = df.filter('count>1') elif count =='drop2': df = df.filter('count>2') model = ALSModel.load(model_file) test_user = df.select('userIndex').distinct() predictions = model.transform(df) actual = predictions.groupBy("userIndex").agg(expr("collect_set(trackIndex) as tracks")) rec = model.recommendForUserSubset(test_user,500) a= rec.select('userIndex','recommendations.trackIndex') b=a.join(actual,['userIndex']).select('trackIndex','tracks').rdd metrics = RankingMetrics(b) result = metrics.meanAveragePrecision print(result) np.savetxt('drop2.txt',np.array([result])) pass
def getMAP(top_predictions, truth): true = truth.select('user_id', 'book_id', 'true_row') w = Window.partitionBy('user_id').orderBy('true_row') true = true.withColumn( 'true', F.collect_list('book_id').over(w)).groupBy('user_id').agg( F.max('true').alias('true')) pred = top_predictions.select('user_id', 'book_id', 'row_num') w = Window.partitionBy('user_id').orderBy('row_num') pred = pred.withColumn( 'pred', F.collect_list('book_id').over(w)).groupBy('user_id').agg( F.max('pred').alias('pred')) pred_true = pred.join(true, 'user_id').select('pred', 'true').rdd metrics = RankingMetrics(pred_true) score = metrics.meanAveragePrecision return score
def main(spark, train_file, val_file, model_file): df_train = spark.read.parquet(train_file) df_val = spark.read.parquet(val_file) print(df_train.count()) print(df_val.count()) als = ALS(implicitPrefs=True, userCol="userIndex", itemCol="trackIndex", ratingCol="count", coldStartStrategy="drop") ranks = [10, 20, 40] reg_params = [0.001, 0.01, 0.1] alphas = [1, 20, 40] max_result = 0.0 best_rank = 0 best_alpha = 0 best_regparam = 0 k = 500 val_user = df_val.select('userIndex').distinct() for rank, reg_param, alpha in itertools.product(ranks, reg_params, alphas): als.setRank(rank).setRegParam(reg_param).setAlpha(alpha) model = als.fit(df_train) rec = model.recommendForUserSubset(val_user, 500) predictions = model.transform(df_val) actual = df_val.groupBy("userIndex").agg( expr("collect_set(trackIndex) as tracks")) pred = rec.select('userIndex', 'recommendations.trackIndex') a = pred.join(actual, ['userIndex']).select('trackIndex', 'tracks') metrics = RankingMetrics(a.rdd) result = metrics.meanAveragePrecision print('For rank %s, for alpha %s, for reg_param %s, the MAP is %s' % (rank, alpha, reg_param, result)) if result > max_result: max_result = result best_rank = rank best_alpha = alpha best_regparam = reg_param best = als.setRank(best_rank).setAlpha(best_alpha).setRegParam( best_regparam) best_model_ = best.fit(df_train) best_model_.save(model_file)
def main(spark, val_file, model_file): model = ALSModel.load(model_file) print('finish loading models') val_df = spark.read.parquet(val_file) val_df = val_df.select('user_label', 'track_label') val_grouped = val_df.groupBy('user_label').agg(F.collect_list(F.col('track_label')).alias('track_label')) print('Finish preparing test data') val_grouped.cache() predictions = model.recommendForAllUsers(500) print('finish making predictions') prediction_df = predictions.rdd.map(lambda r: (r.user_label, [i[0] for i in r.recommendations])).toDF() prediction_df = prediction_df.selectExpr("_1 as user_label", "_2 as recommendations") # Join table val_pred = val_grouped.join(prediction_df, "user_label", "inner") print('finish joining data') # Instantiate regression metrics to compare predicted and actual ratings rdd = val_pred.select('recommendations', 'track_label').rdd print('final steps') ranking_metrics = RankingMetrics(rdd) # MAP print("MAP = %s" % ranking_metrics.meanAveragePrecision)
def main(spark, data_file, model_file,truth_file): '''Main routine for supervised training Parameters ---------- spark : SparkSession object data_file : string, path to the parquet file to load: test set user id. model_file : string, path to store the serialized model file truth_file: ground truth interaction list for each user in the test set. ''' topk = 500 #nmslib indexing parameters Mlist = [50] efclist = [3000] efslist = [800] #Prepare data test_users = spark.read.parquet(data_file) #distinct_test_users.parquet truth = spark.read.parquet(truth_file) model = ALSModel.load(model_file) #recsys_model_search usefactor = model.userFactors queryuser = test_users.join(usefactor,test_users.user_num_id == usefactor.id,how = 'left') userid = [row.id for row in queryuser.select('id').collect()] user = np.array([row.features for row in queryuser.select('features').collect()]) itmfactor = model.itemFactors item = np.array([row.features for row in itmfactor.select('features').collect()]) itemidx = np.array([row.id for row in itmfactor.select('id').collect()]) trs_user = np.append(user,np.zeros((user.shape[0],1)),axis = 1) norms = np.linalg.norm(item,axis = 1) maxnorm = norms.max() extra_item_dim = np.sqrt(maxnorm ** 2 - norms ** 2) trs_item = np.append(item, extra_item_dim.reshape(norms.shape[0], 1), axis=1) print('Finish Preparing the data') print('Start brute force search') #only try brute force once. #time2 = time.time() #brutea,bruteb = bruteforce(user,item,itemidx,topk) #brute_time = time.time() - time2 #print('Time to brute force search top{} items is {}, {} seconds per query'.format(topk,brute_time,brute_time/len(user))) #Get MAP R = Row('id', 'recs') #rec_brute = spark.createDataFrame([R(x, y) for i, (x,y) in enumerate(zip(userid,brutea))]) #pred_brute = truth.join(rec_brute, truth.user_id == rec_brute.id, how='left').select('recs', 'label') #predictionAndLabels_b = pred_brute.rdd.map(lambda lp: (lp.recs, lp.label)).repartition(100) #metrics_b = RankingMetrics(predictionAndLabels_b) #meanAP_b = metrics_b.meanAveragePrecision #Multiple accelerated search with different parameter settings for M,efc,efs in itertools.product(Mlist,efclist,efslist): indexParams = {'M': M, 'indexThreadQty': 4, 'efConstruction': efc, 'post' : 0} #Get time print("__________________Start a new indexer______________________") index = nmslib.init(method = 'hnsw',space = 'cosinesimil') index.addDataPointBatch(trs_item,ids = itemidx) time1 = time.time() index.createIndex(indexParams) nmslib_buildtime = time.time() - time1 print('indexParams for nmslib is {}'.format(indexParams),'queryParams for nmslib is efs = {}'.format(efs)) print('Time to build index for nmslib is {}'.format(nmslib_buildtime)) time3 = time.time() nms_a,nms_b = nmslib_search(index, trs_user,trs_item,itemidx,topk,efs) nms_time = time.time() - time3 print('Time to nmslib search top{} items is {}, {} seconds per query'.format(topk,nms_time,nms_time/len(user))) #Get MAP rec_nms = spark.createDataFrame([R(x,y) for i ,(x,y) in enumerate(zip(userid,nms_a))]) pred_nms = truth.join(rec_nms,truth.user_id == rec_nms.id,how = 'left').select('recs','label') predictionAndLabels_n = pred_nms.rdd.map(lambda lp: (lp.recs, lp.label)).repartition(100) metrics_n = RankingMetrics(predictionAndLabels_n) meanAP_n = metrics_n.meanAveragePrecision print(' MAP for nmslib is {},MAP for bruteforce is 0.04126000613271917'.format(meanAP_n)) print('____________Finish an indexer__________________')
predictionCol="prediction") rmse = evaluator.evaluate(predictions_test) print("Root-mean-square error = " + str(rmse)) test.createOrReplaceTempView('test') test_true = spark.sql( 'select user, book from test where rating > 2 sort by rating desc') labels = test_true.groupby('user').agg(collect_list('book')) test_recommendations = model.recommendForUserSubset(labels.select('user'), 500) preds = test_recommendations.withColumn( 'recommendations', explode('recommendations')).select( 'user', 'recommendations.item').groupBy('user').agg(collect_list('item')) preds_and_labels = preds.join(labels, on='user') metrics = RankingMetrics( preds_and_labels.select('collect_list(item)', 'collect_list(book)').rdd) map_metric = metrics.meanAveragePrecision pA = metrics.precisionAt(500) ndcgA = metrics.ndcgAt(500) results.append((rank, reg, rmse, map_metric, pA, ndcgA)) print('MAP = ', map_metric, ' pA = ', pA, ' ndcgA = ', ndcgA, '\n') res_rdd = spark.sparkContext.parallelize(results) res_df = spark.createDataFrame(res_rdd).repartition(1) res_df.write.csv('test_results.csv')
prediction_val = best_model.transform(df_validation) print(" Predictions for validation dataset: ------------------------------") prediction_val.show() prediction_val.write.csv('hdfs:/user/pg1910/pub/goodreads/prediction_val.csv') prediction_test = best_model.transform(df_test) print(" Predictions for test dataset: ------------------------------") prediction_test.show() prediction_test.write.csv( 'hdfs:/user/pg1910/pub/goodreads/prediction_test.csv') actual_val = df_validation.groupBy("user_id").agg( expr("collect_set(book_id) as books")) pred_val = user_recs.select('user_id', 'recommendations.book_id') output_val = pred_val.join(actual_val, ['user_id']).select('book_id', 'books') metrics_val = RankingMetrics(output_val.rdd) result_val = metrics_val.meanAveragePrecision print("Mean average precision for validation dataset: " + str(result_val)) rmse_val = evaluator.evaluate(prediction_val) print("RMSE for validation dataset=" + str(rmse_val)) actual_test = df_test.groupBy("user_id").agg( expr("collect_set(book_id) as books")) pred_test = user_recs.select('user_id', 'recommendations.book_id') output_test = pred_test.join(actual_test, ['user_id']).select('book_id', 'books') metrics_test = RankingMetrics(output_test.rdd) result_test = metrics_test.meanAveragePrecision
ratings_train = train.map(lambda r: parseLine(r)) ratings_test = test.map(lambda r: parseLine(r)) sample_test = ratings_test.sample(False,0.1) #tirando uma amostra dos usuarios de teste sample_test.count() #quantidade de usuarios de teste usados nesse exemplo test_users = sample_test.map(lambda x: x.user).collect() model = ALS.trainImplicit(ratings_train, 10, 10) recs={} for u in test_users: rec = model.recommendProducts(u,10) recs[u]=map(lambda r: r[1],rec) groundTruth = {} userItemTestRDD = sample_test.map(lambda x: (x.user,x.product)) trueRec = userItemTestRDD.groupByKey().collect() for x in trueRec: groundTruth[x[0]]=list(x[1]) predictionsAndLabels = [] for u in test_users: predictionsAndLabels.append((recs[u],groundTruth[u])) predictionsAndLabelsRDD = sc.parallelize(predictionsAndLabels) metrics = RankingMetrics(predictionsAndLabelsRDD) metrics.precisionAt(5)
userCol="u_id", itemCol="t_id", ratingCol="count", coldStartStrategy="drop") model = als.fit(new_data) labels = new_data.groupby('u_id').agg( F.collect_set('t_id').alias('ranked_labels')) testusers = new_data.select('u_id').distinct() userSubsetRecs = model.recommendForUserSubset(testusers, 10) recommendationsDF = (userSubsetRecs.select( "u_id", explode("recommendations").alias("recommendation")).select( "u_id", "recommendation.*")) preds = recommendationsDF.groupby('u_id').agg( F.collect_set('t_id').alias('ranked_preds')) joined_table = labels.join(preds, labels.u_id == preds.u_id) reqdPredsLabels = joined_table.select(labels.ranked_labels, preds.ranked_preds) metrics = RankingMetrics(reqdPredsLabels.rdd) print('Precision at 500 : {0}'.format(metrics.precisionAt(500))) print('Mean Average Precision: {0}'.format(metrics.meanAveragePrecision)) print(datetime.now()) testdata = new_data.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = new_data.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("Mean Squared Error = " + str(MSE)) print(datetime.now())
lambda_=lamda, seed=seed) #predicting on test dataset preds = model.predictAll(CVTestData.map(lambda p: (p[0], p[1]))).map( lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = CVTestData.map(lambda r: ((r[0], r[1]), r[2])).join(preds) #evaluating predictions with actual ratings/rankings metrics = RegressionMetrics(ratesAndPreds.map(lambda r: r[1])) ratingPairs = ratesAndPreds.map(lambda r: (r[0][0], (r[1][0], r[1][ 1]))).reduceByKey(lambda x, y: x + y).map(lambda x: list(x[1])) rankAndPreds = ratingPairs.map(map1) rmetrics = RankingMetrics(sc.parallelize(rankAndPreds.collect( ))) #rankAndPreds is a PipelinedRDD and has to be converted into RDD MSE = metrics.meanSquaredError total_mse += MSE RMSE = metrics.rootMeanSquaredError total_rmse += RMSE MAP = rmetrics.meanAveragePrecision total_map += MAP print("Average Mean Squared Error = " + str(total_mse / folds)) print("Average Root Mean Squared Error = " + str(total_rmse / folds)) print("Average Mean Average Precision = " + str(total_map / folds)) #This code is executed only after getting best parameters from Cross Validation
def main(spark, train_file, val_file, model_file): train_df = spark.read.parquet(train_file) val_df = spark.read.parquet(val_file) train_df = train_df.select('user_label', 'track_label', 'count') val_df = val_df.select('user_label', 'track_label', 'count') val_grouped = val_df.groupBy('user_label').agg( F.collect_list(F.col('track_label')).alias('track_label')) # ALS for implicit feedback als = ALS(maxIter=5, regParam=0, implicitPrefs=True, alpha=0.4, rank=20, userCol='user_label', itemCol='track_label', ratingCol='count') als_model = als.fit(train_df) predictions = als_model.recommendForAllUsers(100) prediction_df = predictions.rdd.map( lambda r: (r.user_label, [i[0] for i in r.recommendations])).toDF() prediction_df = prediction_df.selectExpr('_1 as user_label', '_2 as recommendations') # Join table val_pred = val_grouped.join(prediction_df, 'user_label', 'inner') rdd = val_pred.select('recommendations', 'track_label').rdd ranking_metrics = RankingMetrics(rdd) print('Before tuning, MAP = %s' % ranking_metrics.meanAveragePrecision) # hyperparameter tuning ranks = [10, 20, 40, 60] reg_params = [0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5] alphas = [0.10, 0.20, 0.40] best_rank = None best_reg_param = None best_alpha = None best_model = None best_map = 0 for rank_i, alpha_i, reg_param_i in itertools.product( ranks, alphas, reg_params): print('Running on rank:', rank_i) print('Running on alpha:', alpha_i) print('Running on reg:', reg_param_i) als = ALS(maxIter=5, regParam=reg_param_i, implicitPrefs=True, alpha=alpha_i, rank=rank_i, userCol='user_label', itemCol='track_label', ratingCol='count') als_model = als.fit(train_df) predictions = als_model.recommendForAllUsers(100) prediction_df = predictions.rdd.map(lambda r: ( r.user_label, [i[0] for i in r.recommendations])).toDF() prediction_df = prediction_df.selectExpr('_1 as user_label', '_2 as recommendations') # Join table val_pred = val_grouped.join(prediction_df, 'user_label', 'inner') rdd = val_pred.select('recommendations', 'track_label').rdd ranking_metrics = RankingMetrics(rdd) map_ = ranking_metrics.meanAveragePrecision print('MAP:', map_) if map_ > best_map: best_rank = rank_i best_reg_param = reg_param_i best_alpha = alpha_i best_model = als_model best_map = map_ print('Best rank:', best_rank) print('Best regParam:', best_reg_param) print('Best alpha:', best_alpha) print('Best map:', best_map) # save the best model best_model.save(model_file)
#Specially in case of unbalanced dataset train = sqlContext.read.parquet('train_transformed') train = train.withColumn("label", train["label"].cast(DoubleType())) test = sqlContext.read.parquet('test_transformed') test = test.withColumn("label", train["label"].cast(DoubleType())) lr = LogisticRegression() #Building the grid grid = ParamGridBuilder() \ .addGrid(lr.maxIter, [0, 1, 5, 10, 15, 20, 25, 30]) \ .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1, 10, 100, 1000]) \ .addGrid(lr.regType, ['l1', 'l2']) \ .addGrid(r.elasticNetParam, [0.001, 0.01, 0.1, 1, 10, 100, 1000]) \ .build() metrics = RankingMetrics() evluator = metrics.precisionAt(12) cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5) cvModel = cv.fit(train) #Testing overfitting/underfitting train_score = evaluator.evaluate(cvModel.transform(train)) test_score = evaluator.evaluate(cvModel.transform(test)) #Saveing the model bestModel = cvModel.bestModel coefficients = DenseVector(bestModel.coefficients.toArray()) os.system('mkdir lr')
def main(spark, train_data, val_data, downsample=True, extension=None): '''Main routine for supervised training Parameters ---------- spark : SparkSession object train_data : string, path to the training parquet file to load val_data: string, path to the validation parquet file to load test_data: string, path to the testing parquet file to load downsample: TRUE or FALSE. To indicate if we should downsample the data or not ''' ### read in the files train = spark.read.parquet(train_data) val = spark.read.parquet(val_data) ### if down-sample: down-sample train data to random 0.1% if downsample: train = train.sample(False, 0.00001, seed=0) #val = val.sample(False, 0.00001, seed = 0) if extension != None: if extension == "log": # log-compression train = train.withColumn("log_count", log("count")) # apply log-compression elif extension == "drop": # drop low counts lower_bound = train.approxQuantile("count", [ 0.1 ], 0.25) # treat the 0.1 quantile of count data as the lower bound train = train.filter( train["count"] > int(lower_bound[0]) ) # filter out count data rows lower than the lower bound ### transform dataframe columns: user_id, track_id from string to float and put them in the pipeline user_indexer = StringIndexer(inputCol="user_id", outputCol="user_id_indexed", handleInvalid='skip') track_indexer = StringIndexer(inputCol="track_id", outputCol="track_id_indexed", handleInvalid='skip') pipeline = Pipeline(stages=[user_indexer, track_indexer]) indexing_model = pipeline.fit(train) #learn (return: pipeline model) ### transform the datasets and create the view train = indexing_model.transform( train) # return a dataframe with new columns train.createOrReplaceTempView("train") val = indexing_model.transform(val) # return a dataframe with new columns val.createOrReplaceTempView("val") # group by user_id, aggregate track_id_indexed for train and val val_groupby = spark.sql( "select user_id_indexed, collect_list(track_id_indexed) track_id_indexed_collections from val group by user_id_indexed" ) val_groupby.createOrReplaceTempView("val_groupby") # Build the recommendation model using ALS on the training data rank = np.arange(4, 10, 2) regParam = np.linspace(0.01, 0.2, 3) alpha = np.linspace(0.5, 2, 3) paramGrid = list(itertools.product(rank, regParam, alpha)) MAP_lst = [] # store MAP results precision_at_500_lst = [] # store precision at 500 results for combo in paramGrid: rank_, regParam_, alpha_ = combo # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics if extension == "log": ratingCol = "log_count" else: ratingCol = "count" als = ALS(rank=rank_, regParam=regParam_, alpha=alpha_, implicitPrefs=True, userCol="user_id_indexed", itemCol="track_id_indexed", ratingCol=ratingCol, coldStartStrategy="drop") # Save the model model = als.fit(train) # fit the pipeline onto training data # get top 500 recommendations userRecs = model.recommendForAllUsers( 500 ) # return: dataframe (columns: user_id_indexed, recommendations) # [('user_id_indexed', 'int'), ('recommendations', 'array<struct<track_id_indexed:int,rating:float>>')] userRecs = userRecs.select( userRecs.user_id_indexed, userRecs.recommendations.track_id_indexed.alias( "pred_list")) # with track_id_indexed only, no track_id userRecs.createOrReplaceTempView("userRecs") # create temporary view combined_df = spark.sql( '''select val_groupby.user_id_indexed user_id_indexed, userRecs.pred_list pred_list, val_groupby.track_id_indexed_collections track_id_indexed_collections from userRecs inner join val_groupby on val_groupby.user_id_indexed = userRecs.user_id_indexed''' ) # combine dfs wrg to user_id_indexed # use ranking metrics for evaluations predLabelsTuple = combined_df.rdd.map( lambda r: (r.pred_list, r.track_id_indexed_collections)) # result: tuple metrics = RankingMetrics(predLabelsTuple) MAP = metrics.meanAveragePrecision precision_at_500 = metrics.precisionAt(500) MAP_lst.append(MAP) # store MAP for each config precision_at_500_lst.append( precision_at_500) # store precision at 500 for each config # print out validation evaluation result print("---------------------------------------") print("configs: \n") print("rank = " + str(rank_) + " , regParam = " + str(regParam_) + " , alpha = " + str(alpha_)) print("\n") print("MAP = " + str(MAP)) print("Precision at 500 = " + str(precision_at_500)) min_index = MAP_lst.index(np.max(MAP_lst)) rank_opt, regParam_opt, alpha_opt = paramGrid[min_index] print("---------------------------------------") print("optimal configs: \n") print("rank = " + str(rank_opt) + " , regParam = " + str(regParam_opt) + " , alpha = " + str(alpha_opt)) print("\n") print("MAP = " + str(np.max(MAP_lst))) print("Precision at 500 =" + str(precision_at_500_lst[min_index]))
def tune_ALS_map(train_read, val_read, val_true_list, iteration, regParams, current_rank): """ grid search function to select the best model based on RMSE of validation data Parameters ---------- train_data: spark DF with columns ['userId', 'movieId', 'rating'] validation_data: spark DF with columns ['userId', 'movieId', 'rating'] maxIter: int, max number of learning iterations regParams: list of float, one dimension of hyper-param tuning grid ranks: list of float, one dimension of hyper-param tuning grid Return ------ The best fitted ALS model with lowest RMSE score on validation data """ # initial min_error = float('inf') best_iter1 = -1 best_rank1 = -1 best_regularization1 = 0 best_model_rmse = None max_map = 0.0 best_iter2 = -1 best_rank2 = -1 best_regularization2 = 0 best_model_map = None for current_rank in ranks: for reg in regParams: # get ALS model #als = ALS().setMaxIter(iteration).setRank(rank).setRegParam(reg) als = ALS(maxIter=iteration, regParam=reg, rank=current_rank, userCol='user_id', itemCol='book_id', ratingCol='rating', coldStartStrategy="drop", nonnegative=True) # train ALS model train_read.checkpoint() model_read = als.fit(train_read) # evaluate the model by computing the RMSE on the validation read data predictions_read = model_read.transform(val_read) # combine predictions on read and unread data predictions_all = predictions_read.union(predictions_unread) # select top 500 books for each use to evaluate window = Window.partitionBy(predictions_all['user_id']).orderBy( predictions_all['prediction'].desc()) val_pred_order = predictions_all.select( '*', rank().over(window).alias('rank')).filter(col('rank') <= 500) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(val_pred_order) if rmse < min_error: min_error = rmse best_rank1 = current_rank best_regularization1 = reg best_iter1 = iteration best_model_rmse = model_read # evaluate the model by computing the MAP on the validation data val_pred_list = val_pred_order.select( 'user_id', 'book_id').groupBy('user_id').agg( expr('collect_list(book_id) as books')) val_RDD = val_pred_list.join( val_true_list, 'user_id').rdd.map(lambda row: (row[1], row[2])) val_RDD.checkpoint() rankingMetrics = RankingMetrics(val_RDD) current_map = rankingMetrics.meanAveragePrecision if current_map > max_map: max_map = current_map best_rank2 = current_rank best_regularization2 = reg best_iter2 = iteration best_model_map = model_read print('{} latent factors and regularization = {} with maxIter {}: ' 'validation RMSE is {}' 'validation MAP is {}'.format(current_rank, reg, iteration, rmse, current_map)) with open('train01_read_eval.csv', 'ab') as f: np.savetxt(f, [ np.array([iteration, current_rank, reg, rmse, current_map]) ], delimiter=",") print('\nThe best model select by RMSE has {} latent factors and ' 'regularization = {}' 'with maxIter = {}'.format(best_rank1, best_regularization1, best_iter1)) print('\nThe best model select by MAP has {} latent factors and ' 'regularization = {}' 'with maxIter = {}'.format(best_rank2, best_regularization2, best_iter2)) return best_model_rmse, best_model_map
test_pred_order = predictions.select( '*', rank().over(window).alias('rank')).filter(col('rank') <= 500) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(test_pred_order) # evaluate the model by computing the MAP on the validation data test_pred_list = test_pred_order.select( 'user_id', 'book_id').groupBy('user_id').agg(expr('collect_list(book_id) as books')) test_RDD = test_pred_list.join(test_true_list, 'user_id').rdd.map(lambda row: (row[1], row[2])) rankingMetrics = RankingMetrics(test_RDD) current_map = rankingMetrics.meanAveragePrecision print( '\nThe best baseline model select by RMSE = {} has {} latent factors and ' 'regularization = {} with maxIter = {} MAP = {}'.format( rmse, current_rank, reg, iteration, current_map)) """ # evaluate read model train_new = train.withColumn('rating',when(train.is_read == 0,float('nan')).otherwise(train.rating)) train_read = train_new.na.drop() train_unread = train.subtract(train_read) test_new = test.withColumn('rating',when(test.is_read == 0,float('nan')).otherwise(test.rating)) test_read = test_new.na.drop()
prediction_val = best_model.transform(df_validation) print(" Predictions for validation dataset: ------------------------------") prediction_val.show() prediction_test = best_model.transform(df_test) print(" Predictions for test dataset: ------------------------------") prediction_test.show() actual_val = df_validation.groupBy("user_id").agg(expr("collect_set(book_id) as books")) pred_val = user_recs.select('user_id','recommendations.book_id') output_val =pred_val.join(actual_val,['user_id']).select('book_id','books') metrics_val = RankingMetrics(output_val.rdd) result_val = metrics_val.meanAveragePrecision result_val2 = metrics_val.precisionAt(20) print("Mean average precision for validation dataset: " + str(result_val)) print("Precision @ 20 for validation dataset: " + str(result_val2)) rmse_val = evaluator.evaluate(prediction_val) print("RMSE for validation dataset=" + str(rmse_val)) actual_test = df_test.groupBy("user_id").agg(expr("collect_set(book_id) as books")) pred_test = user_recs.select('user_id','recommendations.book_id') output_test =pred_test.join(actual_test,['user_id']).select('book_id','books') metrics_test = RankingMetrics(output_test.rdd) result_test = metrics_test.meanAveragePrecision result_test2 = metrics_test.precisionAt(20)
predictedList.append(rankingDict[predictedRankings[i]]) return (predictedList, rankingList) # In[8]: test = groundTruthRankedRatings.join( predictedRankedRatings) # joins the rdds on the movie user id #user id (tuple ( list of actual rankings, other list of predicted rankings) rankingsRDD = test.map(convertToRankings) x = rankingsRDD.map(lambda t: (t[1][0], t[1][1])) # this combines the two rankings metrics = RankingMetrics(rankingsRDD) metrics.meanAveragePrecision # In[6]: num_folds = 5 fold1, fold2, fold3, fold4, fold5, = ratings.randomSplit([.2, .2, .2, .2, .2], seed=9999) dataList = [fold1, fold2, fold3, fold4, fold5] for rank in [5, 10, 15, 20]: for numIterations in [5, 10, 15, 20]: print('rank is ' + str(rank) + ' numIterations is ' + str(numIterations)) total_RMSE = 0
def main(spark, log_comp=False, drop_low=False, drop_thr=0): ''' Parameters ---------- spark : SparkSession object train_path : string, path to the training parquet file to load val_path : string, path to the validation parquet file to load test_path : string, path to the validation parquet file to load ''' ## Load in datasets train_path = 'hdfs:/user/bm106/pub/project/cf_train.parquet' val_path = 'hdfs:/user/bm106/pub/project/cf_validation.parquet' test_path = 'hdfs:/user/bm106/pub/project/cf_test.parquet' train = spark.read.parquet(train_path) val = spark.read.parquet(val_path) test = spark.read.parquet(test_path) ## Downsample the data # Pick out user list in training set user_train = set(row['user_id'] for row in train.select('user_id').distinct().collect()) # Pick out user list in validation set user_val = set(row['user_id'] for row in val.select('user_id').distinct().collect()) # Get the previous 1M users user_prev = list(user_train - user_val) # Random sampling to get 20% k = int(0.2 * len(user_prev)) user_prev_filtered = random.sample(user_prev, k) train = train.where(train.user_id.isin(user_prev_filtered + list(user_val))) ## Create StringIndexer indexer_user = StringIndexer(inputCol="user_id", outputCol="user_id_indexed", handleInvalid='skip') indexer_user_model = indexer_user.fit(train) indexer_track = StringIndexer(inputCol="track_id", outputCol="track_id_indexed", handleInvalid='skip') indexer_track_model = indexer_track.fit(train) train = indexer_user_model.transform(train) train = indexer_track_model.transform(train) val = indexer_user_model.transform(val) val = indexer_track_model.transform(val) test = indexer_user_model.transform(test) test = indexer_track_model.transform(test) ## ALS model rank_ = [5, 10, 20] regParam_ = [0.1, 1, 10] alpha_ = [1, 5, 10] param_grid = it.product(rank_, regParam_, alpha_) ## Pick out users from validation set user_id = val.select('user_id_indexed').distinct() true_label = val.select('user_id_indexed', 'track_id_indexed')\ .groupBy('user_id_indexed')\ .agg(expr('collect_list(track_id_indexed) as true_item')) ## Log-Compression ## count -> log(1+count) if log_comp == True: train = train.select('*', F.log1p('count').alias('count_log1p')) val = val.select('*', F.log1p('count').alias('count_log1p')) rateCol = "count_log1p" else: rateCol = "count" ## Drop interactions that have counts lower than specified threhold if drop_low == True: train = train.filter(train['count'] > drop_thr) val = val.filter(val['count'] > drop_thr) for i in param_grid: print('Start Training for {}'.format(i)) als = ALS(rank = i[0], maxIter=10, regParam=i[1], userCol="user_id_indexed", itemCol="track_id_indexed", ratingCol=rateCol, implicitPrefs=True, \ alpha=i[2], nonnegative=True, coldStartStrategy="drop") model = als.fit(train) print('Finish Training for {}'.format(i)) # Make top 500 recommendations for users in validation test res = model.recommendForUserSubset(user_id, 500) pred_label = res.select('user_id_indexed', 'recommendations.track_id_indexed') pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_indexed', 'inner') \ .rdd \ .map(lambda row: (row[1], row[2])) print('Start Evaluating for {}'.format(i)) metrics = RankingMetrics(pred_true_rdd) map_ = metrics.meanAveragePrecision ndcg = metrics.ndcgAt(500) mpa = metrics.precisionAt(500) print(i, 'map score: ', map_, 'ndcg score: ', ndcg, 'map score: ', mpa) pass