def predict_ratings(self, df, type): if type == 'Restaurants': model = ALSModel.load(modelRest_input_dir) else: model = ALSModel.load(modelAll_input_dir) predictions = model.transform(df) return predictions
def parameter_tuning(file_path, percent, ranks, regParams, maxIter=10, ks=[10, 200, 500]): ''' Tune parameters. ''' print('Load train parquet...') df_train = spark.read.parquet(file_path + 'interactions_train_' + str(percent) + '.parquet') print('Load val parquet...') df_val = spark.read.parquet(file_path + 'interactions_val_' + str(percent) + '.parquet') print('Load test parquet...') df_test = spark.read.parquet(file_path + 'interactions_test_' + str(percent) + '.parquet') tuning_dict = {} for rank in ranks: for regParam in regParams: print('Tune parameters: rank={} and reg={}...'.format(rank, regParam)) model_name = 'rank_' + str(rank) + '_regParam_' + str(regParam) + '_downsample_' + str(percent) print('Train...') try: model = ALSModel.load(model_name) except: model = train(df=df_train, name=model_name, rank=rank, maxIter=maxIter, regParam=regParam) print('Evaluate...') eval_results = evaluation(df_test, model, ks) tuning_dict[model_name] = eval_results metrics_name = 'metrices_test' + str(round(time.time())) + '_downsample_' + str(percent) + '.pkl' with open(metrics_name, 'wb') as f: pickle.dump(tuning_dict, f) return tuning_dict
def get_val_metrics_outdated (spark, model_file, train_file, val_file, output_log_filepath): ''' Gets val metrics for given model, training, and validation data and saves to a log file ''' train = spark.read.parquet(train_file) train.createOrReplaceTempView('train') val = spark.read.parquet(val_file) val.createOrReplaceTempView('val') # use stringIndexer and pipeline to index on user_id and book_id # for user indexer, throws error if there is a user in the validation that is not in training # for item indexer, skips item if it was not in training user_idxer = StringIndexer(inputCol = 'user_id', outputCol = 'user', handleInvalid = 'skip') item_idxer = StringIndexer(inputCol = 'book_id', outputCol = 'item', handleInvalid = 'skip') # index data pipeline = Pipeline(stages = [user_idxer, item_idxer]) val = pipeline.fit(train).transform(val) alsmodel = ALSModel.load(model_file) preds = alsmodel.transform(val) recs = alsmodel.recommendForUserSubset(val, 400) top_items = recs.selectExpr('user as user', 'recommendations.item as top_items') good_preds = preds.where(preds.rating >= 3.0) recs_tall = recs.select(recs.user, explode(recs.recommendations)) recs_taller = recs_tall.withColumn('item', recs_tall.col.item).withColumn('pred_rating', recs_tall.col.rating).select('user', 'item', 'pred_rating') joined = recs_taller.join(good_preds, how = 'inner', on = ['item', 'user']) return preds, recs, val
def main(spark, genre_file, model_file, save_file): # Read fuzzy fuzzy book genres information and map each book to the genre of highest count print('Loading genre information and mapping genres ...') genre_data = [json.loads(line) for line in open(genre_file, 'r')] book_genre_map = [(int(x['book_id']), sorted(x['genres'].items(), key=lambda y: y[1])[-1][0]) for x in genre_data if x['genres']] map_df = spark.createDataFrame(book_genre_map, ['id', 'genre']) # Load model and get vector representation for books left print('Loading model and getting item representations ...') model = ALSModel.load(model_file) item_vecs = model.itemFactors # Remove items with vector representation of all 0's print('Removing items withot representations ...') helper = F.udf(lambda x: all(v == 0 for v in x), BooleanType()) item_vecs = item_vecs.withColumn('check', helper(item_vecs.features)) item_vecs = item_vecs.filter(item_vecs.check == False).select( ['id', 'features']) map_df = map_df.join(item_vecs, 'id') print('There are {} items left.'.format(map_df.count())) # Save data print('Saving to csv ...') map_df.toPandas().to_csv(save_file)
def __init__(self): self.model = None try: self.model = ALSModel.load(modelPath) except Exception as e: print(e) self.train()
def main(spark, model_file, data_file): ''' Parameters ---------- spark : SparkSession object model_file : string, path to the best model file data_file : string, path to the test parquet file to load ''' # Loads test data data = spark.read.parquet(data_file).repartition(5000, "user_num_id") data.createOrReplaceTempView('data') # Loads trained ALS model model = ALSModel.load(model_file) users = data.select('user_num_id').distinct() truth = spark.sql( 'SELECT user_num_id AS user_id, collect_list(track_num_id) AS label FROM data GROUP BY user_num_id' ) # get recommendations userSubsetRecs = model.recommendForUserSubset(users, 500) recs = userSubsetRecs.select("recommendations.track_num_id", "user_num_id") # get input for ranking metrics pred = truth.join(recs, truth.user_id == recs.user_num_id, how='left').select('track_num_id', 'label') predictionAndLabels = pred.rdd.map(lambda lp: (lp.track_num_id, lp.label)) print('--------Start Computing ... ...--------') metrics = RankingMetrics(predictionAndLabels) meanAP = metrics.meanAveragePrecision print('Mean Average Precision on test set = {}'.format(meanAP))
def evaluate_model(spark, model_file, train_file, val_file): # load in the data train = spark.read.parquet(train_file) train.createOrReplaceTempView('train') val = spark.read.parquet(val_file) val.createOrReplaceTempView('val') # use stringIndexer and pipeline to index on user_id and book_id user_idxer = StringIndexer(inputCol = 'user_id', outputCol = 'user', handleInvalid = 'skip') item_idxer = StringIndexer(inputCol = 'book_id', outputCol = 'item', handleInvalid = 'skip') # index data pipeline = Pipeline(stages = [user_idxer, item_idxer]) indexer = pipeline.fit(train) train = indexer.transform(train) val = indexer.transform(val) # load the model model = ALSModel.load(model_file) rmse, prec_at, mean_avg_prec, ndcg = get_val_metrics(model, val) # print('Rank = %d, lambda = %.2f' %(model.getRank(), model.getRegParam())) print('RMSE: %f, precision at 500: %f, MAP %f, ndcg at 500 %f' %(rmse, prec_at, mean_avg_prec, ndcg))
def _compute_rmse(model: ALSModel, data: DataFrame) -> float: """ computes the RMSE error for a given model . Args: model: the model instance data: a spark DataFrame on which to run the model and compare the predicted vs actual ratings. Returns: rmse : the root-mean-squared error value """ predictions = model.transform(data) # remove all NaN values predictions = predictions.na.drop(subset=["prediction"]) try: evaluator = RegressionEvaluator(metricName="rmse", labelCol=config.RATINGS_COL, predictionCol="prediction") rmse = evaluator.evaluate(predictions) return rmse except Exception as e: logger.warning( 'Error in computing rmse. Error description: {}'.format(e)) return math.nan
def main(spark, test_file, index_file, model_file): # Load the dataframe test = spark.read.parquet(test_file) indexer = PipelineModel.load(index_file) #transform user and track ids test = indexer.transform(test) #select distinct users for recommendations #testUsers = test.select("userNew").distinct().alias("userCol") #establish "ground truth" groundTruth = test.groupby("userNew").agg( F.collect_list("trackNew").alias("truth")) print("created ground truth df") alsmodel = ALSModel.load(model_file) rec = alsmodel.recommendForAllUsers(500) print("created recs") predictions = rec.join(groundTruth, rec.userNew == groundTruth.userNew, 'inner') scoreAndLabels = predictions.select('recommendations.trackNew', 'truth').rdd.map(tuple) metrics = RankingMetrics(scoreAndLabels) precision = metrics.precisionAt(500) map_out = metrics.meanAveragePrecision print(f"precision at 500: {precision}") print(f"map : {map_out}")
def __init__(self): self.sc = pyspark.SparkContext() self.spark = SparkSession\ .builder\ .appName("ALSMoviePrediction") \ .getOrCreate() self.model = ALSModel.load('ALSModel')
def main(spark, sc, test_file, index_file, model_file, limit=0.01): # Load the dataframe test = spark.read.parquet(test_file) indexer = PipelineModel.load(index_file) #transform user and track ids test = indexer.transform(test) #select distinct users for recommendations, limit if needed testUsers = test.select("userNew").distinct().alias("userCol").sample( limit) #establish "ground truth" groundTruth = test.groupby("userNew").agg( F.collect_list("trackNew").alias("truth")) print("created ground truth df") alsmodel = ALSModel.load(model_file) #default version baseline(alsmodel, groundTruth, testUsers) annoy(alsmodel, groundTruth, testUsers, sc) trees = [10, 20, 30, 40, 50] ks = [-1, 10, 50, 100] for t in trees: for k in ks: annoy(alsmodel, groundTruth, testUsers, sc, n_trees=t, search_k=k) print("finished!")
def main(spark, model_file, data_file, user_file, track_file): #load ALS model als_model = ALSModel.load(model_file) user_indexer = StringIndexer.load(user_file) track_indexer = StringIndexer.load(track_file) #read in test data as parquet df_test = spark.read.parquet(data_file) pipeline = Pipeline(stages=[user_indexer, track_indexer]) mapping = pipeline.fit(df_test) df_test = mapping.transform(df_test) ########### PERFORM RANKING METRICS ########### #create user actual items dataframe actual_recs = df_test.groupBy('user_idx').agg( F.collect_list('track_idx').alias('track_idx')) #create user predicted items dataframe user_subset = df_test.select('user_idx').distinct() pred_recs = als_model.recommendForUserSubset(user_subset, 500) pred_recs = pred_recs.select( 'user_idx', F.col('recommendations.track_idx').alias('track_idx')) #create user item RDD & join on users perUserItemsRDD = pred_recs\ .join(actual_recs, on='user_idx').rdd\ .map(lambda row: (row[1], row[2])) rankingMetrics = RankingMetrics(perUserItemsRDD) #print results to the console print("Ranking Metrics MAP: ", rankingMetrics.meanAveragePrecision)
def main(spark, sc, test_file, index_file, model_file, limit=0.01): # Load the dataframe test_df = spark.read.parquet(test_file) model_indexer = PipelineModel.load(index_file) # transform test_df using index_model test_df = model_indexer.transform(test_file) # select distinct user for recommendation, limit to save run time test_user = test_df.select('user_label').distinct().alias('userCol').sample(limit) # establish user_truth user_truth = test_df.groupby('user_label').agg(F.collect_list('book_label').alias('truth')) print('test data and user_truth has been preprocessed') # load als model als_model = ALSModel.load(model_file) # default settings baseline(als_model, user_truth, test_user) annoy(als_model, user_truth, test_user, sc) # hyper-parameter tunning: trees = [10, 15, 20] k_list = [-1, 5, 10] for i in trees: for j in k_list: annoy(als_model, user_truth, test_user, sc, n_trees=i, search_k=j) print('fast search feature has been established')
def main(spark, model_file, test_file): ''' Parameters ---------- spark : SparkSession object data_file : string, path to the parquet file to load model_file : string, path to store the serialized model file ''' # Load the parquet file test = spark.read.parquet(test_file) test = test.sort('user', ascending=False) test.createOrReplaceTempView('test_table') model = ALSModel.load(model_file) user_subset = test.select("user").distinct() user_subset = model.recommendForUserSubset(user_subset, 500) user_subset = user_subset.select("user", col("recommendations.item").alias("item")) user_subset = user_subset.sort('user', ascending=False) print("sort user") predictionAndLabels = user_subset.join( test, ["user"], "inner").rdd.map(lambda tup: (tup[1], tup[2])) print("joined predictions and counts") metrics = RankingMetrics(predictionAndLabels) print("made metrics") MAP = metrics.meanAveragePrecision precision = metrics.precisionAt(500) ndcg = metrics.ndcgAt(500) print('MAP: %f' % MAP) print('Precision: %f' % precision) print('NDCG: %f' % ndcg)
def recForAllItem(df: DataFrame, model: ALSModel): print("recommend for all item") # Generate top 10 user recommendations for each item itemRecs = model.recommendForAllItems(param.get("top", 10)) # 得到 userCol_new 与 userCol的映射 df_dup = df.dropDuplicates([param["userCol_new"]]) arrs = df_dup.select(param["userCol_new"], param["userCol"]).toJSON().collect() maps = {} for arr in arrs: arr = json.loads(arr) maps[arr[param["userCol_new"]]] = arr[param["userCol"]] def map_fun(row: Row): # list(Row) list_rows = row[1] result = [] for r in list_rows: result.append(Row(maps[r[param["userCol_new"]]], r["rating"])) return Row(row[0], result) rdd0 = itemRecs.rdd.map(map_fun) # 定义struct type schema = StructType([ StructField(param["itemCol"], IntegerType(), True), \ StructField("recommendations", ArrayType(StructType( [StructField(param["userCol"], LongType(), True), StructField(param["ratingCol"], DoubleType(), True)]), True), True) ]) spark = SparkSession.builder.appName("als").master("local[3]").getOrCreate() df_pd = spark.createDataFrame(rdd0, schema).toPandas() # df_pd.to_json(os.path.join(param["result_dir"],"itemRecs.json"),index=False,orient="split") return df_pd
def main(spark, model_file): ### train = spark.read.parquet('./train.parquet') #validation = spark.read.parquet('./validation.parquet') test = spark.read.parquet('./test.parquet') train_model = ALSModel.load(model_file) users = test.select('convert_user_id').distinct() user_recs = train_model.recommendForUserSubset(users, 500) prediction_df = user_recs.select('convert_user_id', 'recommendations.convert_track_id') true_df = test.groupBy('convert_user_id').agg( expr('collect_list(convert_track_id) as true_items')) prediction_df.write.parquet('./recommendation_count.parquet') true_df.write.parquet('./true_count.parquet') prediction_rdd = prediction_df.join(true_df, 'convert_user_id') \ .rdd \ .map(lambda row: (row[1], row[2])) rankingMetrics = RankingMetrics(prediction_rdd) print(rankingMetrics.meanAveragePrecision) print(rankingMetrics.precisionAt(500))
def main(spark, test_file, model_file): test = spark.read.parquet(test_file) test_df = test.select('user_label', 'track_label', 'count') test_grouped = test_df.groupBy('user_label').agg( F.collect_list(F.col('track_label')).alias('track_label')) #model = MatrixFactorizationModel.load(sc, model_file) model = ALSModel.load(model_file) # Get the predictions # Generate top 10 movie recommendations for each user predictions = model.recommendForAllUsers(10) prediction_df = predictions.rdd.map( lambda r: (r.user_label, [i[0] for i in r.recommendations])).toDF() prediction_df = prediction_df.selectExpr("_1 as user_label", "_2 as recommendations") # Join table test_pred = test_grouped.join(prediction_df, "user_label", "inner") # Instantiate regression metrics to compare predicted and actual ratings rdd = test_pred.select('recommendations', 'track_label').rdd ranking_metrics = RankingMetrics(rdd) # MAP print("MAP = %s" % ranking_metrics.meanAveragePrecision)
def main(spark, txt): model = ALSModel.load('hdfs:/user/jm7955/' + args.model) distinct_users = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' % args.distinct) print("distinct_users") print('finished writing in %d seconds' % int(timer() - start)) #distinct_users.show() labels = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' % args.labels) print("labels") #labels.show() print('finished writing in %d seconds' % int(timer() - start)) predictions = model.recommendForUserSubset(distinct_users, 500)\ .select('user', F.col('recommendations.item').alias('item')) print("predictions") #predictions.show() print('finished writing in %d seconds' % int(timer() - start)) predictionsAndLabels = predictions.join( labels, ["user"], "inner").rdd.map(lambda tup: (tup[1], tup[2])) print("predictionsAndLabels") print('finished writing in %d seconds' % int(timer() - start)) metrics = RankingMetrics(predictionsAndLabels) print('finished writing in %d seconds' % int(timer() - start)) file = open(txt, 'w') file.write('metrics.meanAveragePrecision: %s\n' % metrics.meanAveragePrecision) file.write('metrics.precisionAt(500) %s\n' % metrics.precisionAt(500)) file.write('metrics.ndcgAt(500) %s\n' % metrics.ndcgAt(500)) file.close()
def recommend(self, num, model1, df2, user=None): model = ALSModel.load("nest_recom_trained_model") #model=trained_model.fit(data) userRecs = model.recommendForAllUsers(num) aa = userRecs.withColumn("recommendations", F.explode("recommendations")) aa = aa.select("new_userId", "recommendations.new_propertyId", "recommendations.rating") #aa_joined = aa.join(df2,['new_userId','new_propertyId'],'inner') aa_joined = aa.join(df2, ['new_userId', 'new_propertyId']) #df = df1.join(df2, (df1.x1 == df2.x1) & (df1.x2 == df2.x2)) final_df1 = (aa_joined.select("userId", "propertyId", "rating").withColumn( "Recommendations", F.struct(F.col("propertyId"), F.col("rating")))).select( "userId", "Recommendations") #final_df2 = final_df1.filter(final_df1.new_userId==userId) ##final_json=final_df1.groupby("userId").agg(F.collect_list("Recommendations").alias("Recommendations")) ##final_json.coalesce(1).write.format('json').save('cc.json') return final_df1
def main(spark, sc, test_file, model_file): test_data = spark.read.parquet(test_file) test_data.createOrReplaceTempView('test_data') test_users = spark.sql("select distinct user_id from test_data limit 800") #test_users = test_data.select("user_id").distinct().alias("user_id") groundTruth_test = test_data.groupby("user_id").agg( F.collect_list("book_id").alias("test_truth")) als_model = ALSModel.load(model_file) brute_force(als_model, groundTruth_test, test_users) trees = [10, 20, 40, 50] ks = [-1, 10, 50, 100] #annoy(alsmodel, groundTruth, testUsers, sc) for t in trees: for k in ks: annoy_model(als_model, sc, groundTruth_test, test_users, n_trees=t, search_k=k) print("finished!")
def _load_model(self): try: # path=Path(self.model_folder+'/') self.best_model = ALSModel.load(self.model_folder) print('Model load succeed.') except: print('Model load failed. You can re-train the model.') raise
def test(self, test): '''Evaluate the model by computing the RMSE on the test data. :param test: test dataset ''' model = ALSModel.load(self.args.model_storage_path) evaluator = RegressionEvaluator(labelCol='stars', metricName='rmse') predictions = model.transform(test) rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse))
def main(spark): ''' Parameters ---------- spark : SparkSession object ''' test_file = 'hdfs:/user/bm106/pub/project/cf_test.parquet' test = spark.read.parquet(test_file) test.createOrReplaceTempView('test') w = Window.partitionBy("user_id") def ratio_count(c, w): return (col(c) / count(c).over(w)) test = test.select("user_id", "track_id", ratio_count("count", w).alias("count")) test.createOrReplaceTempView('test') print("Ratio scores done") train_sample = spark.read.parquet('hdfs:/user/dev241/extension4_ratio.parquet') train_sample.createOrReplaceTempView('train_sample') print("Training sample ext4 loaded") StringIndexer = PipelineModel.load('hdfs:/user/dev241/DieterStringIndexer') test_idx = StringIndexer.transform(test) train_idx = StringIndexer.transform(train_sample) #change to best rank = 78 alpha = 14.287069059772636 reg = 0.41772043857578584 model = ALSModel.load("Extension4_ratio") print('Model loaded') #test ranking metrics test_idx = test_idx.select('user_idx','track_idx','count') test_users = test_idx.select('user_idx').distinct() test_comb = test_idx.groupBy('user_idx').agg(F.collect_set('track_idx').alias('test_labels')) track_number = 500 rec_test = spark.read.parquet('hdfs:/user/dev241/rec_test4.parquet') print('Rec test loaded.') join = test_comb.join(rec_test,test_comb.user_idx == rec_test.user_idx) print('Join done.') j4 = join.toDF('user_idx', 'test_labels','user_idx2','recommendations') j4.write.parquet("ext4join") print('j4 parquet written') predictionAndLabels = join.rdd.map(lambda r: ([track.track_idx for track in r.recommendations], r.test_labels)) print('Map done.') metrics = RankingMetrics(predictionAndLabels) print('RM done.') mavgp = metrics.meanAveragePrecision print("Test mean Average Precision : ",mavgp) pass
def recForAllUser(df: DataFrame, model:ALSModel): print("recommend for all user") # Generate top 10 movie recommendations for each user userRecs: DataFrame = model.recommendForAllUsers(param.get("top", 10)) userRecs_pd = userRecs.join( df.select(param["userCol"], param["userCol_new"]).dropDuplicates([param["userCol_new"]]), param["userCol_new"]) \ .select(param["userCol"], "recommendations") \ .toPandas() # userRecs_pd.to_json(os.path.join(param["result_dir"],"userRecs.json"),index=False,orient="split") return userRecs_pd
def __import_model(self, mlInstance): """ Reads an ML instance from the input path :mlInstance: Path to the saved model """ self.model = ALSModel.load(mlInstance) predictions = self.model.transform(self.test) evaluator = RegressionEvaluator(metricName='rmse', labelCol='product_rating', predictionCol='prediction') self.rmse = evaluator.evaluate(predictions)
def main(spark, model_file, train_data_file, test_data_file): time_a = time.time() start = time_a training_data = spark.read.parquet(train_data_file) indexer_id = StringIndexer(inputCol="user_id", outputCol="userindex").setHandleInvalid("skip") indexer_id_model = indexer_id.fit(training_data) indexer_item = StringIndexer( inputCol="track_id", outputCol="itemindex").setHandleInvalid("skip") indexer_item_model = indexer_item.fit(training_data) testing_data = spark.read.parquet(test_data_file) testing_data = indexer_id_model.transform(testing_data) testing_data = indexer_item_model.transform(testing_data) testing_data = testing_data.select('userindex', 'itemindex', 'count') print('Finished Indexing!') time_b = time.time() print(time_b - time_a) time_a = time_b model = ALSModel.load(model_file) prediction = model.recommendForAllUsers(500).select( 'userindex', 'recommendations.itemindex') print('Finished Prediction DF!') testing_df = testing_data.groupBy('userindex').agg( expr('collect_list(itemindex) as item_list')) print('Finished Label DF!') predictionAndLabels = prediction.join(testing_df, 'userindex') print('Joined Prediction and Labels!') time_b = time.time() print(time_b - time_a) time_a = time_b pred_df = predictionAndLabels.select(['itemindex', 'item_list']).rdd.map(list) metrics = RankingMetrics(pred_df) print('Ranking Metrics Calculated!') time_b = time.time() print(time_b - time_a) time_a = time_b eva = metrics.meanAveragePrecision print("Model on Testing Data gives MAP= ", eva) print('Process Finished!') print(time.time() - start)
def main(): spark = SparkSession.builder.appName('test').getOrCreate() als_model = ALSModel.load('anshul_project/als_sampling') test_data = spark.read.parquet('anshul_project/test_index.parquet') als_predictions = als_model.transform(test_data) reg_evaluator = RegressionEvaluator(metricName="rmse", labelCol="count", predictionCol="prediction") rmse = reg_evaluator.evaluate(als_predictions) print("Test rmse " + str(rmse))
def basic_rec_val(spark, dirname, rank, regParam, k, random_seed): val_set = spark.read.parquet(f'{dirname}/val.parquet') print( f'Validating on model with rank = {rank} and regParam = {regParam} trained using {dirname} data ...' ) # load corresponding trained model model = ALSModel.load(f'{dirname}/{rank}_{regParam}_model') # computing RMSE on validation set predictions = model.transform(val_set) evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction') rmse = evaluator.evaluate(predictions) print(f'rmse: {rmse}') print(f'Constructing top {k} books recommended to per user ...') val_users = val_set.select('user_id').distinct() start_time = time.time() perUserPredictedTopKItemsDF = model.recommendForUserSubset(val_users, k) myudf = udf(extract_item, ArrayType(IntegerType())) perUserPredictedTopKItemsDF = perUserPredictedTopKItemsDF.withColumn( 'predictions', myudf(perUserPredictedTopKItemsDF['recommendations'])).drop( 'recommendations') print('Constructing actual books per user ...') perUserActualItemsDF = val_set.filter( column('rating') >= 3.0).groupBy('user_id').agg( expr('collect_list(book_id) as book_ids')) print('Constructing Ranking Metrics ...') perUserItemsRDD = perUserPredictedTopKItemsDF.join( perUserActualItemsDF, 'user_id').rdd.map(lambda row: (row[1], row[2])) rankingMetrics = RankingMetrics(perUserItemsRDD) precisionAtK = rankingMetrics.precisionAt(k) mAP = rankingMetrics.meanAveragePrecision end_time = time.time() time_delta = str(datetime.timedelta(seconds=end_time - start_time)) print(f'p@{k}: {precisionAtK}') print(f'mAP: {mAP}') print(f'run time: {time_delta}')
def main(spark, test_file, train_file, model_path): # Read data from parquet print('Reading parquet file ...') test = spark.read.parquet(test_file) test.createOrReplaceTempView('test') train = spark.read.parquet(train_file) train.createOrReplaceTempView('train') # Load the best model from training print('Loading model ...') best_model = ALSModel.load(model_path) # get recommendations for users in test set print('Evaluating model on test set ...') test_users = test.select("user_id").distinct() rec_test = best_model.recommendForUserSubset(test_users, 700) pred_test_700 = rec_test.select( rec_test.user_id, rec_test.recommendations.book_id.alias('rec_book_id')) sub_train_test = spark.sql('SELECT user_id, book_id \ FROM train \ WHERE user_id IN (SELECT DISTINCT user_id FROM test)' ) df_train_book_test = sub_train_test.groupby('user_id').agg( F.collect_set('book_id').alias('train_book_id')) df_join_test = pred_test_700.join(df_train_book_test, 'user_id') diff = F.udf(book_diff, ArrayType(IntegerType())) df_join_pred_test = df_join_test.withColumn( 'predictions', diff(df_join_test.rec_book_id, df_join_test.train_book_id)) pred_test = df_join_pred_test.select(df_join_pred_test.user_id, df_join_pred_test.predictions) # get true preferences of users in validation set label_test = test.filter(test.rating >= 3).groupby("user_id").agg( F.collect_list("book_id")) predAndLabel_test = pred_test.join( label_test, 'user_id').rdd.map(lambda row: (row[1], row[2])) # Use Mean Average Precision as evaluation metric metrics_test = RankingMetrics(predAndLabel_test) MAP_test = metrics_test.meanAveragePrecision pak_100_test = metrics_test.precisionAt(100) pak_500_test = metrics_test.precisionAt(500) print('\n') print( 'Ranking scores of the best model on test data: MAP = {}, Precision@100 = {}, Precision@500 = {}' .format(MAP, pak_100_test, pak_500_test))
def whiskey_als(df): model = ALSModel.load("hdfs://master/ALSModel/") predict = model.recommendForItemSubset(df, 1) df_user = predict.select( predict.whiskeyId, predict.recommendations[0].userId.alias("userId"), ) df_whiskey= model.recommendForUserSubset(df_user, 5) result_df = df_user.join(df_whiskey, on=['userId'], how='left') result_df = result_df.join(df, on=['whiskeyId'], how='left') result_df = result_df.select("user_name", "whiskeyId", "recommendations") return result_df