def main(): spark = SparkSession \ .builder \ .appName("ALSExample") \ .getOrCreate() # ratings_list = [i.strip().split(",") for i in open('/Users/asapehrsson/dev/learn/hadoop_spark_jupyter/data/ml-latest-small/ratings.csv', 'r').readlines()] # stream = open('/Users/asapehrsson/dev/learn/hadoop_spark_jupyter/data/ml-latest-small/movies.csv') # $example on$ lines = spark.read.text( "/Users/asapehrsson/dev/learn/hadoop_spark_jupyter/data/ml-latest-small/ratings.csv" ).rdd parts = lines.map(lambda row: row.value.split(",")) ratingsRDD = parts.map(extract).filter(lambda x: x is not None) ratings = spark.createDataFrame(ratingsRDD) (training, test) = ratings.randomSplit([0.8, 0.2]) # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) # Generate top 10 movie recommendations for each user userRecs = model.recommendForAllUsers(10) # Generate top 10 user recommendations for each movie movieRecs = model.recommendForAllItems(10) # Generate top 10 movie recommendations for a specified set of users users = ratings.select(als.getUserCol()).distinct().limit(3) userSubsetRecs = model.recommendForUserSubset(users, 10) # Generate top 10 user recommendations for a specified set of movies movies = ratings.select(als.getItemCol()).distinct().limit(3) movieSubSetRecs = model.recommendForItemSubset(movies, 10) # $example off$ userRecs.show() movieRecs.show() userSubsetRecs.show() movieSubSetRecs.show() p = movieRecs.toPandas() spark.stop()
def run_spark_als(file_path): read_data(sql_context) als_data_frame = sql_context.sql(""" select visitorid,itemid, case when event = 'view' then 1 when event = 'addtocart' then 5 when event = 'transaction' then 10 else 0 end as rate from event_table """) print(als_data_frame.count()) als_data_frame.show() (training, test) = als_data_frame.randomSplit([0.7, 0.3]) # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics base_reg = 0.01 for iterNum in range(1): for regParm in range(1): als = ALS(maxIter=iterNum + 1, regParam=0.3, implicitPrefs=False, userCol="visitorid", itemCol="itemid", ratingCol="rate", coldStartStrategy="drop") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rate", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("iterNum: %s, regParam: %s, Root-mean-square error = %s" % (iterNum, base_reg, str(rmse))) base_reg += 0.1 model.itemFactors.show() model.userFactors.show() # Generate top 10 movie recommendations for each user userRecs = model.recommendForAllUsers(10) # Generate top 10 user recommendations for each movie movieRecs = model.recommendForAllItems(10) # Generate top 10 movie recommendations for a specified set of users users = als_data_frame.select(als.getUserCol()).distinct().limit(3) userSubsetRecs = model.recommendForUserSubset(users, 10) # Generate top 10 user recommendations for a specified set of movies movies = als_data_frame.select(als.getItemCol()).distinct().limit(3) movieSubSetRecs = model.recommendForItemSubset(movies, 10) # $example off$ userRecs.show(20, False) movieRecs.show(20, False) userSubsetRecs.show(20, False) movieSubSetRecs.show(20, False)
class Recommendation: def __init__(self, spark, filename): # TO DO # Read data # # self.ratings = ... (training, test) = ratings.randomSplit([0.8, 0.2]) # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get # NaN evaluation metrics self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") self.model = self.als.fit(training) # evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) logger.info("Root-mean-square error = " + str(rmse)) # top movie recommendations for each user def recommend_for_users(self, num_movies): return self.model.recommendForAllUsers(num_movies) # top user recommendations for each movie def recommend_for_movies(self, num_recommendations): return self.model.recommendForAllItems(num_recommendations) # top movie recommendations for a specified set of users def recommend_for_setusers(self, num_users): users = self.ratings.select(self.als.getUserCol()).distinct().limit(3) return self.model.recommendForUserSubset(users, num_users) # top user recommendations for a specified set of movies def recommend_for_setmovies(self, num_movies): movies = self.ratings.select(self.als.getItemCol()).distinct().limit(3) return self.model.recommendForItemSubset(movies, num_movies)
checkpointInterval=1000, intermediateStorageLevel='MEMORY_AND_DISK', finalStorageLevel='MEMORY_AND_DISK', coldStartStrategy='drop') model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) # print("Root-mean-square error = " + str(rmse)) item_rec = parts.select(als.getItemCol()).distinct() item_rec = item_rec.where(item_rec.itemId == baoming) item_rec = model.recommendForItemSubset(item_rec, num_user) data = item_rec.select("recommendations") data_list = data.rdd.map(lambda x: x[0]).take(1) data = [data_list[0][i] for i in range(len(data_list[0]))] data1 = sqlContext.createDataFrame(data) p_user = p_user.withColumnRenamed("userId", "userId_1") data2 = data1.join(p_user, data1["userId"] == p_user["userId_1"]) data3 = data2.select("user1", "rating") data3 = data3.sort("rating", ascending=False) data3 = data3.withColumn("rn", F.row_number().over(Window.orderBy("user1")))
class RecommendationEngine: """A movie recommendation engine """ def __train_model(self): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") self.model = self.als.fit(self.ratingsdf) logger.info("ALS model built!") def get_top_ratings(self, user_id, movies_count): """Recommends up to movies_count top unrated movies to user_id """ users = self.ratingsdf.select(self.als.getUserCol()) users = users.filter(users.userId == user_id) userSubsetRecs = self.model.recommendForUserSubset(users, movies_count) userSubsetRecs = userSubsetRecs.withColumn("recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['movieId'].alias('movieId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs def get_top_movie_recommend(self, movie_id, user_count): """Recommends up to movies_count top unrated movies to user_id """ movies = self.ratingsdf.select(self.als.getItemCol()) movies = movies.filter(movies.movieId == movie_id) movieSubsetRecs = self.model.recommendForItemSubset(movies, user_count) movieSubsetRecs = movieSubsetRecs.withColumn( "recommendations", explode("recommendations")) movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') movieSubsetRecs = movieSubsetRecs.drop('Rating') movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() movieSubsetRecs = movieSubsetRecs.toPandas() movieSubsetRecs = movieSubsetRecs.to_json() return movieSubsetRecs def get_ratings_for_movie_ids(self, user_id, movie_id): """Given a user_id and a list of movie_ids, predict ratings for them """ request = self.spark_session.createDataFrame([(user_id, movie_id)], ["userId", "movieId"]) ratings = self.model.transform(request).collect() return ratings def add_ratings(self, user_id, movie_id, ratings_given): """Add additional movie ratings in the format (user_id, movie_id, rating) """ # Convert ratings to an RDD new_ratings = self.spark_session.createDataFrame( [(user_id, movie_id, ratings_given)], ["userId", "movieId", "rating"]) # Add new ratings to the existing ones self.ratingsdf = self.ratingsdf.union(new_ratings) # Re-train the ALS model with the new ratings self.__train_model() new_ratings = new_ratings.toPandas() new_ratings = new_ratings.to_json() return new_ratings def get_history(self, user_id): """Get rating history for a user """ self.ratingsdf.createOrReplaceTempView("ratingsdata") user_history = self.spark_session.sql( 'SELECT userId, movieId, rating from ratingsdata where userId = "%s"' % user_id) user_history = user_history.join(self.moviesdf, ("movieId"), 'inner') user_history = user_history.toPandas() user_history = user_history.to_json() return user_history def __init__(self, spark_session, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark_session = spark_session # Load ratings data for later use logger.info("Loading Ratings data...") ratings_file_path = os.path.join(dataset_path, '../../datasets/ratings.csv') self.ratingsdf = spark_session.read.csv(ratings_file_path, header=True, inferSchema=True).na.drop() self.ratingsdf = self.ratingsdf.drop("timestamp") # Load movies data for later use logger.info("Loading Movies data...") movies_file_path = os.path.join(dataset_path, '../../datasets/movies.csv') self.moviesdf = spark_session.read.csv(movies_file_path, header=True, inferSchema=True).na.drop() self.moviesdf = self.moviesdf.drop("genres") # Train the model self.__train_model()
class RecommendationEngine: """A movie recommendation engine """ def __train_all_model(self): """Train the ALS model with the current dataset """ logger.info("Splitting dataset") self.df0 = self.df.limit(int(self.dataset_count / 3)) self.df1 = self.df.limit(int(self.dataset_count * 2 / 3)) self.df2 = self.df print('df 0 count = ' + str(self.df0.count())) print('df 1 count = ' + str(self.df1.count())) print('df 2 count = ' + str(self.df2.count())) logger.info("Dataset Splitted !") #Model 1 logger.info("Training the ALS model 1") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") self.model1 = self.als.fit(self.df0) logger.info("ALS model 1 built!") #Model 2 logger.info("Training the ALS model 2") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") self.model2 = self.als.fit(self.df1) logger.info("ALS model 2 built!") #Model 3 logger.info("Training the ALS model 3") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") self.model3 = self.als.fit(self.df2) logger.info("ALS model 3 built!") def __train_model(self, model): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") if model == 0: self.model1 = self.als.fit(self.df0) elif model == 1: self.model2 = self.als.fit(self.df1) elif model == 2: self.model3 = self.als.fit(self.df2) logger.info("ALS model built!") def get_top_ratings(self, model, user_id, movies_count): """Recommends up to movies_count top unrated movies to user_id """ if model == 0: users = self.df0.select(self.als.getUserCol()) users = users.filter(users.userId == user_id) userSubsetRecs = self.model1.recommendForUserSubset( users, movies_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['movieId'].alias('movieId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs elif model == 1: users = self.df1.select(self.als.getUserCol()) users = users.filter(users.userId == user_id) userSubsetRecs = self.model2.recommendForUserSubset( users, movies_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['movieId'].alias('movieId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs elif model == 2: users = self.df2.select(self.als.getUserCol()) users = users.filter(users.userId == user_id) userSubsetRecs = self.model3.recommendForUserSubset( users, movies_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['movieId'].alias('movieId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs def get_top_movie_recommend(self, model, movie_id, user_count): """Recommends up to movies_count top unrated movies to user_id """ if model == 0: movies = self.df0.select(self.als.getItemCol()) movies = movies.filter(movies.movieId == movie_id) movieSubsetRecs = self.model1.recommendForItemSubset( movies, user_count) movieSubsetRecs = movieSubsetRecs.withColumn( "recommendations", explode("recommendations")) movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') movieSubsetRecs = movieSubsetRecs.drop('Rating') movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() movieSubsetRecs = movieSubsetRecs.toPandas() movieSubsetRecs = movieSubsetRecs.to_json() return movieSubsetRecs elif model == 1: movies = self.df1.select(self.als.getItemCol()) movies = movies.filter(movies.movieId == movie_id) movieSubsetRecs = self.model2.recommendForItemSubset( movies, user_count) movieSubsetRecs = movieSubsetRecs.withColumn( "recommendations", explode("recommendations")) movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') movieSubsetRecs = movieSubsetRecs.drop('Rating') movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() movieSubsetRecs = movieSubsetRecs.toPandas() movieSubsetRecs = movieSubsetRecs.to_json() return movieSubsetRecs elif model == 2: movies = self.df2.select(self.als.getItemCol()) movies = movies.filter(movies.movieId == movie_id) movieSubsetRecs = self.model3.recommendForItemSubset( movies, user_count) movieSubsetRecs = movieSubsetRecs.withColumn( "recommendations", explode("recommendations")) movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') movieSubsetRecs = movieSubsetRecs.drop('Rating') movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() movieSubsetRecs = movieSubsetRecs.toPandas() movieSubsetRecs = movieSubsetRecs.to_json() return movieSubsetRecs def get_ratings_for_movie_ids(self, model, user_id, movie_id): """Given a user_id and a list of movie_ids, predict ratings for them """ if model == 0: request = self.spark_session.createDataFrame([(user_id, movie_id)], ["userId", "movieId"]) ratings = self.model1.transform(request).collect() return ratings elif model == 1: request = self.spark_session.createDataFrame([(user_id, movie_id)], ["userId", "movieId"]) ratings = self.model2.transform(request).collect() return ratings elif model == 2: request = self.spark_session.createDataFrame([(user_id, movie_id)], ["userId", "movieId"]) ratings = self.model3.transform(request).collect() return ratings def add_ratings(self, model, user_id, movie_id, ratings_given): """Add additional movie ratings in the format (user_id, rating, movie_id) """ if model == 0: # Convert ratings to an RDD new_ratings = self.spark_session.createDataFrame( [(user_id, ratings_given, movie_id)], ["userId", "rating", "movieId"]) # Add new ratings to the existing ones self.df0 = self.df0.union(new_ratings) # Re-train the ALS model with the new ratings self.__train_model(model) new_ratings = new_ratings.toPandas() new_ratings = new_ratings.to_json() return new_ratings elif model == 1: # Convert ratings to an RDD new_ratings = self.spark_session.createDataFrame( [(user_id, ratings_given, movie_id)], ["userId", "rating", "movieId"]) # Add new ratings to the existing ones self.df1 = self.df1.union(new_ratings) # Re-train the ALS model with the new ratings self.__train_model(model) new_ratings = new_ratings.toPandas() new_ratings = new_ratings.to_json() return new_ratings elif model == 2: # Convert ratings to an RDD new_ratings = self.spark_session.createDataFrame( [(user_id, ratings_given, movie_id)], ["userId", "rating", "movieId"]) # Add new ratings to the existing ones self.df2 = self.df2.union(new_ratings) # Re-train the ALS model with the new ratings self.__train_model(model) new_ratings = new_ratings.toPandas() new_ratings = new_ratings.to_json() return new_ratings def get_history(self, model, user_id): """Get rating history for a user """ if model == 0: self.df0.createOrReplaceTempView("ratingsdata") user_history = self.spark_session.sql( 'SELECT userId, movieId, rating from ratingsdata where userId = "%s"' % user_id) user_history = user_history.join(self.moviesdf, ("movieId"), 'inner') user_history = user_history.toPandas() user_history = user_history.to_json() return user_history elif model == 1: self.df1.createOrReplaceTempView("ratingsdata") user_history = self.spark_session.sql( 'SELECT userId, movieId, rating from ratingsdata where userId = "%s"' % user_id) user_history = user_history.join(self.moviesdf, ("movieId"), 'inner') user_history = user_history.toPandas() user_history = user_history.to_json() return user_history elif model == 2: self.df2.createOrReplaceTempView("ratingsdata") user_history = self.spark_session.sql( 'SELECT userId, movieId, rating from ratingsdata where userId = "%s"' % user_id) user_history = user_history.join(self.moviesdf, ("movieId"), 'inner') user_history = user_history.toPandas() user_history = user_history.to_json() return user_history def __init__(self, spark_session, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark_session = spark_session # Load ratings data for later use logger.info("Loading Ratings data...") file_counter = 0 while True: file_name = 'data_part_' + str(file_counter) + '.txt' dataset_file_path = os.path.join(dataset_path, file_name) exist = os.path.isfile(dataset_file_path) if exist: if file_counter == 0: self.df = spark_session.read.csv(dataset_file_path, header=None, inferSchema=True) else: df_new = spark_session.read.csv(dataset_file_path, header=None, inferSchema=True) self.df = self.df.union(df_new) self.dataset_count = self.df.count() print('Data loaded = ' + str(self.dataset_count)) print(file_name + 'Loaded !') file_counter += 1 else: break self.df = self.df.selectExpr("_c0 as userId", "_c1 as rating", "_c2 as movieId") self.df.show() # print(self.df.count()) # Load movie data for later use logger.info("Loading Movie data...") movies_file_path = os.path.join(dataset_path, 'movie_titles.csv') self.moviesdf = spark_session.read.csv(movies_file_path, header=None, inferSchema=True) self.moviesdf = self.moviesdf.selectExpr("_c0 as movieId", "_c1 as Year", "_c2 as movie_title") # Train the model self.__train_all_model()
print 'The best model was trained with rank %s' % ranks[best_rank] my_model = models[best_rank] # COMMAND ---------- # TEST Test.assertEquals( round(min_error, 2), 0.81, "Unexpected value for best RMSE. Expected rounded value to be 0.81. Got {0}" .format(round(min_error, 2))) Test.assertEquals( ranks[best_rank], 12, "Unexpected value for best rank. Expected 12. Got {0}".format( ranks[best_rank])) Test.assertEqualsHashed( als.getItemCol(), "18f0e2357f8829fe809b2d95bc1753000dd925a6", "Incorrect choice of {0} for ALS item column.".format(als.getItemCol())) Test.assertEqualsHashed( als.getUserCol(), "db36668fa9a19fde5c9676518f9e86c17cabf65a", "Incorrect choice of {0} for ALS user column.".format(als.getUserCol())) Test.assertEqualsHashed( als.getRatingCol(), "3c2d687ef032e625aa4a2b1cfca9751d2080322c", "Incorrect choice of {0} for ALS rating column.".format( als.getRatingCol())) # COMMAND ---------- # MAGIC %md # MAGIC ### (2c) Testing Your Model # MAGIC # MAGIC So far, we used the `training_df` and `validation_df` datasets to select the best model. Since we used these two datasets to determine what model is best, we cannot use them to test how good the model is; otherwise, we would be very vulnerable to [overfitting](https://en.wikipedia.org/wiki/Overfitting). To decide how good our model is, we need to use the `test_df` dataset. We will use the `best_rank` you determined in part (2b) to create a model for predicting the ratings for the test dataset and then we will compute the RMSE.
(training, test) = ratings.randomSplit([0.8, 0.2]) # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics # Create ALS model als = ALS(maxIter=5, regParam=0.01, userCol="invoice_num", itemCol="product_code", ratingCol="rating", coldStartStrategy="drop") model = als.fit(training) model.write().overwrite().save("models/recommendModel") invoiceRecs = model.recommendForAllUsers(10) invoiceRecs.show(truncate=False) print(als.getItemCol()) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) """ # Generate top 10 movie recommendations for each user invoiceRecs = model.recommendForAllUsers(10) # Generate top 10 user recommendations for each movie productRecs = model.recommendForAllItems(10) # Generate top 10 movie recommendations for 3 users
class RecommendationEngine: """A movie recommendation engine""" def __train_model(self): """Train the ALS model with the current dataset""" logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") self.model = self.als.fit(self.ratingsdf) logger.info("ALS model built!") def get_top_ratings(self, user_id, movies_count): users = self.ratingsdf.select(self.als.getUserCol()) users = users.filter(users.userId == user_id) userSubsetRecs = self.model.recommendForUserSubset(users, movies_count) userSubsetRecs = userSubsetRecs.withColumn("recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['movieId'].alias('movieId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') userSubsetRecs = userSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs def get_top_movie_recommend(self, movie_id, user_count): movies = self.ratingsdf.select(self.als.getItemCol()) movies = movies.filter(movies.movieId == movie_id) movieSubsetRecs = self.model.recommendForItemSubset(movies, user_count) movieSubsetRecs = movieSubsetRecs.withColumn( "recommendations", explode("recommendations")) movieSubsetRecs = movieSubsetRecs.select(func.col('movieId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') movieSubsetRecs = movieSubsetRecs.drop('Rating') movieSubsetRecs = movieSubsetRecs.join(self.moviesdf, ("movieId"), 'inner') movieSubsetRecs = movieSubsetRecs.toPandas() movieSubsetRecs = movieSubsetRecs.to_json() return movieSubsetRecs def get_movie_rating(self, user_id, movie_id): request = self.spark_session.createDataFrame([(user_id, movie_id)], ["userId", "movieId"]) ratings = self.model.transform(request).collect() return ratings def __init__(self, spark_session, dataset_path): """Init the recommendation engine given a Spark context and a dataset path""" logger.info("Starting up the Recommendation Engine: ") self.spark_session = spark_session # Load ratings data for later use logger.info("Loading Ratings data...") ratings_file_path = os.path.join(dataset_path, 'ratings.csv') self.ratingsdf = spark_session.read.csv(ratings_file_path, header=True, inferSchema=True).na.drop() self.ratingsdf = self.ratingsdf.drop("timestamp") # Load movies data for later use logger.info("Loading Movies data...") movies_file_path = os.path.join(dataset_path, 'movies.csv') self.moviesdf = spark_session.read.csv(movies_file_path, header=True, inferSchema=True).na.drop() self.moviesdf = self.moviesdf.drop("genres") # Train the model self.__train_model()
print 'For rank %s the RMSE is %s' % (rank, error) if error < min_error: min_error = error best_rank = err err += 1 als.setRank(ranks[best_rank]) print 'The best model was trained with rank %s' % ranks[best_rank] my_model = models[best_rank] # COMMAND ---------- # TEST Test.assertEquals(round(min_error, 2), 0.81, "Unexpected value for best RMSE. Expected rounded value to be 0.81. Got {0}".format(round(min_error, 2))) Test.assertEquals(ranks[best_rank], 12, "Unexpected value for best rank. Expected 12. Got {0}".format(ranks[best_rank])) Test.assertEqualsHashed(als.getItemCol(), "18f0e2357f8829fe809b2d95bc1753000dd925a6", "Incorrect choice of {0} for ALS item column.".format(als.getItemCol())) Test.assertEqualsHashed(als.getUserCol(), "db36668fa9a19fde5c9676518f9e86c17cabf65a", "Incorrect choice of {0} for ALS user column.".format(als.getUserCol())) Test.assertEqualsHashed(als.getRatingCol(), "3c2d687ef032e625aa4a2b1cfca9751d2080322c", "Incorrect choice of {0} for ALS rating column.".format(als.getRatingCol())) # COMMAND ---------- # MAGIC %md # MAGIC ### (2c) Testing Your Model # MAGIC # MAGIC So far, we used the `training_df` and `validation_df` datasets to select the best model. Since we used these two datasets to determine what model is best, we cannot use them to test how good the model is; otherwise, we would be very vulnerable to [overfitting](https://en.wikipedia.org/wiki/Overfitting). To decide how good our model is, we need to use the `test_df` dataset. We will use the `best_rank` you determined in part (2b) to create a model for predicting the ratings for the test dataset and then we will compute the RMSE. # MAGIC # MAGIC The steps you should perform are: # MAGIC * Run a prediction, using `my_model` as created above, on the test dataset (`test_df`), producing a new `predict_df` DataFrame. # MAGIC * Filter out unwanted NaN values (necessary because of [a bug in Spark](https://issues.apache.org/jira/browse/SPARK-14489)). We've supplied this piece of code for you. # MAGIC * Use the previously created RMSE evaluator, `reg_eval` to evaluate the filtered DataFrame.
predictions = model.transform(test) model.itemFactors.show(10, truncate=False) model.userFactors.show(10, truncate=False) evaluator = RegressionEvaluator(predictionCol="prediction", labelCol='ratingFloat', metricName='rmse') rmse = evaluator.evaluate(predictions) print("Root-mean-square error = {}".format(rmse)) # Generate top 10 movie recommendations for each user userRecs = model.recommendForAllUsers(10) # Generate top 10 user recommendations for each movie movieRecs = model.recommendForAllItems(10) # Generate top 10 movie recommendations for a specified set of users users = ratingSamples.select(als.getUserCol()).distinct().limit(3) userSubsetRecs = model.recommendForUserSubset(users, 10) # Generate top 10 user recommendations for a specified set of movies movies = ratingSamples.select(als.getItemCol()).distinct().limit(3) movieSubSetRecs = model.recommendForItemSubset(movies, 10) userRecs.show(5, False) movieRecs.show(5, False) userSubsetRecs.show(5, False) movieSubSetRecs.show(5, False) paramGrid = ParamGridBuilder().addGrid(als.regParam, [0.01]).build() cv = CrossValidator(estimator=als, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10) cvModel = cv.fit(test) avgMetrics = cvModel.avgMetrics spark.stop()
class RecommendationEngine: """ Recommendation engine """ def __init__(self, spark_session, dataset_path): self.spark_session = spark_session logger.info("Starting up the Spark Session: {}".format( self.spark_session)) # Load listening count data logger.info("Loading listening count dataset...") self.listening_count_df = [] for i in range(0, 3): lc_file_path = os.path.join(dataset_path, 'batch/batch' + str(i) + '.txt') new_df = spark_session.read.csv(lc_file_path, header=None, inferSchema=True).na.drop() new_df = new_df.selectExpr("_c0 as user_id", "_c1 as artist_id", "_c2 as weight") try: self.listening_count_df.append( self.listening_count_df[i - 1].union(new_df)) except IndexError: self.listening_count_df.append(new_df) logger.info("{} loaded".format('batch' + str(i) + '.txt')) logger.info("Loading listening count dataset done!") # Load artist data logger.info("Loading artist dataset...") artist_file_path = os.path.join(dataset_path, 'csv/artists.csv') self.artist_df = spark_session.read.csv(artist_file_path, header="true", inferSchema="true").na.drop() self.artist_df.createOrReplaceTempView("artists") self.artist_df_selected = self.spark_session.sql( "SELECT `id` as artist_id, `name`, `url` \ FROM artists") logger.info("Loading artist dataset done...") # Train the model self.__train_model() def __train_model(self): # Train the ALS model with the current dataset logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="artist_id", ratingCol="weight", coldStartStrategy="drop") self.model = [] for i in range(0, 3): self.model.append(self.als.fit(self.listening_count_df[i])) logger.info("Model {} done : {}".format( i, self.listening_count_df[i].count())) logger.info("ALS model built!") def get_top_ratings(self, model_id, user_id, num_of_books): # Recommends up to top unrated books to user_id user = self.listening_count_df[model_id].select(self.als.getUserCol()) user = user.filter(user.user_id == user_id) userSubsetRecs = self.model[model_id].recommendForUserSubset( user, num_of_books) userSubsetRecs = userSubsetRecs.withColumn("recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('recommendations')['artist_id'].alias('artist_id'), \ func.col('recommendations')['Rating'].alias('Rating')).drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') userSubsetRecs = userSubsetRecs.join(self.artist_df_selected, ('artist_id'), 'inner') df_json = userSubsetRecs.toJSON() data = {} data['result'] = [] for row in df_json.collect(): data['result'].append(json.loads(row)) return data def get_top_music_recommend(self, model_id, artist_id, num_of_users): # Recommends up to top unrated books to user_id artist = self.listening_count_df[model_id].select( self.als.getItemCol()) artist = artist.filter(artist.artist_id == artist_id) artistSubsetRecs = self.model[model_id].recommendForItemSubset( artist, num_of_users) artistSubsetRecs = artistSubsetRecs.withColumn( "recommendations", explode("recommendations")) artistSubsetRecs = artistSubsetRecs.select(func.col('recommendations')['user_id'].alias('user_id'), \ func.col('recommendations')['Rating'].alias('Rating')).drop('recommendations') artistSubsetRecs = artistSubsetRecs.drop('Rating') df_json = artistSubsetRecs.toJSON() data = {} data['result'] = [] for row in df_json.collect(): data['result'].append(json.loads(row)) return data def get_listening_count_for_artist_ids(self, model_id, user_id, artist_id): # Given a user_id and a list of artist_ids, predict listening count for them request = self.spark_session.createDataFrame([(user_id, artist_id)], ['user_id', 'artist_id']) weight = self.model[model_id].transform(request).collect() data = {} data['result'] = weight[0][2] return data def get_listening_count(self, model_id, user_id): # Get listening count history for a user self.listening_count_df[model_id].createOrReplaceTempView( "listeningcount") user_history = self.spark_session.sql( 'SELECT `artist_id`, `weight` from listeningcount \ WHERE `user_id` = "%s"' % user_id) user_history = user_history.join(self.artist_df_selected, ('artist_id'), 'inner') df_json = user_history.toJSON() data = {} data['result'] = [] for row in df_json.collect(): data['result'].append(json.loads(row)) return data
class RecommendationEngine: """ A book recommendation engine """ def __init__(self, spark_session, dataset_path): """ Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark_session = spark_session # Load ratings data for later use logger.info("Loading Ratings dataset...") ratings_file_path = os.path.join(dataset_path, 'ratings.csv') self.ratings_df = spark_session.read.csv(ratings_file_path, header="true", inferSchema="true").na.drop() # Load movies data for later use logger.info("Loading Books dataset...") books_file_path = os.path.join(dataset_path, 'books.csv') self.books_df = spark_session.read.csv(books_file_path, header="true", inferSchema="true").na.drop() self.books_df.createOrReplaceTempView("books") self.books_df_selected = self.spark_session.sql("SELECT `book_id`, `title` \ FROM books") # Train the model self.__train_model() def __train_model(self): """ Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="book_id", ratingCol="rating", coldStartStrategy="drop") self.model = self.als.fit(self.ratings_df) logger.info("ALS model built!") def get_top_ratings(self, user_id, book_count): """ Recommends up to book_count top unrated books to user_id """ users = self.ratings_df.select(self.als.getUserCol()) users = users.filter(users.user_id == user_id) userSubsetRecs = self.model.recommendForUserSubset(users, book_count) userSubsetRecs = userSubsetRecs.withColumn("recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('user_id'), \ func.col('recommendations')['book_id'].alias('book_id'), \ func.col('recommendations')['Rating'].alias('Rating')).drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') userSubsetRecs = userSubsetRecs.join(self.books_df_selected, ("book_id"), 'inner') userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs def get_top_book_recommend(self, book_id, user_count): """ Recommends up to book_count top unrated books to user_id """ books = self.ratings_df.select(self.als.getItemCol()) books = books.filter(books.book_id == book_id) bookSubsetRecs = self.model.recommendForItemSubset(books, user_count) bookSubsetRecs = bookSubsetRecs.withColumn("recommendations", explode("recommendations")) bookSubsetRecs = bookSubsetRecs.select(func.col('book_id'), \ func.col('recommendations')['user_id'].alias('user_id'), \ func.col('recommendations')['Rating'].alias('Rating')).drop('recommendations') bookSubsetRecs = bookSubsetRecs.drop('Rating') bookSubsetRecs = bookSubsetRecs.join(self.books_df_selected, ("book_id"), 'inner') bookSubsetRecs = bookSubsetRecs.toPandas() bookSubsetRecs = bookSubsetRecs.to_json() return bookSubsetRecs def get_ratings_for_book_ids(self, user_id, book_id): """ Given a user_id and a list of book_ids, predict ratings for them """ request = self.spark_session.createDataFrame([(user_id, book_id)], ["user_id", "book_id"]) ratings = self.model.transform(request).collect() return ratings def add_ratings(self, user_id, book_id, ratings_given): """ Add additional movie ratings in the format (user_id, movie_id, rating) """ # Convert ratings to an RDD new_ratings = self.spark_session.createDataFrame([(user_id, book_id, ratings_given)], ["user_id", "book_id", "rating"]) # Add new ratings to the existing ones self.ratings_df = self.ratings_df.union(new_ratings) # Re-train the ALS model with the new ratings self.__train_model() new_ratings = new_ratings.toPandas() new_ratings = new_ratings.to_json() return new_ratings def get_history(self, user_id): """ Get rating history for a user """ self.ratings_df.createOrReplaceTempView("ratingsdata") user_history = self.spark_session.sql('SELECT `user_id`, `book_id`, `rating` from ratingsdata \ WHERE `user_id` = "%s"' %user_id) user_history = user_history.join(self.books_df_selected, ("book_id"), 'inner') user_history = user_history.toPandas() user_history = user_history.to_json() return user_history
class RecommendationEngine: """A product recommendation engine """ def __train_all_model(self): """Train the ALS model with the current dataset """ #Model 1 logger.info("Training the ALS model 1") self.als = ALS(maxIter=5, regParam=0.01, userCol="UserId", itemCol="ProductId", ratingCol="Rating", coldStartStrategy="drop") self.model1 = self.als.fit(self.df0) logger.info("ALS model 1 built!") #Model 2 logger.info("Training the ALS model 2") self.als = ALS(maxIter=5, regParam=0.01, userCol="UserId", itemCol="ProductId", ratingCol="Rating", coldStartStrategy="drop") self.model2 = self.als.fit(self.df1) logger.info("ALS model 2 built!") #Model 3 logger.info("Training the ALS model 3") self.als = ALS(maxIter=5, regParam=0.01, userCol="UserId", itemCol="ProductId", ratingCol="Rating", coldStartStrategy="drop") self.model3 = self.als.fit(self.df2) logger.info("ALS model 3 built!") def __train_model(self, model): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="UserId", itemCol="ProductId", ratingCol="Rating", coldStartStrategy="drop") if model == 0: self.model1 = self.als.fit(self.df0) elif model == 1: self.model2 = self.als.fit(self.df1) elif model == 2: self.model3 = self.als.fit(self.df2) logger.info("ALS model built!") def get_top_ratings(self, model, user_id, products_count): if model == 0: users = self.df0.select(self.als.getUserCol()) users = users.filter(users.UserId == user_id) userSubsetRecs = self.model1.recommendForUserSubset( users, products_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('UserId'), func.col('recommendations')['ProductId'].alias('ProductId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs elif model == 1: users = self.df1.select(self.als.getUserCol()) users = users.filter(users.UserId == user_id) userSubsetRecs = self.model2.recommendForUserSubset( users, products_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('UserId'), func.col('recommendations')['ProductId'].alias('ProductId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs elif model == 2: users = self.df2.select(self.als.getUserCol()) users = users.filter(users.UserId == user_id) userSubsetRecs = self.model3.recommendForUserSubset( users, products_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('UserId'), func.col('recommendations')['ProductId'].alias('ProductId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs def get_top_product_recommend(self, model, product_id, user_count): if model == 0: products = self.df0.select(self.als.getItemCol()) products = products.filter(products.ProductId == product_id) productSubsetRecs = self.model1.recommendForItemSubset( products, user_count) productSubsetRecs = productSubsetRecs.withColumn( "recommendations", explode("recommendations")) productSubsetRecs = productSubsetRecs.select(func.col('ProductId'), func.col('recommendations')['UserId'].alias('UserId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') productSubsetRecs = productSubsetRecs.drop('Rating') # userSubsetRecs.printSchema() productSubsetRecs = productSubsetRecs.toPandas() productSubsetRecs = productSubsetRecs.to_json() return productSubsetRecs elif model == 1: products = self.df1.select(self.als.getItemCol()) products = products.filter(products.ProductId == product_id) productSubsetRecs = self.model2.recommendForItemSubset( products, user_count) productSubsetRecs = productSubsetRecs.withColumn( "recommendations", explode("recommendations")) productSubsetRecs = productSubsetRecs.select(func.col('ProductId'), func.col('recommendations')['UserId'].alias('UserId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') productSubsetRecs = productSubsetRecs.drop('Rating') # userSubsetRecs.show() # userSubsetRecs.printSchema() productSubsetRecs = productSubsetRecs.toPandas() productSubsetRecs = productSubsetRecs.to_json() return productSubsetRecs elif model == 2: products = self.df2.select(self.als.getItemCol()) products = products.filter(products.ProductId == product_id) productSubsetRecs = self.model3.recommendForItemSubset( products, user_count) productSubsetRecs = productSubsetRecs.withColumn( "recommendations", explode("recommendations")) productSubsetRecs = productSubsetRecs.select(func.col('ProductId'), func.col('recommendations')['UserId'].alias('UserId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') productSubsetRecs = productSubsetRecs.drop('Rating') # userSubsetRecs.show() # userSubsetRecs.printSchema() productSubsetRecs = productSubsetRecs.toPandas() productSubsetRecs = productSubsetRecs.to_json() return productSubsetRecs def get_ratings_for_product_ids(self, model, user_id, product_id): if model == 0: request = self.spark_session.createDataFrame( [(user_id, product_id)], ["UserId", "ProductId"]) ratings = self.model1.transform(request).collect() return ratings elif model == 1: request = self.spark_session.createDataFrame( [(user_id, product_id)], ["UserId", "ProductId"]) ratings = self.model2.transform(request).collect() return ratings elif model == 2: request = self.spark_session.createDataFrame( [(user_id, product_id)], ["UserId", "ProductId"]) ratings = self.model3.transform(request).collect() return ratings def __init__(self, spark_session, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark_session = spark_session # Load Amazon data for later use logger.info("Loading Amazon data...") file_name1 = 'model-1.txt' dataset_file_path1 = os.path.join(dataset_path, file_name1) exist = os.path.isfile(dataset_file_path1) if exist: self.df0 = spark_session.read.csv(dataset_file_path1, header=None, inferSchema=True) self.df0 = self.df0.selectExpr("_c0 as UserId", "_c1 as ProductId", "_c2 as Rating") file_name2 = 'model-2.txt' dataset_file_path2 = os.path.join(dataset_path, file_name2) exist = os.path.isfile(dataset_file_path2) if exist: self.df1 = spark_session.read.csv(dataset_file_path2, header=None, inferSchema=True) self.df1 = self.df1.selectExpr("_c0 as UserId", "_c1 as ProductId", "_c2 as Rating") file_name3 = 'model-3.txt' dataset_file_path3 = os.path.join(dataset_path, file_name3) exist = os.path.isfile(dataset_file_path3) if exist: self.df2 = spark_session.read.csv(dataset_file_path3, header=None, inferSchema=True) self.df2 = self.df2.selectExpr("_c0 as UserId", "_c1 as ProductId", "_c2 as Rating") # Train the model self.__train_all_model()
class RecommendationEngine: """A anime recommendation engine """ def __train_model(self): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="anime_id", ratingCol="rating", coldStartStrategy="drop") self.model = self.als.fit(self.ratings) logger.info("ALS model built!") def add_ratings(self, user_id, anime_id, ratings): """Add additional anime ratings in the format (user_id, anime_id, rating) """ # Convert ratings to an RDD new_ratings = self.spark.createDataFrame( [(user_id, anime_id, ratings)], ["user_id", "anime_id", "rating"]) # Add new ratings to the existing ones self.ratings = self.ratings.union(new_ratings) # Re-train the ALS model with the new ratings self.__train_model() new_ratings = new_ratings.toPandas() new_ratings = new_ratings.to_json() return new_ratings def get_ratings_for_anime_ids(self, user_id, anime_id): """Given a user_id and a list of anime_ids, predict ratings for them """ dataframe = self.spark.createDataFrame([(user_id, anime_id)], ["user_id", "anime_id"]) predictions = self.model.transform(dataframe) ratings = predictions.toPandas() ratings = ratings.to_json() return ratings def get_top_ratings(self, user_id, animes_count): """Recommends up to animes_count top unrated animes to user_id """ users = self.ratings.select(self.als.getUserCol()).distinct() users = users.filter(users.user_id == user_id) top_ratings = self.model.recommendForUserSubset(users, animes_count) self.json_top = top_ratings.toPandas() self.json_top = self.json_top.to_json() return self.json_top def get_anime_top_ratings(self, anime_id, users_count): """Recommends up to animes_count top unrated animes to user_id """ animes = self.ratings.select(self.als.getItemCol()).distinct() animes = animes.filter(animes.anime_id == anime_id) anime_top = self.model.recommendForItemSubset(animes, users_count) self.json_top = anime_top.toPandas() self.json_top = self.json_top.to_json() return self.json_top def __init__(self, spark, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark = spark # Load ratings data for later use logger.info("Loading Ratings data...") ratings_file_path = os.path.join(dataset_path, 'rating.csv') self.ratings = spark.read.csv(ratings_file_path, header=True, inferSchema=True) # Load data Anime # logger.info("Loading Anime data...") # ratings_file_path = os.path.join(dataset_path, 'anime.csv') # self.animes = spark.read.csv(ratings_file_path, header=True, inferSchema=True) self.__train_model()
mymodel= als.fit(indexedDf) userRecs = mymodel.recommendForAllUsers(10) ProductRecs = mymodel.recommendForAllItems(10) userRecs.show() ProductRecs.show() # Generate top 5 movie recommendations for a specified set of users users = ratings.select(als.getUserCol()).distinct().limit(3) userSubsetRecs = mymodel.recommendForUserSubset(users, 5) # Generate top 5 user recommendations for a specified set of products products = ratings.select(als.getItemCol()).distinct().limit(3) productSubSetRecs = mymodel.recommendForItemSubset(products, 5) userSubsetRecs.show() ProductSubSetRecs.show() #pred_rdd = predictions.rdd #pred_rdd.repartition(1).saveAsTextFile("preds") rdd1 = userRecs.rdd rdd2 = ProductRecs.rdd rdd1.repartition(1).saveAsTextFile("userRecs")
class RecommendationEngine: """A Yelp recommendation engine """ def __train_all_model(self): """Train the ALS model with the current dataset """ #Model 1 logger.info("Training the ALS model 1") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="businessId", ratingCol="Stars", coldStartStrategy="drop") self.model1 = self.als.fit(self.df0) logger.info("ALS model 1 built!") #Model 2 logger.info("Training the ALS model 2") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="businessId", ratingCol="Stars", coldStartStrategy="drop") self.model2 = self.als.fit(self.df1) logger.info("ALS model 2 built!") #Model 3 logger.info("Training the ALS model 3") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="businessId", ratingCol="Stars", coldStartStrategy="drop") self.model3 = self.als.fit(self.df2) logger.info("ALS model 3 built!") def __train_model(self, model): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="businessId", ratingCol="Stars", coldStartStrategy="drop") if model == 0: self.model1 = self.als.fit(self.df0) elif model == 1: self.model2 = self.als.fit(self.df1) elif model == 2: self.model3 = self.als.fit(self.df2) logger.info("ALS model built!") def get_top_stars(self, model, userId, business_count): """Recommends up to business_count top unrated business to userId """ if model == 0: users = self.df0.select(self.als.getUserCol()) users = users.filter(users.userId == userId) userSubsetRecs = self.model1.recommendForUserSubset( users, business_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['businessId'].alias('businessId'), func.col('recommendations')['rating'].alias('rating')).\ drop('recommendations') #userSubsetRecs = userSubsetRecs.drop('stars') # userSubsetRecs = userSubsetRecs.join(self.df0, ("businessId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs elif model == 1: users = self.df1.select(self.als.getUserCol()) users = users.filter(users.userId == userId) userSubsetRecs = self.model2.recommendForUserSubset( users, business_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['businessId'].alias('businessId'), func.col('recommendations')['rating'].alias('rating')).\ drop('recommendations') #userSubsetRecs = userSubsetRecs.drop('stars') #userSubsetRecs = userSubsetRecs.join(self.df1, ("businessId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs elif model == 2: users = self.df2.select(self.als.getUserCol()) users = users.filter(users.userId == userId) userSubsetRecs = self.model3.recommendForUserSubset( users, business_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('userId'), func.col('recommendations')['businessId'].alias('businessId'), func.col('recommendations')['rating'].alias('rating')).\ drop('recommendations') #userSubsetRecs = userSubsetRecs.drop('stars') #userSubsetRecs = userSubsetRecs.join(self.df2, ("businessId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs def get_top_business_recommend(self, model, businessId, user_count): """Recommends up to businesss_count top unrated businesss to user_id """ if model == 0: business = self.df0.select(self.als.getItemCol()) business = business.filter(business.businessId == businessId) businessSubsetRecs = self.model1.recommendForItemSubset( business, user_count) businessSubsetRecs = businessSubsetRecs.withColumn( "recommendations", explode("recommendations")) businessSubsetRecs = businessSubsetRecs.select(func.col('businessId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['rating'].alias('stars')).\ drop('recommendations') # businessSubsetRecs = businessSubsetRecs.drop('rating') #businessSubsetRecs = businessSubsetRecs.join(self.businessdf, ("businessId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() businessSubsetRecs = businessSubsetRecs.toPandas() businessSubsetRecs = businessSubsetRecs.to_json() return businessSubsetRecs elif model == 1: business = self.df1.select(self.als.getItemCol()) business = business.filter(business.businessId == businessId) businessSubsetRecs = self.model2.recommendForItemSubset( business, user_count) businessSubsetRecs = businessSubsetRecs.withColumn( "recommendations", explode("recommendations")) businessSubsetRecs = businessSubsetRecs.select(func.col('businessId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['rating'].alias('rating')).\ drop('recommendations') #businessSubsetRecs = businessSubsetRecs.drop('rating') businessSubsetRecs = businessSubsetRecs.join( self.businesssdf, ("businessId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() businessSubsetRecs = businessSubsetRecs.toPandas() businessSubsetRecs = businessSubsetRecs.to_json() return businessSubsetRecs elif model == 2: businesss = self.df2.select(self.als.getItemCol()) businesss = businesss.filter(businesss.businessId == businessId) businessSubsetRecs = self.model3.recommendForItemSubset( business, user_count) businessSubsetRecs = businessSubsetRecs.withColumn( "recommendations", explode("recommendations")) businessSubsetRecs = businessSubsetRecs.select(func.col('businessId'), func.col('recommendations')['userId'].alias('userId'), func.col('recommendations')['rating'].alias('rating')).\ drop('recommendations') #businessSubsetRecs = businessSubsetRecs.drop('stars') #businessSubsetRecs = businessSubsetRecs.join(self.businesssdf, ("businessId"), 'inner') # userSubsetRecs.show() # userSubsetRecs.printSchema() businessSubsetRecs = businessSubsetRecs.toPandas() businessSubsetRecs = businessSubsetRecs.to_json() return businessSubsetRecs def get_stars_for_business_ids(self, model, userId, businessId): """Given a user_id and a list of business_ids, predict Stars for them """ if model == 0: request = self.spark_session.createDataFrame( [(userId, businessId)], ["userId", "businessId"]) Stars = self.model1.transform(request).collect() return Stars elif model == 1: request = self.spark_session.createDataFrame( [(userId, businessId)], ["userId", "businessId"]) Stars = self.model2.transform(request).collect() return Stars elif model == 2: request = self.spark_session.createDataFrame( [(userId, businessId)], ["userId", "businessId"]) Stars = self.model3.transform(request).collect() return Stars def __init__(self, spark_session, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark_session = spark_session # Load Stars data for later use logger.info("Loading Stars data...") file_name1 = 'data_part_1.txt' dataset_file_path1 = os.path.join(dataset_path, file_name1) exist = os.path.isfile(dataset_file_path1) if exist: self.df0 = spark_session.read.csv(dataset_file_path1, header=None, inferSchema=True) self.df0 = self.df0.selectExpr("_c0 as userId", "_c1 as businessId", "_c2 as Stars") file_name2 = 'data_part_2.txt' dataset_file_path2 = os.path.join(dataset_path, file_name2) exist = os.path.isfile(dataset_file_path2) if exist: self.df1 = spark_session.read.csv(dataset_file_path2, header=None, inferSchema=True) self.df1 = self.df1.selectExpr("_c0 as userId", "_c1 as businessId", "_c2 as Stars") file_name3 = 'data_part_3.txt' dataset_file_path3 = os.path.join(dataset_path, file_name3) exist = os.path.isfile(dataset_file_path3) if exist: self.df2 = spark_session.read.csv(dataset_file_path3, header=None, inferSchema=True) self.df2 = self.df2.selectExpr("_c0 as userId", "_c1 as businessId", "_c2 as Stars") # Train the model self.__train_all_model()
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) # Generate top 10 movie recommendations for each user userRecs = model.recommendForAllUsers(10) # Generate top 10 user recommendations for each movie movieRecs = model.recommendForAllItems(10) # Generate top 10 movie recommendations for a specified set of users users = ratings.select(als.getUserCol()).distinct().limit(3) userSubsetRecs = model.recommendForUserSubset(users, 10) # Generate top 10 user recommendations for a specified set of movies movies = ratings.select(als.getItemCol()).distinct().limit(3) movieSubSetRecs = model.recommendForItemSubset(movies, 10) # $example off$ userRecs.show() movieRecs.show() userSubsetRecs.show() movieSubSetRecs.show() spark.stop()
def CollaborativeFiltering(spark, sampleDataPath): ratingSamples = spark.read.format('csv').option('header', 'true').load(sampleDataPath) \ .withColumn("userIdInt", F.col("userId").cast(IntegerType())) \ .withColumn("movieIdInt", F.col("movieId").cast(IntegerType())) \ .withColumn("ratingFloat", F.col("rating").cast(FloatType())) # 将训练样本使用pyspark.rdd中的randomSplit按0.8:0.2的比例随机分为训练集和测试集 training_data, test_data = ratingSamples.randomSplit((0.8, 0.2)) # 在训练集上建立矩阵分解模型 '''参数详解 regParam:L2正则的系数lambda maxIter:交替计算User与Item的latent factors的迭代次数 userCol:DataFrame中用户列的名字 itemCol:DataFrame中物品列的名字 ratingCol:DataFrame中评分列的名字 coldStateStrategy,冷启动策略:设定为'drop'以确保模型在预测时遇到未知user或者item时(即没有在训练集中出现过)不会返回NaN,而是直接忽略 ''' als = ALS(regParam=0.01, maxIter=5, userCol='userIdInt', itemCol='movieIdInt', ratingCol='ratingFloat', coldStartStrategy='drop') # 训练模型 model = als.fit(training_data) # 通过在测试集上计算RMSE(Root Mean Squared Error, 均方根误差)以评估模型 predictions = model.transform(test_data) # 展示ALS模型的物品隐向量和用户隐向量,可以将这两者当做Item Embedding与User Embedding进行处理 model.itemFactors.show(10, truncate=False) model.userFactors.show(10, truncate=False) # 使用Spark的回归评估器进行评估,metricName选择rmse(Root Mean Square Error, 均方根误差) evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="ratingFloat", metricName='rmse') rmse = evaluator.evaluate(predictions) # 打印结果 print("RMSE = {}".format(rmse)) # 为每个用户生成Top 10 item推荐列表(即电影推荐) recListForUser = model.recommendForAllUsers(10) # 为每部电影生成Top 10 用户推荐列表 recListForMovie = model.recommendForAllItems(10) # 在给定的用户集上为集合中的每个用户生成Top 10电影推荐列表 userSubset = ratingSamples.select(als.getUserCol()).distinct().limit(3) recListForUserSubset = model.recommendForUserSubset(userSubset, 10) # 在给定的电影集上为集合中的每个电影生成Top 10用户推荐列表 movieSubset = ratingSamples.select(als.getItemCol()).distinct().limit(3) recListForMovieSubset = model.recommendForItemSubset(movieSubset, 10) # 显示推荐结果 recListForUser.show(5, truncate=False) recListForMovie.show(5, truncate=False) recListForUserSubset.show(5, truncate=False) recListForMovieSubset.show(5, truncate=False) paramGrid = ParamGridBuilder().addGrid(als.regParam, [0.01]).build() # 使用离线评估策略的交叉验证 # 将全部样本划分为k个大小相等的样本子集,依次遍历这k个子集,将每次遍历到的子集作为验证集,其余子集作为训练集 # 依次进行k次模型的训练和评估,k通常取10 # 最后将这k次评估指标的平均值作为最终评估指标 cv = CrossValidator(estimator=als, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=10) cvModel = cv.fit(test_data) avgMetrics = cvModel.avgMetrics
evaluator = RegressionEvaluator(metricName="rmse", labelCol="count", predictionCol="prediction") rmse = evaluator.evaluate(prediction) print("Root-mean-square error = " + str(rmse)) userRecs = model.recommendForAllUsers(10) artistRecs = model.recommendForAllItems(10) #top 10 artist recomendations users = user_artist_data_df.select(als.getUserCol()).distinct().limit(3) userSubsetRecs = model.recommendForUserSubset(users, 10) #top 10 user recommendations for a specified artist artist = user_artist_data_df.select(als.getItemCol()).distinct().limit(3) artistSubSetRecs = model.recommendForItemSubset(artist, 10) print("Top 10 Recomendation : ") userRecs.show(10) print("============================") artistRecs.show(10) print("============================") userSubsetRecs.show(10) print("============================") artistSubSetRecs.show(10) spark.stop()