def benchmark_spark(ratings, factors, iterations=5): conf = (SparkConf() .setAppName("implicit_benchmark") .setMaster('local[*]') .set('spark.driver.memory', '16G') ) context = SparkContext(conf=conf) spark = SparkSession(context) times = {} try: ratings = convert_sparse_to_dataframe(spark, context, ratings) for rank in factors: als = ALS(rank=rank, maxIter=iterations, alpha=1, implicitPrefs=True, userCol="row", itemCol="col", ratingCol="data") start = time.time() als.fit(ratings) elapsed = time.time() - start times[rank] = elapsed / iterations print("spark. factors=%i took %.3f" % (rank, elapsed/iterations)) finally: spark.stop() return times
spark = SparkSession.builder.appName('Recommendation_system').getOrCreate() df = pd.read_csv("query_results.csv") clusterNames = df["clusterName"].unique().tolist() clusterToIdMapper = dict(zip(clusterNames, range(len(clusterNames)))) df["clusterName"] = df["clusterName"].apply(lambda x: clusterToIdMapper[x]) propertyValues = df["propertyValue"].unique().tolist() propertyValueToIdMapper = dict(zip(propertyValues, range(len(propertyValues)))) df["propertyValue"] = df["propertyValue"].apply(lambda x: propertyValueToIdMapper[x]) print("len(clusterToIdMapper)", len(clusterToIdMapper)) print("len(propertyValueToIdMapper)", len(propertyValueToIdMapper)) als = ALS(maxIter=5, regParam=0.1, userCol="propertyValue", itemCol="clusterName", ratingCol="dcount_targetId", coldStartStrategy="drop") sparkDF=spark.createDataFrame(df) sparkDF.show(10) model=als.fit(sparkDF) model.itemFactors.show(10, truncate=False)
# Load up our movie ID -> name dictionary movieNames = loadMovieNames() # Get the raw data lines = spark.read.text("hdfs:///user/maria_dev/ml-100k/u.data.1").rdd # Convert it to a RDD of Row objects with (userID, movieID, rating) ratingsRDD = lines.map(parseInput) # Convert to a DataFrame and cache it ratings = spark.createDataFrame(ratingsRDD).cache() # Create an ALS collaborative filtering model from the complete dataset als = ALS(maxIter=5, regParam=0.01, userCol="userID", itemCol="movieID", ratingCol="rating") model = als.fit(ratings) # Print out ratings from user 0 print("\nRatings for user ID 0:") userRatings = ratings.filter("userID = 0") for rating in userRatings.collect(): print(movieNames[rating['movieID']], rating['rating']) print("\nTop 20 recommendations:") # Find movies rated more than 100 times ratingCounts = ratings.groupBy("movieID").count().filter("count > 100") # Construct a "test" dataframe for user 0 with every movie rated more than 100 times
#create gridsearch to find optimal hyperparameters try_rank = [30, 35, 40] try_alpha = [2, 5, 12] try_reg = [2, 3, 3.5] auc_res = [] for rank in try_rank: for alpha in try_alpha: for reg in try_reg: #fit model with params for this iteration loop_model = ALS(implicitPrefs=True, userCol="userId", itemCol="artistId", ratingCol="song_count", rank=rank, alpha=alpha, regParam=reg).fit(training) #evaluate AUC loop_auc = areaUnderCurve( test, bTopItemIDs, loop_model.transform ) #AUC for test data w/pred from iteration's model #add tuple of hyperparams and AUC to initalized results list auc_res_content = (rank, alpha, reg, loop_auc) print(auc_res_content) auc_res += tuple([auc_res_content]) final_rank = max(auc_res, key=lambda item: item[3])[0] final_alpha = max(auc_res, key=lambda item: item[3])[1] final_reg = max(auc_res, key=lambda item: item[3])[2]
open(input_dir + 'business_avg.json')) #%% spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext train = sc.textFile(input_dir + 'train_review.json').map( json.loads).map(lambda x: (x['user_id'], x['business_id'], x['stars'])) userInt = sc.broadcast(train.keys().distinct().zipWithIndex().collectAsMap()) bizInt = sc.broadcast( train.map(lambda x: x[1]).distinct().zipWithIndex().collectAsMap()) train = train.map( lambda x: (userInt.value.get(x[0]), bizInt.value.get(x[1]), x[2])).toDF( ['user_id', 'business_id', 'stars']) # Model 1 als_model = ALS(maxIter=20, regParam=0.4, userCol='user_id', itemCol='business_id', ratingCol='stars', coldStartStrategy="nan") als_model = als_model.fit(train) del train #%% weekdays = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ] def modifyHours(x): if x is not None:
def test_storage_levels(self): df = self.spark.createDataFrame( [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)], ["user", "item", "rating"]) als = ALS().setMaxIter(1).setRank(1) # test default params als.fit(df) self.assertEqual(als.getIntermediateStorageLevel(), "MEMORY_AND_DISK") self.assertEqual(als._java_obj.getIntermediateStorageLevel(), "MEMORY_AND_DISK") self.assertEqual(als.getFinalStorageLevel(), "MEMORY_AND_DISK") self.assertEqual(als._java_obj.getFinalStorageLevel(), "MEMORY_AND_DISK") # test non-default params als.setIntermediateStorageLevel("MEMORY_ONLY_2") als.setFinalStorageLevel("DISK_ONLY") als.fit(df) self.assertEqual(als.getIntermediateStorageLevel(), "MEMORY_ONLY_2") self.assertEqual(als._java_obj.getIntermediateStorageLevel(), "MEMORY_ONLY_2") self.assertEqual(als.getFinalStorageLevel(), "DISK_ONLY") self.assertEqual(als._java_obj.getFinalStorageLevel(), "DISK_ONLY")
lines = spark.read.text('ratings.dat').rdd ratingsRDD = lines.map(parse_rating) lines = spark.read.text('gender.dat').rdd users = dict(lines.map(parse_user).collect()) ratings = spark.createDataFrame(ratingsRDD) (training, test) = ratings.randomSplit([0.8, 0.2]) num_training = training.count() num_validation = test.count() print('Training: %d' % num_training) print('Validation: %d' % num_validation) # setup ALS rank = 8 num_iterations = 8 lambda_ = 0.1 als = ALS(maxIter=num_interations, regParam=lambda_, userCol="userID", itemCol="profileID", ratingCol="rating") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) spark.stop()
def train_model(training_df, rank): iterations = 10 als = ALS(rank=rank, maxIter=iterations, implicitPrefs=True) return als.fit(training_df)
def main(spark, df_train, df_val, model_file): # import train data train = spark.read.parquet(df_train) print("imported train data") # import validation data val = spark.read.parquet(df_val) print("imported validation data") # index users and tracks indexer1 = StringIndexer(inputCol = "user_id", outputCol = "user_index", handleInvalid = "skip") #skip null values # grid search bestModel = None bestValidationMAP = -1 best_rank, best_regparam, best_alpha = None, None, None list_regParam = [0.05] list_rank = [10,20,50] list_alpha = [1,15] # drop count threshold #drop_count = [1,2,3] # select records with count > 1 train = train.filter(train["count"] > 1) val = val.filter(val["count"] > 1) print("kept records with count > 1") # Build the recommendation model using ALS on the train data for reg, rank, alpha in itertools.product(list_regParam, list_rank, list_alpha): als = ALS(seed = 1, rank = rank, regParam = reg, alpha = alpha, userCol = "user_index", itemCol = "track_index", ratingCol = "count", implicitPrefs = True) # create pipeline pipeline = Pipeline(stages=[indexer1,als]) model = pipeline.fit(train) print("trained model with reg = %s, rank = %s, alpha = %s" %(reg, rank, alpha)) # predict on validation data and indexed users val_indexed = model.transform(val) val_indexed = val_indexed.select([c for c in val_indexed.columns if c in ["user_index", "count", "track_index"]]) print("indexed users") # make labels val_indexed.createOrReplaceTempView('val_indexed') Labels = spark.sql('SELECT user_index, collect_list(track_index) AS label FROM val_indexed GROUP BY user_index') Labels.createOrReplaceTempView('Labels') print("created ground truth labels") # generate top 500 track recommendations for each user in validation set user_subset = val_indexed.select("user_index").distinct() userRecs = model.stages[-1].recommendForUserSubset(user_subset,500) userRecs.createOrReplaceTempView("userRecs") print("made user recommendations") # explode recommendations in long format Recs = (userRecs.select("user_index", explode("recommendations").alias("pred")).select("user_index", "pred.*")) Recs.createOrReplaceTempView("Recs") # make predictions Preds = spark.sql('SELECT user_index, collect_list(track_index) AS prediction FROM Recs GROUP BY user_index') Preds.createOrReplaceTempView("Preds") # make label pairs Preds_labels = spark.sql('SELECT Preds.prediction AS prediction, Labels.label as label FROM Preds INNER JOIN Labels ON Preds.user_index = Labels.user_index') print("inner join preds & labels") # calculate MAP MAPrecommendationsAndTruth = Preds_labels.select("prediction", "label") metrics = RankingMetrics(MAPrecommendationsAndTruth.rdd) MAP = metrics.meanAveragePrecision print("MAP = %s" % MAP) # get best model if MAP > bestValidationMAP: bestModel = model bestValidationMAP = MAP best_rank, best_regparam, best_alpha = rank, reg, alpha # save best model and params pip_model = bestModel pip_model.write().overwrite().save(model_file) print("Best model saved with reg = %s, rank = %s, alpha = %s, MAP = %s" %(best_regparam, best_rank, best_alpha, bestValidationMAP))
# $example on$ lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd parts = lines.map(lambda row: row.value.split("::")) ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2]), timestamp=long(p[3]))) ratings = spark.createDataFrame(ratingsRDD) (training, test) = ratings.randomSplit([0.8, 0.2]) # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) # Generate top 10 movie recommendations for each user userRecs = model.recommendForAllUsers(10) # Generate top 10 user recommendations for each movie
trainingData.cache() validationData.cache() reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="plays", metricName="rmse") regParams = [0.25] ranks = [16] tolerance = 0.03 errors = [[0]*len(ranks)]*len(regParams) models = [[0]*len(ranks)]*len(regParams) err = 0 min_error = float('inf') best_rank = -1 i=0 for regParam in regParams: j=0 for rank in ranks: als = ALS(maxIter=5, regParam=regParam,rank= rank,alpha=80, seed=8427,userCol="new_user_id", itemCol="new_song_id", ratingCol="plays",implicitPrefs=True) model = als.fit(trainingData) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(validationData) # Remove NaN values from prediction (due to SPARK-14489) predicted_plays_df = predictions.filter(predictions.prediction != float('nan')) #evaluator = RegressionEvaluator(metricName="rmse", labelCol="plays",predictionCol="prediction") #rmse = evaluator.evaluate(predictions) #print("For regParam: " + str(regParam) + ", rank: " +str(rank) + ", alpha: " + str(alpha) + ", Root-mean-square error = " + str(rmse)) # Run the previously created RMSE evaluator, reg_eval, on the predicted_ratings_df DataFrame error = reg_eval.evaluate(predicted_plays_df) errors[i][j] = error models[i][j] = model print("For rank " + str(rank) + ", regularization parameter " + str(regParam) + "the RMSE is " + str(error)) if error < min_error: min_error = error
def train_als(params, data): symbol = ALS(**params) with Timer() as t: model = symbol.fit(data) return model, t
class RecommendationEngine: """A anime recommendation engine """ def __train_model(self): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="anime_id", ratingCol="rating", coldStartStrategy="drop") self.model = self.als.fit(self.ratings) logger.info("ALS model built!") def add_ratings(self, user_id, anime_id, ratings): """Add additional anime ratings in the format (user_id, anime_id, rating) """ # Convert ratings to an RDD new_ratings = self.spark.createDataFrame( [(user_id, anime_id, ratings)], ["user_id", "anime_id", "rating"]) # Add new ratings to the existing ones self.ratings = self.ratings.union(new_ratings) # Re-train the ALS model with the new ratings self.__train_model() new_ratings = new_ratings.toPandas() new_ratings = new_ratings.to_json() return new_ratings def get_ratings_for_anime_ids(self, user_id, anime_id): """Given a user_id and a list of anime_ids, predict ratings for them """ dataframe = self.spark.createDataFrame([(user_id, anime_id)], ["user_id", "anime_id"]) predictions = self.model.transform(dataframe) ratings = predictions.toPandas() ratings = ratings.to_json() return ratings def get_top_ratings(self, user_id, animes_count): """Recommends up to animes_count top unrated animes to user_id """ users = self.ratings.select(self.als.getUserCol()).distinct() users = users.filter(users.user_id == user_id) top_ratings = self.model.recommendForUserSubset(users, animes_count) self.json_top = top_ratings.toPandas() self.json_top = self.json_top.to_json() return self.json_top def get_anime_top_ratings(self, anime_id, users_count): """Recommends up to animes_count top unrated animes to user_id """ animes = self.ratings.select(self.als.getItemCol()).distinct() animes = animes.filter(animes.anime_id == anime_id) anime_top = self.model.recommendForItemSubset(animes, users_count) self.json_top = anime_top.toPandas() self.json_top = self.json_top.to_json() return self.json_top def __init__(self, spark, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark = spark # Load ratings data for later use logger.info("Loading Ratings data...") ratings_file_path = os.path.join(dataset_path, 'rating.csv') self.ratings = spark.read.csv(ratings_file_path, header=True, inferSchema=True) # Load data Anime # logger.info("Loading Anime data...") # ratings_file_path = os.path.join(dataset_path, 'anime.csv') # self.animes = spark.read.csv(ratings_file_path, header=True, inferSchema=True) self.__train_model()
"/Users/grey/Documents/Big Data/project/files/ratings_small.csv") linesRdd = lines.mapPartitions(lambda x: csv.reader(x)) ratingheader = linesRdd.first() linesRdd = linesRdd.filter(lambda x: x != ratingheader) # parts = lines.map(lambda row: row.value.split("::")) ratingsRDD = linesRdd.map( lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2]))) ratings = spark.createDataFrame(ratingsRDD) (training, test) = ratings.randomSplit([0.8, 0.2], int(sys.argv[1])) # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop", rank=70) als.setSeed(int(sys.argv[1])) # Fits a model to the input dataset with optional parameters. # Returns: fitted model(s) model = als.fit(training) # # Evaluate the model by computing the RMSE on the test data # # transform() Transforms the input dataset with optional parameters. # predictions = model.transform(test) # # Evaluator for Regression, which expects two input columns: prediction and label. # evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", # predictionCol="prediction") # # evaluate() Returns: metric
def _evaluate(self, dataset): error=self.rmse(dataset,self.predictionCol,self.targetCol) print ("Error: {}".format(error)) return error def isLargerBetter(self): return False @staticmethod def rmse(dataset,predictionCol,targetCol): return sqrt(dataset.dropna().map(lambda x: (x[targetCol] - x[predictionCol]) ** 2).reduce(add) / float(dataset.count())) lr1 = ALS() grid1 = ParamGridBuilder().addGrid(lr1.regParam, [1.0,0.5,2.0]).build() evaluator1 = MiEvaluador(predictionCol=lr1.getPredictionCol(),targetCol=lr1.getRatingCol()) cv1 = CrossValidator(estimator=lr1, estimatorParamMaps=grid1, evaluator=evaluator1, numFolds=2) cvModel1 = cv1.fit(dfRatings) a=cvModel1.transform(dfRatings) error_cross_validation=MiEvaluador.rmse(a,lr1.getPredictionCol(),lr1.getRatingCol()) print ('ERROR de validacion: {}'.format(error_cross_validation)) error_models=[] for reg_param in (1.0,0.5,2.0): lr = ALS(regParam=reg_param) model = lr.fit(dfRatings) error=MiEvaluador.rmse(model.transform(dfRatings),lr.getPredictionCol(),lr.getRatingCol()) error_models.append(error) print ('reg_param: {}, rmse: {}'.format(reg_param,error))
# the ML libraries require integers, so we need to create keys for the users & videos temporarily user_ids = ratings.select("userid").distinct().rdd.zipWithUniqueId() user_map = user_ids.map(lambda (x, y): Row(userid=x.userid, userid_int=y)).toDF().cache() # same as above - this is a UUID/int mapping video_ids = ratings.select("videoid").distinct().rdd.zipWithUniqueId().cache() video_map = video_ids.map(lambda (x, y): Row(videoid=x.videoid, videoid_int=y)).toDF().cache() print "Recommending based on {0} users and {1} videos.".format(user_map.count(), video_map.count()) training_data = ratings.join(user_map, ratings.userid == user_map.userid).\ join(video_map, ratings.videoid == video_map.videoid).\ select(user_map.userid, user_map.userid_int, video_map.videoid, video_map.videoid_int, "rating") # Create ALS transformer and train with the ratings from our C* table als = ALS(rank=10, maxIter=10).setUserCol("userid_int").setItemCol("videoid_int").setRatingCol("rating") model = als.fit(training_data) users = user_map.collect() user_map.unpersist() count = 0 length = len(users) for user in users: videos_and_user = video_map.withColumn("userid", lit(user.userid)).\ withColumn("userid_int", lit(user.userid_int)) model.transform(videos_and_user).\ sort("prediction", ascending=False).limit(30).\ select("videoid", "userid", col("prediction").alias("rating")).\ write.format("org.apache.spark.sql.cassandra").\ options(keyspace="killrvideo", table="video_recommendations_by_video").\
def als_model(userid, df): als_df_pd = session.execute('SELECT * FROM movie_rating') #als_df_pd = pd.read_csv("ratings_small.csv") movie_list_df = df.select('id', 'title') movie_list_df = movie_list_df.withColumn('userId', lit(userid)) for col in als_df_pd.columns: if als_df_pd[col].dtypes == 'object': als_df_pd[col] = als_df_pd[col].astype('str') ratings = sqlContext.createDataFrame(als_df_pd) #ratings.printSchema() #ratings.show() #print((ratings.count(), len(ratings.columns))) mv_notwatched_df = ratings.filter(ratings.userId == userid)\ .select('movieId')\ .join(movie_list_df, ratings.movieId == movie_list_df.id, 'right_outer')\ .drop("movieId")\ .withColumnRenamed("id", "movieId") #ratings.groupBy("userID").count().show() usercount = ratings.agg( countDistinct(ratings.userId).alias("Users_Count")).head()[0] #print('The number of distinct values of Users is: ', str(usercount)) (training, test) = ratings.randomSplit([0.8, 0.2]) # # Build the recommendation model using ALS on the training data # # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") model = als.fit(training) # # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("ALS- Model Root-mean-square error before Tuning= " + str(rmse)) # # Generate top 10 movie recommendations for each user print("Top 10 movies recommended for each user") userRecs = model.recommendForAllUsers(10) userRecs.show(10) # Generate top 10 user recommendations for each movie print("Top 10 movies recommended for each movie") movieRecs = model.recommendForAllItems(10) movieRecs.show(10) # Tune the model pipeline = Pipeline(stages=[als]) paramGrid = ParamGridBuilder() \ .addGrid(als.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2) # use 3+ folds in practice # Run cross-validation, and choose the best set of parameters. cvModel_fitted = crossval.fit(training) #print("best model") bestModel = cvModel_fitted.bestModel print("ALS model - Root-mean-square error after Tuning= " + str(rmse)) predictions = cvModel_fitted.transform(test) print("Best Prediction Model") predictions.show(10) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error after cross validation = " + str(rmse)) param_dict = cvModel_fitted.bestModel.stages[0].extractParamMap() print("List of Movies not watched by the User") mv_notwatched_df.show(10) print("Top 10 movies Recommended") top_10 = cvModel_fitted.transform(mv_notwatched_df).orderBy( desc('prediction')).limit(10) top_10.show(10) return top_10
spark = SparkSession\ .builder\ .appName("ALSExample")\ .getOrCreate() # $example on$ lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd parts = lines.map(lambda row: row.value.split("::")) ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2]), timestamp=long(p[3]))) ratings = spark.createDataFrame(ratingsRDD) (training, test) = ratings.randomSplit([0.8, 0.2]) # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) # Generate top 10 movie recommendations for each user userRecs = model.recommendForAllUsers(10) # Generate top 10 user recommendations for each movie movieRecs = model.recommendForAllItems(10) # Generate top 10 movie recommendations for a specified set of users
def run_train(spark, train, test_input, param_dict): models = {} # als_param = param_dict["als"] als_nn_param = param_dict["als_nn"] # als_ibcf_param = param_dict["als_ibcf"] als_nn_ibcf_param = param_dict["als_nn_ibcf"] # als_ibcf_mean_param = param_dict["als_ibcf_mean"] # als_nn_ibcf_mean_param = param_dict["als_nn_ibcf_mean"] user_col = "MASV1" item_col = "F_MAMH" item_index_col = "F_MAMH_index" grade_col = "TKET" prediction_col = "prediction" #IBCF prediction model # print("train count: {}".format(train.count())) # print("test_input count: {}".format(test_input.count())) ibcf_estimator = IBCFEstimator(spark, user_col, item_col, item_index_col, grade_col, prediction_col) ibcf_model = ibcf_estimator.fit(train) nbcf_estimator = NBCFEstimator(spark, user_col, item_col, grade_col, prediction_col) nbcf_model = nbcf_estimator.fit( train.unionAll(test_input).drop(item_index_col)) # user_col = "MASV1" # item_col = "F_MAMH" # item_index_col = "F_MAMH_index" # grade_col = "TKET" # prediction_col = "prediction" # # #IBCF prediction model # ibcf_model = IBCF(spark, user_col, item_col, item_index_col, grade_col, prediction_col) # train_part_df = ibcf_model.remove_unknown_item(train, test_input) # validate_part_df = ibcf_model.remove_unknown_item(train, test_output) # item_similarity_df = ibcf_model.fit(train.drop(item_col)) # # for rank in ibcf_ranks: # result_df = ibcf_model.predict(validate_part_df, item_similarity_df, train_part_df, rank) # result_df.show() # error_ibcf = evaluate(result_df,evaluators) # error_list_ibcf[rank] = error_ibcf # # #NBCF prediction model # nbcf_model = NBCF(spark, user_col, item_col, item_index_col, grade_col, prediction_col) # train_df = train.unionAll(test_input) # user_similarity = nbcf_model.fit(train_df.drop(item_col)) # for rank in ibcf_ranks: # result_df = nbcf_model.predict(test_output, user_similarity, train_df, rank) # result_df.show() # error_nbcf = evaluate(result_df,evaluators) # error_list_nbcf[rank] = error_nbcf als_input = train.unionAll(test_input) # # als non negative false # als = ALS(rank=als_param["rank"], maxIter=15, regParam=0.01, userCol="MASV1", # itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=False) # # als_model = als.fit(als_input) als_nn = ALS(rank=als_nn_param["rank"], maxIter=15, regParam=0.01, userCol="MASV1", itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=True) als_nn_model = als_nn.fit(als_input) # combine mf_ibcf_model # # als_ibcf # als_ibcf_als = ALS(rank=als_ibcf_param["als_rank"], maxIter=15, regParam=0.01, userCol="MASV1", # itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=False) # als_ibcf_als_model = als_ibcf_als.fit(als_input) # als_ibcf_model = IBCFWithItemFactor(spark, als_ibcf_als_model.itemFactors, IBCFWithItemFactor.create_item_index(als_input,"F_MAMH", "F_MAMH_index")) \ # .setUserCol("MASV1") \ # .setItemCol("F_MAMH") \ # .setValueCol("TKET") \ # .setRank(als_ibcf_param["ibcf_rank"]) # # # als_ibcf_mean # als_ibcf_mean_als = ALS(rank=als_ibcf_mean_param["als_rank"], maxIter=15, regParam=0.01, userCol="MASV1", # itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=False) # als_ibcf_mean_als_model = als_ibcf_mean_als.fit(als_input) # als_ibcf_mean_ibcf_model = IBCFWithItemFactor(spark, als_ibcf_mean_als_model.itemFactors) \ # .setUserCol("MASV1") \ # .setItemCol("F_MAMH_index") \ # .setValueCol("TKET") \ # .setRank(als_ibcf_mean_param["ibcf_rank"]) # # als_ibcf_mean_model = ALSIBCFMeanModel(spark, als_ibcf_mean_ibcf_model, als_ibcf_mean_als_model)\ # .setUserCol("MASV1") \ # .setItemCol("F_MAMH_index") \ # .setValueCol("TKET") # # als_nn_ibcf als_nn_ibcf_als = ALS(rank=als_nn_ibcf_param["als_rank"], maxIter=15, regParam=0.01, userCol="MASV1", itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=True) als_nn_ibcf_als_model = als_nn_ibcf_als.fit(als_input) als_nn_ibcf_model = IBCFWithItemFactor(spark, als_nn_ibcf_als_model.itemFactors, IBCFWithItemFactor.create_item_index(als_input,"F_MAMH", "F_MAMH_index")) \ .setUserCol("MASV1") \ .setItemCol("F_MAMH") \ .setValueCol("TKET") \ .setRank(als_nn_ibcf_param["ibcf_rank"]) # # # als_nn_ibcf_mean # als_ibcf_nn_mean_als = ALS(rank=als_nn_ibcf_mean_param["als_rank"], maxIter=15, regParam=0.01, userCol="MASV1", # itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=False) # als_ibcf_nn_mean_als_model = als_ibcf_nn_mean_als.fit(als_input) # als_nn_ibcf_mean_ibcf_model = IBCFWithItemFactor(spark, als_ibcf_nn_mean_als_model.itemFactors) \ # .setUserCol("MASV1") \ # .setItemCol("F_MAMH_index") \ # .setValueCol("TKET") \ # .setRank(als_nn_ibcf_mean_param["ibcf_rank"]) # # als_nn_ibcf_mean_model = ALSIBCFMeanModel(spark, als_nn_ibcf_mean_ibcf_model, als_ibcf_nn_mean_als_model)\ # .setUserCol("MASV1") \ # .setItemCol("F_MAMH_index") \ # .setValueCol("TKET") baseline_model = MeanTransformer(spark)\ .setUserCol("MASV1")\ .setItemCol("F_MAMH_index")\ .setValueCol("TKET")\ .setOutputCol("prediction") models["ibcf"] = ibcf_model models["ubcf"] = nbcf_model # models["als"] = als_model models["als_nn"] = als_nn_model # models["als_ibcf"] = als_ibcf_model models["als_nn_ibcf"] = als_nn_ibcf_model # models["als_ibcf_mean"] = als_ibcf_mean_model # models["als_nn_ibcf_mean"] = als_nn_ibcf_mean_model models["baseline"] = baseline_model return models
import os import pandas as pd from pyspark.sql import SparkSession from pyspark.ml.recommendation import ALS from pyspark.ml.evaluation import RegressionEvaluator names = ['user_id', 'item_id', 'rating', 'timestamp'] df = pd.read_csv("/Users/luokui/laji/ml-100k/u.data", sep="\t", header=None, names=names).head(10000) spark = SparkSession.builder.master("spark://luokuideMacBook-Pro.local:7077" ).appName("test.als").getOrCreate() data = spark.createDataFrame(df) (trainsss, testing) = data.randomSplit([0.8, 0.2]) (training, valid) = trainsss.randomSplit([0.8, 0.2]) del df als = ALS(maxIter=10, regParam=0.01, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop", numBlocks=6) model = als.fit(training)
def get_best_param(spark, train, test_input, test_output, rank_list, ibcf_ranks): evaluator_rmse = RegressionEvaluator(metricName="rmse", labelCol="TKET", predictionCol="prediction") evaluator_mse = RegressionEvaluator(metricName="mse", labelCol="TKET", predictionCol="prediction") evaluator_mae = RegressionEvaluator(metricName="mae", labelCol="TKET", predictionCol="prediction") evaluators = [evaluator_rmse, evaluator_mse, evaluator_mae] error_list_als = {} error_list_als_nn = {} error_list_als_ibcf = {} error_list_als_nn_ibcf = {} error_list_combine = {} error_list_combine_nn = {} error_list_ibcf = {} error_list_nbcf = {} error_models = {} best_models = {} # test_input.show() # test_output.show() baseline_model = MeanTransformer(spark)\ .setUserCol("MASV1")\ .setItemCol("F_MAMH_index")\ .setValueCol("TKET")\ .setOutputCol("prediction") # predi = baseline_model.transform(test_input, test_output) # predi.show() user_col = "MASV1" item_col = "F_MAMH" item_index_col = "F_MAMH_index" grade_col = "TKET" prediction_col = "prediction" # #IBCF prediction model ibcf_estimator = IBCFEstimator(spark, user_col, item_col, item_index_col, grade_col, prediction_col) train_part_df = ibcf_estimator.remove_unknown_item(train, test_input) validate_part_df = ibcf_estimator.remove_unknown_item(train, test_output) ibcf_model = ibcf_estimator.fit(train) for rank in ibcf_ranks: result_df = ibcf_model.transform(train_part_df.drop("F_MAMH_index"), validate_part_df.drop("F_MAMH_index"), rank) # result_df.show() error_ibcf = evaluate(result_df, evaluators) error_list_ibcf[rank] = error_ibcf best_models = put_best_model( best_models, "ibcf", Model_Error_Wrapper("ibcf_{}".format(rank), ibcf_model, error_ibcf[0], {"rank": rank})) # # #NBCF prediction model nbcf_model = NBCFEstimator(spark, user_col, item_col, grade_col, prediction_col) train_df = train.unionAll(test_input) nbcf_model = nbcf_model.fit(train_df.drop("F_MAMH_index")) for rank in ibcf_ranks: result_df = nbcf_model.transform(test_output.drop("F_MAMH_index").drop("TKET"), rank)\ .join(test_output.drop("F_MAMH_index"), [user_col, item_col]) error_nbcf = evaluate(result_df, evaluators) error_list_nbcf[rank] = error_nbcf best_models = put_best_model( best_models, "ubcf", Model_Error_Wrapper("ubcf_{}".format(rank), nbcf_model, error_nbcf[0], {"rank": rank})) for i in range(len(rank_list)): als_input = train.unionAll(test_input) # # als non negative false # als = ALS(rank=rank_list[i], maxIter=15, regParam=0.01, userCol="MASV1", # itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=False) # # als_model = als.fit(als_input) # predict_als = als_model.transform(test_output) # als non negative true als_nn = ALS(rank=rank_list[i], maxIter=15, regParam=0.01, userCol="MASV1", itemCol="F_MAMH_index", ratingCol="TKET", coldStartStrategy="drop", nonnegative=True) als_nn_model = als_nn.fit(als_input) predict_als_nn = als_nn_model.transform(test_output) # predict_als_nn.coalesce(1).write.option("header", "true").option("charset", "UTF-8").csv("output_test_" + str(i) + ".csv") # error_als = evaluate(predict_als, evaluators) error_als_nn = evaluate(predict_als_nn, evaluators) # error_list_als[rank_list[i]] = error_als error_list_als_nn[rank_list[i]] = error_als_nn # best_models = put_best_model(best_models, "als", # Model_Error_Wrapper("als_{}".format(rank_list[i]), als_model, error_als[0], {"rank": rank_list[i]})) best_models = put_best_model( best_models, "als_nn", Model_Error_Wrapper("als_nn_{}".format(rank_list[i]), als_nn_model, error_als_nn[0], {"rank": rank_list[i]})) # combine mf_ibcf_model for ibcf_rank in ibcf_ranks: # als_ibcf # als_ibcf_model = IBCFWithItemFactor(spark, als_model.itemFactors, IBCFWithItemFactor.create_item_index(als_input,"F_MAMH", "F_MAMH_index")) \ # .setUserCol("MASV1") \ # .setItemCol("F_MAMH") \ # .setValueCol("TKET") \ # .setRank(ibcf_rank) # predict_als_ibcf = als_ibcf_model.transform(test_input, test_output.drop("TKET")) # predict_als_ibcf_with_gt = predict_als_ibcf.join(test_output, ["MASV1", "F_MAMH"]) # # predict_als_ibcf_with_gt.show() # error_als_ibcf = evaluate(predict_als_ibcf_with_gt, evaluators) # error_list_als_ibcf["{}_{}".format(rank_list[i], ibcf_rank)] = error_als_ibcf # best_models = put_best_model(best_models, "als_ibcf", # Model_Error_Wrapper("als_ibcf_{}_{}".format(rank_list[i], ibcf_rank), # als_ibcf_model, error_als_ibcf[0], {"als_rank": rank_list[i], # "ibcf_rank": ibcf_rank})) # # # als_ibcf_mean # als_ibcf_mean_model = ALSIBCFMeanModel(spark, als_ibcf_model, als_model).setUserCol("MASV1") \ # .setItemCol("F_MAMH_index") \ # .setValueCol("TKET") # combine = als_ibcf_mean_model.transform(test_input, test_output.drop("TKET")).join(test_output, ["MASV1", # "F_MAMH_index"]) # # combine.show() # # # combine with als # # combine = predict_als_ibcf.withColumnRenamed("prediction", "prediction_ibcf") \ # # .join(predict_als.withColumnRenamed("prediction", "prediction_als"), ["MASV1", "F_MAMH_index"]) \ # # .withColumn("prediction", (col("prediction_ibcf") + col("prediction_als")) / 2) # # error_combine = evaluate(combine, evaluators) # error_list_combine["{}_{}".format(rank_list[i], ibcf_rank)] = error_combine # best_models = put_best_model(best_models, "als_ibcf_mean", # Model_Error_Wrapper("als_ibcf_mean_{}_{}".format(rank_list[i], ibcf_rank), # als_ibcf_mean_model, error_combine[0],{"als_rank": rank_list[i], # "ibcf_rank": ibcf_rank})) # # als_nn_ibcf als_nn_ibcf_model = IBCFWithItemFactor(spark, als_nn_model.itemFactors, IBCFWithItemFactor.create_item_index(als_input,"F_MAMH", "F_MAMH_index")) \ .setUserCol("MASV1") \ .setItemCol("F_MAMH") \ .setValueCol("TKET") \ .setRank(ibcf_rank) predict_als_nn_ibcf = als_nn_ibcf_model.transform( test_input, test_output.drop("TKET")) predict_als_nn_ibcf_with_gt = predict_als_nn_ibcf.join( test_output, ["MASV1", "F_MAMH"]) error_als_nn_ibcf = evaluate(predict_als_nn_ibcf_with_gt, evaluators) error_list_als_nn_ibcf["{}_{}".format( rank_list[i], ibcf_rank)] = error_als_nn_ibcf best_models = put_best_model( best_models, "als_nn_ibcf", Model_Error_Wrapper( "als_nn_ibcf_{}_{}".format(rank_list[i], ibcf_rank), als_nn_ibcf_model, error_als_nn_ibcf[0], { "als_rank": rank_list[i], "ibcf_rank": ibcf_rank })) # # # als_nn_ibcf_mean # als_nn_ibcf_mean_model = ALSIBCFMeanModel(spark, als_nn_ibcf_model, als_nn_model).setUserCol("MASV1") \ # .setItemCol("F_MAMH_index") \ # .setValueCol("TKET") # combine_nn = als_nn_ibcf_mean_model.transform(test_input, test_output.drop("TKET")).join(test_output, # ["MASV1", # "F_MAMH_index"]) # # combine_nn.show() # # # combine with als_nn # # combine_nn = predict_als_nn_ibcf.withColumnRenamed("prediction", "prediction_ibcf") \ # # .join(predict_als_nn.withColumnRenamed("prediction", "prediction_als"), ["MASV1", "F_MAMH_index"]) \ # # .withColumn("prediction", (col("prediction_ibcf") + col("prediction_als")) / 2) # # error_combine_nn = evaluate(combine_nn, evaluators) # error_list_combine_nn["{}_{}".format(rank_list[i], ibcf_rank)] = error_combine_nn # best_models = put_best_model(best_models, "als_nn_ibcf_mean", # Model_Error_Wrapper("als_nn_ibcf_mean_{}_{}".format(rank_list[i], ibcf_rank), # als_nn_ibcf_mean_model, error_combine_nn[0],{"als_rank": rank_list[i], # "ibcf_rank": ibcf_rank})) # error_models["als"] = error_list_als error_models["als_nn"] = error_list_als_nn # error_models["als_ibcf"] = error_list_als_ibcf error_models["als_nn_ibcf"] = error_list_als_nn_ibcf # error_models["als_ibcf_mean"] = error_list_combine # error_models["als_nn_ibcf_mean"] = error_list_combine_nn best_models["baseline"] = Model_Error_Wrapper("baseline", baseline_model, 0, {}) error_models["ibcf"] = error_list_ibcf error_models["ubcf"] = error_list_nbcf return error_models, best_models
def main(spark, train_file, test_file, rank, reg, alpha): '''Main routine for supervised training Parameters ---------- spark : SparkSession object data_file : string, path to the parquet file to load model_file : string, path to store the serialized model file ''' # Load the dataframe train = spark.read.parquet(train_file) test = spark.read.parquet(test_file) # Give the dataframe a temporary view so we can run SQL queries train.createOrReplaceTempView('train') test.createOrReplaceTempView('test') # Build model for input parameters rank = float(rank) reg = float(reg) alpha = float(alpha) als = ALS(implicitPrefs=True, userCol="user_idx", itemCol="item_idx", ratingCol="count")\ .setParams(rank=rank, regParam=reg, alpha=alpha) model = als.fit(train) print("model fitted") # Create predition and truth lists k = 500 recommendations = model.recommendForUserSubset(test, k) perUserRecom = recommendations.selectExpr( "user_idx", "recommendations.item_idx as prediction") label_list = test.orderBy(F.col("user_idx"), F.expr("count DESC")).groupby("user_idx").agg( F.expr("collect_list(item_idx) as label")) perUserItem = label_list.select("user_idx", "label") print("predition and label") predictionAndLabel = perUserItem.join( perUserRecom, "user_idx").rdd.map(lambda row: (row.prediction, row.label)) print("inner join") # Use Ranking Metrics for evaluation metrics = RankingMetrics(predictionAndLabel) mean_precision = metrics.meanAveragePrecision print( "At rank={0}, regParam={1}, alpha = {2}, mean average precision is {3}" .format(rank, reg, alpha, mean_precision)) # Use only for final indexed_test.parquet k_precision = metrics.precisionAt(k) print( "At rank={0}, regParam={1}, alpha = {2}, precision at top 500 words is {3}" .format(rank, reg, alpha, k_precision)) pass
# vytvaram jednu tabulku v ktorej su data potrebne na ucenie full_data = scores_data.join(users_data, "username").join(anime_data, "anime_id") recommend_data = full_data.select("user_id", "anime_id", "my_score") # niektore ciselne polia mi inferschema dalo ako string preto ich musim precastovat recommend_data = recommend_data.withColumn( "anime_id", recommend_data["anime_id"].cast(IntegerType())) recommend_data = recommend_data.withColumn( "my_score", recommend_data["my_score"].cast(FloatType())) # samotne ucenie training, test = recommend_data.randomSplit([0.8, 0.2], seed=42) als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="anime_id", ratingCol="my_score", coldStartStrategy="drop") model = als.fit(training) # chyba predikcie predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="my_score", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) # pre kazdeho pouzivatela urobim predikciu pre 3 anime ktore by sa mu mali najviac pacit aj s predikovanym skore userRecs = model.recommendForAllUsers(3) userRecs = userRecs.withColumn("predicted_score",
class MovieRecommendation: def createDf(self): moviesCustomSchema = StructType([ StructField('movieID', IntegerType(), True), StructField('title', StringType(), True), StructField('genre', StringType(), True) ]) ratingsCustomSchema = StructType([ StructField('userId', IntegerType(), True), StructField('movieId', IntegerType(), True), StructField('rating', DoubleType(), True) ]) self.movies_df = self.sqlContext.read.format("jdbc").option( "url", "jdbc:mysql://127.0.0.1:3306/music").option( "driver", "com.mysql.jdbc.Driver").option("dbtable", "movies").option( "user", "root").option("password", "root").load() self.movies_df = self.movies_df.withColumnRenamed("movieId", "ID") self.movies_df = self.movies_df.cache() self.ratings_df = self.sqlContext.read.format("jdbc").option( "url", "jdbc:mysql://127.0.0.1:3306/music").option( "driver", "com.mysql.jdbc.Driver").option("dbtable", "ratings").option( "user", "root").option("password", "root").load() self.ratings_df = self.ratings_df.drop('timestamp') self.ratings_df = self.ratings_df.cache() def topRatedMovies(self): # movies_df = movies_df.drop('genres') self.movie_names_with_avg_ratings_df = self.ratings_df.groupBy( 'movieId').agg({ 'rating': 'avg', 'userId': 'count' }).withColumnRenamed('avg(rating)', 'average').withColumnRenamed( 'count(userId)', 'count') self.moviesRatingsJoined_df = \ self.movies_df.join(self.movie_names_with_avg_ratings_df, self.movies_df.ID == self.movie_names_with_avg_ratings_df.movieId, 'inner') self.moviesRatingsJoined_df = \ self.moviesRatingsJoined_df.sort(self.moviesRatingsJoined_df.average.desc()).drop('ID' ) self.moviesWithHighestRatingWithCountMoreThan500 = \ self.moviesRatingsJoined_df.filter('count >= 500') def splitDataset(self): (split_60_df, split_a_20_df, split_b_20_df) = \ self.ratings_df.randomSplit([0.6, 0.2, 0.2], 123) self.training_df = split_60_df.cache() self.validation_df = split_a_20_df.cache() self.test_df = split_b_20_df.cache() def alternatingLeastSquare(self): self.als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating') model = self.als.fit(self.training_df) # Create an RMSE evaluator using the label and predicted columns self.reg_eval = RegressionEvaluator(predictionCol='prediction', labelCol='rating', metricName='rmse') self.tolerance = 0.03 self.ranks = [4, 8, 12] self.errors = [] self.models = [] self.min_error = float('inf') self.best_rank = -1 for rank in self.ranks: # Set the rank here: self.als.setRank(rank) # Create the model with these parameters. model = self.als.fit(self.training_df) # Run the model to create a prediction. Predict against the validation_df. self.predict_df = model.transform(self.validation_df) # Remove NaN values from prediction (due to SPARK-14489) self.predicted_ratings_df = \ self.predict_df.filter(self.predict_df.prediction != float('nan')) # Run the previously created RMSE evaluator, reg_eval, on the predicted_ratings_df DataFrame error = self.reg_eval.evaluate(self.predicted_ratings_df) self.errors.append(error) self.models.append(model) if error < self.min_error: self.min_error = error self.best_rank = rank self.my_model = model self.als.setRank(self.best_rank) def testModel(self): self.predict_df = self.my_model.transform(self.test_df) # Remove NaN values from prediction (due to SPARK-14489) predicted_test_df = self.predict_df.filter( self.predict_df.prediction != float('nan')) # Run the previously created RMSE evaluator, reg_eval, on the predicted_test_df DataFrame self.reg_eval = RegressionEvaluator(predictionCol='prediction', labelCol='rating', metricName='rmse') test_RMSE = self.reg_eval.evaluate(predicted_test_df) return test_RMSE def get_top_ratings(self, user_id, movies_count): ratings_df_for_user = self.ratings_df.filter('userId=' + str(user_id) + '') ratings_df_for_user.show(5) list_of_movies_row = ratings_df_for_user.select('movieId').collect() my_rated_movie_ids = [i.movieId for i in list_of_movies_row] not_rated_df = self.movies_df.filter( ~self.movies_df['ID'].isin(my_rated_movie_ids)) # Rename the "ID" column to be "movieId", and add a column with my_user_id as "userId". print my_rated_movie_ids my_unrated_movies_df = not_rated_df.selectExpr( 'ID as movieId').withColumn('userId', F.lit(user_id)) # Use my_rating_model to predict ratings for the movies that I did not manually rate. my_unrated_movies_df.show(5) raw_predicted_ratings_df = \ self.my_model.transform(my_unrated_movies_df) predicted_ratings_df = \ raw_predicted_ratings_df.filter(raw_predicted_ratings_df['prediction' ] != float('nan')).withColumnRenamed("movieId", "ID") predicted_ratings_df.show(5) # Join your predicted_ratings_df DataFrame with the movie_names_with_avg_ratings_df DataFrame to obtain the ratings counts for each movie predicted_with_counts_df = \ predicted_ratings_df.join(self.moviesRatingsJoined_df, self.moviesRatingsJoined_df['movieId'] == predicted_ratings_df['ID']).drop('ID') predicted_with_counts_df = predicted_with_counts_df.sort( predicted_with_counts_df.prediction.desc()).filter( 'count >= 50').filter('prediction >= 3.0') list_of_predictions = map(lambda row: row.asDict(), predicted_with_counts_df.collect()) return list_of_predictions def add_ratings(self, ratings): print ratings # my_ratings_df = self.sqlContext.createDataFrame(ratings,['userId', 'movieId', 'rating']) self.ratings_df = self.sqlContext.read.format("jdbc").option( "url", "jdbc:mysql://127.0.0.1:3306/music").option( "driver", "com.mysql.jdbc.Driver").option("dbtable", "ratings").option( "user", "root").option("password", "root").load() # self.ratings_df = self.ratings_df.unionAll(my_ratings_df) self.topRatedMovies() self.als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating') self.als.setPredictionCol('prediction').setMaxIter(5).setSeed( 123).setRegParam(0.1).setUserCol('userId').setItemCol( 'movieId').setRatingCol('rating').setRank(self.best_rank) # Create the model with these parameters. self.my_model = self.als.fit(self.ratings_df) return ratings def __init__(self, sc, sqlcontext): self.sqlContext = SQLContext(sc) self.createDf() self.topRatedMovies() self.splitDataset() self.alternatingLeastSquare() test_RMSE = self.testModel()
#!/usr/bin/env python from pyspark.ml.recommendation import ALS als = ALS(maxIter=10, regParam=0.01, userCol="user_id", itemCol="book_id", ratingCol="rating", coldStartStrategy="drop", implicitPrefs=False, seed=42) model = als.fit(train) def evaluation(model, val, metric): from pyspark.mllib.evaluation import RegressionMetrics import pyspark.sql.functions as f from pyspark.sql.functions import * #all users in val user_val = val.select('user_id').distinct() #recommend top 500 books for each user in val val_rec = model.recommendForUserSubset(user_val, 500) #print(val_rec.first()) #DataFrame[user_id: int, recommendations: array<struct<book_id:int,rating:float>>] #####Reshape the dataframe######
def tune_ALS_NLP(spark, train_data, validation_data, val_true_list, maxIter, regParams, ranks, review_val_predictions): # initial min_error = float('inf') best_iter1 = -1 best_rank1 = -1 best_regularization1 = 0 best_model_rmse = None max_map = 0.0 best_iter2 = -1 best_rank2 = -1 best_regularization2 = 0 best_model_map = None for iteration in maxIter: for current_rank in ranks: for reg in regParams: als=ALS(maxIter=iteration,regParam=reg,rank=current_rank, \ userCol='user_id',itemCol='book_id',ratingCol='rating', \ coldStartStrategy="drop",nonnegative=True) als_model = als.fit(train_data) predictions = als_model.transform(validation_data) review_predictions = review_val_predictions.withColumnRenamed('prediction','review_prediction') als_predictions = predictions.withColumnRenamed('prediction','als_prediction') total_predictions = als_predictions.join(review_predictions,['user_id','book_id','rating'],'outer') total_predictions = total_predictions.withColumn('total_prediction', \ when(total_predictions['review_prediction'].isNotNull(), \ total_predictions['review_prediction']) \ .otherwise(total_predictions['als_prediction'])) window = Window.partitionBy(total_predictions['user_id']).orderBy(total_predictions['total_prediction'].desc()) top_predictions = total_predictions.select('*', rank().over(window).alias('row_num')).filter(col('row_num') <= 500) # rmse evaluator=RegressionEvaluator(metricName='rmse', labelCol='rating',predictionCol='total_prediction') rmse = evaluator.evaluate(top_predictions) if rmse < min_error: min_error = rmse best_rank1 = current_rank best_regularization1 = reg best_iter1 = iteration best_model_rmse = als_model # MAP current_map = MAP.getMAP(top_predictions, val_true_list) if current_map > max_map: max_map = current_map best_rank2 = current_rank best_regularization2 = reg best_iter2 = iteration best_model_map = als_model print('{} latent factors and regularization = {} with maxIter {}: ' 'validation RMSE is {}' 'validation MAP is {}' .format(current_rank, reg, iteration, rmse, current_map)) with open('train05_review_eval.csv', 'ab') as f: np.savetxt(f, [np.array([iteration, current_rank, reg, rmse, current_map])],delimiter=",") print('\nThe best model select by RMSE has {} latent factors and ' 'regularization = {}'' with maxIter = {}: RMSE = {}'.format(best_rank1, best_regularization1, best_iter1, min_error)) print('\nThe best model select by MAP has {} latent factors and ' 'regularization = {}'' with maxIter = {}: MAP = {}'.format(best_rank2, best_regularization2, best_iter2, max_map)) return best_model_rmse,best_model_map
class RecommendationEngine: """A product recommendation engine """ def __train_all_model(self): """Train the ALS model with the current dataset """ #Model 1 logger.info("Training the ALS model 1") self.als = ALS(maxIter=5, regParam=0.01, userCol="UserId", itemCol="ProductId", ratingCol="Rating", coldStartStrategy="drop") self.model1 = self.als.fit(self.df0) logger.info("ALS model 1 built!") #Model 2 logger.info("Training the ALS model 2") self.als = ALS(maxIter=5, regParam=0.01, userCol="UserId", itemCol="ProductId", ratingCol="Rating", coldStartStrategy="drop") self.model2 = self.als.fit(self.df1) logger.info("ALS model 2 built!") #Model 3 logger.info("Training the ALS model 3") self.als = ALS(maxIter=5, regParam=0.01, userCol="UserId", itemCol="ProductId", ratingCol="Rating", coldStartStrategy="drop") self.model3 = self.als.fit(self.df2) logger.info("ALS model 3 built!") def __train_model(self, model): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.als = ALS(maxIter=5, regParam=0.01, userCol="UserId", itemCol="ProductId", ratingCol="Rating", coldStartStrategy="drop") if model == 0: self.model1 = self.als.fit(self.df0) elif model == 1: self.model2 = self.als.fit(self.df1) elif model == 2: self.model3 = self.als.fit(self.df2) logger.info("ALS model built!") def get_top_ratings(self, model, user_id, products_count): if model == 0: users = self.df0.select(self.als.getUserCol()) users = users.filter(users.UserId == user_id) userSubsetRecs = self.model1.recommendForUserSubset( users, products_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('UserId'), func.col('recommendations')['ProductId'].alias('ProductId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs elif model == 1: users = self.df1.select(self.als.getUserCol()) users = users.filter(users.UserId == user_id) userSubsetRecs = self.model2.recommendForUserSubset( users, products_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('UserId'), func.col('recommendations')['ProductId'].alias('ProductId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs elif model == 2: users = self.df2.select(self.als.getUserCol()) users = users.filter(users.UserId == user_id) userSubsetRecs = self.model3.recommendForUserSubset( users, products_count) userSubsetRecs = userSubsetRecs.withColumn( "recommendations", explode("recommendations")) userSubsetRecs = userSubsetRecs.select(func.col('UserId'), func.col('recommendations')['ProductId'].alias('ProductId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') userSubsetRecs = userSubsetRecs.drop('Rating') # userSubsetRecs.show() # userSubsetRecs.printSchema() userSubsetRecs = userSubsetRecs.toPandas() userSubsetRecs = userSubsetRecs.to_json() return userSubsetRecs def get_top_product_recommend(self, model, product_id, user_count): if model == 0: products = self.df0.select(self.als.getItemCol()) products = products.filter(products.ProductId == product_id) productSubsetRecs = self.model1.recommendForItemSubset( products, user_count) productSubsetRecs = productSubsetRecs.withColumn( "recommendations", explode("recommendations")) productSubsetRecs = productSubsetRecs.select(func.col('ProductId'), func.col('recommendations')['UserId'].alias('UserId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') productSubsetRecs = productSubsetRecs.drop('Rating') # userSubsetRecs.printSchema() productSubsetRecs = productSubsetRecs.toPandas() productSubsetRecs = productSubsetRecs.to_json() return productSubsetRecs elif model == 1: products = self.df1.select(self.als.getItemCol()) products = products.filter(products.ProductId == product_id) productSubsetRecs = self.model2.recommendForItemSubset( products, user_count) productSubsetRecs = productSubsetRecs.withColumn( "recommendations", explode("recommendations")) productSubsetRecs = productSubsetRecs.select(func.col('ProductId'), func.col('recommendations')['UserId'].alias('UserId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') productSubsetRecs = productSubsetRecs.drop('Rating') # userSubsetRecs.show() # userSubsetRecs.printSchema() productSubsetRecs = productSubsetRecs.toPandas() productSubsetRecs = productSubsetRecs.to_json() return productSubsetRecs elif model == 2: products = self.df2.select(self.als.getItemCol()) products = products.filter(products.ProductId == product_id) productSubsetRecs = self.model3.recommendForItemSubset( products, user_count) productSubsetRecs = productSubsetRecs.withColumn( "recommendations", explode("recommendations")) productSubsetRecs = productSubsetRecs.select(func.col('ProductId'), func.col('recommendations')['UserId'].alias('UserId'), func.col('recommendations')['Rating'].alias('Rating')).\ drop('recommendations') productSubsetRecs = productSubsetRecs.drop('Rating') # userSubsetRecs.show() # userSubsetRecs.printSchema() productSubsetRecs = productSubsetRecs.toPandas() productSubsetRecs = productSubsetRecs.to_json() return productSubsetRecs def get_ratings_for_product_ids(self, model, user_id, product_id): if model == 0: request = self.spark_session.createDataFrame( [(user_id, product_id)], ["UserId", "ProductId"]) ratings = self.model1.transform(request).collect() return ratings elif model == 1: request = self.spark_session.createDataFrame( [(user_id, product_id)], ["UserId", "ProductId"]) ratings = self.model2.transform(request).collect() return ratings elif model == 2: request = self.spark_session.createDataFrame( [(user_id, product_id)], ["UserId", "ProductId"]) ratings = self.model3.transform(request).collect() return ratings def __init__(self, spark_session, dataset_path): """Init the recommendation engine given a Spark context and a dataset path """ logger.info("Starting up the Recommendation Engine: ") self.spark_session = spark_session # Load Amazon data for later use logger.info("Loading Amazon data...") file_name1 = 'model-1.txt' dataset_file_path1 = os.path.join(dataset_path, file_name1) exist = os.path.isfile(dataset_file_path1) if exist: self.df0 = spark_session.read.csv(dataset_file_path1, header=None, inferSchema=True) self.df0 = self.df0.selectExpr("_c0 as UserId", "_c1 as ProductId", "_c2 as Rating") file_name2 = 'model-2.txt' dataset_file_path2 = os.path.join(dataset_path, file_name2) exist = os.path.isfile(dataset_file_path2) if exist: self.df1 = spark_session.read.csv(dataset_file_path2, header=None, inferSchema=True) self.df1 = self.df1.selectExpr("_c0 as UserId", "_c1 as ProductId", "_c2 as Rating") file_name3 = 'model-3.txt' dataset_file_path3 = os.path.join(dataset_path, file_name3) exist = os.path.isfile(dataset_file_path3) if exist: self.df2 = spark_session.read.csv(dataset_file_path3, header=None, inferSchema=True) self.df2 = self.df2.selectExpr("_c0 as UserId", "_c1 as ProductId", "_c2 as Rating") # Train the model self.__train_all_model()
Test.assertEquals(training_df.filter((ratings_df.userId == 1) & (ratings_df.movieId == 1196) & (ratings_df.rating == 4.5)).count(), 1) Test.assertEquals(validation_df.filter((ratings_df.userId == 1) & (ratings_df.movieId == 296) & (ratings_df.rating == 4.0)).count(), 1) Test.assertEquals(validation_df.filter((ratings_df.userId == 1) & (ratings_df.movieId == 32) & (ratings_df.rating == 3.5)).count(), 1) Test.assertEquals(validation_df.filter((ratings_df.userId == 1) & (ratings_df.movieId == 6888) & (ratings_df.rating == 3.0)).count(), 1) Test.assertEquals(test_df.filter((ratings_df.userId == 1) & (ratings_df.movieId == 4993) & (ratings_df.rating == 5.0)).count(), 1) Test.assertEquals(test_df.filter((ratings_df.userId == 1) & (ratings_df.movieId == 4128) & (ratings_df.rating == 4.0)).count(), 1) Test.assertEquals(test_df.filter((ratings_df.userId == 1) & (ratings_df.movieId == 4915) & (ratings_df.rating == 3.0)).count(), 1) # TODO: Replace <FILL IN> with appropriate code # This step is broken in ML Pipelines: https://issues.apache.org/jira/browse/SPARK-14489 from pyspark.ml.recommendation import ALS # Let's initialize our ALS learner als = ALS() # Now we set the parameters for the method als.setMaxIter(5)\ .setSeed(seed)\ .setRegParam(0.1)\ .<FILL_IN> # Now let's compute an evaluation metric for our test dataset from pyspark.ml.evaluation import RegressionEvaluator # Create an RMSE evaluator using the label and predicted columns reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse") tolerance = 0.03 ranks = [4, 8, 12]
import math spark = SparkSession.builder.appName("hw1").getOrCreate() all_lines = spark.read.text("train.dat").rdd divs = all_lines.map(lambda row: row.value.split("\t")) row_rdd = divs.map(lambda a: Row(userId=int(a[0]), movieId=int(a[1]), rating=float(a[2]), timestamp=int(a[3]))) df = spark.createDataFrame(row_rdd) df.show(10) als = ALS(maxIter=8, regParam=0.085, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="nan") model = als.fit(df) all_lines_test = spark.read.text("test.dat").rdd divs_test = all_lines_test.map(lambda row: row.value.split("\t")) row_rdd_test = divs_test.map( lambda a: Row(userId=int(a[0]), movieId=int(a[1]))) df_test = spark.createDataFrame(row_rdd_test) df_test.show(10) res = df_test.withColumn("col_id", monotonically_increasing_id()) res.show(10) predictions = model.transform(res) predictions.show(20)
# See some statistics about the train, validation and test data print('Statistics for Training Data: ') train.describe().show() print('Statistics for Validation Data: ') val.describe().show() print('Statistics for Test Data: ') test.describe().show() # After doing the hyperparameter tuning, ideal values for rank and regParam are: rank = 50 and regParam = 0.09 r = 50 l = 0.09 als = ALS(rank=r, regParam=l, userCol='user_id', itemCol='book_id', ratingCol='rating', coldStartStrategy='drop', nonnegative=True) # Train the model model = als.fit(train) # RMSE value evalutation (Regression Metric) evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction') # Prediction of rating for validation set predictions = model.transform(val) predictions = predictions.withColumn("prediction",
# MAGIC Using the ML Pipeline's [CrossValidator](http://spark.apache.org/docs/1.6.2/api/python/pyspark.ml.html#pyspark.ml.tuning.CrossValidator) with ALS is thus problematic, because cross validation involves dividing the training data into a set of folds (e.g., three sets) and then using those folds for testing and evaluating the parameters during the parameter grid search process. It is likely that some of the folds will contain users that are not in the other folds, and, as a result, ALS produces NaN values for those new users. When the CrossValidator uses the Evaluator (RMSE) to compute an error metric, the RMSE algorithm will return NaN. This will make *all* of the parameters in the parameter grid appear to be equally good (or bad). # MAGIC # MAGIC You can read the discussion on [Spark JIRA 14489](https://issues.apache.org/jira/browse/SPARK-14489) about this issue. There are proposed workarounds of having ALS provide default values or having RMSE drop NaN values. Both introduce potential issues. We have chosen to have RMSE drop NaN values. While this does not solve the underlying issue of ALS not predicting a value for a new user, it does provide some evaluation value. We manually implement the parameter grid search process using a for loop (below) and remove the NaN values before using RMSE. # MAGIC # MAGIC For a production application, you would want to consider the tradeoffs in how to handle new users. # MAGIC # MAGIC **Note**: This cell will likely take a couple of minutes to run. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code # This step is broken in ML Pipelines: https://issues.apache.org/jira/browse/SPARK-14489 from pyspark.ml.recommendation import ALS # Let's initialize our ALS learner als = ALS() # Now we set the parameters for the method als.setMaxIter(5).setSeed(seed).setRegParam(0.1).setUserCol("userId").setItemCol("movieId").setRatingCol("rating") # Now let's compute an evaluation metric for our test dataset from pyspark.ml.evaluation import RegressionEvaluator # Create an RMSE evaluator using the label and predicted columns reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse") tolerance = 0.03 ranks = [4, 8, 12] errors = [0, 0, 0] models = [0, 0, 0] err = 0
def main(spark, train_data_file, test_data_file, model_file): time_a = time.time() start = time_a # Use Validation and Test user_id to filter Train data, to get the 110k mandatory users # Stored here hdfs:/user/dz584/cf_train_sample.parquet """ training_data = spark.read.parquet('hdfs:/user/bm106/pub/project/cf_train.parquet') validation_data = spark.read.parquet('hdfs:/user/bm106/pub/project/cf_validation.parquet') testing_data = spark.read.parquet('hdfs:/user/bm106/pub/project/cf_test.parquet') validandtest_userid = validation_data.union(testing_data).select('user_id').distinct() validandtest_userid.createOrReplaceTempView('validandtest_userid') training_data.createOrReplaceTempView('training_data') training_data = spark.sql("SELECT * FROM training_data WHERE user_id IN (SELECT user_id FROM validandtest_userid GROUP BY user_id)") training_data.write.parquet("cf_train_sample.parquet") """ training_data = spark.read.parquet(train_data_file) indexer_id = StringIndexer(inputCol="user_id", outputCol="userindex").setHandleInvalid("skip") indexer_id_model = indexer_id.fit(training_data) indexer_item = StringIndexer( inputCol="track_id", outputCol="itemindex").setHandleInvalid("skip") indexer_item_model = indexer_item.fit(training_data) training_data = indexer_id_model.transform(training_data) training_data = indexer_item_model.transform(training_data) testing_data = spark.read.parquet(test_data_file) testing_data = indexer_id_model.transform(testing_data) testing_data = indexer_item_model.transform(testing_data) training_data = training_data.select('userindex', 'itemindex', 'count') testing_data = testing_data.select('userindex', 'itemindex', 'count') # Add Log Compression training_data.createOrReplaceTempView('training_data') training_data = spark.sql( "SELECT *, count+1 as plus_count FROM training_data") training_data = training_data.withColumn("log_count", F.log("plus_count")) print('Finished Indexing!') time_b = time.time() print(time_b - time_a) time_a = time_b result_dict = {} rank_list = [500, 600, 700] #[10,20,30,50] reg_param_list = [0.7] #[0.1,0.5] alpha_list = [1] #[1,1.5] for rank in rank_list: for reg_param in reg_param_list: for alpha in alpha_list: current_key = (rank, reg_param, alpha) als = ALS(maxIter=5, userCol="userindex", itemCol="itemindex", ratingCol="log_count", rank=rank, regParam=reg_param, alpha=alpha) model = als.fit(training_data) print('Finished Modeling with Param:', current_key) time_b = time.time() print(time_b - time_a) time_a = time_b prediction = model.recommendForAllUsers(500).select( 'userindex', 'recommendations.itemindex') print('Finished Prediction DF!') testing_df = testing_data.groupBy('userindex').agg( expr('collect_list(itemindex) as item_list')) print('Finished Label DF!') predictionAndLabels = prediction.join(testing_df, 'userindex') predandlabel_name = 'logplus_rk' + str(rank) + 'reg' + str( reg_param) + 'a' + str(alpha) predandlabel_name = predandlabel_name.replace(".", "") + '.parquet' predictionAndLabels.write.parquet(predandlabel_name) print('Joined Prediction and Labels!') time_b = time.time() print(time_b - time_a) time_a = time_b # pred_df = predictionAndLabels.select(['itemindex','item_list']).rdd.map(list) # metrics = RankingMetrics(pred_df) # print('Ranking Metrics Calculated!') # time_b = time.time() # print(time_b - time_a) # time_a = time_b # eva = metrics.meanAveragePrecision # result_dict[current_key] = eva # print(current_key,"parameter combination has been trained! MAP= ", eva) # time_b = time.time() # print(time_b - time_a) # time_a = time_b # best_model_param = max(result_dict, key=result_dict.get) # als = ALS(maxIter=5, userCol="userindex", itemCol="itemindex", ratingCol="count", rank=best_model_param[0], regParam=best_model_param[1], alpha=best_model_param[2]) # als.fit(training_data).write().overwrite().save(model_file) print('Process Finished!') print(time.time() - start)
def generate_predictions(training_df, prediction_df, rank, model=None): iterations = 10 als = ALS(rank=rank, maxIter=iterations, implicitPrefs=True) if model == None: model = als.fit(training_df) return model.transform(prediction_df).dropna()
ratings_with_user_and_item_Idx = ratings_with_userIdx.join(item_index, on=['itemId'], how='left') ## persisting this dataframe is the key: # https://medium.com/@meltem.tutar/pyspark-under-the-hood-randomsplit-and-sample-inconsistencies-examined-7c6ec62644bc ratings_with_user_and_item_Idx.persist() (training, test) = ratings_with_user_and_item_Idx.randomSplit([0.99, 0.01]) # Build the recommendation model using ALS on the training data # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics als = ALS(maxIter=10, regParam=0.2, userCol="userIdx", itemCol="itemIdx", ratingCol="rating", rank=16, coldStartStrategy="drop") model = als.fit(training) end_time = time.time() print("Time elapsed %f" % (end_time - start_time)) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) # evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", # predictionCol="prediction") # rmse = evaluator.evaluate(predictions)
# MAGIC For a production application, you would want to consider the tradeoffs in how to handle new users. # MAGIC # MAGIC **Note**: This cell will likely take a couple of minutes to run. # COMMAND ---------- # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code # This step is broken in ML Pipelines: https://issues.apache.org/jira/browse/SPARK-14489 from pyspark.ml.recommendation import ALS # Let's initialize our ALS learner als = ALS() # Now we set the parameters for the method als.setMaxIter(5)\ .setSeed(seed)\ .setRegParam(0.1)\ .setUserCol("userId").setItemCol("movieId").setRatingCol("rating") # Now let's compute an evaluation metric for our test dataset from pyspark.ml.evaluation import RegressionEvaluator # Create an RMSE evaluator using the label and predicted columns reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse") tolerance = 0.03 ranks = [4, 8, 12]
class Predictor(object): def __init__(self, spark, user_col_name, item_col_name, rating_col_name, rank=15, maxIter=15, regParam=0.01): self.user_col_name = user_col_name self.item_col_name = item_col_name self.item_col_name_index = "INDEX_" + item_col_name self.rating_col_name = rating_col_name self.als = ALS(rank=rank, maxIter=maxIter, regParam=regParam, userCol=user_col_name, itemCol=self.item_col_name_index, ratingCol=rating_col_name, coldStartStrategy="drop", nonnegative=True) self.item_indexer = StringIndexer().setInputCol( self.item_col_name).setOutputCol(self.item_col_name_index) self.item_index_df = None self.indexer_model = None self.model = None self.item_similarity = None self.spark = spark # fit all the course index def fit_item_index(self, item_df): self.indexer_model = self.item_indexer.fit(item_df) self.item_index_df = self.indexer_model.transform( item_df.select(self.item_col_name).distinct()) # fit training data (call this after fit_item_index) def fit(self, training_df): encoded_df = self.indexer_model.transform(training_df) # encoded_df = encoded_df.withColumn(self.user_col_name, encoded_df[self.user_col_name].cast(IntegerType())) # encoded_df = encoded_df.withColumn(self.rating_col_name, encoded_df[self.rating_col_name].cast(DoubleType())) normalize_rating_udf = udf(lambda p: 0.0 if p > 10 else p, DoubleType()) encoded_df = encoded_df.withColumn( self.rating_col_name, normalize_rating_udf(encoded_df[self.rating_col_name])) self.model = self.als.fit(encoded_df) item_factor = self.model.itemFactors item_factor.createOrReplaceTempView("ItemFactor") # function to calculate cosine similarity between two array def cosine_similarity(item1, item2): dot_product = np.linalg.norm(item1) * np.linalg.norm(item2) if dot_product == 0: return 0.0 return float(np.dot(item1, item2) / dot_product) cosine_similarity_udf = udf(cosine_similarity, DoubleType()) item_similarity = self.spark.sql( "SELECT I1.id as id1, I2.id as id2, I1.features as features1, I2.features as features2 FROM ItemFactor I1, ItemFactor I2 WHERE I1.id != I2.id" ) self.item_similarity = item_similarity.withColumn( "similarity", cosine_similarity_udf(item_similarity["features1"], item_similarity["features2"])) # self.item_similarity.show() # can drop 2 feature column and tempView # item_similarity = item_similarity.drop("features1") # item_similarity = item_similarity.drop("features2") # self.spark.catalog.dropTempView("ItemFactor") # input_df will have 1 student id and all course that the student already studied # first we will index all the course the student already studied and normalize all score # then map similarity data to the already studied course # then check if predict_course_df is None or not, if it None, then predict all the remaining course, # if not transform the predict_course_df to get the index of predict course # then begin predict function (use first 5 relevant course to that course that the student already studied) def predict_using_cosine_similarity(self, input_df, predict_course_df=None): # preprocessed input data # print("begin predict using cosine similarity") encoded_df = self.indexer_model.transform(input_df) normalize_rating_udf = udf(lambda p: 0.0 if p > 10 else p, DoubleType()) encoded_df = encoded_df.withColumn( self.rating_col_name, normalize_rating_udf(encoded_df[self.rating_col_name])) # get predict course df (remaining course) if predict_course_df is None: predict_course_df_predict = encoded_df.join(self.item_index_df, encoded_df[self.item_col_name_index] != self.item_index_df[ self.item_col_name_index]) \ .select(self.user_col_name, self.item_col_name_index) else: predict_course_df = self.indexer_model.transform(predict_course_df) predict_course_df_predict = predict_course_df.drop( self.rating_col_name) # get all value that can participate in evaluate final score similarity_score_df = encoded_df.join(self.item_similarity, encoded_df[self.item_col_name_index] == self.item_similarity['id1']) \ .select(self.user_col_name, self.rating_col_name, 'id1', 'id2', 'similarity') \ # .withColumnRenamed(self.user_col_name, "user_name_similarity") # encoded_df[self.item_col_name_index] == self.item_similarity['id2']) # can delete this part if allow duplicate id1,id2 # def predict(student, course, similarity_score_df): # # get first 5 course the student already attended which are the most relevant to the current course # relevant_df = similarity_score_df.filter(similarity_score_df[self.user_col_name] == student and # similarity_score_df['id2'] == course) \ # .orderBy('similarity', ascending=False) \ # .head(5) # relevant_df = relevant_df.withColumn('score', relevant_df[self.rating_col_name] * relevant_df['similarity']) # return relevant_df.select(spark_func.avg(relevant_df['score']).alias('avg')).collect()[0][ # 'avg'] # need to check again if avg is enough def predict(list_score, list_similarity): sum_simi = sum(list_similarity) if sum_simi == 0: return 0.0 return sum([ list_score[i] * list_similarity[i] for i in range(len(list_score)) ]) / sum(list_similarity) predict_udf = udf(predict, DoubleType()) window = Window.partitionBy([ spark_func.col(self.user_col_name), spark_func.col(self.item_col_name_index) ]).orderBy(spark_func.col('similarity').desc()) predict_course_df_predict = predict_course_df_predict.join( similarity_score_df.withColumnRenamed("id2", self.item_col_name_index), [self.item_col_name_index, self.user_col_name]) \ .select("*", spark_func.rank().over(window).alias("rank")) \ .filter(spark_func.col("rank") <= 7).groupby(self.user_col_name, self.item_col_name_index) \ .agg(spark_func.collect_list(self.rating_col_name).alias("list_score"), spark_func.collect_list("similarity").alias("list_similarity")) predict_course_df_predict = predict_course_df_predict.withColumn( "prediction", predict_udf(spark_func.col("list_score"), spark_func.col("list_similarity"))) if predict_course_df is not None and self.rating_col_name in predict_course_df.columns: predict_course_df_predict = predict_course_df_predict.join( predict_course_df, [self.user_col_name, self.item_col_name_index]) return predict_course_df_predict def transform(self, df): encoded_df = self.indexer_model.transform(df) normalize_rating_udf = udf(lambda p: 0.0 if p > 10 else p, DoubleType()) encoded_df = encoded_df.withColumn( self.rating_col_name, normalize_rating_udf(encoded_df[self.rating_col_name])) return self.model.transform(encoded_df)
def train_als(data, input_user, input_video, input_rating): """Train a als model Args: data: Data used for training input_user: User column input_video: Video column input_rating: Rating column Returns: best_model: Trained als model model1: StringIndexer of user model2: StringIndexer of video """ print(proc_date) # Define StringIndexer user_indexer = StringIndexer(inputCol=input_user, outputCol=input_user + "_index") model1 = user_indexer.fit(data) index1_data = model1.transform(data) video_indexer = StringIndexer(inputCol=input_video, outputCol=input_video + "_index") model2 = video_indexer.fit(index1_data) index2_data = model2.transform(index1_data) newdata = index2_data.select( col(input_user + "_index").cast(IntegerType()), col(input_video + "_index").cast(IntegerType()), input_rating) # Split data train_data, test_data = split_data(newdata) # ALS model als = ALS(userCol=input_user + "_index", itemCol=input_video + "_index", ratingCol=input_rating, coldStartStrategy="nan", implicitPrefs=False) # Crossvalidator paramGrid = ParamGridBuilder() \ .addGrid(als.maxIter, [5, 10]) \ .addGrid(als.regParam, [0.01, 0.1]) \ .addGrid(als.rank, [10, 20]) \ .build() evaluator = RegressionEvaluator(metricName="rmse", labelCol=input_rating, predictionCol="prediction") crossval = CrossValidator(estimator=als, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3) model = crossval.fit(train_data) best_model = model.bestModel # Compute rmse predictions = best_model.transform(test_data).na.drop() rmse = evaluator.evaluate(predictions) print("RMSE: ", rmse) print("MAXIter: ", best_model._java_obj.parent().getMaxIter()) print("RegParam: ", best_model._java_obj.parent().getRegParam()) print("Rank: ", best_model._java_obj.parent().getRank()) return best_model, model1, model2
model_activation.add(keras.layers.Dense(units=1)) model_activation.add(keras.layers.Dropout(rate=0.5)) model_activation.compile(loss='mean_squared_error', optimizer='sgd', metrics=['Precision']) model_activation.fit(X_train, y_train, epochs=10, batch_size=1, verbose = 0) # RECOMMENDERS '''Spark ALS Collaborative Filtering''' spark = SparkSession.builder.getOrCreate() als_model = ALS( itemCol='', userCol='', ratingCol='', nonnegative=True, maxIter=20, regParam=0.05, rank=20) #fit sdf = spark.createDataFrame( #DF ) recommender = als_model.fit(sdf) #predict prediction = recommender.transform(sdf)#.toPandas().prediction #evaluate evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") als_rmse = evaluator.evaluate(predictions) #https://jaceklaskowski.gitbooks.io/mastering-apache-spark/spark-mllib/spark-mllib-RegressionEvaluator.html
import findspark findspark.init("D:\Spark") from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.recommendation import ALS from pyspark.sql import SparkSession spark = SparkSession.builder.appName('Ilk Ornek').getOrCreate() lnes = spark.read.csv('ratings.csv', inferSchema=True, header=True) lnes.show() lnes.describe().show() training, test = lnes.randomSplit([0.7, 0.3]) als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating') model = als.fit(training) predictions = model.transform(test) predictions.show() single_user = test.filter(test['userId'] == 12).select(['movieId', 'userId']) single_user.show() rec = model.transform(single_user) rec.orderBy('prediction', ascending=False).show() evalate = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evalate.evaluate(predictions) print("Root-mean-square error = " + str(rmse))