class Recommender(DataHandler): def __init__(self, spark_session, sqlContext, jdbcUrl, table_name): super().__init__(spark_session, sqlContext, jdbcUrl, table_name) self.best_rank = 24 self.best_iteration = 10 self.regularization_parameter = 0.1 self.model = ALS() # ALS Matrix factorization model. def train_model(self): # movielens dataset has explicit features. I could train algorithm that used for explicit features, # but for some reason (See: https://stackoverflow.com/questions/26213573/apache-spark-als-collaborative-filtering-results-they-dont-make-sense) # that algorithm doesn't give reasonable recommendations. model = ALS().trainImplicit(self.data.rdd.persist(), rank=self.best_rank, iterations=self.best_iteration, lambda_=self.regularization_parameter) self.model = model def generate_recommendation(self, existing_user_id=None, external_user_id=None): data_rdd = self.data.rdd if existing_user_id: # Generates recommendation for particular user that already exists in ratings dataset. rated_movies_by_user = self.rated_movies[ existing_user_id] # list ids of rated movies by this user. unrated_movs = data_rdd.filter(lambda x: x[1] not in rated_movies_by_user) \ .map(lambda s: (existing_user_id, s[1])) # unrated movies raw_recommendations = self.model.predictAll(unrated_movs) else: # Generates recommendation for last user. new_id = self.NEW_USERID rated_movies_by_user = self.rated_movies[new_id] unrated_movs = data_rdd.filter(lambda x: x[1] not in rated_movies_by_user) \ .map(lambda s: (new_id, s[1])) if external_user_id: raw_recommendations = self.model.predictAll(unrated_movs) \ .map(lambda x: (external_user_id, x[1], x[2])) else: raw_recommendations = self.model.predictAll(unrated_movs) return raw_recommendations
class Recommender: def __init__(self, spark_session, sqlContext, jdbcUrl, table_name): self.spark_session = spark_session self.jdbcUrl = jdbcUrl self.sqlContext = sqlContext self.table_name = table_name self.data = self._process_data() self.columns = self.data.columns self.NEW_USERID = 610 # last user's id in ratings dataframe. self.rated_movies = dict() # keys: ids of new users, values: corresponding movie ids. ==> {user_id: movie_ids} self.best_rank = 24 self.best_iteration = 10 self.regularization_parameter = 0.1 self.model = ALS() # ALS Matrix factorization model. def _process_data(self): raw_data = self.sqlContext.read.jdbc(url=self.jdbcUrl, table=self.table_name) cols = raw_data.columns if 'rating' in cols: # if raw_data is ratings dataset data = raw_data.withColumn(cols[0], raw_data[cols[0]].cast(IntegerType())) # convert dtype of userId to integer data = data.withColumn(cols[1], data[cols[1]].cast(IntegerType())) # convert dtype of movieId to integer data = data.withColumn(cols[2], data[cols[2]].cast(FloatType())) # convert dtype of rating to float # shuffle pairs and create 18 partitions with data that's distributed evenly across 3 machines. Then persist data. data = data.repartition(18).persist() else: data = raw_data.repartiton(18).persist() return data def add_new_user(self, movie_ids): """ user_id: integer. id of new user movie_ids: list. list of user provided movie ids (string). """ self.NEW_USERID += 1 rated_movies = [(self.NEW_USERID, int(i), 5) for i in movie_ids] # [(user id, movie id, rating), ...] # converting "rated_movies" list to RDD. new_user_ratings_rdd = self.spark_session.parallelize(rated_movies) # converting new_user_ratings_rdd to spark dataframe object. new_user_ratings_df = new_user_ratings_rdd.toDF(schema=self.columns) self.rated_movies[self.NEW_USERID] = movie_ids self.data = self.data.union(new_user_ratings_df) # concatenating existing ratings data with new_user_ratings_df def train_model(self): # movielens dataset has explicit features. I could train algorithm that used for explicit features, # but for some reason (See: https://stackoverflow.com/questions/26213573/apache-spark-als-collaborative-filtering-results-they-dont-make-sense) # that algorithm doesn't give reasonable recommendations. model = ALS().trainImplicit(self.data.rdd.persist(), rank=self.best_rank, iterations=self.best_iteration, lambda_=self.regularization_parameter) self.model = model def generate_recommendation(self, existing_user_id=None): data_rdd = self.data.rdd if existing_user_id: # Generates recommendation for particular user that already exists in ratings dataset. rated_movies_by_user = self.rated_movies[existing_user_id] # list ids of rated movies by this user. unrated_movs = data_rdd.filter(lambda x: x[1] not in rated_movies_by_user) \ .map(lambda s: (existing_user_id, s[1])) # unrated movies raw_recommendations = self.model.predictAll(unrated_movs) else: # Generates recommendation for last user. new_id = self.NEW_USERID rated_movies_by_user = self.rated_movies[new_id] unrated_movs = data_rdd.filter(lambda x: x[1] not in rated_movies_by_user) \ .map(lambda s: (new_id, s[1])) raw_recommendations = self.model.predictAll(unrated_movs) return raw_recommendations