Ejemplo n.º 1
0
 def train_model(self):
     # movielens dataset has explicit features. I could train algorithm that used for explicit features,
     # but for some reason (See: https://stackoverflow.com/questions/26213573/apache-spark-als-collaborative-filtering-results-they-dont-make-sense)
     # that algorithm doesn't give reasonable recommendations.
     model = ALS().trainImplicit(self.data.rdd.persist(), rank=self.best_rank, iterations=self.best_iteration,
                         lambda_=self.regularization_parameter)
     self.model = model
Ejemplo n.º 2
0
 def __init__(self, spark_session, sqlContext, jdbcUrl, table_name):
     self.spark_session = spark_session
     self.jdbcUrl = jdbcUrl
     self.sqlContext = sqlContext
     self.table_name = table_name
     self.data = self._process_data()
     self.columns = self.data.columns
     self.NEW_USERID = 610  # last user's id in ratings dataframe.
     self.rated_movies = dict()  # keys: ids of new users, values: corresponding movie ids. ==> {user_id: movie_ids}
     self.best_rank = 24
     self.best_iteration = 10
     self.regularization_parameter = 0.1
     self.model = ALS()  # ALS Matrix factorization model.
    "MovieRecommendationSystemWithSpark")
sc = SparkContext(conf=conf)

# getting the rating of the movie with user id and movie id
data = sc.textFile("file:///SparkCourse/ml-100k/u.data")

rdd = data.map(lambda x: x.split()).map(lambda x: int(x[0]), int(x[1]),
                                        float(x[2]))
features = ['user_id', 'movie_id', 'ratings']
df = rdd.toDF(features)

(train, test) = df.randomSplit([0.7, 0.3])

als = ALS(maxIter=5,
          regParam=0.01,
          userCol="user_id",
          itemCol="movie_id",
          ratingCol="rating",
          coldStartStrategy="drop")

model = als.fit(train)

prediction = model.transfrom(test)

value = RegressionEvaluator(metricName="rmse",
                            labelCol="rating",
                            predictionCol="prediction")
rmse = value.evaluate(prediction)
print("Root-mean-square error = " + str(rmse))

userRecs = model.recommendForAllUsers(15)
Ejemplo n.º 4
0
|1229304|   1660|   1.0|
| 672853|    342|   1.0|
|1056465|    921|   1.0|
+-------+-------+------+

# Split the input data into training and test datasets

(training, test) = df.randomSplit([0.8,0.2])



# Apply implicit parameter to the data

from pyspark.ml.recommendation import ALS

als = ALS(implicitPrefs=True, userCol="user", itemCol="product", ratingCol="rating", coldStartStrategy="drop")

als
ALS_4fdeaff285b75ff8d702

als.explainParams()
"alpha: alpha for implicit preference (default: 1.0)" \
\ncheckpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1).
E.g. 10 means that the cache will get checkpointed every 10 iterations.
Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
\ncoldStartStrategy: strategy for dealing with unknown or new users/items at prediction time.
This may be useful in cross-validation or production scenarios,
for handling user/item ids the model has not seen in the training data.
Supported values: 'nan', 'drop'. (default: nan, current: drop)\nfinalStorageLevel:
StorageLevel for ALS model factors. (default: MEMORY_AND_DISK)\nimplicitPrefs:
whether to use implicit preference (default: False, current: True)\nintermediateStorageLevel:
Ejemplo n.º 5
0
    def hotmodel(self, sc, sets, movieRDD):
        '''
        training a super hot model
        '''
        als = ALS(coldStartStrategy="drop")
        param_grid = ParamGridBuilder() \
        .addGrid(als.rank, [6, 8]) \
        .addGrid(als.maxIter,[8, 10, 12]) \
        .build()

        evaluator = RegressionEvaluator(
            metricName="mse",
            labelCol="rating",
            predictionCol="prediction")

        tvs = TrainValidationSplit(
            estimator=als,
            estimatorParamMaps=param_grid,
            evaluator=evaluator,
        )

        model = tvs.fit(sets['training']) ## should we save the model?
        best_rank = model.bestModel.rank
        best_iterations = model.bestModel._java_obj.parent().getMaxIter()
        print('hotmodel part 1')

        prediction = model.transform(sets['test'])
        prediction.alias('p')\
            .join(movieRDD.alias('m'), col('p.item') == col('m.item'))\
                .select([col('p.user'), col('m.title'), col('p.prediction'), col('p.rating')])

        mse = evaluator.evaluate(prediction)
        print("MSE = {}".format(mse))

        '''
        hot model's tinder date
        '''
        rating59169 = [
                (118661, 9), # Avengers
                (371746, 9),  # Iron Man 2008
                (94625, 9),  # Akira
                (1563738, 2), # One day 2011
                (800369, 8),  # Thor
                (1981115, 9), # Thor: The Dark World
                (3501632, 9), # Thor: Ragnarok
                (120338, 3), # Titanic
                (98635, 2), # When Harry Met Sally
                (125439, 3), # Notting Hill
                (332280, 1) # The Notebook
            ]
        
        user59169 = ratingRDD.groupBy().max('user').first()['max(user)'] + 1
        user59169DF = spark.createDataFrame\
        ([Row(user=user59169, item=r[0], rating=r[1]) for r in rating59169])
        user59169DF = user59169DF.select('user','item','rating')
        # user59169DF = sc.parallelize(user59169DF)
        new_model = ALS(rank=best_rank, maxIter=best_iterations, coldStartStrategy="drop")\
            .fit(ratingRDD2)

        unseen_movies = movieRDD.alias('m')\
            .join(user59169DF.alias('r'), col('m.item') == col('r.item'), how='left_anti')\
                .select('item')
        unseen_movies_user = unseen_movies.withColumn("user", lit(user59169))

        print('hot model part 2')

        spark.conf.set("spark.sql.crossJoin.enabled", "true")
        unseen_ratings = new_model.transform(unseen_movies_user)

        unseen_ratings_titles = unseen_ratings.alias('r')\
                        .join(movieRDD.alias('m'), col('r.item') == col('m.item'))\
                        .select(['user', 'title', 'prediction'])

        ratings_per_movie = ratingRDD.groupBy('item').count()
        enough_ratings = ratings_per_movie.filter(col('count') < 500)
        enough_ratings.show()

        
        training_10 = unseen_ratings.alias('r')\
            .join(enough_ratings.alias('e'), col('r.item') == col('e.item'), how='left_anti')\
            .select(['item', 'user', 'prediction']).orderBy(col('prediction').desc())

        training_100.alias('t').join(movieRDD.alias('m'), col('t.item') == col('m.item'))\
            .select(['user', 'title', 'prediction'])\
                .orderBy(col('prediction').desc()).show(10, truncate=False)
    
    # spark.stop()
Ejemplo n.º 6
0
board_games = dict((y, x) for x, y in board_game_index.iteritems())
user_index = np.load(
    '/Users/ericyatskowitz/galvanize_work/MeepleFinder/Erics_Web_App/wa_user_dict.npy'
).item()
users = dict((y, x) for x, y in user_index.iteritems())
als_data = pd.read_csv(
    '/Users/ericyatskowitz/galvanize_work/MeepleFinder/als_ready_wa_ratings_data.csv'
)
als_data.drop('Unnamed: 0', axis=1, inplace=True)

als_spark_df = spark.createDataFrame(als_data)
als_spark_df.cache()
als_model = ALS(itemCol='board_game',
                userCol='user',
                ratingCol='rating',
                nonnegative=True,
                regParam=0.1,
                rank=100,
                maxIter=10)
als_fit_model = als_model.fit(als_spark_df)

just_ranking_info = pd.read_csv(
    '/Users/ericyatskowitz/galvanize_work/MeepleFinder/data/just_ranking_info.csv'
)
just_ranking_info = just_ranking_info.set_index('Title')
predictions_array = list(
    product(als_data.loc[:, 'user'].unique(), just_ranking_info.index))
predictions_df = pd.DataFrame(predictions_array,
                              columns=['user', 'board_game'])
spark_pre_predictions_df = spark.createDataFrame(predictions_df)
spark_predictions_df = als_fit_model.transform(spark_pre_predictions_df)
Ejemplo n.º 7
0
 def __train_model(self):
     "Train the ALS model with current dataset"
     logger.info("Training the ALS model...")
     als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop")
     model = als.fit(self.column_trained)
     logger.info("ALS model built!")
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
print(predictions.take(5))
ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
print(ratesAndPreds.take(5))

MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
from math import sqrt
rmse = sqrt(MSE)
print("Root-mean-square error = " + str(rmse))

#understanding
(training, test) = df.randomSplit([0.8, 0.2])
print(training.count(),test.count())
from pyspark.ml.recommendation import ALS
als = ALS(userCol="UserID", itemCol="product", ratingCol="Rating")
#create pipeline object and setting the created als model as a stage in the pipeline
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[als])
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
paramMapExplicit = ParamGridBuilder() \
                    .addGrid(als.rank, [8, 12]) \
                    .addGrid(als.maxIter, [10, 15]) \
                    .addGrid(als.regParam, [1.0, 10.0]) \
                    .build()
from pyspark.ml.evaluation import RegressionEvaluator
#calling RegressionEvaluator() method with evaluation metric set to rmse and evaluation column set to Rating
evaluatorR = RegressionEvaluator(metricName="rmse", labelCol="Rating")
cvExplicit = CrossValidator(estimator=als, estimatorParamMaps=paramMapExplicit, evaluator=evaluatorR,numFolds=2)
cvModel = cvExplicit.fit(training)
preds = cvModel.transform(test)
Ejemplo n.º 9
0



# In[ ]:





# In[7]:



# model
als = ALS(userCol="userid", itemCol="item", ratingCol="rating",coldStartStrategy='drop',nonnegative=False)
     
# evaluator
rmseevaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

# parameter grid
paramGrid = ParamGridBuilder()    .addGrid(als.rank, [1, 5, 10,50,70])     .addGrid(als.maxIter, [15])    .addGrid(als.regParam, [0.05, 0.1, 0.5,5])    .build()

# train validation split
tvs = TrainValidationSplit(estimator=als,
                           estimatorParamMaps=paramGrid,
                           evaluator=rmseevaluator,
                           trainRatio=0.8)
# fit model and time

tvsmodel = tvs.fit(data_train)
Ejemplo n.º 10
0
header = tvViewingData.first()
lines = tvViewingData.filter(lambda row: row != header).map(
    lambda x: x.split(','))
# showUser = lines.map(lambda p: (p[0], int(p[1]), int(p[2])))
showUserCount = showUser.map(lambda p: p[1]).countByValue()

showUserRDD = lines.map(lambda p: Row(show=int(p[1]), user=int(p[2])))
showCount = showUserRDD.map(lambda p: p[0]).countByValue()
userCount = showUserRDD.map(lambda p: p[1]).countByValue()

showUser = spark.createDataFrame(showUserRDD)
# df = spark.createDataFrame([(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)],["user", "item", "rating"])

(training, test) = showUser.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5,
          regParam=0.01,
          implicitPrefs=True,
          userCol="user",
          itemCol="show",
          ratingCol="")
model = als.fit(training)
predictions = sorted(model.transform(test).collect(), key=lambda r: r[0])

# Save and load model
model.save(sc, "target/tmp/myCollaborativeFilter")
sameModel = MatrixFactorizationModel.load(sc,
                                          "target/tmp/myCollaborativeFilter")
# $example off$
 def __init__(self, spark_session, sqlContext, jdbcUrl, table_name):
     super().__init__(spark_session, sqlContext, jdbcUrl, table_name)
     self.best_rank = 24
     self.best_iteration = 10
     self.regularization_parameter = 0.1
     self.model = ALS()  # ALS Matrix factorization model.