# getting the rating of the movie with user id and movie id
data = sc.textFile("file:///SparkCourse/ml-100k/u.data")

rdd = data.map(lambda x: x.split()).map(lambda x: int(x[0]), int(x[1]),
                                        float(x[2]))
features = ['user_id', 'movie_id', 'ratings']
df = rdd.toDF(features)

(train, test) = df.randomSplit([0.7, 0.3])

als = ALS(maxIter=5,
          regParam=0.01,
          userCol="user_id",
          itemCol="movie_id",
          ratingCol="rating",
          coldStartStrategy="drop")

model = als.fit(train)

prediction = model.transfrom(test)

value = RegressionEvaluator(metricName="rmse",
                            labelCol="rating",
                            predictionCol="prediction")
rmse = value.evaluate(prediction)
print("Root-mean-square error = " + str(rmse))

userRecs = model.recommendForAllUsers(15)

movieRecs = model.recommendForAllItems(15)
Esempio n. 2
0
 def __train_model(self):
     "Train the ALS model with current dataset"
     logger.info("Training the ALS model...")
     als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop")
     model = als.fit(self.column_trained)
     logger.info("ALS model built!")
Esempio n. 3
0
users = dict((y, x) for x, y in user_index.iteritems())
als_data = pd.read_csv(
    '/Users/ericyatskowitz/galvanize_work/MeepleFinder/als_ready_wa_ratings_data.csv'
)
als_data.drop('Unnamed: 0', axis=1, inplace=True)

als_spark_df = spark.createDataFrame(als_data)
als_spark_df.cache()
als_model = ALS(itemCol='board_game',
                userCol='user',
                ratingCol='rating',
                nonnegative=True,
                regParam=0.1,
                rank=100,
                maxIter=10)
als_fit_model = als_model.fit(als_spark_df)

just_ranking_info = pd.read_csv(
    '/Users/ericyatskowitz/galvanize_work/MeepleFinder/data/just_ranking_info.csv'
)
just_ranking_info = just_ranking_info.set_index('Title')
predictions_array = list(
    product(als_data.loc[:, 'user'].unique(), just_ranking_info.index))
predictions_df = pd.DataFrame(predictions_array,
                              columns=['user', 'board_game'])
spark_pre_predictions_df = spark.createDataFrame(predictions_df)
spark_predictions_df = als_fit_model.transform(spark_pre_predictions_df)
pred_ratings_df = spark_predictions_df.toPandas()
pred_ratings_df.fillna(0, inplace=True)
pred_ratings_df.to_csv('pred_ratings_df.csv')
Esempio n. 4
0
header = tvViewingData.first()
lines = tvViewingData.filter(lambda row: row != header).map(
    lambda x: x.split(','))
# showUser = lines.map(lambda p: (p[0], int(p[1]), int(p[2])))
showUserCount = showUser.map(lambda p: p[1]).countByValue()

showUserRDD = lines.map(lambda p: Row(show=int(p[1]), user=int(p[2])))
showCount = showUserRDD.map(lambda p: p[0]).countByValue()
userCount = showUserRDD.map(lambda p: p[1]).countByValue()

showUser = spark.createDataFrame(showUserRDD)
# df = spark.createDataFrame([(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)],["user", "item", "rating"])

(training, test) = showUser.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5,
          regParam=0.01,
          implicitPrefs=True,
          userCol="user",
          itemCol="show",
          ratingCol="")
model = als.fit(training)
predictions = sorted(model.transform(test).collect(), key=lambda r: r[0])

# Save and load model
model.save(sc, "target/tmp/myCollaborativeFilter")
sameModel = MatrixFactorizationModel.load(sc,
                                          "target/tmp/myCollaborativeFilter")
# $example off$