# getting the rating of the movie with user id and movie id data = sc.textFile("file:///SparkCourse/ml-100k/u.data") rdd = data.map(lambda x: x.split()).map(lambda x: int(x[0]), int(x[1]), float(x[2])) features = ['user_id', 'movie_id', 'ratings'] df = rdd.toDF(features) (train, test) = df.randomSplit([0.7, 0.3]) als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="movie_id", ratingCol="rating", coldStartStrategy="drop") model = als.fit(train) prediction = model.transfrom(test) value = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = value.evaluate(prediction) print("Root-mean-square error = " + str(rmse)) userRecs = model.recommendForAllUsers(15) movieRecs = model.recommendForAllItems(15)
def __train_model(self): "Train the ALS model with current dataset" logger.info("Training the ALS model...") als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop") model = als.fit(self.column_trained) logger.info("ALS model built!")
users = dict((y, x) for x, y in user_index.iteritems()) als_data = pd.read_csv( '/Users/ericyatskowitz/galvanize_work/MeepleFinder/als_ready_wa_ratings_data.csv' ) als_data.drop('Unnamed: 0', axis=1, inplace=True) als_spark_df = spark.createDataFrame(als_data) als_spark_df.cache() als_model = ALS(itemCol='board_game', userCol='user', ratingCol='rating', nonnegative=True, regParam=0.1, rank=100, maxIter=10) als_fit_model = als_model.fit(als_spark_df) just_ranking_info = pd.read_csv( '/Users/ericyatskowitz/galvanize_work/MeepleFinder/data/just_ranking_info.csv' ) just_ranking_info = just_ranking_info.set_index('Title') predictions_array = list( product(als_data.loc[:, 'user'].unique(), just_ranking_info.index)) predictions_df = pd.DataFrame(predictions_array, columns=['user', 'board_game']) spark_pre_predictions_df = spark.createDataFrame(predictions_df) spark_predictions_df = als_fit_model.transform(spark_pre_predictions_df) pred_ratings_df = spark_predictions_df.toPandas() pred_ratings_df.fillna(0, inplace=True) pred_ratings_df.to_csv('pred_ratings_df.csv')
header = tvViewingData.first() lines = tvViewingData.filter(lambda row: row != header).map( lambda x: x.split(',')) # showUser = lines.map(lambda p: (p[0], int(p[1]), int(p[2]))) showUserCount = showUser.map(lambda p: p[1]).countByValue() showUserRDD = lines.map(lambda p: Row(show=int(p[1]), user=int(p[2]))) showCount = showUserRDD.map(lambda p: p[0]).countByValue() userCount = showUserRDD.map(lambda p: p[1]).countByValue() showUser = spark.createDataFrame(showUserRDD) # df = spark.createDataFrame([(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)],["user", "item", "rating"]) (training, test) = showUser.randomSplit([0.8, 0.2]) # Build the recommendation model using ALS on the training data als = ALS(maxIter=5, regParam=0.01, implicitPrefs=True, userCol="user", itemCol="show", ratingCol="") model = als.fit(training) predictions = sorted(model.transform(test).collect(), key=lambda r: r[0]) # Save and load model model.save(sc, "target/tmp/myCollaborativeFilter") sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter") # $example off$