Test.assertEquals( round(min_error, 2), 0.81, "Unexpected value for best RMSE. Expected rounded value to be 0.81. Got {0}" .format(round(min_error, 2))) Test.assertEquals( ranks[best_rank], 12, "Unexpected value for best rank. Expected 12. Got {0}".format( ranks[best_rank])) Test.assertEqualsHashed( als.getItemCol(), "18f0e2357f8829fe809b2d95bc1753000dd925a6", "Incorrect choice of {0} for ALS item column.".format(als.getItemCol())) Test.assertEqualsHashed( als.getUserCol(), "db36668fa9a19fde5c9676518f9e86c17cabf65a", "Incorrect choice of {0} for ALS user column.".format(als.getUserCol())) Test.assertEqualsHashed( als.getRatingCol(), "3c2d687ef032e625aa4a2b1cfca9751d2080322c", "Incorrect choice of {0} for ALS rating column.".format( als.getRatingCol())) # COMMAND ---------- # MAGIC %md # MAGIC ### (2c) Testing Your Model # MAGIC # MAGIC So far, we used the `training_df` and `validation_df` datasets to select the best model. Since we used these two datasets to determine what model is best, we cannot use them to test how good the model is; otherwise, we would be very vulnerable to [overfitting](https://en.wikipedia.org/wiki/Overfitting). To decide how good our model is, we need to use the `test_df` dataset. We will use the `best_rank` you determined in part (2b) to create a model for predicting the ratings for the test dataset and then we will compute the RMSE. # MAGIC # MAGIC The steps you should perform are: # MAGIC * Run a prediction, using `my_model` as created above, on the test dataset (`test_df`), producing a new `predict_df` DataFrame. # MAGIC * Filter out unwanted NaN values (necessary because of [a bug in Spark](https://issues.apache.org/jira/browse/SPARK-14489)). We've supplied this piece of code for you. # MAGIC * Use the previously created RMSE evaluator, `reg_eval` to evaluate the filtered DataFrame.
min_error = error best_rank = err err += 1 als.setRank(ranks[best_rank]) print 'The best model was trained with rank %s' % ranks[best_rank] my_model = models[best_rank] # COMMAND ---------- # TEST Test.assertEquals(round(min_error, 2), 0.81, "Unexpected value for best RMSE. Expected rounded value to be 0.81. Got {0}".format(round(min_error, 2))) Test.assertEquals(ranks[best_rank], 12, "Unexpected value for best rank. Expected 12. Got {0}".format(ranks[best_rank])) Test.assertEqualsHashed(als.getItemCol(), "18f0e2357f8829fe809b2d95bc1753000dd925a6", "Incorrect choice of {0} for ALS item column.".format(als.getItemCol())) Test.assertEqualsHashed(als.getUserCol(), "db36668fa9a19fde5c9676518f9e86c17cabf65a", "Incorrect choice of {0} for ALS user column.".format(als.getUserCol())) Test.assertEqualsHashed(als.getRatingCol(), "3c2d687ef032e625aa4a2b1cfca9751d2080322c", "Incorrect choice of {0} for ALS rating column.".format(als.getRatingCol())) # COMMAND ---------- # MAGIC %md # MAGIC ### (2c) Testing Your Model # MAGIC # MAGIC So far, we used the `training_df` and `validation_df` datasets to select the best model. Since we used these two datasets to determine what model is best, we cannot use them to test how good the model is; otherwise, we would be very vulnerable to [overfitting](https://en.wikipedia.org/wiki/Overfitting). To decide how good our model is, we need to use the `test_df` dataset. We will use the `best_rank` you determined in part (2b) to create a model for predicting the ratings for the test dataset and then we will compute the RMSE. # MAGIC # MAGIC The steps you should perform are: # MAGIC * Run a prediction, using `my_model` as created above, on the test dataset (`test_df`), producing a new `predict_df` DataFrame. # MAGIC * Filter out unwanted NaN values (necessary because of [a bug in Spark](https://issues.apache.org/jira/browse/SPARK-14489)). We've supplied this piece of code for you. # MAGIC * Use the previously created RMSE evaluator, `reg_eval` to evaluate the filtered DataFrame. # COMMAND ----------
reviews_dataframe = reviews_data.join(review_count_per_user, 'user_index', 'leftsemi') (training, test) = reviews_dataframe.randomSplit([0.8, 0.2]) print('########################## Training ###########################') als = ALS(userCol="user_index", itemCol="business_index", ratingCol="stars", coldStartStrategy="drop") als.setSeed(123) # Setting parameters for grid builder grid = ParamGridBuilder().addGrid(als.maxIter, [20]).addGrid( als.rank, [20, 30, 40, 50, 60, 70]).addGrid(als.regParam, [0.45, 0.5, 0.55]).build() evaluator = RegressionEvaluator(predictionCol=als.getPredictionCol(), labelCol=als.getRatingCol(), metricName='rmse') cv = CrossValidator(estimator=als, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5) cvModel = cv.fit(training) cvModel.save('E:\Big_data_project\model\collab_montreal_model\\bestModel') predictions = cvModel.transform(test) predictions.cache() print('########################## Computing RMSE ###########################') rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='stars',
error=self.rmse(dataset,self.predictionCol,self.targetCol) print ("Error: {}".format(error)) return error def isLargerBetter(self): return False @staticmethod def rmse(dataset,predictionCol,targetCol): return sqrt(dataset.dropna().map(lambda x: (x[targetCol] - x[predictionCol]) ** 2).reduce(add) / float(dataset.count())) lr1 = ALS() grid1 = ParamGridBuilder().addGrid(lr1.regParam, [1.0,0.5,2.0]).build() evaluator1 = MiEvaluador(predictionCol=lr1.getPredictionCol(),targetCol=lr1.getRatingCol()) cv1 = CrossValidator(estimator=lr1, estimatorParamMaps=grid1, evaluator=evaluator1, numFolds=2) cvModel1 = cv1.fit(dfRatings) a=cvModel1.transform(dfRatings) error_cross_validation=MiEvaluador.rmse(a,lr1.getPredictionCol(),lr1.getRatingCol()) print ('ERROR de validacion: {}'.format(error_cross_validation)) error_models=[] for reg_param in (1.0,0.5,2.0): lr = ALS(regParam=reg_param) model = lr.fit(dfRatings) error=MiEvaluador.rmse(model.transform(dfRatings),lr.getPredictionCol(),lr.getRatingCol()) error_models.append(error) print ('reg_param: {}, rmse: {}'.format(reg_param,error)) import numpy as np