Test.assertEquals(
    round(min_error, 2), 0.81,
    "Unexpected value for best RMSE. Expected rounded value to be 0.81. Got {0}"
    .format(round(min_error, 2)))
Test.assertEquals(
    ranks[best_rank], 12,
    "Unexpected value for best rank. Expected 12. Got {0}".format(
        ranks[best_rank]))
Test.assertEqualsHashed(
    als.getItemCol(), "18f0e2357f8829fe809b2d95bc1753000dd925a6",
    "Incorrect choice of {0} for ALS item column.".format(als.getItemCol()))
Test.assertEqualsHashed(
    als.getUserCol(), "db36668fa9a19fde5c9676518f9e86c17cabf65a",
    "Incorrect choice of {0} for ALS user column.".format(als.getUserCol()))
Test.assertEqualsHashed(
    als.getRatingCol(), "3c2d687ef032e625aa4a2b1cfca9751d2080322c",
    "Incorrect choice of {0} for ALS rating column.".format(
        als.getRatingCol()))

# COMMAND ----------

# MAGIC %md
# MAGIC ### (2c) Testing Your Model
# MAGIC
# MAGIC So far, we used the `training_df` and `validation_df` datasets to select the best model.  Since we used these two datasets to determine what model is best, we cannot use them to test how good the model is; otherwise, we would be very vulnerable to [overfitting](https://en.wikipedia.org/wiki/Overfitting).  To decide how good our model is, we need to use the `test_df` dataset.  We will use the `best_rank` you determined in part (2b) to create a model for predicting the ratings for the test dataset and then we will compute the RMSE.
# MAGIC
# MAGIC The steps you should perform are:
# MAGIC * Run a prediction, using `my_model` as created above, on the test dataset (`test_df`), producing a new `predict_df` DataFrame.
# MAGIC * Filter out unwanted NaN values (necessary because of [a bug in Spark](https://issues.apache.org/jira/browse/SPARK-14489)). We've supplied this piece of code for you.
# MAGIC * Use the previously created RMSE evaluator, `reg_eval` to evaluate the filtered DataFrame.
コード例 #2
0
    min_error = error
    best_rank = err
  err += 1

als.setRank(ranks[best_rank])
print 'The best model was trained with rank %s' % ranks[best_rank]
my_model = models[best_rank]

# COMMAND ----------

# TEST
Test.assertEquals(round(min_error, 2), 0.81, "Unexpected value for best RMSE. Expected rounded value to be 0.81. Got {0}".format(round(min_error, 2)))
Test.assertEquals(ranks[best_rank], 12, "Unexpected value for best rank. Expected 12. Got {0}".format(ranks[best_rank]))
Test.assertEqualsHashed(als.getItemCol(), "18f0e2357f8829fe809b2d95bc1753000dd925a6", "Incorrect choice of {0} for ALS item column.".format(als.getItemCol()))
Test.assertEqualsHashed(als.getUserCol(), "db36668fa9a19fde5c9676518f9e86c17cabf65a", "Incorrect choice of {0} for ALS user column.".format(als.getUserCol()))
Test.assertEqualsHashed(als.getRatingCol(), "3c2d687ef032e625aa4a2b1cfca9751d2080322c", "Incorrect choice of {0} for ALS rating column.".format(als.getRatingCol()))

# COMMAND ----------

# MAGIC %md
# MAGIC ### (2c) Testing Your Model
# MAGIC 
# MAGIC So far, we used the `training_df` and `validation_df` datasets to select the best model.  Since we used these two datasets to determine what model is best, we cannot use them to test how good the model is; otherwise, we would be very vulnerable to [overfitting](https://en.wikipedia.org/wiki/Overfitting).  To decide how good our model is, we need to use the `test_df` dataset.  We will use the `best_rank` you determined in part (2b) to create a model for predicting the ratings for the test dataset and then we will compute the RMSE.
# MAGIC 
# MAGIC The steps you should perform are:
# MAGIC * Run a prediction, using `my_model` as created above, on the test dataset (`test_df`), producing a new `predict_df` DataFrame.
# MAGIC * Filter out unwanted NaN values (necessary because of [a bug in Spark](https://issues.apache.org/jira/browse/SPARK-14489)). We've supplied this piece of code for you.
# MAGIC * Use the previously created RMSE evaluator, `reg_eval` to evaluate the filtered DataFrame.

# COMMAND ----------
コード例 #3
0
reviews_dataframe = reviews_data.join(review_count_per_user, 'user_index',
                                      'leftsemi')

(training, test) = reviews_dataframe.randomSplit([0.8, 0.2])
print('########################## Training ###########################')
als = ALS(userCol="user_index",
          itemCol="business_index",
          ratingCol="stars",
          coldStartStrategy="drop")
als.setSeed(123)
# Setting parameters for grid builder
grid = ParamGridBuilder().addGrid(als.maxIter, [20]).addGrid(
    als.rank, [20, 30, 40, 50, 60, 70]).addGrid(als.regParam,
                                                [0.45, 0.5, 0.55]).build()
evaluator = RegressionEvaluator(predictionCol=als.getPredictionCol(),
                                labelCol=als.getRatingCol(),
                                metricName='rmse')
cv = CrossValidator(estimator=als,
                    estimatorParamMaps=grid,
                    evaluator=evaluator,
                    numFolds=5)
cvModel = cv.fit(training)

cvModel.save('E:\Big_data_project\model\collab_montreal_model\\bestModel')
predictions = cvModel.transform(test)
predictions.cache()

print('########################## Computing RMSE ###########################')

rmse_evaluator = RegressionEvaluator(predictionCol='prediction',
                                     labelCol='stars',
        error=self.rmse(dataset,self.predictionCol,self.targetCol)
        print ("Error: {}".format(error))
        return error
    
    def isLargerBetter(self):
        return False
    
    @staticmethod
    def rmse(dataset,predictionCol,targetCol):
        return sqrt(dataset.dropna().map(lambda x: (x[targetCol] - x[predictionCol]) ** 2).reduce(add) / float(dataset.count()))


    
lr1 = ALS()
grid1 = ParamGridBuilder().addGrid(lr1.regParam, [1.0,0.5,2.0]).build()
evaluator1 = MiEvaluador(predictionCol=lr1.getPredictionCol(),targetCol=lr1.getRatingCol())
cv1 = CrossValidator(estimator=lr1, estimatorParamMaps=grid1, evaluator=evaluator1, numFolds=2)
cvModel1 = cv1.fit(dfRatings)
a=cvModel1.transform(dfRatings)
error_cross_validation=MiEvaluador.rmse(a,lr1.getPredictionCol(),lr1.getRatingCol())
print ('ERROR de validacion: {}'.format(error_cross_validation))

error_models=[]
for reg_param in (1.0,0.5,2.0):
    lr = ALS(regParam=reg_param)
    model = lr.fit(dfRatings)
    error=MiEvaluador.rmse(model.transform(dfRatings),lr.getPredictionCol(),lr.getRatingCol())
    error_models.append(error)
    print ('reg_param: {}, rmse: {}'.format(reg_param,error))
    
import numpy as np