def evaluateModel(model, validationData): score = model.predict(validationData.map(lambda p: p.features)) scoreAndLabels = score.zip(validationData.map(lambda p: p.label)) metrics = RegressionMetrics(scoreAndLabels) # Root mean squared error RMSE = metrics.rootMeanSquaredError return RMSE
def train_model_spark( test_set: spark.DataFrame, training_set: spark.DataFrame, alpha: float = 1.0, l1_ratio: float = 0.5, saved_model=parameter.output.folder_data.with_flag(None)[PathStr], ) -> str: transform = VectorAssembler(inputCols=["0", "1", "2"], outputCol="features") lr = LogisticRegression( featuresCol="features", labelCol="target", regParam=l1_ratio, elasticNetParam=alpha, family="multinomial", maxIter=1, ) ppl = Pipeline(stages=[transform, lr]) # Fit the pipeline to training documents. model = ppl.fit(training_set) prediction = model.transform(test_set) evaluation = prediction.withColumn("label", prediction["target"].cast( DoubleType())).select(["label", "prediction"]) evaluation.show() metrics = RegressionMetrics(evaluation.rdd) log_metric("r2", metrics.r2) log_metric("alpha", alpha) model.write().save(str(saved_model)) return "ok"
def test_model(train_RDD, validate_RDD, validate_for_predict_RDD): seed = 5L iterations = 20 regularization_parameter = 0.1 ranks = [14]#10, 20, 14] errors = [0, 0, 0] reg_met = [0, 0, 0] err = 0 min_error = float('inf') best_rank = -1 best_iteration = -1 for rank in ranks: model = ALS.train(train_RDD, rank=rank, seed=seed, iterations=iterations, lambda_=regularization_parameter, nonnegative=True) # model = ALS.trainImplicit(train_RDD, rank=rank, seed=seed, iterations=iterations, lambda_=regularization_parameter) predictions = model.predictAll(validate_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2])) rates_and_preds = validate_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions) valuesAndPreds = rates_and_preds.map(lambda p: (p[1])) metrics = RegressionMetrics(valuesAndPreds) error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) errors[err] = error reg_met[err] = metrics.rootMeanSquaredError err += 1 print 'For rank %s the RMSE is %s the reg_met is %s' % (rank, error,\ metrics.rootMeanSquaredError) if error < min_error: min_error = error best_rank = rank print 'The best model was trained with rank %s' % best_rank
def get_movie_rate(): conf = SparkConf().setMaster("local[*]").setAppName( "Movies Recommended Rates with ALS") sc = SparkContext(conf=conf) data = sc.textFile( "/Users/arz/Desktop/bigdata-project/ml-1m/ratings_training_5.dat") ratings = data.map(lambda l: l.split("::")).map( lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) random_user_pairs_data = sc.textFile( "/Users/arz/Desktop/bigdata-project/ml-1m/random_pairs_5").map( lambda x: x.split("::")) random_user_pairs = random_user_pairs_data.map(lambda x: (x[0], x[1])).cache() # print(random_user_pairs.collect()) rank = 10 num_iterations = 20 alpha = 0.01 model = ALS.train(ratings, rank, num_iterations, alpha) predictions = model.predictAll(random_user_pairs).map( lambda r: ((r[0], r[1]), r[2])).cache() rating_tuples = random_user_pairs_data.map( lambda x: ((int(x[0]), int(x[1])), float(x[2]))) scores = predictions.join(rating_tuples) print(scores.collect()) score_labels = scores.map(lambda x: x[1]) metrics = RegressionMetrics(score_labels) root_mean_square_error = str(metrics.rootMeanSquaredError) sc.stop() return root_mean_square_error
def printMetrics(predictions_and_labels): metrics = RegressionMetrics(predictions_and_labels) f.write('Explained Variance:{0}\n'.format(metrics.explainedVariance)) f.write('Mean Absolute Error:{0}\n'.format(metrics.meanAbsoluteError)) f.write('Mean Squared Error:{0}\n'.format(metrics.meanSquaredError)) f.write('Root Mean Squared Error:{0}\n'.format(metrics.rootMeanSquaredError)) f.write('R^2 :{0}\n'.format(metrics.r2))
def getPredictionsLabels(model, test_data): predictions = model.transform(test_data) trainingSummary = RegressionMetrics( predictions.rdd.map(lambda row: (row.prediction, row.duration))) return (predictions, trainingSummary)
def train_model_spark( test_set: parameter(log_histograms=True)[spark.DataFrame], training_set: spark.DataFrame, alpha: float = 1.0, l1_ratio: float = 0.5, saved_model=model_output_parameter, ) -> str: transform = VectorAssembler(inputCols=SELECTED_FEATURES, outputCol="features") lr = LogisticRegression( featuresCol="features", labelCol=LABEL_COLUMN, regParam=l1_ratio, elasticNetParam=alpha, family="multinomial", maxIter=1, ) ppl = Pipeline(stages=[transform, lr]) # Fit the pipeline to training documents. model = ppl.fit(training_set) prediction = model.transform(test_set) evaluation = prediction.withColumn( "label", prediction["score_label"].cast(DoubleType())).select( ["label", "prediction"]) evaluation.show() metrics = RegressionMetrics(evaluation.rdd) log_metric("r2", metrics.r2) log_metric("alpha", alpha) path = str(saved_model) model.write().save(path) return path
def __evaluate_rating(self, rat_inf: SparkDF): # lit a 1 for implicit rat_inf if "stars" not in rat_inf.columns: rat_inf = rat_inf.withColumn("stars", lit(1.0)) # RegressionMetrics pred_with_labels = (rat_inf .na.drop() .select(col("stars").cast("double").alias("label"), col("prediction").cast("double"))) metrics = RegressionMetrics(pred_with_labels.rdd.map(lambda x: (x.prediction, x.label))) results = {} for m in self.regression_metrics: if m == "rmse": results[m] = metrics.rootMeanSquaredError elif m == "mae": results[m] = metrics.meanAbsoluteError elif m == "rsquared": results[m] = metrics.r2 return results
def EvaluateModel(model, validationData): score = model.predict(validationData.map(lambda p: p.features)) score = score.map(lambda x: float(x)) scoreAndLables = score.zip(validationData.map(lambda p: p.label)) metric = RegressionMetrics(scoreAndLables) RMSE = metric.rootMeanSquaredError return(RMSE)
def test_regression_model(spark_context, regression_model, boston_housing_dataset): batch_size = 64 epochs = 10 x_train, y_train, x_test, y_test = boston_housing_dataset df = to_data_frame(spark_context, x_train, y_train) test_df = to_data_frame(spark_context, x_test, y_test) sgd = optimizers.SGD(lr=0.00001) sgd_conf = optimizers.serialize(sgd) estimator = ElephasEstimator() estimator.set_keras_model_config(regression_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("mae") estimator.set_metrics(['mae']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.01) estimator.set_categorical_labels(False) pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_observations = pnl.rdd.map(lambda row: (row.label, row.prediction)) metrics = RegressionMetrics(prediction_and_observations) print(metrics.r2)
def printMetrics(model): predictions_and_labels = test.map(lambda lr: (float(model.predict(lr.features)), lr.label)) metrics = RegressionMetrics(predictions_and_labels) f.write('Explained Variance:{0}\n'.format(metrics.explainedVariance)) f.write('Mean Absolute Error:{0}\n'.format(metrics.meanAbsoluteError)) f.write('Mean Squared Error:{0}\n'.format(metrics.meanSquaredError)) f.write('Root Mean Squared Error:{0}\n'.format(metrics.rootMeanSquaredError)) f.write('R^2 :{0}\n'.format(metrics.r2))
def evaluateModel(model, validationData): # 计算AUC(ROC曲线下的面积) score = model.predict(validationData.map(lambda x: x.features)) print(score) scoreAndLabels = score.zip(validationData.map(lambda x: x.label)) print("scoreAndLabels的前5项", scoreAndLabels.take(5)) metrics = RegressionMetrics(scoreAndLabels) RMSE = metrics.rootMeanSquaredError return (RMSE)
def main(sc): ratings_info = sc.textFile("input/ratings.csv") ratings_data = ratings_info.map(split).map(parse).filter( lambda line: line != None) training, validation, test = ratings_data.randomSplit([6, 2, 2]) validation_data = validation.map(lambda x: (x[0], x[1])) test_data = test.map(lambda x: (x[0], x[1])) ranks = [6, 8, 10, 12, 14] iteration = 10 min_error = float('inf') best_rank = -1 string = "" for rank in ranks: model = ALS.train(training, rank, iterations=iteration, lambda_=0.1) predictions = model.predictAll(validation_data).map( lambda r: ((r[0], r[1]), r[2])) ratings = validation.map(lambda r: ((r[0], r[1]), r[2])) preds_and_rates = predictions.join(ratings) predsAndratess = preds_and_rates.map(lambda tup: tup[1]) metrics = RegressionMetrics(predsAndratess) error = metrics.rootMeanSquaredError string += "For rank " + str(rank) + "the RMSE is " + str(error) + "\n" if error < min_error: min_error = error best_rank = rank string += "The best model was trained with rank " + str(best_rank) + "\n" model = ALS.train(training, best_rank, iterations=iteration, lambda_=0.1) predictions = model.predictAll(test_data).map(lambda r: ((r[0], r[1]), r[2])) ratings = test.map(lambda r: ((r[0], r[1]), r[2])) preds_and_rates = predictions.join(ratings) predsAndratess = preds_and_rates.map(lambda tup: tup[1]) metrics = RegressionMetrics(predsAndratess) error = metrics.rootMeanSquaredError string += "The RMSE for Test data is " + str(error) + "\n" print string
def evaluate_model(): """ will read train and test files from jan and Feb 2017 to evaluate model prints validation and test set error metrics to logs :return: None """ ml_model = train_model.EnsembleModel() df_raw_train_filepath = os.path.join(setting.data_dir_interim, setting.raw_train_filename) df_raw_test_filepath = os.path.join(setting.data_dir_interim, setting.raw_test_filename) logger.info("using data from {} for training and validation".format( df_raw_train_filepath)) logger.info("using data from {} for testing".format(df_raw_test_filepath)) df_raw_train = spark.read.parquet(df_raw_train_filepath) df_raw_test = spark.read.parquet(df_raw_test_filepath) train_frac = 0.75 test_frac = (1 - train_frac) df_raw_train, df_raw_val = df_raw_train.randomSplit( [train_frac, test_frac]) df_train = build_features.featurize(df_raw_train) ml_model = ml_model.fit(df_train) _, val_predictions = ml_model.transform( build_features.featurize(df_raw_val)) _, test_predictions = ml_model.transform( build_features.featurize(df_raw_test)) val_prediction_labels = val_predictions.select("tip_amount", "prediction").rdd val_test_metrics = RegressionMetrics(val_prediction_labels) test_prediction_labels = test_predictions.select("tip_amount", "prediction").rdd test_test_metrics = RegressionMetrics(test_prediction_labels) logger.info("Validation set RMSE = {}".format( val_test_metrics.rootMeanSquaredError)) logger.info("Test set RMSE = {}".format( test_test_metrics.rootMeanSquaredError))
def report_accuracy(result_rdd): from pyspark.mllib.evaluation import RegressionMetrics if not result_rdd.isEmpty(): metrics = RegressionMetrics( result_rdd.map(lambda t: (float(t[1]), float(t[0])))) print("MSE = %s" % metrics.meanSquaredError) print("RMSE = %s" % metrics.rootMeanSquaredError) print("R-squared = %s" % metrics.r2) print("MAE = %s" % metrics.meanAbsoluteError) print("Explained variance = %s" % metrics.explainedVariance)
def taxi_regression(sc, filename): ''' Args: sc: The Spark Context filename: Filename of the Amazon reviews file to use, where each line represents a review ''' sqlContext = SQLContext(sc) df = sqlContext.read.load(filename, format='com.databricks.spark.csv', header='true', inferSchema='true').sample(False, 0.001) # Limit Data: longitude > 40.6 and longitude < 40.9 and latitude < -73.75 and latitude > -74.05 df = df.filter((df.pickup_longitude < -73.75) & (df.pickup_longitude > -74.05) & (df.dropoff_longitude < -73.75) & (df.dropoff_longitude > -74.05)) df = df.filter((df.pickup_latitude < 40.9) & (df.pickup_latitude > 40.6) &(df.dropoff_latitude < 40.9) & (df.dropoff_latitude > 40.6)) labeled_rdd = df.rdd.map(lambda x: get_labeled_point(x)) labeled_rdd = labeled_rdd.filter(lambda row: row[0] > 0.0 and row[0] < 60 and row[1][0] < 0.21 and row[1][0] > 0.0) # For Graphing: # df = labeled_rdd.map(lambda row: (row[0], row[1][0])).toDF() # df.write.format("com.databricks.spark.csv").option("header", "true").save("distance_fare.csv") labeled_rdd = labeled_rdd.map(lambda row: LabeledPoint(row[0], row[1])) # Build model training_data, test_data = labeled_rdd.randomSplit([0.8, 0.2]) model = LinearRegressionWithSGD.train(training_data, intercept=True) valuesAndPredsTraining = training_data.map(lambda p: (float(model.predict(p.features)), p.label)) valuesAndPreds = test_data.map(lambda p: (float(model.predict(p.features)), p.label)) trainingMetrics = RegressionMetrics(valuesAndPredsTraining) metrics = RegressionMetrics(valuesAndPreds) # for row in labeled_rdd.collect(): # print("distance: " + str(row.features) + " actual fare: " + str(row.label) + " predcted fare: " + str(model.predict(row.features))) print("RMSE = ", metrics.rootMeanSquaredError," Explained Variance = ", metrics.explainedVariance, " RMSE Training = ", trainingMetrics.rootMeanSquaredError)
def amazon_regression(sc, filename): ''' Args: sc: The Spark Context filename: Filename of the Amazon reviews file to use, where each line represents a review ''' # YOUR CODE HERE reviews = sc.textFile(filename).sample(False, 0.0001) reviews = reviews.map(lambda x: loadcsv(x)) reviews = reviews.filter(lambda x: x != None) labels = reviews.map(lambda x: x[0]) # reviews = reviews.map(lambda x: (float(x[0]), x[1])).mapValues(lambda x:x.split()) reviews = (reviews.map(lambda x: (float(x[0]), x[1])).mapValues( lambda x: x.split())) # Feed HashingTF the array of words tf = HashingTF().transform(reviews.map(lambda x: x[1])) # Pipe term frequencies into the IDF idf = IDF(minDocFreq=5).fit(tf) # Transform the IDF into a TF-IDF tfidf = idf.transform(tf) parsedData = (labels.zip(tfidf).map(lambda x: LabeledPoint((x[0]), x[1]))) training, test = parsedData.randomSplit([0.5, 0.5]) # Build the model model = LinearRegressionWithSGD.train(training, iterations=10000, step=0.000000001) # Evaluate the model on training data valuesAndPreds = (training.map(lambda x: x.label).zip( model.predict(training.map(lambda x: x.features)))) # Save and load mode vap = (test.map(lambda x: x.label).zip( model.predict(test.map(lambda x: x.features)))) trained_metrics = RegressionMetrics(valuesAndPreds.mapValues(float)) train_rootMeanSquaredError = trained_metrics.rootMeanSquaredError train_explainedVariance = trained_metrics.explainedVariance test_metrics = RegressionMetrics(vap.mapValues(float)) test_rootMeanSquaredError = test_metrics.rootMeanSquaredError test_explainedVariance = test_metrics.explainedVariance
def evaluateModel(model, validationData): ''' 使用RMES(Area under of Curve of ROC)评估模型的准确率 :param model: :param validationData: :return: ''' score = model.predict(validationData.map(lambda p: p.features)) scoreAndLabels = score.zip(validationData.map(lambda p: p.label)) metrics = RegressionMetrics(scoreAndLabels) RMES = metrics.rootMeanSquaredError return (RMES)
def evaluateRegression(self, scoreAndLabels): metrics = RegressionMetrics(scoreAndLabels) result = {} result['MAE'] = metrics.meanAbsoluteError result['MSE'] = metrics.meanSquaredError result['RMSE'] = metrics.rootMeanSquaredError result['R-squared'] = metrics.r2 return result
def tql_regression(sc, filename): # load in csv records = sc.textFile(filename).sample(False, 0.1, 16) p_records = records.flatMap(lambda l: process_record(l)) prices = p_records.keys() attrs = p_records.values() labeled_points = prices.zip(attrs).map(lambda x: LabeledPoint(x[0], x[1])) print(labeled_points.take(10)) training, test = labeled_points.randomSplit([0.8, 0.2]) #model = RidgeRegressionWithSGD.train(training, iterations=200) model = LinearRegressionWithSGD.train(training, iterations=200, regType="l2") # Use our model to predict train_predicts = model.predict(training.map(lambda x: x.features)) train_preds = training.map(lambda x: x.label).zip(train_predicts) test_preds = test.map(lambda x: x.label).zip( model.predict(test.map(lambda x: x.features))) print(train_preds.take(10)) print(test_preds.take(10)) # Ask PySpark for some metrics on how our model predictions performed trained_metrics = RegressionMetrics( train_preds.map(lambda x: (x[0], float(x[1])))) test_metrics = RegressionMetrics( test_preds.map(lambda x: (x[0], float(x[1])))) print("___________________trained RMSE", trained_metrics.rootMeanSquaredError) print("___________________trained EV", trained_metrics.explainedVariance) print("___________________test RMSE", test_metrics.rootMeanSquaredError) print("___________________test EV", test_metrics.explainedVariance) return 0
def testRegression(train, test): # Train a GradientBoostedTrees model. rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel") model = rf.fit(train) predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ .map(lambda x: (x.prediction, x.indexedLabel)) metrics = RegressionMetrics(predictionAndLabels) print("rmse %.3f" % metrics.rootMeanSquaredError) print("r2 %.3f" % metrics.r2) print("mae %.3f" % metrics.meanAbsoluteError)
def alsModelEvaluate(model, testing_rdd): # 对测试数据集预测评分,针对测试数据集进行预测 predict_rdd = model.predictAll(testing_rdd.map(lambda r: (r[0], r[1]))) predict_actual_rdd = predict_rdd.map(lambda r: ((r[0], r[1]), r[2])) \ .join(testing_ratings.map(lambda r: ((r[0], r[1]), r[2]))) # 创建评估指标实例对象 metrics = RegressionMetrics(predict_actual_rdd.map(lambda pr: pr[1])) #print("MSE = %s" % metrics.meanSquaredError) #print("RMSE = %s" % metrics.rootMeanSquaredError) # 返回均方根误差 return metrics.rootMeanSquaredError
def testRegression(train, test): # Train a RandomForest model. # Note: Use larger numTrees in practice. rf = RandomForestRegressor(labelCol="indexedLabel", numTrees=3, maxDepth=4) model = rf.fit(train) predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ .map(lambda x: (x.prediction, x.indexedLabel)) metrics = RegressionMetrics(predictionAndLabels) print("rmse %.3f" % metrics.rootMeanSquaredError) print("r2 %.3f" % metrics.r2) print("mae %.3f" % metrics.meanAbsoluteError)
def Print_regression_info(xy_predict): ''' 打印和回归有关的信息 xy_predict:模型预测的数据集 ''' predict_and_target_rdd = xy_predict.rdd.map( lambda row: (float(row.prediction), float(row.label))) metrics = RegressionMetrics(predict_and_target_rdd) print '----------------------------------------------' print "MSE: %s" % metrics.meanSquaredError print "RMSE: %s" % metrics.rootMeanSquaredError print "R-squared: %s" % metrics.r2 print "MAE: %s" % metrics.meanAbsoluteError print '----------------------------------------------'
def performRegression(data, params): training, test = data.randomSplit([.8, .2]) lasso = performLasso(training) linReg = performLinearRegression(training) ridgeReg = performRidgeRegression(training) lassoTest = (test.map(lambda x: x.label).zip(lasso.predict(test.map(lambda x: x.features)))) linTest = (test.map(lambda x: x.label).zip(linReg.predict(test.map(lambda x: x.features)))) ridgeTest = (test.map(lambda x: x.label).zip(ridgeReg.predict(test.map(lambda x: x.features)))) lassoMetrics = RegressionMetrics(lassoTest.map(lambda x: (x[0], float(x[1])))) linMetrics = RegressionMetrics(linTest.map(lambda x: (x[0], float(x[1])))) ridgeMetrics = RegressionMetrics(ridgeTest.map(lambda x: (x[0], float(x[1])))) lassoValue = lassoMetrics.rootMeanSquaredError linRegValue = linMetrics.rootMeanSquaredError ridgeRegValue = ridgeMetrics.rootMeanSquaredError if(lassoValue < linRegValue and lassoValue < ridgeRegValue): return "lasso" if(linRegValue < lassoValue and linRegValue < ridgeRegValue): return "linear" return "ridge"
def evaluateRddRegressionModel(self): # Get predictions valuesAndPreds = self.getRddPredictionsLabels(self._get_rddModel(), self._get_rddTest()) # Instantiate metrics object metrics = RegressionMetrics(valuesAndPreds) # Squared Error print("MSE = %s" % metrics.meanSquaredError) print("RMSE = %s" % metrics.rootMeanSquaredError) # R-squared print("R-squared = %s" % metrics.r2) # Mean absolute error print("MAE = %s" % metrics.meanAbsoluteError) # Explained variance print("Explained variance = %s" % metrics.explainedVariance)
def evaluate_prediction(self, test_data, test_target): """ Evaluates the performance of the prediction model by printing out MSE and other model related data. @test_data: The test data @test_target: The labels of the test data. @return: Each predicted value in pair with the true value """ predictions = self.predict(test_data) rf_predicted_values = test_target.zip( predictions.map(lambda x: float(x))) metrics_rf = RegressionMetrics(rf_predicted_values) self._log.info('Random Forest predictions: {}'.format( str(rf_predicted_values.take(5)))) self._log.info('TestSet MSE = {}'.format(metrics_rf.meanSquaredError)) return rf_predicted_values
def bestmodel(traindata, validatedata): bestValidationRmse = float("inf") #map validate data to userId, movieId validation = validatedata.map(lambda r: (r[0], r[1])) #get actual rating data for pairs of userId, movieId ratingTuple = validatedata.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))) for rank in ranks: #create model by train data model = ALS.train(traindata, rank, numIterations, lambda_=regulz_para) #predict ratings for validation data predictions = model.predictAll(validation).map(lambda r: ((r[0], r[1]), r[2])) #create predict and actual ratings scoreAndLabels = predictions.join(ratingTuple).map(lambda tup: tup[1]) regMetrics = RegressionMetrics(scoreAndLabels) RMSE = regMetrics.rootMeanSquaredError MSE = regMetrics.meanSquaredError print("For rank %s:" % rank) print("RMSE = %s" % RMSE) print("MSE = %s" % MSE) if RMSE < bestValidationRmse: bestValidationRmse = RMSE best_rank = rank print 'The best model was trained with rank %s' % best_rank #MAP: #actual top 10 movie sequence for users by rating model = ALS.train(traindata, best_rank, numIterations, lambda_=regulz_para) actual_user_movie = validatedata.map(lambda x: (x[0], (x[1], x[2]))).groupByKey() actual_user_movie1 = actual_user_movie.map(order_movies) predict_user_movie = model.predictAll(validation).map( lambda r: (r[0], (r[1], r[2]))).groupByKey() predict_user_movie1 = predict_user_movie.map(order_movies) movie_seq = predict_user_movie1.join(actual_user_movie1).map( lambda x: x[1]) movie_seq = movie_seq.map(movie_index) rankMetrics = RankingMetrics(movie_seq) MAP = rankMetrics.meanAveragePrecision print("MAP = %s" % MAP)
def main(sc): ratings_info = sc.textFile("input/ratings.csv") ratings_data = ratings_info.map(split).map(parse).filter( lambda line: line != None) fold1, fold2, fold3, fold4, fold5 = ratings_data.randomSplit( [0.2, 0.2, 0.2, 0.2, 0.2]) folds = [fold1, fold2, fold3, fold4, fold5] rank = 12 itr = 25 mse = 0 rmse = 0 map = 0 for i in range(5): test_data = folds[i] train_data = sc.emptyRDD() for j in range(5): if i == j: continue else: train_data = train_data.union(folds[j]) model = ALS.train(train_data, rank, iterations=itr, lambda_=0.1) testdata = test_data.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) rates = test_data.map(lambda r: ((r[0], r[1]), r[2])) predsAndlabels = predictions.join(rates).map(lambda tup: tup[1]) actual_rating = predsAndlabels.map(lambda r: r[1]).collect() predicted_rating = predsAndlabels.map(lambda r: r[0]).collect() predAndReal = sc.parallelize([(predicted_rating, actual_rating)]) metrics = RegressionMetrics(predsAndlabels) metric = RankingMetrics(predAndReal) mse += metrics.meanSquaredError rmse += metrics.rootMeanSquaredError map += metric.meanAveragePrecision k_mse = mse / 5.0 k_rmse = rmse / 5.0 k_map = map / 5.0 print("MSE = %s" % k_mse) print("RMSE = %s" % k_rmse) print("MAP = %s" % k_map)
def get_val_metrics(model, val): preds = model.transform(val) recs = model.recommendForUserSubset(val, 500) top_items = recs.selectExpr('user as user', 'recommendations.item as top_items') true_items = val.where(val.rating >= 3).groupby('user').agg(collect_list('item').alias('true_item_list')) predictions_and_labels_rankings = top_items.join(true_items, how = 'inner', on = 'user')\ .select('true_item_list', 'top_items') predictions_and_labels_rankings.write.json('val_recs.json') ranking_metrics = RankingMetrics(predictions_and_labels_rankings.cache().rdd) prec_at = ranking_metrics.precisionAt(500) mean_avg_prec = ranking_metrics.meanAveragePrecision ndcg = ranking_metrics.ndcgAt(500) rmse = RegressionMetrics(preds.select('rating', 'prediction').cache().rdd).rootMeanSquaredError evaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'rating', metricName = 'rmse') rmse = evaluator.evaluate(preds) return rmse, prec_at, mean_avg_prec, ndcg