Esempio n. 1
0
def evaluateModel(model, validationData):
    score = model.predict(validationData.map(lambda p: p.features))
    scoreAndLabels = score.zip(validationData.map(lambda p: p.label))
    metrics = RegressionMetrics(scoreAndLabels)
    # Root mean squared error
    RMSE = metrics.rootMeanSquaredError
    return RMSE
Esempio n. 2
0
def train_model_spark(
    test_set: spark.DataFrame,
    training_set: spark.DataFrame,
    alpha: float = 1.0,
    l1_ratio: float = 0.5,
    saved_model=parameter.output.folder_data.with_flag(None)[PathStr],
) -> str:

    transform = VectorAssembler(inputCols=["0", "1", "2"],
                                outputCol="features")
    lr = LogisticRegression(
        featuresCol="features",
        labelCol="target",
        regParam=l1_ratio,
        elasticNetParam=alpha,
        family="multinomial",
        maxIter=1,
    )
    ppl = Pipeline(stages=[transform, lr])

    # Fit the pipeline to training documents.
    model = ppl.fit(training_set)

    prediction = model.transform(test_set)
    evaluation = prediction.withColumn("label", prediction["target"].cast(
        DoubleType())).select(["label", "prediction"])
    evaluation.show()
    metrics = RegressionMetrics(evaluation.rdd)

    log_metric("r2", metrics.r2)
    log_metric("alpha", alpha)

    model.write().save(str(saved_model))
    return "ok"
Esempio n. 3
0
def test_model(train_RDD, validate_RDD, validate_for_predict_RDD):
    seed = 5L
    iterations = 20
    regularization_parameter = 0.1
    ranks = [14]#10, 20, 14]
    errors = [0, 0, 0]
    reg_met = [0, 0, 0]
    err = 0

    min_error = float('inf')
    best_rank = -1
    best_iteration = -1
    for rank in ranks:
        model = ALS.train(train_RDD, rank=rank, seed=seed, iterations=iterations, lambda_=regularization_parameter, nonnegative=True)
        # model = ALS.trainImplicit(train_RDD, rank=rank, seed=seed, iterations=iterations, lambda_=regularization_parameter)
        predictions = model.predictAll(validate_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))

        rates_and_preds = validate_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
        valuesAndPreds = rates_and_preds.map(lambda p: (p[1]))
        metrics = RegressionMetrics(valuesAndPreds)
        error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
        errors[err] = error
        reg_met[err] = metrics.rootMeanSquaredError
        err += 1
        print 'For rank %s the RMSE is %s the reg_met is %s' % (rank, error,\
                                                                metrics.rootMeanSquaredError)
        if error < min_error:
            min_error = error
            best_rank = rank

    print 'The best model was trained with rank %s' % best_rank
Esempio n. 4
0
def get_movie_rate():
    conf = SparkConf().setMaster("local[*]").setAppName(
        "Movies Recommended Rates with ALS")
    sc = SparkContext(conf=conf)

    data = sc.textFile(
        "/Users/arz/Desktop/bigdata-project/ml-1m/ratings_training_5.dat")

    ratings = data.map(lambda l: l.split("::")).map(
        lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))

    random_user_pairs_data = sc.textFile(
        "/Users/arz/Desktop/bigdata-project/ml-1m/random_pairs_5").map(
            lambda x: x.split("::"))
    random_user_pairs = random_user_pairs_data.map(lambda x:
                                                   (x[0], x[1])).cache()
    # print(random_user_pairs.collect())
    rank = 10
    num_iterations = 20
    alpha = 0.01
    model = ALS.train(ratings, rank, num_iterations, alpha)
    predictions = model.predictAll(random_user_pairs).map(
        lambda r: ((r[0], r[1]), r[2])).cache()
    rating_tuples = random_user_pairs_data.map(
        lambda x: ((int(x[0]), int(x[1])), float(x[2])))
    scores = predictions.join(rating_tuples)
    print(scores.collect())
    score_labels = scores.map(lambda x: x[1])
    metrics = RegressionMetrics(score_labels)
    root_mean_square_error = str(metrics.rootMeanSquaredError)
    sc.stop()
    return root_mean_square_error
Esempio n. 5
0
 def printMetrics(predictions_and_labels):
     metrics = RegressionMetrics(predictions_and_labels)
     f.write('Explained Variance:{0}\n'.format(metrics.explainedVariance))
     f.write('Mean Absolute Error:{0}\n'.format(metrics.meanAbsoluteError))
     f.write('Mean Squared Error:{0}\n'.format(metrics.meanSquaredError))
     f.write('Root Mean Squared Error:{0}\n'.format(metrics.rootMeanSquaredError))
     f.write('R^2 :{0}\n'.format(metrics.r2))
Esempio n. 6
0
def getPredictionsLabels(model, test_data):
    predictions = model.transform(test_data)

    trainingSummary = RegressionMetrics(
        predictions.rdd.map(lambda row: (row.prediction, row.duration)))

    return (predictions, trainingSummary)
def train_model_spark(
    test_set: parameter(log_histograms=True)[spark.DataFrame],
    training_set: spark.DataFrame,
    alpha: float = 1.0,
    l1_ratio: float = 0.5,
    saved_model=model_output_parameter,
) -> str:
    transform = VectorAssembler(inputCols=SELECTED_FEATURES,
                                outputCol="features")
    lr = LogisticRegression(
        featuresCol="features",
        labelCol=LABEL_COLUMN,
        regParam=l1_ratio,
        elasticNetParam=alpha,
        family="multinomial",
        maxIter=1,
    )
    ppl = Pipeline(stages=[transform, lr])

    # Fit the pipeline to training documents.
    model = ppl.fit(training_set)

    prediction = model.transform(test_set)
    evaluation = prediction.withColumn(
        "label", prediction["score_label"].cast(DoubleType())).select(
            ["label", "prediction"])
    evaluation.show()
    metrics = RegressionMetrics(evaluation.rdd)

    log_metric("r2", metrics.r2)
    log_metric("alpha", alpha)

    path = str(saved_model)
    model.write().save(path)
    return path
Esempio n. 8
0
    def __evaluate_rating(self, rat_inf: SparkDF):
        
        # lit a 1 for implicit rat_inf
        if "stars" not in rat_inf.columns:
            rat_inf = rat_inf.withColumn("stars", lit(1.0))

        # RegressionMetrics
        pred_with_labels = (rat_inf
                            .na.drop()
                            .select(col("stars").cast("double").alias("label"),
                                    col("prediction").cast("double")))

        metrics = RegressionMetrics(pred_with_labels.rdd.map(lambda x: (x.prediction, x.label)))

        results = {}

        for m in self.regression_metrics:
            if m == "rmse":
                results[m] = metrics.rootMeanSquaredError
            elif m == "mae":
                results[m] = metrics.meanAbsoluteError
            elif m == "rsquared":
                results[m] = metrics.r2

        return results
Esempio n. 9
0
def EvaluateModel(model, validationData):
    score = model.predict(validationData.map(lambda p: p.features))
    score = score.map(lambda x: float(x))
    scoreAndLables = score.zip(validationData.map(lambda p: p.label))
    metric = RegressionMetrics(scoreAndLables)
    RMSE = metric.rootMeanSquaredError
    return(RMSE)
Esempio n. 10
0
def test_regression_model(spark_context, regression_model,
                          boston_housing_dataset):
    batch_size = 64
    epochs = 10

    x_train, y_train, x_test, y_test = boston_housing_dataset
    df = to_data_frame(spark_context, x_train, y_train)
    test_df = to_data_frame(spark_context, x_test, y_test)

    sgd = optimizers.SGD(lr=0.00001)
    sgd_conf = optimizers.serialize(sgd)
    estimator = ElephasEstimator()
    estimator.set_keras_model_config(regression_model.to_yaml())
    estimator.set_optimizer_config(sgd_conf)
    estimator.set_mode("synchronous")
    estimator.set_loss("mae")
    estimator.set_metrics(['mae'])
    estimator.set_epochs(epochs)
    estimator.set_batch_size(batch_size)
    estimator.set_validation_split(0.01)
    estimator.set_categorical_labels(False)

    pipeline = Pipeline(stages=[estimator])
    fitted_pipeline = pipeline.fit(df)
    prediction = fitted_pipeline.transform(test_df)
    pnl = prediction.select("label", "prediction")
    pnl.show(100)

    prediction_and_observations = pnl.rdd.map(lambda row:
                                              (row.label, row.prediction))
    metrics = RegressionMetrics(prediction_and_observations)
    print(metrics.r2)
Esempio n. 11
0
 def printMetrics(model):
     predictions_and_labels = test.map(lambda lr: (float(model.predict(lr.features)), lr.label))
     metrics = RegressionMetrics(predictions_and_labels)
     f.write('Explained Variance:{0}\n'.format(metrics.explainedVariance))
     f.write('Mean Absolute Error:{0}\n'.format(metrics.meanAbsoluteError))
     f.write('Mean Squared Error:{0}\n'.format(metrics.meanSquaredError))
     f.write('Root Mean Squared Error:{0}\n'.format(metrics.rootMeanSquaredError))
     f.write('R^2 :{0}\n'.format(metrics.r2))
def evaluateModel(model, validationData):
    # 计算AUC(ROC曲线下的面积)
    score = model.predict(validationData.map(lambda x: x.features))
    print(score)
    scoreAndLabels = score.zip(validationData.map(lambda x: x.label))
    print("scoreAndLabels的前5项", scoreAndLabels.take(5))
    metrics = RegressionMetrics(scoreAndLabels)
    RMSE = metrics.rootMeanSquaredError
    return (RMSE)
Esempio n. 13
0
def main(sc):
    ratings_info = sc.textFile("input/ratings.csv")
    ratings_data = ratings_info.map(split).map(parse).filter(
        lambda line: line != None)

    training, validation, test = ratings_data.randomSplit([6, 2, 2])
    validation_data = validation.map(lambda x: (x[0], x[1]))
    test_data = test.map(lambda x: (x[0], x[1]))

    ranks = [6, 8, 10, 12, 14]
    iteration = 10

    min_error = float('inf')
    best_rank = -1
    string = ""
    for rank in ranks:
        model = ALS.train(training, rank, iterations=iteration, lambda_=0.1)
        predictions = model.predictAll(validation_data).map(
            lambda r: ((r[0], r[1]), r[2]))
        ratings = validation.map(lambda r: ((r[0], r[1]), r[2]))
        preds_and_rates = predictions.join(ratings)
        predsAndratess = preds_and_rates.map(lambda tup: tup[1])
        metrics = RegressionMetrics(predsAndratess)
        error = metrics.rootMeanSquaredError

        string += "For rank " + str(rank) + "the RMSE is " + str(error) + "\n"
        if error < min_error:
            min_error = error
            best_rank = rank

    string += "The best model was trained with rank " + str(best_rank) + "\n"

    model = ALS.train(training, best_rank, iterations=iteration, lambda_=0.1)
    predictions = model.predictAll(test_data).map(lambda r:
                                                  ((r[0], r[1]), r[2]))
    ratings = test.map(lambda r: ((r[0], r[1]), r[2]))
    preds_and_rates = predictions.join(ratings)
    predsAndratess = preds_and_rates.map(lambda tup: tup[1])
    metrics = RegressionMetrics(predsAndratess)
    error = metrics.rootMeanSquaredError

    string += "The RMSE for Test data is " + str(error) + "\n"
    print string
Esempio n. 14
0
def evaluate_model():
    """
    will read train and test files from jan and Feb 2017 to evaluate model
    prints validation and test set error metrics to logs
    :return: None
    """
    ml_model = train_model.EnsembleModel()
    df_raw_train_filepath = os.path.join(setting.data_dir_interim,
                                         setting.raw_train_filename)
    df_raw_test_filepath = os.path.join(setting.data_dir_interim,
                                        setting.raw_test_filename)
    logger.info("using data from {} for training and validation".format(
        df_raw_train_filepath))
    logger.info("using data from {} for testing".format(df_raw_test_filepath))
    df_raw_train = spark.read.parquet(df_raw_train_filepath)
    df_raw_test = spark.read.parquet(df_raw_test_filepath)

    train_frac = 0.75
    test_frac = (1 - train_frac)

    df_raw_train, df_raw_val = df_raw_train.randomSplit(
        [train_frac, test_frac])

    df_train = build_features.featurize(df_raw_train)

    ml_model = ml_model.fit(df_train)

    _, val_predictions = ml_model.transform(
        build_features.featurize(df_raw_val))
    _, test_predictions = ml_model.transform(
        build_features.featurize(df_raw_test))

    val_prediction_labels = val_predictions.select("tip_amount",
                                                   "prediction").rdd
    val_test_metrics = RegressionMetrics(val_prediction_labels)
    test_prediction_labels = test_predictions.select("tip_amount",
                                                     "prediction").rdd
    test_test_metrics = RegressionMetrics(test_prediction_labels)

    logger.info("Validation set RMSE = {}".format(
        val_test_metrics.rootMeanSquaredError))
    logger.info("Test set RMSE = {}".format(
        test_test_metrics.rootMeanSquaredError))
Esempio n. 15
0
    def report_accuracy(result_rdd):
        from pyspark.mllib.evaluation import RegressionMetrics

        if not result_rdd.isEmpty():
            metrics = RegressionMetrics(
                result_rdd.map(lambda t: (float(t[1]), float(t[0]))))
            print("MSE = %s" % metrics.meanSquaredError)
            print("RMSE = %s" % metrics.rootMeanSquaredError)
            print("R-squared = %s" % metrics.r2)
            print("MAE = %s" % metrics.meanAbsoluteError)
            print("Explained variance = %s" % metrics.explainedVariance)
def taxi_regression(sc, filename):
    '''
    Args:
        sc: The Spark Context
        filename: Filename of the Amazon reviews file to use, where each line represents a review    
    '''

    sqlContext = SQLContext(sc)
    df = sqlContext.read.load(filename, 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true').sample(False, 0.001)

    # Limit Data: longitude > 40.6 and longitude < 40.9 and latitude < -73.75 and latitude > -74.05
    df = df.filter((df.pickup_longitude < -73.75) & (df.pickup_longitude > -74.05) & (df.dropoff_longitude < -73.75) & (df.dropoff_longitude > -74.05))
    df = df.filter((df.pickup_latitude < 40.9) & (df.pickup_latitude > 40.6) &(df.dropoff_latitude < 40.9) & (df.dropoff_latitude > 40.6))

    labeled_rdd = df.rdd.map(lambda x: get_labeled_point(x))
    labeled_rdd = labeled_rdd.filter(lambda row: row[0] > 0.0 and row[0] < 60 and row[1][0] < 0.21 and row[1][0] > 0.0)

    # For Graphing:
    # df = labeled_rdd.map(lambda row: (row[0], row[1][0])).toDF()
    # df.write.format("com.databricks.spark.csv").option("header", "true").save("distance_fare.csv")



    labeled_rdd = labeled_rdd.map(lambda row: LabeledPoint(row[0], row[1]))

    # Build model
    training_data, test_data = labeled_rdd.randomSplit([0.8, 0.2])
    model = LinearRegressionWithSGD.train(training_data, intercept=True)

    valuesAndPredsTraining = training_data.map(lambda p: (float(model.predict(p.features)), p.label))
    valuesAndPreds = test_data.map(lambda p: (float(model.predict(p.features)), p.label))

    trainingMetrics = RegressionMetrics(valuesAndPredsTraining)
    metrics = RegressionMetrics(valuesAndPreds)

    # for row in labeled_rdd.collect():
    #     print("distance: " + str(row.features) + " actual fare: " + str(row.label) + " predcted fare: " + str(model.predict(row.features)))
    print("RMSE = ", metrics.rootMeanSquaredError," Explained Variance = ", metrics.explainedVariance, " RMSE Training = ", trainingMetrics.rootMeanSquaredError)
def amazon_regression(sc, filename):
    '''
    Args:
        sc: The Spark Context
        filename: Filename of the Amazon reviews file to use, where each line represents a review    
    '''

    # YOUR CODE HERE
    reviews = sc.textFile(filename).sample(False, 0.0001)
    reviews = reviews.map(lambda x: loadcsv(x))
    reviews = reviews.filter(lambda x: x != None)
    labels = reviews.map(lambda x: x[0])
    # reviews = reviews.map(lambda x: (float(x[0]), x[1])).mapValues(lambda x:x.split())
    reviews = (reviews.map(lambda x: (float(x[0]), x[1])).mapValues(
        lambda x: x.split()))
    # Feed HashingTF the array of words
    tf = HashingTF().transform(reviews.map(lambda x: x[1]))
    # Pipe term frequencies into the IDF
    idf = IDF(minDocFreq=5).fit(tf)
    # Transform the IDF into a TF-IDF
    tfidf = idf.transform(tf)
    parsedData = (labels.zip(tfidf).map(lambda x: LabeledPoint((x[0]), x[1])))
    training, test = parsedData.randomSplit([0.5, 0.5])
    # Build the model
    model = LinearRegressionWithSGD.train(training,
                                          iterations=10000,
                                          step=0.000000001)

    # Evaluate the model on training data
    valuesAndPreds = (training.map(lambda x: x.label).zip(
        model.predict(training.map(lambda x: x.features))))

    # Save and load mode
    vap = (test.map(lambda x: x.label).zip(
        model.predict(test.map(lambda x: x.features))))
    trained_metrics = RegressionMetrics(valuesAndPreds.mapValues(float))
    train_rootMeanSquaredError = trained_metrics.rootMeanSquaredError
    train_explainedVariance = trained_metrics.explainedVariance
    test_metrics = RegressionMetrics(vap.mapValues(float))
    test_rootMeanSquaredError = test_metrics.rootMeanSquaredError
    test_explainedVariance = test_metrics.explainedVariance
Esempio n. 18
0
def evaluateModel(model, validationData):
    '''
    使用RMES(Area under of Curve of ROC)评估模型的准确率
    :param model:
    :param validationData:
    :return:
    '''
    score = model.predict(validationData.map(lambda p: p.features))
    scoreAndLabels = score.zip(validationData.map(lambda p: p.label))
    metrics = RegressionMetrics(scoreAndLabels)
    RMES = metrics.rootMeanSquaredError
    return (RMES)
Esempio n. 19
0
    def evaluateRegression(self, scoreAndLabels):

        metrics = RegressionMetrics(scoreAndLabels)

        result = {}

        result['MAE'] = metrics.meanAbsoluteError
        result['MSE'] = metrics.meanSquaredError
        result['RMSE'] = metrics.rootMeanSquaredError
        result['R-squared'] = metrics.r2

        return result
Esempio n. 20
0
def tql_regression(sc, filename):
    # load in csv
    records = sc.textFile(filename).sample(False, 0.1, 16)
    p_records = records.flatMap(lambda l: process_record(l))

    prices = p_records.keys()
    attrs = p_records.values()

    labeled_points = prices.zip(attrs).map(lambda x: LabeledPoint(x[0], x[1]))
    print(labeled_points.take(10))

    training, test = labeled_points.randomSplit([0.8, 0.2])
    #model = RidgeRegressionWithSGD.train(training, iterations=200)
    model = LinearRegressionWithSGD.train(training,
                                          iterations=200,
                                          regType="l2")

    # Use our model to predict
    train_predicts = model.predict(training.map(lambda x: x.features))
    train_preds = training.map(lambda x: x.label).zip(train_predicts)
    test_preds = test.map(lambda x: x.label).zip(
        model.predict(test.map(lambda x: x.features)))

    print(train_preds.take(10))
    print(test_preds.take(10))

    # Ask PySpark for some metrics on how our model predictions performed
    trained_metrics = RegressionMetrics(
        train_preds.map(lambda x: (x[0], float(x[1]))))
    test_metrics = RegressionMetrics(
        test_preds.map(lambda x: (x[0], float(x[1]))))

    print("___________________trained RMSE",
          trained_metrics.rootMeanSquaredError)
    print("___________________trained EV", trained_metrics.explainedVariance)

    print("___________________test RMSE", test_metrics.rootMeanSquaredError)
    print("___________________test EV", test_metrics.explainedVariance)

    return 0
Esempio n. 21
0
def testRegression(train, test):
    # Train a GradientBoostedTrees model.

    rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel")

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = RegressionMetrics(predictionAndLabels)
    print("rmse %.3f" % metrics.rootMeanSquaredError)
    print("r2 %.3f" % metrics.r2)
    print("mae %.3f" % metrics.meanAbsoluteError)
Esempio n. 22
0
def alsModelEvaluate(model, testing_rdd):
    # 对测试数据集预测评分,针对测试数据集进行预测
    predict_rdd = model.predictAll(testing_rdd.map(lambda r: (r[0], r[1])))
    predict_actual_rdd = predict_rdd.map(lambda r: ((r[0], r[1]), r[2])) \
        .join(testing_ratings.map(lambda r: ((r[0], r[1]), r[2])))

    # 创建评估指标实例对象
    metrics = RegressionMetrics(predict_actual_rdd.map(lambda pr: pr[1]))

    #print("MSE = %s" % metrics.meanSquaredError)
    #print("RMSE = %s" % metrics.rootMeanSquaredError)

    # 返回均方根误差
    return metrics.rootMeanSquaredError
Esempio n. 23
0
def testRegression(train, test):
    # Train a RandomForest model.
    # Note: Use larger numTrees in practice.

    rf = RandomForestRegressor(labelCol="indexedLabel", numTrees=3, maxDepth=4)

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = RegressionMetrics(predictionAndLabels)
    print("rmse %.3f" % metrics.rootMeanSquaredError)
    print("r2 %.3f" % metrics.r2)
    print("mae %.3f" % metrics.meanAbsoluteError)
Esempio n. 24
0
def Print_regression_info(xy_predict):
    '''
    打印和回归有关的信息
    xy_predict:模型预测的数据集
    '''
    predict_and_target_rdd = xy_predict.rdd.map(
        lambda row: (float(row.prediction), float(row.label)))
    metrics = RegressionMetrics(predict_and_target_rdd)

    print '----------------------------------------------'
    print "MSE: %s" % metrics.meanSquaredError
    print "RMSE: %s" % metrics.rootMeanSquaredError
    print "R-squared: %s" % metrics.r2
    print "MAE: %s" % metrics.meanAbsoluteError
    print '----------------------------------------------'
Esempio n. 25
0
def performRegression(data, params):
	training, test = data.randomSplit([.8, .2])
	
	lasso = performLasso(training)
	linReg = performLinearRegression(training)
	ridgeReg = performRidgeRegression(training)
	
	lassoTest = (test.map(lambda x: x.label).zip(lasso.predict(test.map(lambda x: x.features))))
	linTest = (test.map(lambda x: x.label).zip(linReg.predict(test.map(lambda x: x.features))))
	ridgeTest = (test.map(lambda x: x.label).zip(ridgeReg.predict(test.map(lambda x: x.features))))
	
	lassoMetrics = RegressionMetrics(lassoTest.map(lambda x: (x[0], float(x[1]))))
	linMetrics = RegressionMetrics(linTest.map(lambda x: (x[0], float(x[1]))))
	ridgeMetrics = RegressionMetrics(ridgeTest.map(lambda x: (x[0], float(x[1]))))
	
	lassoValue = lassoMetrics.rootMeanSquaredError
	linRegValue = linMetrics.rootMeanSquaredError
	ridgeRegValue = ridgeMetrics.rootMeanSquaredError
	
	if(lassoValue < linRegValue and lassoValue < ridgeRegValue):
		return "lasso"
	if(linRegValue < lassoValue and linRegValue < ridgeRegValue):
		return "linear"
	return "ridge"
Esempio n. 26
0
 def evaluateRddRegressionModel(self):
     # Get predictions
     valuesAndPreds = self.getRddPredictionsLabels(self._get_rddModel(),
                                                   self._get_rddTest())
     # Instantiate metrics object
     metrics = RegressionMetrics(valuesAndPreds)
     # Squared Error
     print("MSE = %s" % metrics.meanSquaredError)
     print("RMSE = %s" % metrics.rootMeanSquaredError)
     # R-squared
     print("R-squared = %s" % metrics.r2)
     # Mean absolute error
     print("MAE = %s" % metrics.meanAbsoluteError)
     # Explained variance
     print("Explained variance = %s" % metrics.explainedVariance)
Esempio n. 27
0
 def evaluate_prediction(self, test_data, test_target):
     """
     Evaluates the performance of the prediction model by printing out MSE
     and other model related data.
     @test_data: The test data
     @test_target: The labels of the test data.
     @return: Each predicted value in pair with the true value
     """
     predictions = self.predict(test_data)
     rf_predicted_values = test_target.zip(
         predictions.map(lambda x: float(x)))
     metrics_rf = RegressionMetrics(rf_predicted_values)
     self._log.info('Random Forest predictions: {}'.format(
         str(rf_predicted_values.take(5))))
     self._log.info('TestSet MSE = {}'.format(metrics_rf.meanSquaredError))
     return rf_predicted_values
Esempio n. 28
0
def bestmodel(traindata, validatedata):
    bestValidationRmse = float("inf")
    #map validate data to userId, movieId
    validation = validatedata.map(lambda r: (r[0], r[1]))
    #get actual rating data for pairs of userId, movieId
    ratingTuple = validatedata.map(lambda r:
                                   ((int(r[0]), int(r[1])), float(r[2])))
    for rank in ranks:
        #create model by train data
        model = ALS.train(traindata, rank, numIterations, lambda_=regulz_para)
        #predict ratings for validation data
        predictions = model.predictAll(validation).map(lambda r:
                                                       ((r[0], r[1]), r[2]))
        #create predict and actual ratings
        scoreAndLabels = predictions.join(ratingTuple).map(lambda tup: tup[1])

        regMetrics = RegressionMetrics(scoreAndLabels)
        RMSE = regMetrics.rootMeanSquaredError
        MSE = regMetrics.meanSquaredError

        print("For rank %s:" % rank)
        print("RMSE = %s" % RMSE)
        print("MSE = %s" % MSE)

        if RMSE < bestValidationRmse:
            bestValidationRmse = RMSE
            best_rank = rank

    print 'The best model was trained with rank %s' % best_rank

    #MAP:
    #actual top 10 movie sequence for users by rating
    model = ALS.train(traindata, best_rank, numIterations, lambda_=regulz_para)
    actual_user_movie = validatedata.map(lambda x:
                                         (x[0], (x[1], x[2]))).groupByKey()
    actual_user_movie1 = actual_user_movie.map(order_movies)
    predict_user_movie = model.predictAll(validation).map(
        lambda r: (r[0], (r[1], r[2]))).groupByKey()
    predict_user_movie1 = predict_user_movie.map(order_movies)
    movie_seq = predict_user_movie1.join(actual_user_movie1).map(
        lambda x: x[1])
    movie_seq = movie_seq.map(movie_index)
    rankMetrics = RankingMetrics(movie_seq)
    MAP = rankMetrics.meanAveragePrecision
    print("MAP = %s" % MAP)
Esempio n. 29
0
def main(sc):
    ratings_info = sc.textFile("input/ratings.csv")
    ratings_data = ratings_info.map(split).map(parse).filter(
        lambda line: line != None)

    fold1, fold2, fold3, fold4, fold5 = ratings_data.randomSplit(
        [0.2, 0.2, 0.2, 0.2, 0.2])
    folds = [fold1, fold2, fold3, fold4, fold5]

    rank = 12
    itr = 25
    mse = 0
    rmse = 0
    map = 0
    for i in range(5):
        test_data = folds[i]
        train_data = sc.emptyRDD()
        for j in range(5):
            if i == j:
                continue
            else:
                train_data = train_data.union(folds[j])

        model = ALS.train(train_data, rank, iterations=itr, lambda_=0.1)
        testdata = test_data.map(lambda p: (p[0], p[1]))
        predictions = model.predictAll(testdata).map(lambda r:
                                                     ((r[0], r[1]), r[2]))
        rates = test_data.map(lambda r: ((r[0], r[1]), r[2]))
        predsAndlabels = predictions.join(rates).map(lambda tup: tup[1])
        actual_rating = predsAndlabels.map(lambda r: r[1]).collect()
        predicted_rating = predsAndlabels.map(lambda r: r[0]).collect()
        predAndReal = sc.parallelize([(predicted_rating, actual_rating)])

        metrics = RegressionMetrics(predsAndlabels)
        metric = RankingMetrics(predAndReal)
        mse += metrics.meanSquaredError
        rmse += metrics.rootMeanSquaredError
        map += metric.meanAveragePrecision

    k_mse = mse / 5.0
    k_rmse = rmse / 5.0
    k_map = map / 5.0
    print("MSE = %s" % k_mse)
    print("RMSE = %s" % k_rmse)
    print("MAP = %s" % k_map)
Esempio n. 30
0
def get_val_metrics(model, val):
    preds = model.transform(val)
    recs = model.recommendForUserSubset(val, 500)
    
    top_items = recs.selectExpr('user as user', 'recommendations.item as top_items')
    true_items = val.where(val.rating >= 3).groupby('user').agg(collect_list('item').alias('true_item_list'))
    predictions_and_labels_rankings = top_items.join(true_items, how = 'inner', on = 'user')\
        .select('true_item_list', 'top_items')
    
    predictions_and_labels_rankings.write.json('val_recs.json')
    
    ranking_metrics = RankingMetrics(predictions_and_labels_rankings.cache().rdd)
    prec_at = ranking_metrics.precisionAt(500)
    mean_avg_prec = ranking_metrics.meanAveragePrecision
    ndcg = ranking_metrics.ndcgAt(500)
    
    rmse = RegressionMetrics(preds.select('rating', 'prediction').cache().rdd).rootMeanSquaredError
    evaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'rating', metricName = 'rmse')
    rmse = evaluator.evaluate(preds)
    return rmse, prec_at, mean_avg_prec, ndcg