Python RegressionEvaluator.setPredictionCol Examples

Programming Language: Python

Namespace/Package Name: pyspark.ml.evaluation

Method/Function: setPredictionCol

Examples at hotexamples.com: 2

Python RegressionEvaluator.setPredictionCol - 2 examples found. These are the top rated real world Python examples of pyspark.ml.evaluation.RegressionEvaluator.setPredictionCol extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

RegressionEvaluator(30)

setMetricName(7)

setParams(4)

explainParams(2)

setLabelCol(2)

setPredictionCol(2)

copy(1)

evaluator(1)

getLabelCol(1)

getPredictionCol(1)

Example #1

Show file

def fitVar(p_lag, dataFrame, model_path=None):
    # print(p_lag)
    current_lag = p_lag

    # df_len_ori: number of variables in model, K
    x_list = dataFrame.columns
    # print('x_list',x_list)
    df_len_ori = len(x_list)
    # print("df_len_ori is ")
    # print(df_len_ori)
    dataFrame_names = dataFrame.columns
    # dataFrame = dataFrame.withColumn("id", monotonically_increasing_id())
    # dataFrame.printSchema()
    # dataFrame.show(10)
    # Here, VAR model regression_type is "const" same to R VAR library, and the default in Python VAR library
    # w = Window().partitionBy().orderBy(col("id"))
    w = Window().partitionBy().orderBy(col("Timestamp"))
    df_len = len(dataFrame.columns)
    ys_lagged_list = ["const"]
    # Making sure first column is not considered for forecasting
    for i in range(1, p_lag + 1):
        for j in range(0, df_len):
            # making sure index column is not considered as feature column
            if x_list[j] != 'TimeStamp':
                ys_lagged_list.append("%st-%s" % (x_list[j], str(i)))
                # print('2',ys_lagged_list)
                dataFrame = dataFrame.withColumn("%st-%s" % (x_list[j], str(i)), lag(dataFrame[j], i, 0).over(w))
                # print('3')
    # print("Showing DataFrame")
    # dataFrame.show(5)
    # print('ys_lagged_list',ys_lagged_list)

    # add "const" column of value 1 to get intercept when fitting the regression model
    dataFrame = dataFrame.withColumn("const", lit(1))
    dataFrame = dataFrame.withColumn("const", lag("const", p_lag, 0).over(w))
    dataFrame = dataFrame.withColumn("rid", monotonically_increasing_id())
    dataFrame = dataFrame.filter(dataFrame.rid >= p_lag)
    # dataFrame.show(5)
    #     build ys_lagged dataframe, will be used in F-test
    ys_lagged = dataFrame.select(ys_lagged_list)
    ys_lagged_len = ys_lagged.count()
    # print('ye dikhai lagged value')
    # ys_lagged.show(10)

    #     dataFrame = dataFrame.drop('id')
    dataFrame = dataFrame.drop('rid')
    dataFrame = dataFrame.drop('const')
    input_feature_name = dataFrame.schema.names

    # input_feature_name.remove("id")
    for x_name in x_list:
        input_feature_name.remove('{}'.format(x_name))

    # assemble the vector for MLlib linear regression
    assembler_for_lag = VectorAssembler(
        inputCols=input_feature_name,
        outputCol="features")

    # a = {}
    # b = {}
    lrModels = []
    # Arjun added this for evaluation
    evaluator = RegressionEvaluator()
    models = {}
    predictions = {}
    # print('Iterating the features')
    for select_y in x_list:
        if select_y != 'TimeStamp':
            model_key = '{}'.format(select_y)
            # ML model will be trained for each micro batch if existing model is not provided
            # print('model path',model_path+ '{}'.format(select_y))
            if model_path is None:
                lr = LinearRegression(featuresCol='features', labelCol='{}'.format(select_y), maxIter=1000,
                                      fitIntercept=True)
                pipeline = Pipeline(stages=[assembler_for_lag, lr])
                model_val = pipeline.fit(dataFrame)
            else:
                # try:
                model_val = PipelineModel.load(model_path + '{}'.format(select_y))
                # model_val.transform(dataFrame).show()
            # Arjun Added this code for the performance evaluation of the model
            evaluator.setLabelCol("{}".format(select_y))
            # Root Mean Square Error
            evaluator.setMetricName('rmse')
            evaluator.setPredictionCol("prediction")
            rmse = evaluator.evaluate(model_val.transform(dataFrame))
            # Mean Square Error
            evaluator.setMetricName('mse')
            mse = evaluator.evaluate(model_val.transform(dataFrame))
            # Mean Absolute Error
            evaluator.setMetricName('mae')
            mae = evaluator.evaluate(model_val.transform(dataFrame))

            models[model_key] = model_val
            predictions[model_key] = model_val.transform(dataFrame)
            lrModels.append(model_val)
    # trying to return it as model
    # return lrModels,predictions

    df_RT_Temp = predictions['RT_Temp']
    df_Nu_Temp = predictions['Nu_Temp']
    df_final = (
        df_RT_Temp.alias('dr').join(df_Nu_Temp.alias('dn'), on=df_RT_Temp['TimeStamp'] == df_Nu_Temp['TimeStamp'],
                                    how='inner').selectExpr('dr.TimeStamp as TS',
                                                            'dr.RT_Temp',
                                                            'dr.prediction as RT_Temp_Predict',
                                                            'dn.Nu_Temp as Nu_Temp',
                                                            'dn.prediction as NU_Temp_Predict')
    )
    df_final = df_final.withColumn("MAE_Score", lit(mae))
    df_final = df_final.withColumn("MSE_Score", lit(mse))
    df_final = df_final.withColumn("RMSE_Score", lit(rmse))
    return df_final

Example #2

Show file

File: Logistic Regression.py Project: SerkanPolat/ApacheSparkMLlib

import findspark
findspark.init("D:\Spark")

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('myproj').getOrCreate()
data = spark.read.csv('titanic.csv',inferSchema=True,header=True)
data.printSchema()
data.columns
my_cols = data.select(['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'])
my_final_data = my_cols.na.drop()
from pyspark.ml.feature import (VectorAssembler,OneHotEncoder,StringIndexer)
gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex')
gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec')
embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex')
embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec')
assembler = VectorAssembler(inputCols=['Pclass','SexVec','Age','SibSp','Parch','Fare','EmbarkVec'],outputCol='features')
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
log_reg_titanic = LogisticRegression(featuresCol='features',labelCol='Survived')
pipeline = Pipeline(stages=[gender_indexer,embark_indexer,gender_encoder,embark_encoder,assembler,log_reg_titanic])  
train_titanic_data, test_titanic_data = my_final_data.randomSplit([0.6,0.4])
fit_model = pipeline.fit(train_titanic_data)
results = fit_model.transform(test_titanic_data)
my_eval = RegressionEvaluator(labelCol='Survived')
my_eval.setPredictionCol("prediction")
results.select('Survived','prediction').show()
AUC = my_eval.evaluate(results)
AUC