def fitVar(p_lag, dataFrame, model_path=None): # print(p_lag) current_lag = p_lag # df_len_ori: number of variables in model, K x_list = dataFrame.columns # print('x_list',x_list) df_len_ori = len(x_list) # print("df_len_ori is ") # print(df_len_ori) dataFrame_names = dataFrame.columns # dataFrame = dataFrame.withColumn("id", monotonically_increasing_id()) # dataFrame.printSchema() # dataFrame.show(10) # Here, VAR model regression_type is "const" same to R VAR library, and the default in Python VAR library # w = Window().partitionBy().orderBy(col("id")) w = Window().partitionBy().orderBy(col("Timestamp")) df_len = len(dataFrame.columns) ys_lagged_list = ["const"] # Making sure first column is not considered for forecasting for i in range(1, p_lag + 1): for j in range(0, df_len): # making sure index column is not considered as feature column if x_list[j] != 'TimeStamp': ys_lagged_list.append("%st-%s" % (x_list[j], str(i))) # print('2',ys_lagged_list) dataFrame = dataFrame.withColumn("%st-%s" % (x_list[j], str(i)), lag(dataFrame[j], i, 0).over(w)) # print('3') # print("Showing DataFrame") # dataFrame.show(5) # print('ys_lagged_list',ys_lagged_list) # add "const" column of value 1 to get intercept when fitting the regression model dataFrame = dataFrame.withColumn("const", lit(1)) dataFrame = dataFrame.withColumn("const", lag("const", p_lag, 0).over(w)) dataFrame = dataFrame.withColumn("rid", monotonically_increasing_id()) dataFrame = dataFrame.filter(dataFrame.rid >= p_lag) # dataFrame.show(5) # build ys_lagged dataframe, will be used in F-test ys_lagged = dataFrame.select(ys_lagged_list) ys_lagged_len = ys_lagged.count() # print('ye dikhai lagged value') # ys_lagged.show(10) # dataFrame = dataFrame.drop('id') dataFrame = dataFrame.drop('rid') dataFrame = dataFrame.drop('const') input_feature_name = dataFrame.schema.names # input_feature_name.remove("id") for x_name in x_list: input_feature_name.remove('{}'.format(x_name)) # assemble the vector for MLlib linear regression assembler_for_lag = VectorAssembler( inputCols=input_feature_name, outputCol="features") # a = {} # b = {} lrModels = [] # Arjun added this for evaluation evaluator = RegressionEvaluator() models = {} predictions = {} # print('Iterating the features') for select_y in x_list: if select_y != 'TimeStamp': model_key = '{}'.format(select_y) # ML model will be trained for each micro batch if existing model is not provided # print('model path',model_path+ '{}'.format(select_y)) if model_path is None: lr = LinearRegression(featuresCol='features', labelCol='{}'.format(select_y), maxIter=1000, fitIntercept=True) pipeline = Pipeline(stages=[assembler_for_lag, lr]) model_val = pipeline.fit(dataFrame) else: # try: model_val = PipelineModel.load(model_path + '{}'.format(select_y)) # model_val.transform(dataFrame).show() # Arjun Added this code for the performance evaluation of the model evaluator.setLabelCol("{}".format(select_y)) # Root Mean Square Error evaluator.setMetricName('rmse') evaluator.setPredictionCol("prediction") rmse = evaluator.evaluate(model_val.transform(dataFrame)) # Mean Square Error evaluator.setMetricName('mse') mse = evaluator.evaluate(model_val.transform(dataFrame)) # Mean Absolute Error evaluator.setMetricName('mae') mae = evaluator.evaluate(model_val.transform(dataFrame)) models[model_key] = model_val predictions[model_key] = model_val.transform(dataFrame) lrModels.append(model_val) # trying to return it as model # return lrModels,predictions df_RT_Temp = predictions['RT_Temp'] df_Nu_Temp = predictions['Nu_Temp'] df_final = ( df_RT_Temp.alias('dr').join(df_Nu_Temp.alias('dn'), on=df_RT_Temp['TimeStamp'] == df_Nu_Temp['TimeStamp'], how='inner').selectExpr('dr.TimeStamp as TS', 'dr.RT_Temp', 'dr.prediction as RT_Temp_Predict', 'dn.Nu_Temp as Nu_Temp', 'dn.prediction as NU_Temp_Predict') ) df_final = df_final.withColumn("MAE_Score", lit(mae)) df_final = df_final.withColumn("MSE_Score", lit(mse)) df_final = df_final.withColumn("RMSE_Score", lit(rmse)) return df_final
import findspark findspark.init("D:\Spark") from pyspark.ml.evaluation import RegressionEvaluator from pyspark.sql import SparkSession spark = SparkSession.builder.appName('myproj').getOrCreate() data = spark.read.csv('titanic.csv',inferSchema=True,header=True) data.printSchema() data.columns my_cols = data.select(['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']) my_final_data = my_cols.na.drop() from pyspark.ml.feature import (VectorAssembler,OneHotEncoder,StringIndexer) gender_indexer = StringIndexer(inputCol='Sex',outputCol='SexIndex') gender_encoder = OneHotEncoder(inputCol='SexIndex',outputCol='SexVec') embark_indexer = StringIndexer(inputCol='Embarked',outputCol='EmbarkIndex') embark_encoder = OneHotEncoder(inputCol='EmbarkIndex',outputCol='EmbarkVec') assembler = VectorAssembler(inputCols=['Pclass','SexVec','Age','SibSp','Parch','Fare','EmbarkVec'],outputCol='features') from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline log_reg_titanic = LogisticRegression(featuresCol='features',labelCol='Survived') pipeline = Pipeline(stages=[gender_indexer,embark_indexer,gender_encoder,embark_encoder,assembler,log_reg_titanic]) train_titanic_data, test_titanic_data = my_final_data.randomSplit([0.6,0.4]) fit_model = pipeline.fit(train_titanic_data) results = fit_model.transform(test_titanic_data) my_eval = RegressionEvaluator(labelCol='Survived') my_eval.setPredictionCol("prediction") results.select('Survived','prediction').show() AUC = my_eval.evaluate(results) AUC