model = RandomForestRegressor(numTrees=100) model = model.fit(train_data) # # model evaluation # In[ ]: model.featureImportances # In[ ]: from pyspark.ml.evaluation import RegressionEvaluator # In[ ]: test_results = model.transform(test_data) # In[ ]: evaluator = RegressionEvaluator() print('RMSE') evaluator.evaluate(test_results) # In[ ]: print('R_sqr') evaluator.evaluate(test_results, {evaluator.metricName: "r2"}) # In[ ]: print('MAE')
#Definimos el algoritmo del modelo (random forest) model_regresion = RandomForestRegressor(labelCol="label", featuresCol="features", maxDepth=11, maxBins=64, numTrees=10) # Fit the model model_regresion = model_regresion.fit(train_LP) #Save the model # model_multiclase.save("dbfs:/dataset/modelo_multiclase_RF") # Make predictions. predictions = model_regresion.transform(evaluation_LP) evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) # COMMAND ---------- #Generamos un vector con la columna array features ignore = ["label"] assembler = VectorAssembler( inputCols=[x for x in test.columns if x not in ignore], outputCol='features')
df_testing = df_testing.dropna() #assembler assembler = VectorAssembler(inputCols=['Latitude', 'Longitude', 'Depth'], outputCol='features') model = RandomForestRegressor(featuresCol='features', labelCol='Magnitude') #pipeline pipeline = Pipeline(stages=[assembler, model]) #train_model model = pipeline.fit(df_training) #make prediction pred = model.transform(df_testing) #evaluate evaluator = RegressionEvaluator(labelCol='Magnitude', predictionCol='prediction', metricName='rmse') rmse = evaluator.evaluate(pred) #create the prediction dataset df_pred_results = pred['Latitude', 'Longitude', 'prediction'] #rename prediction column df_pred_results = df_pred_results.withColumnRenamed('prediction', 'Pred_Magnitude') #add more column to df_pred_results
from pyspark.ml.regression import LinearRegression lin_reg = LinearRegression(labelCol='total') lr_model = lin_reg.fit(df_train) print(lr_model.coefficients, '\n', lr_model.intercept) train_prediction = lr_model.evaluate(df_train) print(train_prediction.r2, train_prediction.meanAbsoluteError) test_prediction = lr_model.evaluate(df_test) print(test_prediction.r2, test_prediction.meanAbsoluteError) test_prediction.predictions.show(3) from pyspark.ml.regression import RandomForestRegressor rf_model = RandomForestRegressor(featuresCol='features', labelCol='total', numTrees=100).fit(df_train) predictions = rf_model.transform(df_test) predictions.show() rf_model.featureImportances from pyspark.mllib.evaluation import RegressionMetrics from pyspark.ml.evaluation import RegressionEvaluator # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator( labelCol="total", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) rf_model.stages[1] print(rf_model) # summary only