def build_gradient_boosted_tree_regression(observation_df, feature_columns): # Create new column with all of the features vector_observation_df = create_feature_column(observation_df, feature_columns, ['features', 'duration_sec']) train_df, test_df = vector_observation_df.randomSplit([0.7, 0.3]) model = GBTRegressor(featuresCol="features", labelCol="duration_sec", maxIter=15) model = model.fit(train_df) test_predictions = model.transform(test_df) test_predictions.select("prediction", "duration_sec", "features").show(5) evaluator = RegressionEvaluator(predictionCol='prediction', labelCol="duration_sec", metricName="rmse") print("RMSE on test data = %g" % evaluator.evaluate(test_predictions)) evaluator = RegressionEvaluator(predictionCol='prediction', labelCol="duration_sec", metricName="r2") print("R2 on test data = %g" % evaluator.evaluate(test_predictions)) return model
outputCol='features') train_LP = assembler.transform(train).select(['label', 'features']) evaluation_LP = assembler.transform(evaluation).select(['label', 'features']) #Definimos el algoritmo del modelo (gradient boosted tree) model_regresion = GBTRegressor(labelCol="label", featuresCol="features", maxDepth=13, maxBins=64, maxIter=10) # Fit the model model_regresion = model_regresion.fit(train_LP) # Make predictions. predictions = model_regresion.transform(evaluation_LP) evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) # COMMAND ---------- #Generamos un vector con la columna array features ignore = ["label"] assembler = VectorAssembler( inputCols=[x for x in test.columns if x not in ignore], outputCol='features')