Ejemplo n.º 1
0
def build_gradient_boosted_tree_regression(observation_df, feature_columns):
    # Create new column with all of the features
    vector_observation_df = create_feature_column(observation_df,
                                                  feature_columns,
                                                  ['features', 'duration_sec'])

    train_df, test_df = vector_observation_df.randomSplit([0.7, 0.3])
    model = GBTRegressor(featuresCol="features",
                         labelCol="duration_sec",
                         maxIter=15)

    model = model.fit(train_df)

    test_predictions = model.transform(test_df)

    test_predictions.select("prediction", "duration_sec", "features").show(5)

    evaluator = RegressionEvaluator(predictionCol='prediction',
                                    labelCol="duration_sec",
                                    metricName="rmse")
    print("RMSE on test data = %g" % evaluator.evaluate(test_predictions))

    evaluator = RegressionEvaluator(predictionCol='prediction',
                                    labelCol="duration_sec",
                                    metricName="r2")

    print("R2 on test data = %g" % evaluator.evaluate(test_predictions))

    return model
Ejemplo n.º 2
0
    outputCol='features')
train_LP = assembler.transform(train).select(['label', 'features'])
evaluation_LP = assembler.transform(evaluation).select(['label', 'features'])

#Definimos el algoritmo del modelo (gradient boosted tree)
model_regresion = GBTRegressor(labelCol="label",
                               featuresCol="features",
                               maxDepth=13,
                               maxBins=64,
                               maxIter=10)

# Fit the model
model_regresion = model_regresion.fit(train_LP)

# Make predictions.
predictions = model_regresion.transform(evaluation_LP)

evaluator = RegressionEvaluator(labelCol="label",
                                predictionCol="prediction",
                                metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

# COMMAND ----------

#Generamos un vector con la columna array features

ignore = ["label"]
assembler = VectorAssembler(
    inputCols=[x for x in test.columns if x not in ignore],
    outputCol='features')