Esempio n. 1
0
def gbtRegression(df, conf):
    """ 
        input : df [spark.dataframe], conf [configuration params]
        output : decisiontree_regression model [model]
    """
    featuresCol = conf["params"].get("featuresCol")
    labelCol = conf["params"].get("labelCol")
    predictionCol=conf["params"].get("predictionCol")
    impurity = conf["params"].get("impurity", "variance")
    
    maxDepth    = conf["params"].get("maxDepth", 5)
    maxIter = conf["params"].get("maxIter", 20)
    maxBin = conf["params"].get("maxBins", 32)
    minInstancesPerNode = conf["params"].get("minInstancesPerNode", 1)
    minInfoGain = conf ["params"].get("minInfoGain", 0.0)
    maxMemoryInMB = conf["params"].get("maxMemoryInMB",256)
    cacheNodeIds = conf["params"].get("cacheNodeIds", False)
    subsamplingRate= conf["params"].get("subsamplingRate", 1.0)
    checkpointInterval = conf["params"].get("checkpointInterval", 10)
    lossType = conf["params"].get("lossType", "squared")
    seed = conf["params"].get("seed", None) 
    
    gbt = GBTRegressor(maxIter=maxIter, maxDepth=maxDepth, featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[featureIndexer, gbt])
    
    print ("maxDepth : " , gbt.getMaxDepth())
    print ("maxIter : ", gbt.getMaxIter())
    
    #jika menggunakan ml-tuning
    if conf["tuning"]:
            
          #jika menggunakan ml-tuning cross validation  
          if conf["tuning"].get("method").lower() == "crossval":
            paramGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramgGrids:
              pg.addGrid(key, paramGrids[key])
          
            grid = pg.build()
            folds = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, 
                                evaluator=evaluator, numFolds= folds)
            model = cv.fit(df)
          
          #jika menggunakan ml-tuning train validation split
          elif conf["tuning"].get("method").lower() == "trainvalsplit":
            paramGrids = conf["tuning"].get("paramGrids")
            pg = ParamGridBuilder()
            for key in paramgGrids:
              pg.addGrid(key, paramGrids[key])
          
            grid = pg.build()
            tr = conf["tuning"].get("methodParam")
            evaluator = RegressionEvaluator()
            tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=grid, 
                                       evaluator=evaluator, trainRatio=tr )
            model = tvs.fit(df)
            
    #jika tidak menggunakan ml-tuning
    elif conf["tuning"] == None:
          print ("test")
          model = pipeline.fit(df)
          
    return model
Esempio n. 2
0
estimator = GBTRegressor(featuresCol='features',
                         labelCol='Sales',
                         predictionCol='prediction')

pipeline_obj = Pipeline(stages=[assembler, estimator])

mlflow.set_tracking_uri('http://localhost:5000/')
print("mlflow tracking_uri: " + mlflow.tracking.get_tracking_uri())

mlflow.set_experiment("Advertising Regression Online Lesson")

with mlflow.start_run(run_name="spark-advertising-gbt-regressor") as run:
    # Log params:
    mlflow.log_param("min_info_gain", estimator.getMinInfoGain())
    mlflow.log_param("max_depth", estimator.getMaxDepth())
    mlflow.log_param("max_bins", estimator.getMaxBins())
    mlflow.log_param("step_size", estimator.getStepSize())

    # Log the model while training
    pipelineModel = pipeline_obj.fit(train_df)
    mlflow.spark.log_model(pipelineModel, "model")

    # Log metrics: RMSE and R2
    predDF = pipelineModel.transform(test_df)
    regressionEvaluator = RegressionEvaluator(predictionCol="prediction",
                                              labelCol="Sales")
    rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF)
    r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
    mlflow.log_metrics({"rmse": rmse, "r2": r2})