def gbtRegression(df, conf): """ input : df [spark.dataframe], conf [configuration params] output : decisiontree_regression model [model] """ featuresCol = conf["params"].get("featuresCol") labelCol = conf["params"].get("labelCol") predictionCol=conf["params"].get("predictionCol") impurity = conf["params"].get("impurity", "variance") maxDepth = conf["params"].get("maxDepth", 5) maxIter = conf["params"].get("maxIter", 20) maxBin = conf["params"].get("maxBins", 32) minInstancesPerNode = conf["params"].get("minInstancesPerNode", 1) minInfoGain = conf ["params"].get("minInfoGain", 0.0) maxMemoryInMB = conf["params"].get("maxMemoryInMB",256) cacheNodeIds = conf["params"].get("cacheNodeIds", False) subsamplingRate= conf["params"].get("subsamplingRate", 1.0) checkpointInterval = conf["params"].get("checkpointInterval", 10) lossType = conf["params"].get("lossType", "squared") seed = conf["params"].get("seed", None) gbt = GBTRegressor(maxIter=maxIter, maxDepth=maxDepth, featuresCol="indexedFeatures") pipeline = Pipeline(stages=[featureIndexer, gbt]) print ("maxDepth : " , gbt.getMaxDepth()) print ("maxIter : ", gbt.getMaxIter()) #jika menggunakan ml-tuning if conf["tuning"]: #jika menggunakan ml-tuning cross validation if conf["tuning"].get("method").lower() == "crossval": paramGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramgGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() folds = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator, numFolds= folds) model = cv.fit(df) #jika menggunakan ml-tuning train validation split elif conf["tuning"].get("method").lower() == "trainvalsplit": paramGrids = conf["tuning"].get("paramGrids") pg = ParamGridBuilder() for key in paramgGrids: pg.addGrid(key, paramGrids[key]) grid = pg.build() tr = conf["tuning"].get("methodParam") evaluator = RegressionEvaluator() tvs = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator, trainRatio=tr ) model = tvs.fit(df) #jika tidak menggunakan ml-tuning elif conf["tuning"] == None: print ("test") model = pipeline.fit(df) return model
estimator = GBTRegressor(featuresCol='features', labelCol='Sales', predictionCol='prediction') pipeline_obj = Pipeline(stages=[assembler, estimator]) mlflow.set_tracking_uri('http://localhost:5000/') print("mlflow tracking_uri: " + mlflow.tracking.get_tracking_uri()) mlflow.set_experiment("Advertising Regression Online Lesson") with mlflow.start_run(run_name="spark-advertising-gbt-regressor") as run: # Log params: mlflow.log_param("min_info_gain", estimator.getMinInfoGain()) mlflow.log_param("max_depth", estimator.getMaxDepth()) mlflow.log_param("max_bins", estimator.getMaxBins()) mlflow.log_param("step_size", estimator.getStepSize()) # Log the model while training pipelineModel = pipeline_obj.fit(train_df) mlflow.spark.log_model(pipelineModel, "model") # Log metrics: RMSE and R2 predDF = pipelineModel.transform(test_df) regressionEvaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Sales") rmse = regressionEvaluator.setMetricName("rmse").evaluate(predDF) r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF) mlflow.log_metrics({"rmse": rmse, "r2": r2})