def run_ML_gbt_crossValidation(model, train, test, df, name): # Same idea than run_ML_regression_crossValidation() evaluator = BinaryClassificationEvaluator() grid = tune.ParamGridBuilder() grid = grid.addGrid(model.maxDepth, [1, 10, 20, 30]) grid = grid.addGrid(model.maxIter, [7, 10, 14, 18]) # grid = grid.addGrid(model.minInstancesPerNode, np.linspace(1, 32, 32, endpoint=True)) # grid = grid.addGrid(model.subsamplingRate, np.linspace(1, 10, 10, endpoint=True)) # grid = grid.addGrid(model.maxBins, np.linspace(20, 44, 32, endpoint=True)) # grid = grid.addGrid(model.minInfoGain, [0,1,2]) # grid = grid.addGrid(model.min_samples_leaf, [40,50,60]) grid = grid.build() cv = tune.CrossValidator(estimator=model, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5 ) models = cv.fit(train) bestModel = models.bestModel # obtain the best params result = df.copy() for index, rows in result.iterrows(): if rows['Name'] == name: result.at[index, 'Best_Param'] = "maxDepth: " + str(bestModel._java_obj.getMaxDepth()) + " - maxIter: " + str(bestModel._java_obj.getMaxIter()) finalPredictions = bestModel.transform(train) return finalPredictions, bestModel, result
def GBDTclf(trainingData, testData): max_depth = [1, 5, 10] grid = tune.ParamGridBuilder() \ .addGrid(GBDT.maxDepth, max_depth) \ .build() evaluator = ev.BinaryClassificationEvaluator( rawPredictionCol='probability', labelCol='label') # 3-fold validation cv = tune.CrossValidator(estimator=GBDT, estimatorParamMaps=grid, evaluator=evaluator, numFolds=3) # pipelineDtCV = Pipeline(stages=[cv]) cvModel = cv.fit(trainingData) results = cvModel.transform(testData) label = results.select("label").toPandas().values predict = results.select("prediction").toPandas().values np.savetxt('res/predictedGBDT_spark.txt', predict, fmt='%01d') print("[accuracy,precision,recall,f1]") # print(evaluate(label,predict)) return evaluate(label, predict)
def train_evaluate(train_data, test_data): # 将文字的分类特征转为数字 stringIndexer = ft.StringIndexer(inputCol='alchemy_category', outputCol="alchemy_category_Index") encoder = ft.OneHotEncoder(dropLast=False, inputCol='alchemy_category_Index', outputCol="alchemy_category_IndexVec") assemblerInputs = ['alchemy_category_IndexVec'] + train_data.columns[4:-1] assembler = ft.VectorAssembler(inputCols=assemblerInputs, outputCol="features") # dt = cl.DecisionTreeClassifier(labelCol="label", # featuresCol="features") rf = cl.RandomForestClassifier(labelCol="label", featuresCol="features") evaluator = ev.BinaryClassificationEvaluator( rawPredictionCol="probability", labelCol='label', metricName='areaUnderROC') grid_search = tune.ParamGridBuilder()\ .addGrid(rf.impurity, [ "gini","entropy"])\ .addGrid(rf.maxDepth, [ 5,10,15])\ .addGrid(rf.maxBins, [10, 15,20])\ .addGrid(rf.numTrees, [10, 20,30])\ .build() rf_cv = tune.CrossValidator(estimator=rf, estimatorParamMaps=grid_search, evaluator=evaluator, numFolds=5) # rf_tvs = tune.TrainValidationSplit( # estimator=rf, # estimatorParamMaps=grid_search, # evaluator=evaluator, # trainRatio=0.7 # ) pipeline = Pipeline(stages=[stringIndexer, encoder, assembler, rf_cv]) cv_pipeline_model = pipeline.fit(train_data) best_model = cv_pipeline_model.stages[-1] best_parm = get_best_param(best_model) AUC, AP = evaluate_model(cv_pipeline_model, test_data) return AUC, AP, best_parm, cv_pipeline_model
def run_ML_regression_crossValidation(model, train, test, df, name): # We’ll be using cross validation to choose the hyperparameters # by creating a grid of the possible pairs of values for the three hyperparameters, # elasticNetParam, regParam and maxIter # and using the cross validation error to compare all the different models so you can choose the best one # We will create a 5-fold CrossValidator # The first thing we need when doing cross validation for model selection is a way to compare different models evaluator = BinaryClassificationEvaluator() # Next, we need to create a grid of values to search over when looking for the optimal hyperparameters # Create the parameter grid grid = tune.ParamGridBuilder() # Add the hyperparameter grid = grid.addGrid(model.regParam, np.arange(0, .1, .01)) grid = grid.addGrid(model.elasticNetParam, [0, 1]) grid = grid.addGrid(model.maxIter, [1, 5, 10]) # Build the grid grid = grid.build() # Create the CrossValidator cv = tune.CrossValidator(estimator=model, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5, collectSubModels=True ) # Fit cross validation models models = cv.fit(train) # Extract the best model bestModel = models.bestModel # obtain the best params result = df.copy() for index, rows in result.iterrows(): if rows['Name'] == name: result.at[index, 'Best_Param'] = "regParam: " + str(bestModel._java_obj.getRegParam()) + " - MaxIter: " + str(bestModel._java_obj.getMaxIter()) + " - elasticNetParam: " + str(bestModel._java_obj.getElasticNetParam()) finalPredictions = bestModel.transform(train) return finalPredictions, bestModel, result
def run_ML_mpc_crossValidation(model, train, test, df, name): # Same idea than run_ML_regression_crossValidation() evaluator = MulticlassClassificationEvaluator() grid = tune.ParamGridBuilder() grid = grid.build() cv = tune.CrossValidator(estimator=model, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5 ) models = cv.fit(train) bestModel = models.bestModel # obtain the best params result = df.copy() for index, rows in result.iterrows(): if rows['Name'] == name: result.at[index, 'Best_Param'] = "unspecified" finalPredictions = bestModel.transform(train) return finalPredictions, bestModel, result
def run_ML_dt_crossValidation(model, train, test, df, name): # Same idea than run_ML_regression_crossValidation() evaluator = MulticlassClassificationEvaluator() grid = tune.ParamGridBuilder() grid = grid.addGrid(model.maxDepth, [4, 8]) grid = grid.addGrid(model.maxBins, [2, 4, 6]) grid = grid.build() cv = tune.CrossValidator(estimator=model, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5 ) models = cv.fit(train) bestModel = models.bestModel # obtain the best params result = df.copy() for index, rows in result.iterrows(): if rows['Name'] == name: result.at[index, 'Best_Param'] = "maxDepth: " + str(bestModel._java_obj.getMaxDepth()) + " - maxBins: " + str(bestModel._java_obj.getMaxBins()) finalPredictions = bestModel.transform(train) return finalPredictions, bestModel, result
# Import the tuning submodule import pyspark.ml.tuning as tune # Create the parameter grid grid = tune.ParamGridBuilder() # Add the hyperparameter grid = grid.addGrid(lr.regParam, np.arange(0, .1, .01)) grid = grid.addGrid(lr.elasticNetParam, [0, 1]) # Build the grid grid = grid.build() # Create the CrossValidator cv = tune.CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator ) # Call lr.fit() best_lr = lr.fit(training) # Print best_lr print(best_lr) # Fit cross validation models models = cv.fit(training) # Extract the best model best_lr = models.bestModel # Use the model to predict the test set test_results = best_lr.transform(test)
elasticNetParam=0, family='binomial', threshold=0.5, weightCol='weight', labelCol='y') grid = tune.ParamGridBuilder()\ .addGrid(lr_model.maxIter,[200,300,500,800])\ .addGrid(lr_model.regParam,[0.001,0.002])\ .build() evaluator = ev.BinaryClassificationEvaluator( rawPredictionCol='probability', labelCol='y') cv = tune.CrossValidator(estimator=lr_model, estimatorParamMaps=grid, evaluator=evaluator, numFolds=3) ppline = Pipeline(stages=[featuerCreator]) train_transfomer = ppline.fit(train) cv_model = cv.fit(train_transfomer.transform(train)) test = train_transfomer.transform(test) results = cv_model.transform(test) print('predict_results_type:', type(results)) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'})) best_param = [([{ key.name: paramValues } for key, paramValues in zip(params.keys(), params.values())], metric)
# Create the parameter grid grid = tune.ParamGridBuilder() # Add the hyperparameter grid = grid.addGrid(lr.regParam, np.arange(0.00001, 1, 50)) # This does both l1 and l2 - list of 0 and 1 # NOTE - 1 = LASSO, 0 = Ridge regression grid = grid.addGrid(lr.elasticNetParam, [0, 1]) # Build the grid grid = grid.build() # Create the CrossValidator cv = tune.CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5) #Here, we partition the data into X, Y training, validation and testing data. Data checks follow: train = df_dev test = df_test valid = df_val # Build the model t0 = time() logit_models = cv.fit(train) tt = time() - t0 tt # Follow the Sample_ML example from time import time
from pyspark.ml import Pipeline pipeline = Pipeline(stages=[featuresCreator, indexer]) data_transformer = pipeline.fit(data_train) ## 模型拟合及性能评估 Fit the model & Model performance # 使用BinaryClassificationEvaluator评估模型性能 import pyspark.ml.evaluation as ev evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='label') # 进行5折交叉验证 cv = tune.CrossValidator(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5) # 拟合模型,并在测试集上进行预测 cvModel = cv.fit( data_transformer \ .transform(data_train) ) prediction = cvModel.transform( \ data_transformer \ .transform(data_test)) results = prediction.select("id", "prediction", "probability", "label") # 查看预测结果(前10行)
# metricName='areaUnderROC' # ) eval_f1 = sme.MulticlassClassificationEvaluator(labelCol='label', predictionCol='predictedLabel', metricName='f1') # Set up a parameter grid for cross validation param_grid = smt.ParamGridBuilder().addGrid( reducer.k, [10, 20, 50, 75]).addGrid(classifier.maxDepth, [2, 5, 10]).addGrid(classifier.subsamplingRate, [0.1, 0.2, 0.3]).build() # Bring everything together validator = smt.CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=eval_f1, numFolds=3) # Fit the model to the data #######################################################################' model = validator.fit(train) train_predictions = model.transform(train) val_predictions = model.transform(val) # Evaluate model performance eval_roc = sme.BinaryClassificationEvaluator(labelCol='label', rawPredictionCol='predictedLabel', metricName='areaUnderROC') eval_accuracy = sme.MulticlassClassificationEvaluator(
estimator = DistKeras(trainers.ADAG, {'batch_size': 256, 'communication_window': 3, 'num_epoch': 10, 'num_workers': 50}, **param_grid[0]) evaluator = evaluation.RegressionEvaluator(metricName='r2') cv_estimator = tuning.CrossValidator(estimator=estimator, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5) cv_model = cv_estimator.fit(df_train) df_pred_train = cv_model.transform(df_train) df_pred_test = cv_model.transform(df_test)
def hyper_parameter_optimization_ml(): spark = SparkSession.builder.appName('hyper-parameter-optimization-ml').getOrCreate() spark.sparkContext.setLogLevel('WARN') labels = [ ('INFANT_ALIVE_AT_REPORT', types.IntegerType()), ('BIRTH_PLACE', types.StringType()), ('MOTHER_AGE_YEARS', types.IntegerType()), ('FATHER_COMBINED_AGE', types.IntegerType()), ('CIG_BEFORE', types.IntegerType()), ('CIG_1_TRI', types.IntegerType()), ('CIG_2_TRI', types.IntegerType()), ('CIG_3_TRI', types.IntegerType()), ('MOTHER_HEIGHT_IN', types.IntegerType()), ('MOTHER_PRE_WEIGHT', types.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', types.IntegerType()), ('MOTHER_WEIGHT_GAIN', types.IntegerType()), ('DIABETES_PRE', types.IntegerType()), ('DIABETES_GEST', types.IntegerType()), ('HYP_TENS_PRE', types.IntegerType()), ('HYP_TENS_GEST', types.IntegerType()), ('PREV_BIRTH_PRETERM', types.IntegerType()) ] schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels]) births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema) # Create transformers. births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType())) # Encode the BIRTH_PLACE column using the OneHotEncoder method. encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC') featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features') # Split the dataset into training and testing datasets. births_train, births_test = births.randomSplit([0.7, 0.3], seed=666) # Create a purely transforming Pipeline. pipeline = Pipeline(stages=[encoder, featuresCreator]) data_transformer = pipeline.fit(births_train) # Specify our model and the list of parameters we want to loop through. logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT') grid = tune.ParamGridBuilder() \ .addGrid(logistic.maxIter, [2, 10, 50]) \ .addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \ .build() # Define a way of comparing the models. evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') # Create a logic that will do the validation work. cv = tune.CrossValidator(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(data_transformer.transform(births_train)) # See if cvModel performed better than our previous model data_train = data_transformer.transform(births_test) results = cvModel.transform(data_train) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'})) # Parameters which the best model has. results = [ ([{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())], metric) for params, metric in zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics) ] print(sorted(results, key=lambda el: el[1], reverse=True)[0])
######################## #---Machine Learning---# ######################## # Create machine learning pipeline piped = ml_pipeline(churn) # Standardize dataset training, testing = piped.randomSplit([.75, .25]) # Create evaluator evaluator = evals.BinaryClassificationEvaluator(metricName="areaUnderROC") # Create logistic regression and decision tree model lr = LogisticRegression() dt = DecisionTreeClassifier() # Create cross validation object cross_validation = tune.CrossValidator(estimator=lr, evaluator=evaluator) # Train logisitic regression and decision tree model fitted_model_lr = lr.fit(training) fitted_model_dt = dt.fit(training) # Plot ROC for logistic regression roc_plot(fitted_model_lr) # Compute accuracy for logistic regression and decision tree models lr_accuracy = test_roc_performance(fitted_model_lr, testing, evaluator) dt_accuracy = test_roc_performance(fitted_model_dt, testing, evaluator)