def run_ML_gbt_crossValidation(model, train, test, df, name):
    # Same idea than run_ML_regression_crossValidation()
    evaluator = BinaryClassificationEvaluator()
    grid = tune.ParamGridBuilder()
    grid = grid.addGrid(model.maxDepth, [1, 10, 20, 30])
    grid = grid.addGrid(model.maxIter, [7, 10, 14, 18])
    # grid = grid.addGrid(model.minInstancesPerNode, np.linspace(1, 32, 32, endpoint=True))
    # grid = grid.addGrid(model.subsamplingRate, np.linspace(1, 10, 10, endpoint=True))
    # grid = grid.addGrid(model.maxBins, np.linspace(20, 44, 32, endpoint=True))
    # grid = grid.addGrid(model.minInfoGain, [0,1,2])

    # grid = grid.addGrid(model.min_samples_leaf, [40,50,60])
    grid = grid.build()
    cv = tune.CrossValidator(estimator=model,
               estimatorParamMaps=grid,
               evaluator=evaluator,
               numFolds=5
               )
    models = cv.fit(train)
    bestModel = models.bestModel
    
    # obtain the best params
    result = df.copy()
    for index, rows in result.iterrows():
        if rows['Name'] == name:
            result.at[index, 'Best_Param'] = "maxDepth: " + str(bestModel._java_obj.getMaxDepth()) + " - maxIter: " + str(bestModel._java_obj.getMaxIter())
            
    finalPredictions = bestModel.transform(train)
    return finalPredictions, bestModel, result
Beispiel #2
0
def GBDTclf(trainingData, testData):

    max_depth = [1, 5, 10]
    grid = tune.ParamGridBuilder() \
        .addGrid(GBDT.maxDepth, max_depth) \
        .build()

    evaluator = ev.BinaryClassificationEvaluator(
        rawPredictionCol='probability', labelCol='label')

    # 3-fold validation
    cv = tune.CrossValidator(estimator=GBDT,
                             estimatorParamMaps=grid,
                             evaluator=evaluator,
                             numFolds=3)

    # pipelineDtCV = Pipeline(stages=[cv])
    cvModel = cv.fit(trainingData)
    results = cvModel.transform(testData)

    label = results.select("label").toPandas().values
    predict = results.select("prediction").toPandas().values
    np.savetxt('res/predictedGBDT_spark.txt', predict, fmt='%01d')
    print("[accuracy,precision,recall,f1]")
    # print(evaluate(label,predict))
    return evaluate(label, predict)
 def create_grid(self, dict_features):
     param_grid = tuning.ParamGridBuilder()
     for model_parameter, grid_values in dict_features.items():
         if isinstance(grid_values, int) or isinstance(grid_values, float):
             param_grid.baseOn(eval('model.' + model_parameter),
                               grid_values)
         else:
             param_grid.addGrid(eval('model.' + model_parameter),
                                grid_values)
Beispiel #4
0
def train_evaluate(train_data, test_data):
    # 将文字的分类特征转为数字
    stringIndexer = ft.StringIndexer(inputCol='alchemy_category',
                                     outputCol="alchemy_category_Index")

    encoder = ft.OneHotEncoder(dropLast=False,
                               inputCol='alchemy_category_Index',
                               outputCol="alchemy_category_IndexVec")

    assemblerInputs = ['alchemy_category_IndexVec'] + train_data.columns[4:-1]
    assembler = ft.VectorAssembler(inputCols=assemblerInputs,
                                   outputCol="features")

    # dt = cl.DecisionTreeClassifier(labelCol="label",
    #                             featuresCol="features")
    rf = cl.RandomForestClassifier(labelCol="label", featuresCol="features")

    evaluator = ev.BinaryClassificationEvaluator(
        rawPredictionCol="probability",
        labelCol='label',
        metricName='areaUnderROC')

    grid_search = tune.ParamGridBuilder()\
        .addGrid(rf.impurity, [ "gini","entropy"])\
        .addGrid(rf.maxDepth, [ 5,10,15])\
        .addGrid(rf.maxBins, [10, 15,20])\
        .addGrid(rf.numTrees, [10, 20,30])\
        .build()

    rf_cv = tune.CrossValidator(estimator=rf,
                                estimatorParamMaps=grid_search,
                                evaluator=evaluator,
                                numFolds=5)

    # rf_tvs = tune.TrainValidationSplit(
    #     estimator=rf,
    #     estimatorParamMaps=grid_search,
    #     evaluator=evaluator,
    #     trainRatio=0.7
    # )
    pipeline = Pipeline(stages=[stringIndexer, encoder, assembler, rf_cv])
    cv_pipeline_model = pipeline.fit(train_data)

    best_model = cv_pipeline_model.stages[-1]
    best_parm = get_best_param(best_model)

    AUC, AP = evaluate_model(cv_pipeline_model, test_data)

    return AUC, AP, best_parm, cv_pipeline_model
def run_ML_regression_crossValidation(model, train, test, df, name):
    # We’ll be using cross validation to choose the hyperparameters
    # by creating a grid of the possible pairs of values for the three hyperparameters,
    # elasticNetParam, regParam and maxIter
    # and using the cross validation error to compare all the different models so you can choose the best one

    # We will create a 5-fold CrossValidator

    # The first thing we need when doing cross validation for model selection is a way to compare different models
    evaluator = BinaryClassificationEvaluator()

    # Next, we need to create a grid of values to search over when looking for the optimal hyperparameters

    # Create the parameter grid
    grid = tune.ParamGridBuilder()
    # Add the hyperparameter
    grid = grid.addGrid(model.regParam, np.arange(0, .1, .01))
    grid = grid.addGrid(model.elasticNetParam, [0, 1])
    grid = grid.addGrid(model.maxIter, [1, 5, 10])
    # Build the grid
    grid = grid.build()

    # Create the CrossValidator
    cv = tune.CrossValidator(estimator=model,
               estimatorParamMaps=grid,
               evaluator=evaluator,
               numFolds=5,
               collectSubModels=True
               )

    # Fit cross validation models
    models = cv.fit(train)
    # Extract the best model
    bestModel = models.bestModel
    
    # obtain the best params
    result = df.copy()
    for index, rows in result.iterrows():
        if rows['Name'] == name:
            result.at[index, 'Best_Param'] = "regParam: " + str(bestModel._java_obj.getRegParam()) + " - MaxIter: " + str(bestModel._java_obj.getMaxIter()) + " - elasticNetParam: " + str(bestModel._java_obj.getElasticNetParam())
    
    finalPredictions = bestModel.transform(train)
    return finalPredictions, bestModel, result
def run_ML_mpc_crossValidation(model, train, test, df, name):
    # Same idea than run_ML_regression_crossValidation()
    evaluator = MulticlassClassificationEvaluator()
    grid = tune.ParamGridBuilder()
    grid = grid.build()
    cv = tune.CrossValidator(estimator=model,
               estimatorParamMaps=grid,
               evaluator=evaluator,
               numFolds=5
               )
    models = cv.fit(train)
    bestModel = models.bestModel
    
    # obtain the best params
    result = df.copy()
    for index, rows in result.iterrows():
        if rows['Name'] == name:
            result.at[index, 'Best_Param'] = "unspecified"
            
    finalPredictions = bestModel.transform(train)
    return finalPredictions, bestModel, result
def run_ML_dt_crossValidation(model, train, test, df, name):
    # Same idea than run_ML_regression_crossValidation()
    evaluator = MulticlassClassificationEvaluator()
    grid = tune.ParamGridBuilder()
    grid = grid.addGrid(model.maxDepth, [4, 8])
    grid = grid.addGrid(model.maxBins, [2, 4, 6])
    grid = grid.build()
    cv = tune.CrossValidator(estimator=model,
               estimatorParamMaps=grid,
               evaluator=evaluator,
               numFolds=5
               )
    models = cv.fit(train)
    bestModel = models.bestModel
    
    # obtain the best params
    result = df.copy()
    for index, rows in result.iterrows():
        if rows['Name'] == name:
            result.at[index, 'Best_Param'] = "maxDepth: " + str(bestModel._java_obj.getMaxDepth()) + " - maxBins: " + str(bestModel._java_obj.getMaxBins())
            
    finalPredictions = bestModel.transform(train)
    return finalPredictions, bestModel, result
Beispiel #8
0
from pyspark.ml.classification import LogisticRegression

# Create a LogisticRegression Estimator
lr = LogisticRegression()

# Import the evaluation submodule
import pyspark.ml.evaluation as evals

# Create a BinaryClassificationEvaluator
evaluator = evals.BinaryClassificationEvaluator(metricName="areaUnderROC")

# Import the tuning submodule
import pyspark.ml.tuning as tune

# Create the parameter grid
grid = tune.ParamGridBuilder()

# Add the hyperparameter
grid = grid.addGrid(lr.regParam, np.arange(0, .1, .01))
grid = grid.addGrid(lr.elasticNetParam, [0, 1])

# Build the grid
grid = grid.build()

# Create the CrossValidator
cv = tune.CrossValidator(estimator=lr,
               estimatorParamMaps=grid,
               evaluator=evaluator
               )
# Call lr.fit()
best_lr = lr.fit(training)
                    color='blue',
                    ax=ax,
                    width=width,
                    position=1,
                    legend=True)
display()

# COMMAND ----------

# MAGIC %md
# MAGIC Applying regularization to further improve our RMSE
# MAGIC First step to build a grid of two parameters - ElasticNetRegularization

# COMMAND ----------

grid = tune.ParamGridBuilder()

# COMMAND ----------

grid = grid.addGrid(regScaled.elasticNetParam, [0, 0.2, 0.4, 0.6, 0.8, 1])

# COMMAND ----------

grid = grid.addGrid(regScaled.regParam, np.arange(0, .1, .01))

# COMMAND ----------

grid = grid.build()

# COMMAND ----------
Beispiel #10
0
    ##weightCol:
    data = data.withColumn('weight',
                           fn.when(data['y'] == 1, 1.0).otherwise(0.02))

    train, test = data.randomSplit([0.7, 0.3], seed=1234)  #42
    lr_model = cl.LogisticRegression(
        # maxIter=10,
        # regParam=0.01,
        elasticNetParam=0,
        family='binomial',
        threshold=0.5,
        weightCol='weight',
        labelCol='y')

    grid = tune.ParamGridBuilder()\
        .addGrid(lr_model.maxIter,[200,300,500,800])\
        .addGrid(lr_model.regParam,[0.001,0.002])\
        .build()

    evaluator = ev.BinaryClassificationEvaluator(
        rawPredictionCol='probability', labelCol='y')

    cv = tune.CrossValidator(estimator=lr_model,
                             estimatorParamMaps=grid,
                             evaluator=evaluator,
                             numFolds=3)

    ppline = Pipeline(stages=[featuerCreator])
    train_transfomer = ppline.fit(train)

    cv_model = cv.fit(train_transfomer.transform(train))
    test = train_transfomer.transform(test)
Beispiel #11
0
loadedPipelineModel = PipelineModel.load(modelPath)
test_loadedModel = loadedPipelineModel.transform(births_test)
print ('test_loadedModel:', test_loadedModel)


# 超参调优
import pyspark.ml.tuning as tune

# 使用网格搜索
logistic = cl.LogisticRegression(
    labelCol='INFANT_ALIVE_AT_REPORT')

grid = tune.ParamGridBuilder() \
    .addGrid(logistic.maxIter,  
             [2, 10, 50]) \
    .addGrid(logistic.regParam, 
             [0.01, 0.05, 0.3]) \
    .build()


evaluator = ev.BinaryClassificationEvaluator(
    rawPredictionCol='probability', 
    labelCol='INFANT_ALIVE_AT_REPORT')



cv = tune.CrossValidator(
    estimator=logistic, 
    estimatorParamMaps=grid, 
    evaluator=evaluator
)
Beispiel #12
0
## 超参调优 Parameter hyper-tuning

### 创建评估器 Create an estimator

import pyspark.ml.classification as cl

logistic = cl.LogisticRegression(
    labelCol='label')  # 对评估器的参数还需进一步进行超参调优,故先不设定超参数

### 网格搜索 Grid search

import pyspark.ml.tuning as tune

grid = tune.ParamGridBuilder() \
    .addGrid(logistic.maxIter,
             [10, 50, 80]) \
    .addGrid(logistic.regParam,
             [0.01, 0.001]) \
    .build()

### 创建管道 Create a pipeline

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[featuresCreator, indexer])
data_transformer = pipeline.fit(data_train)

## 模型拟合及性能评估 Fit the model & Model performance

# 使用BinaryClassificationEvaluator评估模型性能
import pyspark.ml.evaluation as ev

evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability',
Beispiel #13
0
pipeline = sm.Pipeline(stages=[scaler, reducer, classifier])

# Create an evaluator which will quantify model performance
# evaluator = sme.BinaryClassificationEvaluator(
#     labelCol='label',
#     rawPredictionCol='predictedLabel',
#     metricName='areaUnderROC'
# )
eval_f1 = sme.MulticlassClassificationEvaluator(labelCol='label',
                                                predictionCol='predictedLabel',
                                                metricName='f1')

# Set up a parameter grid for cross validation
param_grid = smt.ParamGridBuilder().addGrid(
    reducer.k,
    [10, 20, 50, 75]).addGrid(classifier.maxDepth,
                              [2, 5, 10]).addGrid(classifier.subsamplingRate,
                                                  [0.1, 0.2, 0.3]).build()

# Bring everything together
validator = smt.CrossValidator(estimator=pipeline,
                               estimatorParamMaps=param_grid,
                               evaluator=eval_f1,
                               numFolds=3)

# Fit the model to the data #######################################################################'
model = validator.fit(train)

train_predictions = model.transform(train)
val_predictions = model.transform(val)
Beispiel #14
0
    def _transform(self, *args, **kwargs):

        data_frame = args[0]

        pred_col = self._predictor.output_column

        preds = self._predictor.predict(data_frame)

        return preds.withColumn(pred_col,

                                cast_to_double(preds[pred_col]))

cast_to_double = functions.udf(lambda row: float(row[0]), types.DoubleType())


param_grid = tuning.ParamGridBuilder().baseOn(['regularizer', regularizers.l1_l2]) .addGrid('activations', [['tanh', 'relu']]) .addGrid('initializers', [['glorot_normal','glorot_uniform']]) .addGrid('layer_dims', [[input_dim, 2000, 300, 1]]) .addGrid('metrics', [['mae']]) .baseOn(['learning_rate', 1e-2]) .baseOn(['reg_strength', 1e-2]) .baseOn(['reg_decay', 0.25]) .baseOn(['lr_decay', 0.90]) .addGrid('dropout_rate', [0.20, 0.35, 0.50, 0.65, 0.80]) .addGrid('loss', ['mse', 'msle']).build()


estimator = DistKeras(trainers.ADAG,

                      {'batch_size': 256,

                       'communication_window': 3,

                       'num_epoch': 10,

                       'num_workers': 50},

                      **param_grid[0])

Beispiel #15
0
def hyper_parameter_optimization_ml():
	spark = SparkSession.builder.appName('hyper-parameter-optimization-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Create a purely transforming Pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator])
	data_transformer = pipeline.fit(births_train)

	# Specify our model and the list of parameters we want to loop through.
	logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT')
	grid = tune.ParamGridBuilder() \
		.addGrid(logistic.maxIter, [2, 10, 50]) \
		.addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \
		.build()
	# Define a way of comparing the models.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a logic that will do the validation work.
	cv = tune.CrossValidator(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator)

	cvModel = cv.fit(data_transformer.transform(births_train))

	# See if cvModel performed better than our previous model
	data_train = data_transformer.transform(births_test)
	results = cvModel.transform(data_train)

	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))

	# Parameters which the best model has.
	results = [
		([{key.name: paramValue} for key, paramValue in zip(params.keys(), params.values())], metric)
		for params, metric in zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics)
	]
	print(sorted(results, key=lambda el: el[1], reverse=True)[0])
Beispiel #16
0
def train_validation_splitting_ml():
	spark = SparkSession.builder.appName('train-validation-splitting-ml').getOrCreate()
	spark.sparkContext.setLogLevel('WARN')

	labels = [
		('INFANT_ALIVE_AT_REPORT', types.IntegerType()),
		('BIRTH_PLACE', types.StringType()),
		('MOTHER_AGE_YEARS', types.IntegerType()),
		('FATHER_COMBINED_AGE', types.IntegerType()),
		('CIG_BEFORE', types.IntegerType()),
		('CIG_1_TRI', types.IntegerType()),
		('CIG_2_TRI', types.IntegerType()),
		('CIG_3_TRI', types.IntegerType()),
		('MOTHER_HEIGHT_IN', types.IntegerType()),
		('MOTHER_PRE_WEIGHT', types.IntegerType()),
		('MOTHER_DELIVERY_WEIGHT', types.IntegerType()),
		('MOTHER_WEIGHT_GAIN', types.IntegerType()),
		('DIABETES_PRE', types.IntegerType()),
		('DIABETES_GEST', types.IntegerType()),
		('HYP_TENS_PRE', types.IntegerType()),
		('HYP_TENS_GEST', types.IntegerType()),
		('PREV_BIRTH_PRETERM', types.IntegerType())
	]
	schema = types.StructType([types.StructField(e[0], e[1], False) for e in labels])
	births = spark.read.csv('dataset/births_transformed.csv.gz', header=True, schema=schema)

	# Create transformers.
	births = births.withColumn('BIRTH_PLACE_INT', births['BIRTH_PLACE'].cast(types.IntegerType()))
	# Encode the BIRTH_PLACE column using the OneHotEncoder method.
	encoder = ml_feature.OneHotEncoder(inputCol='BIRTH_PLACE_INT', outputCol='BIRTH_PLACE_VEC')

	featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[2:]] + [encoder.getOutputCol()], outputCol='features')

	# Split the dataset into training and testing datasets.
	births_train, births_test = births.randomSplit([0.7, 0.3], seed=666)

	# Select only the top five features.
	selector = ml_feature.ChiSqSelector(
		numTopFeatures=5,
		featuresCol=featuresCreator.getOutputCol(),
		outputCol='selectedFeatures',
		labelCol='INFANT_ALIVE_AT_REPORT'
	)

	# Create a purely transforming Pipeline.
	pipeline = Pipeline(stages=[encoder, featuresCreator, selector])
	data_transformer = pipeline.fit(births_train)

	# Create LogisticRegression and Pipeline.
	logistic = ml_classification.LogisticRegression(labelCol='INFANT_ALIVE_AT_REPORT', featuresCol='selectedFeatures')
	grid = tune.ParamGridBuilder() \
		.addGrid(logistic.maxIter, [2, 10, 50]) \
		.addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \
		.build()
	# Define a way of comparing the models.
	evaluator = ml_eval.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT')

	# Create a TrainValidationSplit object.
	tvs = tune.TrainValidationSplit(estimator=logistic, estimatorParamMaps=grid, evaluator=evaluator)

	# Fit our data to the model.
	tvsModel = tvs.fit(data_transformer.transform(births_train))
	data_train = data_transformer.transform(births_test)

	# Calculate results.
	results = tvsModel.transform(data_train)
	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderROC'}))
	print(evaluator.evaluate(results, {evaluator.metricName: 'areaUnderPR'}))