コード例 #1
0
 def _build_param_grid(self):
     param_grid_builder = ParamGridBuilder()
     param_grid_builder.addGrid(self.tokenizer.tokenizer, self.tokenizer_map)
     param_grid_builder.addGrid(self.ngram.n, self.ngram_map)
     param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
     param_grid_builder.addGrid(self.clf.regParam, self.clf_map)
     return param_grid_builder.build()
コード例 #2
0
 def _get_param_grid(self):
     return ParamGridBuilder() \
            .addGrid(self.classifier.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) \
            .build()
コード例 #3
0
# make predictions on test set
make_predictions(GBT_model, test_GBT)

# COMMAND ----------

# MAGIC %md #### Train with cross validation

# COMMAND ----------

train_cv = train_GBT.union(val_GBT)

# COMMAND ----------

# set parameter search grid
paramGrid = ParamGridBuilder()\
  .addGrid(gbt.maxDepth, [1, 10])\
  .build()

# options for classification evaluator
evaluator = BinaryClassificationEvaluator(labelCol="label")

# Cross validation
cv = CrossValidator(estimator=gbt,
                    evaluator=evaluator,
                    estimatorParamMaps=paramGrid,
                    numFolds=3)

# Train GBT model with cross validation
cv_model = cv.fit(train_cv)

# COMMAND ----------
コード例 #4
0
print('*' * 100)
print('Cross-validated model w/ grid search')
print('*' * 100)
print('*' * 60)
print('Cross-validated model - learning')
print('*' * 60)
print()

##### Cross validation - random forest
# instantiate random forest classifier
rf = RandomForestClassifier(featureSubsetStrategy = 'auto', impurity = 'gini')

# parameter grid
param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 15, 20, 25, 30, 40, 50]) \
    .addGrid(rf.maxDepth, [2, 3, 4, 5, 6, 7, 8]) \
    .addGrid(rf.maxBins, [16, 32, 48]) \
    .build()

# create pipeline that includes preprocessing steps and model
stages = [assembler, minMaxScaler, rf]
pipeline = Pipeline(stages = stages)

# cross validator
cv = CrossValidator(estimator = pipeline
                    ,estimatorParamMaps = param_grid
                    ,evaluator = BinaryClassificationEvaluator()
                    ,numFolds = 8
    )

# fit model using training data
コード例 #5
0
indexed = indexed.join(meta, "product_id")

# Split data into train and test data set
(training, test) = indexed. \
  select("user_id_index", "product_id_index", "score", "reviewed_at", "title"). \
  randomSplit([0.6, 0.4], seed=0)

# Train and evaluate with ALS

from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
als = ALS(maxIter=5,
          userCol="user_id_index",
          itemCol="product_id_index",
          ratingCol="score")

param_grid = ParamGridBuilder().addGrid(als.regParam, [0.01, 0.1, 1.0]).build()

evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="score",
                                predictionCol="prediction")

tvs = TrainValidationSplit(
    estimator=als,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
)

model = tvs.fit(training)

predictions = model.transform(test)
predictions = predictions.fillna(0, subset=['prediction'])
コード例 #6
0
                                  (10, "spark compile", 1.0),
                                  (11, "hadoop software", 0.0)],
                                 ["id", "text", "label"])

# Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
# HashingTF,使用hash的方法将序列元素转化成他们的频率
# hashingTF.transform(tokenizer.transform(training)).select('text','features').collect() 可以看到features中是hash值对上频率
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# 这里是一个取之范围,不是区间
# numFeatures 特征数量,应该和哈希值的范围决定有关
paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=2)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(training)

# Prepare test documents, which are unlabeled.
test = spark.createDataFrame([(4, "spark i j k"), (5, "l m n"),
                              (6, "mapreduce spark"), (7, "apache hadoop")],
                             ["id", "text"])
コード例 #7
0
print("Running Cross-Validation. Please wait.")
start = time.time()

pipeline = Pipeline(
    stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

# Create ParamGrid for Cross Validation to test various parameters
paramGrid = (
    ParamGridBuilder().addGrid(lr.regParam,
                               [0.1, 0.3, 0.5])  # regularization parameter
    .addGrid(lr.elasticNetParam,
             [0.0, 0.1, 0.2])  # Elastic Net Parameter (Ridge = 0)
    #            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
    #            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
    .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)
cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
end = time.time()

lrPredictions=dtModel.transform(testingData);
lrPredictions.select("prediction", "label", "std_features").show(5)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(lrPredictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

treeModel = dtModel.stages[2]
# summary only
print(treeModel)

paramGrid = ParamGridBuilder()\
    .addGrid(dt.maxDepth, [2,3,4,5,6,7]) \
    .build()
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(predictionCol='prediction', labelCol='label',metricName= "r2"),
                          numFolds=3)

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(trainingData)

print(cvModel.avgMetrics)
#print( cvModel.bestModel.stages[2].summary.r2)

for param in paramGrid:
    print param
コード例 #9
0
##--------run on pyspark shell--------##

##load data to pyspark
trainData = spark.read.format('libsvm').load('trian_svm.txt');
cvData = spark.read.format('libsvm').load('cv_svm.txt');
testData = spark.read.format('libsvm').load('test_svm.txt');

##GBDT in pyspark
#design a GBDT model
gbdt = GBTClassifier(labelCol='label', featuresCol='features');

#build param grid
paramGrid = ParamGridBuilder()\
.addGrid(gbdt.maxDepth, [6, 7, 8])\
.addGrid(gbdt.minInstancesPerNode, [200, 500, 800])\
.addGrid(gbdt.maxIter, [100, 120, 140])\
.addGrid(gbdt.stepSize, [0.04, 0.08])\
.addGrid(gbdt.subsamplingRate, [0.6, 0.8])\
.build();

#build train validation split
tvs = TrainValidationSplit(estimator=gbdt, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), trainRatio=0.8);

#train the model
model_sp = tvs.fit(trainData);

#predict
train_pred = model_sp.transform(trainData).select('label', 'prediction');
cv_pred = model_sp.transform(cvData).select('label', 'prediction');
test_pred = model_sp.transform(testData).select('label', 'prediction');
コード例 #10
0
plt.legend()

plt.show()

# ### Using ParamGrid for hyperparameter tuning
# The parameters we wish to tweak are:
# * maxIter
# * regParam
# * elasticNetParam - whether a lasso or ridge model will be best

# In[30]:

from pyspark.ml.tuning import ParamGridBuilder

paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [10, 50, 100]).addGrid(
    lr.regParam, [0.1, 0.3, 1.0]).addGrid(lr.elasticNetParam,
                                          [0.0, 1.0]).build()

# #### Define the RegressionEvaluator used to evaluate the models
# We wish to minimize RMSE

# In[31]:

evaluator = RegressionEvaluator(labelCol='price',
                                predictionCol='prediction',
                                metricName='rmse')

# ### Define the CrossValidator
# This is used to put all the pieces together
# * <b>estimator: </b>Can be a standalone estimator or a pipeline with an estimator at the end. We use our pipeline
# * <b>estimatorParamMaps: </b>We add our paramGrid in order to build models with different combinations of the parameters
コード例 #11
0
ファイル: power_plant_rf.py プロジェクト: pycckuu/HPC
def main():
    spark = (SparkSession
             .builder
             .appName("PowerPlant")
             .getOrCreate()
             )

    powerPlantDF = spark.read.csv("../data/CCPP/sheet*.csv",header=True,inferSchema=True)

    vectorizer = VectorAssembler(inputCols = ["AT","V","AP","RH"],outputCol="features")

    split20DF,split80DF = powerPlantDF.randomSplit([0.20,0.80],seed=100)
    testSetDF = split20DF.cache()
    trainingSetDF = split80DF.cache()

    lr = LinearRegression(predictionCol="Predicted_PE",
                          labelCol="PE",
                          regParam=0.1,
                          maxIter=100,)

    lrPipeline = Pipeline(stages=[vectorizer,lr])
    lrModel = lrPipeline.fit(trainingSetDF)

    intercept = lrModel.stages[1].intercept
    weights = lrModel.stages[1].coefficients
    print("The y intercept: {}".format(intercept))
    print("The coefficients: {}".format(weights))
    print("Columns:{}".format(trainingSetDF.columns))

    predictionsAndLabelsDF = lrModel.transform(testSetDF).select("AT","V","AP","RH","PE","Predicted_PE")

    regEval = RegressionEvaluator(predictionCol="Predicted_PE",labelCol="PE",metricName="rmse")
    rmse = regEval.evaluate(predictionsAndLabelsDF)
    print("Root Mean Squared Error: %.2f" % rmse)

    r2 = regEval.evaluate(predictionsAndLabelsDF,{regEval.metricName:"r2"})
    print("r2: {0:.2f}".format(r2))

    print("========== LR Cross Validation==========")

    crossval = CrossValidator(estimator=lrPipeline,evaluator=regEval,numFolds=3)
    regParam = [x/100.0 for x in range(1,11)]
    paramGrid = (ParamGridBuilder()
                 .addGrid(lr.regParam,regParam)
                 .addGrid(lr.maxIter,[50,100,150])
                 .addGrid(lr.elasticNetParam,[0,1])
                 .build()
                )
    crossval.setEstimatorParamMaps(paramGrid)

    cvModel = crossval.fit(trainingSetDF).bestModel

    predictionsAndLabelsDF = cvModel.transform(testSetDF).select("AT","V","AP","RH","PE","Predicted_PE")

    rmseNew = regEval.evaluate(predictionsAndLabelsDF)

    r2New = regEval.evaluate(predictionsAndLabelsDF,{regEval.metricName:"r2"})

    print("Old RMSE: {0:.2f}".format(rmse))
    print("New RMSE: {0:.2f}".format(rmseNew))
    print("Old r2: {0:.2f}".format(r2))
    print("New r2: {0:.2f}".format(r2New))

    print("Best RegParam: {0}".format(cvModel.stages[-1]._java_obj.parent().getRegParam()))
    print("Best maxIter: {0}".format(cvModel.stages[-1]._java_obj.parent().getMaxIter()))
    print("Best elasticNetParam: {0}".format(cvModel.stages[-1]._java_obj.parent().getElasticNetParam()))


    print("========Random Forest=========")

    rf = (RandomForestRegressor()
          .setLabelCol("PE")
          .setPredictionCol("Predicted_PE")
          .setFeaturesCol("features")
          .setSeed(100)
          .setMaxDepth(8)
          .setNumTrees(30)
         )

    rfPipeline = (Pipeline()
                  .setStages([vectorizer,rf])
                 )

    crossval.setEstimator(rfPipeline)
    paramGrid = (ParamGridBuilder()
                 .addGrid(rf.maxBins,[50,100])
                 .addGrid(rf.maxDepth,[4,8,12])
                 .addGrid(rf.numTrees,[20,30,40])
                 .build()
                )
    crossval.setEstimatorParamMaps(paramGrid)

    rfModel = crossval.fit(trainingSetDF).bestModel

    predictionsAndLabelsDF = (rfModel
                              .transform(testSetDF)
                              .select("AT","V","AP","RH","PE","Predicted_PE")
                             )
    rmseRF = regEval.evaluate(predictionsAndLabelsDF)
    r2RF = regEval.evaluate(predictionsAndLabelsDF,{regEval.metricName:"r2"})

    print("LR RMSE: {0:.2f}".format(rmseNew))
    print("RF RMSE: {0:.2f}".format(rmseRF))
    print("LR R2: {0:.2f}".format(r2New))
    print("RF RMSE: {0:.2f}".format(r2RF))

    print("The maxDepth is: {}".format(rfModel.stages[-1]._java_obj.parent().getMaxDepth()))
    print("The numTrees is: {}".format(rfModel.stages[-1]._java_obj.parent().getNumTrees()))
    print("The maxBins is: {}".format(rfModel.stages[-1]._java_obj.parent().getMaxBins()))

    spark.stop()
コード例 #12
0
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import col, expr
from pyspark.mllib.evaluation import RankingMetrics

spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()


df_training = spark.read.parquet('hdfs:/user/tb2517/pub/goodreads/training_sample_10p.parquet')
df_validation = spark.read.parquet('hdfs:/user/tb2517/pub/goodreads/validation_sample_10p.parquet')
df_test = spark.read.parquet('hdfs:/user/tb2517/pub/goodreads/testing_sample_10p.parquet')



als=ALS(userCol="user_id",itemCol="book_id",ratingCol="rating",coldStartStrategy="drop",nonnegative=True)

param_grid = ParamGridBuilder().addGrid(als.rank, [15,25,35]).addGrid(als.maxIter, [5,8,10]).addGrid(als.regParam, [0.08,0.09,0.10]).build()
evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")

cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)
model=cv.fit(df_training)

best_model = model.bestModel

print("Tuned Hyperparameters:-------------")
print("Rank: ", best_model._java_obj.parent().getRank())
print("MaxIter: ", best_model._java_obj.parent().getMaxIter())
print("RegParam: ", best_model._java_obj.parent().getRegParam())

print("Recommendations: ------------------------------")
user_recs = best_model.recommendForAllUsers(500)
print(user_recs.count())
コード例 #13
0
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

# Create a Spark DataFrame from a pandas DataFrame using Arrow
ratings = spark.createDataFrame(dev)
(training, test) = ratings.randomSplit([0.8,0.2])



# ALS
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training)

# Grid search
paramGrid = ParamGridBuilder()\
    .addGrid(als.rank, [4,8,12]) \
    .addGrid(als.regParam, [0.1,1,10])\
    .addGrid(als.maxIter, [5,10,15])\
    .addGrid(als.alpha, [1,2,3])\
    .build()
    
# Tune hyper param
tvs = TrainValidationSplit(estimator=als,
                           estimatorParamMaps=paramGrid,
                           evaluator=rmse,
                           trainRatio=0.8)


# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
コード例 #14
0
 def _build_param_grid(self):
     param_grid_builder = ParamGridBuilder()
     param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
     param_grid_builder.addGrid(self.lr.regParam, self.lr_map)
     return param_grid_builder.build()
コード例 #15
0
training_df.head(5)

from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
idf = IDF(minDocFreq=3, inputCol="features", outputCol="idf")
nb = NaiveBayes()
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nb])

paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 1.0]).build()

cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=paramGrid,
                    evaluator=MulticlassClassificationEvaluator(),
                    numFolds=4)

cvModel = cv.fit(training_df)

result = cvModel.transform(test_df)
prediction_df = result.select("text", "label", "prediction")

datasci_df = prediction_df.filter(prediction_df['label'] == 0.0)
datasci_df.show(truncate=False)

ao_df = prediction_df.filter(prediction_df['label'] == 1.0)
コード例 #16
0
 def _build_param_grid(self):
     param_grid_builder = ParamGridBuilder()
     param_grid_builder.addGrid(self.lr.regParam, self.lr_map)
     return param_grid_builder.build()
コード例 #17
0
# split the data into training and test sets
(training, test) = df.randomSplit(weights=[0.8, 0.2])

# train model
model = pipeline.fit(training)

# prediction with training data
prediction_training = model.transform(training)
# prediction with test data
prediction_test = model.transform(test)

## cross validation
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
paramGrid = ParamGridBuilder().\
    addGrid(dTree.minInfoGain, [0,1,2]).\
    addGrid(dTree.maxDepth, [2,5,10]).\
    build()

# evaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='indexedLabel',
                                              predictionCol='prediction')

# 3-fold cross validation
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=5)

# train model through cross validation
dTree_cv_model = cv.fit(training)
コード例 #18
0
ファイル: SparkML.py プロジェクト: Guang-yi/Cookbook
    print('Average RMSE for {0} windows: {1}'.format(num_windows, np.mean(total_RMSE)))
        
        
feature_columns = ['previous_hour_price', 'previous_hour_high_low_range', 'previous_hour_volume']
sliding_window_evaluation(dataframe=test, feature_columns=feature_columns, num_windows=3, test_size=0.2)

#################################################################################################################
##### Hyperparameter Tuning
# Grid Search - Spark ML
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Defining our parameter grid
paramGrid = (ParamGridBuilder()
  .addGrid(randomForest.numTrees, [10, 30, 100, 300])
  .addGrid(randomForest.maxDepth, [3, None])
  .build()
)

# Cross validation with the parameter grid
crossval = CrossValidator(estimator=randomForest,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=3)

# Reporting the number of nodes on the cluster
print('Number of nodes on the cluster:', sc._jsc.sc().getExecutorMemoryStatus().size())

# Performing the grid search
cvModel = crossval.fit(trainingDataset)
コード例 #19
0
ファイル: mlflow_spark_mlops.py プロジェクト: xk97/repo
# Split dataset into "train" and "test" sets
(train, test) = df.randomSplit([trainSplit, testSplit], 42) 

# Setup evaluator -- default is F1 score
classEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")

with mlflow.start_run(): 
  # Gradient-boosted tree regression
  gbt = GBTRegressor(maxIter=maxIter)

  # Setup pipeline
  pipeline = Pipeline(stages=[gbt])

  # Setup hyperparams grid
  paramGrid = ParamGridBuilder().build()

  # Setup model evaluators
  rmseevaluator = RegressionEvaluator() #Note: By default, it will show how many units off in the same scale as the target -- RMSE
  r2evaluator = RegressionEvaluator(metricName="r2") #Select R2 as our main scoring metric

  # Setup cross validator
  cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=r2evaluator, numFolds=numberOfCVFolds) 

  # Fit model on "train" set
  cvModel = cv.fit(train)

  # Get the best model based on CrossValidator
  model = cvModel.bestModel

  # Run inference on "test" set
コード例 #20
0
features = ["reviewed", "vehicle_year", "vehicle_color_encoded", "CloudCover"]
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Specify the estimator (i.e., classification algorithm):
from pyspark.ml.classification import RandomForestClassifier
classifier = RandomForestClassifier(featuresCol="features", labelCol="star_rating")
print(classifier.explainParams())

# Specify the hyperparameter grid:
from pyspark.ml.tuning import ParamGridBuilder
maxDepthList = [5, 10, 20]
numTreesList = [20, 50, 100]
subsamplingRateList = [0.5, 1.0]
paramGrid = ParamGridBuilder() \
  .addGrid(classifier.maxDepth, maxDepthList) \
  .addGrid(classifier.numTrees, numTreesList) \
  .addGrid(classifier.subsamplingRate, subsamplingRateList) \
  .build()

# Specify the evaluator:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="star_rating", metricName="accuracy")

# **Note:** We are treating `star_rating` as a multiclass label.

# Specify the validator:
from pyspark.ml.tuning import TrainValidationSplit
validator = TrainValidationSplit(estimator=classifier, estimatorParamMaps=paramGrid, evaluator=evaluator)


# ## Specify the pipeline 
コード例 #21
0
# Split the data into training and test sets (30% held out for testing)
(training_data, test_data) = assembled_df.randomSplit([0.7, 0.3], seed=1234)

num_folds = 5

evaluator = MulticlassClassificationEvaluator(labelCol="success",
                                              predictionCol="prediction",
                                              metricName="accuracy")

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="success",
                            featuresCol="features",
                            numTrees=500)

paramGrid = (ParamGridBuilder().addGrid(param=rf.numTrees,
                                        values=[100, 300, 500]).build())

crossval = CrossValidator(
    estimator=rf,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=num_folds,
    seed=1234,
)

model = crossval.fit(training_data)

predictions_train = model.transform(training_data)
predictions_test = model.transform(test_data)

predictions_train.select("movie_name", "imdb_id", "prediction",
コード例 #22
0
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

# 12.4
gbt.explainParams()


############# GG. Gradient Boosting Cross-validation ##################

# 12.5 Cross validation using parameter grid
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# 12.6
paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, [2, 4, 6])
             .addGrid(gbt.maxBins, [20, 60])
             .addGrid(gbt.maxIter, [10, 20])
             .build())

# 12.7
cv = CrossValidator(estimator=gbt,
                                            estimatorParamMaps=paramGrid,
                                           evaluator=evaluator,
                                           numFolds=5
                                            )

# 12.8 Run cross validations. 
#           Takes about 6 minutes as it is training over 20 trees!
cvModel = cv.fit(train)
predictions = cvModel.transform(test)
evaluator.evaluate(predictions)
コード例 #23
0
        .appName('ImageFeatureSelector') \
        .config('spark.executor.memory', '2G') \
        .config('spark.executor.cores', '2') \
        .config('spark.driver.memory', '3G') \
        .config('spark.driver.cores', '1') \
        .getOrCreate()

    train_df = spark.createDataFrame(load_train_data(imagenet_path))

    pre_trained_model = InceptionV3(weights="imagenet")
    pre_trained_model.save('/tmp/model-full.h5')

    estimator = KerasImageFileEstimator(inputCol="uri",
                                        outputCol="prediction",
                                        labelCol="one_hot_label",
                                        imageLoader=load_image_from_uri,
                                        kerasOptimizer='adam',
                                        kerasLoss='categorical_crossentropy',
                                        modelFile='/tmp/model-full-tmp.h5'  # local file path for model
                                        )

    param_grid = (ParamGridBuilder().addGrid(estimator.kerasFitParams, [{"batch_size": 32, "verbose": 0},
                                                                        {"batch_size": 64, "verbose": 0}]).build())

    binary_evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="label")
    cv = CrossValidator(estimator=estimator, estimatorParamMaps=param_grid, evaluator=binary_evaluator, numFolds=2)

    cv_model = cv.fit(train_df)

    print(cv_model)
コード例 #24
0
rForm = RFormula()
lr = LogisticRegression().setLabelCol("label").setFeaturesCol("features")

# COMMAND ----------

from pyspark.ml import Pipeline
stages = [rForm, lr]
pipeline = Pipeline().setStages(stages)

# COMMAND ----------

from pyspark.ml.tuning import ParamGridBuilder
params = ParamGridBuilder()\
  .addGrid(rForm.formula, [
    "lab ~ . + color:value1",
    "lab ~ . + color:value1 + color:value2"])\
  .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
  .addGrid(lr.regParam, [0.1, 2.0])\
  .build()

# COMMAND ----------

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()\
  .setMetricName("areaUnderROC")\
  .setRawPredictionCol("prediction")\
  .setLabelCol("label")

# COMMAND ----------

from pyspark.ml.tuning import TrainValidationSplit
コード例 #25
0
def feature_selector(df: DataFrame,
                     ranked_features: list,
                     output_column: str,
                     estimator_obj=RandomForestRegressor,
                     feature_inclusion_increments: int = 1,
                     train_test_split_ratio: list = None,
                     cv: int = -1,
                     evaluation_metric: str = 'r2'):
    """
    Trains the estimator at multiple steps, with features progressively added to the input list based on their ranks

    :param df: the input dataset with features and output as columns
    :param ranked_features: the output of the feature ranking algorithm or a manually selected ranking scheme
    :param output_column: the name of the output column in the dataset
    :param estimator_obj: the training model object
    :param train_test_split_ratio: the default for train_test_split_ratio is [0.66, 0.33]
    :param cv: if left as default (c = -1), changes nothing. If selected as a value > 1, it enforces cross validation
                and overrides the train-test-splitting
    :param feature_inclusion_increments:
    :param evaluation_metric: evaluation metric to return for predictions on test set - "rmse": root mean
            squared error - "mse": mean squared error - "r2" (default): coefficient of determination -
            "mae": mean absolute error
    """

    if train_test_split_ratio is None:
        train_test_split_ratio = [0.66, 0.33]

    feature_count_list = list(
        range(1, len(ranked_features),
              feature_inclusion_increments)) + [len(ranked_features)]

    estimator_features_col = 'features'
    while estimator_features_col in df.columns:
        estimator_features_col += '_'
    estimator_prediction_col = 'prediction'
    while estimator_prediction_col in df.columns:
        estimator_prediction_col += '_'
    estimator_obj.setFeaturesCol(estimator_features_col)
    estimator_obj.setPredictionCol(estimator_prediction_col)
    estimator_obj.setLabelCol(output_column)

    evaluator = RegressionEvaluator(labelCol=output_column,
                                    predictionCol=estimator_prediction_col,
                                    metricName=evaluation_metric)

    scores = []
    if cv <= 1:
        df_train, df_test = df.randomSplit(train_test_split_ratio)
        for feature_count in feature_count_list:
            input_features = ranked_features[0:feature_count]
            assembler = VectorAssembler(inputCols=input_features,
                                        outputCol=estimator_features_col)
            df_train = assembler.transform(df_train)
            fit_model = estimator_obj.fit(df_train)
            df_test = assembler.transform(df_test)
            df_test = fit_model.transform(df_test)
            score = evaluator.evaluate(df_test)
            scores.append((feature_count, score))
            df_train = df_train.drop(estimator_features_col)
            df_test = df_test.drop(estimator_features_col,
                                   estimator_prediction_col)
    else:
        for feature_count in feature_count_list:
            input_features = ranked_features[0:feature_count]
            assembler = VectorAssembler(inputCols=input_features,
                                        outputCol=estimator_features_col)
            df = assembler.transform(df)
            grid = ParamGridBuilder().addGrid(
                estimator_obj.featuresCol,
                [estimator_obj.getFeaturesCol()]).build()
            crossval = CrossValidator(estimator=estimator_obj,
                                      evaluator=evaluator,
                                      numFolds=cv,
                                      estimatorParamMaps=grid)
            fit_crossval = crossval.fit(df)
            scores.append((feature_count, fit_crossval.avgMetrics[0]))
            df = df.drop(estimator_features_col)

    return scores
コード例 #26
0
corr = meas.select(sensorNameArray).toPandas().corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

cmap = sb.diverging_palette(220, 10, as_cmap=True)
sb.heatmap(corr, mask=mask, xticklabels=sensorNameArray, yticklabels=sensorNameArray,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

# #### Model Tuning
# Spark has advanced model tuning capabilities as well. Let's improve our Random Forest
# Classifier using the ML tuning api
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

# ParamGrids are grids of model tuning parameter values
paramGrid = ParamGridBuilder()\
  .addGrid(rf.maxDepth, [5,10,15])\
  .addGrid(rf.numTrees, [20,25,30])\
  .build()

# A TrainValidationSplit is used for hyper-parameter tuning. It takes a model estimator,
# parameter grid, and evaluator as input and runs the model multiple times to identify
# the most optimal model parameters
tvs = TrainValidationSplit(estimator=rf,
                           estimatorParamMaps=paramGrid,
                           evaluator=MulticlassClassificationEvaluator(),
                           trainRatio=0.8)

(trainingData, testData) = li.transform(va).randomSplit([0.7, 0.3])

# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(trainingData)
コード例 #27
0
# COMMAND ----------

# MAGIC %md Third, we wrap the model training stage within a `CrossValidator` stage.  `CrossValidator` knows how to call the GBT algorithm with different hyperparameter settings.  It will train multiple models and choose the best one, based on minimizing some metric.  In this example, our metric is [Root Mean Squared Error (RMSE)](https://en.wikipedia.org/wiki/Root-mean-square_deviation).
# MAGIC
# MAGIC ![Image of CV](http://training.databricks.com/databricks_guide/4-cv.png)

# COMMAND ----------

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
# Define a grid of hyperparameters to test:
#  - maxDepth: max depth of each decision tree in the GBT ensemble
#  - maxIter: iterations, i.e., number of trees in each GBT ensemble
# In this example notebook, we keep these values small.  In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100).
paramGrid = ParamGridBuilder()\
  .addGrid(gbt.maxDepth, [2, 5])\
  .addGrid(gbt.maxIter, [10, 100])\
  .build()
# We define an evaluation metric.  This tells CrossValidator how well we are doing by comparing the true labels with predictions.
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol=gbt.getLabelCol(),
                                predictionCol=gbt.getPredictionCol())
# Declare the CrossValidator, which runs model tuning for us.
cv = CrossValidator(estimator=gbt,
                    evaluator=evaluator,
                    estimatorParamMaps=paramGrid)

# COMMAND ----------

# MAGIC %md Finally, we can tie our feature processing and model training stages together into a single `Pipeline`.
# MAGIC
# MAGIC ![Image of Pipeline](http://training.databricks.com/databricks_guide/5-pipeline.png)
コード例 #28
0
 def _get_param_grid(self):
     self.classifier.getClassifier
     return ParamGridBuilder() \
         .addGrid(self.classifier.getClassifier().regParam, [0.1, 0.2, 0.4, 0.6, 0.8, 1]) \
         .build()
コード例 #29
0
# metrics1 = BinaryClassificationMetrics(PredictionandLabels)
# (train score/train accuracy   --- )
# (train error = 1-train score ?)
metrics2 = MulticlassMetrics(PredictionandLabels)
metrics2.accuracy
metrics2.areaUnderPR
print(metrics2.confusionMatrix())

# ----------------------------------------------------------------------------
# CV / Parameter Tuning approach ---------------------------------------------

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

paramGrid = ParamGridBuilder().addGrid(dt1.impurity,
                                       ['entropy', 'gini']).addGrid(
                                           dt1.maxDepth,
                                           [2, 3, 4, 5, 6]).build()

evaluator1 = MulticlassClassificationEvaluator(predictionCol='prediction',
                                               labelCol='Survived',
                                               metricName='accuracy')

crossVal4 = CrossValidator(estimator=dt1,
                           estimatorParamMaps=paramGrid,
                           evaluator=evaluator1,
                           numFolds=10)

model23 = crossVal4.fit(df3)
model23.avgMetrics

# --------------------------------------------------------
コード例 #30
0
 def _get_param_grid(self):
     return ParamGridBuilder() \
         .addGrid(self.classifier.maxDepth, [5, 7, 11, 13]) \
         .addGrid(self.classifier.numTrees, [5, 7, 11, 13, 17]) \
         .build()
コード例 #31
0
    print("MLflow:")
    print("  run_id:",run_id)
    print("  experiment_id:",experiment_id)
    
    # Log MLflow parameters
    print("Parameters:")
    print("  maxDepthParams:",maxDepthParams)
    print("  maxBinsParams:",maxBinsParams)

    # Create pipeline
    dt = DecisionTreeRegressor(labelCol=colLabel, featuresCol=colFeatures)
    assembler = VectorAssembler(inputCols=data.columns[:-1], outputCol=colFeatures)
    pipeline = Pipeline(stages=[assembler, dt])
    
    paramGrid = ParamGridBuilder() \
        .addGrid(dt.maxDepth, maxDepthParams) \
        .addGrid(dt.maxBins, maxBinsParams) \
        .build()
        
    evaluator = RegressionEvaluator(
        labelCol=colLabel, predictionCol=colPrediction, metricName=metricName)

    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=numFolds)  

    # Train model.  This also runs the indexers.
    cvModel = crossval.fit(trainingData)
    model = cvModel.bestModel

    # Make predictions.
コード例 #32
0
    xs = crashes.flatMap(lambda x:x.split('\n')) \
                .map(json.loads) \
                .map(group_crashes) \
                .map(improve_times)

    # conver to DF
    df = spark.createDataFrame(xs)

    feature_labels = df.columns
    feature_labels.pop(feature_labels.index('Number of Vehicles Involved'))
    df = reduce(string_to_index, feature_labels, df)
    indexes = ["i-"+f for f in feature_labels]

    df = VectorAssembler(inputCols=indexes,
                         outputCol="features").transform(df)

    df = StringIndexer(inputCol='Number of Vehicles Involved',
                       outputCol='label').fit(df).transform(df)

    grid = ParamGridBuilder().addGrid(nb.smoothing, [1.0, 1.5]) \
                             .build()

    cv = CrossValidator(estimator=nb, estimatorParamMaps=grid,
                            evaluator=mce,numFolds=5,
                            parallelism=4)
    cv_model = cv.fit(df)
    transformed = cv_model.transform(df)
    f1 = mce.evaluate(transformed)
    print("NB F1: {:0.4f}".format(f1))
    cv_model.bestModel.save(sys.argv[2])
コード例 #33
0
                            outputCol="features")

dtc = DecisionTreeClassifier(featuresCol="features", labelCol="Survived")

# COMMAND ----------

# MAGIC %md-sandbox
# MAGIC `ParamGridBuilder()` allows us to string together all of the different possible hyperparameters we would like to test.  In this case, we can test the maximum number of iterations, whether we want to use an intercept with the y axis, and whether we want to standardize our features.
# MAGIC
# MAGIC <img alt="Caution" title="Caution" style="vertical-align: text-bottom; position: relative; height:1.3em; top:0.0em" src="https://files.training.databricks.com/static/images/icon-warning.svg"/> Since grid search works through exhaustively building a model for each combination of parameters, it quickly becomes a lot of different unique combinations of parameters.

# COMMAND ----------

from pyspark.ml.tuning import ParamGridBuilder

paramGrid = (ParamGridBuilder().addGrid(dtc.maxDepth, [2, 3, 4, 5, 6]).addGrid(
    dtc.maxBins, [16, 32, 48, 64]).build())

# COMMAND ----------

# MAGIC %md-sandbox
# MAGIC ### Cross-Validation
# MAGIC
# MAGIC There are a number of different ways of conducting cross-validation, allowing us to trade off between computational expense and model performance.  An exhaustive approach to cross-validation would include every possible split of the training set.  More commonly, _k_-fold cross-validation is used where the training dataset is divided into _k_ smaller sets, or folds.  A model is then trained on _k_-1 folds of the training data and the last fold is used to evaluate its performance.
# MAGIC
# MAGIC <img alt="Side Note" title="Side Note" style="vertical-align: text-bottom; position: relative; height:1.75em; top:0.05em; transform:rotate(15deg)" src="https://files.training.databricks.com/static/images/icon-note.webp"/> See <a href="https://en.wikipedia.org/wiki/Cross-validation_(statistics)" target="_blank">the Wikipedia article on Cross-Validation</a> for more information.

# COMMAND ----------

# MAGIC %md
# MAGIC Create a `MulticlassClassificationEvaluator()` to evaluate our grid search experiments and a `CrossValidator()` to build our models.