# COMMAND ---------- # MAGIC %md `TrainClassifier` can be used to initialize and fit a model, it wraps SparkML classifiers. # MAGIC You can use `help(mmlspark.TrainClassifier)` to view the different parameters. # MAGIC # MAGIC Note that it implicitly converts the data into the format expected by the algorithm: tokenize # MAGIC and hash strings, one-hot encodes categorical variables, assembles the features into a vector # MAGIC and so on. The parameter `numFeatures` controls the number of hashed features. # COMMAND ---------- from mmlspark.train import TrainClassifier from pyspark.ml.classification import LogisticRegression model = TrainClassifier(model=LogisticRegression(), labelCol="income", numFeatures=256).fit(train) # COMMAND ---------- # MAGIC %md After the model is trained, we score it against the test dataset and view metrics. # COMMAND ---------- from mmlspark.train import ComputeModelStatistics, TrainedClassifierModel prediction = model.transform(test) metrics = ComputeModelStatistics().transform(prediction) metrics.limit(10).toPandas() # COMMAND ----------
# COMMAND ---------- # MAGIC %md `TrainClassifier` can be used to initialize and fit a model, it wraps SparkML classifiers. # MAGIC You can use `help(mmlspark.TrainClassifier)` to view the different parameters. # MAGIC # MAGIC Note that it implicitly converts the data into the format expected by the algorithm. More specifically it: # MAGIC tokenizes, hashes strings, one-hot encodes categorical variables, assembles the features into a vector # MAGIC etc. The parameter `numFeatures` controls the number of hashed features. # COMMAND ---------- from mmlspark.train import TrainClassifier from pyspark.ml.classification import LogisticRegression model = TrainClassifier(model=LogisticRegression(), labelCol="income", numFeatures=256).fit(train) # COMMAND ---------- # MAGIC %md After the model is trained, we score it against the test dataset and view metrics. # COMMAND ---------- from mmlspark.train import ComputeModelStatistics, TrainedClassifierModel prediction = model.transform(test) prediction.printSchema() # COMMAND ---------- metrics = ComputeModelStatistics().transform(prediction)
# COMMAND ---------- # MAGIC %md Next, define the models that wil be tuned: # COMMAND ---------- from mmlspark.automl import TuneHyperparameters from mmlspark.train import TrainClassifier from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier logReg = LogisticRegression() randForest = RandomForestClassifier() gbt = GBTClassifier() smlmodels = [logReg, randForest, gbt] mmlmodels = [ TrainClassifier(model=model, labelCol="Label") for model in smlmodels ] # COMMAND ---------- # MAGIC %md We can specify the hyperparameters using the HyperparamBuilder. # MAGIC We can add either DiscreteHyperParam or RangeHyperParam hyperparameters. # MAGIC TuneHyperparameters will randomly choose values from a uniform distribution. # COMMAND ---------- from mmlspark.automl import * paramBuilder = \ HyperparamBuilder() \ .addHyperparam(logReg, logReg.regParam, RangeHyperParam(0.1, 0.3)) \
# MAGIC trained models by find the model which performs best on the `test` # MAGIC dataset given the specified metric # MAGIC # MAGIC 3. The **`CompueModelStatistics`** Transformer computes the different # MAGIC metrics on a scored dataset (in our case, the `validation` dataset) # MAGIC at the same time # COMMAND ---------- from mmlspark.train import TrainClassifier, ComputeModelStatistics from mmlspark.automl import FindBestModel # Prepare data for learning train, test, validation = data.randomSplit([0.60, 0.20, 0.20], seed=123) # Train the models on the 'train' data lrHyperParams = [0.05, 0.1, 0.2, 0.4] logisticRegressions = [LogisticRegression(regParam = hyperParam) for hyperParam in lrHyperParams] lrmodels = [TrainClassifier(model=lrm, labelCol="label", numFeatures=10000).fit(train) for lrm in logisticRegressions] # Select the best model bestModel = FindBestModel(evaluationMetric="AUC", models=lrmodels).fit(test) # Get AUC on the validation dataset predictions = bestModel.transform(validation) metrics = ComputeModelStatistics().transform(predictions) print("Best model's AUC on validation set = " + "{0:.2f}%".format(metrics.first()["AUC"] * 100))
# COMMAND ---------- # MAGIC %md Generate several models with different parameters from the training data. # COMMAND ---------- from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier from mmlspark.train import TrainClassifier import itertools lrHyperParams = [0.05, 0.2] logisticRegressions = [ LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams ] lrmodels = [ TrainClassifier(model=lrm, labelCol="label").fit(ptrain) for lrm in logisticRegressions ] rfHyperParams = itertools.product([5, 10], [2, 3]) randomForests = [ RandomForestClassifier(numTrees=hyperParam[0], maxDepth=hyperParam[1]) for hyperParam in rfHyperParams ] rfmodels = [ TrainClassifier(model=rfm, labelCol="label").fit(ptrain) for rfm in randomForests ] gbtHyperParams = itertools.product([8, 16], [2, 3]) gbtclassifiers = [
processedData.limit(5).toPandas() # COMMAND ---------- # MAGIC %md Train several Logistic Regression models with different regularizations. # COMMAND ---------- train, test, validation = processedData.randomSplit([0.60, 0.20, 0.20]) from pyspark.ml.classification import LogisticRegression lrHyperParams = [0.05, 0.1, 0.2, 0.4] logisticRegressions = [LogisticRegression(regParam = hyperParam) for hyperParam in lrHyperParams] from mmlspark.train import TrainClassifier lrmodels = [TrainClassifier(model=lrm, labelCol="label").fit(train) for lrm in logisticRegressions] # COMMAND ---------- # MAGIC %md Find the model with the best AUC on the test set. # COMMAND ---------- from mmlspark.automl import FindBestModel, BestModel bestModel = FindBestModel(evaluationMetric="AUC", models=lrmodels).fit(test) bestModel.getEvaluationResults().show() bestModel.getBestModelMetrics().show() bestModel.getAllModelMetrics().show() # COMMAND ----------
# COMMAND ---------- # MAGIC %md Featurize images # COMMAND ---------- featurizedImages = cntkModel.transform(imagesWithLabels).select(["features","labels"]) # COMMAND ---------- # MAGIC %md Use featurized images to train a classifier # COMMAND ---------- from mmlspark.train import TrainClassifier from pyspark.ml.classification import RandomForestClassifier train,test = featurizedImages.randomSplit([0.75,0.25]) model = TrainClassifier(model=RandomForestClassifier(),labelCol="labels").fit(train) # COMMAND ---------- # MAGIC %md Evaluate the accuracy of the model # COMMAND ---------- from mmlspark.train import ComputeModelStatistics predictions = model.transform(test) metrics = ComputeModelStatistics(evaluationMetric="accuracy").transform(predictions) metrics.show()
'clump_thickness', 'unif_cell_size', 'unif_cell_shape', 'marg_adhesion', 'single_epith_cell_size', 'bare_nuclei', 'bland_chrom', 'norm_nucleoli', 'mitoses' ], outputCol='features') p = Pipeline(stages=[f1]).fit(data) data = p.transform(data) train_data, test_data = data.randomSplit([0.8, 0.2], seed=0) lg = LogisticRegression() rf = RandomForestClassifier() gbt = GBTClassifier() models = [lg, rf, gbt] mml_models = [ TrainClassifier(model=model, labelCol="label") for model in models ] param_builder = HyperparamBuilder() \ .addHyperparam(lg, lg.regParam, RangeHyperParam(0.1, 0.3)) \ .addHyperparam(rf, rf.numTrees, DiscreteHyperParam([5, 10])) \ .addHyperparam(rf, rf.maxDepth, DiscreteHyperParam([3, 5])) \ .addHyperparam(gbt, gbt.maxBins, RangeHyperParam(8, 16)) \ .addHyperparam(gbt, gbt.maxDepth, DiscreteHyperParam([3, 5])) search_space = param_builder.build() print(search_space) random_space = RandomSpace(search_space) best_model = TuneHyperparameters(evaluationMetric="accuracy", models=mml_models, numFolds=2,
spark = pyspark.sql.SparkSession.builder.appName("MyApp")\ .config("spark.jars.packages", "com.microsoft.ml.spark:mmlspark_2.11:0.18.1")\ .getOrCreate() import mmlspark from mmlspark.lightgbm import LightGBMClassifier from mmlspark.train import TrainClassifier, ComputeModelStatistics df = spark.read.csv(r"C:\Users\yanrujing\Desktop\breast_cancer.csv", header=True, inferSchema=True) train_data, test_data = df.randomSplit([0.8, 0.2], seed=0) print(df.limit(10).toPandas()) model = TrainClassifier(model=LogisticRegression(), labelCol="class", numFeatures=256).fit(train_data) prediction = model.transform(test_data) metrics = ComputeModelStatistics().transform(prediction) print(metrics.limit(10).toPandas()) f1 = VectorAssembler(inputCols=[ 'clump_thickness', 'unif_cell_size', 'unif_cell_shape', 'marg_adhesion', 'single_epith_cell_size', 'bare_nuclei', 'bland_chrom', 'norm_nucleoli', 'mitoses' ], outputCol='features') f2 = StringIndexer(inputCol='class', outputCol='label') p = Pipeline(stages=[f1, f2]).fit(df) data = p.transform(df)