Exemple #1
0
# COMMAND ----------

# MAGIC %md `TrainClassifier` can be used to initialize and fit a model, it wraps SparkML classifiers.
# MAGIC You can use `help(mmlspark.TrainClassifier)` to view the different parameters.
# MAGIC
# MAGIC Note that it implicitly converts the data into the format expected by the algorithm: tokenize
# MAGIC and hash strings, one-hot encodes categorical variables, assembles the features into a vector
# MAGIC and so on.  The parameter `numFeatures` controls the number of hashed features.

# COMMAND ----------

from mmlspark.train import TrainClassifier
from pyspark.ml.classification import LogisticRegression
model = TrainClassifier(model=LogisticRegression(),
                        labelCol="income",
                        numFeatures=256).fit(train)

# COMMAND ----------

# MAGIC %md After the model is trained, we score it against the test dataset and view metrics.

# COMMAND ----------

from mmlspark.train import ComputeModelStatistics, TrainedClassifierModel
prediction = model.transform(test)
metrics = ComputeModelStatistics().transform(prediction)
metrics.limit(10).toPandas()

# COMMAND ----------
# COMMAND ----------

# MAGIC %md `TrainClassifier` can be used to initialize and fit a model, it wraps SparkML classifiers.
# MAGIC You can use `help(mmlspark.TrainClassifier)` to view the different parameters.
# MAGIC
# MAGIC Note that it implicitly converts the data into the format expected by the algorithm. More specifically it:
# MAGIC  tokenizes, hashes strings, one-hot encodes categorical variables, assembles the features into a vector
# MAGIC etc.  The parameter `numFeatures` controls the number of hashed features.

# COMMAND ----------

from mmlspark.train import TrainClassifier
from pyspark.ml.classification import LogisticRegression
model = TrainClassifier(model=LogisticRegression(),
                        labelCol="income",
                        numFeatures=256).fit(train)

# COMMAND ----------

# MAGIC %md After the model is trained, we score it against the test dataset and view metrics.

# COMMAND ----------

from mmlspark.train import ComputeModelStatistics, TrainedClassifierModel
prediction = model.transform(test)
prediction.printSchema()

# COMMAND ----------

metrics = ComputeModelStatistics().transform(prediction)
# COMMAND ----------

# MAGIC %md Next, define the models that wil be tuned:

# COMMAND ----------

from mmlspark.automl import TuneHyperparameters
from mmlspark.train import TrainClassifier
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
logReg = LogisticRegression()
randForest = RandomForestClassifier()
gbt = GBTClassifier()
smlmodels = [logReg, randForest, gbt]
mmlmodels = [
    TrainClassifier(model=model, labelCol="Label") for model in smlmodels
]

# COMMAND ----------

# MAGIC %md We can specify the hyperparameters using the HyperparamBuilder.
# MAGIC We can add either DiscreteHyperParam or RangeHyperParam hyperparameters.
# MAGIC TuneHyperparameters will randomly choose values from a uniform distribution.

# COMMAND ----------

from mmlspark.automl import *

paramBuilder = \
  HyperparamBuilder() \
    .addHyperparam(logReg, logReg.regParam, RangeHyperParam(0.1, 0.3)) \
Exemple #4
0
# MAGIC    trained models by find the model which performs best on the `test`
# MAGIC    dataset given the specified metric
# MAGIC 
# MAGIC 3. The **`CompueModelStatistics`** Transformer computes the different
# MAGIC    metrics on a scored dataset (in our case, the `validation` dataset)
# MAGIC    at the same time

# COMMAND ----------

from mmlspark.train import TrainClassifier, ComputeModelStatistics
from mmlspark.automl import FindBestModel

# Prepare data for learning
train, test, validation = data.randomSplit([0.60, 0.20, 0.20], seed=123)

# Train the models on the 'train' data
lrHyperParams = [0.05, 0.1, 0.2, 0.4]
logisticRegressions = [LogisticRegression(regParam = hyperParam)
                       for hyperParam in lrHyperParams]
lrmodels = [TrainClassifier(model=lrm, labelCol="label", numFeatures=10000).fit(train)
            for lrm in logisticRegressions]

# Select the best model
bestModel = FindBestModel(evaluationMetric="AUC", models=lrmodels).fit(test)


# Get AUC on the validation dataset
predictions = bestModel.transform(validation)
metrics = ComputeModelStatistics().transform(predictions)
print("Best model's AUC on validation set = "
      + "{0:.2f}%".format(metrics.first()["AUC"] * 100))
Exemple #5
0
# COMMAND ----------

# MAGIC %md Generate several models with different parameters from the training data.

# COMMAND ----------

from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from mmlspark.train import TrainClassifier
import itertools

lrHyperParams = [0.05, 0.2]
logisticRegressions = [
    LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams
]
lrmodels = [
    TrainClassifier(model=lrm, labelCol="label").fit(ptrain)
    for lrm in logisticRegressions
]

rfHyperParams = itertools.product([5, 10], [2, 3])
randomForests = [
    RandomForestClassifier(numTrees=hyperParam[0], maxDepth=hyperParam[1])
    for hyperParam in rfHyperParams
]
rfmodels = [
    TrainClassifier(model=rfm, labelCol="label").fit(ptrain)
    for rfm in randomForests
]

gbtHyperParams = itertools.product([8, 16], [2, 3])
gbtclassifiers = [
processedData.limit(5).toPandas()

# COMMAND ----------

# MAGIC %md Train several Logistic Regression models with different regularizations.

# COMMAND ----------

train, test, validation = processedData.randomSplit([0.60, 0.20, 0.20])
from pyspark.ml.classification import LogisticRegression

lrHyperParams = [0.05, 0.1, 0.2, 0.4]
logisticRegressions = [LogisticRegression(regParam = hyperParam) for hyperParam in lrHyperParams]

from mmlspark.train import TrainClassifier
lrmodels = [TrainClassifier(model=lrm, labelCol="label").fit(train) for lrm in logisticRegressions]

# COMMAND ----------

# MAGIC %md Find the model with the best AUC on the test set.

# COMMAND ----------

from mmlspark.automl import FindBestModel, BestModel
bestModel = FindBestModel(evaluationMetric="AUC", models=lrmodels).fit(test)
bestModel.getEvaluationResults().show()
bestModel.getBestModelMetrics().show()
bestModel.getAllModelMetrics().show()


# COMMAND ----------
# COMMAND ----------

# MAGIC %md Featurize images

# COMMAND ----------

featurizedImages = cntkModel.transform(imagesWithLabels).select(["features","labels"])

# COMMAND ----------

# MAGIC %md Use featurized images to train a classifier

# COMMAND ----------

from mmlspark.train import TrainClassifier
from pyspark.ml.classification import RandomForestClassifier

train,test = featurizedImages.randomSplit([0.75,0.25])

model = TrainClassifier(model=RandomForestClassifier(),labelCol="labels").fit(train)

# COMMAND ----------

# MAGIC %md Evaluate the accuracy of the model

# COMMAND ----------

from mmlspark.train import ComputeModelStatistics
predictions = model.transform(test)
metrics = ComputeModelStatistics(evaluationMetric="accuracy").transform(predictions)
metrics.show()
Exemple #8
0
    'clump_thickness', 'unif_cell_size', 'unif_cell_shape', 'marg_adhesion',
    'single_epith_cell_size', 'bare_nuclei', 'bland_chrom', 'norm_nucleoli',
    'mitoses'
],
                     outputCol='features')
p = Pipeline(stages=[f1]).fit(data)
data = p.transform(data)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=0)

lg = LogisticRegression()
rf = RandomForestClassifier()
gbt = GBTClassifier()

models = [lg, rf, gbt]
mml_models = [
    TrainClassifier(model=model, labelCol="label") for model in models
]
param_builder = HyperparamBuilder() \
    .addHyperparam(lg, lg.regParam, RangeHyperParam(0.1, 0.3)) \
    .addHyperparam(rf, rf.numTrees, DiscreteHyperParam([5, 10])) \
    .addHyperparam(rf, rf.maxDepth, DiscreteHyperParam([3, 5])) \
    .addHyperparam(gbt, gbt.maxBins, RangeHyperParam(8, 16)) \
    .addHyperparam(gbt, gbt.maxDepth, DiscreteHyperParam([3, 5]))

search_space = param_builder.build()
print(search_space)
random_space = RandomSpace(search_space)

best_model = TuneHyperparameters(evaluationMetric="accuracy",
                                 models=mml_models,
                                 numFolds=2,
Exemple #9
0
spark = pyspark.sql.SparkSession.builder.appName("MyApp")\
    .config("spark.jars.packages", "com.microsoft.ml.spark:mmlspark_2.11:0.18.1")\
    .getOrCreate()

import mmlspark
from mmlspark.lightgbm import LightGBMClassifier
from mmlspark.train import TrainClassifier, ComputeModelStatistics

df = spark.read.csv(r"C:\Users\yanrujing\Desktop\breast_cancer.csv",
                    header=True,
                    inferSchema=True)
train_data, test_data = df.randomSplit([0.8, 0.2], seed=0)
print(df.limit(10).toPandas())

model = TrainClassifier(model=LogisticRegression(),
                        labelCol="class",
                        numFeatures=256).fit(train_data)
prediction = model.transform(test_data)
metrics = ComputeModelStatistics().transform(prediction)
print(metrics.limit(10).toPandas())

f1 = VectorAssembler(inputCols=[
    'clump_thickness', 'unif_cell_size', 'unif_cell_shape', 'marg_adhesion',
    'single_epith_cell_size', 'bare_nuclei', 'bland_chrom', 'norm_nucleoli',
    'mitoses'
],
                     outputCol='features')
f2 = StringIndexer(inputCol='class', outputCol='label')

p = Pipeline(stages=[f1, f2]).fit(df)
data = p.transform(df)