Python TrainClassifier Examples

Programming Language: Python

Namespace/Package Name: mmlspark.TrainClassifier

Class/Type: TrainClassifier

Examples at hotexamples.com: 3

Python TrainClassifier - 3 examples found. These are the top rated real world Python examples of mmlspark.TrainClassifier.TrainClassifier extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

TrainClassifier(3)

write(3)

transform(1)

Example #1

Show file

File: cmcpospark.py Project: scrdataaitsp/sparkclassification

##Featurization of text starts now
print("Generating Features")
textFeaturizer = TextFeaturizer() \
                 .setInputCol("itemdesc").setOutputCol("features") \
                 .setUseStopWordsRemover(True).setUseIDF(True).setMinDocFreq(5).setNumFeatures(1 << 10).fit(df)

processedData = textFeaturizer.transform(dataset=df)
processedData = processedData.withColumn("label", processedData["label"]) \
                             .select(["features", "label"])

print("Splitting the data into train, test sets")
train, test = processedData.randomSplit([0.70, 0.30])
#from pyspark.ml.classification import LogisticRegression

print("Fitting the model Starts")
model = TrainClassifier(model=DecisionTreeClassifier(),
                        labelCol="label").fit(train)

print("Generating model scores with the test data")
prediction = model.transform(test)
metrics = ComputeModelStatistics().transform(prediction)
print("Best model's accuracy on validation set = " +
      "{0:.2f}%".format(metrics.first()["accuracy"] * 100))

run_logger.log('Accuracy', metrics.first()["accuracy"])
#Save the trained model for scoring later
model.write().overwrite().save(
    "wasbs://srramhdispark-2018-03-28t20-34-23-500z@srramstorage.blob.core.windows.net/HdiNotebooks/PySpark/POClassificationmmlspark.mml"
)

Example #2

Show file

# Split data into train and test.
train, test = data.randomSplit([0.75, 0.25], seed=123)

print("********* TRAINING DATA ***********")
print(train.limit(10).toPandas())

reg = 0.1
# Load Regularization Rate from argument
if len(sys.argv) > 1:
    reg = float(sys.argv[1])
print("Regularization Rate is {}.".format(reg))

# Use TrainClassifier in mmlspark to train a logistic regression model. Notice that we don't have to do any one-hot encoding, or vectorization.
# We also don't need to convert the label column from string to binary. mmlspark does those all these tasks for us.
model = TrainClassifier(model=LogisticRegression(regParam=reg),
                        labelCol=" income",
                        numFeatures=256).fit(train)
run_logger.log("Regularization Rate", reg)

# predict on the test dataset
prediction = model.transform(test)

# compute model metrics
metrics = ComputeModelStatistics().transform(prediction)

print("******** MODEL METRICS ************")
print("Accuracy is {}.".format(metrics.collect()[0]['accuracy']))
print("Precision is {}.".format(metrics.collect()[0]['precision']))
print("Recall is {}.".format(metrics.collect()[0]['recall']))
print("AUC is {}.".format(metrics.collect()[0]['AUC']))

Example #3

Show file

spark = pyspark.sql.SparkSession.builder.appName("MyApp").getOrCreate()

# Create a Spark dataframe out of the csv file.
data = spark.createDataFrame(
    pd.read_csv(dataFile,
                dtype={
                    "feature1": np.float64,
                    "feature2": string,
                    "label": string
                }))

# Split data into train and test.
train, test = data.randomSplit([0.75, 0.25], seed=123)

# Train a model
model = TrainClassifier(model=LogisticRegression(),
                        labelCol="label").fit(train)

# Evaluate the model
metrics = ComputeModelStatistics().transform(prediction)
print("******** MODEL METRICS ************")
print("Accuracy is {}.".format(metrics.collect()[0]['accuracy']))
print("Precision is {}.".format(metrics.collect()[0]['precision']))
print("Recall is {}.".format(metrics.collect()[0]['recall']))
print("AUC is {}.".format(metrics.collect()[0]['AUC']))

# log accuracy and AUC
run_logger.log("Accuracy", metrics.collect()[0]['accuracy'])
run_logger.log("AUC", metrics.collect()[0]['AUC'])

######## Persist the Model ######
model.write().overwrite().save("mySparkMLModel.mml")