##Featurization of text starts now
print("Generating Features")
textFeaturizer = TextFeaturizer() \
                 .setInputCol("itemdesc").setOutputCol("features") \
                 .setUseStopWordsRemover(True).setUseIDF(True).setMinDocFreq(5).setNumFeatures(1 << 10).fit(df)

processedData = textFeaturizer.transform(dataset=df)
processedData = processedData.withColumn("label", processedData["label"]) \
                             .select(["features", "label"])

print("Splitting the data into train, test sets")
train, test = processedData.randomSplit([0.70, 0.30])
#from pyspark.ml.classification import LogisticRegression

print("Fitting the model Starts")
model = TrainClassifier(model=DecisionTreeClassifier(),
                        labelCol="label").fit(train)

print("Generating model scores with the test data")
prediction = model.transform(test)
metrics = ComputeModelStatistics().transform(prediction)
print("Best model's accuracy on validation set = " +
      "{0:.2f}%".format(metrics.first()["accuracy"] * 100))

run_logger.log('Accuracy', metrics.first()["accuracy"])
#Save the trained model for scoring later
model.write().overwrite().save(
    "wasbs://srramhdispark-2018-03-28t20-34-23-500z@srramstorage.blob.core.windows.net/HdiNotebooks/PySpark/POClassificationmmlspark.mml"
)
Example #2
0
# Split data into train and test.
train, test = data.randomSplit([0.75, 0.25], seed=123)

print("********* TRAINING DATA ***********")
print(train.limit(10).toPandas())

reg = 0.1
# Load Regularization Rate from argument
if len(sys.argv) > 1:
    reg = float(sys.argv[1])
print("Regularization Rate is {}.".format(reg))

# Use TrainClassifier in mmlspark to train a logistic regression model. Notice that we don't have to do any one-hot encoding, or vectorization.
# We also don't need to convert the label column from string to binary. mmlspark does those all these tasks for us.
model = TrainClassifier(model=LogisticRegression(regParam=reg),
                        labelCol=" income",
                        numFeatures=256).fit(train)
run_logger.log("Regularization Rate", reg)

# predict on the test dataset
prediction = model.transform(test)

# compute model metrics
metrics = ComputeModelStatistics().transform(prediction)

print("******** MODEL METRICS ************")
print("Accuracy is {}.".format(metrics.collect()[0]['accuracy']))
print("Precision is {}.".format(metrics.collect()[0]['precision']))
print("Recall is {}.".format(metrics.collect()[0]['recall']))
print("AUC is {}.".format(metrics.collect()[0]['AUC']))
Example #3
0
spark = pyspark.sql.SparkSession.builder.appName("MyApp").getOrCreate()

# Create a Spark dataframe out of the csv file.
data = spark.createDataFrame(
    pd.read_csv(dataFile,
                dtype={
                    "feature1": np.float64,
                    "feature2": string,
                    "label": string
                }))

# Split data into train and test.
train, test = data.randomSplit([0.75, 0.25], seed=123)

# Train a model
model = TrainClassifier(model=LogisticRegression(),
                        labelCol="label").fit(train)

# Evaluate the model
metrics = ComputeModelStatistics().transform(prediction)
print("******** MODEL METRICS ************")
print("Accuracy is {}.".format(metrics.collect()[0]['accuracy']))
print("Precision is {}.".format(metrics.collect()[0]['precision']))
print("Recall is {}.".format(metrics.collect()[0]['recall']))
print("AUC is {}.".format(metrics.collect()[0]['AUC']))

# log accuracy and AUC
run_logger.log("Accuracy", metrics.collect()[0]['accuracy'])
run_logger.log("AUC", metrics.collect()[0]['AUC'])

######## Persist the Model ######
model.write().overwrite().save("mySparkMLModel.mml")