##Featurization of text starts now print("Generating Features") textFeaturizer = TextFeaturizer() \ .setInputCol("itemdesc").setOutputCol("features") \ .setUseStopWordsRemover(True).setUseIDF(True).setMinDocFreq(5).setNumFeatures(1 << 10).fit(df) processedData = textFeaturizer.transform(dataset=df) processedData = processedData.withColumn("label", processedData["label"]) \ .select(["features", "label"]) print("Splitting the data into train, test sets") train, test = processedData.randomSplit([0.70, 0.30]) #from pyspark.ml.classification import LogisticRegression print("Fitting the model Starts") model = TrainClassifier(model=DecisionTreeClassifier(), labelCol="label").fit(train) print("Generating model scores with the test data") prediction = model.transform(test) metrics = ComputeModelStatistics().transform(prediction) print("Best model's accuracy on validation set = " + "{0:.2f}%".format(metrics.first()["accuracy"] * 100)) run_logger.log('Accuracy', metrics.first()["accuracy"]) #Save the trained model for scoring later model.write().overwrite().save( "wasbs://srramhdispark-2018-03-28t20-34-23-500z@srramstorage.blob.core.windows.net/HdiNotebooks/PySpark/POClassificationmmlspark.mml" )
# Split data into train and test. train, test = data.randomSplit([0.75, 0.25], seed=123) print("********* TRAINING DATA ***********") print(train.limit(10).toPandas()) reg = 0.1 # Load Regularization Rate from argument if len(sys.argv) > 1: reg = float(sys.argv[1]) print("Regularization Rate is {}.".format(reg)) # Use TrainClassifier in mmlspark to train a logistic regression model. Notice that we don't have to do any one-hot encoding, or vectorization. # We also don't need to convert the label column from string to binary. mmlspark does those all these tasks for us. model = TrainClassifier(model=LogisticRegression(regParam=reg), labelCol=" income", numFeatures=256).fit(train) run_logger.log("Regularization Rate", reg) # predict on the test dataset prediction = model.transform(test) # compute model metrics metrics = ComputeModelStatistics().transform(prediction) print("******** MODEL METRICS ************") print("Accuracy is {}.".format(metrics.collect()[0]['accuracy'])) print("Precision is {}.".format(metrics.collect()[0]['precision'])) print("Recall is {}.".format(metrics.collect()[0]['recall'])) print("AUC is {}.".format(metrics.collect()[0]['AUC']))
spark = pyspark.sql.SparkSession.builder.appName("MyApp").getOrCreate() # Create a Spark dataframe out of the csv file. data = spark.createDataFrame( pd.read_csv(dataFile, dtype={ "feature1": np.float64, "feature2": string, "label": string })) # Split data into train and test. train, test = data.randomSplit([0.75, 0.25], seed=123) # Train a model model = TrainClassifier(model=LogisticRegression(), labelCol="label").fit(train) # Evaluate the model metrics = ComputeModelStatistics().transform(prediction) print("******** MODEL METRICS ************") print("Accuracy is {}.".format(metrics.collect()[0]['accuracy'])) print("Precision is {}.".format(metrics.collect()[0]['precision'])) print("Recall is {}.".format(metrics.collect()[0]['recall'])) print("AUC is {}.".format(metrics.collect()[0]['AUC'])) # log accuracy and AUC run_logger.log("Accuracy", metrics.collect()[0]['accuracy']) run_logger.log("AUC", metrics.collect()[0]['AUC']) ######## Persist the Model ###### model.write().overwrite().save("mySparkMLModel.mml")