def predict_class(df: DataFrame, vector_assembler: VectorAssembler, lrModel: LogisticRegression): """[Predict class of dataframe] Args: df (DataFrame): [spark dataframe for prediction] vector_assembler (VectorAssembler): [vector assembler to prepare features] lrModel (LogisticRegression): [model] """ predict_df = vector_assembler.transform(df).select('features') predictions = lrModel.transform(predict_df) predictions = predictions.withColumn( 'class', when(col('prediction') == 0, 'Iris-setosa').when( col('prediction') == 1, 'Iris-versicolor').otherwise('Iris-virginica')) (predictions.select('class').coalesce(1).write.option( "header", "true").format('csv').save('out/out_3_2.txt'))
# In[13]: from pyspark.ml.classification import LogisticRegression # Evaluate model based on auc ROC from pyspark.ml.evaluation import BinaryClassificationEvaluator # Evaluate model based on F1 socre from pyspark.ml.evaluation import MulticlassClassificationEvaluator # Evaluate model based on confusion matrix from pyspark.mllib.evaluation import MulticlassMetrics # model on training data regPara: lasso regularisation parameter (L1) lrModel = LogisticRegression(regParam=0.2).fit(trainData) # make prediction on test data pred = lrModel.transform(testData) pred.select('label', 'prediction').show() evaluator1 = BinaryClassificationEvaluator(labelCol='label', metricName="areaUnderROC") evaluator2 = MulticlassClassificationEvaluator(labelCol='label', metricName="f1") metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple)) print('AUC ROC of Logistic Regression model is %f' % evaluator1.evaluate(pred)) print('F1 score of Logistic Regression model is %f' % evaluator2.evaluate(pred)) metrics.confusionMatrix().toArray().transpose() # <a id="context322"></a>
def log_reg_train(train_data, test_data): classifier = LogisticRegression(featuresCol='features', labelCol='label') classifier = classifier.fit(train_data) pred = classifier.transform(test_data) cm = pred.select("label", "prediction") return cm
FP = prediction.filter('prediction = 1 AND label != prediction').count() # Accuracy measures the proportion of correct predictions accuracy = (TN + TP) / (TN + TP + FN + FP) print(accuracy) -------------------------------------------------- # Exercise_8 # Import the logistic regression class from pyspark.ml.classification import LogisticRegression # Create a classifier object and train on training data logistic = LogisticRegression().fit(flights_train) # Create predictions for the testing data and show confusion matrix prediction = logistic.transform(flights_test) prediction.groupBy('label', 'prediction').count().show() -------------------------------------------------- # Exercise_9 from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator # Calculate precision and recall precision = TP / (TP + FP) recall = TP / (TP + FN) print('precision = {:.2f}\nrecall = {:.2f}'.format(precision, recall)) # Find weighted precision multi_evaluator = MulticlassClassificationEvaluator() weighted_precision = multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: "weightedPrecision"})
from pyspark.ml.evaluation import BinaryClassificationEvaluator predictions = model.evaluate(test_data) # COMMAND ---------- evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label') evaluator.evaluate(predictions.predictions) # COMMAND ---------- #Achieved 96.3% accuracy on test data, lets predict on the unseen test data for kaggle submission. # COMMAND ---------- results = model.transform(test_data) results.select('features', 'prediction').show() # COMMAND ---------- results.count() # COMMAND ---------- results = model.transform(testData) results.select('features', 'prediction').show() # COMMAND ---------- results.count()
train_1, train_2, train_3 = train_data.randomSplit([0.33, 0.33, 0.34], seed=1234) ''' 1) Training 1 individual model with all data ''' print('**********') print('For 1 Model') print('**********') #Training the Logistic Regression Model classifier = LogisticRegression(featuresCol='features', labelCol='label') classifier = classifier.fit(train_data) #Making predictions pred = classifier.transform(test_data) print('Predictions:') pred.show(10) #Model Accuracy cm = pred.select("label", "prediction") cm.show() acc = cm.filter(cm.label == cm.prediction).count() / cm.count() print(f'Accuracy for one model: {acc}%') ''' 2) Training 3 individual models parallelly with 1/3 data each ''' print('**********') print('For 3 models trained separately used as an Ensemble')
# Convert hashed symbols to TF-IDF idf = IDF(inputCol="hash", outputCol="features") sms = idf.fit(hashed).transform(hashed) # View the first four records sms.show(4, truncate=False) # Split the data into training and testing sets sms_train, sms_test = sms.randomSplit([0.8, 0.2], seed=13) # Fit a Logistic Regression model to the training data logistic = LogisticRegression(regParam=0.2) logistic = logistic.fit(sms_train) # Make predictions on the testing data prediction = logistic.transform(sms_test) # Create a confusion matrix, comparing predictions to known labels prediction.groupBy("label", 'prediction').count().show() # Find weighted precision multi_evaluator = MulticlassClassificationEvaluator() accuracy = multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: "accuracy"}) weighted_precision = multi_evaluator.evaluate( prediction, {multi_evaluator.metricName: "weightedPrecision"}) weighted_recall = multi_evaluator.evaluate( prediction, {multi_evaluator.metricName: "weightedRecall"}) # Find AUC binary_evaluator = BinaryClassificationEvaluator()
selectedcols = ["label", "features"] + ["hours_per_week"] + ["income"] dataset = preppedDataDF.select(selectedcols) # COMMAND ---------- (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=122423) lrModel = LogisticRegression().fit(trainingData) print("Coefficients: " + str(lrModel.coefficients)) print("Intercept: " + str(lrModel.intercept)) # COMMAND ---------- # Determine intersect point of "hours_per_week" where model goes from predicting income "<= 50k" to income ">50k" predictions = lrModel.transform(testData) selected = predictions.select("income", "label", "prediction", "probability", "hours_per_week").filter("hours_per_week > 65 and hours_per_week < 69") display(selected) # COMMAND ---------- # evaluate. note only 2 metrics are supported out of the box by Spark ML. bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction') au_roc = bce.setMetricName('areaUnderROC').evaluate(predictions) au_prc = bce.setMetricName('areaUnderPR').evaluate(predictions) truePositive = predictions.select("label").filter("label = 1 and prediction = 1").count() falsePositive = predictions.select("label").filter("label = 0 and prediction = 1").count() trueNegative = predictions.select("label").filter("label = 0 and prediction = 0").count() falseNegative = predictions.select("label").filter("label = 1 and prediction = 0").count()
pipeline = pipeline.fit(df) df = pipeline.transform(df) # Model lr = LogisticRegression(featuresCol="features", labelCol=TARGET, predictionCol="predictions", maxIter=10, regParam=0.0, elasticNetParam=0.0, threshold=0.5) lr = lr.fit(df) df = lr.transform(df) summary = lr.summary print("Labels") print(summary.labels) print("Accuracy") print(summary.accuracy) print("Precision by Label") print(summary.precisionByLabel) print("Recall by Label") print(summary.recallByLabel)
# Reference https://chrisalbon.com/machine_learning/trees_and_forests/random_forest_classifier_example/ import time from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, MultilayerPerceptronClassifier, LinearSVC, OneVsRest, NaiveBayes from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark import SparkConf, SparkContext, SQLContext conf = SparkConf().setMaster("local[*]") sc = SparkContext(conf=conf) spark = SQLContext(sc) data = spark.read.format("libsvm").load( "D:\Outils\Spark\data\mllib\iris_libsvm.txt") (train, test) = data.randomSplit([0.8, 0.2]) model = LogisticRegression() # model = DecisionTreeClassifier() # model = RandomForestClassifier() # model = MultilayerPerceptronClassifier(layers=[4, 3]) # model = OneVsRest(classifier=LinearSVC()) # model = NaiveBayes() model = model.fit(train) predictions = model.transform(test) evaluator = MulticlassClassificationEvaluator(metricName="accuracy") score = evaluator.evaluate(predictions) print('Accuracy: ', score)
#Estimator:入:DataFrame => 出:Transformer sIndexer_02 = StringIndexer(inputCol="label", outputCol="indexed02") si_model_02 = sIndexer_02.fit(train_data) (trainingData02, testData02) = train_data.randomSplit([0.7, 0.3]) td_02 = si_model_02.transform(trainingData02) #NB不能为负数 from pyspark.ml.classification import NaiveBayes nb = NaiveBayes(smoothing=1.0, modelType="multinomial") #LR from pyspark.ml.classification import LogisticRegression, GBTClassifier, RandomForestClassifier model_LR = LogisticRegression(maxIter=5, regParam=0.01) model_LR = model_LR.fit(train_data) predict_lr_testData = model_LR.transform(testData) #计算精度 def computeAcc(data): err = data.filter(data['label'] != data['prediction']).count() total = data.count() acc = float(err) / total print err, total, acc return acc #GBT gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed02") model_gbt = gbt.fit(td_02) predict_gbt_testData = model_gbt.transform(testData02)
test_df.show(10) print("Training Dataset Count: " + str(train_df.count())) print("Test Dataset Count: " + str(test_df.count())) ################################################## # MODELING ################################################## ################################################## # Logistic Regression ################################################## log_model = LogisticRegression(featuresCol='features', labelCol='label').fit(train_df) y_pred = log_model.transform(test_df) y_pred.show() y_pred.select("label", "prediction").show() y_pred.filter(y_pred.label == y_pred.prediction).count() / y_pred.count() evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName='areaUnderROC') evaluatorMulti = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction") acc = evaluatorMulti.evaluate(y_pred, {evaluatorMulti.metricName: "accuracy"}) precision = evaluatorMulti.evaluate( y_pred, {evaluatorMulti.metricName: "precisionByLabel"})
spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .getOrCreate() fileStore = sys.argv[1] df = spark.read.format("csv")\ .options(inferSchema=True, header=True)\ .load(fileStore) valids = [v for v in df.columns if not v in remove] df = df.select(valids) #printc("%s" % df.dtypes) inputs, df = vectorizeData(df=df, labelsCol=LEAVE) train, test = df.randomSplit([0.7, 0.3], seed=12345) # Train Logistic Regression lr = LogisticRegression(regParam=0.01) lr = lr.fit(train) # Make predictions. predictions = lr.transform(test) evaluator = Evaluator() # Select example rows to display. #predictions.select("prediction", "label", "features").show() # Evaluate the learned model print("Pensiones random deads Test %s: %f" % (evaluator.getMetricName(), evaluator.evaluate(predictions))) # Print important features get_feature_importances(model=lr, featureNames=inputs, out_csv=out_csv)
# MAGIC %md # MAGIC #### Step 4: Fit our model by using the training data # MAGIC When we call the `.fit()` function, the pipeline stages are executed on the data in that dataset. # COMMAND ---------- lrModel = LogisticRegression().fit(trainingData) # COMMAND ---------- # MAGIC %md # MAGIC #### Step 5: Run our test data through the fit model, and view the predicted results for model evaluation # COMMAND ---------- predictionsDF = (lrModel.transform(testData)).select("income", "label", "prediction", "probability") # COMMAND ---------- predictionsDF.registerTempTable("incomePredictionsOutputDF") # COMMAND ---------- # MAGIC %sql # MAGIC # MAGIC SELECT # MAGIC * # MAGIC FROM incomePredictionsOutputDF
stages_lr = stages.copy() inputCols = ['norm_cols'] + [ cname + "classVec" for cname in categorical_cols if cname != 'native_country' ] final_assembler = VectorAssembler(inputCols=inputCols, outputCol='features') stages_lr += [final_assembler] pipeline = Pipeline(stages=stages_lr) train_lr = pipeline.fit(train).transform(train) test_lr = pipeline.fit(test).transform(test) lr = LogisticRegression(featuresCol='features', labelCol='label').fit(train_lr) res_lr = lr.transform(test_lr) #----------------- Decision and Random Forest ----------------- # Final assembly inputCols = ['norm_cols' ] + [cname + "classVec" for cname in categorical_cols] final_assembler = VectorAssembler(inputCols=inputCols, outputCol='features') stages += [final_assembler] pipeline = Pipeline(stages=stages) train_final = pipeline.fit(train).transform(train) test_final = pipeline.fit(test).transform(test) dt = DecisionTreeClassifier(featuresCol='features',