def binaryClassificationEvaluator(predictions): evaluator = BinaryClassificationEvaluator() print("Test Area Under ROC: " + str( evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}))) evaluator.getMetricName() return evaluator
def predictor(model, testdata): print("Running Predict Function") print("Predicting Model with RF Model:") predictions = model.transform(testdata) evaluator = BinaryClassificationEvaluator(labelCol="Label") print("Evaluation Metric", evaluator.evaluate(predictions) * 100) print("Metric Used", evaluator.getMetricName())
def auc(df, model): prediction = model.transform(df) evaluator = BinaryClassificationEvaluator() metric = evaluator.evaluate(prediction) metric_name = evaluator.getMetricName() return metric_name, metric
from pyspark.ml.evaluation import BinaryClassificationEvaluator # Evaluate model evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol='Status') evaluator.evaluate(results) results.show(15) evaluator.evaluate(results) evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='Status') evaluator.evaluate(results) evaluator.getMetricName() print(log_reg.explainParams())
from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True) lrmodel = lr.fit(adulttrain) lrmodel = lr.setParams(regParam=0.01, maxIter=500, fitIntercept=True).fit(adulttrain) lrmodel.weights lrmodel.intercept #section 8.2.3 validpredicts = lrmodel.transform(adultvalid) from pyspark.ml.evaluation import BinaryClassificationEvaluator bceval = BinaryClassificationEvaluator() bceval.evaluate(validpredicts) bceval.getMetricName() bceval.setMetricName("areaUnderPR") bceval.evaluate(validpredicts) #section 8.2.5 from pyspark.ml.tuning import CrossValidator from pyspark.ml.tuning import ParamGridBuilder cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5) paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [1000]).addGrid( lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]).build() cv.setEstimatorParamMaps(paramGrid) cvmodel = cv.fit(adulttrain) cvmodel.bestModel.weights BinaryClassificationEvaluator().evaluate( cvmodel.bestModel.transform(adultvalid))
train_data, test_data = df_model2.randomSplit([.8,.2],seed=1234) train_data.groupby('label').agg({'label': 'count'}).show() test_data.groupby('label').agg({'label': 'count'}).show() LR = LogisticRegression(labelCol="label", featuresCol="featureX", maxIter=10,regParam=0.3) linearModel = LR.fit(train_data) predictions = linearModel.transform(test_data) predictions.printSchema() selectedPredictions = predictions.select("label", "prediction", "probability") selectedPredictions.show(8) print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx PREDICTIONS xxxxxxxxxxxxx 10') evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") print("PENCENTAGE OF GUESS ERROR :") print(evaluator.evaluate(predictions)) print(evaluator.getMetricName()) print("TYPE OF TESTS CONDUCTED :"+evaluator.getMetricName()) print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx ROC METRIX xxxxxxxxxxxxx 11') print("") print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx DATA SCIENCE TWO #2 SUCCESSFUL xxxxxxxxxx DONE!')
# Let's check the first 20 results predictions.show() #################################################################################################### # 4. Evaluation of the Model #################################################################################################### # Now that you have done with the prediciton process, let's evaluate the model from pyspark.ml.evaluation import BinaryClassificationEvaluator # First, you need to build an evaluator # "rawPredictionCol" can be either rawPrediction or probability (Either way will yield the same result) evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") # AUC and AUC(PR) are the metrics, provided by the library print evaluator.getMetricName(), "The AUC of the Model is {}".format( evaluator.evaluate(predictions)) print "The AUC under PR curve is {}".format( evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"})) # If you want to check out the deufalt matrix evaluator.getMetricName() # Some other matrix can be found as follows print 'Model Intercept: ', lrModel.interceptVector print 'Model coefficientMatrix: ', lrModel.coefficientMatrix #################################################################################################### # 5. Param Setting #################################################################################################### # To extract the best params from the model, let's use ParamGridBuilder along with CV
bestmodels(RFT_crossval, vectorized_CV_data, 2) RFT_model = RFT_crossval.fit(vectorized_CV_data) RFT_tree_model = RFT_model.bestModel.stages[2] #####预测和模型评估####### vectorized_test_data = vectorizeData(final_test_data) tf_data = CV_model.transform(vectorized_test_data) GBT_tf_data = GBT_model.transform(vectorized_test_data) RFT_tf_data = RFT_model.transform(vectorized_test_data) ################决策树评估################################################### print(f1_evaluator.getMetricName(), 'score:', f1_evaluator.evaluate(tf_data, {f1_evaluator.metricName: "f1"})) print(ROC_evaluator.getMetricName(), 'AUC:', ROC_evaluator.evaluate(tf_data)) predictions = tf_data.select('indexedLabel', 'prediction', 'probability') resultdf = predictions.toPandas() print(accuracy_score(resultdf.indexedLabel, resultdf.prediction)) print(confusion_matrix(resultdf.indexedLabel, resultdf.prediction)) print(classification_report(resultdf.indexedLabel, resultdf.prediction)) ################GBDT评估################################################### print(f1_evaluator.getMetricName(), 'score:', f1_evaluator.evaluate(GBT_tf_data)) print(ROC_evaluator.getMetricName(), 'AUC:', ROC_evaluator.evaluate(GBT_tf_data)) predictions = GBT_tf_data.select('indexedLabel', 'prediction',
from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True) lrmodel = lr.fit(adulttrain) lrmodel = lr.setParams(regParam=0.01, maxIter=500, fitIntercept=True).fit(adulttrain) lrmodel.weights lrmodel.intercept #section 8.2.3 validpredicts = lrmodel.transform(adultvalid) from pyspark.ml.evaluation import BinaryClassificationEvaluator bceval = BinaryClassificationEvaluator() bceval.evaluate(validpredicts) bceval.getMetricName() bceval.setMetricName("areaUnderPR") bceval.evaluate(validpredicts) #section 8.2.5 from pyspark.ml.tuning import CrossValidator from pyspark.ml.tuning import ParamGridBuilder cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5) paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [1000]).addGrid(lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]).build() cv.setEstimatorParamMaps(paramGrid) cvmodel = cv.fit(adulttrain) cvmodel.bestModel.weights BinaryClassificationEvaluator().evaluate(cvmodel.bestModel.transform(adultvalid)) #section 8.2.6
assemblerInputs = indexedCategoricalCols + numericColList assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") df = assembler.transform(df) # Indexing binary labels labeller = StringIndexer(inputCol=label, outputCol="label").fit(df) df = labeller.transform(df).select(["features", "label"]) ### Randomly split data into training and test sets. set seed for reproducibility (trainingData, testData) = df.randomSplit([0.7, 0.3], seed=100) #dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") dt = LogisticRegression(regParam=0.01) model = dt.fit(trainingData) # Make predictions. predictions = model.transform(testData) evaluator = Evaluator() # Select example rows to display. predictions.select("prediction", "label", "features").show() # Evaluate the learned model print("LogRegression Test %s: %f" % (evaluator.getMetricName(), evaluator.evaluate(predictions))) model = NaiveBayes(thresholds=[0.1, 1.0]) model = dt.fit(trainingData) predictions = model.transform(testData) predictions.select("prediction", "label", "features").show() print("Bayes Test %s: %f" % (evaluator.getMetricName(), evaluator.evaluate(predictions)))
rawPredictionCol='prediction', metricName='areaUnderROC') # In[71]: #generate splits for cross validation splits = indexedData.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2]) # In[72]: TotalAccuracy = 0 for i in range(5): testIndex = splits[i].select('id').collect() #get test index for each fold rdd = sc.parallelize(testIndex) test_rdd = rdd.flatMap(lambda x: x).collect() test_Data = indexedData.filter( indexedData.id.isin(test_rdd)) #get test data for each fold train_Data = indexedData.filter( ~indexedData.id.isin(test_rdd)) #get train data for each model model = nb.fit(train_Data) #fit train data to model transformed_data = model.transform(test_Data) # evaluate test data accuracy = binaryEvaluator.evaluate( transformed_data) # get accuracy for test data print(binaryEvaluator.getMetricName(), 'accuracy:', accuracy) TotalAccuracy = TotalAccuracy + accuracy averageAccuracy = TotalAccuracy / 5 # get average accuracy print(averageAccuracy)
# Predict With Model ################# logistic_regression_predictions = logistic_regression_pipeline_model.transform(test_data) ################# # Evaluate Model ################# logistic_regression_predictions_selected = logistic_regression_predictions.select(CAT_COLS + CONT_COLS + ["income", "income_str_idx", "prediction", "probability"]) logistic_regression_predictions_selected.show(30) logistic_regression_predictions_selected.groupby('income').agg({'income': 'count'}).show() lr_pred = logistic_regression_predictions.select("income_str_idx", "prediction") lr_accuracy_rate = lr_pred.filter(lr_pred.income_str_idx == lr_pred.prediction).count() / (lr_pred.count() * 1.0) print('MODEL RESULTS:') print("Overall Accuracy: {}".format(lr_accuracy_rate)) evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='income_str_idx') print('{}: {}'.format(evaluator.getMetricName(), evaluator.evaluate(logistic_regression_predictions))) ################# # Save and Load Model ################# logistic_regression_pipeline_model.write().overwrite().save('my_logistic_regression_model_2.model') loaded_lr_model = PipelineModel.load("my_logistic_regression_model_2.model") more_predictions = loaded_lr_model.transform(test_data) print('\nLOADED MODEL RESULTS:') print("Coefficients: " + str(loaded_lr_model.stages[-1].coefficients)) print("Intercept: " + str(loaded_lr_model.stages[-1].intercept)) lr_pred = more_predictions.select("income_str_idx", "prediction") loaded_accuracy = lr_pred.filter(lr_pred.income_str_idx == lr_pred.prediction).count() / (lr_pred.count() * 1.0) print("Overall Accuracy Loaded: {}".format(loaded_accuracy))
from pyspark.ml.classification import RandomForestClassifier rf = RandomForestClassifier( labelCol="label", featuresCol="features", ) from pyspark.ml.tuning import CrossValidator, ParamGridBuilder pipeline_rf = Pipeline(stages=[rf]) paramGrid = ParamGridBuilder().addGrid(rf.maxDepth, [2, 3, 4, 5, 6, 7]).addGrid( rf.numTrees, [100, 300]).build() evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction") crossval = CrossValidator(estimator=pipeline_rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3) ## Fitting the CV CV_model = crossval.fit(trainingData) ## Printing best model print(CV_model.bestModel.stages[0]) test_pred = CV_model.transform(testData) print(evaluator.getMetricName(), evaluator.evaluate(test_pred))
# accuracy def accuracy_m(model): predictions = model.transform(test_data) cm = predictions.select('label', 'prediction') acc = cm.filter(cm.label == cm.prediction).count() / cm.count() print('model accuracy : %.3f%%' % (acc * 100)) accuracy_m(model=linearModel) # model accuracy : 82.161% # use ROC for binary classification ) = True Positive Rate(recall) # TODO : 확인하기 from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction') print(evaluator.evaluate(predictions)) # 0.8952698333157076 print(evaluator.getMetricName()) # areaUnderROC # step 6) tune the hyperparameter ''' To reduce the time of the computation, you only tune the regularization parameter with only two values. ''' from pyspark.ml.tuning import ParamGridBuilder, CrossValidator param_grid = (ParamGridBuilder().addGrid(lr.regParam, [0.01, 0.5]).build()) # time check and kfold=5 from time import * start_time = time() cv = CrossValidator(estimator=lr, estimatorParamMaps=param_grid,
args = parser.parse_args() in_file = args.csv CV_data = spark.read.csv(in_file, header=True, inferSchema=True) to_vectorize = [ 'step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFlaggedFraud' ] training_data, testing_data = CV_data.randomSplit([0.8, 0.2]) #xytrain = labelData(training_data) xytrain = vectorizeData(training_data, validCols=to_vectorize, labelsCol='isFraud') lr = LogisticRegression(regParam=0.01) model = lr.fit(xytrain) xytest = vectorizeData(testing_data, validCols=to_vectorize, labelsCol='isFraud') predicted_train = model.transform(xytrain) predicted_test = model.transform(xytest) evaluator = Evaluator() print("Train %s: %f" % (evaluator.getMetricName(), evaluator.evaluate(predicted_train))) print("Test %s: %f" % (evaluator.getMetricName(), evaluator.evaluate(predicted_test)))
cm = predictions.select("label", "prediction") cm.groupby('label').agg({'label': 'count'}).show() cm.groupby('prediction').agg({'prediction': 'count'}).show() cm.filter(cm.label == cm.prediction).count() / cm.count() accuracy_m(model = linearModel) ### Use ROC from pyspark.ml.evaluation import BinaryClassificationEvaluator # Evaluate model evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") print(evaluator.evaluate(predictions)) print(evaluator.getMetricName()) print(evaluator.evaluate(predictions)) from pyspark.ml.tuning import ParamGridBuilder,CrossValidator # Create ParamGrid for Cross Validation paramGrid = (ParamGridBuilder() .addGrid(lr.regParam, [0.01, 0.5]) .build()) from time import * start_time = time() # Create 5-fold CrossValidator
# COMMAND ---------- from pyspark.ml.evaluation import BinaryClassificationEvaluator # Evaluate model evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") evaluator.evaluate(predictions) # COMMAND ---------- # MAGIC %md Note that the default metric for the BinaryClassificationEvaluator is areaUnderROC # COMMAND ---------- evaluator.getMetricName() # COMMAND ---------- # MAGIC %md The evaluator currently accepts 2 kinds of metrics - areaUnderROC and areaUnderPR. # MAGIC We can set it to areaUnderPR by using evaluator.setMetricName("areaUnderPR"). # COMMAND ---------- # MAGIC %md # MAGIC Now we will try tuning the model with the ParamGridBuilder and the CrossValidator. # MAGIC # MAGIC If you are unsure what params are available for tuning, you can use explainParams() to print a list of all params. # COMMAND ----------
VectorAssembler( inputCols=["{0}_counts".format(i) for i in range(1, n + 1)], outputCol="features") ] nb = [NaiveBayes(smoothing=1.0, modelType="multinomial")] return Pipeline(stages=tokenizer + ngrams + vectorizers + assembler + nb) model = build_ngrams(n=2).fit(train_data) preds_valid = model.transform(valid_data) #Evaluate the model. default metric : Area Under ROC..... areaUnderROC:0.609 # with text_clean: 0.607 # with text_clean + build_ngrams(n=2): 0.612 bceval = BinaryClassificationEvaluator() print bceval.getMetricName() + ":" + str(round(bceval.evaluate(preds_valid), 3)) #Evaluate the model. metric : Area Under PR...... areaUnderPR:0.732 # with text_clean: 0.728 # with text_clean + build_ngrams(n=2): 0.729 bceval.setMetricName("areaUnderPR") print bceval.getMetricName() + ":" + str(round(bceval.evaluate(preds_valid), 3)) #Evaluate the model. metric : F1 score...... f1:0.865 # with text_clean: 0.858 # with text_clean + build_ngrams(n=2): 0.882 mceval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1") print(mceval.getMetricName() + ":" +