Example #1
0
def svm(df, trainingData, testData, maxIterValue, regParamValue, depth,
        thresholdValue):

    print("\n")
    print("mvs")

    svm = LinearSVC(labelCol="G3",
                    featuresCol="features",
                    maxIter=maxIterValue,
                    regParam=regParamValue,
                    aggregationDepth=depth,
                    threshold=thresholdValue)

    # Fit the model
    model = svm.fit(trainingData)

    # make predictions using our trained model

    predictions = model.transform(testData)

    # estimate the accuracy of the prediction
    #Métricas de evaluación
    multi_evaluator = MulticlassClassificationEvaluator(
        labelCol="G3", predictionCol="prediction", metricName="accuracy")
    accuracy = multi_evaluator.evaluate(predictions)

    multi_evaluator = multi_evaluator.setMetricName('precisionByLabel')
    precision = multi_evaluator.evaluate(predictions)

    multi_evaluator = multi_evaluator.setMetricName('f1')
    f1_score = multi_evaluator.evaluate(predictions)

    multi_evaluator = multi_evaluator.setMetricName('recallByLabel')
    recall = multi_evaluator.evaluate(predictions)

    bin_evaluator = BinaryClassificationEvaluator(
        labelCol="G3",
        rawPredictionCol="prediction",
        metricName="areaUnderROC")
    area = bin_evaluator.evaluate(predictions)

    #results = [["Accuracy",accuracy],["Precision",precision],
    #["Recall",recall],["F1 Score",f1_score],["Area under ROC curve",area]]
    print("Accuracy = {}".format(accuracy))
    print("Precision = {}".format(precision))
    print("Recall = {}".format(recall))
    print("F1 score = {}".format(f1_score))
    print("Area under ROC curve = {}".format(area))
    return (model)
Example #2
0
def predict(row):
    svm = LinearSVC.load("Modelo1")
    predictions = svm.transform(row)

    multi_evaluator = MulticlassClassificationEvaluator(
        labelCol="G3", predictionCol="prediction", metricName="accuracy")
    accuracy = multi_evaluator.evaluate(predictions)

    multi_evaluator = multi_evaluator.setMetricName('precisionByLabel')
    precision = multi_evaluator.evaluate(predictions)

    multi_evaluator = multi_evaluator.setMetricName('f1')
    f1_score = multi_evaluator.evaluate(predictions)

    multi_evaluator = multi_evaluator.setMetricName('recallByLabel')
    recall = multi_evaluator.evaluate(predictions)

    bin_evaluator = BinaryClassificationEvaluator(
        labelCol="G3",
        rawPredictionCol="prediction",
        metricName="areaUnderROC")
    area = bin_evaluator.evaluate(predictions)
    return (accuracy, precision, f1_score, recall, area)
    def train_model(self, data):
        # Create features vector from multiple columns
        assembler = VectorAssembler(inputCols=self.FEATURE_COLUMNS,
                                    outputCol='features',
                                    handleInvalid='skip')
        data_with_features_column = assembler.transform(data)

        feature_indexer = VectorIndexer(
            inputCol='features',
            outputCol='indexed_features').fit(data_with_features_column)
        pipeline = Pipeline(stages=[feature_indexer, self.model])

        train_set, test_set = data_with_features_column.randomSplit([0.8, 0.2])

        # Train the model
        trained_model = pipeline.fit(train_set)

        # Make predictions
        predictions = trained_model.transform(test_set)

        # Output metrics
        evaluator = MulticlassClassificationEvaluator(
            labelCol='end_cluster', predictionCol='prediction')
        evaluator.setMetricName('accuracy')
        accuracy = evaluator.evaluate(predictions)

        evaluator.setMetricName('weightedPrecision')
        precision = evaluator.evaluate(predictions)

        evaluator.setMetricName('weightedRecall')
        recall = evaluator.evaluate(predictions)

        evaluator.setMetricName('f1')
        f1 = evaluator.evaluate(predictions)

        print(f'Accuracy: {accuracy}')
        print(f'Precision: {precision}')
        print(f'Recall: {recall}')
        print(f'F1-Score: {f1}')
def evaluate_classification(predictions):
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction",
                                                  metricName="f1")
    # print(evaluator.explainParams())
    f1 = evaluator.evaluate(predictions)
    evaluator.setMetricName('weightedPrecision')
    weighted_precision = evaluator.evaluate(predictions)
    evaluator.setMetricName('weightedRecall')
    weighted_recall = evaluator.evaluate(predictions)
    evaluator.setMetricName('accuracy')
    accuracy = evaluator.evaluate(predictions)
    print()
    print("Test set accuracy = " + str(accuracy))
    print("Test set weightedPrecision = " + str(weighted_precision))
    print("Test set weightedRecall = " + str(weighted_recall))
    print("Test set f1 = " + str(f1))
Example #5
0
display(predDF)

# COMMAND ----------

# MAGIC %md
# MAGIC #Step 4) Collecting Metrics on test dataset

# COMMAND ----------

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(metricName="f1",
                                              labelCol="label_index")
metricsDF = spark.createDataFrame(
    [("f1", evaluator.evaluate(predDF)),
     ("accuracy", evaluator.setMetricName("accuracy").evaluate(predDF))],
    ["Metric", "Value"])
display(metricsDF)

# COMMAND ----------

from datetime import date

today = date.today()
# dd/mm/YY
d1 = today.strftime("%d-%m-%Y")

# COMMAND ----------

import mlflow
import mlflow.tracking
Example #6
0
# with text_clean: 0.607
# with text_clean + build_ngrams(n=2): 0.612
bceval = BinaryClassificationEvaluator()
print bceval.getMetricName() + ":" + str(round(bceval.evaluate(preds_valid),
                                               3))

#Evaluate the model. metric : Area Under PR...... areaUnderPR:0.732
# with text_clean: 0.728
# with text_clean + build_ngrams(n=2): 0.729
bceval.setMetricName("areaUnderPR")
print bceval.getMetricName() + ":" + str(round(bceval.evaluate(preds_valid),
                                               3))

#Evaluate the model. metric : F1 score...... f1:0.865
# with text_clean: 0.858
# with text_clean + build_ngrams(n=2): 0.882
mceval = MulticlassClassificationEvaluator(labelCol="label",
                                           predictionCol="prediction",
                                           metricName="f1")
print(mceval.getMetricName() + ":" +
      str(round(mceval.evaluate(preds_valid), 3)))

#Evaluate the model. metric : accuracy......  accuracy:0.866
# with text_clean: 0.859
# with text_clean + build_ngrams(n=2): 0.883
mceval.setMetricName("accuracy")
print(mceval.getMetricName() + ":" +
      str(round(mceval.evaluate(preds_valid), 3)))

#########
sc.stop()
# 多分类模型 评估指标
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator()
evaluator.setMetricName("f1|weightedPrecision|weightedRecall|accuracy")
evaluator.evaluate(...)

# 回归模型 评估指标
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator()
evaluator.setMetricName("rmse|mse|mae|r2")
evaluator.evaluate(...)

# 聚类模型 评估指标 (计算轮廓系数)
from pyspark.ml.evaluation import ClusteringEvaluator
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(...)
def randomForest(df,
                 feature_list=['BFSIZE', 'HDRSIZE', 'NODETYPE'],
                 maxDepth=5,
                 numTrees=20,
                 seed=None,
                 overwrite_model=False):
    # Checks if there is a SparkContext running if so grab that if not start a new one
    # sc = SparkContext.getOrCreate()
    # sqlContext = SQLContext(sc)
    # sqlContext.setLogLevel('INFO')
    feature_list.sort()
    feature_name = '_'.join(feature_list)
    param_name = '_'.join([str(maxDepth), str(numTrees)])
    model_path_name = model_dir + 'RandomForest/' + feature_name + '_' + param_name
    model = None

    vector_assembler = VectorAssembler(inputCols=feature_list,
                                       outputCol="features")
    df_temp = vector_assembler.transform(df)

    df = df_temp.select(['label', 'features'])

    trainingData, testData = df.randomSplit([0.7, 0.3])

    if os.path.isdir(model_path_name) and not overwrite_model:
        print('Loading model from ' + model_path_name)
        model = RandomForestClassificationModel.load(model_path_name)

    else:
        rf = RandomForestClassifier(labelCol="label",
                                    featuresCol="features",
                                    numTrees=numTrees,
                                    maxDepth=maxDepth,
                                    seed=seed)
        model = rf.fit(trainingData)

    print('Making predictions on validation data')
    predictions = model.transform(testData)

    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction")
    # f1|weightedPrecision|weightedRecall|accuracy
    evaluator.setMetricName('accuracy')
    print('Evaluating accuracy')
    accuracy = evaluator.evaluate(predictions)

    evaluator.setMetricName('f1')
    print('Evaluating f1')
    f1 = evaluator.evaluate(predictions)

    evaluator.setMetricName('weightedPrecision')
    print('Evaluating weightedPrecision')
    weightedPrecision = evaluator.evaluate(predictions)

    evaluator.setMetricName('weightedRecall')
    print('Evaluating weightedRecall')
    weightedRecall = evaluator.evaluate(predictions)

    print('accuracy {}'.format(accuracy))
    print('f1 {}'.format(f1))
    print('weightedPrecision {}'.format(weightedPrecision))
    print('weightedRecall {}'.format(weightedRecall))

    # test distribution of outputs
    total = df.select('label').count()
    tape = df.filter(df.label == 0).count()
    disk = df.filter(df.label == 1).count()
    cloud = df.filter(df.label == 2).count()

    # print outputs
    print('Random Forests')
    print(feature_list)
    print('Data distribution')
    print('Total Observations {}'.format(total))
    print(' Cloud %{}'.format((cloud / total) * 100))
    print(' Disk %{}'.format((disk / total) * 100))
    print(' Tape %{}\n'.format((tape / total) * 100))

    print(" Test Error = {}".format((1.0 - accuracy) * 100))
    print(" Test Accuracy = {}\n".format(accuracy * 100))

    print('Error distribution')
    misses = predictions.filter(predictions.label != predictions.prediction)
    # now get percentage of error
    tape_misses = misses.filter(misses.label == 0).count()
    disk_misses = misses.filter(misses.label == 1).count()
    cloud_misses = misses.filter(misses.label == 2).count()

    tape_pred = predictions.filter(predictions.label == 0).count()
    disk_pred = predictions.filter(predictions.label == 1).count()
    cloud_pred = predictions.filter(predictions.label == 2).count()

    print(' Cloud Misses %{}'.format((cloud_misses / cloud_pred) * 100))
    print(' Disk Misses %{}'.format((disk_misses / disk_pred) * 100))
    print(' Tape Misses %{}'.format((tape_misses / tape_pred) * 100))

    if accuracy > 0.80:
        if os.path.isdir(model_path_name):
            if overwrite_model:
                print('Saving model to ' + model_path_name)
                model.write().overwrite().save(model_path_name)
            else:
                pass
        else:
            print('Saving model to ' + model_path_name)
            model.save(model_path_name)

    metrics = {
        'data': {
            'Total': total,
            'Cloud': (cloud / total) * 100,
            'Disk': (disk / total) * 100,
            'Tape': (tape / total) * 100
        },
        'metrics': {
            'Accuracy': accuracy * 100,
            'f1': f1 * 100,
            'Weighted Precision': weightedPrecision * 100,
            'Weighted Recall': weightedRecall * 100
        },
        'error_percentage': {
            'Cloud': cloud_misses / cloud_pred * 100,
            'Disk': disk_misses / disk_pred * 100,
            'Tape': tape_misses / tape_pred * 100
        },
        'params': {
            'Number of Trees': model.getNumTrees,
            'Maximum Depth': maxDepth
        },
        'model_debug': model.toDebugString,
        'name': 'Random Forest Model',
        'features': feature_list
    }

    with open('tmp/temp.yml', 'w') as outfile:
        yaml.dump(metrics, outfile)

    return metrics, model
Example #9
0
display(testPredDF.orderBy("probability"))

# COMMAND ----------

display(testPredDF.filter("label != prediction"))

# COMMAND ----------

# DBTITLE 1,Confusion Matrix (more False Negatives)
display(testPredDF.groupBy("label", "prediction").count())

# COMMAND ----------

# DBTITLE 1,Evaluate
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                              labelCol='label',
                                              metricName='accuracy')

metricsDF = spark.createDataFrame(
    [("f1", evaluator.evaluate(testPredDF)),
     ("accuracy", evaluator.setMetricName("accuracy").evaluate(testPredDF))],
    ["Metric", "Value"])
display(metricsDF)

# COMMAND ----------

# COMMAND ----------
def process(time, rdd):
    print(
        "=========*********************************************** %s ***********************************************============"
        % str(time))
    print("\n")
    if not (rdd.isEmpty()):
        df = spark.createDataFrame(rdd, ["label", "text"])
        print(
            "=========***********************************************$ Raw Data From Stream $***********************************************========="
        )
        df.show()
        pipeline_data = loded_pipeline.transform(df)
        print("\n")
        print(
            "=========***********************************************$ Transformed Data After Running Pre Loded Pipeline $***********************************************========="
        )
        pipeline_data.show()
        print("\n\n")

        print(
            "=========***********************************************$ Classification Using Pre Trained Logistic Classification Model $***********************************************========="
        )
        predictions = saved_logistic_model.transform(pipeline_data)

        evaluator = MulticlassClassificationEvaluator(
            labelCol="label", predictionCol="prediction")
        f1 = evaluator.setMetricName("f1").evaluate(predictions)
        weightedPrecision = evaluator.setMetricName(
            "weightedPrecision").evaluate(predictions)
        weightedRecall = evaluator.setMetricName("weightedRecall").evaluate(
            predictions)
        accuracyNaiveBayes = evaluator.setMetricName("accuracy").evaluate(
            predictions)

        predictions = predictions.select("label", "prediction")

        predictions = predictions.withColumn(
            "Current Stream Accuracy %",
            lit(str((accuracyNaiveBayes) * 100) + "%"))
        predictions = predictions.withColumn(
            "Current Stream Error %",
            lit(str((1.0 - accuracyNaiveBayes) * 100) + "%"))
        predictions = predictions.withColumn("Current Stream F1 Score",
                                             lit(str(f1)))
        predictions = predictions.withColumn("Current Stream weightedRecall",
                                             lit(str(weightedRecall)))
        predictions = predictions.withColumn(
            "Current Stream weightedPrecision", lit(str(weightedPrecision)))

        # To Print Data Frame Schema for Debbuging
        #predictions.printSchema()

        label = mapSpeciesTypeWithNumericLabel(
            predictions.select("prediction").first())
        labelInitial = mapSpeciesTypeWithNumericLabel(
            predictions.select("label").first())
        global total_count_logistic_classification
        global correct_count_logistic_classification
        total_count_logistic_classification = total_count_logistic_classification + 1
        if (labelInitial == label):
            correct_count_logistic_classification = correct_count_logistic_classification + 1

        overall_accuracy_percent = (
            float(correct_count_logistic_classification) /
            float(total_count_logistic_classification)) * 100
        predictions = predictions.withColumn("News_Category_Predicted",
                                             lit(str(label)))
        predictions = predictions.withColumn("News_Category_InitalLabel",
                                             lit(str(labelInitial)))

        predictions.show()

        # Overall Stats
        total_predictions = predictions.select("label")
        total_predictions = total_predictions.withColumn(
            "Overall Correct Count",
            lit(str(correct_count_logistic_classification)))
        total_predictions = total_predictions.select("Overall Correct Count")
        total_predictions = total_predictions.withColumn(
            "Total Count", lit(str(total_count_logistic_classification)))
        total_predictions = total_predictions.withColumn(
            "Overall Accuracy Percent(%)",
            lit(str(overall_accuracy_percent) + "%"))
        total_predictions = total_predictions.withColumn(
            "Overall Error Percent(%)",
            lit(str(100 - overall_accuracy_percent) + "%"))

        print("\n")
        print(
            "=========***********************************************$ Overall Classification Metrics Logistic Classification Model $***********************************************========="
        )
        total_predictions.show()

        # print("Test Error for Naive Bayes :" + str((1.0 - accuracyNaiveBayes) * 100) + "%")
        # print("Test Accuracy for Naive Bayes :" + str((accuracyNaiveBayes) * 100) + "%")
        # print("Test weightedRecall for Naive Bayes :" + str(weightedRecall))
        # print("Test weightedPrecision for Naive Bayes :" + str(weightedPrecision))
        # print("Test f1 score for Naive Bayes :" + str(f1))

        # Naive bayes Model Classification

        print("\n\n")
        print(
            "=========***********************************************$ Classification Using Pre Trained Naive Bayes Classification Model $***********************************************========="
        )
        print("\n")
        naive_bayes_predictions = saved_naive_bayes_model.transform(
            pipeline_data)

        evaluator = MulticlassClassificationEvaluator(
            labelCol="label", predictionCol="prediction")
        f1 = evaluator.setMetricName("f1").evaluate(naive_bayes_predictions)
        weightedPrecision = evaluator.setMetricName(
            "weightedPrecision").evaluate(naive_bayes_predictions)
        weightedRecall = evaluator.setMetricName("weightedRecall").evaluate(
            naive_bayes_predictions)
        accuracyNaiveBayes = evaluator.setMetricName("accuracy").evaluate(
            naive_bayes_predictions)

        naive_bayes_predictions = naive_bayes_predictions.select(
            "label", "prediction")

        naive_bayes_predictions = naive_bayes_predictions.withColumn(
            "Current Stream Accuracy %",
            lit(str((accuracyNaiveBayes) * 100) + "%"))
        naive_bayes_predictions = naive_bayes_predictions.withColumn(
            "Current Stream Error %",
            lit(str((1.0 - accuracyNaiveBayes) * 100) + "%"))
        naive_bayes_predictions = naive_bayes_predictions.withColumn(
            "Current Stream F1 Score", lit(str(f1)))
        naive_bayes_predictions = naive_bayes_predictions.withColumn(
            "Current Stream weightedRecall", lit(str(weightedRecall)))
        naive_bayes_predictions = naive_bayes_predictions.withColumn(
            "Current Stream weightedPrecision", lit(str(weightedPrecision)))

        # To Print Data Frame Schema for Debbuging
        # predictions.printSchema()

        label_naive_bayes = mapSpeciesTypeWithNumericLabel(
            naive_bayes_predictions.select("prediction").first())
        labelInitial_naive_bayes = mapSpeciesTypeWithNumericLabel(
            naive_bayes_predictions.select("label").first())

        # Loading Global Variables
        global total_count_naive_bayes_classification
        global correct_count_naive_bayes_classification

        total_count_naive_bayes_classification = total_count_naive_bayes_classification + 1
        if (label_naive_bayes == labelInitial_naive_bayes):
            correct_count_naive_bayes_classification = correct_count_naive_bayes_classification + 1

        overall_accuracy_naive_bayes_percent = (
            float(correct_count_naive_bayes_classification) /
            float(total_count_naive_bayes_classification)) * 100
        naive_bayes_predictions = naive_bayes_predictions.withColumn(
            "News_Category_Predicted", lit(str(label_naive_bayes)))
        naive_bayes_predictions = naive_bayes_predictions.withColumn(
            "News_Category_InitalLabel", lit(str(labelInitial_naive_bayes)))

        naive_bayes_predictions.show()
        print("\n")

        # Overall Stats
        total_naive_bayes_predictions = naive_bayes_predictions.select("label")
        total_naive_bayes_predictions = total_naive_bayes_predictions.withColumn(
            "Overall Correct Count",
            lit(str(correct_count_naive_bayes_classification)))
        total_naive_bayes_predictions = total_naive_bayes_predictions.select(
            "Overall Correct Count")
        total_naive_bayes_predictions = total_naive_bayes_predictions.withColumn(
            "Total Count", lit(str(total_count_naive_bayes_classification)))
        total_naive_bayes_predictions = total_naive_bayes_predictions.withColumn(
            "Overall Accuracy Percent(%)",
            lit(str(overall_accuracy_naive_bayes_percent) + "%"))
        total_naive_bayes_predictions = total_naive_bayes_predictions.withColumn(
            "Overall Error Percent(%)",
            lit(str(100 - overall_accuracy_naive_bayes_percent) + "%"))

        print(
            "=========***********************************************$ Overall Classification Metrics Naive Bayes Classification Model $***********************************************========="
        )

        total_naive_bayes_predictions.show()

        print("\n")
        print(
            "=========*********************************************** End of Single Stream ***********************************************========="
        )
Example #11
0
#Logistic Regression Classification
lr = LogisticRegression(maxIter=25, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0).select("text","index","probability","label","prediction").orderBy("probability", ascending=False).show(n = 10, truncate = 30)



evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Test Error for Logistic Regression :" + str((1.0 - accuracy)*100)+ "%")
print("Test Accuracy for Logistic Regression :" + str((accuracy)*100)+ "%")

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction" ,metricName='f1')
f1 = evaluator.setMetricName("f1").evaluate(predictions)
weightedPrecision = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
weightedRecall = evaluator.setMetricName("weightedRecall").evaluate(predictions)
accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)

print("Test weightedRecall for Logistic Regression :" + str(weightedRecall))
print("Test weightedPrecision for Logistic Regression :" + str(weightedPrecision))
print("Test f1 score for Logistic Regression :" + str(f1))


# Save model
save_model_path = output_folder_path + "LogisticClassificationModel"
lrModel.write().overwrite().save(save_model_path)

print("Logistic Classification Model Successfully trained and saved in project Output directory")
Example #12
0
# %%
pred_test = crossvalidation_mode.transform(testing)
pred_test.show(5)

# %% [markdown]
# ## Best model from cross validation

# %%
print("The parameter smoothing has best value:",
      crossvalidation_mode.bestModel._java_obj.getSmoothing())

# %% [markdown]
# ### Prediction accuracy on train data

# %%
print('training data (f1):', evaluator.setMetricName('f1').evaluate(pred_train), "\n",
     'training data (weightedPrecision): ', evaluator.setMetricName('weightedPrecision').evaluate(pred_train),"\n",
     'training data (weightedRecall): ', evaluator.setMetricName('weightedRecall').evaluate(pred_train),"\n",
     'training data (accuracy): ', evaluator.setMetricName('accuracy').evaluate(pred_train))

# %% [markdown]
# ### Prediction accuracy on test data

# %%
print('test data (f1):', evaluator.setMetricName('f1').evaluate(pred_test), "\n",
     'test data (weightedPrecision): ', evaluator.setMetricName('weightedPrecision').evaluate(pred_test),"\n",
     'test data (weightedRecall): ', evaluator.setMetricName('weightedRecall').evaluate(pred_test),"\n",
     'test data (accuracy): ', evaluator.setMetricName('accuracy').evaluate(pred_test))

# %% [markdown]
# ## Confusion matrix
Example #13
0
print("\nModels Evaluation:")
print("{:-<24}".format(""))
for idx, c in enumerate(classifiers):
	print(c)
	# fit the model
	model = classifiers[c].fit(train_set)
	
	# make predictions
	predictions = model.transform(test_set)
	predictions.cache()
	
	# evaluate performance
	evaluator = MulticlassClassificationEvaluator(labelCol="Label_Idx", predictionCol="prediction")
	
	for m in metrics:
		evaluator.setMetricName(m)
		metric = evaluator.evaluate(predictions)
		print("{name} = {value:.2f}".format(name=m, value=metric))
	
	# Build confusion matrix using Scikit-learn (sktlearn)
	target_list = predictions.select("Label_Idx").rdd.flatMap(lambda x: x).collect()
	pred_list = predictions.select("prediction").rdd.flatMap(lambda x: x).collect()
	label_num_list = predictions.select("Label_Idx").distinct().orderBy("Label_Idx").rdd.flatMap(lambda x: x).collect()
	# print("\nClassification report using Sklearn:")
	# print(classification_report(target_list, pred_list, target_names=label_list))
	conf_matrix = confusion_matrix(target_list, pred_list, label_num_list)
	plt.figure(idx)
	plt.title("Confusion matrix - {model}".format(model=c))
	sns.heatmap(conf_matrix.T, square=True, annot=True, fmt='d', cbar=False,
	            annot_kws={"size": 7.5}, xticklabels=label_list, yticklabels=label_list)
	plt.xlabel('true label')
Example #14
0
ax0, ax1 = axList
ax0.set_title('First Model', color='#999999')
ax1.set_title('Second Model', color='#999999')
generateROC(axList[0], labelsAndScores)
generateROC(axList[1], labelsAndScores2)
display(fig)

# COMMAND ----------

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

metric = 'precision'

multiclassEval = MulticlassClassificationEvaluator()

multiclassEval.setMetricName(metric)
print 'Model one {0}: {1:.3f}'.format(metric, multiclassEval.evaluate(irisTestPredictions))
print 'Model two {0}: {1:.3f}\n'.format(metric, multiclassEval.evaluate(irisTestPredictions2))

# COMMAND ----------

import inspect
print inspect.getsource(MulticlassClassificationEvaluator)

# COMMAND ----------

# MAGIC %md
# MAGIC #### Using MLlib instead of ML
# MAGIC  
# MAGIC We've been using `ml` transformers, estimators, pipelines, and evaluators.  How can we accomplish the same things with MLlib?
def randomForest(df,
                 feature_list=['BFSIZE', 'HDRSIZE', 'NODETYPE'],
                 maxDepth=5,
                 numTrees=20,
                 seed=None):
    # Checks if there is a SparkContext running if so grab that if not start a new one
    # sc = SparkContext.getOrCreate()
    # sqlContext = SQLContext(sc)
    # sqlContext.setLogLevel('INFO')
    print(
        'Sanity check that Im not doing something dumb like using the label in the feature_list: {}'
        .format(feature_list))
    vector_assembler = VectorAssembler(inputCols=feature_list,
                                       outputCol="features")
    df_temp = vector_assembler.transform(df)

    df = df_temp.select(['label', 'features'])

    (trainingData, testData) = df.randomSplit([0.7, 0.3])
    rf = RandomForestClassifier(labelCol="label",
                                featuresCol="features",
                                numTrees=numTrees,
                                maxDepth=maxDepth,
                                seed=seed)

    model = rf.fit(trainingData)
    predictions = model.transform(testData)
    # predictions.select("prediction", "label").show(100)
    # df.select('label').distinct().show()
    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction")
    # f1|weightedPrecision|weightedRecall|accuracy
    evaluator.setMetricName('accuracy')
    accuracy = evaluator.evaluate(predictions)
    evaluator.setMetricName('f1')
    f1 = evaluator.evaluate(predictions)
    evaluator.setMetricName('weightedPrecision')
    weightedPrecision = evaluator.evaluate(predictions)
    evaluator.setMetricName('weightedRecall')
    weightedRecall = evaluator.evaluate(predictions)

    print('accuracy {}'.format(accuracy))
    print('f1 {}'.format(f1))
    print('weightedPrecision {}'.format(weightedPrecision))
    print('weightedRecall {}'.format(weightedRecall))

    # test distribution of outputs
    total = df.select('label').count()
    tape = df.filter(df.label == 0).count()
    disk = df.filter(df.label == 1).count()
    cloud = df.filter(df.label == 2).count()
    # print outputs
    print('Random Forests')
    print(feature_list)
    print('Total Observations {}'.format(total))
    print(' Cloud %{}'.format((cloud / total) * 100))
    print(' Disk %{}'.format((disk / total) * 100))
    print(' Tape %{}\n'.format((tape / total) * 100))

    print(" Test Error = {}".format((1.0 - accuracy) * 100))
    print(" Test Accuracy = {}\n".format(accuracy * 100))

    misses = predictions.filter(predictions.label != predictions.prediction)
    # now get percentage of error
    tape_misses = misses.filter(misses.label == 0).count()
    disk_misses = misses.filter(misses.label == 1).count()
    cloud_misses = misses.filter(misses.label == 2).count()

    print(' Cloud Misses %{}'.format((cloud_misses / cloud) * 100))
    print(' Disk Misses %{}'.format((disk_misses / disk) * 100))
    print(' Tape Misses %{}'.format((tape_misses / tape) * 100))

    # plt.xlabel("FPR", fontsize=14)
    # plt.ylabel("TPR", fontsize=14)
    # plt.title("ROC Curve", fontsize=14)
    # plt.plot(fp[0:250], tp, linewidth=2)
    # buf = io.BytesIO()
    # plt.savefig(buf, format='png')
    # buf.seek(0)
    # image = tf.image.decode_png(buf.getvalue(), channels=4)
    # image = tf.expand_dims(image, 0)
    # summary_op = tf.summary.image("ROC Curve", image)
    return accuracy, 'Random Forests: {}'.format(accuracy), model
Example #16
0
ax0, ax1 = axList
ax0.set_title('First Model', color='#999999')
ax1.set_title('Second Model', color='#999999')
generateROC(axList[0], labelsAndScores)
generateROC(axList[1], labelsAndScores2)
display(fig)

# COMMAND ----------

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

metric = 'precision'

multiclassEval = MulticlassClassificationEvaluator()

multiclassEval.setMetricName(metric)
print 'Model one {0}: {1:.3f}'.format(metric, multiclassEval.evaluate(irisTestPredictions))
print 'Model two {0}: {1:.3f}\n'.format(metric, multiclassEval.evaluate(irisTestPredictions2))

# COMMAND ----------

import inspect
print inspect.getsource(MulticlassClassificationEvaluator)

# COMMAND ----------

# MAGIC %md
# MAGIC #### Using MLlib instead of ML
# MAGIC  
# MAGIC We've been using `ml` transformers, estimators, pipelines, and evaluators.  How can we accomplish the same things with MLlib?
def multinomialRegression(df,
                          feature_list=['BFSIZE', 'HDRSIZE', 'NODETYPE'],
                          maxIter=100,
                          regParam=0.0,
                          elasticNetParam=0.0,
                          threshold=0.5,
                          overwrite_model=False):
    # Checks if there is a SparkContext running if so grab that if not start a new one
    # sc = SparkContext.getOrCreate()
    # sqlContext = SQLContext(sc)
    # sqlContext.setLogLevel('INFO')
    feature_list.sort()
    feature_name = '_'.join(feature_list)
    param_name = '_'.join(
        [str(regParam),
         str(elasticNetParam),
         str(maxIter),
         str(threshold)])
    model_path_name = model_dir + 'MultinomialRegression/' + feature_name + '_' + param_name
    model = None

    vector_assembler = VectorAssembler(inputCols=feature_list,
                                       outputCol="features")
    df_temp = vector_assembler.transform(df)

    df = df_temp.select(['label', 'features'])

    trainingData, testData = df.randomSplit([0.7, 0.3])

    if os.path.isdir(model_path_name) and not overwrite_model:
        print('Loading model from ' + model_path_name)
        model = LogisticRegressionModel.load(model_path_name)

    else:
        lr = LogisticRegression(labelCol="label",
                                maxIter=maxIter,
                                regParam=regParam,
                                elasticNetParam=elasticNetParam)
        model = lr.fit(trainingData)

    print('Making predictions on validation data')
    predictions = model.transform(testData)

    evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                                  predictionCol="prediction")

    evaluator.setMetricName('accuracy')
    print('Evaluating accuracy')
    accuracy = evaluator.evaluate(predictions)

    evaluator.setMetricName('f1')
    print('Evaluating f1')
    f1 = evaluator.evaluate(predictions)

    evaluator.setMetricName('weightedPrecision')
    print('Evaluating weightedPrecision')
    weightedPrecision = evaluator.evaluate(predictions)

    evaluator.setMetricName('weightedRecall')
    print('Evaluating weightedRecall')
    weightedRecall = evaluator.evaluate(predictions)

    print('accuracy {}'.format(accuracy))
    print('f1 {}'.format(f1))
    print('weightedPrecision {}'.format(weightedPrecision))
    print('weightedRecall {}'.format(weightedRecall))

    # test distribution of outputs
    total = df.select('label').count()
    tape = df.filter(df.label == 0).count()
    disk = df.filter(df.label == 1).count()
    cloud = df.filter(df.label == 2).count()

    # print outputs
    print('Multinomial Regression Classification')
    print(feature_list)
    print('Data distribution')
    print('Total Observations {}'.format(total))
    print(' Cloud %{}'.format((cloud / total) * 100))
    print(' Disk %{}'.format((disk / total) * 100))
    print(' Tape %{}\n'.format((tape / total) * 100))

    print(" Test Error = {}".format((1.0 - accuracy) * 100))
    print(" Test Accuracy = {}\n".format(accuracy * 100))

    print('Error distribution')
    misses = predictions.filter(predictions.label != predictions.prediction)
    # now get percentage of error
    tape_misses = misses.filter(misses.label == 0).count()
    disk_misses = misses.filter(misses.label == 1).count()
    cloud_misses = misses.filter(misses.label == 2).count()

    tape_pred = predictions.filter(predictions.label == 0).count()
    disk_pred = predictions.filter(predictions.label == 1).count()
    cloud_pred = predictions.filter(predictions.label == 2).count()

    print(' Cloud Misses %{}'.format((cloud_misses / cloud_pred) * 100))
    print(' Disk Misses %{}'.format((disk_misses / disk_pred) * 100))
    print(' Tape Misses %{}'.format((tape_misses / tape_pred) * 100))

    if accuracy > 0.80:
        if os.path.isdir(model_path_name):
            if overwrite_model:
                print('Saving model to ' + model_path_name)
                model.write().overwrite().save(model_path_name)
            else:
                pass
        else:
            print('Saving model to ' + model_path_name)
            model.save(model_path_name)

    metrics = {
        'data': {
            'Total': total,
            'Cloud': (cloud / total) * 100,
            'Disk': (disk / total) * 100,
            'Tape': (tape / total) * 100
        },
        'metrics': {
            'Accuracy': accuracy * 100,
            'f1': f1 * 100,
            'Weighted Precision': weightedPrecision * 100,
            'Weighted Recall': weightedRecall * 100
        },
        'error_percentage': {
            'Cloud': cloud_misses / cloud_pred * 100,
            'Disk': disk_misses / disk_pred * 100,
            'Tape': tape_misses / tape_pred * 100
        },
        'params': {
            'Regularization Parameter': regParam,
            'Maximum Iteration': maxIter,
            'ElasticNet Mixing Parameter': elasticNetParam,
            'Threshold': threshold
        },
        'name': 'Multinomial Regression Classification',
        'features': feature_list
    }

    with open('tmp/temp2.yml', 'w') as outfile:
        yaml.dump(metrics, outfile)

    return metrics, model