Esempi in Python per BinaryClassificationEvaluator.setMetricName, esempi in Python per pyspark.ml.evaluation.BinaryClassificationEvaluator.setMetricName

Esempio n. 1

0

Mostra file

File: validateModels.py Progetto: Manwelanza/Successful_Terms_py

    def validateClassification(self, df, model, modelName=None):
        predictions0 = model.transform(df)

        evaluator = BinaryClassificationEvaluator(
            metricName=ValidateModels.ROC)
        roc = evaluator.evaluate(predictions0)
        evaluator.setMetricName(ValidateModels.PR)
        pr = evaluator.evaluate(predictions0)

        if modelName != None:
            self.metrics[modelName][ValidateModels.ROC] = roc
            self.metrics[modelName][ValidateModels.PR] = pr

        return {ValidateModels.ROC: roc, ValidateModels.PR: pr}

Esempio n. 2

0

Mostra file

def build_model(source_df, config_df):
    """

    Args:
        source_df:
        config_df:

    Returns:

    """
    config_dict = config_df.asDict()
    pipeline_builder = PipelineBuilder(source_df, config_dict)
    target_df = pipeline_builder.transform()
    (training_data,
     test_data) = target_df.randomSplit(config_dict['randomSplit'],
                                        seed=config_dict['seed'])
    # Create initial LogisticRegression model
    lr = LogisticRegression(labelCol="label", featuresCol="features")
    # Create ParamGrid for Cross Validation
    param_grid = (ParamGridBuilder().addGrid(
        lr.regParam, config_dict['regParam']).addGrid(
            lr.elasticNetParam, config_dict['elasticNetParam']).addGrid(
                lr.maxIter, config_dict['maxIter']).build())
    # Evaluate model
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
    evaluator.setMetricName(config_dict['metricName'])
    # Create K-fold CrossValidator
    cv = CrossValidator(estimator=lr,
                        estimatorParamMaps=param_grid,
                        evaluator=evaluator,
                        numFolds=config_dict["numFolds"])
    # Run cross validations
    cv_model = cv.fit(training_data)
    # Use test set here so we can measure the accuracy of our model on new data
    test_predictions = cv_model.transform(test_data)
    evaluator.evaluate(test_predictions)
    # Extract weights
    coefficients = cv_model.bestModel.coefficients
    weights = []
    for index, feature in enumerate(pipeline_builder.features()):
        weights.append(
            Row(feature=feature,
                weight=float(coefficients[index]),
                intercept=cv_model.bestModel.intercept))
    return cv_model, weights, test_predictions

Esempio n. 3

0

Mostra file

def validateModel(model, filename):
    test = spark.read.load('../dataset/merged/article/')
    test = test.withColumn('label', test._hyperpartisan.cast('integer'))
    test = model.transform(test)
    ev = BinaryClassificationEvaluator()
    with open(filename,"a") as file:
        file.write(f"{ev.getMetricName()}: {ev.evaluate(test)}\n")
        ev = MulticlassClassificationEvaluator()
        file.write(f"{ev.getMetricName()}: {ev.evaluate(test)}\n")
        ev.setMetricName("weightedPrecision")
        file.write(f"{ev.getMetricName()}: {ev.evaluate(test)}\n")
        ev.setMetricName("weightedRecall")
        file.write(f"{ev.getMetricName()}: {ev.evaluate(test)}\n")
        ev.setMetricName("accuracy")
        file.write(f"{ev.getMetricName()}: {ev.evaluate(test)}\n")

Esempio n. 4

0

Mostra file

File: 02_classify.py Progetto: joon1170/CreditCard-Fraud

# test_with_prediction.show(5)
test_with_prediction.select("Class","rawPrediction","probability","prediction").show(5)

# **Note:** The resulting DataFrame includes three types of predictions.  The
# `rawPrediction` is a vector of log-odds, `prediction` is a vector or
# probabilities `prediction` is the predicted class based on the probability
# vector.

# Create an instance of `BinaryClassificationEvaluator` class:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="Class", metricName="areaUnderROC")
print(evaluator.explainParams())
evaluator.evaluate(test_with_prediction)

# Evaluate using another metric:
evaluator.setMetricName("areaUnderPR").evaluate(test_with_prediction)


# ## Score out a new dataset

# There are two ways to score out a new dataset.

# **Method1:** The `evaluate` method

# The more expensive way is to use the `evaluate` method of the
# `LogisticRegressionModel` class.  The `predictions` attribute of the
# resulting `BinaryLogisticRegressionSummary` instance contains the scored
# DataFrame:
test_with_evaluation = log_reg_model.evaluate(df_test)
test_with_evaluation.predictions.printSchema()
test_with_evaluation.predictions.head(5)

Esempio n. 5

0

Mostra file

for reg in regs:
    print("Regularization rate: {}".format(reg))
    with main_run.child_run("reg-" + str(reg)) as run:
        lr = LogisticRegression(featuresCol="features",
                                labelCol='label',
                                regParam=reg)
        pipe = Pipeline(stages=[
            stringIndexer, tokenizer, stopwordsRemover, hashingTF, idf, lr
        ])
        model_p = pipe.fit(training_data)

        # make prediction on test_data
        pred = model_p.transform(test_data)

        bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')
        au_roc = bce.setMetricName('areaUnderROC').evaluate(pred)
        au_prc = bce.setMetricName('areaUnderPR').evaluate(pred)
        totalCount = pred.count()
        tp = pred.where("prediction == 1 and label == 1").count()
        tn = pred.where("prediction == 0 and label == 0").count()
        fp = pred.where("prediction == 1 and label == 0").count()
        fn = pred.where("prediction == 0 and label == 1").count()
        acc = (tp + tn) / totalCount
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = 2 * precision * recall / (precision + recall)

        run.log("reg", reg)
        run.log("au_roc", au_roc)
        run.log("au_prc", au_prc)
        run.log("TN", tn)

Esempio n. 6

0

Mostra file

lrmodel = lr.fit(adulttrain)
lrmodel = lr.setParams(regParam=0.01, maxIter=500,
                       fitIntercept=True).fit(adulttrain)

lrmodel.weights
lrmodel.intercept

#section 8.2.3
validpredicts = lrmodel.transform(adultvalid)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
bceval = BinaryClassificationEvaluator()
bceval.evaluate(validpredicts)
bceval.getMetricName()

bceval.setMetricName("areaUnderPR")
bceval.evaluate(validpredicts)

#section 8.2.5
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5)
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [1000]).addGrid(
    lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]).build()
cv.setEstimatorParamMaps(paramGrid)
cvmodel = cv.fit(adulttrain)
cvmodel.bestModel.weights
BinaryClassificationEvaluator().evaluate(
    cvmodel.bestModel.transform(adultvalid))

#section 8.2.6

Esempio n. 7

0

Mostra file

File: Capstone_LogisticRegression.py Progetto: jimaaa17/Sentimental-Analysis-on-Drug-Reviews-using-NLP

# %% [markdown]
# ## Prediction on training data

# %%
pred_training_logr = logr_model.transform(training)
show_columns = ['features', 'label', 'prediction', 'rawPrediction', 'probability']
pred_training_logr.select(show_columns).show(5, truncate=True)

# %% [markdown]
# ## Evaluator

# %%
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print('Accuracy on training data (areaUnderROC): ', evaluator.setMetricName('areaUnderROC').evaluate(pred_training_logr))

# %% [markdown]
# ## Prediction on test data

# %%
pred_testing_logr = logr_model.transform(testing)
pred_testing_logr.select(show_columns).show(5, truncate=True)


# %%
print('Accuracy on testing data (areaUnderROC): ', evaluator.setMetricName('areaUnderROC').evaluate(pred_testing_logr))

# %% [markdown]
# ## Confusion Matrix

Esempio n. 8

0

Mostra file

    # fitting the model
    lrModel = lr.fit(vtrain_df)

    # printing the coefficients and intercept for logistic regression
    lr_coeff = lrModel.coefficients
    print('\nCoefficients: ')
    print([round(i, 3) for i in lr_coeff])
    print('\nIntercept: ', lrModel.intercept, '\n')

    # getting train predictions and accuracy rate / area under ROC / area under PR
    evaluator = BinaryClassificationEvaluator()
    train_pred = lrModel.transform(vtrain_df)
    train_pred.show(5, False)
    print('\nEntire Train Predictions DataFrame\n')
    train_roc = evaluator.setMetricName('areaUnderROC').evaluate(train_pred)
    train_pr = evaluator.setMetricName('areaUnderPR').evaluate(train_pred)
    condensed_train_pred = train_pred.select(['label', 'prediction'])
    train_acc = round(get_accuracy_rate(condensed_train_pred), 2)
    condensed_train_pred.show(10, False)
    print('\nCondensed Train Predictions DataFrame\n')
    print('Train Accuracy Rate :', train_acc)
    print('Train Area Under ROC :', round(train_roc, 4))
    print('Train Area Under PR :', round(train_pr, 4), '\n')

    # getting test predictions and accuracy rate
    test_pred = lrModel.transform(vtest_df)
    test_pred.show(5, False)
    print('\nEntire Test Predictions DataFrame\n')
    test_roc = evaluator.setMetricName('areaUnderROC').evaluate(test_pred)
    test_pr = evaluator.setMetricName('areaUnderPR').evaluate(test_pred)

Esempio n. 9

0

Mostra file

pipeline_model = pipeline.fit(taxi)

final_columns = feature_columns + ['features', 'label']
taxi_df = pipeline_model.transform(taxi).select(final_columns)
#taxi_df.show(5)
train, test = taxi_df.randomSplit([0.8, 0.2], seed=1234)

random_forest = RandomForestClassifier(featuresCol='features',
                                       labelCol='label')

param_grid = ParamGridBuilder().\
    addGrid(random_forest.maxDepth, [2, 3, 4]).\
    addGrid(random_forest.minInfoGain, [0.0, 0.1, 0.2, 0.3]).\
    build()
evaluator = BinaryClassificationEvaluator()
crossvalidation = CrossValidator(estimator=random_forest,
                                 estimatorParamMaps=param_grid,
                                 evaluator=evaluator)
crossvalidation_mod = crossvalidation.fit(taxi_df)

pred_test = crossvalidation_mod.transform(test)
pred_test.show(5)
label_pred_test = pred_test.select('label', 'prediction')
label_pred_test.rdd.zipWithIndex().countByKey()

print('Accuracy : ',
      evaluator.setMetricName('areaUnderROC').evaluate(pred_test))
print('Precision : ',
      evaluator.setMetricName('areaUnderPR').evaluate(pred_test))
#print('Precision : ', evaluator.setMetricName('precision').evaluate(pred_test))

Esempio n. 10

0

Mostra file

File: 09-SampleLRModel.py Progetto: mansim07/CreditCardRiskAnalytics

#    inputCols=["LIMIT_BAL", "SEX", "EDUCATION","MARRIAGE","AGE"],
#    outputCol="features")

assembler=VectorAssembler(inputCols=["LIMIT_BAL","BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6", \
                                    "PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6","SEX_Vec","MARRIAGE_Vec", \
                                    "AGE_Vec","EDUCATION_Vec","PAY_0_Vec","PAY_2_Vec","PAY_3_Vec","PAY_4_Vec","PAY_5_Vec","PAY_6_Vec"],outputCol="features")

output = assembler.transform(trans_df3)


#Split the training & test data 
(trainingData, testData) = output.randomSplit([0.7, 0.3])

from pyspark.ml.evaluation import BinaryClassificationEvaluator
binaryEvaluator=BinaryClassificationEvaluator(labelCol="Y",rawPredictionCol="rawPrediction")
binaryEvaluator.setMetricName("areaUnderROC")

from pyspark.ml.evaluation import RegressionEvaluator
evaluatorRegression=RegressionEvaluator(labelCol="Y",predictionCol="prediction")
evaluatorRegression.setMetricName("rmse")

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol='Y',maxIter=10, regParam=0.03, elasticNetParam=0.8)
model = lr.fit(trainingData)

print(model.summary.areaUnderROC)

prediction=model.transform(trainingData)
areaTraining=binaryEvaluator.evaluate(prediction)
print("Area Under ROC using Logistics Regression on training data =" + str(areaTraining))

Esempio n. 11

0

Mostra file

    print('\n\tFinal Test Dataframe for Logistic Regression\n')

    # Start Logistic Regression Model
    logisticReg = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter = 10)
    logisticRegModel = logisticReg.fit(vTrainDataFrame)
    logisticRegCoeff = logisticRegModel.coefficients
    print('\nCoefficients: ')
    print([round(i, 3) for i in logisticRegCoeff])
    print('\nIntercept: ', logisticRegModel.intercept, '\n')

    # Calculate Train Predictions, Accuracy Rate, Area under ROC and Area unde PR
    evaluator = BinaryClassificationEvaluator()
    trainPredict = logisticRegModel.transform(vTrainDataFrame)
    trainPredict.show(5, False)
    print('\nTraining Predictions DataFrame\n')
    trainROC = evaluator.setMetricName('areaUnderROC').evaluate(trainPredict)
    trainPR = evaluator.setMetricName('areaUnderPR').evaluate(trainPredict)
    condenseTrainPredict = trainPredict.select(['label', 'prediction'])
    trainACC = round(getAccuracyRate(condenseTrainPredict), 2)
    condenseTrainPredict.show(5, False)
    print('\nTraining Predictions DataFrame\n')
    print('Training Accuracy Rate:', trainACC)
    print('Training Area under ROC:', round(trainROC, 4))
    print('Training Area under PR:', round(trainPR, 4), '\n')

    # Calculate Test Predictions and Accuracy Rate
    testPredict = logisticRegModel.transform(vTestDataFrame)
    testPredict.show(5, False)
    print('\nTest Predictions DataFrame\n')
    testROC = evaluator.setMetricName('areaUnderROC').evaluate(testPredict)
    testPR = evaluator.setMetricName('areaUnderPR').evaluate(testPredict)

Esempio n. 12

0

Mostra file

# %%
pred_training_rf = rf_model.transform(training)
show_columns = [
    'features', 'label', 'prediction', 'rawPrediction', 'probability'
]
pred_training_rf.select(show_columns).show(5, truncate=True)

# %% [markdown]
# ## Evaluator
#

# %%
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print('Accuracy on training data (areaUnderROC): ',
      evaluator.setMetricName('areaUnderROC').evaluate(pred_training_rf))

# %% [markdown]
# ## Prediction on test data

# %%
pred_testing_rf = rf_model.transform(testing)
pred_testing_rf.select(show_columns).show(5, truncate=True)

# %%
print('Accuracy on testing data (areaUnderROC): ',
      evaluator.setMetricName('areaUnderROC').evaluate(pred_testing_rf))

# %% [markdown]
# ## Confusion Matrix

Esempio n. 13

0

Mostra file

File: part_3.py Progetto: sophieyuefeiwang/MSDS694-IoT-sensor-project

# step 10
lr = LogisticRegression(regParam=0.01, maxIter=100, fitIntercept=True)

bceval = BinaryClassificationEvaluator()
cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(n_fold)

paramGrid = ParamGridBuilder().addGrid(lr.maxIter, max_iter)\
    .addGrid(lr.regParam, reg_params).build()

cv.setEstimatorParamMaps(paramGrid)

cvmodel = cv.fit(train)

print(cvmodel.bestModel.coefficients)
print('')
print(cvmodel.bestModel.intercept)
print('')
print(cvmodel.bestModel.getMaxIter())
print('')
print(cvmodel.bestModel.getRegParam())
print('')


# step 11
result11 = bceval.setMetricName('areaUnderROC').evaluate(
    cvmodel.bestModel.transform(valid))
print(result11)

ss.stop()

Esempio n. 14

0

Mostra file

File: ch08-listings.py Progetto: AkiraKane/first-edition

lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(adulttrain)
lrmodel = lr.setParams(regParam=0.01, maxIter=500, fitIntercept=True).fit(adulttrain)

lrmodel.weights
lrmodel.intercept

#section 8.2.3
validpredicts = lrmodel.transform(adultvalid)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
bceval = BinaryClassificationEvaluator()
bceval.evaluate(validpredicts)
bceval.getMetricName()

bceval.setMetricName("areaUnderPR")
bceval.evaluate(validpredicts)

#section 8.2.5
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5)
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [1000]).addGrid(lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]).build()
cv.setEstimatorParamMaps(paramGrid)
cvmodel = cv.fit(adulttrain)
cvmodel.bestModel.weights
BinaryClassificationEvaluator().evaluate(cvmodel.bestModel.transform(adultvalid))

#section 8.2.6
penschema = StructType([
    StructField("pix1",DoubleType(),True),

Esempio n. 15

0

Mostra file

File: Learn.py Progetto: YZLonlines/Senior-Capstone-2018-2019

def gradientBoosting(df,
                     feature_list=['BFSIZE', 'HDRSIZE', 'NODETYPE'],
                     maxIter=20,
                     stepSize=0.1,
                     maxDepth=5,
                     overwrite_model=False):
    # Checks if there is a SparkContext running if so grab that if not start a new one
    # sc = SparkContext.getOrCreate()
    # sqlContext = SQLContext(sc)
    # sqlContext.setLogLevel('INFO')
    feature_list.sort()
    feature_name = '_'.join(feature_list)
    param_name = '_'.join([str(maxDepth), str(stepSize), str(maxIter)])
    model_path_name = model_dir + 'GradientBoosting/' + feature_name + '_' + param_name
    model = None

    vector_assembler = VectorAssembler(inputCols=feature_list,
                                       outputCol="features")
    df_temp = vector_assembler.transform(df)

    df = df_temp.select(['label', 'features'])

    trainingData, testData = df.randomSplit([0.7, 0.3])

    if os.path.isdir(model_path_name) and not overwrite_model:
        print('Loading model from ' + model_path_name)
        model = GBTClassificationModel.load(model_path_name)

    else:
        gbt = GBTClassifier(labelCol="label",
                            featuresCol="features",
                            maxIter=maxIter,
                            stepSize=stepSize,
                            maxDepth=maxDepth)
        model = gbt.fit(trainingData)

    print('Making predictions on validation data')
    predictions = model.transform(testData)
    evaluator = BinaryClassificationEvaluator()

    evaluator.setMetricName('areaUnderROC')
    print('Evaluating areaUnderROC')
    auc = evaluator.evaluate(predictions)

    evaluator.setMetricName('areaUnderPR')
    print('Evaluating areaUnderPR')
    areaUnderPR = evaluator.evaluate(predictions)

    # test distribution of outputs
    total = df.select('label').count()
    disk = df.filter(df.label == 0).count()
    cloud = df.filter(df.label == 1).count()

    # print outputs
    print('Gradient-Boosted Tree')
    print(feature_list)
    print('Data distribution')
    print('Total Observations {}'.format(total))
    print(' Cloud %{}'.format((cloud / total) * 100))
    print(' Disk %{}'.format((disk / total) * 100))

    print(" Test AUC = {}\n".format(auc * 100))

    print('Error distribution')
    misses = predictions.filter(predictions.label != predictions.prediction)
    # now get percentage of error
    disk_misses = misses.filter(misses.label == 0).count()
    cloud_misses = misses.filter(misses.label == 1).count()

    disk_pred = predictions.filter(predictions.label == 0).count()
    cloud_pred = predictions.filter(predictions.label == 1).count()

    print(' Cloud Misses %{}'.format((cloud_misses / cloud_pred) * 100))
    print(' Disk Misses %{}'.format((disk_misses / disk_pred) * 100))

    if auc > 0.80:
        if os.path.isdir(model_path_name):
            if overwrite_model:
                print('Saving model to ' + model_path_name)
                model.write().overwrite().save(model_path_name)
            else:
                pass
        else:
            print('Saving model to ' + model_path_name)
            model.save(model_path_name)

    metrics = {
        'data': {
            'total': total,
            'cloud': (cloud / total) * 100,
            'disk': (disk / total) * 100
        },
        'metrics': {
            'Area Under ROC curve': auc * 100,
            'Area Under PR curve': areaUnderPR * 100
        },
        'error_percentage': {
            'cloud': cloud_misses / cloud_pred * 100,
            'disk': disk_misses / disk_pred * 100
        },
        'params': {
            'Number of Trees': model.getNumTrees,
            'Maximum Depth': maxDepth,
            'Maximum Number of Iterations': maxIter,
            'Step Size': stepSize
        },
        'model_debug': model.toDebugString,
        'name': 'Gradient Boosted Model',
        'features': feature_list
    }

    with open('tmp/temp1.yml', 'w') as outfile:
        yaml.dump(metrics, outfile)

    return metrics, model

Esempio n. 16

0

Mostra file

File: Learn.py Progetto: YZLonlines/Senior-Capstone-2018-2019

def linearSVC(df,
              feature_list=['BFSIZE', 'HDRSIZE', 'NODETYPE'],
              maxIter=100,
              regParam=0.0,
              threshold=0.0,
              overwrite_model=False):
    # Checks if there is a SparkContext running if so grab that if not start a new one
    # sc = SparkContext.getOrCreate()
    # sqlContext = SQLContext(sc)
    # sqlContext.setLogLevel('INFO')
    feature_list.sort()
    feature_name = '_'.join(feature_list)
    param_name = '_'.join([str(regParam), str(threshold), str(maxIter)])
    model_path_name = model_dir + 'LinearSVC/' + feature_name + '_' + param_name
    model = None

    vector_assembler = VectorAssembler(inputCols=feature_list,
                                       outputCol="features")
    df_temp = vector_assembler.transform(df)

    df = df_temp.select(['label', 'features'])

    trainingData, testData = df.randomSplit([0.7, 0.3])

    if os.path.isdir(model_path_name) and not overwrite_model:
        print('Loading model from ' + model_path_name)
        model = LinearSVCModel.load(model_path_name)

    else:
        lsvc = LinearSVC(maxIter=maxIter,
                         regParam=regParam,
                         threshold=threshold)
        model = lsvc.fit(trainingData)

    print('Making predictions on validation data')
    predictions = model.transform(testData)
    evaluator = BinaryClassificationEvaluator()

    evaluator.setMetricName('areaUnderROC')
    print('Evaluating areaUnderROC')
    auc = evaluator.evaluate(predictions)

    evaluator.setMetricName('areaUnderPR')
    print('Evaluating areaUnderPR')
    areaUnderPR = evaluator.evaluate(predictions)

    # test distribution of outputs
    total = df.select('label').count()
    disk = df.filter(df.label == 0).count()
    cloud = df.filter(df.label == 1).count()

    # print outputs
    print('Linear SVC')
    print(feature_list)
    print('Data distribution')
    print('Total Observations {}'.format(total))
    print(' Cloud %{}'.format((cloud / total) * 100))
    print(' Disk %{}'.format((disk / total) * 100))

    print(" Test AUC = {}\n".format(auc * 100))

    print('Error distribution')
    misses = predictions.filter(predictions.label != predictions.prediction)
    # now get percentage of error
    disk_misses = misses.filter(misses.label == 0).count()
    cloud_misses = misses.filter(misses.label == 1).count()

    disk_pred = predictions.filter(predictions.label == 0).count()
    cloud_pred = predictions.filter(predictions.label == 1).count()

    print(' Cloud Misses %{}'.format((cloud_misses / cloud_pred) * 100))
    print(' Disk Misses %{}'.format((disk_misses / disk_pred) * 100))

    if auc > 0.70:
        if os.path.isdir(model_path_name):
            if overwrite_model:
                print('Saving model to ' + model_path_name)
                model.write().overwrite().save(model_path_name)
            else:
                pass
        else:
            print('Saving model to ' + model_path_name)
            model.save(model_path_name)

    metrics = {
        'data': {
            'total': total,
            'cloud': (cloud / total) * 100,
            'disk': (disk / total) * 100
        },
        'metrics': {
            'Area Under ROC curve': auc * 100,
            'Area Under PR curve': areaUnderPR * 100
        },
        'error_percentage': {
            'cloud': cloud_misses / cloud_pred * 100,
            'disk': disk_misses / disk_pred * 100
        },
        'params': {
            'Regularization Parameter': regParam,
            'Maximum Iteration': maxIter,
            'Threshold': threshold
        },
        'name': 'Linear SVC',
        'features': feature_list
    }

    with open('tmp/temp3.yml', 'w') as outfile:
        yaml.dump(metrics, outfile)

    return metrics, model