Beispiel #1
0
def binaryClassificationEvaluator(predictions):
    evaluator = BinaryClassificationEvaluator()
    print("Test Area Under ROC: " + str(
        evaluator.evaluate(predictions,
                           {evaluator.metricName: "areaUnderROC"})))
    evaluator.getMetricName()
    return evaluator
def predictor(model, testdata):
    print("Running Predict Function")
    print("Predicting Model with RF Model:")
    predictions = model.transform(testdata)
    evaluator = BinaryClassificationEvaluator(labelCol="Label")

    print("Evaluation Metric", evaluator.evaluate(predictions) * 100)
    print("Metric Used", evaluator.getMetricName())
Beispiel #3
0
def auc(df, model):
    prediction = model.transform(df)
    evaluator = BinaryClassificationEvaluator()

    metric = evaluator.evaluate(prediction)
    metric_name = evaluator.getMetricName()

    return metric_name, metric
Beispiel #4
0
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol='Status')
evaluator.evaluate(results)
results.show(15)
evaluator.evaluate(results)

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                          labelCol='Status')
evaluator.evaluate(results)
evaluator.getMetricName()
print(log_reg.explainParams())
Beispiel #5
0
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(adulttrain)
lrmodel = lr.setParams(regParam=0.01, maxIter=500,
                       fitIntercept=True).fit(adulttrain)

lrmodel.weights
lrmodel.intercept

#section 8.2.3
validpredicts = lrmodel.transform(adultvalid)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
bceval = BinaryClassificationEvaluator()
bceval.evaluate(validpredicts)
bceval.getMetricName()

bceval.setMetricName("areaUnderPR")
bceval.evaluate(validpredicts)

#section 8.2.5
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5)
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [1000]).addGrid(
    lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]).build()
cv.setEstimatorParamMaps(paramGrid)
cvmodel = cv.fit(adulttrain)
cvmodel.bestModel.weights
BinaryClassificationEvaluator().evaluate(
    cvmodel.bestModel.transform(adultvalid))
Beispiel #6
0
train_data, test_data = df_model2.randomSplit([.8,.2],seed=1234)
train_data.groupby('label').agg({'label': 'count'}).show()
test_data.groupby('label').agg({'label': 'count'}).show()
LR = LogisticRegression(labelCol="label",
                        featuresCol="featureX",
                        maxIter=10,regParam=0.3)
linearModel = LR.fit(train_data)
predictions = linearModel.transform(test_data)
predictions.printSchema()
selectedPredictions = predictions.select("label", "prediction", "probability")
selectedPredictions.show(8)

print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx PREDICTIONS xxxxxxxxxxxxx 10')
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print("PENCENTAGE OF GUESS ERROR :")
print(evaluator.evaluate(predictions))
print(evaluator.getMetricName())
print("TYPE OF TESTS CONDUCTED :"+evaluator.getMetricName())
print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx ROC METRIX xxxxxxxxxxxxx 11')
print("")
print('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx DATA SCIENCE TWO #2 SUCCESSFUL xxxxxxxxxx DONE!')








Beispiel #7
0
# Let's check the first 20 results
predictions.show()

####################################################################################################
# 4. Evaluation of the Model
####################################################################################################

# Now that you have done with the prediciton process, let's evaluate the model
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# First, you need to build an evaluator
# "rawPredictionCol" can be either rawPrediction or probability (Either way will yield the same result)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

# AUC and AUC(PR) are the metrics, provided by the library
print evaluator.getMetricName(), "The AUC of the Model is {}".format(
    evaluator.evaluate(predictions))
print "The AUC under PR curve is {}".format(
    evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"}))

# If you want to check out the deufalt matrix
evaluator.getMetricName()

# Some other matrix can be found as follows
print 'Model Intercept: ', lrModel.interceptVector
print 'Model coefficientMatrix: ', lrModel.coefficientMatrix

####################################################################################################
# 5. Param Setting
####################################################################################################

# To extract the best params from the model, let's use ParamGridBuilder along with CV
Beispiel #8
0
    bestmodels(RFT_crossval, vectorized_CV_data, 2)

    RFT_model = RFT_crossval.fit(vectorized_CV_data)
    RFT_tree_model = RFT_model.bestModel.stages[2]

    #####预测和模型评估#######
    vectorized_test_data = vectorizeData(final_test_data)
    tf_data = CV_model.transform(vectorized_test_data)
    GBT_tf_data = GBT_model.transform(vectorized_test_data)
    RFT_tf_data = RFT_model.transform(vectorized_test_data)

    ################决策树评估###################################################
    print(f1_evaluator.getMetricName(), 'score:',
          f1_evaluator.evaluate(tf_data, {f1_evaluator.metricName: "f1"}))
    print(ROC_evaluator.getMetricName(), 'AUC:',
          ROC_evaluator.evaluate(tf_data))

    predictions = tf_data.select('indexedLabel', 'prediction', 'probability')
    resultdf = predictions.toPandas()
    print(accuracy_score(resultdf.indexedLabel, resultdf.prediction))
    print(confusion_matrix(resultdf.indexedLabel, resultdf.prediction))
    print(classification_report(resultdf.indexedLabel, resultdf.prediction))

    ################GBDT评估###################################################
    print(f1_evaluator.getMetricName(), 'score:',
          f1_evaluator.evaluate(GBT_tf_data))
    print(ROC_evaluator.getMetricName(), 'AUC:',
          ROC_evaluator.evaluate(GBT_tf_data))

    predictions = GBT_tf_data.select('indexedLabel', 'prediction',
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(adulttrain)
lrmodel = lr.setParams(regParam=0.01, maxIter=500, fitIntercept=True).fit(adulttrain)

lrmodel.weights
lrmodel.intercept

#section 8.2.3
validpredicts = lrmodel.transform(adultvalid)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
bceval = BinaryClassificationEvaluator()
bceval.evaluate(validpredicts)
bceval.getMetricName()

bceval.setMetricName("areaUnderPR")
bceval.evaluate(validpredicts)

#section 8.2.5
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.tuning import ParamGridBuilder
cv = CrossValidator().setEstimator(lr).setEvaluator(bceval).setNumFolds(5)
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [1000]).addGrid(lr.regParam, [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]).build()
cv.setEstimatorParamMaps(paramGrid)
cvmodel = cv.fit(adulttrain)
cvmodel.bestModel.weights
BinaryClassificationEvaluator().evaluate(cvmodel.bestModel.transform(adultvalid))

#section 8.2.6
Beispiel #10
0
assemblerInputs = indexedCategoricalCols + numericColList
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
df = assembler.transform(df)

# Indexing binary labels
labeller = StringIndexer(inputCol=label, outputCol="label").fit(df)
df = labeller.transform(df).select(["features", "label"])

### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = df.randomSplit([0.7, 0.3], seed=100)

#dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
dt = LogisticRegression(regParam=0.01)
model = dt.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)
evaluator = Evaluator()
# Select example rows to display.
predictions.select("prediction", "label", "features").show()
# Evaluate the learned model
print("LogRegression Test %s: %f" %
      (evaluator.getMetricName(), evaluator.evaluate(predictions)))

model = NaiveBayes(thresholds=[0.1, 1.0])
model = dt.fit(trainingData)
predictions = model.transform(testData)
predictions.select("prediction", "label", "features").show()

print("Bayes Test %s: %f" %
      (evaluator.getMetricName(), evaluator.evaluate(predictions)))
                                                rawPredictionCol='prediction',
                                                metricName='areaUnderROC')

# In[71]:

#generate splits for cross validation
splits = indexedData.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2])

# In[72]:

TotalAccuracy = 0

for i in range(5):

    testIndex = splits[i].select('id').collect()  #get test index for each fold
    rdd = sc.parallelize(testIndex)
    test_rdd = rdd.flatMap(lambda x: x).collect()
    test_Data = indexedData.filter(
        indexedData.id.isin(test_rdd))  #get test data for each fold
    train_Data = indexedData.filter(
        ~indexedData.id.isin(test_rdd))  #get train data for each model
    model = nb.fit(train_Data)  #fit train data to model
    transformed_data = model.transform(test_Data)  # evaluate test data
    accuracy = binaryEvaluator.evaluate(
        transformed_data)  # get accuracy for test data
    print(binaryEvaluator.getMetricName(), 'accuracy:', accuracy)
    TotalAccuracy = TotalAccuracy + accuracy

averageAccuracy = TotalAccuracy / 5  # get average accuracy
print(averageAccuracy)
Beispiel #12
0
# Predict With Model
#################
logistic_regression_predictions = logistic_regression_pipeline_model.transform(test_data)

#################
# Evaluate Model
#################
logistic_regression_predictions_selected = logistic_regression_predictions.select(CAT_COLS + CONT_COLS + ["income", "income_str_idx", "prediction", "probability"])
logistic_regression_predictions_selected.show(30)
logistic_regression_predictions_selected.groupby('income').agg({'income': 'count'}).show()
lr_pred = logistic_regression_predictions.select("income_str_idx", "prediction")
lr_accuracy_rate = lr_pred.filter(lr_pred.income_str_idx == lr_pred.prediction).count() / (lr_pred.count() * 1.0)
print('MODEL RESULTS:')
print("Overall Accuracy: {}".format(lr_accuracy_rate))

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='income_str_idx')
print('{}: {}'.format(evaluator.getMetricName(), evaluator.evaluate(logistic_regression_predictions)))


#################
# Save and Load Model
#################
logistic_regression_pipeline_model.write().overwrite().save('my_logistic_regression_model_2.model')
loaded_lr_model = PipelineModel.load("my_logistic_regression_model_2.model")
more_predictions = loaded_lr_model.transform(test_data)
print('\nLOADED MODEL RESULTS:')
print("Coefficients: " + str(loaded_lr_model.stages[-1].coefficients))
print("Intercept: " + str(loaded_lr_model.stages[-1].intercept))
lr_pred = more_predictions.select("income_str_idx", "prediction")
loaded_accuracy = lr_pred.filter(lr_pred.income_str_idx == lr_pred.prediction).count() / (lr_pred.count() * 1.0)
print("Overall Accuracy Loaded: {}".format(loaded_accuracy))
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(
    labelCol="label",
    featuresCol="features",
)

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

pipeline_rf = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.maxDepth,
                                       [2, 3, 4, 5, 6, 7]).addGrid(
                                           rf.numTrees, [100, 300]).build()

evaluator = BinaryClassificationEvaluator(labelCol="label",
                                          rawPredictionCol="rawPrediction")
crossval = CrossValidator(estimator=pipeline_rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

## Fitting the CV
CV_model = crossval.fit(trainingData)

## Printing best model
print(CV_model.bestModel.stages[0])

test_pred = CV_model.transform(testData)
print(evaluator.getMetricName(), evaluator.evaluate(test_pred))
Beispiel #14
0
# accuracy
def accuracy_m(model):
    predictions = model.transform(test_data)
    cm = predictions.select('label', 'prediction')
    acc = cm.filter(cm.label == cm.prediction).count() / cm.count()
    print('model accuracy : %.3f%%' % (acc * 100))


accuracy_m(model=linearModel)  #   model accuracy : 82.161%

# use ROC for binary classification ) = True Positive Rate(recall)  # TODO : 확인하기
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')
print(evaluator.evaluate(predictions))  #   0.8952698333157076
print(evaluator.getMetricName())  # areaUnderROC

# step 6) tune the hyperparameter
'''
To reduce the time of the computation, 
you only tune the regularization parameter with only two values.
'''
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
param_grid = (ParamGridBuilder().addGrid(lr.regParam, [0.01, 0.5]).build())

# time check and kfold=5
from time import *

start_time = time()
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=param_grid,
Beispiel #15
0
    args = parser.parse_args()

    in_file = args.csv
    CV_data = spark.read.csv(in_file, header=True, inferSchema=True)
    to_vectorize = [
        'step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest',
        'newbalanceDest', 'isFlaggedFraud'
    ]

    training_data, testing_data = CV_data.randomSplit([0.8, 0.2])

    #xytrain = labelData(training_data)

    xytrain = vectorizeData(training_data,
                            validCols=to_vectorize,
                            labelsCol='isFraud')
    lr = LogisticRegression(regParam=0.01)
    model = lr.fit(xytrain)

    xytest = vectorizeData(testing_data,
                           validCols=to_vectorize,
                           labelsCol='isFraud')
    predicted_train = model.transform(xytrain)
    predicted_test = model.transform(xytest)
    evaluator = Evaluator()

    print("Train %s: %f" %
          (evaluator.getMetricName(), evaluator.evaluate(predicted_train)))
    print("Test %s: %f" %
          (evaluator.getMetricName(), evaluator.evaluate(predicted_test)))
Beispiel #16
0
cm = predictions.select("label", "prediction")
cm.groupby('label').agg({'label': 'count'}).show()

cm.groupby('prediction').agg({'prediction': 'count'}).show()

cm.filter(cm.label == cm.prediction).count() / cm.count()

accuracy_m(model = linearModel)

### Use ROC 
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print(evaluator.evaluate(predictions))
print(evaluator.getMetricName())

print(evaluator.evaluate(predictions))


from pyspark.ml.tuning import ParamGridBuilder,CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5])
             .build())

from time import *
start_time = time()

# Create 5-fold CrossValidator
# COMMAND ----------

from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

# COMMAND ----------

# MAGIC %md Note that the default metric for the BinaryClassificationEvaluator is areaUnderROC

# COMMAND ----------

evaluator.getMetricName()

# COMMAND ----------

# MAGIC %md The evaluator currently accepts 2 kinds of metrics - areaUnderROC and areaUnderPR.
# MAGIC We can set it to areaUnderPR by using evaluator.setMetricName("areaUnderPR").

# COMMAND ----------

# MAGIC %md
# MAGIC Now we will try tuning the model with the ParamGridBuilder and the CrossValidator.
# MAGIC 
# MAGIC If you are unsure what params are available for tuning, you can use explainParams() to print a list of all params.

# COMMAND ----------
Beispiel #18
0
        VectorAssembler(
            inputCols=["{0}_counts".format(i) for i in range(1, n + 1)],
            outputCol="features")
    ]
    nb = [NaiveBayes(smoothing=1.0, modelType="multinomial")]
    return Pipeline(stages=tokenizer + ngrams + vectorizers + assembler + nb)


model = build_ngrams(n=2).fit(train_data)
preds_valid = model.transform(valid_data)

#Evaluate the model. default metric : Area Under ROC..... areaUnderROC:0.609
# with text_clean: 0.607
# with text_clean + build_ngrams(n=2): 0.612
bceval = BinaryClassificationEvaluator()
print bceval.getMetricName() + ":" + str(round(bceval.evaluate(preds_valid),
                                               3))

#Evaluate the model. metric : Area Under PR...... areaUnderPR:0.732
# with text_clean: 0.728
# with text_clean + build_ngrams(n=2): 0.729
bceval.setMetricName("areaUnderPR")
print bceval.getMetricName() + ":" + str(round(bceval.evaluate(preds_valid),
                                               3))

#Evaluate the model. metric : F1 score...... f1:0.865
# with text_clean: 0.858
# with text_clean + build_ngrams(n=2): 0.882
mceval = MulticlassClassificationEvaluator(labelCol="label",
                                           predictionCol="prediction",
                                           metricName="f1")
print(mceval.getMetricName() + ":" +