Beispiel #1
0
                             "rawPrediction", "probability")
selected.show(1)
selected.printSchema
#for row in selected.collect():
#    rid, actual, prob, prediction = row
#    print((rid, actual, prob, prediction))

from pyspark.ml.evaluation import BinaryClassificationEvaluator
import pyspark.sql.functions as F
import pyspark.sql.types as T

prob_extract = F.udf(lambda x: float(x[1]), T.FloatType())
#print(prediction.withColumn("prob1",prob_extract("probability")).select("prob1","prediction").show())

evaluator = BinaryClassificationEvaluator(
    rawPredictionCol='rawPrediction',
    metricName="areaUnderROC",
    labelCol='default_payment_next_month')
print('Evaluator areaUnderROC: ' +
      str(evaluator.evaluate(prediction)))  # 0.7294563666075892

evaluator = BinaryClassificationEvaluator(
    rawPredictionCol='rawPrediction',
    metricName="areaUnderPR",
    labelCol='default_payment_next_month')
print('Evaluator areaUnderPR : ' +
      str(evaluator.evaluate(prediction)))  # 0.7294563666075892

prediction.groupBy('default_payment_next_month', 'prediction').count().show()

# Metrics
predictionRDD = prediction.select(['label', 'prediction']) \
    ]
    label_stringIdx = [StringIndexer(inputCol="label", outputCol="labels")]
    selector = [
        ChiSqSelector(numTopFeatures=2**14,
                      featuresCol='rawFeatures',
                      outputCol="features")
    ]
    lr = [LogisticRegression(maxIter=1000)]
    return Pipeline(stages=tokenizer + remover + ngrams + cv + idf +
                    assembler + label_stringIdx + selector + lr)


#saving pipeline steps of execute
pipeline_load = PipelineModel.load("pipeLineModel")
predictions = pipeline_load.transform(
    test_set)  #put dataframe for testing here
int(predictions.collect()[-1]['prediction'])  #prediction

#finding the accuracy of the model.
accuracy = predictions.filter(
    predictions.label == predictions.prediction).count() / float(
        test_set.count())
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
roc_auc = evaluator.evaluate(predictions)
print("Accuracy Score: ", accuracy)
print("ROC-AUC: {0:.4f}", roc_auc)

#loading pipeline and predicting the accuracy of new data.
predictions = pipeline_load.transform(ddf)
int(predictions.collect()[-1]['prediction'])
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
    # This will allow us to jointly choose parameters for all Pipeline stages.
    # A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
    # We use a ParamGridBuilder to construct a grid of parameters to search over.
    # With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
    # this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
    paramGrid = ParamGridBuilder() \
        .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
        .addGrid(lr.regParam, [0.1, 0.01]) \
        .build()

    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=BinaryClassificationEvaluator(),
                              numFolds=2)  # use 3+ folds in practice

    # Run cross-validation, and choose the best set of parameters.
    cvModel = crossval.fit(training)

    # Prepare test documents, which are unlabeled.
    Document = Row("id", "text")
    test = sc.parallelize([(4L, "spark i j k"),
                           (5L, "l m n"),
                           (6L, "mapreduce spark"),
                           (7L, "apache hadoop")]) \
        .map(lambda x: Document(*x)).toDF()

    # Make predictions on test documents. cvModel uses the best model found (lrModel).
    prediction = cvModel.transform(test)
Beispiel #4
0
# MAGIC since this is a binary classification problem, we define a `BinaryClassificationEvaluator` evaluator.
# MAGIC 
# MAGIC The default metrics are 
# MAGIC * Area under the precision-recall curve and 
# MAGIC * Area under the receiver operating characteristic (ROC) curve
# MAGIC 
# MAGIC For more information see:
# MAGIC * Scala: <a href="https://spark.apache.org/docs/latest/api/scala/#org.apache.spark.ml.evaluation.BinaryClassificationEvaluator" target="_blank">BinaryClassificationEvaluator</a>
# MAGIC * Python: <a href="https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.BinaryClassificationEvaluator" target="_blank">BinaryClassificationEvaluator</a>

# COMMAND ----------

from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = ( BinaryClassificationEvaluator()
    .setLabelCol("label")
    .setRawPredictionCol("prediction"))

# COMMAND ----------

# MAGIC %md ## Define CrossValidator 
# MAGIC 
# MAGIC for best model selection and makes sure that there's no overfitting.

# COMMAND ----------

cv = ( CrossValidator()
    .setEstimator(pipeline)
    .setEvaluator(evaluator)
    .setEstimatorParamMaps(paramGrid)
    .setNumFolds(numFolds))
Beispiel #5
0
def testModel(model, validate = validate):
	pred = model.transform(validate)
	evaluator = BinaryClassificationEvaluator(labelCol = 'index')
	return evaluator.evaluate(pred)
def main(sqlContext):
    """Main function takes a Spark SQL context."""
    spark = SparkSession(SparkContext.getOrCreate())
    # Task 1
    # if parquet exists, read
    try:
        # These are the two data frames we're working with
        comments = sqlContext.read.parquet("comments.parquet")
        submissions = sqlContext.read.parquet("submissions.parquet")
    except:
        # Otherwise do the following
        comments = sqlContext.read.json("comments-minimal.json.bz2")
        submissions = sqlContext.read.json("submissions.json.bz2")
        labeled_data = sqlContext.read.format("csv").option(
            "header", "true").load("labeled_data.csv")
        comments.write.parquet("comments.parquet")
        submissions.write.parquet("submissions.parquet")

    # Task 2
    # join on labeled_data.Input_id and comments.id
    labeled_data = sqlContext.read.format("csv").option(
        "header", "true").load("labeled_data.csv")
    labeled_data.createOrReplaceTempView("labeled_data")
    comments.createOrReplaceTempView("comments")
    sqlDF = spark.sql(
        "SELECT l.id as id, l.body, r.labeldem as Dem,r.labelgop as GOP,r.labeldjt as Trump FROM labeled_data as r INNER JOIN comments as l ON r.Input_id = l.id "
    )
    # Task 3

    # Task 4 & Task 5
    sqlDF.createOrReplaceTempView("sqlDF")

    def parse(z):
        res1 = []
        res2 = []
        wordList = sanitize(z)
        for i, val in enumerate(wordList[1:]):
            res1.append(val)
        for i, value in enumerate(wordList[1:]):
            for j, val in enumerate((value.split(" "))):
                res2.append(val)
        return res1 + res2

    sqlContext.registerFunction("parser", lambda z: parse(z),
                                ArrayType(StringType()))

    parsedTable = spark.sql(
        "SELECT id, body, Trump, parser(body) as parsed FROM sqlDF")
    parsedTable.createOrReplaceTempView("parsedTable")

    # Task 6a
    # parsedTableID = spark.sql("SELECT id, parsed FROM parsedTable")
    cv = CountVectorizer(inputCol="parsed", outputCol="vectors", minDF=10.0)
    model = cv.fit(parsedTable)
    parsedVectorTable = model.transform(parsedTable)
    parsedVectorTable.createOrReplaceTempView("parsedVectorTable")

    # Task 6b
    resTable = spark.sql(
        "SELECT id, body, Trump, vectors, CASE WHEN Trump=1 THEN 1 ELSE 0 END AS positive, CASE WHEN Trump=-1 THEN 1 ELSE 0 END AS negative FROM parsedVectorTable"
    )

    # TASK 7
    # Initialize two logistic regression models.
    # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
    poslr = LogisticRegression(labelCol="label",
                               featuresCol="vectors",
                               maxIter=10)
    neglr = LogisticRegression(labelCol="label",
                               featuresCol="vectors",
                               maxIter=10)
    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    pos = resTable.select("positive", "vectors")
    pos = pos.withColumnRenamed("positive", "label")
    neg = resTable.select("negative", "vectors")
    neg = neg.withColumnRenamed("negative", "label")

    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])
    # Train the models
    print("Training positive classifier...")
    posModel = posCrossval.fit(posTrain)
    print("Training negative classifier...")
    negModel = negCrossval.fit(negTrain)

    # Once we train the models, we don't want to do it again. We can save the models and load them again later.
    posModel.save("project2/pos.model")
    negModel.save("project2/neg.model")

    # Task 8
    print('task 8 started')

    def strip_t3(s):
        return s[3:]

    # Sample data if needed
    #comment out later
    # comments = comments.sample(False, 0.0002)

    sqlContext.registerFunction("strip_t3", lambda z: strip_t3(z),
                                StringType())
    comments.createOrReplaceTempView("comments")
    submissions.createOrReplaceTempView("submissions")
    joined_data = spark.sql(
        "SELECT c.created_utc as created_time, s.title as post_title, c.author_flair_text as com_state, c.body as body, c.id as comment_id, s.id as submission_id, s.score as s_score, c.score as c_score FROM comments as c INNER JOIN submissions as s ON strip_t3(c.link_id) = s.id"
    )
    joined_data.createOrReplaceTempView("joined_data")
    # joined_data.show()

    # Task 9
    # dataframe_task9 = spark.sql("SELECT * FROM joined_data WHERE body NOT LIKE '&gt;%' AND body NOT LIKE '%/s%'")
    print('task 9 started')
    dataframe_task9 = spark.sql(
        "SELECT created_time, post_title, com_state, parser(body) as parsed, comment_id, submission_id, c_score, s_score FROM joined_data WHERE body NOT LIKE '&gt;%' AND body NOT LIKE '%/s%'"
    )
    dataframe_task9.createOrReplaceTempView("dataframe_task9")
    # dataframe_task9.show()

    cv_result = model.transform(dataframe_task9)
    pos_model = CrossValidatorModel.load('project2/pos.model')
    neg_model = CrossValidatorModel.load('project2/neg.model')

    pos = pos_model.transform(cv_result)
    pos.createOrReplaceTempView('pos')

    def posProbUDF(z):
        if z[1] > 0.2:
            return 1
        else:
            return 0

    def negProbUDF(z):
        if z[1] > 0.25:
            return 1
        else:
            return 0

    posProb = udf(posProbUDF, IntegerType())
    negProb = udf(negProbUDF, IntegerType())

    # sqlContext.registerFunction("posProbUDF", lambda z: parse(z), IntegerType())
    # sqlContext.registerFunction("negProbUDF", lambda z: parse(z), IntegerType())
    # pos = spark.sql("SELECT com_state, vectors, submission_id, created_time, s_score, c_score, rawPrediction as pos_rawPrediction, posProbUDF(probability) as pos_probability, prediction as pos_prediction FROM pos")
    # pos.createOrReplaceTempView('pos')
    pos = pos.select(col('com_state'), col('vectors'), col('submission_id'),
                     col('created_time'), col('s_score'), col('c_score'),
                     posProb("probability").alias("pos_probability"),
                     col("prediction").alias('pos_pred'))
    all_results = neg_model.transform(pos)
    total_result = all_results.select(
        col('com_state'), col('vectors'), col('submission_id'),
        col('created_time'), col('s_score'), col('pos_probability'),
        col('c_score'),
        negProb("probability").alias("neg_probability"), col("prediction"))

    # Task 10
    def getState(input_flair):
        states = [
            'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
            'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
            'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
            'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
            'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
            'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
            'New Jersey', 'New Mexico', 'New York', 'North Carolina',
            'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
            'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
            'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
            'West Virginia', 'Wisconsin', 'Wyoming'
        ]
        if str(input_flair) in states:
            return str(input_flair)
        return 'not_state'

    get_state = udf(getState, StringType())
    total_result = total_result.select('*',
                                       get_state('com_state').alias('state'))
    total_result.createOrReplaceTempView('final_results')

    # parsedTable = spark.sql("SELECT id, body, Trump, parser(body) as parsed FROM sqlDF")

    query_1 = spark.sql(
        "SELECT submission_id, AVG(pos_probability) as pos_prob, AVG(neg_probability) as neg_prob FROM final_results GROUP BY submission_id"
    )
    print('q1')
    query_2 = spark.sql(
        "SELECT date(from_unixtime(created_time)) as date, AVG(pos_probability) as pos_prob, AVG(neg_probability) as neg_prob FROM final_results GROUP BY date"
    )
    print('q2')
    query_3 = spark.sql(
        "SELECT state, AVG(pos_probability) as pos_prob, AVG(neg_probability) as neg_prob FROM final_results WHERE state !='not_state' GROUP BY state"
    )
    # print('q3')
    query_4c = spark.sql(
        "SELECT c_score, AVG(pos_probability) as pos_prob, AVG(neg_probability) as neg_prob  FROM final_results GROUP BY c_score"
    )
    query_4s = spark.sql(
        "SELECT s_score, AVG(pos_probability) as pos_prob, AVG(neg_probability) as neg_prob FROM final_results GROUP BY s_score"
    )

    query_1.toPandas().to_csv("query_1.csv")
    query_2.toPandas().to_csv("query_2.csv")
    query_3.toPandas().to_csv("query_3.csv")
    query_4c.toPandas().to_csv("query_4c.csv")
    query_4s.toPandas().to_csv("query_4s.csv")
Beispiel #7
0
# ### using Grid Search and cross validation
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
RFclassifier = RandomForestClassifier(labelCol='label',
                                      featuresCol='features',
                                      impurity=param_impurity)

pipeline = Pipeline(stages=[labelIndexer, featureIndexer, RFclassifier])

# ### Define test configutations (to be evaluated in Grid)
paramGrid = ParamGridBuilder()\
   .addGrid(RFclassifier.maxDepth, param_maxDepth )\
   .addGrid(RFclassifier.numTrees, param_numTrees )\
   .build()

# ### Defing metric by wich the model will be evaluated
evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')

crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    parallelism=3,  #number of models run in ||
    numFolds=2)

# ### fit model (note : returns the best model)
cvModel = crossval.fit(trainingData)

# ### show performande of runs
print(cvModel.avgMetrics)

# # Evaluation of model performance on validation dataset
hasher.transform(df_train).select("features").show()

from pyspark.ml.classification import LogisticRegression

classifier = LogisticRegression(maxIter=20,
                                regParam=0.000,
                                elasticNetParam=0.000)

stages = [hasher, classifier]

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=stages)

model = pipeline.fit(df_train)

predictions = model.transform(df_test)

predictions.cache()

from pyspark.ml.evaluation import BinaryClassificationEvaluator

ev = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                   metricName="areaUnderROC")
print(ev.evaluate(predictions))

spark.stop()

# In[ ]:
testData.show()

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

#Create the model
Classifer =  LogisticRegression(regParam=0.0,labelCol="label",\
                featuresCol="features")
Model = Classifer.fit(trainingData)

#Predict on the test data
predictions = Model.transform(testData)
predictions.select("prediction","label").show()

#Evaluate accuracy
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", \
                    labelCol="label")
evaluator.evaluate(predictions)      

#to check with the overfitting problem
predictions_train = Model.transform(trainingData)
predictions.select("prediction","label").show()


#Draw a confusion matrix
predictions.groupBy("label","prediction").count().show()

###################################### INSULT as the output
#Split into training and testing data
(trainingData, testData) = INSULTDf.randomSplit([0.75, 0.25])
trainingData.count()
testData.count()
Beispiel #10
0
# streamline all above steps into a pipeline
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr])

# train model and predict results

# perform grid search looking for the best parameters and the best models
paramGrid = ParamGridBuilder()\
    .addGrid(hashingTF.numFeatures,[1000,5000,10000])\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.3, 0.6])\
    .build()
tvs = TrainValidationSplit(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=BinaryClassificationEvaluator().setMetricName('areaUnderPR'),
    trainRatio=0.8)
# set area under precision-recall curve as the evaluation metric - 80% of data will be used for training, 20% for validation

# run TrainValidationSplit and choose the best set of parameters
model = tvs.fit(train_set)

# make predictions
train_prediction = model.transform(train_set)
test_prediction = model.transform(test_set)

# report accuracy

# caculate the accuracy score for the best model
correct = test_prediction.filter(
    test_prediction.label == test_prediction.prediction).count()
    return cvModel

def model_test(cvModel, df_test)
    """
    Returning sparkify trained datasets and test it against test datasets with best parameters
    Parameters
    -----------
        cvModel: Cross validator model
        df_test: DataFrame 
    returns
    -------
        results: dataframe
    """
    best_model = cv_model.bestModel
    results = best_model.transform(df_test)
    evaluatorb = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction")
    evaluatorm = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
    print(title)
    print('area under ROC: %f' % evaluatorb.evaluate(results, {evaluatorb.metricName: "areaUnderROC"}))
    print('area under PR: %f' % evaluatorb.evaluate(results, {evaluatorb.metricName: "areaUnderPR"}))
    print('Accuracy: %f' % evaluatorm.evaluate(results, {evaluatorm.metricName: "accuracy"}))
    print('F-1 Score: %f' % evaluatorm.evaluate(results, {evaluatorm.metricName: "f1"}))
    print('wPrecision: %f' % evaluatorm.evaluate(results, {evaluatorm.metricName: "weightedPrecision"}))
    print('wRecall: %f' % evaluatorm.evaluate(results, {evaluatorm.metricName: "weightedRecall"}))

    return results


def ExtractFeature(featureImp, dataset, featuresCol):
    """
    Returning a dataframe that consists of features weight according to the trained sets and results from test set
Beispiel #12
0
def SparkML(train_df,
            test_df=None,
            featuresCol='features',
            labelCol='label',
            binaryclass=False,
            multiclass=False,
            n_cluster=2,
            userCol='user',
            itemCol='item',
            ratingCol='rating',
            rank=10,
            userid=3,
            itemid=3,
            itemsCol='items',
            minSupport=0.3,
            minConfidence=0.8,
            stringIndexer=False,
            inputColStringIndexer=None,
            outputColStringIndexer=None,
            oneHotEncoder=False,
            inputColOneHotEncoder=None,
            outputColOneHotEncoder=None,
            vectorAssembler=False,
            inputColsVectorAssembler=None,
            outputColsVectorAssembler=None,
            vectorIndexer=False,
            inputColsVectorIndexer=None,
            outputColsVectorIndexer=None,
            maxCategories=None,
            classification=False,
            logisticregression=False,
            decisiontreeclassifier=False,
            linearsvc=False,
            naivebayes=False,
            randomforestclassifier=False,
            gbtclassifier=False,
            regression=False,
            linearregression=True,
            decisiontreeregressor=False,
            randomforestregressor=False,
            gbtregressor=False,
            clustering=False,
            kmeans=False,
            gaussianmixture=False,
            lda=False,
            recommendation=False,
            als=False,
            association=False,
            fpgrowth=False):
    if classification:
        if logisticregression:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            LRClassifier = LogisticRegression(featuresCol=featuresCol,
                                              labelCol=labelCol,
                                              predictionCol='Prediction',
                                              probabilityCol='Probability',
                                              rawPredictionCol='RawPrediction',
                                              standardization=True,
                                              maxIter=100,
                                              regParam=0.0,
                                              elasticNetParam=0.0,
                                              tol=1e-06,
                                              fitIntercept=True,
                                              threshold=0.5)
            paramGrid = ParamGridBuilder().addGrid(
                LRClassifier.maxIter,
                [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid(
                    LRClassifier.regParam,
                    [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0
                     ]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            LRCV = CrossValidator(estimator=LRClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(LRCV)
            LRC_Pipeline = Pipeline(stages=stagesList)
            LRC_PipelineModel = LRC_Pipeline.fit(train_df)
            LRC_Predicted = LRC_PipelineModel.transform(test_df)
            LRC_BestModel = LRC_PipelineModel.stages[-1].bestModel
            LRC_Probability = LRC_Predicted.select("Probability").toPandas()
            LRC_Prediction = LRC_Predicted.select("Prediction").toPandas()
            LRC_Score = evaluator.evaluate(LRC_Predicted)
            return LRC_BestModel, LRC_Predicted, LRC_Probability, LRC_Prediction, LRC_Score
        if decisiontreeclassifier:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            DTClassifier = DecisionTreeClassifier(
                featuresCol=featuresCol,
                labelCol=labelCol,
                predictionCol='Prediction',
                probabilityCol='Probability',
                rawPredictionCol='RawPrediction',
                maxDepth=5,
                maxBins=32,
                minInstancesPerNode=1,
                minInfoGain=0.0,
                impurity='gini',
                seed=None)
            paramGrid = ParamGridBuilder().addGrid(
                DTClassifier.impurity, ["gini", "entropy"]).addGrid(
                    DTClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                        DTClassifier.maxBins,
                        [3, 5, 10, 50, 100, 200]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            DTCV = CrossValidator(estimator=DTClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(DTCV)
            DTC_Pipeline = Pipeline(stages=stagesList)
            DTC_PipelineModel = DTC_Pipeline.fit(train_df)
            DTC_Predicted = DTC_PipelineModel.transform(test_df)
            DTC_BestModel = DTC_PipelineModel.stages[-1].bestModel
            DTC_Probability = DTC_Predicted.select("Probability").toPandas()
            DTC_Prediction = DTC_Predicted.select("Prediction").toPandas()
            DTC_Score = evaluator.evaluate(DTC_Predicted)
            return DTC_BestModel, DTC_Predicted, DTC_Probability, DTC_Prediction, DTC_Score
        if linearsvc:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            SVClassifier = LinearSVC(featuresCol=featuresCol,
                                     labelCol=labelCol,
                                     predictionCol='Prediction',
                                     rawPredictionCol='RawPrediction',
                                     maxIter=100,
                                     regParam=0.0,
                                     tol=1e-06,
                                     fitIntercept=True,
                                     standardization=True,
                                     threshold=0.0)
            paramGrid = ParamGridBuilder().addGrid(
                SVClassifier.maxIter,
                [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid(
                    SVClassifier.regParam,
                    [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0
                     ]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            SVCV = CrossValidator(estimator=SVClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(SVCV)
            SVC_Pipeline = Pipeline(stages=stagesList)
            SVC_PipelineModel = SVC_Pipeline.fit(train_df)
            SVC_Predicted = SVC_PipelineModel.transform(test_df)
            SVC_BestModel = SVC_PipelineModel.stages[-1].bestModel
            SVC_Prediction = SVC_Predicted.select("Prediction").toPandas()
            SVC_Score = evaluator.evaluate(SVC_Predicted)
            return SVC_BestModel, SVC_Predicted, SVC_Prediction, SVC_Score
        if naivebayes:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            NBClassifier = NaiveBayes(featuresCol=featuresCol,
                                      labelCol=labelCol,
                                      predictionCol='Prediction',
                                      probabilityCol='Probability',
                                      rawPredictionCol='RawPrediction',
                                      smoothing=1.0,
                                      modelType='multinomial',
                                      thresholds=None)
            paramGrid = ParamGridBuilder().addGrid(
                NBClassifier.smoothing,
                [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            NBCV = CrossValidator(estimator=NBClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(NBCV)
            NBC_Pipeline = Pipeline(stages=stagesList)
            NBC_PipelineModel = NBC_Pipeline.fit(train_df)
            NBC_Predicted = NBC_PipelineModel.transform(test_df)
            NBC_BestModel = NBC_PipelineModel.stages[-1].bestModel
            NBC_Probability = NBC_Predicted.select("Probability").toPandas()
            NBC_Prediction = NBC_Predicted.select("Prediction").toPandas()
            NBC_Score = evaluator.evaluate(NBC_Predicted)
            return NBC_BestModel, NBC_Predicted, NBC_Probability, NBC_Prediction, NBC_Score
        if randomforestclassifier:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            RFClassifier = RandomForestClassifier(
                featuresCol=featuresCol,
                labelCol=labelCol,
                predictionCol='Prediction',
                probabilityCol='Probability',
                rawPredictionCol='RawPrediction',
                maxDepth=5,
                maxBins=32,
                minInstancesPerNode=1,
                minInfoGain=0.0,
                impurity='gini',
                numTrees=20,
                featureSubsetStrategy='auto',
                seed=None,
                subsamplingRate=1.0)
            paramGrid = ParamGridBuilder().addGrid(
                RFClassifier.impurity, ["gini", "entropy"]).addGrid(
                    RFClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                        RFClassifier.maxBins,
                        [3, 5, 10, 50, 100, 200]).addGrid(
                            RFClassifier.numTrees,
                            [5, 10, 20, 50, 100, 200]).addGrid(
                                RFClassifier.subsamplingRate,
                                [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            if binaryclass:
                evaluator = BinaryClassificationEvaluator(
                    rawPredictionCol="RawPrediction",
                    labelCol=labelCol,
                    metricName="areaUnderROC")
            if multiclass:
                evaluator = MulticlassClassificationEvaluator(
                    labelCol=labelCol,
                    predictionCol="Prediction",
                    metricName="accuracy")
            RFCV = CrossValidator(estimator=RFClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(RFCV)
            RFC_Pipeline = Pipeline(stages=stagesList)
            RFC_PipelineModel = RFC_Pipeline.fit(train_df)
            RFC_Predicted = RFC_PipelineModel.transform(test_df)
            RFC_BestModel = RFC_PipelineModel.stages[-1].bestModel
            RFC_Probability = RFC_Predicted.select("Probability").toPandas()
            RFC_Prediction = RFC_Predicted.select("Prediction").toPandas()
            RFC_Score = evaluator.evaluate(RFC_Predicted)
            return RFC_BestModel, RFC_Predicted, RFC_Probability, RFC_Prediction, RFC_Score
        if gbtclassifier:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            GBClassifier = GBTClassifier(featuresCol=featuresCol,
                                         labelCol=labelCol,
                                         predictionCol='Prediction',
                                         maxDepth=5,
                                         maxBins=32,
                                         minInstancesPerNode=1,
                                         minInfoGain=0.0,
                                         lossType='logistic',
                                         maxIter=20,
                                         stepSize=0.1,
                                         seed=None,
                                         subsamplingRate=1.0)
            paramGrid = ParamGridBuilder().addGrid(
                GBClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    GBClassifier.maxBins, [3, 5, 10, 50, 100, 200]).addGrid(
                        GBClassifier.maxIter,
                        [5, 10, 20, 50, 100, 200]).addGrid(
                            GBClassifier.stepSize,
                            [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid(
                                GBClassifier.subsamplingRate,
                                [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            evaluator = MulticlassClassificationEvaluator(
                labelCol=labelCol,
                predictionCol="Prediction",
                metricName="accuracy")
            GBCV = CrossValidator(estimator=GBClassifier,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(GBCV)
            GBC_Pipeline = Pipeline(stages=stagesList)
            GBC_PipelineModel = GBC_Pipeline.fit(train_df)
            GBC_Predicted = GBC_PipelineModel.transform(test_df)
            GBC_BestModel = GBC_PipelineModel.stages[-1].bestModel
            GBC_Prediction = GBC_Predicted.select("Prediction").toPandas()
            GBC_Score = evaluator.evaluate(GBC_Predicted)
            return GBC_BestModel, GBC_Predicted, GBC_Prediction, GBC_Score
    if regression:
        if linearregression:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            LRegressor = LinearRegression(featuresCol=featuresCol,
                                          labelCol=labelCol,
                                          predictionCol='Prediction',
                                          standardization=True,
                                          fitIntercept=True,
                                          loss='squaredError',
                                          maxIter=100,
                                          regParam=0.0,
                                          elasticNetParam=0.0,
                                          tol=1e-06,
                                          epsilon=1.35)
            paramGrid = ParamGridBuilder().addGrid(
                LRegressor.maxIter,
                [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid(
                    LRegressor.regParam,
                    [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0
                     ]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            LRCV = CrossValidator(estimator=LRegressor,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(LRCV)
            LR_Pipeline = Pipeline(stages=stagesList)
            LR_PipelineModel = LR_Pipeline.fit(train_df)
            LR_Predicted = LR_PipelineModel.transform(test_df)
            LR_BestModel = LR_PipelineModel.stages[-1].bestModel
            LR_Prediction = LR_Predicted.select("Prediction").toPandas()
            LR_Score = evaluator.evaluate(LR_Predicted)
            return LR_BestModel, LR_Predicted, LR_Prediction, LR_Score
        if decisiontreeregressor:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            DTRegressor = DecisionTreeRegressor(featuresCol=featuresCol,
                                                labelCol=labelCol,
                                                predictionCol='Prediction',
                                                maxDepth=5,
                                                maxBins=32,
                                                minInstancesPerNode=1,
                                                minInfoGain=0.0,
                                                impurity='variance',
                                                seed=None,
                                                varianceCol=None)
            paramGrid = ParamGridBuilder().addGrid(
                DTRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    DTRegressor.maxBins, [3, 5, 10, 50, 100, 200]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            DTRCV = CrossValidator(estimator=DTRegressor,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(DTRCV)
            DTR_Pipeline = Pipeline(stages=stagesList)
            DTR_PipelineModel = DTR_Pipeline.fit(train_df)
            DTR_Predicted = DTR_PipelineModel.transform(test_df)
            DTR_BestModel = DTR_PipelineModel.stages[-1].bestModel
            DTR_Prediction = DTR_Predicted.select("Prediction").toPandas()
            DTR_Score = evaluator.evaluate(DTR_Predicted)
            return DTR_BestModel, DTR_Predicted, DTR_Prediction, DTR_Score
        if randomforestregressor:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            RFRegressor = RandomForestRegressor(featuresCol=featuresCol,
                                                labelCol=labelCol,
                                                predictionCol='Prediction',
                                                maxDepth=5,
                                                maxBins=32,
                                                minInstancesPerNode=1,
                                                minInfoGain=0.0,
                                                impurity='variance',
                                                subsamplingRate=1.0,
                                                seed=None,
                                                numTrees=20)
            paramGrid = ParamGridBuilder().addGrid(
                RFRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    RFRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid(
                        RFRegressor.numTrees,
                        [5, 10, 20, 50, 100, 200]).addGrid(
                            RFRegressor.subsamplingRate,
                            [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            RFRCV = CrossValidator(estimator=RFRegressor,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(RFRCV)
            RFR_Pipeline = Pipeline(stages=stagesList)
            RFR_PipelineModel = RFR_Pipeline.fit(train_df)
            RFR_Predicted = RFR_PipelineModel.transform(test_df)
            RFR_BestModel = RFR_PipelineModel.stages[-1].bestModel
            RFR_Prediction = RFR_Predicted.select("Prediction").toPandas()
            RFR_Score = evaluator.evaluate(RFR_Predicted)
            return RFR_BestModel, RFR_Predicted, RFR_Prediction, RFR_Score
        if gbtregressor:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            GBRegressor = GBTRegressor(featuresCol=featuresCol,
                                       labelCol=labelCol,
                                       predictionCol='Prediction',
                                       maxDepth=5,
                                       maxBins=32,
                                       minInstancesPerNode=1,
                                       minInfoGain=0.0,
                                       subsamplingRate=1.0,
                                       lossType='squared',
                                       maxIter=20,
                                       stepSize=0.1,
                                       seed=None,
                                       impurity='variance')
            paramGrid = ParamGridBuilder().addGrid(
                GBRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid(
                    GBRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid(
                        GBRegressor.maxIter,
                        [5, 10, 20, 50, 100, 200]).addGrid(
                            GBRegressor.stepSize,
                            [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid(
                                GBRegressor.subsamplingRate,
                                [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build()
            evaluator = RegressionEvaluator(labelCol=labelCol,
                                            predictionCol="Prediction",
                                            metricName="rmse")
            GBRCV = CrossValidator(estimator=GBRegressor,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(GBRCV)
            GBR_Pipeline = Pipeline(stages=stagesList)
            GBR_PipelineModel = GBR_Pipeline.fit(train_df)
            GBR_Predicted = GBR_PipelineModel.transform(test_df)
            GBR_BestModel = GBR_PipelineModel.stages[-1].bestModel
            GBR_Prediction = GBR_Predicted.select("Prediction").toPandas()
            GBR_Score = evaluator.evaluate(GBR_Predicted)
            return GBR_BestModel, GBR_Predicted, GBR_Prediction, GBR_Score
    if clustering:
        if kmeans:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            KCluster = KMeans(featuresCol=featuresCol,
                              predictionCol='Prediction',
                              k=n_cluster,
                              initMode='k-means||',
                              initSteps=2,
                              tol=0.0001,
                              maxIter=20,
                              seed=None)
            paramGrid = ParamGridBuilder().addGrid(
                KCluster.initSteps, [1, 2, 5, 10, 20, 50, 100]).addGrid(
                    KCluster.maxIter,
                    [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid(
                        KCluster.seed, [i for i in range(1001)]).build()
            evaluator = ClusteringEvaluator(predictionCol='Prediction',
                                            featuresCol=featuresCol,
                                            metricName='silhouette')
            KMCV = CrossValidator(estimator=KCluster,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(KMCV)
            KMC_Pipeline = Pipeline(stages=stagesList)
            KMC_PipelineModel = KMC_Pipeline.fit(train_df)
            KMC_Predicted = KMC_PipelineModel.transform(train_df)
            KMC_BestModel = KMC_PipelineModel.stages[-1].bestModel
            KMC_Prediction = KMC_Predicted.select("Prediction").toPandas()
            KMC_Score = evaluator.evaluate(KMC_Predicted)
            return KMC_BestModel, KMC_Predicted, KMC_Prediction, KMC_Score
        if gaussianmixture:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            GMCluster = GaussianMixture(featuresCol=featuresCol,
                                        predictionCol='Prediction',
                                        probabilityCol='Probability',
                                        k=n_cluster,
                                        tol=0.01,
                                        maxIter=100,
                                        seed=None)
            paramGrid = ParamGridBuilder().addGrid(
                GMCluster.maxIter,
                [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid(
                    GMCluster.seed, [i for i in range(1001)]).build()
            evaluator = ClusteringEvaluator(predictionCol='Prediction',
                                            featuresCol=featuresCol,
                                            metricName='silhouette')
            GMCV = CrossValidator(estimator=GMCluster,
                                  evaluator=evaluator,
                                  estimatorParamMaps=paramGrid,
                                  numFolds=10)
            stagesList.append(GMCV)
            GMC_Pipeline = Pipeline(stages=stagesList)
            GMC_PipelineModel = GMC_Pipeline.fit(train_df)
            GMC_Predicted = GMC_PipelineModel.transform(train_df)
            GMC_BestModel = GMC_PipelineModel.stages[-1].bestModel
            GMC_Probability = GMC_Predicted.select("Probability").toPandas()
            GMC_Prediction = GMC_Predicted.select("Prediction").toPandas()
            GMC_Score = evaluator.evaluate(GMC_Predicted)
            return GMC_BestModel, GMC_Predicted, GMC_Probability, GMC_Prediction, GMC_Score
        if lda:
            stagesList = FeaturesTransform(
                stringIndexer=stringIndexer,
                inputColStringIndexer=inputColStringIndexer,
                outputColStringIndexer=outputColStringIndexer,
                oneHotEncoder=oneHotEncoder,
                inputColOneHotEncoder=inputColOneHotEncoder,
                outputColOneHotEncoder=outputColOneHotEncoder,
                vectorAssembler=vectorAssembler,
                inputColsVectorAssembler=inputColsVectorAssembler,
                outputColsVectorAssembler=outputColsVectorAssembler,
                vectorIndexer=vectorIndexer,
                inputColsVectorIndexer=inputColsVectorIndexer,
                outputColsVectorIndexer=outputColsVectorIndexer,
                maxCategories=maxCategories)
            LDACluster = LDA(featuresCol=featuresCol,
                             maxIter=20,
                             seed=None,
                             k=n_cluster,
                             learningOffset=1024.0,
                             learningDecay=0.51,
                             subsamplingRate=0.05)
            paramGrid = ParamGridBuilder().addGrid(
                LDACluster.maxIter,
                [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid(
                    LDACluster.seed, [i for i in range(1001)]).addGrid(
                        LDACluster.subsamplingRate,
                        [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).build()
            evaluator = ClusteringEvaluator(predictionCol='Prediction',
                                            featuresCol=featuresCol,
                                            metricName='silhouette')
            LDACV = CrossValidator(estimator=LDACluster,
                                   evaluator=evaluator,
                                   estimatorParamMaps=paramGrid,
                                   numFolds=10)
            stagesList.append(LDACV)
            LDA_Pipeline = Pipeline(stages=stagesList)
            LDA_PipelineModel = LDA_Pipeline.fit(train_df)
            LDA_Predicted = LDA_PipelineModel.transform(train_df)
            LDA_BestModel = LDA_PipelineModel.stages[-1].bestModel
            LDA_Topics = LDA_BestModel.describeTopics().toPandas()
            LDA_Score = evaluator.evaluate(LDA_Predicted)
            return LDA_BestModel, LDA_Topics, LDA_Score
    if recommendation:
        if als:
            ALSR = ALS(userCol=userCol,
                       itemCol=itemCol,
                       ratingCol=ratingCol,
                       rank=rank,
                       maxIter=10,
                       regParam=0.1,
                       numUserBlocks=10,
                       numItemBlocks=10,
                       alpha=1.0,
                       seed=1)
            ALSR_Model = ALSR.fit(train_df)
            ALSR_ForUsers = ALSR_Model.recommendForAllUsers(userid=userid)
            ALSR_ForItems = ALSR_Model.recommendForAllItems(itemid=itemid)
            return ALSR_Model, ALSR_ForUsers, ALSR_ForItems
    if association:
        if fpgrowth:
            fpg = FPGrowth(minSupport=minSupport,
                           minConfidence=minConfidence,
                           itemsCol=itemsCol,
                           predictionCol='Prediction')
            fpg_model = fpg.fit(train_df)
            fpg_freqItemsets = fpg_model.freqItemsets.toPandas()
            fpg_associationRules = fpg_model.associationRules.toPandas()
            return fpg_model, fpg_freqItemsets, fpg_associationRules
Beispiel #13
0
#accuracy = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='affairs',metricName='accuracy',metricLabel=1).evaluate(lrPredictions)
#weightedPrecision = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='affairs',metricName='weightedPrecision',metricLabel=1).evaluate(lrPredictions)
#weightedRecall = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='affairs',metricName='weightedRecall',metricLabel=1).evaluate(lrPredictions)

#分类报告
report = Predictions.select("prediction", "labels", "features",
                            "probability").toPandas()
print(
    classification_report(y_true=report['labels'],
                          y_pred=report['prediction']))
# 使用混淆矩阵评估模型性能[[TP,FN],[TN,FP]]
TP = Predictions.filter(Predictions['prediction'] == 1).filter(
    Predictions['labels'] == 1).count()
FN = Predictions.filter(Predictions['prediction'] == 0).filter(
    Predictions['labels'] == 1).count()
TN = Predictions.filter(Predictions['prediction'] == 0).filter(
    Predictions['labels'] == 0).count()
FP = Predictions.filter(Predictions['prediction'] == 1).filter(
    Predictions['labels'] == 0).count()
# 计算查准率 TP/(TP+FP)
precision = TP / (TP + FP)
# 计算召回率 TP/(TP+FN)
recall = TP / (TP + FN)
# 计算F1值 (TP+TN)/(TP+TN+FP+FN)
F1 = (2 * precision * recall) / (precision + recall)
# 计算accuracy
accuracy = (TP + TN) / (TP + TN + FP + FN)
auc = BinaryClassificationEvaluator(labelCol='labels').evaluate(Predictions)
print(
    " f1:%1.2f\n accuracy%1.2f\n Precision:%1.2f\n Recall:%1.2f\n auc:%1.2f " %
    (F1, accuracy, precision, recall, auc))
Beispiel #14
0
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex')
embark_encoder = OneHotEncoder(
    inputCol='EmbarkedIndex', outputCol='EmbarkedVec'
)  # ---> each entry will be converted to a vector A = [1, 0] B = [0, 1]

new_cols = ['Pclass', 'SexVec', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkedVec']
assembler = VectorAssembler(inputCols=new_cols, outputCol='features')

logreg_titanic = LogisticRegression(featuresCol='features',
                                    labelCol='Survived')

pipeline = Pipeline(stages=[
    gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler,
    logreg_titanic
])

train_data, test_data = final_data.randomSplit([0.7, 0.3])

fit_model = pipeline.fit(train_data)

results = fit_model.transform(test_data)
evaluate = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                         labelCol='Survived')

results.select(['Survived', 'prediction']).show()

acc = evaluate.evaluate(results)
print(acc)
'''
'''
def dfEvaluation(predictions):
    evaluator = BinaryClassificationEvaluator()
    # auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})
    # print "ROC curve value: ", auroc
    accuracy = evaluator.evaluate(predictions)
    print "Accuracy : ", accuracy * 100
Beispiel #16
0
def train(params):
  with mlflow.start_run():
    
    impurity = params['impurity']
    max_depth = int(params['max_depth'])
    max_bins = int(params['max_bins'])
    mlflow.log_param('impurity', impurity)
    mlflow.log_param('max_depth', max_depth)
    mlflow.log_param('max_bins', max_bins)
      
    parameters = ['condition', 'num_conditions', 'days']
    for parameter in parameters:
      mlflow.log_param(parameter, dbutils.widgets.get(parameter))
  
    dt = DecisionTreeClassifier(impurity=impurity, maxDepth=max_depth, maxBins=max_bins)
  
    model = dt.fit(training_encounters)
    mlflow.spark.log_model(model, 'patient-trajectory+PtAge')
  
    (testing_encounters, _) = featurize_encounters(test_patients, string_indicers=string_indicers)
  
    bce = BinaryClassificationEvaluator()
    test_transformed = model.transform(testing_encounters)
    aroc = bce.evaluate(test_transformed, {bce.metricName: "areaUnderROC"})
    aPR = bce.evaluate(test_transformed, {bce.metricName: "areaUnderPR"})
    
    # use sklearn to caluclate evaluation metrics
    from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score
    y_test = test_transformed.select('label').toPandas()
    y_pred = test_transformed.select('prediction').toPandas()
    
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test,y_pred)
    
    # get classification matrics as a dictionary
    class_report = classification_report(y_test,y_pred, output_dict=True)
    recall_0 = class_report['0']['recall']
    f1_score_0 = class_report['0']['f1-score']
    
    # log metrics
    mlflow.log_metric("accuracy_score", acc)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall_0", recall_0)
    mlflow.log_metric("f1_score_0", f1_score_0)
    mlflow.log_metric("area_under_ROC", aroc)
    mlflow.log_metric("area_under_PR", aPR)
    
  return {'loss': -aroc, 'status': STATUS_OK}

# COMMAND ----------

    result = (test_transformed.cube('prediction', 'label').count()
            .where(col('prediction').isNotNull() & col('label').isNotNull())
            .withColumn('param', when((col('prediction')==1) & (col('label')==1), 'TP') 
                        .when((col('prediction')==0) & (col('label')==0), 'TN')
                        .when((col('prediction')==1) & (col('label')==0), 'FP')
                        .otherwise('FN'))
            .select('param', 'count').toPandas())
    
    # get confusion matrix values
    true_positive = result.iloc[0, 1]
    true_negative = result.iloc[3, 1]
    false_positive = result.iloc[1, 1]
    false_negative = result.iloc[2, 1]
Beispiel #17
0
    # Step - 2: Transform dataframe to vectorized dataframe
    output = assembler.transform(animals).select("features", "eatable",
                                                 "cyr_name")

    output.cache()

    # Step - 3: Set up the LinearSVC Classifier
    trainer = LinearSVC(labelCol="eatable", featuresCol="features")

    # Step - 4: Train the model
    model = trainer.fit(output)

    print("Coefficients: " + str(model.coefficients) + " Intercept: " +
          str(model.intercept))

    rawPredictions = model.transform(output)

    predictions = enrichPredictions(rawPredictions)

    predictions.show(100)

    # Step - 5: Evaluate prediction
    evaluator = BinaryClassificationEvaluator(labelCol="eatable",
                                              rawPredictionCol="prediction")

    # Step - 6: Calculate ROC AUC
    rocAuc = evaluator.evaluate(rawPredictions)
    print("ROC_AUC = %g " % rocAuc)

    spark.stop()
Beispiel #18
0
# LogisticRegression.transform() will only use the 'features' column.

predictions = lrModel.transform(testData)
predictions.show()

# Puedes ver cuantos predijo mal
predictions.groupBy('label', 'prediction').count().show()

# ----------------------------------------------------------------EVALUACION DEL MODELO----------------------------------------------------------
# We can use BinaryClassificationEvaluator to evaluate our model.
# We can set the required column names in rawPredictionCol and labelCol Param and the metric in metricName Param.

# Evaluate model
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='label',
                                          rawPredictionCol="rawPrediction",
                                          metricName='areaUnderROC')
evaluator.evaluate(predictions)
# Note that the default metric for the BinaryClassificationEvaluator is areaUnderROC
print(lr.explainParams())

# Summary del modelo
trainingSummary = lrModel.summary
trainingSummary.accuracy
trainingSummary.areaUnderROC

# Graficas
roc = trainingSummary.roc.toPandas()
plt.figure()
plt.plot(roc['FPR'],
         roc['TPR'],
Beispiel #19
0
def randomForest(trainingData,
                 testData,
                 impurity,
                 maxDepth,
                 maxBins,
                 numTrees,
                 enableCrossValidator=False,
                 featuresCol='features',
                 labelCol='label',
                 predictionCol='prediction',
                 probabilityCol='probability',
                 rawPredictionCol='rawPrediction',
                 minInstancesPerNode=1,
                 minInfoGain=0.0,
                 maxMemoryInMB=256,
                 cacheNodeIds=False,
                 checkpointInterval=10,
                 featureSubsetStrategy='auto',
                 seed=None,
                 subsamplingRate=1.0):

    print("\nInizio classificazione con RandomForestClassifier")

    # Inizializzo il modello del classificatore con i parametri in input (e quelli default)
    rfc = RandomForestClassifier(featuresCol=featuresCol,
                                 labelCol=labelCol,
                                 predictionCol=predictionCol,
                                 probabilityCol=probabilityCol,
                                 rawPredictionCol=rawPredictionCol,
                                 maxDepth=maxDepth,
                                 maxBins=maxBins,
                                 minInstancesPerNode=minInstancesPerNode,
                                 minInfoGain=minInfoGain,
                                 maxMemoryInMB=maxMemoryInMB,
                                 cacheNodeIds=cacheNodeIds,
                                 checkpointInterval=checkpointInterval,
                                 impurity=impurity,
                                 numTrees=numTrees,
                                 featureSubsetStrategy=featureSubsetStrategy,
                                 seed=seed,
                                 subsamplingRate=subsamplingRate)

    print("    -modello creato")

    validator = None
    # In caso di cross validation
    if enableCrossValidator:
        # Creo la mappa dei parametri
        paramGrid = ParamGridBuilder().build()

        # Inizializzo l'evaluator
        evaluator = BinaryClassificationEvaluator()

        # Creo il sistema di k-fold cross validation, dove estiamtor è il classificatore da valutare e numFolds è il K
        crossVal = CrossValidator(estimator=rfc,
                                  estimatorParamMaps=paramGrid,
                                  evaluator=evaluator,
                                  numFolds=5)  # use 3+ folds in practice
        validator = crossVal
    else:
        validator = rfc

    print("    -validator creato")

    training = trainingData.map(lambda x: (x[31], Vectors.dense(x[1:29]), x[
        30])).toDF(schema=['index', 'features', 'label']).orderBy('index')

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    # tokenizer = Tokenizer(inputCol="features", outputCol="transactions")
    # hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures", numFeatures=29)

    pipeline = Pipeline(stages=[validator])

    model = pipeline.fit(training)

    print("    -modello addestrato con la pipeline (" + str(training.count()) +
          " elementi utilizzati come training)")

    test = testData.map(lambda x: (x[30], Vectors.dense(x[1:29]), x[31])).toDF(
        schema=['label', 'features', 'index']).orderBy('index')

    # prediction = predictions, label, index
    predictionsAndLabels = model.transform(test).rdd.map(lambda x:
                                                         (x[5], x[0], x[2]))

    print("    -" + str(predictionsAndLabels.count()) +
          " elementi predetti (" + str(test.count()) +
          " elementi usati come test)")

    return predictionsAndLabels
Beispiel #20
0
# Step 9
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
# Vectorize
vecAssembler = VectorAssembler(inputCols=['age','job','marital',’education','balance','housing','loan','contact','day','month','duration','campaign','pdays','previous','poutcome'],outputCol="features")
# Transform data
df_train = vecAssembler.transform(train_data)
pd.DataFrame(df_train.take(5), columns=df_train.columns).transpose()
dt = DecisionTreeClassifier(labelCol="deposit", featuresCol="features")
pipeline = Pipeline(stages=[vecAssembler, dt])
model = pipeline.fit(train_data)
predictions = model.transform(test_data)
#Select prediction information
predictions.select("prediction", "Classification", "features").toPandas().head(10)
evaluator = BinaryClassificationEvaluator(labelCol="deposit", rawPredictionCol="prediction")
evaluator.evaluate(predictions)
paramGrid = ParamGridBuilder().addGrid(dt.maxDepth,[2,3,4,5,6,7,8,9,10,11,12]).build()
# Set up 3-fold cross validation
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator, 	
                          numFolds=3)
CV_model = crossval.fit(train_data)
tree_model = CV_model.bestModel.stages[1]
print(tree_model)
predictions_improved = CV_model.bestModel.transform(test_data)
predictions_improved.select("prediction", "deposit", "features").toPandas().head(10)
evaluator.evaluate(predictions_improved)
    # ****************************************************************************** #
    # Run Logistic Regression Classification.                                        #
    # ****************************************************************************** #

    lr = LogisticRegression(family='binomial',
                            featuresCol='features',
                            labelCol='label',
                            predictionCol='pred',
                            rawPredictionCol='pred_raw',
                            maxIter=10)
    lr_model = lr.fit(trainDF)
    lr_result = lr_model.transform(testDF)

    # Create an evaluator to measure classification performance.
    evaluator1 = BinaryClassificationEvaluator(rawPredictionCol='pred_raw',
                                               labelCol='label',
                                               metricName='areaUnderPR')
    area_under_pr = evaluator1.evaluate(lr_result)
    evaluator2 = MulticlassClassificationEvaluator(predictionCol="pred",
                                                   labelCol="label",
                                                   metricName="f1")
    f1_score = evaluator2.evaluate(lr_result)
    evaluator3 = MulticlassClassificationEvaluator(predictionCol="pred",
                                                   labelCol="label",
                                                   metricName="accuracy")
    accuracy = evaluator3.evaluate(lr_result)

    print("")
    print(
        "########################################################################"
    )
Beispiel #22
0
# Generar predicciones en el DataFrame de prueba:
test_with_prediction = log_reg_model.transform(test)
test_with_prediction.show(5)

test_summary_pred = log_reg_model.evaluate(test_with_prediction)
plot_roc_curve(test_summary)

# ** Nota: ** El DataFrame resultante incluye tres tipos de predicciones. los
# `rawPrediction` es un vector de log-odds,` prediction` es un vector o
# probabilidades `prediction` es la clase predicha basada en la probabilidad
# vector.

# Crear una instancia de la clase `BinaryClassificationEvaluator` :
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                          labelCol="high_rating",
                                          metricName="areaUnderROC")
print(evaluator.explainParams())
evaluator.evaluate(test_with_prediction)

# Evaluar usando la metrica :
evaluator.setMetricName("areaUnderPR").evaluate(test_with_prediction)

# ## References

# [Spark Documentation - Classification and regression](https://spark.apache.org/docs/latest/ml-classification-regression.html)

# [Spark Python API - pyspark.ml.feature module](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#module-pyspark.ml.feature)

# [Spark Python API - pyspark.ml.classification module](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#module-pyspark.ml.classification)
Beispiel #23
0
# In[65]:

final_model = LogisticRegression()
fit_final_model = final_model.fit(lr_train)

# <font size=4,font style=arial>
# train setine uyguladığımız modeli test edelim
# </font>

# In[66]:

predictions_and_labels = fit_final_model.evaluate(lr_test)

# <font size=4,font style=arial>
# label gerçekleşen ve prediction da tahmin olmak üzere aşağıda ki şekildedir
# </font>

# In[67]:

predictions_and_labels.predictions.show(100, truncate=False)

# <font size=4,font style=arial>
# Roc eğrisinin altında ki alanı rakam olarak görelim. 1'e yakın bir değer iyi bir değerdir. Veri seti manual oluşturulduğundan aşağıda ki şekilde bir değer çıkmıştır.
# </font>

# In[68]:

my_eval = BinaryClassificationEvaluator()
my_final_roc = my_eval.evaluate(predictions_and_labels.predictions)
my_final_roc
Beispiel #24
0
def evaluate_ROC(predictions):
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    evaluator = BinaryClassificationEvaluator()
    return evaluator.evaluate(predictions)
def main(context):
    """Main function takes a Spark SQL context."""
    # TASK 1
    # the read is from the parquet file
    comments = sqlContext.read.parquet("comments-minimal.parquet")
    submissions = sqlContext.read.parquet("submissions.parquet")

    # only look at columns that are useful
    comments = comments.select("id","created_utc","body","author_flair_text", "link_id", "score").\
        withColumnRenamed("score", "commentscore")
    submissions = submissions.select("id", "title", "score").\
        withColumnRenamed("score", "storyscore")

    #comments.write.parquet("comments-minimal.parquet")
    #submissions.write.parquet("submissions.parquet")

    # TASK 2
    labeled_data = sqlContext.read.format("csv").options(
        header='true', inferSchema='true').load('labeled_data.csv')

    #here we do the join on comment id
    joined = comments.join(labeled_data, comments.id == labeled_data.Input_id)
    #comments.join(labeled_data, comments.id == labeled_data.Input_id).explain()

    # TASK 4
    #sanitize_new ignores processed string given by sanitize
    from cleantext import sanitize

    def sanitize_new(text):
        r = sanitize(text)[1:]
        return r[0].split(" ") + r[1].split(" ") + r[2].split(" ")

    # TASK 5
    #create the udf, generate new column of n-grams
    sanitize_udf = udf(sanitize_new, ArrayType(StringType()))
    joined = joined.withColumn("ngrams", sanitize_udf(joined.body))

    # TASK 6A
    # construct feature vector based on "ngrams"
    #store the transformed column in "features"
    #CountVectroizer produces sparse vector by default so no need to change
    cv = CountVectorizer(inputCol="ngrams",
                         outputCol="features",
                         minDF=5.0,
                         binary=True)
    cv_model = cv.fit(joined)
    joined = cv_model.transform(joined)

    # TASK 6B
    # construct pos column and neg column
    #for this project, only look at label on Trump
    pos_udf = udf(lambda label: 1 if label == 1 else 0, IntegerType())
    neg_udf = udf(lambda label: 1 if label == -1 else 0, IntegerType())
    joined = joined.withColumn("poslabel", pos_udf(joined.labeldjt))
    joined = joined.withColumn("neglabel", neg_udf(joined.labeldjt))

    # TASK 7
    #train logistic regression model
    #code adopted from project spec
    #Initialize two logistic regression models.
    poslr = LogisticRegression(labelCol="poslabel",
                               featuresCol="features",
                               maxIter=10)
    neglr = LogisticRegression(labelCol="neglabel",
                               featuresCol="features",
                               maxIter=10)
    poslr.setThreshold(0.2)
    neglr.setThreshold(0.25)
    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator(labelCol="poslabel")
    negEvaluator = BinaryClassificationEvaluator(labelCol="neglabel")
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 1.0. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = joined.randomSplit([0.5, 0.5])
    negTrain, negTest = joined.randomSplit([0.5, 0.5])

    # Train the models
    posModel = posCrossval.fit(posTrain)
    negModel = negCrossval.fit(negTrain)

    # TASK: Extra Credit's Curve:
    # evaluate the model
    #     posTestRes = posModel.transform(posTest).toPandas()['probability']
    #     posTestRes = np.array([i[1] for i in posTestRes])
    #     negTestRes = negModel.transform(negTest).toPandas()['probability']
    #     negTestRes = np.array([i[1] for i in negTestRes])
    #     print(negTestRes, posTestRes)
    #     print('ok')
    #     pfpr, ptpr, _ = metrics.roc_curve(posTest.select('poslabel').toPandas(), posTestRes)
    #     nfpr, ntpr, _ = metrics.roc_curve(negTest.select('neglabel').toPandas(), negTestRes)
    #     print(pfpr[:5], ptpr[:5], nfpr[:5],ntpr[:5])
    #     plt.plot(pfpr, ptpr, label = 'posModel')
    #     plt.plot(nfpr, ntpr, label = 'negModel')
    #     plt.legend()
    #     plt.savefig('ROC.png')
    #     plt.close()
    #     # save the models
    #     posModel.save("www/pos.model")
    #     negModel.save("www/neg.model")

    #load instead
    #     posModel = CrossValidatorModel.load("www/pos.model")
    #     negModel = CrossValidatorModel.load("www/neg.model")
    #print("finished loading model")

    # TASK 8.1
    # selected column 'created_utc' and transformed in 10.2 using from_unixtime

    # TASK 8.2
    # title of submission of the comment
    comments = comments.withColumn("clean_id",
                                   regexp_replace("link_id", r'^t3_', ''))
    comments = comments.join(
        submissions, comments.clean_id == submissions.id).drop(submissions.id)

    # TASK 8.3
    # Please see TASK 10.3 (by state) line 166

    # TASK 9
    #filter out comments with "\s" and starts with "&gt"
    comments = comments.filter(~comments.body.rlike(r'^&gt')).\
        filter(~comments.body.rlike(r'\\s'))
    #sample
    comments = comments.sample(
        False, sampleRate,
        None)  # 1 serves as the seed so model is reproducible
    #redo 4,5,6a
    comments = comments.withColumn("ngrams", sanitize_udf(comments.body))
    comments = cv_model.transform(comments)
    #print("done with transforming the sampled comments")

    #make predictions
    comments = posModel.transform(comments).\
        drop("body", "link_id", "clean_id", "ngrams","rawPrediction", "probability").\
        withColumnRenamed("prediction", "poslabel")
    comments = negModel.transform(comments).drop("features", "rawPrediction", "probability").\
        withColumnRenamed("prediction", "neglabel")

    # TASK 10.1
    # compute the percentage of positive, negative comments
    #print("Percentage of positive comments")
    result = comments.select('poslabel').groupBy().avg()
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("pos-perc.csv")
    #print("Percenetage of negative comments")
    result = comments.select('neglabel').groupBy().avg()
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("neg-perc.csv")

    # TASK 10.2
    #2. by date
    comments = comments.withColumn(
        "date", from_unixtime(comments.created_utc, "YYYY-MM-dd"))
    result = comments.groupBy("date").agg({
        "poslabel": "mean",
        "neglabel": "mean"
    })
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("time_data.csv")

    # TASK 10.3
    #3. by state
    val_state_udf = udf(lambda state: state if state in states else None,
                        StringType())
    comments = comments.withColumn(
        "state", val_state_udf(lower(comments.author_flair_text)))
    comments = comments.filter(comments.state.isNotNull())
    result = comments.groupBy("state").agg({
        "poslabel": "mean",
        "neglabel": "mean"
    })
    result.show(truncate=False)
    #print(result.count())
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("state_data.csv")

    # TASK 10.4
    #4a. by comment score
    result = comments.groupBy("commentscore").agg({
        "poslabel": "mean",
        "neglabel": "mean"
    })
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("comment_score.csv")
    #4b. by story score
    result = comments.groupBy("storyscore").agg({
        "poslabel": "mean",
        "neglabel": "mean"
    })
    result.repartition(1).write.format("com.databricks.spark.csv").\
        option("header","true").save("story_score.csv")

    # DELIVERABLE 4.
    story = result.orderBy('avg(poslabel)', ascending=False).limit(10)
    # join is too expensive, subquery is also expensive
    score_list = set(story.select('storyscore').toPandas()['storyscore'])
    comments[comments.storyscore.isin(score_list)].select(
        'storyscore', 'title').limit(20).show(truncate=False)

    story = result.orderBy('avg(neglabel)', ascending=False).limit(10)
    score_list = set(story.select('storyscore').toPandas()['storyscore'])
    comments[comments.storyscore.isin(score_list)].select(
        'storyscore', 'title').limit(20).show(truncate=False)
    print("===========VectorAssembler====================")
    feature = df.columns[1:len(df.columns)-1]
    lable_name = df.columns[-1]
    print(lable_name)
    assembler = VectorAssembler(inputCols=feature, outputCol="features")
    
    print("=============pipeline==================")
    model = LogisticRegression(regParam=0.1, labelCol=lable_name, featuresCol="features" , family ='binomial')
    pipeline = Pipeline(stages=[assembler,model])
    pipeline.getStages()
    
    print("===========TaintingAndTesting====================")
    pipelineModel = pipeline.fit(train_df)
    predicted=pipelineModel.transform(test_df)
    
    print("===========PredictedAUC====================")
    evaluator = BinaryClassificationEvaluator(
                              rawPredictionCol="rawPrediction",
                              labelCol= lable_name,  
                              metricName="areaUnderROC"  )
    auc= evaluator.evaluate(predicted)
    print(auc)
    
    print("===========PredictedScore====================")
    Multi_evaluator = MulticlassClassificationEvaluator(labelCol= lable_name)
    Accuracy= Multi_evaluator.evaluate(predicted ,{evaluator.metricName: "accuracy"})
    Precision = Multi_evaluator.evaluate(predicted ,{evaluator.metricName: "weightedPrecision"})
    Recall = Multi_evaluator.evaluate(predicted ,{evaluator.metricName: "weightedRecall"})
    F1 = Multi_evaluator.evaluate(predicted ,{evaluator.metricName: "f1"})

    print("Accuracy",Accuracy,"Precision",Precision,"Recall",Recall,"F1",F1)
Beispiel #27
0
# Evaluate model based on auc ROC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Evaluate model based on F1 socre
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Evaluate model based on confusion matrix
from pyspark.mllib.evaluation import MulticlassMetrics

# model on training data regPara: lasso regularisation parameter (L1)
lrModel = LogisticRegression().fit(trainData)

# make prediction on test data
pred = lrModel.transform(testData)

pred.select('catLabel', 'label', 'prediction').show()

evaluator1 = BinaryClassificationEvaluator(labelCol='label',
                                           metricName="areaUnderROC")
evaluator2 = MulticlassClassificationEvaluator(labelCol='label',
                                               metricName="f1")
metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple))

print('AUC ROC of Logistic Regression model is %f' % evaluator1.evaluate(pred))
print('F1 score of Logistic Regression model is %f' %
      evaluator2.evaluate(pred))
metrics.confusionMatrix().toArray().transpose()

# <a id="context322"></a>
# #### 3.2.2. Decision Tree

# In[20]:

from pyspark.ml.classification import DecisionTreeClassifier
Beispiel #28
0
    return crossval


from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

evaluatorB = BinaryClassificationEvaluator(labelCol="labels",
                                           rawPredictionCol="prediction",
                                           metricName="areaUnderROC")
evaluatorM = MulticlassClassificationEvaluator(labelCol="labels",
                                               predictionCol="prediction",
                                               metricName="accuracy")

dt = DecisionTreeClassifier(labelCol="labels", featuresCol="features")
pipeline = Pipeline(stages=[dt])

paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10]) \
    .addGrid(dt.impurity, ['gini', 'entropy'])\
    .build()

print("Decision Tree Classifier, Metric: Area Under ROC")
crossval = metrics(pipeline, paramGrid, evaluatorB)
    print("Regularization rate: {}".format(reg))
    # create a bunch of child runs
    with root_run.child_run("reg-" + str(reg)) as run:
        # create a new Logistic Regression model.
        
        lr = LogisticRegression(regParam=reg)
        
        # put together the pipeline
        pipe = Pipeline(stages=[lr])

        # train the model
        model_pipeline = pipe.fit(trainingData)
        predictions = model_pipeline.transform(testData)

        # evaluate. note only 2 metrics are supported out of the box by Spark ML.
        bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')
        au_roc = bce.setMetricName('areaUnderROC').evaluate(predictions)
        au_prc = bce.setMetricName('areaUnderPR').evaluate(predictions)
        truePositive = predictions.select("label").filter("label = 1 and prediction = 1").count()
        falsePositive = predictions.select("label").filter("label = 0 and prediction = 1").count()
        trueNegative = predictions.select("label").filter("label = 0 and prediction = 0").count()
        falseNegative = predictions.select("label").filter("label = 1 and prediction = 0").count()

        # log reg, au_roc, au_prc and feature names in run history
        run.log("reg", reg)
        run.log("au_roc", au_roc)
        run.log("au_prc", au_prc)
        
        print("Area under ROC: {}".format(au_roc))
        print("Area Under PR: {}".format(au_prc))
       
Beispiel #30
0
tree_model = tree_classifier.fit(training_data)
predictions = tree_model.transform(test_data)
#print(tree_model.toDebugString)
test_error = predictions.filter(
    predictions["prediction"] != predictions["Accident_Severity"]).count(
    ) / float(test_data.count())
print "Testing error: {0:.4f}".format(test_error)
# Select example rows to display.
predictions.select("prediction", "Accident_Severity", "features").show(5)
#Model rozhodovacie stromu
print(tree_model.toDebugString)
#vyhodnotenie decision tree
evaluatorMulti = MulticlassClassificationEvaluator(
    labelCol="Accident_Severity", predictionCol="prediction")
evaluator = BinaryClassificationEvaluator(labelCol="Accident_Severity",
                                          rawPredictionCol="prediction",
                                          metricName='areaUnderROC')
acc = evaluatorMulti.evaluate(predictions,
                              {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "f1"})
Precision = evaluatorMulti.evaluate(
    predictions, {evaluatorMulti.metricName: "weightedPrecision"})
Recall = evaluatorMulti.evaluate(predictions,
                                 {evaluatorMulti.metricName: "weightedRecall"})
auc = evaluator.evaluate(predictions)
print('Accuracy score: ', acc)
print('f1: ', f1)
print('Precision: ', Precision)
print('Recall: ', Recall)
print('Auc: ', auc)
#kontingencna tabulka