"rawPrediction", "probability") selected.show(1) selected.printSchema #for row in selected.collect(): # rid, actual, prob, prediction = row # print((rid, actual, prob, prediction)) from pyspark.ml.evaluation import BinaryClassificationEvaluator import pyspark.sql.functions as F import pyspark.sql.types as T prob_extract = F.udf(lambda x: float(x[1]), T.FloatType()) #print(prediction.withColumn("prob1",prob_extract("probability")).select("prob1","prediction").show()) evaluator = BinaryClassificationEvaluator( rawPredictionCol='rawPrediction', metricName="areaUnderROC", labelCol='default_payment_next_month') print('Evaluator areaUnderROC: ' + str(evaluator.evaluate(prediction))) # 0.7294563666075892 evaluator = BinaryClassificationEvaluator( rawPredictionCol='rawPrediction', metricName="areaUnderPR", labelCol='default_payment_next_month') print('Evaluator areaUnderPR : ' + str(evaluator.evaluate(prediction))) # 0.7294563666075892 prediction.groupBy('default_payment_next_month', 'prediction').count().show() # Metrics predictionRDD = prediction.select(['label', 'prediction']) \
] label_stringIdx = [StringIndexer(inputCol="label", outputCol="labels")] selector = [ ChiSqSelector(numTopFeatures=2**14, featuresCol='rawFeatures', outputCol="features") ] lr = [LogisticRegression(maxIter=1000)] return Pipeline(stages=tokenizer + remover + ngrams + cv + idf + assembler + label_stringIdx + selector + lr) #saving pipeline steps of execute pipeline_load = PipelineModel.load("pipeLineModel") predictions = pipeline_load.transform( test_set) #put dataframe for testing here int(predictions.collect()[-1]['prediction']) #prediction #finding the accuracy of the model. accuracy = predictions.filter( predictions.label == predictions.prediction).count() / float( test_set.count()) evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") roc_auc = evaluator.evaluate(predictions) print("Accuracy Score: ", accuracy) print("ROC-AUC: {0:.4f}", roc_auc) #loading pipeline and predicting the accuracy of new data. predictions = pipeline_load.transform(ddf) int(predictions.collect()[-1]['prediction'])
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance. # This will allow us to jointly choose parameters for all Pipeline stages. # A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator. # We use a ParamGridBuilder to construct a grid of parameters to search over. # With 3 values for hashingTF.numFeatures and 2 values for lr.regParam, # this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from. paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=2) # use 3+ folds in practice # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(training) # Prepare test documents, which are unlabeled. Document = Row("id", "text") test = sc.parallelize([(4L, "spark i j k"), (5L, "l m n"), (6L, "mapreduce spark"), (7L, "apache hadoop")]) \ .map(lambda x: Document(*x)).toDF() # Make predictions on test documents. cvModel uses the best model found (lrModel). prediction = cvModel.transform(test)
# MAGIC since this is a binary classification problem, we define a `BinaryClassificationEvaluator` evaluator. # MAGIC # MAGIC The default metrics are # MAGIC * Area under the precision-recall curve and # MAGIC * Area under the receiver operating characteristic (ROC) curve # MAGIC # MAGIC For more information see: # MAGIC * Scala: <a href="https://spark.apache.org/docs/latest/api/scala/#org.apache.spark.ml.evaluation.BinaryClassificationEvaluator" target="_blank">BinaryClassificationEvaluator</a> # MAGIC * Python: <a href="https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.evaluation.BinaryClassificationEvaluator" target="_blank">BinaryClassificationEvaluator</a> # COMMAND ---------- from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = ( BinaryClassificationEvaluator() .setLabelCol("label") .setRawPredictionCol("prediction")) # COMMAND ---------- # MAGIC %md ## Define CrossValidator # MAGIC # MAGIC for best model selection and makes sure that there's no overfitting. # COMMAND ---------- cv = ( CrossValidator() .setEstimator(pipeline) .setEvaluator(evaluator) .setEstimatorParamMaps(paramGrid) .setNumFolds(numFolds))
def testModel(model, validate = validate): pred = model.transform(validate) evaluator = BinaryClassificationEvaluator(labelCol = 'index') return evaluator.evaluate(pred)
def main(sqlContext): """Main function takes a Spark SQL context.""" spark = SparkSession(SparkContext.getOrCreate()) # Task 1 # if parquet exists, read try: # These are the two data frames we're working with comments = sqlContext.read.parquet("comments.parquet") submissions = sqlContext.read.parquet("submissions.parquet") except: # Otherwise do the following comments = sqlContext.read.json("comments-minimal.json.bz2") submissions = sqlContext.read.json("submissions.json.bz2") labeled_data = sqlContext.read.format("csv").option( "header", "true").load("labeled_data.csv") comments.write.parquet("comments.parquet") submissions.write.parquet("submissions.parquet") # Task 2 # join on labeled_data.Input_id and comments.id labeled_data = sqlContext.read.format("csv").option( "header", "true").load("labeled_data.csv") labeled_data.createOrReplaceTempView("labeled_data") comments.createOrReplaceTempView("comments") sqlDF = spark.sql( "SELECT l.id as id, l.body, r.labeldem as Dem,r.labelgop as GOP,r.labeldjt as Trump FROM labeled_data as r INNER JOIN comments as l ON r.Input_id = l.id " ) # Task 3 # Task 4 & Task 5 sqlDF.createOrReplaceTempView("sqlDF") def parse(z): res1 = [] res2 = [] wordList = sanitize(z) for i, val in enumerate(wordList[1:]): res1.append(val) for i, value in enumerate(wordList[1:]): for j, val in enumerate((value.split(" "))): res2.append(val) return res1 + res2 sqlContext.registerFunction("parser", lambda z: parse(z), ArrayType(StringType())) parsedTable = spark.sql( "SELECT id, body, Trump, parser(body) as parsed FROM sqlDF") parsedTable.createOrReplaceTempView("parsedTable") # Task 6a # parsedTableID = spark.sql("SELECT id, parsed FROM parsedTable") cv = CountVectorizer(inputCol="parsed", outputCol="vectors", minDF=10.0) model = cv.fit(parsedTable) parsedVectorTable = model.transform(parsedTable) parsedVectorTable.createOrReplaceTempView("parsedVectorTable") # Task 6b resTable = spark.sql( "SELECT id, body, Trump, vectors, CASE WHEN Trump=1 THEN 1 ELSE 0 END AS positive, CASE WHEN Trump=-1 THEN 1 ELSE 0 END AS negative FROM parsedVectorTable" ) # TASK 7 # Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol="label", featuresCol="vectors", maxIter=10) neglr = LogisticRegression(labelCol="label", featuresCol="vectors", maxIter=10) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 pos = resTable.select("positive", "vectors") pos = pos.withColumnRenamed("positive", "label") neg = resTable.select("negative", "vectors") neg = neg.withColumnRenamed("negative", "label") posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.save("project2/pos.model") negModel.save("project2/neg.model") # Task 8 print('task 8 started') def strip_t3(s): return s[3:] # Sample data if needed #comment out later # comments = comments.sample(False, 0.0002) sqlContext.registerFunction("strip_t3", lambda z: strip_t3(z), StringType()) comments.createOrReplaceTempView("comments") submissions.createOrReplaceTempView("submissions") joined_data = spark.sql( "SELECT c.created_utc as created_time, s.title as post_title, c.author_flair_text as com_state, c.body as body, c.id as comment_id, s.id as submission_id, s.score as s_score, c.score as c_score FROM comments as c INNER JOIN submissions as s ON strip_t3(c.link_id) = s.id" ) joined_data.createOrReplaceTempView("joined_data") # joined_data.show() # Task 9 # dataframe_task9 = spark.sql("SELECT * FROM joined_data WHERE body NOT LIKE '>%' AND body NOT LIKE '%/s%'") print('task 9 started') dataframe_task9 = spark.sql( "SELECT created_time, post_title, com_state, parser(body) as parsed, comment_id, submission_id, c_score, s_score FROM joined_data WHERE body NOT LIKE '>%' AND body NOT LIKE '%/s%'" ) dataframe_task9.createOrReplaceTempView("dataframe_task9") # dataframe_task9.show() cv_result = model.transform(dataframe_task9) pos_model = CrossValidatorModel.load('project2/pos.model') neg_model = CrossValidatorModel.load('project2/neg.model') pos = pos_model.transform(cv_result) pos.createOrReplaceTempView('pos') def posProbUDF(z): if z[1] > 0.2: return 1 else: return 0 def negProbUDF(z): if z[1] > 0.25: return 1 else: return 0 posProb = udf(posProbUDF, IntegerType()) negProb = udf(negProbUDF, IntegerType()) # sqlContext.registerFunction("posProbUDF", lambda z: parse(z), IntegerType()) # sqlContext.registerFunction("negProbUDF", lambda z: parse(z), IntegerType()) # pos = spark.sql("SELECT com_state, vectors, submission_id, created_time, s_score, c_score, rawPrediction as pos_rawPrediction, posProbUDF(probability) as pos_probability, prediction as pos_prediction FROM pos") # pos.createOrReplaceTempView('pos') pos = pos.select(col('com_state'), col('vectors'), col('submission_id'), col('created_time'), col('s_score'), col('c_score'), posProb("probability").alias("pos_probability"), col("prediction").alias('pos_pred')) all_results = neg_model.transform(pos) total_result = all_results.select( col('com_state'), col('vectors'), col('submission_id'), col('created_time'), col('s_score'), col('pos_probability'), col('c_score'), negProb("probability").alias("neg_probability"), col("prediction")) # Task 10 def getState(input_flair): states = [ 'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming' ] if str(input_flair) in states: return str(input_flair) return 'not_state' get_state = udf(getState, StringType()) total_result = total_result.select('*', get_state('com_state').alias('state')) total_result.createOrReplaceTempView('final_results') # parsedTable = spark.sql("SELECT id, body, Trump, parser(body) as parsed FROM sqlDF") query_1 = spark.sql( "SELECT submission_id, AVG(pos_probability) as pos_prob, AVG(neg_probability) as neg_prob FROM final_results GROUP BY submission_id" ) print('q1') query_2 = spark.sql( "SELECT date(from_unixtime(created_time)) as date, AVG(pos_probability) as pos_prob, AVG(neg_probability) as neg_prob FROM final_results GROUP BY date" ) print('q2') query_3 = spark.sql( "SELECT state, AVG(pos_probability) as pos_prob, AVG(neg_probability) as neg_prob FROM final_results WHERE state !='not_state' GROUP BY state" ) # print('q3') query_4c = spark.sql( "SELECT c_score, AVG(pos_probability) as pos_prob, AVG(neg_probability) as neg_prob FROM final_results GROUP BY c_score" ) query_4s = spark.sql( "SELECT s_score, AVG(pos_probability) as pos_prob, AVG(neg_probability) as neg_prob FROM final_results GROUP BY s_score" ) query_1.toPandas().to_csv("query_1.csv") query_2.toPandas().to_csv("query_2.csv") query_3.toPandas().to_csv("query_3.csv") query_4c.toPandas().to_csv("query_4c.csv") query_4s.toPandas().to_csv("query_4s.csv")
# ### using Grid Search and cross validation from pyspark.ml.tuning import CrossValidator, ParamGridBuilder RFclassifier = RandomForestClassifier(labelCol='label', featuresCol='features', impurity=param_impurity) pipeline = Pipeline(stages=[labelIndexer, featureIndexer, RFclassifier]) # ### Define test configutations (to be evaluated in Grid) paramGrid = ParamGridBuilder()\ .addGrid(RFclassifier.maxDepth, param_maxDepth )\ .addGrid(RFclassifier.numTrees, param_numTrees )\ .build() # ### Defing metric by wich the model will be evaluated evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC') crossval = CrossValidator( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, parallelism=3, #number of models run in || numFolds=2) # ### fit model (note : returns the best model) cvModel = crossval.fit(trainingData) # ### show performande of runs print(cvModel.avgMetrics) # # Evaluation of model performance on validation dataset
hasher.transform(df_train).select("features").show() from pyspark.ml.classification import LogisticRegression classifier = LogisticRegression(maxIter=20, regParam=0.000, elasticNetParam=0.000) stages = [hasher, classifier] from pyspark.ml import Pipeline pipeline = Pipeline(stages=stages) model = pipeline.fit(df_train) predictions = model.transform(df_test) predictions.cache() from pyspark.ml.evaluation import BinaryClassificationEvaluator ev = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", metricName="areaUnderROC") print(ev.evaluate(predictions)) spark.stop() # In[ ]:
testData.show() from pyspark.ml.classification import LogisticRegression from pyspark.ml.evaluation import BinaryClassificationEvaluator #Create the model Classifer = LogisticRegression(regParam=0.0,labelCol="label",\ featuresCol="features") Model = Classifer.fit(trainingData) #Predict on the test data predictions = Model.transform(testData) predictions.select("prediction","label").show() #Evaluate accuracy evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", \ labelCol="label") evaluator.evaluate(predictions) #to check with the overfitting problem predictions_train = Model.transform(trainingData) predictions.select("prediction","label").show() #Draw a confusion matrix predictions.groupBy("label","prediction").count().show() ###################################### INSULT as the output #Split into training and testing data (trainingData, testData) = INSULTDf.randomSplit([0.75, 0.25]) trainingData.count() testData.count()
# streamline all above steps into a pipeline pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr]) # train model and predict results # perform grid search looking for the best parameters and the best models paramGrid = ParamGridBuilder()\ .addGrid(hashingTF.numFeatures,[1000,5000,10000])\ .addGrid(lr.regParam, [0.1, 0.01]) \ .addGrid(lr.elasticNetParam, [0.0, 0.3, 0.6])\ .build() tvs = TrainValidationSplit( estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator().setMetricName('areaUnderPR'), trainRatio=0.8) # set area under precision-recall curve as the evaluation metric - 80% of data will be used for training, 20% for validation # run TrainValidationSplit and choose the best set of parameters model = tvs.fit(train_set) # make predictions train_prediction = model.transform(train_set) test_prediction = model.transform(test_set) # report accuracy # caculate the accuracy score for the best model correct = test_prediction.filter( test_prediction.label == test_prediction.prediction).count()
return cvModel def model_test(cvModel, df_test) """ Returning sparkify trained datasets and test it against test datasets with best parameters Parameters ----------- cvModel: Cross validator model df_test: DataFrame returns ------- results: dataframe """ best_model = cv_model.bestModel results = best_model.transform(df_test) evaluatorb = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction") evaluatorm = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction") print(title) print('area under ROC: %f' % evaluatorb.evaluate(results, {evaluatorb.metricName: "areaUnderROC"})) print('area under PR: %f' % evaluatorb.evaluate(results, {evaluatorb.metricName: "areaUnderPR"})) print('Accuracy: %f' % evaluatorm.evaluate(results, {evaluatorm.metricName: "accuracy"})) print('F-1 Score: %f' % evaluatorm.evaluate(results, {evaluatorm.metricName: "f1"})) print('wPrecision: %f' % evaluatorm.evaluate(results, {evaluatorm.metricName: "weightedPrecision"})) print('wRecall: %f' % evaluatorm.evaluate(results, {evaluatorm.metricName: "weightedRecall"})) return results def ExtractFeature(featureImp, dataset, featuresCol): """ Returning a dataframe that consists of features weight according to the trained sets and results from test set
def SparkML(train_df, test_df=None, featuresCol='features', labelCol='label', binaryclass=False, multiclass=False, n_cluster=2, userCol='user', itemCol='item', ratingCol='rating', rank=10, userid=3, itemid=3, itemsCol='items', minSupport=0.3, minConfidence=0.8, stringIndexer=False, inputColStringIndexer=None, outputColStringIndexer=None, oneHotEncoder=False, inputColOneHotEncoder=None, outputColOneHotEncoder=None, vectorAssembler=False, inputColsVectorAssembler=None, outputColsVectorAssembler=None, vectorIndexer=False, inputColsVectorIndexer=None, outputColsVectorIndexer=None, maxCategories=None, classification=False, logisticregression=False, decisiontreeclassifier=False, linearsvc=False, naivebayes=False, randomforestclassifier=False, gbtclassifier=False, regression=False, linearregression=True, decisiontreeregressor=False, randomforestregressor=False, gbtregressor=False, clustering=False, kmeans=False, gaussianmixture=False, lda=False, recommendation=False, als=False, association=False, fpgrowth=False): if classification: if logisticregression: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) LRClassifier = LogisticRegression(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', standardization=True, maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, fitIntercept=True, threshold=0.5) paramGrid = ParamGridBuilder().addGrid( LRClassifier.maxIter, [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid( LRClassifier.regParam, [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0 ]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") LRCV = CrossValidator(estimator=LRClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(LRCV) LRC_Pipeline = Pipeline(stages=stagesList) LRC_PipelineModel = LRC_Pipeline.fit(train_df) LRC_Predicted = LRC_PipelineModel.transform(test_df) LRC_BestModel = LRC_PipelineModel.stages[-1].bestModel LRC_Probability = LRC_Predicted.select("Probability").toPandas() LRC_Prediction = LRC_Predicted.select("Prediction").toPandas() LRC_Score = evaluator.evaluate(LRC_Predicted) return LRC_BestModel, LRC_Predicted, LRC_Probability, LRC_Prediction, LRC_Score if decisiontreeclassifier: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) DTClassifier = DecisionTreeClassifier( featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='gini', seed=None) paramGrid = ParamGridBuilder().addGrid( DTClassifier.impurity, ["gini", "entropy"]).addGrid( DTClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( DTClassifier.maxBins, [3, 5, 10, 50, 100, 200]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") DTCV = CrossValidator(estimator=DTClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(DTCV) DTC_Pipeline = Pipeline(stages=stagesList) DTC_PipelineModel = DTC_Pipeline.fit(train_df) DTC_Predicted = DTC_PipelineModel.transform(test_df) DTC_BestModel = DTC_PipelineModel.stages[-1].bestModel DTC_Probability = DTC_Predicted.select("Probability").toPandas() DTC_Prediction = DTC_Predicted.select("Prediction").toPandas() DTC_Score = evaluator.evaluate(DTC_Predicted) return DTC_BestModel, DTC_Predicted, DTC_Probability, DTC_Prediction, DTC_Score if linearsvc: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) SVClassifier = LinearSVC(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', rawPredictionCol='RawPrediction', maxIter=100, regParam=0.0, tol=1e-06, fitIntercept=True, standardization=True, threshold=0.0) paramGrid = ParamGridBuilder().addGrid( SVClassifier.maxIter, [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid( SVClassifier.regParam, [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0 ]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") SVCV = CrossValidator(estimator=SVClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(SVCV) SVC_Pipeline = Pipeline(stages=stagesList) SVC_PipelineModel = SVC_Pipeline.fit(train_df) SVC_Predicted = SVC_PipelineModel.transform(test_df) SVC_BestModel = SVC_PipelineModel.stages[-1].bestModel SVC_Prediction = SVC_Predicted.select("Prediction").toPandas() SVC_Score = evaluator.evaluate(SVC_Predicted) return SVC_BestModel, SVC_Predicted, SVC_Prediction, SVC_Score if naivebayes: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) NBClassifier = NaiveBayes(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', smoothing=1.0, modelType='multinomial', thresholds=None) paramGrid = ParamGridBuilder().addGrid( NBClassifier.smoothing, [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") NBCV = CrossValidator(estimator=NBClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(NBCV) NBC_Pipeline = Pipeline(stages=stagesList) NBC_PipelineModel = NBC_Pipeline.fit(train_df) NBC_Predicted = NBC_PipelineModel.transform(test_df) NBC_BestModel = NBC_PipelineModel.stages[-1].bestModel NBC_Probability = NBC_Predicted.select("Probability").toPandas() NBC_Prediction = NBC_Predicted.select("Prediction").toPandas() NBC_Score = evaluator.evaluate(NBC_Predicted) return NBC_BestModel, NBC_Predicted, NBC_Probability, NBC_Prediction, NBC_Score if randomforestclassifier: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) RFClassifier = RandomForestClassifier( featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', probabilityCol='Probability', rawPredictionCol='RawPrediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='gini', numTrees=20, featureSubsetStrategy='auto', seed=None, subsamplingRate=1.0) paramGrid = ParamGridBuilder().addGrid( RFClassifier.impurity, ["gini", "entropy"]).addGrid( RFClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( RFClassifier.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( RFClassifier.numTrees, [5, 10, 20, 50, 100, 200]).addGrid( RFClassifier.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() if binaryclass: evaluator = BinaryClassificationEvaluator( rawPredictionCol="RawPrediction", labelCol=labelCol, metricName="areaUnderROC") if multiclass: evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") RFCV = CrossValidator(estimator=RFClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(RFCV) RFC_Pipeline = Pipeline(stages=stagesList) RFC_PipelineModel = RFC_Pipeline.fit(train_df) RFC_Predicted = RFC_PipelineModel.transform(test_df) RFC_BestModel = RFC_PipelineModel.stages[-1].bestModel RFC_Probability = RFC_Predicted.select("Probability").toPandas() RFC_Prediction = RFC_Predicted.select("Prediction").toPandas() RFC_Score = evaluator.evaluate(RFC_Predicted) return RFC_BestModel, RFC_Predicted, RFC_Probability, RFC_Prediction, RFC_Score if gbtclassifier: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) GBClassifier = GBTClassifier(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, lossType='logistic', maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0) paramGrid = ParamGridBuilder().addGrid( GBClassifier.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( GBClassifier.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( GBClassifier.maxIter, [5, 10, 20, 50, 100, 200]).addGrid( GBClassifier.stepSize, [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid( GBClassifier.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() evaluator = MulticlassClassificationEvaluator( labelCol=labelCol, predictionCol="Prediction", metricName="accuracy") GBCV = CrossValidator(estimator=GBClassifier, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(GBCV) GBC_Pipeline = Pipeline(stages=stagesList) GBC_PipelineModel = GBC_Pipeline.fit(train_df) GBC_Predicted = GBC_PipelineModel.transform(test_df) GBC_BestModel = GBC_PipelineModel.stages[-1].bestModel GBC_Prediction = GBC_Predicted.select("Prediction").toPandas() GBC_Score = evaluator.evaluate(GBC_Predicted) return GBC_BestModel, GBC_Predicted, GBC_Prediction, GBC_Score if regression: if linearregression: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) LRegressor = LinearRegression(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', standardization=True, fitIntercept=True, loss='squaredError', maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-06, epsilon=1.35) paramGrid = ParamGridBuilder().addGrid( LRegressor.maxIter, [10, 20, 50, 100, 200, 300, 500, 1000, 2000, 5000]).addGrid( LRegressor.regParam, [0.0, 0.01, 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0 ]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") LRCV = CrossValidator(estimator=LRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(LRCV) LR_Pipeline = Pipeline(stages=stagesList) LR_PipelineModel = LR_Pipeline.fit(train_df) LR_Predicted = LR_PipelineModel.transform(test_df) LR_BestModel = LR_PipelineModel.stages[-1].bestModel LR_Prediction = LR_Predicted.select("Prediction").toPandas() LR_Score = evaluator.evaluate(LR_Predicted) return LR_BestModel, LR_Predicted, LR_Prediction, LR_Score if decisiontreeregressor: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) DTRegressor = DecisionTreeRegressor(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='variance', seed=None, varianceCol=None) paramGrid = ParamGridBuilder().addGrid( DTRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( DTRegressor.maxBins, [3, 5, 10, 50, 100, 200]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") DTRCV = CrossValidator(estimator=DTRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(DTRCV) DTR_Pipeline = Pipeline(stages=stagesList) DTR_PipelineModel = DTR_Pipeline.fit(train_df) DTR_Predicted = DTR_PipelineModel.transform(test_df) DTR_BestModel = DTR_PipelineModel.stages[-1].bestModel DTR_Prediction = DTR_Predicted.select("Prediction").toPandas() DTR_Score = evaluator.evaluate(DTR_Predicted) return DTR_BestModel, DTR_Predicted, DTR_Prediction, DTR_Score if randomforestregressor: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) RFRegressor = RandomForestRegressor(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, impurity='variance', subsamplingRate=1.0, seed=None, numTrees=20) paramGrid = ParamGridBuilder().addGrid( RFRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( RFRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( RFRegressor.numTrees, [5, 10, 20, 50, 100, 200]).addGrid( RFRegressor.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") RFRCV = CrossValidator(estimator=RFRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(RFRCV) RFR_Pipeline = Pipeline(stages=stagesList) RFR_PipelineModel = RFR_Pipeline.fit(train_df) RFR_Predicted = RFR_PipelineModel.transform(test_df) RFR_BestModel = RFR_PipelineModel.stages[-1].bestModel RFR_Prediction = RFR_Predicted.select("Prediction").toPandas() RFR_Score = evaluator.evaluate(RFR_Predicted) return RFR_BestModel, RFR_Predicted, RFR_Prediction, RFR_Score if gbtregressor: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) GBRegressor = GBTRegressor(featuresCol=featuresCol, labelCol=labelCol, predictionCol='Prediction', maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, subsamplingRate=1.0, lossType='squared', maxIter=20, stepSize=0.1, seed=None, impurity='variance') paramGrid = ParamGridBuilder().addGrid( GBRegressor.maxDepth, [3, 5, 10, 15, 20, 25]).addGrid( GBRegressor.maxBins, [3, 5, 10, 50, 100, 200]).addGrid( GBRegressor.maxIter, [5, 10, 20, 50, 100, 200]).addGrid( GBRegressor.stepSize, [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).addGrid( GBRegressor.subsamplingRate, [0.1, 0.2, 0.5, 0.8, 0.9, 1.0]).build() evaluator = RegressionEvaluator(labelCol=labelCol, predictionCol="Prediction", metricName="rmse") GBRCV = CrossValidator(estimator=GBRegressor, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(GBRCV) GBR_Pipeline = Pipeline(stages=stagesList) GBR_PipelineModel = GBR_Pipeline.fit(train_df) GBR_Predicted = GBR_PipelineModel.transform(test_df) GBR_BestModel = GBR_PipelineModel.stages[-1].bestModel GBR_Prediction = GBR_Predicted.select("Prediction").toPandas() GBR_Score = evaluator.evaluate(GBR_Predicted) return GBR_BestModel, GBR_Predicted, GBR_Prediction, GBR_Score if clustering: if kmeans: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) KCluster = KMeans(featuresCol=featuresCol, predictionCol='Prediction', k=n_cluster, initMode='k-means||', initSteps=2, tol=0.0001, maxIter=20, seed=None) paramGrid = ParamGridBuilder().addGrid( KCluster.initSteps, [1, 2, 5, 10, 20, 50, 100]).addGrid( KCluster.maxIter, [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid( KCluster.seed, [i for i in range(1001)]).build() evaluator = ClusteringEvaluator(predictionCol='Prediction', featuresCol=featuresCol, metricName='silhouette') KMCV = CrossValidator(estimator=KCluster, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(KMCV) KMC_Pipeline = Pipeline(stages=stagesList) KMC_PipelineModel = KMC_Pipeline.fit(train_df) KMC_Predicted = KMC_PipelineModel.transform(train_df) KMC_BestModel = KMC_PipelineModel.stages[-1].bestModel KMC_Prediction = KMC_Predicted.select("Prediction").toPandas() KMC_Score = evaluator.evaluate(KMC_Predicted) return KMC_BestModel, KMC_Predicted, KMC_Prediction, KMC_Score if gaussianmixture: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) GMCluster = GaussianMixture(featuresCol=featuresCol, predictionCol='Prediction', probabilityCol='Probability', k=n_cluster, tol=0.01, maxIter=100, seed=None) paramGrid = ParamGridBuilder().addGrid( GMCluster.maxIter, [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid( GMCluster.seed, [i for i in range(1001)]).build() evaluator = ClusteringEvaluator(predictionCol='Prediction', featuresCol=featuresCol, metricName='silhouette') GMCV = CrossValidator(estimator=GMCluster, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(GMCV) GMC_Pipeline = Pipeline(stages=stagesList) GMC_PipelineModel = GMC_Pipeline.fit(train_df) GMC_Predicted = GMC_PipelineModel.transform(train_df) GMC_BestModel = GMC_PipelineModel.stages[-1].bestModel GMC_Probability = GMC_Predicted.select("Probability").toPandas() GMC_Prediction = GMC_Predicted.select("Prediction").toPandas() GMC_Score = evaluator.evaluate(GMC_Predicted) return GMC_BestModel, GMC_Predicted, GMC_Probability, GMC_Prediction, GMC_Score if lda: stagesList = FeaturesTransform( stringIndexer=stringIndexer, inputColStringIndexer=inputColStringIndexer, outputColStringIndexer=outputColStringIndexer, oneHotEncoder=oneHotEncoder, inputColOneHotEncoder=inputColOneHotEncoder, outputColOneHotEncoder=outputColOneHotEncoder, vectorAssembler=vectorAssembler, inputColsVectorAssembler=inputColsVectorAssembler, outputColsVectorAssembler=outputColsVectorAssembler, vectorIndexer=vectorIndexer, inputColsVectorIndexer=inputColsVectorIndexer, outputColsVectorIndexer=outputColsVectorIndexer, maxCategories=maxCategories) LDACluster = LDA(featuresCol=featuresCol, maxIter=20, seed=None, k=n_cluster, learningOffset=1024.0, learningDecay=0.51, subsamplingRate=0.05) paramGrid = ParamGridBuilder().addGrid( LDACluster.maxIter, [10, 20, 50, 100, 200, 500, 1000, 2000]).addGrid( LDACluster.seed, [i for i in range(1001)]).addGrid( LDACluster.subsamplingRate, [0.01, 0.05, 0.1, 0.2, 0.5, 1.0]).build() evaluator = ClusteringEvaluator(predictionCol='Prediction', featuresCol=featuresCol, metricName='silhouette') LDACV = CrossValidator(estimator=LDACluster, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=10) stagesList.append(LDACV) LDA_Pipeline = Pipeline(stages=stagesList) LDA_PipelineModel = LDA_Pipeline.fit(train_df) LDA_Predicted = LDA_PipelineModel.transform(train_df) LDA_BestModel = LDA_PipelineModel.stages[-1].bestModel LDA_Topics = LDA_BestModel.describeTopics().toPandas() LDA_Score = evaluator.evaluate(LDA_Predicted) return LDA_BestModel, LDA_Topics, LDA_Score if recommendation: if als: ALSR = ALS(userCol=userCol, itemCol=itemCol, ratingCol=ratingCol, rank=rank, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, alpha=1.0, seed=1) ALSR_Model = ALSR.fit(train_df) ALSR_ForUsers = ALSR_Model.recommendForAllUsers(userid=userid) ALSR_ForItems = ALSR_Model.recommendForAllItems(itemid=itemid) return ALSR_Model, ALSR_ForUsers, ALSR_ForItems if association: if fpgrowth: fpg = FPGrowth(minSupport=minSupport, minConfidence=minConfidence, itemsCol=itemsCol, predictionCol='Prediction') fpg_model = fpg.fit(train_df) fpg_freqItemsets = fpg_model.freqItemsets.toPandas() fpg_associationRules = fpg_model.associationRules.toPandas() return fpg_model, fpg_freqItemsets, fpg_associationRules
#accuracy = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='affairs',metricName='accuracy',metricLabel=1).evaluate(lrPredictions) #weightedPrecision = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='affairs',metricName='weightedPrecision',metricLabel=1).evaluate(lrPredictions) #weightedRecall = MulticlassClassificationEvaluator(predictionCol='prediction',labelCol='affairs',metricName='weightedRecall',metricLabel=1).evaluate(lrPredictions) #分类报告 report = Predictions.select("prediction", "labels", "features", "probability").toPandas() print( classification_report(y_true=report['labels'], y_pred=report['prediction'])) # 使用混淆矩阵评估模型性能[[TP,FN],[TN,FP]] TP = Predictions.filter(Predictions['prediction'] == 1).filter( Predictions['labels'] == 1).count() FN = Predictions.filter(Predictions['prediction'] == 0).filter( Predictions['labels'] == 1).count() TN = Predictions.filter(Predictions['prediction'] == 0).filter( Predictions['labels'] == 0).count() FP = Predictions.filter(Predictions['prediction'] == 1).filter( Predictions['labels'] == 0).count() # 计算查准率 TP/(TP+FP) precision = TP / (TP + FP) # 计算召回率 TP/(TP+FN) recall = TP / (TP + FN) # 计算F1值 (TP+TN)/(TP+TN+FP+FN) F1 = (2 * precision * recall) / (precision + recall) # 计算accuracy accuracy = (TP + TN) / (TP + TN + FP + FN) auc = BinaryClassificationEvaluator(labelCol='labels').evaluate(Predictions) print( " f1:%1.2f\n accuracy%1.2f\n Precision:%1.2f\n Recall:%1.2f\n auc:%1.2f " % (F1, accuracy, precision, recall, auc))
embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex') embark_encoder = OneHotEncoder( inputCol='EmbarkedIndex', outputCol='EmbarkedVec' ) # ---> each entry will be converted to a vector A = [1, 0] B = [0, 1] new_cols = ['Pclass', 'SexVec', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkedVec'] assembler = VectorAssembler(inputCols=new_cols, outputCol='features') logreg_titanic = LogisticRegression(featuresCol='features', labelCol='Survived') pipeline = Pipeline(stages=[ gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler, logreg_titanic ]) train_data, test_data = final_data.randomSplit([0.7, 0.3]) fit_model = pipeline.fit(train_data) results = fit_model.transform(test_data) evaluate = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Survived') results.select(['Survived', 'prediction']).show() acc = evaluate.evaluate(results) print(acc) ''' '''
def dfEvaluation(predictions): evaluator = BinaryClassificationEvaluator() # auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}) # print "ROC curve value: ", auroc accuracy = evaluator.evaluate(predictions) print "Accuracy : ", accuracy * 100
def train(params): with mlflow.start_run(): impurity = params['impurity'] max_depth = int(params['max_depth']) max_bins = int(params['max_bins']) mlflow.log_param('impurity', impurity) mlflow.log_param('max_depth', max_depth) mlflow.log_param('max_bins', max_bins) parameters = ['condition', 'num_conditions', 'days'] for parameter in parameters: mlflow.log_param(parameter, dbutils.widgets.get(parameter)) dt = DecisionTreeClassifier(impurity=impurity, maxDepth=max_depth, maxBins=max_bins) model = dt.fit(training_encounters) mlflow.spark.log_model(model, 'patient-trajectory+PtAge') (testing_encounters, _) = featurize_encounters(test_patients, string_indicers=string_indicers) bce = BinaryClassificationEvaluator() test_transformed = model.transform(testing_encounters) aroc = bce.evaluate(test_transformed, {bce.metricName: "areaUnderROC"}) aPR = bce.evaluate(test_transformed, {bce.metricName: "areaUnderPR"}) # use sklearn to caluclate evaluation metrics from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score y_test = test_transformed.select('label').toPandas() y_pred = test_transformed.select('prediction').toPandas() acc = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) conf_matrix = confusion_matrix(y_test,y_pred) # get classification matrics as a dictionary class_report = classification_report(y_test,y_pred, output_dict=True) recall_0 = class_report['0']['recall'] f1_score_0 = class_report['0']['f1-score'] # log metrics mlflow.log_metric("accuracy_score", acc) mlflow.log_metric("precision", precision) mlflow.log_metric("recall_0", recall_0) mlflow.log_metric("f1_score_0", f1_score_0) mlflow.log_metric("area_under_ROC", aroc) mlflow.log_metric("area_under_PR", aPR) return {'loss': -aroc, 'status': STATUS_OK} # COMMAND ---------- result = (test_transformed.cube('prediction', 'label').count() .where(col('prediction').isNotNull() & col('label').isNotNull()) .withColumn('param', when((col('prediction')==1) & (col('label')==1), 'TP') .when((col('prediction')==0) & (col('label')==0), 'TN') .when((col('prediction')==1) & (col('label')==0), 'FP') .otherwise('FN')) .select('param', 'count').toPandas()) # get confusion matrix values true_positive = result.iloc[0, 1] true_negative = result.iloc[3, 1] false_positive = result.iloc[1, 1] false_negative = result.iloc[2, 1]
# Step - 2: Transform dataframe to vectorized dataframe output = assembler.transform(animals).select("features", "eatable", "cyr_name") output.cache() # Step - 3: Set up the LinearSVC Classifier trainer = LinearSVC(labelCol="eatable", featuresCol="features") # Step - 4: Train the model model = trainer.fit(output) print("Coefficients: " + str(model.coefficients) + " Intercept: " + str(model.intercept)) rawPredictions = model.transform(output) predictions = enrichPredictions(rawPredictions) predictions.show(100) # Step - 5: Evaluate prediction evaluator = BinaryClassificationEvaluator(labelCol="eatable", rawPredictionCol="prediction") # Step - 6: Calculate ROC AUC rocAuc = evaluator.evaluate(rawPredictions) print("ROC_AUC = %g " % rocAuc) spark.stop()
# LogisticRegression.transform() will only use the 'features' column. predictions = lrModel.transform(testData) predictions.show() # Puedes ver cuantos predijo mal predictions.groupBy('label', 'prediction').count().show() # ----------------------------------------------------------------EVALUACION DEL MODELO---------------------------------------------------------- # We can use BinaryClassificationEvaluator to evaluate our model. # We can set the required column names in rawPredictionCol and labelCol Param and the metric in metricName Param. # Evaluate model from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol="rawPrediction", metricName='areaUnderROC') evaluator.evaluate(predictions) # Note that the default metric for the BinaryClassificationEvaluator is areaUnderROC print(lr.explainParams()) # Summary del modelo trainingSummary = lrModel.summary trainingSummary.accuracy trainingSummary.areaUnderROC # Graficas roc = trainingSummary.roc.toPandas() plt.figure() plt.plot(roc['FPR'], roc['TPR'],
def randomForest(trainingData, testData, impurity, maxDepth, maxBins, numTrees, enableCrossValidator=False, featuresCol='features', labelCol='label', predictionCol='prediction', probabilityCol='probability', rawPredictionCol='rawPrediction', minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, featureSubsetStrategy='auto', seed=None, subsamplingRate=1.0): print("\nInizio classificazione con RandomForestClassifier") # Inizializzo il modello del classificatore con i parametri in input (e quelli default) rfc = RandomForestClassifier(featuresCol=featuresCol, labelCol=labelCol, predictionCol=predictionCol, probabilityCol=probabilityCol, rawPredictionCol=rawPredictionCol, maxDepth=maxDepth, maxBins=maxBins, minInstancesPerNode=minInstancesPerNode, minInfoGain=minInfoGain, maxMemoryInMB=maxMemoryInMB, cacheNodeIds=cacheNodeIds, checkpointInterval=checkpointInterval, impurity=impurity, numTrees=numTrees, featureSubsetStrategy=featureSubsetStrategy, seed=seed, subsamplingRate=subsamplingRate) print(" -modello creato") validator = None # In caso di cross validation if enableCrossValidator: # Creo la mappa dei parametri paramGrid = ParamGridBuilder().build() # Inizializzo l'evaluator evaluator = BinaryClassificationEvaluator() # Creo il sistema di k-fold cross validation, dove estiamtor è il classificatore da valutare e numFolds è il K crossVal = CrossValidator(estimator=rfc, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5) # use 3+ folds in practice validator = crossVal else: validator = rfc print(" -validator creato") training = trainingData.map(lambda x: (x[31], Vectors.dense(x[1:29]), x[ 30])).toDF(schema=['index', 'features', 'label']).orderBy('index') # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. # tokenizer = Tokenizer(inputCol="features", outputCol="transactions") # hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures", numFeatures=29) pipeline = Pipeline(stages=[validator]) model = pipeline.fit(training) print(" -modello addestrato con la pipeline (" + str(training.count()) + " elementi utilizzati come training)") test = testData.map(lambda x: (x[30], Vectors.dense(x[1:29]), x[31])).toDF( schema=['label', 'features', 'index']).orderBy('index') # prediction = predictions, label, index predictionsAndLabels = model.transform(test).rdd.map(lambda x: (x[5], x[0], x[2])) print(" -" + str(predictionsAndLabels.count()) + " elementi predetti (" + str(test.count()) + " elementi usati come test)") return predictionsAndLabels
# Step 9 from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import VectorAssembler from pyspark.ml import Pipeline from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.tuning import CrossValidator, ParamGridBuilder # Vectorize vecAssembler = VectorAssembler(inputCols=['age','job','marital',’education','balance','housing','loan','contact','day','month','duration','campaign','pdays','previous','poutcome'],outputCol="features") # Transform data df_train = vecAssembler.transform(train_data) pd.DataFrame(df_train.take(5), columns=df_train.columns).transpose() dt = DecisionTreeClassifier(labelCol="deposit", featuresCol="features") pipeline = Pipeline(stages=[vecAssembler, dt]) model = pipeline.fit(train_data) predictions = model.transform(test_data) #Select prediction information predictions.select("prediction", "Classification", "features").toPandas().head(10) evaluator = BinaryClassificationEvaluator(labelCol="deposit", rawPredictionCol="prediction") evaluator.evaluate(predictions) paramGrid = ParamGridBuilder().addGrid(dt.maxDepth,[2,3,4,5,6,7,8,9,10,11,12]).build() # Set up 3-fold cross validation crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3) CV_model = crossval.fit(train_data) tree_model = CV_model.bestModel.stages[1] print(tree_model) predictions_improved = CV_model.bestModel.transform(test_data) predictions_improved.select("prediction", "deposit", "features").toPandas().head(10) evaluator.evaluate(predictions_improved)
# ****************************************************************************** # # Run Logistic Regression Classification. # # ****************************************************************************** # lr = LogisticRegression(family='binomial', featuresCol='features', labelCol='label', predictionCol='pred', rawPredictionCol='pred_raw', maxIter=10) lr_model = lr.fit(trainDF) lr_result = lr_model.transform(testDF) # Create an evaluator to measure classification performance. evaluator1 = BinaryClassificationEvaluator(rawPredictionCol='pred_raw', labelCol='label', metricName='areaUnderPR') area_under_pr = evaluator1.evaluate(lr_result) evaluator2 = MulticlassClassificationEvaluator(predictionCol="pred", labelCol="label", metricName="f1") f1_score = evaluator2.evaluate(lr_result) evaluator3 = MulticlassClassificationEvaluator(predictionCol="pred", labelCol="label", metricName="accuracy") accuracy = evaluator3.evaluate(lr_result) print("") print( "########################################################################" )
# Generar predicciones en el DataFrame de prueba: test_with_prediction = log_reg_model.transform(test) test_with_prediction.show(5) test_summary_pred = log_reg_model.evaluate(test_with_prediction) plot_roc_curve(test_summary) # ** Nota: ** El DataFrame resultante incluye tres tipos de predicciones. los # `rawPrediction` es un vector de log-odds,` prediction` es un vector o # probabilidades `prediction` es la clase predicha basada en la probabilidad # vector. # Crear una instancia de la clase `BinaryClassificationEvaluator` : from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="high_rating", metricName="areaUnderROC") print(evaluator.explainParams()) evaluator.evaluate(test_with_prediction) # Evaluar usando la metrica : evaluator.setMetricName("areaUnderPR").evaluate(test_with_prediction) # ## References # [Spark Documentation - Classification and regression](https://spark.apache.org/docs/latest/ml-classification-regression.html) # [Spark Python API - pyspark.ml.feature module](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#module-pyspark.ml.feature) # [Spark Python API - pyspark.ml.classification module](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#module-pyspark.ml.classification)
# In[65]: final_model = LogisticRegression() fit_final_model = final_model.fit(lr_train) # <font size=4,font style=arial> # train setine uyguladığımız modeli test edelim # </font> # In[66]: predictions_and_labels = fit_final_model.evaluate(lr_test) # <font size=4,font style=arial> # label gerçekleşen ve prediction da tahmin olmak üzere aşağıda ki şekildedir # </font> # In[67]: predictions_and_labels.predictions.show(100, truncate=False) # <font size=4,font style=arial> # Roc eğrisinin altında ki alanı rakam olarak görelim. 1'e yakın bir değer iyi bir değerdir. Veri seti manual oluşturulduğundan aşağıda ki şekilde bir değer çıkmıştır. # </font> # In[68]: my_eval = BinaryClassificationEvaluator() my_final_roc = my_eval.evaluate(predictions_and_labels.predictions) my_final_roc
def evaluate_ROC(predictions): from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator() return evaluator.evaluate(predictions)
def main(context): """Main function takes a Spark SQL context.""" # TASK 1 # the read is from the parquet file comments = sqlContext.read.parquet("comments-minimal.parquet") submissions = sqlContext.read.parquet("submissions.parquet") # only look at columns that are useful comments = comments.select("id","created_utc","body","author_flair_text", "link_id", "score").\ withColumnRenamed("score", "commentscore") submissions = submissions.select("id", "title", "score").\ withColumnRenamed("score", "storyscore") #comments.write.parquet("comments-minimal.parquet") #submissions.write.parquet("submissions.parquet") # TASK 2 labeled_data = sqlContext.read.format("csv").options( header='true', inferSchema='true').load('labeled_data.csv') #here we do the join on comment id joined = comments.join(labeled_data, comments.id == labeled_data.Input_id) #comments.join(labeled_data, comments.id == labeled_data.Input_id).explain() # TASK 4 #sanitize_new ignores processed string given by sanitize from cleantext import sanitize def sanitize_new(text): r = sanitize(text)[1:] return r[0].split(" ") + r[1].split(" ") + r[2].split(" ") # TASK 5 #create the udf, generate new column of n-grams sanitize_udf = udf(sanitize_new, ArrayType(StringType())) joined = joined.withColumn("ngrams", sanitize_udf(joined.body)) # TASK 6A # construct feature vector based on "ngrams" #store the transformed column in "features" #CountVectroizer produces sparse vector by default so no need to change cv = CountVectorizer(inputCol="ngrams", outputCol="features", minDF=5.0, binary=True) cv_model = cv.fit(joined) joined = cv_model.transform(joined) # TASK 6B # construct pos column and neg column #for this project, only look at label on Trump pos_udf = udf(lambda label: 1 if label == 1 else 0, IntegerType()) neg_udf = udf(lambda label: 1 if label == -1 else 0, IntegerType()) joined = joined.withColumn("poslabel", pos_udf(joined.labeldjt)) joined = joined.withColumn("neglabel", neg_udf(joined.labeldjt)) # TASK 7 #train logistic regression model #code adopted from project spec #Initialize two logistic regression models. poslr = LogisticRegression(labelCol="poslabel", featuresCol="features", maxIter=10) neglr = LogisticRegression(labelCol="neglabel", featuresCol="features", maxIter=10) poslr.setThreshold(0.2) neglr.setThreshold(0.25) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator(labelCol="poslabel") negEvaluator = BinaryClassificationEvaluator(labelCol="neglabel") # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = joined.randomSplit([0.5, 0.5]) negTrain, negTest = joined.randomSplit([0.5, 0.5]) # Train the models posModel = posCrossval.fit(posTrain) negModel = negCrossval.fit(negTrain) # TASK: Extra Credit's Curve: # evaluate the model # posTestRes = posModel.transform(posTest).toPandas()['probability'] # posTestRes = np.array([i[1] for i in posTestRes]) # negTestRes = negModel.transform(negTest).toPandas()['probability'] # negTestRes = np.array([i[1] for i in negTestRes]) # print(negTestRes, posTestRes) # print('ok') # pfpr, ptpr, _ = metrics.roc_curve(posTest.select('poslabel').toPandas(), posTestRes) # nfpr, ntpr, _ = metrics.roc_curve(negTest.select('neglabel').toPandas(), negTestRes) # print(pfpr[:5], ptpr[:5], nfpr[:5],ntpr[:5]) # plt.plot(pfpr, ptpr, label = 'posModel') # plt.plot(nfpr, ntpr, label = 'negModel') # plt.legend() # plt.savefig('ROC.png') # plt.close() # # save the models # posModel.save("www/pos.model") # negModel.save("www/neg.model") #load instead # posModel = CrossValidatorModel.load("www/pos.model") # negModel = CrossValidatorModel.load("www/neg.model") #print("finished loading model") # TASK 8.1 # selected column 'created_utc' and transformed in 10.2 using from_unixtime # TASK 8.2 # title of submission of the comment comments = comments.withColumn("clean_id", regexp_replace("link_id", r'^t3_', '')) comments = comments.join( submissions, comments.clean_id == submissions.id).drop(submissions.id) # TASK 8.3 # Please see TASK 10.3 (by state) line 166 # TASK 9 #filter out comments with "\s" and starts with ">" comments = comments.filter(~comments.body.rlike(r'^>')).\ filter(~comments.body.rlike(r'\\s')) #sample comments = comments.sample( False, sampleRate, None) # 1 serves as the seed so model is reproducible #redo 4,5,6a comments = comments.withColumn("ngrams", sanitize_udf(comments.body)) comments = cv_model.transform(comments) #print("done with transforming the sampled comments") #make predictions comments = posModel.transform(comments).\ drop("body", "link_id", "clean_id", "ngrams","rawPrediction", "probability").\ withColumnRenamed("prediction", "poslabel") comments = negModel.transform(comments).drop("features", "rawPrediction", "probability").\ withColumnRenamed("prediction", "neglabel") # TASK 10.1 # compute the percentage of positive, negative comments #print("Percentage of positive comments") result = comments.select('poslabel').groupBy().avg() result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("pos-perc.csv") #print("Percenetage of negative comments") result = comments.select('neglabel').groupBy().avg() result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("neg-perc.csv") # TASK 10.2 #2. by date comments = comments.withColumn( "date", from_unixtime(comments.created_utc, "YYYY-MM-dd")) result = comments.groupBy("date").agg({ "poslabel": "mean", "neglabel": "mean" }) result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("time_data.csv") # TASK 10.3 #3. by state val_state_udf = udf(lambda state: state if state in states else None, StringType()) comments = comments.withColumn( "state", val_state_udf(lower(comments.author_flair_text))) comments = comments.filter(comments.state.isNotNull()) result = comments.groupBy("state").agg({ "poslabel": "mean", "neglabel": "mean" }) result.show(truncate=False) #print(result.count()) result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("state_data.csv") # TASK 10.4 #4a. by comment score result = comments.groupBy("commentscore").agg({ "poslabel": "mean", "neglabel": "mean" }) result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("comment_score.csv") #4b. by story score result = comments.groupBy("storyscore").agg({ "poslabel": "mean", "neglabel": "mean" }) result.repartition(1).write.format("com.databricks.spark.csv").\ option("header","true").save("story_score.csv") # DELIVERABLE 4. story = result.orderBy('avg(poslabel)', ascending=False).limit(10) # join is too expensive, subquery is also expensive score_list = set(story.select('storyscore').toPandas()['storyscore']) comments[comments.storyscore.isin(score_list)].select( 'storyscore', 'title').limit(20).show(truncate=False) story = result.orderBy('avg(neglabel)', ascending=False).limit(10) score_list = set(story.select('storyscore').toPandas()['storyscore']) comments[comments.storyscore.isin(score_list)].select( 'storyscore', 'title').limit(20).show(truncate=False)
print("===========VectorAssembler====================") feature = df.columns[1:len(df.columns)-1] lable_name = df.columns[-1] print(lable_name) assembler = VectorAssembler(inputCols=feature, outputCol="features") print("=============pipeline==================") model = LogisticRegression(regParam=0.1, labelCol=lable_name, featuresCol="features" , family ='binomial') pipeline = Pipeline(stages=[assembler,model]) pipeline.getStages() print("===========TaintingAndTesting====================") pipelineModel = pipeline.fit(train_df) predicted=pipelineModel.transform(test_df) print("===========PredictedAUC====================") evaluator = BinaryClassificationEvaluator( rawPredictionCol="rawPrediction", labelCol= lable_name, metricName="areaUnderROC" ) auc= evaluator.evaluate(predicted) print(auc) print("===========PredictedScore====================") Multi_evaluator = MulticlassClassificationEvaluator(labelCol= lable_name) Accuracy= Multi_evaluator.evaluate(predicted ,{evaluator.metricName: "accuracy"}) Precision = Multi_evaluator.evaluate(predicted ,{evaluator.metricName: "weightedPrecision"}) Recall = Multi_evaluator.evaluate(predicted ,{evaluator.metricName: "weightedRecall"}) F1 = Multi_evaluator.evaluate(predicted ,{evaluator.metricName: "f1"}) print("Accuracy",Accuracy,"Precision",Precision,"Recall",Recall,"F1",F1)
# Evaluate model based on auc ROC from pyspark.ml.evaluation import BinaryClassificationEvaluator # Evaluate model based on F1 socre from pyspark.ml.evaluation import MulticlassClassificationEvaluator # Evaluate model based on confusion matrix from pyspark.mllib.evaluation import MulticlassMetrics # model on training data regPara: lasso regularisation parameter (L1) lrModel = LogisticRegression().fit(trainData) # make prediction on test data pred = lrModel.transform(testData) pred.select('catLabel', 'label', 'prediction').show() evaluator1 = BinaryClassificationEvaluator(labelCol='label', metricName="areaUnderROC") evaluator2 = MulticlassClassificationEvaluator(labelCol='label', metricName="f1") metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple)) print('AUC ROC of Logistic Regression model is %f' % evaluator1.evaluate(pred)) print('F1 score of Logistic Regression model is %f' % evaluator2.evaluate(pred)) metrics.confusionMatrix().toArray().transpose() # <a id="context322"></a> # #### 3.2.2. Decision Tree # In[20]: from pyspark.ml.classification import DecisionTreeClassifier
return crossval from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.regression import DecisionTreeRegressor from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.feature import HashingTF, Tokenizer from pyspark.ml.tuning import CrossValidator, ParamGridBuilder from pyspark.ml.evaluation import RegressionEvaluator evaluatorB = BinaryClassificationEvaluator(labelCol="labels", rawPredictionCol="prediction", metricName="areaUnderROC") evaluatorM = MulticlassClassificationEvaluator(labelCol="labels", predictionCol="prediction", metricName="accuracy") dt = DecisionTreeClassifier(labelCol="labels", featuresCol="features") pipeline = Pipeline(stages=[dt]) paramGrid = ParamGridBuilder() \ .addGrid(dt.maxDepth, [5, 10]) \ .addGrid(dt.impurity, ['gini', 'entropy'])\ .build() print("Decision Tree Classifier, Metric: Area Under ROC") crossval = metrics(pipeline, paramGrid, evaluatorB)
print("Regularization rate: {}".format(reg)) # create a bunch of child runs with root_run.child_run("reg-" + str(reg)) as run: # create a new Logistic Regression model. lr = LogisticRegression(regParam=reg) # put together the pipeline pipe = Pipeline(stages=[lr]) # train the model model_pipeline = pipe.fit(trainingData) predictions = model_pipeline.transform(testData) # evaluate. note only 2 metrics are supported out of the box by Spark ML. bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction') au_roc = bce.setMetricName('areaUnderROC').evaluate(predictions) au_prc = bce.setMetricName('areaUnderPR').evaluate(predictions) truePositive = predictions.select("label").filter("label = 1 and prediction = 1").count() falsePositive = predictions.select("label").filter("label = 0 and prediction = 1").count() trueNegative = predictions.select("label").filter("label = 0 and prediction = 0").count() falseNegative = predictions.select("label").filter("label = 1 and prediction = 0").count() # log reg, au_roc, au_prc and feature names in run history run.log("reg", reg) run.log("au_roc", au_roc) run.log("au_prc", au_prc) print("Area under ROC: {}".format(au_roc)) print("Area Under PR: {}".format(au_prc))
tree_model = tree_classifier.fit(training_data) predictions = tree_model.transform(test_data) #print(tree_model.toDebugString) test_error = predictions.filter( predictions["prediction"] != predictions["Accident_Severity"]).count( ) / float(test_data.count()) print "Testing error: {0:.4f}".format(test_error) # Select example rows to display. predictions.select("prediction", "Accident_Severity", "features").show(5) #Model rozhodovacie stromu print(tree_model.toDebugString) #vyhodnotenie decision tree evaluatorMulti = MulticlassClassificationEvaluator( labelCol="Accident_Severity", predictionCol="prediction") evaluator = BinaryClassificationEvaluator(labelCol="Accident_Severity", rawPredictionCol="prediction", metricName='areaUnderROC') acc = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "accuracy"}) f1 = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "f1"}) Precision = evaluatorMulti.evaluate( predictions, {evaluatorMulti.metricName: "weightedPrecision"}) Recall = evaluatorMulti.evaluate(predictions, {evaluatorMulti.metricName: "weightedRecall"}) auc = evaluator.evaluate(predictions) print('Accuracy score: ', acc) print('f1: ', f1) print('Precision: ', Precision) print('Recall: ', Recall) print('Auc: ', auc) #kontingencna tabulka