# In[75]: # Evaluate from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(labelCol="class_attack", metricName="accuracy", predictionCol="prediction") evaluator.evaluate(conn_test_pred) # - Conclusion : DT performed nicely because it is too strong given the range of different features. The prediction accuracy of decision trees is 99% # In[76]: print(dt.explainParams()) # ## Training random Forest classifier # In[77]: from pyspark.ml import Pipeline from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.mllib.evaluation import MulticlassMetrics from pyspark.mllib.evaluation import BinaryClassificationMetrics # In[78]: rf = RandomForestClassifier(labelCol="class_attack",
# COMMAND ---------- summary = lrModel.summary print summary.areaUnderROC summary.roc.show() summary.pr.show() # COMMAND ---------- summary.objectiveHistory # COMMAND ---------- from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier() print dt.explainParams() dtModel = dt.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import RandomForestClassifier rfClassifier = RandomForestClassifier() print rfClassifier.explainParams() trainedModel = rfClassifier.fit(bInput) # COMMAND ---------- from pyspark.ml.classification import GBTClassifier gbtClassifier = GBTClassifier() print gbtClassifier.explainParams() trainedModel = gbtClassifier.fit(bInput)
"pg2", "pg3", "pg4", "pg5", "pg6", "pg7", "pg8", "pg9", "pg10", "pg11", "pg12", "pg13", "pg14", "pg15", "pg16", "pg17", "pg18", "ps1", "ps2", "ps3", "ps4", "ps5", "ps6", "ps7", "ps8", "ps9", "drugCount_max", "drugCount_min", "drugCount_ave", "drugcount_months", "labCount_max", "labCount_min", "labCount_ave", "labcount_months" ]) vecAssembler.setOutputCol("features") print vecAssembler.explainParams() from pyspark.ml.classification import DecisionTreeClassifier aft = DecisionTreeClassifier() aft.setLabelCol("Readmitlabel") aft.setMaxDepth(30) print aft.explainParams() # COMMAND ---------- from pyspark.ml import Pipeline # We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar. lrPipeline = Pipeline() # Now we'll tell the pipeline to first create the feature vector, and then do the linear regression lrPipeline.setStages([vecAssembler, aft]) # Pipelines are themselves Estimators -- so to use them we call fit: lrPipelineModel = lrPipeline.fit(finaldf) # COMMAND ----------