# In[75]:

# Evaluate
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="class_attack",
                                              metricName="accuracy",
                                              predictionCol="prediction")
evaluator.evaluate(conn_test_pred)

# - Conclusion : DT performed nicely because it is too strong given the range of different features. The prediction accuracy of decision trees is 99%

# In[76]:

print(dt.explainParams())

# ## Training random Forest classifier

# In[77]:

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.evaluation import BinaryClassificationMetrics

# In[78]:

rf = RandomForestClassifier(labelCol="class_attack",
Beispiel #2
0
# COMMAND ----------

summary = lrModel.summary
print summary.areaUnderROC
summary.roc.show()
summary.pr.show()

# COMMAND ----------

summary.objectiveHistory

# COMMAND ----------

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier()
print dt.explainParams()
dtModel = dt.fit(bInput)

# COMMAND ----------

from pyspark.ml.classification import RandomForestClassifier
rfClassifier = RandomForestClassifier()
print rfClassifier.explainParams()
trainedModel = rfClassifier.fit(bInput)

# COMMAND ----------

from pyspark.ml.classification import GBTClassifier
gbtClassifier = GBTClassifier()
print gbtClassifier.explainParams()
trainedModel = gbtClassifier.fit(bInput)
    "pg2", "pg3", "pg4", "pg5", "pg6", "pg7", "pg8", "pg9", "pg10", "pg11",
    "pg12", "pg13", "pg14", "pg15", "pg16", "pg17", "pg18", "ps1", "ps2",
    "ps3", "ps4", "ps5", "ps6", "ps7", "ps8", "ps9", "drugCount_max",
    "drugCount_min", "drugCount_ave", "drugcount_months", "labCount_max",
    "labCount_min", "labCount_ave", "labcount_months"
])
vecAssembler.setOutputCol("features")
print vecAssembler.explainParams()

from pyspark.ml.classification import DecisionTreeClassifier

aft = DecisionTreeClassifier()
aft.setLabelCol("Readmitlabel")
aft.setMaxDepth(30)

print aft.explainParams()

# COMMAND ----------

from pyspark.ml import Pipeline

# We will use the new spark.ml pipeline API. If you have worked with scikit-learn this will be very familiar.
lrPipeline = Pipeline()

# Now we'll tell the pipeline to first create the feature vector, and then do the linear regression
lrPipeline.setStages([vecAssembler, aft])

# Pipelines are themselves Estimators -- so to use them we call fit:
lrPipelineModel = lrPipeline.fit(finaldf)

# COMMAND ----------