train = splits[0]
test = splits[1]

# Creamos el modelo de Decision Tree, lo entrenamos, lo guardamos y realizamos la prediccion
now = datetime.datetime.now()
print (now.year, now.month, now.day, now.hour, now.minute, now.second)

dt = DecisionTreeClassifier(labelCol='attack_cat_index', featuresCol='features', impurity='entropy', seed=1234, maxBins=136, maxDepth=25,
                            predictionCol='prediction')
dt = dt.fit(train)
model_output_path = "{}/data/DecisionTree_extended.bin".format(base_path)
dt.write().overwrite().save(model_output_path)

now = datetime.datetime.now()
print (now.year, now.month, now.day, now.hour, now.minute, now.second)
result = dt.transform(test)

prediction_df = result.select("attack_cat_index", "prediction").toPandas()
prediction_list = prediction_df[["attack_cat_index","prediction"]].values.tolist()

#Funcion para el TPR individual por clase
def truePositiveRate(list, label):
    tot_count = 0
    true_count = 0
    for a in list:
        if a[0] == label:
            tot_count = tot_count + 1
            if a[1] == label:
                true_count = true_count + 1
    TPR = true_count/tot_count
    return TPR
print('F1 score of Logistic Regression model is %f' %
      evaluator2.evaluate(pred))
metrics.confusionMatrix().toArray().transpose()

# <a id="context322"></a>
# #### 3.2.2. Decision Tree

# In[14]:

from pyspark.ml.classification import DecisionTreeClassifier

# model on training data maxDepth is the hyperparameter
dtModel = DecisionTreeClassifier(maxDepth=3).fit(trainData)

# make prediction on test data
pred = dtModel.transform(testData)

pred.select('label', 'prediction').show()

evaluator1 = BinaryClassificationEvaluator(labelCol='label',
                                           metricName="areaUnderROC")
evaluator2 = MulticlassClassificationEvaluator(labelCol='label',
                                               metricName="f1")
metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple))

print('AUC ROC of Decision Tree model is %f' % evaluator1.evaluate(pred))
print('F1 score of Decision Tree model is %f' % evaluator2.evaluate(pred))
metrics.confusionMatrix().toArray().transpose()

# <a id="context323"></a>
# #### 3.2.3. Random Forest
Example #3
0
# Build parameter grid
params = params.build()

--------------------------------------------------
# Exercise_9 
# Import the classes required
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Create model objects and train on training data
tree = DecisionTreeClassifier().fit(flights_train)
gbt = GBTClassifier().fit(flights_train)

# Compare AUC on testing data
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(tree.transform(flights_test))
evaluator.evaluate(gbt.transform(flights_test))

# Find the number of trees and the relative importance of features
print(gbt.getNumTrees)
print(gbt.featureImportances)

--------------------------------------------------
# Exercise_10 
# Create a random forest classifier
forest = RandomForestClassifier()

# Create a parameter grid
params = ParamGridBuilder() \
            .addGrid(forest.featureSubsetStrategy, ['all', 'onethird', 'sqrt', 'log2']) \
            .addGrid(forest.maxDepth, [2, 5, 10]) \
Example #4
0
kars = cars_assembled.select('features', 'origin_idx')
#kars.show(9)

# Split data into training and testing sets
kars_train, kars_test = kars.randomSplit([0.8, 0.2], seed=23)

print(kars_train.count(), kars_test.count())

# Create a Decision Tree classifier
tree = DecisionTreeClassifier(labelCol="origin_idx")

# Learn from training data
tree = tree.fit(kars_train)

# Make predictions on testing data
prediction = tree.transform(kars_test)

prediction.show(9)

# Confusion matrix
confusion_matrix = prediction.groupBy("origin_idx", "prediction").count()
confusion_matrix.show()

# Accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="origin_idx", metricName="accuracy")
accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))

spark.stop()

'''
Example #5
0
    print("truePositive: " + str(truePositive))
    print("falsePositive: " + str(falsePositive))
    print("trueNegative: " + str(trueNegative))
    print("falseNegative: " + str(falseNegative))
    print("-----")

# COMMAND ----------

# MAGIC %md #6. Decision tree - different algorithm

# COMMAND ----------

dtModel = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3).fit(trainingData)

predictions = dtModel.transform(testData)

truePositive = predictions.select("label").filter("label = 1 and prediction = 1").count()
falsePositive = predictions.select("label").filter("label = 0 and prediction = 1").count()
trueNegative = predictions.select("label").filter("label = 0 and prediction = 0").count()
falseNegative = predictions.select("label").filter("label = 1 and prediction = 0").count()

print("truePositive: " + str(truePositive))
print("falsePositive: " + str(falsePositive))
print("trueNegative: " + str(trueNegative))
print("falseNegative: " + str(falseNegative))

# COMMAND ----------

maxDepth = [1, 3, 5, 10]
for maxd in maxDepth:
    #----------------- Decision and Random Forest -----------------

    # Final assembly
    inputCols = ['norm_cols'
                 ] + [cname + "classVec" for cname in categorical_cols]
    final_assembler = VectorAssembler(inputCols=inputCols,
                                      outputCol='features')
    stages += [final_assembler]

    pipeline = Pipeline(stages=stages)
    train_final = pipeline.fit(train).transform(train)
    test_final = pipeline.fit(test).transform(test)

    dt = DecisionTreeClassifier(featuresCol='features',
                                labelCol='label').fit(train_final)
    res_dt = dt.transform(test_final)

    rf = RandomForestClassifier(featuresCol='features',
                                labelCol='label',
                                numTrees=20).fit(train_final)
    res_rf = rf.transform(test_final)

    res_lr.select('prediction', 'label').write.csv(sys.argv[2] + "lr",
                                                   header=True)
    res_dt.select('prediction', 'label').write.csv(sys.argv[2] + "dt",
                                                   header=True)
    res_rf.select('prediction', 'label').write.csv(sys.argv[2] + "rf",
                                                   header=True)

    spark.stop()