train = splits[0] test = splits[1] # Creamos el modelo de Decision Tree, lo entrenamos, lo guardamos y realizamos la prediccion now = datetime.datetime.now() print (now.year, now.month, now.day, now.hour, now.minute, now.second) dt = DecisionTreeClassifier(labelCol='attack_cat_index', featuresCol='features', impurity='entropy', seed=1234, maxBins=136, maxDepth=25, predictionCol='prediction') dt = dt.fit(train) model_output_path = "{}/data/DecisionTree_extended.bin".format(base_path) dt.write().overwrite().save(model_output_path) now = datetime.datetime.now() print (now.year, now.month, now.day, now.hour, now.minute, now.second) result = dt.transform(test) prediction_df = result.select("attack_cat_index", "prediction").toPandas() prediction_list = prediction_df[["attack_cat_index","prediction"]].values.tolist() #Funcion para el TPR individual por clase def truePositiveRate(list, label): tot_count = 0 true_count = 0 for a in list: if a[0] == label: tot_count = tot_count + 1 if a[1] == label: true_count = true_count + 1 TPR = true_count/tot_count return TPR
print('F1 score of Logistic Regression model is %f' % evaluator2.evaluate(pred)) metrics.confusionMatrix().toArray().transpose() # <a id="context322"></a> # #### 3.2.2. Decision Tree # In[14]: from pyspark.ml.classification import DecisionTreeClassifier # model on training data maxDepth is the hyperparameter dtModel = DecisionTreeClassifier(maxDepth=3).fit(trainData) # make prediction on test data pred = dtModel.transform(testData) pred.select('label', 'prediction').show() evaluator1 = BinaryClassificationEvaluator(labelCol='label', metricName="areaUnderROC") evaluator2 = MulticlassClassificationEvaluator(labelCol='label', metricName="f1") metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple)) print('AUC ROC of Decision Tree model is %f' % evaluator1.evaluate(pred)) print('F1 score of Decision Tree model is %f' % evaluator2.evaluate(pred)) metrics.confusionMatrix().toArray().transpose() # <a id="context323"></a> # #### 3.2.3. Random Forest
# Build parameter grid params = params.build() -------------------------------------------------- # Exercise_9 # Import the classes required from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier from pyspark.ml.evaluation import BinaryClassificationEvaluator # Create model objects and train on training data tree = DecisionTreeClassifier().fit(flights_train) gbt = GBTClassifier().fit(flights_train) # Compare AUC on testing data evaluator = BinaryClassificationEvaluator() evaluator.evaluate(tree.transform(flights_test)) evaluator.evaluate(gbt.transform(flights_test)) # Find the number of trees and the relative importance of features print(gbt.getNumTrees) print(gbt.featureImportances) -------------------------------------------------- # Exercise_10 # Create a random forest classifier forest = RandomForestClassifier() # Create a parameter grid params = ParamGridBuilder() \ .addGrid(forest.featureSubsetStrategy, ['all', 'onethird', 'sqrt', 'log2']) \ .addGrid(forest.maxDepth, [2, 5, 10]) \
kars = cars_assembled.select('features', 'origin_idx') #kars.show(9) # Split data into training and testing sets kars_train, kars_test = kars.randomSplit([0.8, 0.2], seed=23) print(kars_train.count(), kars_test.count()) # Create a Decision Tree classifier tree = DecisionTreeClassifier(labelCol="origin_idx") # Learn from training data tree = tree.fit(kars_train) # Make predictions on testing data prediction = tree.transform(kars_test) prediction.show(9) # Confusion matrix confusion_matrix = prediction.groupBy("origin_idx", "prediction").count() confusion_matrix.show() # Accuracy evaluator = MulticlassClassificationEvaluator(labelCol="origin_idx", metricName="accuracy") accuracy = evaluator.evaluate(prediction) print("Test set accuracy = " + str(accuracy)) spark.stop() '''
print("truePositive: " + str(truePositive)) print("falsePositive: " + str(falsePositive)) print("trueNegative: " + str(trueNegative)) print("falseNegative: " + str(falseNegative)) print("-----") # COMMAND ---------- # MAGIC %md #6. Decision tree - different algorithm # COMMAND ---------- dtModel = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=3).fit(trainingData) predictions = dtModel.transform(testData) truePositive = predictions.select("label").filter("label = 1 and prediction = 1").count() falsePositive = predictions.select("label").filter("label = 0 and prediction = 1").count() trueNegative = predictions.select("label").filter("label = 0 and prediction = 0").count() falseNegative = predictions.select("label").filter("label = 1 and prediction = 0").count() print("truePositive: " + str(truePositive)) print("falsePositive: " + str(falsePositive)) print("trueNegative: " + str(trueNegative)) print("falseNegative: " + str(falseNegative)) # COMMAND ---------- maxDepth = [1, 3, 5, 10] for maxd in maxDepth:
#----------------- Decision and Random Forest ----------------- # Final assembly inputCols = ['norm_cols' ] + [cname + "classVec" for cname in categorical_cols] final_assembler = VectorAssembler(inputCols=inputCols, outputCol='features') stages += [final_assembler] pipeline = Pipeline(stages=stages) train_final = pipeline.fit(train).transform(train) test_final = pipeline.fit(test).transform(test) dt = DecisionTreeClassifier(featuresCol='features', labelCol='label').fit(train_final) res_dt = dt.transform(test_final) rf = RandomForestClassifier(featuresCol='features', labelCol='label', numTrees=20).fit(train_final) res_rf = rf.transform(test_final) res_lr.select('prediction', 'label').write.csv(sys.argv[2] + "lr", header=True) res_dt.select('prediction', 'label').write.csv(sys.argv[2] + "dt", header=True) res_rf.select('prediction', 'label').write.csv(sys.argv[2] + "rf", header=True) spark.stop()