Example #1
0
def random_forest_classifier():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    df = spark.createDataFrame([(1.0, Vectors.dense(1.0)),
                                (0.0, Vectors.sparse(1, [], []))],
                               ["label", "features"])
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    td = si_model.transform(df)
    rf = RandomForestClassifier(numTrees=3,
                                maxDepth=2,
                                labelCol="indexed",
                                seed=42)
    model = rf.fit(td)
    # model.featureImportances
    # # SparseVector(1, {0: 1.0})
    # allclose(model.treeWeights, [1.0, 1.0, 1.0])
    # # True
    test0 = spark.createDataFrame([(Vectors.dense(-1.0), )], ["features"])
    result = model.transform(test0).head()
    # result.prediction
    # # 0.0
    # numpy.argmax(result.probability)
    # # 0
    # numpy.argmax(result.rawPrediction)
    # # 0
    # test1 = spark.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"])
    # model.transform(test1).head().prediction
    # # 1.0
    # model.trees
    # # [DecisionTreeClassificationModel (uid=...) of depth..., DecisionTreeClassificationModel...]
    temp_path = "."
    rfc_path = temp_path + "/rfc"
    rf.write().overwrite().save(rfc_path)
    rf2 = RandomForestClassifier.load(rfc_path)
    # rf2.getNumTrees()
    # # 3
    model_path = temp_path + "/rfc_model"
    model.write().overwrite().save(model_path)
    model2 = RandomForestClassificationModel.load(model_path)
print(now.year, now.month, now.day, now.hour, now.minute, now.second)

rf = RandomForestClassifier(labelCol='attack_cat_index',
                            featuresCol='features',
                            impurity='entropy',
                            seed=1234,
                            maxBins=136,
                            maxDepth=25,
                            featureSubsetStrategy='all',
                            predictionCol='prediction')
rf = rf.fit(train)

now = datetime.datetime.now()
print(now.year, now.month, now.day, now.hour, now.minute, now.second)
model_output_path = "{}/data/RandomForest_extended.bin".format(base_path)
rf.write().overwrite().save(model_output_path)
result = rf.transform(test)

prediction_df = result.select("attack_cat_index", "prediction").toPandas()
prediction_list = prediction_df[["attack_cat_index",
                                 "prediction"]].values.tolist()


#Creamos una funcion para el TPR
def truePositiveRate(list, label):
    tot_count = 0
    true_count = 0
    for a in list:
        if a[0] == label:
            tot_count = tot_count + 1
            if a[1] == label: