def balanced_random_forest_tuning(train_samples):
    rf = RandomForestClassifier(labelCol="label",
                                featuresCol="features",
                                cacheNodeIds=True,
                                weightCol="weight")
    ru = RandomUnderSampler().setIndexCol('id')
    cw = ClassWeighter()
    pipeline = Pipeline().setStages([ru, cw, rf])
    paramGrid = \
        (ParamGridBuilder()
         .addGrid(rf.numTrees, [50, 75, 100])
         .addGrid(rf.featureSubsetStrategy, ['sqrt'])
         .addGrid(rf.impurity, ['gini', 'entropy'])
         .addGrid(rf.maxDepth, [5, 15, 30])
         .addGrid(rf.minInstancesPerNode, [1])
         .addGrid(rf.subsamplingRate, [1.0, 0.66, 0.4])
         .addGrid(cw.classWeight, [[1/36, 1.0], [1/9.0, 1.0]])
         .addGrid(ru.targetImbalanceRatio, [9.0, 36.0])
         .build())
    pr_evaluator = \
        BinaryClassificationEvaluator(labelCol="label",
                                      rawPredictionCol="rawPrediction",
                                      metricName="areaUnderPR")
    tvs = CrossValidator(estimator=pipeline,
                         estimatorParamMaps=paramGrid,
                         evaluator=pr_evaluator,
                         numFolds=4,
                         collectSubModels=True)

    model = tvs.fit(train_samples)

    return model
def random_forest_tuning(train_samples):
    rf = RandomForestClassifier(
        labelCol="label", featuresCol="features", cacheNodeIds=True
    )
    ru = RandomUnderSampler().setIndexCol("id")
    pipeline = Pipeline().setStages([ru, rf])
    paramGrid = (
        ParamGridBuilder()
        .addGrid(rf.numTrees, [50, 75, 100])
        .addGrid(rf.featureSubsetStrategy, ["sqrt"])
        .addGrid(rf.impurity, ["gini", "entropy"])
        .addGrid(rf.maxDepth, [5, 15, 30])
        .addGrid(rf.minInstancesPerNode, [1])
        .addGrid(rf.subsamplingRate, [1.0, 0.6, 0.4])
        .addGrid(ru.targetImbalanceRatio, [1.0, 1.5, 2.0])
        .build()
    )
    pr_evaluator = BinaryClassificationEvaluator(
        labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderPR"
    )
    tvs = TrainValidationSplit(
        estimator=pipeline,
        estimatorParamMaps=paramGrid,
        evaluator=pr_evaluator,
        trainRatio=0.8,
        collectSubModels=True,
    )

    model = tvs.fit(train_samples)

    return model
Exemple #3
0
imbalance_ratio = (neg_samples.count()/pos_samples.count())

train_set, test_set = get_dataset_df(spark, pos_samples, neg_samples)
train_set, test_set = train_set.persist(), test_set.persist()

rf = RandomForestClassifier(labelCol="label",
                            featuresCol="features",
                            cacheNodeIds=True,
                            maxDepth=17,
                            impurity='entropy',
                            featureSubsetStrategy='sqrt',
                            minInstancesPerNode=10,
                            numTrees=100,
                            subsamplingRate=1.0,
                            maxMemoryInMB=768)
ru = (RandomUnderSampler()
      .setIndexCol('sample_id')
      .setTargetImbalanceRatio(1.0))
pipeline = Pipeline().setStages([ru, rf])
model = pipeline.fit(train_set)
predictions = model.transform(test_set).persist()
train_predictions = model.transform(train_set).persist()

write_params(model, result_dir)
write_results(predictions, train_predictions, result_dir)

# Write feature importances
feature_importances = get_feature_importances(model.stages[1])
feature_importances.to_csv(result_dir + '/feature_importances.csv')