Beispiel #1
0
def grid_search(spark):
    sample_ratio = 0.05
    neg_samples = get_negative_samples(spark).sample(sample_ratio).na.fill(0)
    pos_samples = get_positive_samples(spark).sample(sample_ratio).na.fill(0)
    df = get_dataset_df(spark, pos_samples, neg_samples).na.fill(0)
    trainDF, testDF = df.randomSplit([0.8, 0.2], seed=0)

    xgboost = XGBoostEstimator(featuresCol="features",
                               labelCol="label",
                               predictionCol="prediction")

    pipeline = Pipeline().setStages([xgboost])
    model = pipeline.fit(trainDF)
    paramGrid = (ParamGridBuilder().addGrid(
        xgboost.max_depth, [x for x in range(3, 20, 6)]).addGrid(
            xgboost.eta, [x for x in np.linspace(0.2, 0.6, 4)]).addGrid(
                xgboost.scale_pos_weight,
                [x for x in np.linspace(0.03, 1.0, 3)]).build())
    evaluator = BinaryClassificationEvaluator(labelCol="label",
                                              rawPredictionCol="probabilities",
                                              metricName="areaUnderPR")
    cv = (CrossValidator().setEstimator(pipeline).setEvaluator(
        evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(3))

    cvModel = cv.fit(trainDF)

    bestModel = (cvModel.bestModel.asInstanceOf[PipelineModel].stages(
        2).asInstanceOf[XGBoostClassificationModel])

    with open(workdir + 'data/xgboost_tuning_results_1.txt', 'w') as file:
        for model, result in zip(model.subModels[0], model.avgMetrics):
            file.write('==================================\n')
            for stage in model.stages:
                params = stage.extractParamMap()
                for k in params:
                    file.write(f'{k.name}: {params[k]}\n')
            file.write(f"Area under PR: {result}\n")

    prediction = bestModel.transform(testDF)
    prediction = prediction.withColumn("rawPrediction",
                                       prediction['probabilities'])
    area_under_PR, f1_score = evaluate_binary_classifier(prediction)

    with open(workdir + 'data/xgboost_tuning_perf_1.txt', 'w') as file:
        file.write(f"Area Under PR = {area_under_PR}\nF1 score = {f1_score}")

    return
#!/usr/bin/env python
from from accident_prediction_montreal.preprocess import get_negative_samples, get_positive_samplespreprocess import get_negative_samples, get_positive_samples
from from accident_prediction_montreal.utils import init_sparkutils import init_spark
from from accident_prediction_montreal.preprocess import get_dataset_dfpreprocess import get_dataset_df
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, \
                              CrossValidator
from pyspark.ml import Pipeline
from random_forest import get_feature_importances
from export_results import *

result_dir = create_result_dir('rf')
spark = init_spark()
neg_samples = get_negative_samples(spark).sample(0.5)
pos_samples = get_positive_samples(spark)

imbalance_ratio = (neg_samples.count()/pos_samples.count())

train_set, test_set = get_dataset_df(spark, pos_samples, neg_samples)
train_set, test_set = train_set.persist(), test_set.persist()

rf = RandomForestClassifier(labelCol="label",
                            featuresCol="features",
                            cacheNodeIds=True,
                            maxDepth=17,
                            impurity='entropy',
                            featureSubsetStrategy='sqrt',
                            minInstancesPerNode=10,
                            numTrees=100,
                            subsamplingRate=1.0,
                            maxMemoryInMB=768)
#!/usr/bin/env python
from accident_prediction_montreal.preprocess import get_negative_samples, get_positive_samples
from accident_prediction_montreal.utils import init_spark
from accident_prediction_montreal.workdir import workdir
spark = init_spark()
neg_samples = \
    get_negative_samples(spark,
                         save_to='data/negative-sample-new.parquet',
                         sample_ratio=1e-3)
print(neg_samples.count())
Beispiel #4
0
)
from accident_prediction_montreal.preprocess import (
    get_positive_samples,
    get_negative_samples,
    get_dataset_df,
)
from accident_prediction_montreal.evaluate import evaluate_binary_classifier
from accident_prediction_montreal.utils import init_spark
from accident_prediction_montreal.workdir import workdir

spark = init_spark()

i = 1
sampleFraction = 0.01

neg_samples = get_negative_samples(spark).sample(sampleFraction)
pos_samples = get_positive_samples(spark).sample(sampleFraction)
df = get_dataset_df(spark, pos_samples, neg_samples)
(train_set, test_set) = df.randomSplit([0.8, 0.2])
(train_set, test_set) = (train_set.persist(), test_set.persist())

model = balanced_random_forest_tuning(train_set)

with open(workdir + f'data/brf_tuning_results_{i}.txt', 'w') as file:
    for model, result in zip(model.subModels[0], model.avgMetrics):
        file.write('==================================\n')
        for stage in model.stages:
            params = stage.extractParamMap()
            for k in params:
                file.write(f'{k.name}: {params[k]}\n')
        file.write(f"Area under PR: {result}\n")