def grid_search(spark):
    sample_ratio = 0.05
    neg_samples = get_negative_samples(spark).sample(sample_ratio).na.fill(0)
    pos_samples = get_positive_samples(spark).sample(sample_ratio).na.fill(0)
    df = get_dataset_df(spark, pos_samples, neg_samples).na.fill(0)
    trainDF, testDF = df.randomSplit([0.8, 0.2], seed=0)

    xgboost = XGBoostEstimator(featuresCol="features",
                               labelCol="label",
                               predictionCol="prediction")

    pipeline = Pipeline().setStages([xgboost])
    model = pipeline.fit(trainDF)
    paramGrid = (ParamGridBuilder().addGrid(
        xgboost.max_depth, [x for x in range(3, 20, 6)]).addGrid(
            xgboost.eta, [x for x in np.linspace(0.2, 0.6, 4)]).addGrid(
                xgboost.scale_pos_weight,
                [x for x in np.linspace(0.03, 1.0, 3)]).build())
    evaluator = BinaryClassificationEvaluator(labelCol="label",
                                              rawPredictionCol="probabilities",
                                              metricName="areaUnderPR")
    cv = (CrossValidator().setEstimator(pipeline).setEvaluator(
        evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(3))

    cvModel = cv.fit(trainDF)

    bestModel = (cvModel.bestModel.asInstanceOf[PipelineModel].stages(
        2).asInstanceOf[XGBoostClassificationModel])

    with open(workdir + 'data/xgboost_tuning_results_1.txt', 'w') as file:
        for model, result in zip(model.subModels[0], model.avgMetrics):
            file.write('==================================\n')
            for stage in model.stages:
                params = stage.extractParamMap()
                for k in params:
                    file.write(f'{k.name}: {params[k]}\n')
            file.write(f"Area under PR: {result}\n")

    prediction = bestModel.transform(testDF)
    prediction = prediction.withColumn("rawPrediction",
                                       prediction['probabilities'])
    area_under_PR, f1_score = evaluate_binary_classifier(prediction)

    with open(workdir + 'data/xgboost_tuning_perf_1.txt', 'w') as file:
        file.write(f"Area Under PR = {area_under_PR}\nF1 score = {f1_score}")

    return
Exemple #2
0
#!/usr/bin/env python
import accident_prediction_montreal
from pyspark.sql.functions import udf, min, max, col
from pyspark.sql.types import FloatType
from pyspark.ml.feature import VectorAssembler
from preprocess import get_negative_samples, get_positive_samples
from utils import init_spark
from preprocess import get_dataset_df
from export_results import *

result_dir = create_result_dir('base')
spark = init_spark()
neg_samples = get_negative_samples(spark).sample(0.5)
pos_samples = get_positive_samples(spark)

imbalance_ratio = (neg_samples.count() / pos_samples.count())

train_set, test_set = get_dataset_df(spark, pos_samples, neg_samples)
train_set, test_set = train_set.persist(), test_set.persist()

get_accidents_count = udf(lambda v: float(v[7]), FloatType())


def fit(train_set):
    accidents_count = train_set.select(
        get_accidents_count('features').alias('accidents_count'), 'label')
    accidents_count_to_proba = []
    for i in range(377):
        accidents_count_higher = \
                accidents_count.filter(col('accidents_count') >= i)
        proba = (accidents_count_higher.filter(col('label') == 1.0).count() /
from random_forest import balanced_random_forest_tuning, \
                          compute_precision_recall, \
                          compute_precision_recall_graph
from preprocess import get_positive_samples, \
                       get_negative_samples, \
                       get_dataset_df
from evaluate import evaluate_binary_classifier
from utils import init_spark
from workdir import workdir

spark = init_spark()

i = 1
sampleFraction = 0.01

neg_samples = get_negative_samples(spark).sample(sampleFraction)
pos_samples = get_positive_samples(spark).sample(sampleFraction)
df = get_dataset_df(spark, pos_samples, neg_samples)
(train_set, test_set) = df.randomSplit([0.8, 0.2])
(train_set, test_set) = (train_set.persist(), test_set.persist())

model = balanced_random_forest_tuning(train_set)

with open(workdir + f'data/brf_tuning_results_{i}.txt', 'w') as file:
    for model, result in zip(model.subModels[0], model.avgMetrics):
        file.write('==================================\n')
        for stage in model.stages:
            params = stage.extractParamMap()
            for k in params:
                file.write(f'{k.name}: {params[k]}\n')
        file.write(f"Area under PR: {result}\n")
Exemple #4
0
#!/usr/bin/env python
import accident_prediction_montreal
from preprocess import get_negative_samples, get_positive_samples
from utils import init_spark
from workdir import workdir
spark = init_spark()
neg_samples = \
    get_negative_samples(spark,
                         save_to='data/negative-sample-new.parquet',
                         sample_ratio=1e-3)
print(neg_samples.count())