def generate_match_accident_road_of_one_month(year, month):
    filepath = workdir + f'data/match_accident-road_{year}-{month}.parquet'
    if isdir(filepath):  # Skip if already done
        return
    print(f'Generating {year}-{month}')
    spark = init_spark()
    road_df = get_road_df(spark, use_cache=True)
    accident_df = preprocess_accidents(get_accident_df(spark))

    start_day_str = f'{year}-{month:02}-01'
    if month == 12:
        end_year = year + 1
        month = 0
    else:
        end_year = year
    end_day_str = f'{end_year}-{(month + 1):02}-01'

    start_day = datetime.datetime.fromisoformat(start_day_str)
    end_day = datetime.datetime.fromisoformat(end_day_str)
    accident_df = (accident_df.filter((col('date') >= start_day)
                                      & (col('date') < end_day)))

    match_accident_road = \
        match_accidents_with_roads(spark, road_df, accident_df,
                                   use_cache=False)

    match_accident_road.write.parquet(filepath)
    spark.stop()  # Force garbage collection and empty temp dir
Exemple #2
0
def get_negative_samples(spark,
                         use_cache=True,
                         save_to=None,
                         road_limit=None,
                         year_limit=None,
                         year_ratio=None,
                         weather_df=None,
                         sample_ratio=None,
                         accident_df=None):
    """
    Note to self: 539 293 road, 43 848 generated dates,
    nb dates for 1 year : 8760

    year_limit: int or tuple of int
    """
    cache_path = workdir + 'data/negative-samples.parquet'
    if isdir(cache_path) and use_cache and save_to is None:
        return spark.read.parquet(cache_path)
    if save_to is not None:
        cache_path = workdir + save_to
        if isdir(cache_path):
            raise ValueError(f"Directory {save_to} already exists")

    road_df = get_road_df(spark, use_cache)
    road_features_df = \
        get_road_features_df(spark, road_df=road_df, use_cache=use_cache)
    road_df = road_features_df.select('street_id')
    dates_df = generate_dates_df(spark, year_limit, year_ratio)

    if road_limit is not None:
        road_df = road_df.limit(road_limit)

    negative_samples = (dates_df.crossJoin(road_df))

    if sample_ratio is not None:
        negative_samples = negative_samples.sample(sample_ratio)

    negative_samples = \
        negative_samples.withColumn('sample_id',
                                    monotonically_increasing_id())
    accident_df = preprocess_accidents(accident_df or get_accident_df(spark))
    if year_limit is not None:
        accident_df = accident_df.filter(year('date').isin(year_limit))

    weather_df = weather_df or get_weather_df(spark, accident_df)
    negative_samples = negative_samples.join(road_features_df, 'street_id')
    negative_sample_weather = \
        get_weather_information(negative_samples, weather_df)
    negative_samples = \
        negative_samples.join(negative_sample_weather, 'sample_id')
    negative_samples = add_date_features(negative_samples)
    negative_samples = add_solar_features(negative_samples)

    negative_samples = negative_samples.persist()

    if use_cache:
        negative_samples.write.parquet(cache_path)
    return negative_samples
Exemple #3
0
def get_positive_samples(spark,
                         road_df=None,
                         weather_df=None,
                         year_limit=None,
                         use_cache=True,
                         limit=None):
    if isinstance(year_limit, int):
        year_limit = [year_limit]
    elif isinstance(year_limit, tuple):
        year_limit = list(year_limit)
    elif not ((year_limit is None) or isinstance(year_limit, list)):
        raise ValueError('Type of year_limit not authorized.')

    cache_path = workdir + 'data/positive-samples.parquet'
    if isdir(cache_path) and use_cache:
        return spark.read.parquet(cache_path)

    road_df = road_df or get_road_df(spark, use_cache)
    accident_df = get_accident_df(spark, use_cache)
    accident_df = preprocess_accidents(accident_df)

    if year_limit is not None:
        accident_df = accident_df.filter(year('date').isin(year_limit))
    if limit is not None:
        accident_df = accident_df.limit(limit)

    weather_df = weather_df or get_weather_df(spark, accident_df)
    road_features_df = \
        (get_road_features_df(spark, road_df=road_df, use_cache=use_cache)
         .drop('loc_lat', 'loc_long'))

    match_acc_road = match_accidents_with_roads(spark, road_df, accident_df)
    print(match_acc_road.head(10))
    accident_df = accident_df.withColumnRenamed('accident_id', 'sample_id')
    accident_weather = get_weather_information(accident_df, weather_df)
    positive_samples = (accident_df.join(
        accident_weather, 'sample_id').withColumnRenamed(
            'sample_id',
            'accident_id').join(match_acc_road, 'accident_id').join(
                road_features_df,
                'street_id').withColumnRenamed('accident_id', 'sample_id'))

    positive_samples = add_date_features(positive_samples)
    positive_samples = add_solar_features(positive_samples)

    #    positive_samples = positive_samples.persist(pyspark.StorageLevel.DISK_ONLY)

    if use_cache:
        positive_samples.write.parquet(cache_path)
    return positive_samples
Exemple #4
0
def get_positive_samples(spark,
                         road_df=None,
                         weather_df=None,
                         year_limit=None,
                         use_cache=True,
                         limit=None):
    if isinstance(year_limit, int):
        year_limit = [year_limit]
    elif isinstance(year_limit, tuple):
        year_limit = list(year_limit)
    elif not ((year_limit is None) or isinstance(year_limit, list)):
        raise ValueError("Type of year_limit not authorized.")

    cache_path = workdir + "data/positive-samples.parquet"
    if isdir(cache_path) and use_cache:
        return spark.read.parquet(cache_path)

    road_df = road_df or get_road_df(spark, use_cache)
    accident_df = get_accident_df(spark, use_cache)
    accident_df = preprocess_accidents(accident_df)

    if year_limit is not None:
        accident_df = accident_df.filter(year("date").isin(year_limit))
    if limit is not None:
        accident_df = accident_df.limit(limit)

    weather_df = weather_df or get_weather_df(spark, accident_df)
    road_features_df = get_road_features_df(spark,
                                            road_df=road_df,
                                            use_cache=use_cache).drop(
                                                "loc_lat", "loc_long")
    match_acc_road = match_accidents_with_roads(spark, road_df, accident_df)
    accident_df = accident_df.withColumnRenamed("accident_id", "sample_id")
    accident_weather = get_weather_information(accident_df, weather_df)
    positive_samples = (accident_df.join(
        accident_weather, "sample_id").withColumnRenamed(
            "sample_id",
            "accident_id").join(match_acc_road, "accident_id").join(
                road_features_df,
                "street_id").withColumnRenamed("accident_id", "sample_id"))

    positive_samples = add_date_features(positive_samples)
    positive_samples = add_solar_features(positive_samples)
    positive_samples = positive_samples.persist()

    if use_cache:
        positive_samples.write.parquet(cache_path)
    return positive_samples
#!/usr/bin/env python
import accident_prediction_montreal
from accidents_montreal import get_accident_df
from weather import get_weather_df
from utils import init_spark
from preprocess import preprocess_accidents

spark = init_spark()

accident_df = preprocess_accidents(get_accident_df(spark))
df = get_weather_df(spark, accident_df)
    start_day = datetime.datetime.fromisoformat(start_day_str)
    end_day = datetime.datetime.fromisoformat(end_day_str)
    accident_df = (accident_df.filter((col('date') >= start_day)
                                      & (col('date') < end_day)))

    match_accident_road = \
        match_accidents_with_roads(spark, road_df, accident_df,
                                   use_cache=False)

    match_accident_road.write.parquet(filepath)
    spark.stop()  # Force garbage collection and empty temp dir


# Make sure to generate accidents DF
spark = init_spark()
get_accident_df(spark)
spark.stop()

for year in range(2012, 2018):
    for month in range(1, 13):
        generate_match_accident_road_of_one_month(year, month)

spark = init_spark()


def match_accident_road_samples_reader():
    for year in range(2012, 2018):
        for month in range(1, 13):
            filepath = workdir \
                + f'data/match_accident-road_{year}-{month}' \
                + '.parquet'