Esempio n. 1
0
def get_negative_samples(spark,
                         use_cache=True,
                         save_to=None,
                         road_limit=None,
                         year_limit=None,
                         year_ratio=None,
                         weather_df=None,
                         sample_ratio=None,
                         accident_df=None):
    """
    Note to self: 539 293 road, 43 848 generated dates,
    nb dates for 1 year : 8760

    year_limit: int or tuple of int
    """
    cache_path = workdir + 'data/negative-samples.parquet'
    if isdir(cache_path) and use_cache and save_to is None:
        return spark.read.parquet(cache_path)
    if save_to is not None:
        cache_path = workdir + save_to
        if isdir(cache_path):
            raise ValueError(f"Directory {save_to} already exists")

    road_df = get_road_df(spark, use_cache)
    road_features_df = \
        get_road_features_df(spark, road_df=road_df, use_cache=use_cache)
    road_df = road_features_df.select('street_id')
    dates_df = generate_dates_df(spark, year_limit, year_ratio)

    if road_limit is not None:
        road_df = road_df.limit(road_limit)

    negative_samples = (dates_df.crossJoin(road_df))

    if sample_ratio is not None:
        negative_samples = negative_samples.sample(sample_ratio)

    negative_samples = \
        negative_samples.withColumn('sample_id',
                                    monotonically_increasing_id())
    accident_df = preprocess_accidents(accident_df or get_accident_df(spark))
    if year_limit is not None:
        accident_df = accident_df.filter(year('date').isin(year_limit))

    weather_df = weather_df or get_weather_df(spark, accident_df)
    negative_samples = negative_samples.join(road_features_df, 'street_id')
    negative_sample_weather = \
        get_weather_information(negative_samples, weather_df)
    negative_samples = \
        negative_samples.join(negative_sample_weather, 'sample_id')
    negative_samples = add_date_features(negative_samples)
    negative_samples = add_solar_features(negative_samples)

    negative_samples = negative_samples.persist()

    if use_cache:
        negative_samples.write.parquet(cache_path)
    return negative_samples
Esempio n. 2
0
def get_positive_samples(spark,
                         road_df=None,
                         weather_df=None,
                         year_limit=None,
                         use_cache=True,
                         limit=None):
    if isinstance(year_limit, int):
        year_limit = [year_limit]
    elif isinstance(year_limit, tuple):
        year_limit = list(year_limit)
    elif not ((year_limit is None) or isinstance(year_limit, list)):
        raise ValueError('Type of year_limit not authorized.')

    cache_path = workdir + 'data/positive-samples.parquet'
    if isdir(cache_path) and use_cache:
        return spark.read.parquet(cache_path)

    road_df = road_df or get_road_df(spark, use_cache)
    accident_df = get_accident_df(spark, use_cache)
    accident_df = preprocess_accidents(accident_df)

    if year_limit is not None:
        accident_df = accident_df.filter(year('date').isin(year_limit))
    if limit is not None:
        accident_df = accident_df.limit(limit)

    weather_df = weather_df or get_weather_df(spark, accident_df)
    road_features_df = \
        (get_road_features_df(spark, road_df=road_df, use_cache=use_cache)
         .drop('loc_lat', 'loc_long'))

    match_acc_road = match_accidents_with_roads(spark, road_df, accident_df)
    print(match_acc_road.head(10))
    accident_df = accident_df.withColumnRenamed('accident_id', 'sample_id')
    accident_weather = get_weather_information(accident_df, weather_df)
    positive_samples = (accident_df.join(
        accident_weather, 'sample_id').withColumnRenamed(
            'sample_id',
            'accident_id').join(match_acc_road, 'accident_id').join(
                road_features_df,
                'street_id').withColumnRenamed('accident_id', 'sample_id'))

    positive_samples = add_date_features(positive_samples)
    positive_samples = add_solar_features(positive_samples)

    #    positive_samples = positive_samples.persist(pyspark.StorageLevel.DISK_ONLY)

    if use_cache:
        positive_samples.write.parquet(cache_path)
    return positive_samples
Esempio n. 3
0
def get_positive_samples(spark,
                         road_df=None,
                         weather_df=None,
                         year_limit=None,
                         use_cache=True,
                         limit=None):
    if isinstance(year_limit, int):
        year_limit = [year_limit]
    elif isinstance(year_limit, tuple):
        year_limit = list(year_limit)
    elif not ((year_limit is None) or isinstance(year_limit, list)):
        raise ValueError("Type of year_limit not authorized.")

    cache_path = workdir + "data/positive-samples.parquet"
    if isdir(cache_path) and use_cache:
        return spark.read.parquet(cache_path)

    road_df = road_df or get_road_df(spark, use_cache)
    accident_df = get_accident_df(spark, use_cache)
    accident_df = preprocess_accidents(accident_df)

    if year_limit is not None:
        accident_df = accident_df.filter(year("date").isin(year_limit))
    if limit is not None:
        accident_df = accident_df.limit(limit)

    weather_df = weather_df or get_weather_df(spark, accident_df)
    road_features_df = get_road_features_df(spark,
                                            road_df=road_df,
                                            use_cache=use_cache).drop(
                                                "loc_lat", "loc_long")
    match_acc_road = match_accidents_with_roads(spark, road_df, accident_df)
    accident_df = accident_df.withColumnRenamed("accident_id", "sample_id")
    accident_weather = get_weather_information(accident_df, weather_df)
    positive_samples = (accident_df.join(
        accident_weather, "sample_id").withColumnRenamed(
            "sample_id",
            "accident_id").join(match_acc_road, "accident_id").join(
                road_features_df,
                "street_id").withColumnRenamed("accident_id", "sample_id"))

    positive_samples = add_date_features(positive_samples)
    positive_samples = add_solar_features(positive_samples)
    positive_samples = positive_samples.persist()

    if use_cache:
        positive_samples.write.parquet(cache_path)
    return positive_samples
Esempio n. 4
0
def get_dataset_df(spark, pos_samples, neg_samples):
    neg_samples = remove_positive_samples_from_negative_samples(
        neg_samples, pos_samples)
    pos_samples = pos_samples.withColumn("label", lit(1.0))
    neg_samples = neg_samples.withColumn("label", lit(0.0))

    # Make sure sample_id is unique
    neg_samples = neg_samples.withColumn("sample_id", -1 * col("sample_id"))

    pos_samples = pos_samples.select(*neg_samples.columns)

    train_pos, train_neg, test_pos, test_neg = train_test_split(
        pos_samples, neg_samples)

    roads = get_road_features_df(spark)
    accident_count = get_accident_count(spark, train_pos, roads)
    street_level_index = get_street_categories_index(spark, "street_level",
                                                     train_pos, roads)
    street_type_index = get_street_categories_index(spark, "street_type",
                                                    train_pos, roads)

    def prepare_dataset(va, pos_samples, neg_samples):
        dataset = pos_samples.union(neg_samples)
        dataset = (dataset.join(accident_count, "street_id").join(
            street_level_index,
            "street_level").join(street_type_index,
                                 "street_type").drop("street_level",
                                                     "street_type"))
        return va.transform(dataset).select("sample_id", "street_id", "date",
                                            "hour", "features", "label")

    va = VectorAssembler(outputCol="features",
                         inputCols=features_col,
                         handleInvalid="keep")

    train_set = prepare_dataset(va, train_pos, train_neg)
    test_set = prepare_dataset(va, test_pos, test_neg)

    return train_set, test_set
def get_dataset_df(spark, pos_samples, neg_samples):
    neg_samples = remove_positive_samples_from_negative_samples(
        neg_samples, pos_samples)
    pos_samples = pos_samples.withColumn('label', lit(1.0))
    neg_samples = neg_samples.withColumn('label', lit(0.0))

    # Make sure sample_id is unique
    neg_samples = neg_samples.withColumn('sample_id', -1 * col('sample_id'))

    pos_samples = pos_samples.select(*neg_samples.columns)

    train_pos, train_neg, test_pos, test_neg = \
        train_test_split(pos_samples, neg_samples)

    roads = get_road_features_df(spark)
    accident_count = get_accident_count(spark, train_pos, roads)
    street_level_index = \
        get_street_categories_index(spark, 'street_level', train_pos, roads)
    street_type_index = \
        get_street_categories_index(spark, 'street_type', train_pos, roads)

    def prepare_dataset(va, pos_samples, neg_samples):
        dataset = pos_samples.union(neg_samples)
        dataset = (dataset.join(accident_count, 'street_id').join(
            street_level_index,
            'street_level').join(street_type_index,
                                 'street_type').drop('street_level',
                                                     'street_type'))
        return (va.transform(dataset).select('sample_id', 'street_id', 'date',
                                             'hour', 'features', 'label'))

    va = VectorAssembler(outputCol="features",
                         inputCols=features_col,
                         handleInvalid='keep')

    train_set = prepare_dataset(va, train_pos, train_neg)
    test_set = prepare_dataset(va, test_pos, test_neg)

    return train_set, test_set