def get_negative_samples(spark, use_cache=True, save_to=None, road_limit=None, year_limit=None, year_ratio=None, weather_df=None, sample_ratio=None, accident_df=None): """ Note to self: 539 293 road, 43 848 generated dates, nb dates for 1 year : 8760 year_limit: int or tuple of int """ cache_path = workdir + 'data/negative-samples.parquet' if isdir(cache_path) and use_cache and save_to is None: return spark.read.parquet(cache_path) if save_to is not None: cache_path = workdir + save_to if isdir(cache_path): raise ValueError(f"Directory {save_to} already exists") road_df = get_road_df(spark, use_cache) road_features_df = \ get_road_features_df(spark, road_df=road_df, use_cache=use_cache) road_df = road_features_df.select('street_id') dates_df = generate_dates_df(spark, year_limit, year_ratio) if road_limit is not None: road_df = road_df.limit(road_limit) negative_samples = (dates_df.crossJoin(road_df)) if sample_ratio is not None: negative_samples = negative_samples.sample(sample_ratio) negative_samples = \ negative_samples.withColumn('sample_id', monotonically_increasing_id()) accident_df = preprocess_accidents(accident_df or get_accident_df(spark)) if year_limit is not None: accident_df = accident_df.filter(year('date').isin(year_limit)) weather_df = weather_df or get_weather_df(spark, accident_df) negative_samples = negative_samples.join(road_features_df, 'street_id') negative_sample_weather = \ get_weather_information(negative_samples, weather_df) negative_samples = \ negative_samples.join(negative_sample_weather, 'sample_id') negative_samples = add_date_features(negative_samples) negative_samples = add_solar_features(negative_samples) negative_samples = negative_samples.persist() if use_cache: negative_samples.write.parquet(cache_path) return negative_samples
def get_positive_samples(spark, road_df=None, weather_df=None, year_limit=None, use_cache=True, limit=None): if isinstance(year_limit, int): year_limit = [year_limit] elif isinstance(year_limit, tuple): year_limit = list(year_limit) elif not ((year_limit is None) or isinstance(year_limit, list)): raise ValueError('Type of year_limit not authorized.') cache_path = workdir + 'data/positive-samples.parquet' if isdir(cache_path) and use_cache: return spark.read.parquet(cache_path) road_df = road_df or get_road_df(spark, use_cache) accident_df = get_accident_df(spark, use_cache) accident_df = preprocess_accidents(accident_df) if year_limit is not None: accident_df = accident_df.filter(year('date').isin(year_limit)) if limit is not None: accident_df = accident_df.limit(limit) weather_df = weather_df or get_weather_df(spark, accident_df) road_features_df = \ (get_road_features_df(spark, road_df=road_df, use_cache=use_cache) .drop('loc_lat', 'loc_long')) match_acc_road = match_accidents_with_roads(spark, road_df, accident_df) print(match_acc_road.head(10)) accident_df = accident_df.withColumnRenamed('accident_id', 'sample_id') accident_weather = get_weather_information(accident_df, weather_df) positive_samples = (accident_df.join( accident_weather, 'sample_id').withColumnRenamed( 'sample_id', 'accident_id').join(match_acc_road, 'accident_id').join( road_features_df, 'street_id').withColumnRenamed('accident_id', 'sample_id')) positive_samples = add_date_features(positive_samples) positive_samples = add_solar_features(positive_samples) # positive_samples = positive_samples.persist(pyspark.StorageLevel.DISK_ONLY) if use_cache: positive_samples.write.parquet(cache_path) return positive_samples
def get_positive_samples(spark, road_df=None, weather_df=None, year_limit=None, use_cache=True, limit=None): if isinstance(year_limit, int): year_limit = [year_limit] elif isinstance(year_limit, tuple): year_limit = list(year_limit) elif not ((year_limit is None) or isinstance(year_limit, list)): raise ValueError("Type of year_limit not authorized.") cache_path = workdir + "data/positive-samples.parquet" if isdir(cache_path) and use_cache: return spark.read.parquet(cache_path) road_df = road_df or get_road_df(spark, use_cache) accident_df = get_accident_df(spark, use_cache) accident_df = preprocess_accidents(accident_df) if year_limit is not None: accident_df = accident_df.filter(year("date").isin(year_limit)) if limit is not None: accident_df = accident_df.limit(limit) weather_df = weather_df or get_weather_df(spark, accident_df) road_features_df = get_road_features_df(spark, road_df=road_df, use_cache=use_cache).drop( "loc_lat", "loc_long") match_acc_road = match_accidents_with_roads(spark, road_df, accident_df) accident_df = accident_df.withColumnRenamed("accident_id", "sample_id") accident_weather = get_weather_information(accident_df, weather_df) positive_samples = (accident_df.join( accident_weather, "sample_id").withColumnRenamed( "sample_id", "accident_id").join(match_acc_road, "accident_id").join( road_features_df, "street_id").withColumnRenamed("accident_id", "sample_id")) positive_samples = add_date_features(positive_samples) positive_samples = add_solar_features(positive_samples) positive_samples = positive_samples.persist() if use_cache: positive_samples.write.parquet(cache_path) return positive_samples