def generate_match_accident_road_of_one_month(year, month): filepath = workdir + f'data/match_accident-road_{year}-{month}.parquet' if isdir(filepath): # Skip if already done return print(f'Generating {year}-{month}') spark = init_spark() road_df = get_road_df(spark, use_cache=True) accident_df = preprocess_accidents(get_accident_df(spark)) start_day_str = f'{year}-{month:02}-01' if month == 12: end_year = year + 1 month = 0 else: end_year = year end_day_str = f'{end_year}-{(month + 1):02}-01' start_day = datetime.datetime.fromisoformat(start_day_str) end_day = datetime.datetime.fromisoformat(end_day_str) accident_df = (accident_df.filter((col('date') >= start_day) & (col('date') < end_day))) match_accident_road = \ match_accidents_with_roads(spark, road_df, accident_df, use_cache=False) match_accident_road.write.parquet(filepath) spark.stop() # Force garbage collection and empty temp dir
#!/usr/bin/env python from accident_prediction_montreal.accidents_montreal import get_accident_df from accident_prediction_montreal.weather import get_weather_station_id_df from accident_prediction_montreal.utils import init_spark from accident_prediction_montreal.preprocess import preprocess_accidents spark = init_spark() accident_df = preprocess_accidents(get_accident_df(spark)) get_weather_station_id_df(spark, accident_df)
def spark(): return init_spark()