def transform_into_traffic_dataset(_, trip_dataset: TripDataFrame) -> TrafficDataFrame: def max_traffic_load(trips): interval_count = { start_interval: 0 for start_interval in date_range(trips.name, periods=24, freq='h') } for interval in interval_count.keys(): upper_bound_interval = interval + timedelta(hours=1) # Count number of bikes in transit during sample interval interval_count[interval] = len( trips[ ( ( # Select trip if the trip started within the sample interval (interval <= trips['start_time']) & (trips['start_time'] < upper_bound_interval) ) | ( # Select trip if the trip ended within the sample interval (interval <= trips['end_time']) & (trips['end_time'] < upper_bound_interval) ) | ( # Select trip if the trip started AND ended outside of the interval (trips['start_time'] < interval) & (trips['end_time'] >= upper_bound_interval) ) ) ] ) return max(interval_count.values()) counts = trip_dataset.groupby(['interval_date']).apply(max_traffic_load) traffic_dataset = DataFrame(counts).reset_index() traffic_dataset.columns = ['interval_date', 'peak_traffic_load'] return TrafficDataFrame(traffic_dataset)
def preprocess_trip_dataset(_, dataframe: DataFrame) -> TripDataFrame: dataframe = dataframe[['bike_id', 'start_time', 'end_time']].dropna(how='all').reindex() dataframe['bike_id'] = dataframe['bike_id'].astype('int64') dataframe['start_time'] = to_datetime(dataframe['start_time']) dataframe['end_time'] = to_datetime(dataframe['end_time']) dataframe['interval_date'] = dataframe['start_time'].apply(lambda x: x.date()) return TripDataFrame(dataframe)