def extract_feature(self): print('Loading datasets') speeds = None if self.mode == 'local': tr = data.speeds_original('train') te = data.speed_test_masked() speeds = pd.concat([tr, te]) del tr del te elif self.mode == 'full': tr = data.speeds(mode='full') te = data.speeds_original('test2') speeds = pd.concat([tr, te]) del tr del te sensors = data.sensors() print('Done') df = pd.merge(speeds.dropna(), sensors, left_on=[KEY, KM], right_on=[KEY, KM]) df[DATETIME] = pd.to_datetime(df.DATETIME_UTC) return df[['ROAD_TYPE', 'SPEED_AVG']].groupby('ROAD_TYPE').mean().reset_index()\ .rename(columns={'SPEED_AVG': 'avg_speed_roadtype'})
def extract_feature(self): tr = data.speeds_original('train') te = data.speed_test_masked() speeds = pd.concat([tr, te]) del tr del te print('Extracting min and max timestamps...') min_datetime = speeds.DATETIME_UTC.min() max_datetime = speeds.DATETIME_UTC.max() sensors = data.sensors().drop_duplicates([KEY, KM]) print('Done') datetimes_df = pd.DataFrame( pd.date_range(min_datetime, max_datetime, freq='15min').to_series()).reset_index() datetimes_df[DATETIME] = pd.to_datetime(datetimes_df['index']) datetimes_df = datetimes_df[[DATETIME]] print('Shifting hours') datetimes_df['DATETIME_HOUR'] = pd.to_datetime( datetimes_df[DATETIME]).apply(lambda x: x.floor('1H')) datetimes_df[ 'DATETIME_HOUR'] = datetimes_df['DATETIME_HOUR'] - pd.DateOffset(1) print('Done') print('Creating skeleton') datetimes_df['MERGE'] = 0 sensors['MERGE'] = 0 skeleton = pd.merge(sensors[[KEY, KM, 'MERGE']], datetimes_df, on='MERGE') skeleton[DATETIME] = pd.to_datetime(skeleton[DATETIME]) skeleton.set_index(DATETIME, inplace=True) print('Done') print('Merging with speeds..') resampled_speeds = speeds\ .groupby([KEY, KM])\ .apply(lambda x: x.set_index(DATETIME)\ .resample('H').mean()[[SPEED_AVG, SPEED_MAX, SPEED_MIN, SPEED_SD, N_CARS]]).reset_index() skeleton_merge = skeleton.reset_index() df = pd.merge(skeleton_merge, resampled_speeds, left_on=[KEY, KM, 'DATETIME_HOUR'], right_on=[KEY, KM, DATETIME]) df = df.rename( columns={ 'DATETIME_UTC_x': 'DATETIME_UTC', SPEED_AVG: 'SPEED_AVG_D-1', SPEED_MAX: 'SPEED_MAX_D-1', SPEED_MIN: 'SPEED_MIN_D-1', SPEED_SD: 'SPEED_SD_D-1', N_CARS: 'N_VEHICLES_D-1' }) print('Done') return df
def extract_feature(self): print('Reading data...') df = data.base_dataset() sensors = data.sensors().drop_duplicates().sort_values([KEY, KM]) speeds = pd.concat([data.speeds_original('train'), data.speeds_original('test'), data.speeds_original('test2')]).drop_duplicates() sensors['KM_BEFORE'] = sensors['KM'].shift(1) sensors['KEY_BEFORE'] = sensors['KEY'].shift(1) sensors['KM_AFTER'] = sensors['KM'].shift(-1) sensors['KEY_AFTER'] = sensors['KEY'].shift(-1) sensors.loc[sensors.KEY_AFTER != sensors.KEY, 'KM_AFTER'] = np.nan sensors.loc[sensors.KEY_BEFORE != sensors.KEY, 'KM_BEFORE'] = np.nan sensors.drop(['KEY_BEFORE', 'KEY_AFTER'], axis=1, inplace=True) sensors = sensors[[KEY, KM, 'KM_BEFORE', 'KM_AFTER']] merged = pd.merge(df, sensors, left_on=[KEY, KM], right_on=[KEY, KM]) print('creating features...') for i in range(1, 5): speed_avg_before = 'SPEED_AVG_BEFORE_-' + str(i) speed_avg_after = 'SPEED_AVG_AFTER_-' + str(i) datetime = 'DATETIME_UTC_-' + str(i) speeds[speed_avg_before] = speeds[SPEED_AVG] speeds[speed_avg_after] = speeds[SPEED_AVG] merged = pd.merge(merged, speeds[[KEY, KM, DATETIME, speed_avg_before]], left_on=[KEY, 'KM_BEFORE', datetime], right_on=[KEY, KM, DATETIME], suffixes=('_x_-' + str(i), '_y_-' + str(i))) merged = pd.merge(merged, speeds[[KEY, KM, DATETIME, speed_avg_after]], left_on=[KEY, 'KM_AFTER', datetime], right_on=[KEY, KM, DATETIME], suffixes=('_x_-' + str(i), '_y_-' + str(i))) merged.drop(columns=['KM', 'DATETIME_UTC_y_-3', 'KM_y_-3', 'DATETIME_UTC_y_-4', 'DATETIME_UTC_y_-2', 'KM_y_-2', 'DATETIME_UTC_y_-1', 'KM_x_-2', 'KM_y_-1', 'KM_x_-3', 'KM_x_-4', 'KM_y_-4', 'DATETIME_UTC_y_-4'], inplace=True) merged.rename(columns={'KM_x_-1': 'KM', 'DATETIME_UTC_x_-4': 'DATETIME_UTC_-4', 'DATETIME_UTC_x_-3': 'DATETIME_UTC_-3', 'DATETIME_UTC_x_-2': 'DATETIME_UTC_-2', 'DATETIME_UTC_x_-1': 'DATETIME_UTC_-1'}, inplace=True) merged['DELTA_BEFORE'] = merged[KM] - merged['KM_BEFORE'] merged['DELTA_AFTER'] = merged['KM_AFTER'] - merged[KM] to_keep_1 = ['DATETIME_UTC_-' + str(k) for k in range(1, 5)] to_keep_2 = ['SPEED_AVG_BEFORE_-' + str(k) for k in range(1, 5)] to_keep_3 = ['SPEED_AVG_AFTER_-' + str(k) for k in range(1, 5)] to_keep_4 = ['DELTA_BEFORE', 'DELTA_AFTER'] to_keep = [KEY, KM, *to_keep_1, *to_keep_2, *to_keep_3, *to_keep_4] for i in range(1, 5): merged['DATETIME_UTC_-' + str(i)] = pd.to_datetime(merged['DATETIME_UTC_-' + str(i)]) return merged[to_keep]
def avg_speed_for_roadtype() -> pd.DataFrame: print('Loading datasets') speeds = data.speeds() sensors = data.sensors() print('Done') df = pd.merge(speeds.dropna(), sensors, left_on=[KEY, KM], right_on=[KEY, KM]) df[DATETIME] = pd.to_datetime(df.DATETIME_UTC) return df[['ROAD_TYPE', 'SPEED_AVG']].groupby('ROAD_TYPE').mean()
def avg_speed_for_roadtype_event() -> pd.DataFrame: speeds = data.speeds_original() events = data.events() sensors = data.sensors() merged = utility.merge_speed_events(speeds, events) merged = pd.merge(merged, sensors, on=[KEY, KM]) merged = merged[[EVENT_TYPE, SPEED_AVG, ROAD_TYPE]].dropna() \ .groupby([EVENT_TYPE, ROAD_TYPE]).agg(['mean', 'std']) merged['AVG_SPEED_EVENT'] = merged[SPEED_AVG]['mean'] merged['STD_SPEED_EVENT'] = merged[SPEED_AVG]['std'] merged.columns = merged.columns.droplevel(level=1) merged.drop([SPEED_AVG], axis=1, inplace=True) merged.reset_index(inplace=True) return merged
def create_base_dataset(mode, steps_behind_event, steps_after_event=3, validation_split=0.2): """ Create the dataframe containing the road measurements for every timestamp and related additional information about sensors, events and weather """ print( f'Creating base dataset for {mode.upper()} with timewindows ({steps_behind_event}, {steps_after_event})' ) # load dataframes to be joined # - sensors sensors = data.sensors() weather = data.weather() for t in ['train', 'test']: print() print('Creating dataset', t.upper()) # - speeds # if speed_imputed: # s = data.speeds(mode).merge(sensors, how='left') # else: print('Merging speeds and events...') e = data.events(mode, t) if mode == 'local': speeds = data.speeds_original(t) elif mode == 'full': speeds = data.speeds(mode=mode, t=t) print('Done') print_memory_usage() # create the time windows for each event print('Creating time windows for events...') # find the starting time of each event ev_agg = e.astype({ 'KEY': 'int' }).groupby('index').agg({ 'step_duration': 'first', 'EVENT_DETAIL': 'first', 'EVENT_TYPE': 'first', 'KM_END': 'first', 'KM_START': 'first', 'KEY': 'first', 'KEY_2': 'first', 'KM_EVENT': 'first', 'START_DATETIME_UTC': 'min', }).rename(columns={'step_duration': 'event_duration'}) ev_agg['timewind_start'] = ev_agg.START_DATETIME_UTC - pd.to_timedelta( 15 * steps_behind_event, unit='m') ev_agg['timewind_end'] = ev_agg.START_DATETIME_UTC + pd.to_timedelta( 15 * steps_after_event, unit='m') # add speeds info ev_agg = merge_speed_events(speeds, ev_agg) # expand different sensors base_df = pd.DataFrame({col:np.repeat(ev_agg[col], ev_agg['sensors'].str.len()) \ for col in ev_agg.columns.drop('sensors')} \ ).assign(**{'KM': np.concatenate(ev_agg['sensors'].values)}) # expand timestamps base_df = utility.expand_timestamps(base_df, col_ts_start='timewind_start', col_ts_end='timewind_end')\ .drop(['timewind_start','timewind_end','step_duration'], axis=1) \ .rename(columns={'index':'event_index'}) \ .sort_values('event_index') base_df['DATETIME_UTC'] = pd.to_datetime(base_df['DATETIME_UTC'], unit='s') joined_df = base_df.drop('KEY_2', axis=1).merge( speeds.astype({'KEY': 'int'}), how='left', on=['KEY', 'KM', 'DATETIME_UTC']) # add other dataframes # - weather joined_df = joined_df.merge(weather, how='left') # - sensors joined_df = joined_df.merge(sensors, how='left') print('Aggregating events in samples...') joined_df = joined_df.sort_values(['KEY','KM','DATETIME_UTC']) \ .groupby(['event_index','KEY','KM'], as_index=False).agg({ 'KM_START':'first', 'KM_END':'first', 'DATETIME_UTC':list, 'event_duration':'first', 'SPEED_AVG':list, #[list, lambda x: x[0:event_beginning_step].dropna().mean()], 'SPEED_SD':list, 'SPEED_MAX':list, 'SPEED_MIN':list, 'N_VEHICLES':list, 'EMERGENCY_LANE':'first', 'LANES':'first', 'ROAD_TYPE':'first', 'EVENT_DETAIL':lambda x: x.values[steps_behind_event], 'EVENT_TYPE':lambda x: x.values[steps_behind_event], 'WEATHER': list, 'DISTANCE': list, 'TEMPERATURE': list, 'MIN_TEMPERATURE': list, 'MAX_TEMPERATURE': list }) # set sensor distance from event start and end joined_df['distance_start'] = joined_df['KM'] - joined_df['KM_START'] joined_df['distance_end'] = joined_df['KM'] - joined_df['KM_END'] joined_df.drop(['KM_END', 'KM_START'], axis=1, inplace=True) # split the last m measures in different columns def split_prediction_fields(row, event_beginning_step): return pd.Series(( row.DATETIME_UTC[:event_beginning_step], row.DATETIME_UTC[event_beginning_step:], row.SPEED_AVG[:event_beginning_step], row.SPEED_AVG[event_beginning_step:], row.SPEED_SD[:event_beginning_step], row.SPEED_MAX[:event_beginning_step], row.SPEED_MIN[:event_beginning_step], row.N_VEHICLES[:event_beginning_step], row.WEATHER[:event_beginning_step], row.DISTANCE[:event_beginning_step], row.TEMPERATURE[:event_beginning_step], row.MIN_TEMPERATURE[:event_beginning_step], row.MAX_TEMPERATURE[:event_beginning_step], )) print('Splitting time steps into separate columns...') columns_to_split = [ 'DATETIME_UTC', 'DATETIME_UTC_y', 'SPEED_AVG', 'SPEED_AVG_Y', 'SPEED_SD', 'SPEED_MAX', 'SPEED_MIN', 'N_VEHICLES', 'WEATHER', 'DISTANCE', 'TEMPERATURE', 'MIN_TEMPERATURE', 'MAX_TEMPERATURE' ] joined_df[columns_to_split] = joined_df.apply( split_prediction_fields, axis=1, event_beginning_step=steps_behind_event) for col_name in columns_to_split: if col_name.upper().endswith('_Y'): new_cols = [ '{}_{}'.format(col_name, i) for i in range(0, steps_after_event + 1) ] else: new_cols = [ '{}_{}'.format(col_name, i) for i in range(-steps_behind_event, 0) ] joined_df[new_cols] = pd.DataFrame( joined_df[col_name].values.tolist(), index=joined_df.index) # removed the residual columns of lists joined_df = joined_df.drop(columns_to_split, axis=1) # drop the rows for which all speeds are NaNs print('Dataset shape:', joined_df.shape) #print('Dropping not available speeds...') #joined_df.dropna(how='all', subset=[f'SPEED_AVG_{i}' for i in range(-steps_behind_event, 0)], inplace=True) #print('Dataset shape reduced to:', joined_df.shape) # set to NaN some of the target speeds if the events is shorter than 4 time steps joined_df.loc[joined_df['event_duration'] == 3, 'SPEED_AVG_Y_3'] = np.nan joined_df.loc[joined_df['event_duration'] == 2, ['SPEED_AVG_Y_2', 'SPEED_AVG_Y_3']] = np.nan joined_df.loc[ joined_df['event_duration'] == 1, ['SPEED_AVG_Y_1', 'SPEED_AVG_Y_2', 'SPEED_AVG_Y_3']] = np.nan joined_df.drop('event_duration', axis=1, inplace=True) # cast to int some columns joined_df = joined_df.astype({ 'EMERGENCY_LANE': 'int', 'LANES': 'int', 'ROAD_TYPE': 'int', 'EVENT_DETAIL': 'int', 'KEY': 'int', 'KM': 'int', 'event_index': 'int' }) """ if mode == 'train': # take random validation rows # random_indices = random.shuffle(joined_df.index) # validation_indices = random_indices[0: int(len(random_indices) * validation_split)] # train_df = joined_df.drop(validation_indices) # valid_df = joined_df.loc[validation_indices] """ # save the base dataset filepath = data.get_path_preprocessed(mode, t, 'base_dataset.csv.gz') print('Saving base dataframe to {}'.format(filepath)) joined_df.to_csv(filepath, index=False, compression='gzip') del joined_df print('Done')
def add_possible_sensors(events_df): sensors = data.sensors() res_df = sensors[['KEY', 'KM']].drop_duplicates().sort_values( ['KEY', 'KM']).groupby('KEY').agg(list) res_df = res_df.rename(columns={'KM': 'ROAD_SENSORS'}) return events_df.merge(res_df, on='KEY', how='left')