def create_base_structure_hours(): """ Call to create the base structure it is a pd Dataframe composed as follow: KEY | DATETIME_UTC | KM it is usefull to do join with other dataframe in it there are all the DATETIME_UTC present both in train and test speeds.csv files """ start = time() # define the base path where to save the base_structure _BASE_PATH = 'resources/dataset/preprocessed' # check if the folder exsist if not create it utils.check_folder(_BASE_PATH) speeds_train = data.speeds('train') speeds_test = data.speeds('test') # create all the datetimes between min train and max test datetime min_train_datetime = sorted( pd.to_datetime( speeds_train['DATETIME_UTC']).unique())[0].astype('int') // 10**9 max_test_datetime = sorted( pd.to_datetime( speeds_test['DATETIME_UTC']).unique())[-1].astype('int') // 10**9 range_datetimes = np.arange(min_train_datetime, max_test_datetime, 60 * 60) datetime_df = pd.DataFrame(pd.to_datetime(range_datetimes, unit='s'), columns=['DATETIME_UTC']) key_2_train = speeds_train.KEY_2.unique() key_2_test = speeds_test.KEY_2.unique() # get all the unique key_2 in train and test key_2_full = sorted(set(key_2_test) | set(key_2_train)) temp = pd.DataFrame(list(map(lambda x: x.split('_'), key_2_full)), columns=['KEY', 'KM']) # add dummy column to let a merge do a cartesian product temp['dummy'] = 0 datetime_df['dummy'] = 0 print('Doing cartesian product... it will take a while!') base_structure = pd.merge(datetime_df, temp).drop(['dummy'], axis=1) print('Done\n') print('sorting values...') base_structure = base_structure.sort_values(['DATETIME_UTC', 'KEY', 'KM']).reset_index(drop=True) print('Done\n') # save the base structure print('Saving base structure to {}/base_structure.csv'.format(_BASE_PATH)) base_structure.to_csv(f'{_BASE_PATH}/base_structure_hours.csv', index=False) print('Done\n') print(f'PROCEDURE ENDED SUCCESSFULLY IN: {round(time() - start, 4)} s')
def extract_feature(self): print('Loading datasets') speeds = None if self.mode == 'local': tr = data.speeds_original('train') te = data.speed_test_masked() speeds = pd.concat([tr, te]) del tr del te elif self.mode == 'full': tr = data.speeds(mode='full') te = data.speeds_original('test2') speeds = pd.concat([tr, te]) del tr del te sensors = data.sensors() print('Done') df = pd.merge(speeds.dropna(), sensors, left_on=[KEY, KM], right_on=[KEY, KM]) df[DATETIME] = pd.to_datetime(df.DATETIME_UTC) return df[['ROAD_TYPE', 'SPEED_AVG']].groupby('ROAD_TYPE').mean().reset_index()\ .rename(columns={'SPEED_AVG': 'avg_speed_roadtype'})
def extract_feature(self): speeds = None if self.mode == 'local': tr = data.speeds_original('train') te = data.speed_test_masked() speeds = pd.concat([tr, te]) del tr del te elif self.mode == 'full': tr = data.speeds(mode='full') te = data.speeds_original('test2') speeds = pd.concat([tr, te]) del tr del te print('Extracting min and max timestamps...') min_datetime = speeds.DATETIME_UTC.min() max_datetime = speeds.DATETIME_UTC.max() print('Done') df = pd.DataFrame( pd.date_range(min_datetime, max_datetime, freq='15min').to_series()).reset_index() df[DATETIME] = pd.to_datetime(df['index']) df = df[[DATETIME]] df['WEEK_DAY'] = pd.to_datetime(df[DATETIME]).dt.weekday df['IS_WEEKEND'] = df.WEEK_DAY.map(lambda x: 1 if x in [5, 6] else 0) return df.rename(columns={'DATETIME_UTC': 'DATETIME_UTC_y_0'})
def extract_feature(self): df = None if self.mode == 'local': tr = data.speeds_original('train') te = data.speed_test_masked() df = pd.concat([tr, te]) del tr del te elif self.mode == 'full': tr = data.speeds(mode='full') te = data.speeds_original('test2') df = pd.concat([tr, te]) del tr del te df.DATETIME_UTC = df.DATETIME_UTC.dt.strftime('%H:%M:%S') return df[['KEY', 'KM', 'DATETIME_UTC', 'SPEED_AVG', 'SPEED_SD', 'SPEED_MIN', 'SPEED_MAX', 'N_VEHICLES']].groupby(['KEY', 'KM', 'DATETIME_UTC']).mean().reset_index()\ .rename(columns={'DATETIME_UTC': 'DATETIME_UTC_SPEED_SENSOR_HOUR', 'SPEED_AVG': 'avg_speed_sensor_hour', 'SPEED_SD': 'avg_speed_sd_sensor_hour', 'SPEED_MIN': 'avg_speed_min_sensor_hour', 'SPEED_MAX': 'avg_speed_max_sensor_hour', 'N_VEHICLES': 'avg_n_vehicles_sensor_hour'})
def extract_feature(self): speeds = None if self.mode == 'local': tr = data.speeds_original('train') te = data.speed_test_masked() speeds = pd.concat([tr, te]) del tr del te elif self.mode == 'full': tr = data.speeds(mode='full') te = data.speeds_original('test2') speeds = pd.concat([tr, te]) del tr del te feature_cols = ["DATETIME_UTC", "KEY", "KM", "N_VEHICLES"] speeds = speeds.loc[:, feature_cols] speeds["N_VEHICLES"] = speeds.N_VEHICLES.fillna(0).astype(int) #contains also weekday speeds["day"] = speeds.DATETIME_UTC.dt.weekday speeds = speeds[['KEY', 'KM', 'N_VEHICLES', 'day']].groupby(['KEY', 'KM', 'day']).mean().reset_index() return speeds.rename( columns={'N_VEHICLES': 'avg_n_vehicles_sensor_per_day'})
def extract_feature(self): s = None if self.mode == 'local': tr = data.speeds_original('train').drop(['KEY_2'], axis=1) te = data.speed_test_masked().drop(['KEY_2'], axis=1) s = pd.concat([tr, te]) del tr del te elif self.mode == 'full': tr = data.speeds(mode='full').drop(['KEY_2'], axis=1) te = data.speeds_original('test2').drop(['KEY_2'], axis=1) s = pd.concat([tr, te]) del tr del te f = s[['KEY', 'DATETIME_UTC', 'KM']].copy() s = s.rename(columns={'DATETIME_UTC': 'DATETIME_UTC_drop'}) for i in tqdm(range(1, self.n_days_before + 1)): colname = 'DATETIME_UTC_{}_D'.format(i) f[colname] = f.DATETIME_UTC - pd.Timedelta(days=i) f = pd.merge(f, s, how='left', left_on=['KEY', 'KM', colname], \ right_on=['KEY', 'KM', 'DATETIME_UTC_drop']) \ .drop([colname, 'DATETIME_UTC_drop'], axis=1) f = f.rename( columns={ 'SPEED_AVG': 'SPEED_AVG_{}_DAY_BEFORE'.format(i), 'SPEED_SD': 'SPEED_SD_{}_DAY_BEFORE'.format(i), 'SPEED_MIN': 'SPEED_MIN_{}_DAY_BEFORE'.format(i), 'SPEED_MAX': 'SPEED_MAX_{}_DAY_BEFORE'.format(i), 'N_VEHICLES': 'N_VEHICLES_{}_DAY_BEFORE'.format(i) }) return f.rename(columns={'DATETIME_UTC': 'DATETIME_UTC_y_0'})
def avg_speed_for_roadtype() -> pd.DataFrame: print('Loading datasets') speeds = data.speeds() sensors = data.sensors() print('Done') df = pd.merge(speeds.dropna(), sensors, left_on=[KEY, KM], right_on=[KEY, KM]) df[DATETIME] = pd.to_datetime(df.DATETIME_UTC) return df[['ROAD_TYPE', 'SPEED_AVG']].groupby('ROAD_TYPE').mean()
def extract_feature(self): if self.mode == 'local': tr = data.speeds_original('train') te = data.speed_test_masked() f = pd.concat([tr, te]) elif self.mode == 'full': tr = data.speeds(mode='full') te = data.speeds_original('test2') f = pd.concat([tr, te]) del tr del te etr = data.events(self.mode, 'train') ete = data.events(self.mode, 'test') ef = pd.concat([etr, ete]) del etr del ete m = pd.merge(ef, f, left_on=['KEY', 'DATETIME_UTC'], right_on=['KEY', 'DATETIME_UTC']) m = m[(m.KM >= m.KM_START) & (m.KM <= m.KM_END)] df['start_event_distance'] = df[] return df
def extract_feature(self): df = None if self.mode == 'local': tr = data.speeds_original('train') te = data.speed_test_masked() df = pd.concat([tr, te]) del tr del te elif self.mode == 'full': tr = data.speeds(mode='full') te = data.speeds_original('test2') df = pd.concat([tr, te]) del tr del te f = df[['KEY', 'SPEED_AVG', 'SPEED_SD', 'SPEED_MIN', 'SPEED_MAX', 'N_VEHICLES']].groupby(['KEY']).mean().reset_index()\ .rename(columns={'SPEED_AVG': 'avg_speed_street',\ 'SPEED_SD': 'avg_speed_sd_street', \ 'SPEED_MIN': 'avg_speed_min_street', \ 'SPEED_MAX': 'avg_speed_max_street', \ 'N_VEHICLES': 'avg_n_vehicles_street'}) return f
def extract_feature(self): speeds = None if self.mode == 'local': tr = data.speeds_original('train') te = data.speed_test_masked() speeds = pd.concat([tr, te]) del tr del te elif self.mode == 'full': tr = data.speeds(mode='full') te = data.speeds_original('test2') speeds = pd.concat([tr, te]) del tr del te etr = data.events(self.mode, 'train') ete = data.events(self.mode, 'test') ef = pd.concat([etr, ete]) del etr del ete t = ef[['START_DATETIME_UTC', 'END_DATETIME_UTC', 'KEY', 'KM_START', \ 'KM_END', 'DATETIME_UTC', 'EVENT_TYPE', 'EVENT_DETAIL']] t = t.loc[t.groupby(['START_DATETIME_UTC', 'END_DATETIME_UTC', 'KEY'], as_index=False).DATETIME_UTC.idxmin()] t['DATETIME_UTC-1'] = t.DATETIME_UTC - pd.Timedelta(minutes=15) t = t.drop(['START_DATETIME_UTC', 'END_DATETIME_UTC', 'DATETIME_UTC'], axis=1) speeds = speeds[['KEY', 'KM', 'DATETIME_UTC', 'SPEED_AVG']] final = pd.merge(t, speeds, left_on=['KEY', 'DATETIME_UTC-1'], right_on=['KEY', 'DATETIME_UTC']) final = final.rename(columns={ 'SPEED_AVG': 'speed_avg-1', 'KM': 'KM-1' }) final = final.drop(['DATETIME_UTC'], axis=1) final = final[(final['KM-1'] >= final.KM_START) & (final['KM-1'] <= final.KM_END)] ds = [] for ts in range(4): m_ = t.copy() print(len(m_)) quarters_delta = ts + 1 m_['DATETIME_UTC_{}'.format( quarters_delta)] = m_['DATETIME_UTC-1'] + pd.Timedelta( minutes=15 * quarters_delta) m_ = pd.merge(m_, speeds, \ left_on=['KEY', 'DATETIME_UTC_{}'.format(quarters_delta)], \ right_on=['KEY', 'DATETIME_UTC'], how='left') m_ = m_.rename(columns={'SPEED_AVG': 'speed_avg_{}'.format(quarters_delta), \ 'KM': 'KM_{}'.format(quarters_delta)}) m_ = m_.drop(['DATETIME_UTC'], axis=1) m_ = m_[(m_['KM_{}'.format(quarters_delta)] >= m_.KM_START) & (m_['KM_{}'.format(quarters_delta)] <= m_.KM_END)] m_ = m_.rename(columns={'KM_{}'.format(quarters_delta): 'KM-1'}) m_ = m_.drop(['DATETIME_UTC_{}'.format(quarters_delta)], axis=1) print(len(m_)) ds.append(m_) final = final.drop(['KM-1'], axis=1) for i in range(len(ds)): df = ds[i] j = i + 1 print('shape before {}'.format(len(final))) final = pd.merge(final, df) print('shape after {}'.format(len(final))) final = final[[ 'EVENT_TYPE', 'speed_avg-1', 'speed_avg_1', 'speed_avg_2', 'speed_avg_3', 'speed_avg_4' ]] final['diff-1-step'] = final['speed_avg_1'] - final['speed_avg-1'] final['diff-2-step'] = final['speed_avg_2'] - final['speed_avg-1'] final['diff-3-step'] = final['speed_avg_3'] - final['speed_avg-1'] final['diff-4-step'] = final['speed_avg_4'] - final['speed_avg-1'] final = final.drop([ 'speed_avg_1', 'speed_avg-1', 'speed_avg_2', 'speed_avg_3', 'speed_avg_4' ], axis=1) return final.groupby(['EVENT_TYPE'], as_index=False).mean()
#if you want to know current working dir sys.path.append(os.getcwd()) from src.utils import * from src.utility import merge_speed_events import src.data as data import src.utility as utils from src.utils import resources_path from src.preprocessing.other_features import avg_speed_for_roadtype_event from tqdm import tqdm if __name__ == '__main__': for t in ['train', 'test', '2019']: print('Reading datasets...') X_df = data.base_dataset(mode=t) speeds = data.speeds(mode=t) print('Done') speeds[DATETIME] = pd.to_datetime(speeds[DATETIME]) print('Inferring...') window_len = sum(X_df.columns.str.match('^SPEED_AVG_-.*$') * 1) for i in tqdm(range(1, window_len + 1)): time = 'DATETIME_UTC_-' + str(i) speed_avg = 'SPEED_AVG_-' + str(i) speed_max = 'SPEED_MAX_-' + str(i) speed_min = 'SPEED_MIN_-' + str(i) speed_std = 'SPEED_SD_-' + str(i) n_cars = 'N_VEHICLES_-' + str(i) X_df[time] = pd.to_datetime(X_df[time]) X_df.drop(
def create_base_dataset(mode, steps_behind_event, steps_after_event=3, validation_split=0.2): """ Create the dataframe containing the road measurements for every timestamp and related additional information about sensors, events and weather """ print( f'Creating base dataset for {mode.upper()} with timewindows ({steps_behind_event}, {steps_after_event})' ) # load dataframes to be joined # - sensors sensors = data.sensors() weather = data.weather() for t in ['train', 'test']: print() print('Creating dataset', t.upper()) # - speeds # if speed_imputed: # s = data.speeds(mode).merge(sensors, how='left') # else: print('Merging speeds and events...') e = data.events(mode, t) if mode == 'local': speeds = data.speeds_original(t) elif mode == 'full': speeds = data.speeds(mode=mode, t=t) print('Done') print_memory_usage() # create the time windows for each event print('Creating time windows for events...') # find the starting time of each event ev_agg = e.astype({ 'KEY': 'int' }).groupby('index').agg({ 'step_duration': 'first', 'EVENT_DETAIL': 'first', 'EVENT_TYPE': 'first', 'KM_END': 'first', 'KM_START': 'first', 'KEY': 'first', 'KEY_2': 'first', 'KM_EVENT': 'first', 'START_DATETIME_UTC': 'min', }).rename(columns={'step_duration': 'event_duration'}) ev_agg['timewind_start'] = ev_agg.START_DATETIME_UTC - pd.to_timedelta( 15 * steps_behind_event, unit='m') ev_agg['timewind_end'] = ev_agg.START_DATETIME_UTC + pd.to_timedelta( 15 * steps_after_event, unit='m') # add speeds info ev_agg = merge_speed_events(speeds, ev_agg) # expand different sensors base_df = pd.DataFrame({col:np.repeat(ev_agg[col], ev_agg['sensors'].str.len()) \ for col in ev_agg.columns.drop('sensors')} \ ).assign(**{'KM': np.concatenate(ev_agg['sensors'].values)}) # expand timestamps base_df = utility.expand_timestamps(base_df, col_ts_start='timewind_start', col_ts_end='timewind_end')\ .drop(['timewind_start','timewind_end','step_duration'], axis=1) \ .rename(columns={'index':'event_index'}) \ .sort_values('event_index') base_df['DATETIME_UTC'] = pd.to_datetime(base_df['DATETIME_UTC'], unit='s') joined_df = base_df.drop('KEY_2', axis=1).merge( speeds.astype({'KEY': 'int'}), how='left', on=['KEY', 'KM', 'DATETIME_UTC']) # add other dataframes # - weather joined_df = joined_df.merge(weather, how='left') # - sensors joined_df = joined_df.merge(sensors, how='left') print('Aggregating events in samples...') joined_df = joined_df.sort_values(['KEY','KM','DATETIME_UTC']) \ .groupby(['event_index','KEY','KM'], as_index=False).agg({ 'KM_START':'first', 'KM_END':'first', 'DATETIME_UTC':list, 'event_duration':'first', 'SPEED_AVG':list, #[list, lambda x: x[0:event_beginning_step].dropna().mean()], 'SPEED_SD':list, 'SPEED_MAX':list, 'SPEED_MIN':list, 'N_VEHICLES':list, 'EMERGENCY_LANE':'first', 'LANES':'first', 'ROAD_TYPE':'first', 'EVENT_DETAIL':lambda x: x.values[steps_behind_event], 'EVENT_TYPE':lambda x: x.values[steps_behind_event], 'WEATHER': list, 'DISTANCE': list, 'TEMPERATURE': list, 'MIN_TEMPERATURE': list, 'MAX_TEMPERATURE': list }) # set sensor distance from event start and end joined_df['distance_start'] = joined_df['KM'] - joined_df['KM_START'] joined_df['distance_end'] = joined_df['KM'] - joined_df['KM_END'] joined_df.drop(['KM_END', 'KM_START'], axis=1, inplace=True) # split the last m measures in different columns def split_prediction_fields(row, event_beginning_step): return pd.Series(( row.DATETIME_UTC[:event_beginning_step], row.DATETIME_UTC[event_beginning_step:], row.SPEED_AVG[:event_beginning_step], row.SPEED_AVG[event_beginning_step:], row.SPEED_SD[:event_beginning_step], row.SPEED_MAX[:event_beginning_step], row.SPEED_MIN[:event_beginning_step], row.N_VEHICLES[:event_beginning_step], row.WEATHER[:event_beginning_step], row.DISTANCE[:event_beginning_step], row.TEMPERATURE[:event_beginning_step], row.MIN_TEMPERATURE[:event_beginning_step], row.MAX_TEMPERATURE[:event_beginning_step], )) print('Splitting time steps into separate columns...') columns_to_split = [ 'DATETIME_UTC', 'DATETIME_UTC_y', 'SPEED_AVG', 'SPEED_AVG_Y', 'SPEED_SD', 'SPEED_MAX', 'SPEED_MIN', 'N_VEHICLES', 'WEATHER', 'DISTANCE', 'TEMPERATURE', 'MIN_TEMPERATURE', 'MAX_TEMPERATURE' ] joined_df[columns_to_split] = joined_df.apply( split_prediction_fields, axis=1, event_beginning_step=steps_behind_event) for col_name in columns_to_split: if col_name.upper().endswith('_Y'): new_cols = [ '{}_{}'.format(col_name, i) for i in range(0, steps_after_event + 1) ] else: new_cols = [ '{}_{}'.format(col_name, i) for i in range(-steps_behind_event, 0) ] joined_df[new_cols] = pd.DataFrame( joined_df[col_name].values.tolist(), index=joined_df.index) # removed the residual columns of lists joined_df = joined_df.drop(columns_to_split, axis=1) # drop the rows for which all speeds are NaNs print('Dataset shape:', joined_df.shape) #print('Dropping not available speeds...') #joined_df.dropna(how='all', subset=[f'SPEED_AVG_{i}' for i in range(-steps_behind_event, 0)], inplace=True) #print('Dataset shape reduced to:', joined_df.shape) # set to NaN some of the target speeds if the events is shorter than 4 time steps joined_df.loc[joined_df['event_duration'] == 3, 'SPEED_AVG_Y_3'] = np.nan joined_df.loc[joined_df['event_duration'] == 2, ['SPEED_AVG_Y_2', 'SPEED_AVG_Y_3']] = np.nan joined_df.loc[ joined_df['event_duration'] == 1, ['SPEED_AVG_Y_1', 'SPEED_AVG_Y_2', 'SPEED_AVG_Y_3']] = np.nan joined_df.drop('event_duration', axis=1, inplace=True) # cast to int some columns joined_df = joined_df.astype({ 'EMERGENCY_LANE': 'int', 'LANES': 'int', 'ROAD_TYPE': 'int', 'EVENT_DETAIL': 'int', 'KEY': 'int', 'KM': 'int', 'event_index': 'int' }) """ if mode == 'train': # take random validation rows # random_indices = random.shuffle(joined_df.index) # validation_indices = random_indices[0: int(len(random_indices) * validation_split)] # train_df = joined_df.drop(validation_indices) # valid_df = joined_df.loc[validation_indices] """ # save the base dataset filepath = data.get_path_preprocessed(mode, t, 'base_dataset.csv.gz') print('Saving base dataframe to {}'.format(filepath)) joined_df.to_csv(filepath, index=False, compression='gzip') del joined_df print('Done')