def extract_feature(self):
        print('Loading datasets')
        speeds = None

        if self.mode == 'local':
            tr = data.speeds_original('train')
            te = data.speed_test_masked()
            speeds = pd.concat([tr, te])
            del tr
            del te

        elif self.mode == 'full':
            tr = data.speeds(mode='full')
            te = data.speeds_original('test2')
            speeds = pd.concat([tr, te])
            del tr
            del te

        sensors = data.sensors()
        print('Done')

        df = pd.merge(speeds.dropna(),
                      sensors,
                      left_on=[KEY, KM],
                      right_on=[KEY, KM])
        df[DATETIME] = pd.to_datetime(df.DATETIME_UTC)

        return df[['ROAD_TYPE', 'SPEED_AVG']].groupby('ROAD_TYPE').mean().reset_index()\
            .rename(columns={'SPEED_AVG': 'avg_speed_roadtype'})
    def extract_feature(self):
        tr = data.speeds_original('train')
        te = data.speed_test_masked()
        speeds = pd.concat([tr, te])
        del tr
        del te

        print('Extracting min and max timestamps...')
        min_datetime = speeds.DATETIME_UTC.min()
        max_datetime = speeds.DATETIME_UTC.max()
        sensors = data.sensors().drop_duplicates([KEY, KM])
        print('Done')

        datetimes_df = pd.DataFrame(
            pd.date_range(min_datetime, max_datetime,
                          freq='15min').to_series()).reset_index()
        datetimes_df[DATETIME] = pd.to_datetime(datetimes_df['index'])
        datetimes_df = datetimes_df[[DATETIME]]
        print('Shifting hours')
        datetimes_df['DATETIME_HOUR'] = pd.to_datetime(
            datetimes_df[DATETIME]).apply(lambda x: x.floor('1H'))
        datetimes_df[
            'DATETIME_HOUR'] = datetimes_df['DATETIME_HOUR'] - pd.DateOffset(1)
        print('Done')

        print('Creating skeleton')
        datetimes_df['MERGE'] = 0
        sensors['MERGE'] = 0
        skeleton = pd.merge(sensors[[KEY, KM, 'MERGE']],
                            datetimes_df,
                            on='MERGE')
        skeleton[DATETIME] = pd.to_datetime(skeleton[DATETIME])
        skeleton.set_index(DATETIME, inplace=True)
        print('Done')

        print('Merging with speeds..')
        resampled_speeds = speeds\
            .groupby([KEY, KM])\
            .apply(lambda x: x.set_index(DATETIME)\
            .resample('H').mean()[[SPEED_AVG, SPEED_MAX, SPEED_MIN, SPEED_SD, N_CARS]]).reset_index()
        skeleton_merge = skeleton.reset_index()
        df = pd.merge(skeleton_merge,
                      resampled_speeds,
                      left_on=[KEY, KM, 'DATETIME_HOUR'],
                      right_on=[KEY, KM, DATETIME])
        df = df.rename(
            columns={
                'DATETIME_UTC_x': 'DATETIME_UTC',
                SPEED_AVG: 'SPEED_AVG_D-1',
                SPEED_MAX: 'SPEED_MAX_D-1',
                SPEED_MIN: 'SPEED_MIN_D-1',
                SPEED_SD: 'SPEED_SD_D-1',
                N_CARS: 'N_VEHICLES_D-1'
            })
        print('Done')
        return df
Beispiel #3
0
    def extract_feature(self):
        print('Reading data...')
        df = data.base_dataset()
        sensors = data.sensors().drop_duplicates().sort_values([KEY, KM])
        speeds = pd.concat([data.speeds_original('train'), data.speeds_original('test'), data.speeds_original('test2')]).drop_duplicates()
        
        sensors['KM_BEFORE'] = sensors['KM'].shift(1)
        sensors['KEY_BEFORE'] = sensors['KEY'].shift(1)
        sensors['KM_AFTER'] = sensors['KM'].shift(-1)
        sensors['KEY_AFTER'] = sensors['KEY'].shift(-1)

        sensors.loc[sensors.KEY_AFTER != sensors.KEY, 'KM_AFTER'] = np.nan
        sensors.loc[sensors.KEY_BEFORE != sensors.KEY, 'KM_BEFORE'] = np.nan

        sensors.drop(['KEY_BEFORE', 'KEY_AFTER'], axis=1, inplace=True)
        sensors = sensors[[KEY, KM, 'KM_BEFORE', 'KM_AFTER']]
        
        merged = pd.merge(df, sensors, left_on=[KEY, KM], right_on=[KEY, KM])
        
        print('creating features...')
        for i in range(1, 5):
            speed_avg_before = 'SPEED_AVG_BEFORE_-' + str(i)
            speed_avg_after = 'SPEED_AVG_AFTER_-' + str(i)
            datetime = 'DATETIME_UTC_-' + str(i)


            speeds[speed_avg_before] = speeds[SPEED_AVG]
            speeds[speed_avg_after] = speeds[SPEED_AVG]
            merged = pd.merge(merged, speeds[[KEY, KM, DATETIME, speed_avg_before]], left_on=[KEY, 'KM_BEFORE', datetime], right_on=[KEY, KM, DATETIME], suffixes=('_x_-' + str(i), '_y_-' + str(i)))

            merged = pd.merge(merged, speeds[[KEY, KM, DATETIME, speed_avg_after]], left_on=[KEY, 'KM_AFTER', datetime], right_on=[KEY, KM, DATETIME], suffixes=('_x_-' + str(i), '_y_-' + str(i)))


        merged.drop(columns=['KM', 'DATETIME_UTC_y_-3', 'KM_y_-3', 'DATETIME_UTC_y_-4',
                             'DATETIME_UTC_y_-2', 'KM_y_-2', 'DATETIME_UTC_y_-1', 'KM_x_-2',
                             'KM_y_-1', 'KM_x_-3',
                             'KM_x_-4', 'KM_y_-4', 'DATETIME_UTC_y_-4'], inplace=True)
        merged.rename(columns={'KM_x_-1': 'KM',
                               'DATETIME_UTC_x_-4': 'DATETIME_UTC_-4',
                               'DATETIME_UTC_x_-3': 'DATETIME_UTC_-3',
                               'DATETIME_UTC_x_-2': 'DATETIME_UTC_-2',
                               'DATETIME_UTC_x_-1': 'DATETIME_UTC_-1'}, inplace=True)
        merged['DELTA_BEFORE'] = merged[KM] - merged['KM_BEFORE']
        merged['DELTA_AFTER'] = merged['KM_AFTER'] - merged[KM]
        
        to_keep_1 = ['DATETIME_UTC_-' + str(k) for k in range(1, 5)]
        to_keep_2 = ['SPEED_AVG_BEFORE_-' + str(k) for k in range(1, 5)]
        to_keep_3 = ['SPEED_AVG_AFTER_-' + str(k) for k in range(1, 5)]
        to_keep_4 = ['DELTA_BEFORE', 'DELTA_AFTER']
        to_keep = [KEY, KM, *to_keep_1, *to_keep_2, *to_keep_3, *to_keep_4]
        for i in range(1, 5):
            merged['DATETIME_UTC_-' + str(i)] = pd.to_datetime(merged['DATETIME_UTC_-' + str(i)])
        return merged[to_keep]
def avg_speed_for_roadtype() -> pd.DataFrame:
    print('Loading datasets')
    speeds = data.speeds()
    sensors = data.sensors()
    print('Done')

    df = pd.merge(speeds.dropna(),
                  sensors,
                  left_on=[KEY, KM],
                  right_on=[KEY, KM])
    df[DATETIME] = pd.to_datetime(df.DATETIME_UTC)

    return df[['ROAD_TYPE', 'SPEED_AVG']].groupby('ROAD_TYPE').mean()
def avg_speed_for_roadtype_event() -> pd.DataFrame:
    speeds = data.speeds_original()
    events = data.events()
    sensors = data.sensors()
    merged = utility.merge_speed_events(speeds, events)

    merged = pd.merge(merged, sensors, on=[KEY, KM])
    merged = merged[[EVENT_TYPE, SPEED_AVG, ROAD_TYPE]].dropna() \
            .groupby([EVENT_TYPE, ROAD_TYPE]).agg(['mean', 'std'])

    merged['AVG_SPEED_EVENT'] = merged[SPEED_AVG]['mean']
    merged['STD_SPEED_EVENT'] = merged[SPEED_AVG]['std']
    merged.columns = merged.columns.droplevel(level=1)

    merged.drop([SPEED_AVG], axis=1, inplace=True)
    merged.reset_index(inplace=True)
    return merged
Beispiel #6
0
def create_base_dataset(mode,
                        steps_behind_event,
                        steps_after_event=3,
                        validation_split=0.2):
    """
    Create the dataframe containing the road measurements for every timestamp and related
    additional information about sensors, events and weather
    """
    print(
        f'Creating base dataset for {mode.upper()} with timewindows ({steps_behind_event}, {steps_after_event})'
    )

    # load dataframes to be joined
    # - sensors
    sensors = data.sensors()
    weather = data.weather()

    for t in ['train', 'test']:
        print()
        print('Creating dataset', t.upper())
        # - speeds
        # if speed_imputed:
        #     s = data.speeds(mode).merge(sensors, how='left')
        # else:
        print('Merging speeds and events...')
        e = data.events(mode, t)

        if mode == 'local':
            speeds = data.speeds_original(t)
        elif mode == 'full':
            speeds = data.speeds(mode=mode, t=t)

        print('Done')
        print_memory_usage()

        # create the time windows for each event
        print('Creating time windows for events...')

        # find the starting time of each event
        ev_agg = e.astype({
            'KEY': 'int'
        }).groupby('index').agg({
            'step_duration': 'first',
            'EVENT_DETAIL': 'first',
            'EVENT_TYPE': 'first',
            'KM_END': 'first',
            'KM_START': 'first',
            'KEY': 'first',
            'KEY_2': 'first',
            'KM_EVENT': 'first',
            'START_DATETIME_UTC': 'min',
        }).rename(columns={'step_duration': 'event_duration'})

        ev_agg['timewind_start'] = ev_agg.START_DATETIME_UTC - pd.to_timedelta(
            15 * steps_behind_event, unit='m')
        ev_agg['timewind_end'] = ev_agg.START_DATETIME_UTC + pd.to_timedelta(
            15 * steps_after_event, unit='m')

        # add speeds info
        ev_agg = merge_speed_events(speeds, ev_agg)

        # expand different sensors
        base_df = pd.DataFrame({col:np.repeat(ev_agg[col], ev_agg['sensors'].str.len()) \
                           for col in ev_agg.columns.drop('sensors')} \
            ).assign(**{'KM': np.concatenate(ev_agg['sensors'].values)})
        # expand timestamps
        base_df = utility.expand_timestamps(base_df, col_ts_start='timewind_start', col_ts_end='timewind_end')\
                    .drop(['timewind_start','timewind_end','step_duration'], axis=1) \
                    .rename(columns={'index':'event_index'}) \
                    .sort_values('event_index')
        base_df['DATETIME_UTC'] = pd.to_datetime(base_df['DATETIME_UTC'],
                                                 unit='s')

        joined_df = base_df.drop('KEY_2', axis=1).merge(
            speeds.astype({'KEY': 'int'}),
            how='left',
            on=['KEY', 'KM', 'DATETIME_UTC'])

        # add other dataframes
        # - weather
        joined_df = joined_df.merge(weather, how='left')
        # - sensors
        joined_df = joined_df.merge(sensors, how='left')

        print('Aggregating events in samples...')
        joined_df = joined_df.sort_values(['KEY','KM','DATETIME_UTC']) \
            .groupby(['event_index','KEY','KM'], as_index=False).agg({
            'KM_START':'first',
            'KM_END':'first',
            'DATETIME_UTC':list,
            'event_duration':'first',
            'SPEED_AVG':list, #[list, lambda x: x[0:event_beginning_step].dropna().mean()],
            'SPEED_SD':list,
            'SPEED_MAX':list,
            'SPEED_MIN':list,
            'N_VEHICLES':list,
            'EMERGENCY_LANE':'first',
            'LANES':'first',
            'ROAD_TYPE':'first',
            'EVENT_DETAIL':lambda x: x.values[steps_behind_event],
            'EVENT_TYPE':lambda x: x.values[steps_behind_event],
            'WEATHER': list,
            'DISTANCE': list,
            'TEMPERATURE': list,
            'MIN_TEMPERATURE': list,
            'MAX_TEMPERATURE': list
        })
        # set sensor distance from event start and end
        joined_df['distance_start'] = joined_df['KM'] - joined_df['KM_START']
        joined_df['distance_end'] = joined_df['KM'] - joined_df['KM_END']
        joined_df.drop(['KM_END', 'KM_START'], axis=1, inplace=True)

        # split the last m measures in different columns
        def split_prediction_fields(row, event_beginning_step):
            return pd.Series((
                row.DATETIME_UTC[:event_beginning_step],
                row.DATETIME_UTC[event_beginning_step:],
                row.SPEED_AVG[:event_beginning_step],
                row.SPEED_AVG[event_beginning_step:],
                row.SPEED_SD[:event_beginning_step],
                row.SPEED_MAX[:event_beginning_step],
                row.SPEED_MIN[:event_beginning_step],
                row.N_VEHICLES[:event_beginning_step],
                row.WEATHER[:event_beginning_step],
                row.DISTANCE[:event_beginning_step],
                row.TEMPERATURE[:event_beginning_step],
                row.MIN_TEMPERATURE[:event_beginning_step],
                row.MAX_TEMPERATURE[:event_beginning_step],
            ))

        print('Splitting time steps into separate columns...')

        columns_to_split = [
            'DATETIME_UTC', 'DATETIME_UTC_y', 'SPEED_AVG', 'SPEED_AVG_Y',
            'SPEED_SD', 'SPEED_MAX', 'SPEED_MIN', 'N_VEHICLES', 'WEATHER',
            'DISTANCE', 'TEMPERATURE', 'MIN_TEMPERATURE', 'MAX_TEMPERATURE'
        ]
        joined_df[columns_to_split] = joined_df.apply(
            split_prediction_fields,
            axis=1,
            event_beginning_step=steps_behind_event)

        for col_name in columns_to_split:
            if col_name.upper().endswith('_Y'):
                new_cols = [
                    '{}_{}'.format(col_name, i)
                    for i in range(0, steps_after_event + 1)
                ]
            else:
                new_cols = [
                    '{}_{}'.format(col_name, i)
                    for i in range(-steps_behind_event, 0)
                ]

            joined_df[new_cols] = pd.DataFrame(
                joined_df[col_name].values.tolist(), index=joined_df.index)

        # removed the residual columns of lists
        joined_df = joined_df.drop(columns_to_split, axis=1)

        # drop the rows for which all speeds are NaNs
        print('Dataset shape:', joined_df.shape)
        #print('Dropping not available speeds...')
        #joined_df.dropna(how='all', subset=[f'SPEED_AVG_{i}' for i in range(-steps_behind_event, 0)], inplace=True)
        #print('Dataset shape reduced to:', joined_df.shape)

        # set to NaN some of the target speeds if the events is shorter than 4 time steps
        joined_df.loc[joined_df['event_duration'] == 3,
                      'SPEED_AVG_Y_3'] = np.nan
        joined_df.loc[joined_df['event_duration'] == 2,
                      ['SPEED_AVG_Y_2', 'SPEED_AVG_Y_3']] = np.nan
        joined_df.loc[
            joined_df['event_duration'] == 1,
            ['SPEED_AVG_Y_1', 'SPEED_AVG_Y_2', 'SPEED_AVG_Y_3']] = np.nan
        joined_df.drop('event_duration', axis=1, inplace=True)

        # cast to int some columns
        joined_df = joined_df.astype({
            'EMERGENCY_LANE': 'int',
            'LANES': 'int',
            'ROAD_TYPE': 'int',
            'EVENT_DETAIL': 'int',
            'KEY': 'int',
            'KM': 'int',
            'event_index': 'int'
        })
        """
        if mode == 'train':
            # take random validation rows

            # random_indices = random.shuffle(joined_df.index)
            # validation_indices = random_indices[0: int(len(random_indices) * validation_split)]
            # train_df = joined_df.drop(validation_indices)
            # valid_df = joined_df.loc[validation_indices]
        """

        # save the base dataset
        filepath = data.get_path_preprocessed(mode, t, 'base_dataset.csv.gz')

        print('Saving base dataframe to {}'.format(filepath))
        joined_df.to_csv(filepath, index=False, compression='gzip')
        del joined_df
        print('Done')
Beispiel #7
0
def add_possible_sensors(events_df):
    sensors = data.sensors()
    res_df = sensors[['KEY', 'KM']].drop_duplicates().sort_values(
        ['KEY', 'KM']).groupby('KEY').agg(list)
    res_df = res_df.rename(columns={'KM': 'ROAD_SENSORS'})
    return events_df.merge(res_df, on='KEY', how='left')