def extract_feature(self):
        print('Loading datasets')
        speeds = None

        if self.mode == 'local':
            tr = data.speeds_original('train')
            te = data.speed_test_masked()
            speeds = pd.concat([tr, te])
            del tr
            del te

        elif self.mode == 'full':
            tr = data.speeds(mode='full')
            te = data.speeds_original('test2')
            speeds = pd.concat([tr, te])
            del tr
            del te

        sensors = data.sensors()
        print('Done')

        df = pd.merge(speeds.dropna(),
                      sensors,
                      left_on=[KEY, KM],
                      right_on=[KEY, KM])
        df[DATETIME] = pd.to_datetime(df.DATETIME_UTC)

        return df[['ROAD_TYPE', 'SPEED_AVG']].groupby('ROAD_TYPE').mean().reset_index()\
            .rename(columns={'SPEED_AVG': 'avg_speed_roadtype'})
    def extract_feature(self):
        speeds = None

        if self.mode == 'local':
            tr = data.speeds_original('train')
            te = data.speed_test_masked()
            speeds = pd.concat([tr, te])
            del tr
            del te

        elif self.mode == 'full':
            tr = data.speeds(mode='full')
            te = data.speeds_original('test2')
            speeds = pd.concat([tr, te])
            del tr
            del te

        print('Extracting min and max timestamps...')
        min_datetime = speeds.DATETIME_UTC.min()
        max_datetime = speeds.DATETIME_UTC.max()
        print('Done')
        df = pd.DataFrame(
            pd.date_range(min_datetime, max_datetime,
                          freq='15min').to_series()).reset_index()
        df[DATETIME] = pd.to_datetime(df['index'])
        df = df[[DATETIME]]
        df['WEEK_DAY'] = pd.to_datetime(df[DATETIME]).dt.weekday
        df['IS_WEEKEND'] = df.WEEK_DAY.map(lambda x: 1 if x in [5, 6] else 0)
        return df.rename(columns={'DATETIME_UTC': 'DATETIME_UTC_y_0'})
Ejemplo n.º 3
0
    def extract_feature(self):
        df = None

        if self.mode == 'local':
            tr = data.speeds_original('train')
            te = data.speed_test_masked()
            df = pd.concat([tr, te])
            del tr
            del te

        elif self.mode == 'full':
            tr = data.speeds(mode='full')
            te = data.speeds_original('test2')
            df = pd.concat([tr, te])
            del tr
            del te

        df.DATETIME_UTC = df.DATETIME_UTC.dt.strftime('%H:%M:%S')
        return df[['KEY', 'KM', 'DATETIME_UTC', 'SPEED_AVG', 'SPEED_SD', 'SPEED_MIN', 'SPEED_MAX', 'N_VEHICLES']].groupby(['KEY', 'KM', 'DATETIME_UTC']).mean().reset_index()\
            .rename(columns={'DATETIME_UTC': 'DATETIME_UTC_SPEED_SENSOR_HOUR',
                            'SPEED_AVG': 'avg_speed_sensor_hour',
                            'SPEED_SD': 'avg_speed_sd_sensor_hour',
                            'SPEED_MIN': 'avg_speed_min_sensor_hour',
                            'SPEED_MAX': 'avg_speed_max_sensor_hour',
                            'N_VEHICLES': 'avg_n_vehicles_sensor_hour'})
Ejemplo n.º 4
0
    def extract_feature(self):
        speeds = None

        if self.mode == 'local':
            tr = data.speeds_original('train')
            te = data.speed_test_masked()
            speeds = pd.concat([tr, te])
            del tr
            del te

        elif self.mode == 'full':
            tr = data.speeds(mode='full')
            te = data.speeds_original('test2')
            speeds = pd.concat([tr, te])
            del tr
            del te

        feature_cols = ["DATETIME_UTC", "KEY", "KM", "N_VEHICLES"]
        speeds = speeds.loc[:, feature_cols]
        speeds["N_VEHICLES"] = speeds.N_VEHICLES.fillna(0).astype(int)
        #contains also weekday
        speeds["day"] = speeds.DATETIME_UTC.dt.weekday
        speeds = speeds[['KEY', 'KM', 'N_VEHICLES',
                         'day']].groupby(['KEY', 'KM',
                                          'day']).mean().reset_index()

        return speeds.rename(
            columns={'N_VEHICLES': 'avg_n_vehicles_sensor_per_day'})
    def extract_feature(self):
        s = None
        if self.mode == 'local':
            tr = data.speeds_original('train').drop(['KEY_2'], axis=1)
            te = data.speed_test_masked().drop(['KEY_2'], axis=1)
            s = pd.concat([tr, te])
            del tr
            del te

        elif self.mode == 'full':
            tr = data.speeds(mode='full').drop(['KEY_2'], axis=1)
            te = data.speeds_original('test2').drop(['KEY_2'], axis=1)
            s = pd.concat([tr, te])
            del tr
            del te

        f = s[['KEY', 'DATETIME_UTC', 'KM']].copy()
        s = s.rename(columns={'DATETIME_UTC': 'DATETIME_UTC_drop'})
        for i in tqdm(range(1, self.n_days_before + 1)):
            colname = 'DATETIME_UTC_{}_D'.format(i)
            f[colname] = f.DATETIME_UTC - pd.Timedelta(days=i)
            f = pd.merge(f, s, how='left', left_on=['KEY', 'KM', colname], \
                        right_on=['KEY', 'KM', 'DATETIME_UTC_drop']) \
                        .drop([colname, 'DATETIME_UTC_drop'], axis=1)
            f = f.rename(
                columns={
                    'SPEED_AVG': 'SPEED_AVG_{}_DAY_BEFORE'.format(i),
                    'SPEED_SD': 'SPEED_SD_{}_DAY_BEFORE'.format(i),
                    'SPEED_MIN': 'SPEED_MIN_{}_DAY_BEFORE'.format(i),
                    'SPEED_MAX': 'SPEED_MAX_{}_DAY_BEFORE'.format(i),
                    'N_VEHICLES': 'N_VEHICLES_{}_DAY_BEFORE'.format(i)
                })
        return f.rename(columns={'DATETIME_UTC': 'DATETIME_UTC_y_0'})
    def extract_feature(self):
        tr = data.speeds_original('train')
        te = data.speed_test_masked()
        speeds = pd.concat([tr, te])
        del tr
        del te

        print('Extracting min and max timestamps...')
        min_datetime = speeds.DATETIME_UTC.min()
        max_datetime = speeds.DATETIME_UTC.max()
        sensors = data.sensors().drop_duplicates([KEY, KM])
        print('Done')

        datetimes_df = pd.DataFrame(
            pd.date_range(min_datetime, max_datetime,
                          freq='15min').to_series()).reset_index()
        datetimes_df[DATETIME] = pd.to_datetime(datetimes_df['index'])
        datetimes_df = datetimes_df[[DATETIME]]
        print('Shifting hours')
        datetimes_df['DATETIME_HOUR'] = pd.to_datetime(
            datetimes_df[DATETIME]).apply(lambda x: x.floor('1H'))
        datetimes_df[
            'DATETIME_HOUR'] = datetimes_df['DATETIME_HOUR'] - pd.DateOffset(1)
        print('Done')

        print('Creating skeleton')
        datetimes_df['MERGE'] = 0
        sensors['MERGE'] = 0
        skeleton = pd.merge(sensors[[KEY, KM, 'MERGE']],
                            datetimes_df,
                            on='MERGE')
        skeleton[DATETIME] = pd.to_datetime(skeleton[DATETIME])
        skeleton.set_index(DATETIME, inplace=True)
        print('Done')

        print('Merging with speeds..')
        resampled_speeds = speeds\
            .groupby([KEY, KM])\
            .apply(lambda x: x.set_index(DATETIME)\
            .resample('H').mean()[[SPEED_AVG, SPEED_MAX, SPEED_MIN, SPEED_SD, N_CARS]]).reset_index()
        skeleton_merge = skeleton.reset_index()
        df = pd.merge(skeleton_merge,
                      resampled_speeds,
                      left_on=[KEY, KM, 'DATETIME_HOUR'],
                      right_on=[KEY, KM, DATETIME])
        df = df.rename(
            columns={
                'DATETIME_UTC_x': 'DATETIME_UTC',
                SPEED_AVG: 'SPEED_AVG_D-1',
                SPEED_MAX: 'SPEED_MAX_D-1',
                SPEED_MIN: 'SPEED_MIN_D-1',
                SPEED_SD: 'SPEED_SD_D-1',
                N_CARS: 'N_VEHICLES_D-1'
            })
        print('Done')
        return df
Ejemplo n.º 7
0
    def extract_feature(self):

        if self.mode == 'local':
            tr = data.speeds_original('train')
            te = data.speed_test_masked()
            f = pd.concat([tr, te])
        elif self.mode == 'full':
            tr = data.speeds(mode='full')
            te = data.speeds_original('test2')
            f = pd.concat([tr, te])
        del tr
        del te

        etr = data.events(self.mode, 'train')
        ete = data.events(self.mode, 'test')
        ef = pd.concat([etr, ete])
        del etr
        del ete

        m = pd.merge(ef, f, left_on=['KEY', 'DATETIME_UTC'], right_on=['KEY', 'DATETIME_UTC'])
        m = m[(m.KM >= m.KM_START) & (m.KM <= m.KM_END)]

        df['start_event_distance'] = df[]
        return df
Ejemplo n.º 8
0
    def extract_feature(self):
        df = None

        if self.mode == 'local':
            tr = data.speeds_original('train')
            te = data.speed_test_masked()
            df = pd.concat([tr, te])
            del tr
            del te

        elif self.mode == 'full':
            tr = data.speeds(mode='full')
            te = data.speeds_original('test2')
            df = pd.concat([tr, te])
            del tr
            del te

        f = df[['KEY', 'SPEED_AVG', 'SPEED_SD', 'SPEED_MIN', 'SPEED_MAX', 'N_VEHICLES']].groupby(['KEY']).mean().reset_index()\
                .rename(columns={'SPEED_AVG': 'avg_speed_street',\
                                'SPEED_SD': 'avg_speed_sd_street', \
                                'SPEED_MIN': 'avg_speed_min_street', \
                                'SPEED_MAX': 'avg_speed_max_street', \
                                'N_VEHICLES': 'avg_n_vehicles_street'})
        return f
    def extract_feature(self):
        speeds = None

        if self.mode == 'local':
            tr = data.speeds_original('train')
            te = data.speed_test_masked()
            speeds = pd.concat([tr, te])
            del tr
            del te

        elif self.mode == 'full':
            tr = data.speeds(mode='full')
            te = data.speeds_original('test2')
            speeds = pd.concat([tr, te])
            del tr
            del te

        etr = data.events(self.mode, 'train')
        ete = data.events(self.mode, 'test')
        ef = pd.concat([etr, ete])
        del etr
        del ete

        t = ef[['START_DATETIME_UTC', 'END_DATETIME_UTC', 'KEY', 'KM_START', \
        'KM_END', 'DATETIME_UTC', 'EVENT_TYPE', 'EVENT_DETAIL']]
        t = t.loc[t.groupby(['START_DATETIME_UTC', 'END_DATETIME_UTC', 'KEY'],
                            as_index=False).DATETIME_UTC.idxmin()]
        t['DATETIME_UTC-1'] = t.DATETIME_UTC - pd.Timedelta(minutes=15)
        t = t.drop(['START_DATETIME_UTC', 'END_DATETIME_UTC', 'DATETIME_UTC'],
                   axis=1)
        speeds = speeds[['KEY', 'KM', 'DATETIME_UTC', 'SPEED_AVG']]

        final = pd.merge(t,
                         speeds,
                         left_on=['KEY', 'DATETIME_UTC-1'],
                         right_on=['KEY', 'DATETIME_UTC'])
        final = final.rename(columns={
            'SPEED_AVG': 'speed_avg-1',
            'KM': 'KM-1'
        })
        final = final.drop(['DATETIME_UTC'], axis=1)
        final = final[(final['KM-1'] >= final.KM_START)
                      & (final['KM-1'] <= final.KM_END)]

        ds = []
        for ts in range(4):
            m_ = t.copy()
            print(len(m_))
            quarters_delta = ts + 1
            m_['DATETIME_UTC_{}'.format(
                quarters_delta)] = m_['DATETIME_UTC-1'] + pd.Timedelta(
                    minutes=15 * quarters_delta)
            m_ = pd.merge(m_, speeds, \
                        left_on=['KEY', 'DATETIME_UTC_{}'.format(quarters_delta)], \
                        right_on=['KEY', 'DATETIME_UTC'], how='left')
            m_ = m_.rename(columns={'SPEED_AVG': 'speed_avg_{}'.format(quarters_delta), \
                                    'KM': 'KM_{}'.format(quarters_delta)})
            m_ = m_.drop(['DATETIME_UTC'], axis=1)
            m_ = m_[(m_['KM_{}'.format(quarters_delta)] >= m_.KM_START)
                    & (m_['KM_{}'.format(quarters_delta)] <= m_.KM_END)]
            m_ = m_.rename(columns={'KM_{}'.format(quarters_delta): 'KM-1'})
            m_ = m_.drop(['DATETIME_UTC_{}'.format(quarters_delta)], axis=1)
            print(len(m_))
            ds.append(m_)

        final = final.drop(['KM-1'], axis=1)
        for i in range(len(ds)):
            df = ds[i]
            j = i + 1
            print('shape before {}'.format(len(final)))
            final = pd.merge(final, df)
            print('shape after {}'.format(len(final)))

        final = final[[
            'EVENT_TYPE', 'speed_avg-1', 'speed_avg_1', 'speed_avg_2',
            'speed_avg_3', 'speed_avg_4'
        ]]
        final['diff-1-step'] = final['speed_avg_1'] - final['speed_avg-1']
        final['diff-2-step'] = final['speed_avg_2'] - final['speed_avg-1']
        final['diff-3-step'] = final['speed_avg_3'] - final['speed_avg-1']
        final['diff-4-step'] = final['speed_avg_4'] - final['speed_avg-1']
        final = final.drop([
            'speed_avg_1', 'speed_avg-1', 'speed_avg_2', 'speed_avg_3',
            'speed_avg_4'
        ],
                           axis=1)
        return final.groupby(['EVENT_TYPE'], as_index=False).mean()