Ejemplo n.º 1
0
    def __get_primitive_aggregation_df(df: pd.DataFrame):
        logger.info('starting group by primitive')

        df = df.groupby(BOOKING_ID).agg(
            ['min', 'max', 'mean', 'sum', 'mad', 'skew', 'median', 'std'])
        df.columns = ['_'.join(tup) for tup in df.columns.values]
        df = df.reset_index()

        logger.info('finish group by primitive')
        return df
Ejemplo n.º 2
0
    def __get_total_acceleration_and_gyro_df(df: pd.DataFrame):
        logger.info('starting count total acceleration and gyro')

        df[TOTAL_ACC] = np.sqrt((df[ACC_X]**2) + (df[ACC_Y]**2) +
                                (df[ACC_Z]**2))
        df[TOTAL_GYRO] = np.sqrt((df[GYRO_X]**2) + (df[GYRO_Y]**2) +
                                 (df[GYRO_Z]**2))

        logger.info('finish count total acceleration and gyro')
        return df
Ejemplo n.º 3
0
    def __get_mean_absolute_value_df(df: pd.DataFrame):
        logger.info('starting get mean absolute')

        df = df.apply(abs)
        df = df.groupby(BOOKING_ID).agg('mean')
        df.columns = ['{}_abs_mean'.format(x) for x in df.columns]
        df = df.reset_index()

        logger.info('finish get mean absolute')
        return df
Ejemplo n.º 4
0
    def __get_kurtosis_aggregation_df(df: pd.DataFrame):
        logger.info('starting group by kurtosis')

        df = df.groupby(BOOKING_ID).apply(pd.DataFrame.kurt).drop(BOOKING_ID,
                                                                  axis=1)
        df.columns = ['{}_kurt'.format(x) for x in df.columns]
        df = df.reset_index()

        logger.info('finish group by kurtosis')
        return df
Ejemplo n.º 5
0
    def __get_hilbert_mean_and_hann_window_mean_df(self, df: pd.DataFrame):
        logger.info('starting hilbert mean and hann window mean')

        def __calc_hilbert_mean(df: pd.DataFrame, col):
            arr = df[col].values
            return np.abs(hilbert(arr)).mean()

        def __calc_hann_window_mean(df: pd.DataFrame, col):
            arr = df[col].values
            hann_window = convolve(arr, hann(10), mode='same') / sum(hann(10))
            return hann_window.mean()

        df_acc_hilbert_mean = df[[BOOKING_ID,
                                  TOTAL_ACC]].groupby(BOOKING_ID).apply(
                                      __calc_hilbert_mean,
                                      TOTAL_ACC).reset_index()
        df_acc_hilbert_mean.columns = [BOOKING_ID, TOTAL_ACC_HILBERT_MEAN]
        df_acc_hann_mean = df[[BOOKING_ID,
                               TOTAL_ACC]].groupby(BOOKING_ID).apply(
                                   __calc_hann_window_mean,
                                   TOTAL_ACC).reset_index()
        df_acc_hann_mean.columns = [BOOKING_ID, TOTAL_ACC_HANN_MEAN]

        df_gyro_hilbert_mean = df[[BOOKING_ID,
                                   TOTAL_GYRO]].groupby(BOOKING_ID).apply(
                                       __calc_hilbert_mean,
                                       TOTAL_GYRO).reset_index()
        df_gyro_hilbert_mean.columns = [BOOKING_ID, TOTAL_GYRO_HILBERT_MEAN]
        df_gyro_hann_mean = df[[BOOKING_ID,
                                TOTAL_GYRO]].groupby(BOOKING_ID).apply(
                                    __calc_hann_window_mean,
                                    TOTAL_GYRO).reset_index()
        df_gyro_hann_mean.columns = [BOOKING_ID, TOTAL_GYRO_HANN_MEAN]

        df_speed_hilbert_mean = df[[BOOKING_ID,
                                    SPEED]].groupby(BOOKING_ID).apply(
                                        __calc_hilbert_mean,
                                        SPEED).reset_index()
        df_speed_hilbert_mean.columns = [BOOKING_ID, SPEED_HILBERT_MEAN]
        df_speed_hann_mean = df[[BOOKING_ID, SPEED]].groupby(BOOKING_ID).apply(
            __calc_hann_window_mean, SPEED).reset_index()
        df_speed_hann_mean.columns = [BOOKING_ID, SPEED_HANN_MEAN]

        list_of_df = [
            df_acc_hilbert_mean, df_acc_hann_mean, df_gyro_hilbert_mean,
            df_gyro_hann_mean, df_speed_hilbert_mean, df_speed_hann_mean
        ]

        df = self.__merge_df(list_of_df)

        logger.info('finish hilbert mean and hann window mean')
        return df
Ejemplo n.º 6
0
    def __get_agg_last_n_sec(self, df, n):
        logger.info('start agg_for_last_{}_sec'.format(n))

        df = df.sort_values([BOOKING_ID, SECOND], ascending=[True, False])
        df = df.groupby(BOOKING_ID).head(n).reset_index(drop=True)

        df_mean_change_rate = self.__get_mean_change_and_mean_change_rate_df(
            df, '_last_{}_sec'.format(n))
        df = df.groupby(BOOKING_ID).agg(
            ['min', 'max', 'mean', 'std', 'mad', 'skew'])
        df.columns = [
            '_'.join(tup) + '_last_{}_sec'.format(n)
            for tup in df.columns.values
        ]
        df = df.reset_index()

        df = pd.merge(df, df_mean_change_rate, on=BOOKING_ID, how='left')

        logger.info('finish agg_for_last_{}_sec'.format(n))
        return df
Ejemplo n.º 7
0
    def __get_moving_average_std_df(self, df: pd.DataFrame):
        logger.info('starting moving average std')

        col_list = [BOOKING_ID, TOTAL_ACC, TOTAL_GYRO, SPEED]
        df = df[col_list]
        list_of_df = []

        rolling_windows = [10, 30, 60, 90]
        for window in rolling_windows:
            temp = df.groupby(BOOKING_ID).rolling(window).std()
            temp = temp.drop(BOOKING_ID, axis=1).reset_index()[col_list]
            temp = temp.groupby(BOOKING_ID).std().reset_index()
            temp.columns = [BOOKING_ID] + [
                'MA{}_{}_std'.format(window, x) for x in col_list[1:]
            ]
            list_of_df.append(temp)

        df = self.__merge_df(list_of_df)

        logger.info('finish moving average std')
        return df
Ejemplo n.º 8
0
    def __get_quantile_df(self, df: pd.DataFrame):
        logger.info('starting quantile df')

        col_list = [BOOKING_ID, TOTAL_ACC, TOTAL_GYRO, SPEED]
        df = df[col_list]
        list_of_df = []

        quantile_list = [.01, .05, .95, .99]
        for quantile_ in quantile_list:
            temp = df.groupby(BOOKING_ID).quantile(quantile_)
            del temp.columns.name
            temp = temp.reset_index()
            temp.columns = [BOOKING_ID] + [
                'q{}_{}'.format(quantile_, x).replace('.', '')
                for x in col_list[1:]
            ]
            list_of_df.append(temp)

        df = self.__merge_df(list_of_df)

        logger.info('finish quantile df')
        return df
Ejemplo n.º 9
0
    def __get_iqr_df(df: pd.DataFrame):
        logger.info('starting iqr df')

        col_list = [BOOKING_ID, TOTAL_ACC, TOTAL_GYRO, SPEED]

        df = df[col_list]
        df_25 = df.groupby(BOOKING_ID).quantile(.25).reset_index()
        df_75 = df.groupby(BOOKING_ID).quantile(.75).reset_index()

        df_25.columns = [BOOKING_ID
                         ] + ['{}_25'.format(x) for x in col_list[1:]]
        df_75.columns = [BOOKING_ID
                         ] + ['{}_75'.format(x) for x in col_list[1:]]

        df = pd.merge(df_25, df_75, on=BOOKING_ID, how='left')
        df[TOTAL_ACC_IQR] = df[TOTAL_ACC + '_75'] - df[TOTAL_ACC + '_25']
        df[TOTAL_GYRO_IQR] = df[TOTAL_GYRO + '_75'] - df[TOTAL_GYRO + '_25']
        df[SPEED_IQR] = df[SPEED + '_75'] - df[SPEED + '_25']

        df = df[[BOOKING_ID, TOTAL_ACC_IQR, TOTAL_GYRO_IQR, SPEED_IQR]]

        logger.info('finish iqr df')
        return df
Ejemplo n.º 10
0
    def __get_mean_change_and_mean_change_rate_df(self,
                                                  df: pd.DataFrame,
                                                  additional_col_name=''):
        def __calc_change_rate(df: pd.DataFrame, col):
            arr = df[col].values
            change = np.diff(arr) / arr[:-1]
            change = change[np.nonzero(change)[0]]
            change = np.mean(change)
            return change

        # total_acc, total_gyro, and Speed has many zero values,
        # thus mean change rate speed can be very large number (inf) and has no meaning

        # logger.info('start mean change rate acc')
        # df_acc_change_rate = df.groupby(BOOKING_ID).apply(__calc_change_rate, TOTAL_ACC).reset_index()
        # df_acc_change_rate.columns = [BOOKING_ID, TOTAL_ACC_MEAN_CHANGE_RATE + additional_col_name]
        # logger.info('finish mean change rate acc')
        #
        # logger.info('start mean change rate gyro')
        # df_gyro_change_rate = df.groupby(BOOKING_ID).apply(__calc_change_rate, TOTAL_GYRO).reset_index()
        # df_gyro_change_rate.columns = [BOOKING_ID, TOTAL_GYRO_MEAN_CHANGE_RATE + additional_col_name]
        # logger.info('finish mean change rate gyro')

        # logger.info('start mean change rate speed')
        # df_speed_change_rate = df.groupby(BOOKING_ID).apply(__calc_change_rate, SPEED).reset_index()
        # df_speed_change_rate.columns = [BOOKING_ID, SPEED_MEAN_CHANGE_RATE + additional_col_name]
        # logger.info('finish mean change rate speed')

        logger.info('start mean change acc')
        df_acc_mean_change = df[[BOOKING_ID,
                                 TOTAL_ACC]].groupby(BOOKING_ID).apply(
                                     np.diff).apply(np.mean).reset_index()
        df_acc_mean_change.columns = [
            BOOKING_ID, TOTAL_ACC_MEAN_CHANGE + additional_col_name
        ]
        logger.info('finish mean change acc')

        logger.info('start mean change gyro')
        df_gyro_mean_change = df[[BOOKING_ID,
                                  TOTAL_GYRO]].groupby(BOOKING_ID).apply(
                                      np.diff).apply(np.mean).reset_index()
        df_gyro_mean_change.columns = [
            BOOKING_ID, TOTAL_GYRO_MEAN_CHANGE + additional_col_name
        ]
        logger.info('finish mean change gyro')

        logger.info('start mean change speed')
        df_speed_mean_change = df[[BOOKING_ID,
                                   SPEED]].groupby(BOOKING_ID).apply(
                                       np.diff).apply(np.mean).reset_index()
        df_speed_mean_change.columns = [
            BOOKING_ID, SPEED_MEAN_CHANGE + additional_col_name
        ]
        logger.info('finish mean change gyro')

        list_of_df = [
            df_acc_mean_change, df_gyro_mean_change, df_speed_mean_change
        ]

        df = self.__merge_df(list_of_df)
        return df
Ejemplo n.º 11
0
    def __get_trend_features_df(self, df: pd.DataFrame):
        def __calc_linear_regression_coef(df: pd.DataFrame, col):
            lr = LinearRegression()

            seconds = df[SECOND].values
            seconds = seconds.reshape(-1, 1)
            target = df[col].values
            lr.fit(seconds, target)
            return lr.coef_[0]

        logger.info('starting trend feature total acc')
        df_acc = df.groupby(BOOKING_ID).apply(__calc_linear_regression_coef,
                                              TOTAL_ACC).reset_index()
        df_acc.columns = [BOOKING_ID, TOTAL_ACC_TREND]
        logger.info('finish trend feature total acc')

        logger.info('starting trend feature total gyro')
        df_gyro = df.groupby(BOOKING_ID).apply(__calc_linear_regression_coef,
                                               TOTAL_GYRO).reset_index()
        df_gyro.columns = [BOOKING_ID, TOTAL_GYRO_TREND]
        logger.info('finish trend feature total gyro')

        logger.info('starting trend feature speed')
        df_speed = df.groupby(BOOKING_ID).apply(__calc_linear_regression_coef,
                                                SPEED).reset_index()
        df_speed.columns = [BOOKING_ID, SPEED_TREND]
        logger.info('finish trend feature speed')

        list_of_df = [df_acc, df_gyro, df_speed]
        df = self.__merge_df(list_of_df)

        return df
Ejemplo n.º 12
0
                        help='Input folder of your label csv data.',
                        required=False)
    parser.add_argument('-o',
                        '--output_path',
                        type=str,
                        help='Output path for prediction result.',
                        required=False)

    args = parser.parse_args()
    mode = args.mode
    input_features_folder = args.input_features_folder
    input_label_folder = args.input_label_folder
    output_path = args.output_path

    if mode == 'training':
        logger.info('start load dataframe')

        df_features = read_multiple_csv_pandas(input_features_folder)
        df_label = read_multiple_csv_pandas(input_label_folder)
        df_label = df_label.groupby(BOOKING_ID).agg({
            LABEL: 'last'
        }).reset_index()

        logger.info('finished load dataframe')
        logger.info('start feature engineering')

        fe = FeatureEngineering()
        df_features = fe.transform(df_features)

        logger.info('finished feature engineering')
        logger.info('start modeling')