def train_model():
        print()
        mode = menu.mode_selection()
        chain_mode = input(
            'Choose the chain mode (m: multioutput / c: regressorchain): '
        ).lower()
        M = MultiOutputRegressor if chain_mode == 'm' else RegressorChain

        #X, Y = data.dataset_with_features('train', onehot=False, drop_index_columns=True)
        X, Y = data.dataset('local', 'train', onehot=False)
        print(X.shape, Y.shape)

        # mask_not_all_null = np.any(X[['SPEED_AVG_-4','SPEED_AVG_-3','SPEED_AVG_-2','SPEED_AVG_-1']].notnull(),axis=1)
        # X = X[mask_not_all_null]
        # Y = Y[mask_not_all_null]

        # print('\nAfter cleaning nan')
        # print(X.shape, Y.shape)

        weather_cols = ['WEATHER_-4', 'WEATHER_-3', 'WEATHER_-2', 'WEATHER_-1']
        X[weather_cols] = X[weather_cols].fillna('Unknown')

        weather_cols = [col for col in X.columns if col.startswith('WEATHER_')]
        categorical_cols = [
            'EMERGENCY_LANE', 'ROAD_TYPE', 'EVENT_DETAIL', 'EVENT_TYPE'
        ] + weather_cols

        categorical_cols.extend(['WEEK_DAY', 'IS_WEEKEND'])

        weather_clusters_cols = [
            'WEATHER_-4_CL', 'WEATHER_-3_CL', 'WEATHER_-2_CL', 'WEATHER_-1_CL'
        ]
        X[weather_clusters_cols] = X[weather_clusters_cols].fillna('Unknown')

        catboost = CatBoost({
            'X': X,
            'mode': mode,
            'loss_function': 'MAE',
            'eval_metric': 'MAE',
            'n_estimators': 5000,
            'depth': 6,
            'learning_rate': 0.1,
            'early_stopping_rounds': 100,
            'cat_features': categorical_cols
        })

        model = M(catboost)
        model.fit(X, Y)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.2,
                                                            shuffle=False)
        mae, mae_4 = inout.evaluate(model, X_test, y_test, intermediate=True)
        print()
        print(mae)
        print(mae_4)

        # save the model
        mae = round(mae, 5)

        suffix = input('Insert model name suffix: ')
        model_folder = 'saved_models'
        folder.create_if_does_not_exist(model_folder)

        chain_mode = 'chain' if chain_mode == 'c' else 'multiout'
        filename = f'catboost_{chain_mode}_{mae}_{suffix}.jl'
        inout.save(model, os.path.join(model_folder, filename))
        f.append(pd.DataFrame([[np.nan, np.nan]],
                              columns=['WEATHER', 'WEATHER_CL']),
                 ignore_index=True)
        matching = [
            i for i in range(len(df.columns.values))
            if "WEATHER_-" in df.columns.values[i]
        ]
        for i in matching:
            f_ = f.rename(
                columns={
                    'WEATHER': df.columns.values[i],
                    'WEATHER_CL': '{}_CL'.format(df.columns.values[i])
                })
            df = pd.merge(df,
                          f_,
                          left_on=[df.columns.values[i]],
                          right_on=df.columns.values[i],
                          how='left')
        return df


if __name__ == '__main__':
    from src.utils.menu import mode_selection
    mode = mode_selection()
    c = Weather_clusters(mode)

    print('Creating {}'.format(c.name))
    c.save_feature()

    print(c.read_feature())
Ejemplo n.º 3
0
        })
        """
        if mode == 'train':
            # take random validation rows

            # random_indices = random.shuffle(joined_df.index)
            # validation_indices = random_indices[0: int(len(random_indices) * validation_split)]
            # train_df = joined_df.drop(validation_indices)
            # valid_df = joined_df.loc[validation_indices]
        """

        # save the base dataset
        filepath = data.get_path_preprocessed(mode, t, 'base_dataset.csv.gz')

        print('Saving base dataframe to {}'.format(filepath))
        joined_df.to_csv(filepath, index=False, compression='gzip')
        del joined_df
        print('Done')


def print_memory_usage():
    process = psutil.Process(os.getpid())
    print(
        f'Current memory usage: {int(process.memory_info().rss / float(2 ** 20))}MB'
    )


if __name__ == '__main__':
    mode = menu.mode_selection()
    create_base_dataset(mode, steps_behind_event=4, steps_after_event=3)