Beispiel #1
0
def test_run_train_test():
    df_train, train_params = initial_processing(train_data, mode='train')
    df_test, _ = initial_processing(test_data, mode='train')

    tf = CatTransformer(train_params['cat_cols'])
    tf.fit(df_train)
    df_train_tf = tf.transform(df_train)
    df_test_tf = tf.transform(df_test)

    assert set(df_train_tf.columns.values) == set(df_test_tf.columns.values)
Beispiel #2
0
def run_train_test(ds_name, metric, params, obj):
    path = _DATA_PATH + ds_name
    with Profiler('initial feature selection'):
        x_initial_raw, y_initial, _ = load_data(f'{path}/train.csv',
                                                mode='train',
                                                sample=_SAMPLE)
        x_initial, ini_params = initial_processing(x_initial_raw, mode='train')

        tf = CatTransformer(ini_params['cat_cols'])
        # tf.fit(x_initial)
        x_initial_tf = tf.fit_transform(x_initial)
        selected_features, feat_list = ols_selection(x_initial_tf, y_initial,
                                                     obj)
        hp_params = hyperopt_lgb(x_initial_tf[feat_list], y_initial, params,
                                 obj)

    print('selected features=', len(selected_features))

    x_train_raw, y_train, _ = load_data(f'{path}/train.csv',
                                        mode='train',
                                        sample=_SAMPLE,
                                        used_cols=selected_features)

    x_test_raw, _, _ = load_data(f'{path}/test.csv', mode='test')
    y_test = load_test_label(f'{path}/test-target.csv')

    x_train, train_params = initial_processing(x_train_raw, mode='train')
    x_test, test_params = initial_processing(x_test_raw, mode='test')

    with Profiler('fit transform cat columns'):
        x_test_rein = x_test.reindex(columns=train_params['used_cols'])
        tf = CatTransformer(train_params['cat_cols'])
        tf.fit(x_train)
        x_train_tf = tf.transform(x_train)
        x_test_tf = tf.transform(x_test_rein)

    with Profiler('run train'):
        model = lgb.train(hp_params, lgb.Dataset(x_train_tf, label=y_train),
                          600)

    with Profiler('predict'):
        y_train_out = model.predict(x_train_tf)
        y_test_out = model.predict(x_test_tf)

    train_err = metric(y_train, y_train_out)
    test_err = metric(y_test, y_test_out)

    return train_err, test_err
Beispiel #3
0
def test_cat_fit_test():
    _, params = initial_processing(train_data, mode='train')
    cat_cols = params['cat_cols']

    tf = CatTransformer(cat_cols)
    tf.fit(train_data[cat_cols])
    res_df = tf.transform(test_data)
    assert res_df['string_0'][0] == 2 / 5
    assert res_df['string_0'][1] == 1 / 5
    assert res_df['string_0'][2] == 2 / 5
    assert np.isnan(res_df['string_0'][3])
    assert np.isnan(res_df['string_0'][4])

    assert np.isnan(res_df['string_1']).all()
def test_date_col_processing():
    df, params = initial_processing(train_data, mode='train')

    assert df['date_month_datetime_0'][0] == 1
    assert df['date_month_datetime_0'][2] == 2
    assert df['date_month_datetime_0'][3] == 11

    assert df['date_weekday_datetime_0'][1] == 2
    assert df['date_weekday_datetime_0'][3] == 0
    assert df['date_weekday_datetime_0'][4] == 6

    assert df['date_day_datetime_0'][1] == 3
    assert df['date_day_datetime_0'][2] == 14
    assert df['date_day_datetime_0'][4] == 30
Beispiel #5
0
        "num_leaves": 200,
        "feature_fraction": 0.70,
        "bagging_fraction": 0.70,
        'bagging_freq': 4,
        "max_depth": -1,
        "verbosity": -1,
        "reg_alpha": 0.3,
        "reg_lambda": 0.1,
        "min_child_weight": 10,
        'zero_as_missing': True,
        'num_threads': 4,
        'seed': 1
    }
    with Profiler('load data and perform feature selection'):
        x_ini_raw, y_ini, _ = load_data(args.train_csv, sample=_SAMPLE)
        x_initial, ini_params = initial_processing(x_ini_raw, mode='train')
        tf = CatTransformer(ini_params['cat_cols'])
        x_initial_tf = tf.fit_transform(x_initial)
        selected_features, feat_list = ols_selection(x_initial_tf, y_ini, obj)
        hp_params = hyperopt_lgb(x_initial_tf[feat_list], y_ini, params, obj)
    print(f'{ len(selected_features)} features selected')

    df_X_raw, df_y, _ = load_data(args.train_csv, used_cols=selected_features)
    x_train, train_params = initial_processing(df_X_raw, mode='train')

    with Profiler('fit transform cat columns'):
        tf = CatTransformer(train_params['cat_cols'])
        tf.fit(x_train)
        x_train_tf = tf.transform(x_train)

    with Profiler('run train'):
def test_cat_cols_frequency():
    df, params = initial_processing(train_data, mode='train')
    cat_cols = params['cat_cols']

    assert set(cat_cols) == {'id_0', 'string_0', 'string_1'}
Beispiel #7
0
    parser = argparse.ArgumentParser()
    parser.add_argument('--test-csv', required=True)
    parser.add_argument('--prediction-csv',
                        type=argparse.FileType('w'),
                        required=True)
    parser.add_argument('--model-dir', required=True)
    args = parser.parse_args()

    start_time = time.time()

    # load model
    model_config_filename = os.path.join(args.model_dir, 'model_config.pkl')
    with open(model_config_filename, 'rb') as fin:
        model_config = pickle.load(fin)
    model = model_config['model']
    tf = model_config['cat_tf']
    train_params = model_config['train_params']

    x_test_raw, _, df = load_data(args.test_csv, mode='test')
    x_test, test_params = initial_processing(x_test_raw, mode='test')

    with Profiler('transform cat columns'):
        x_test_rein = x_test.reindex(columns=train_params['used_cols'])
        x_test_tf = tf.transform(x_test_rein)

    df['prediction'] = model.predict(x_test_tf)

    df[['line_id', 'prediction']].to_csv(args.prediction_csv, index=False)

    print('Prediction time: {:0.2f}'.format(time.time() - start_time))