def main():
    # Read input data
    train_df = pd.read_csv('input/train.csv')
    test_df = pd.read_csv('input/test.csv')

    # This prints out (rows, columns) in each dataframe
    print('original train shape: %s' % str(train_df.shape))
    print('original test shape: %s' % str(test_df.shape))

    train_df, test_df = fillna(train_df, test_df)
    train_df, test_df = count_encoding(train_df, test_df, replace=False)
    train_df, test_df = target_encoding(train_df, test_df, replace=False)
    train_df, test_df = high_diff_corr_pca(train_df, test_df, n_features=5)
    train_df, test_df = sum_of_na(train_df, test_df)
    train_df, test_df = drop_calc(train_df, test_df)

    # This prints out (rows, columns) in each dataframe
    print('Train shape: %s' % str(train_df.shape))
    print('Test shape: %s' % str(test_df.shape))

    params = {
        'lambda': 1e-5,
        'factor': 6,
        'iteration': 100,
        'patience': 10,
        'eta': 0.2,
        'nr_threads': 1,
        'seed': 41
    }

    train_predict(train_df, test_df, params)
Ejemplo n.º 2
0
def main():
    # Read input data
    train_df = pd.read_csv('input/train.csv')
    test_df = pd.read_csv('input/test.csv')

    # This prints out (rows, columns) in each dataframe
    print('original train shape: %s' % str(train_df.shape))
    print('original test shape: %s' % str(test_df.shape))

    # Fill NA
    train_df, test_df = fillna(train_df, test_df)

    # Count encoding
    train_df, test_df = count_encoding(train_df, test_df, replace=False)

    # Target encoding
    train_df, test_df = target_encoding(train_df, test_df, replace=False)

    # Dummy encoding
    train_df, test_df = dummy_encoding(train_df, test_df)

    # high_diff_corr_pca
    train_df, test_df = high_diff_corr_pca(train_df, test_df, n_features=5)

    # Sum of NA
    train_df, test_df = sum_of_na(train_df, test_df)

    # drop calc
    train_df, test_df = drop_calc(train_df, test_df)

    # This prints out (rows, columns) in each dataframe
    print('Train shape: %s' % str(train_df.shape))
    print('Test shape: %s' % str(test_df.shape))

    params = {
        'max_leaf': 1000,
        'algorithm': 'RGF',
        'loss': 'Log',
        'l2': 0.01,
        'sl2': 0.01,
        'normalize': False,
        'min_samples_leaf': 10,
        'n_iter': None,
        'opt_interval': 100,
        'learning_rate': 0.5,
        'calc_prob': 'sigmoid',
        'n_jobs': -1,
        'memory_policy': 'generous',
        'verbose': True
    }

    train_predict(train_df, test_df, params)
def main():
    # Read input data
    train_df = pd.read_csv('input/train.csv')
    test_df = pd.read_csv('input/test.csv')

    # This prints out (rows, columns) in each dataframe
    print('original train shape: %s' % str(train_df.shape))
    print('original test shape: %s' % str(test_df.shape))

    train_df, test_df = fillna(train_df, test_df)
    train_df, test_df = count_encoding(train_df, test_df, replace=False)
    train_df, test_df = target_encoding(train_df, test_df, replace=False)
    train_df, test_df = dummy_encoding(train_df, test_df)
    train_df, test_df = high_diff_corr_pca(train_df, test_df, n_features=5)
    train_df, test_df = sum_of_na(train_df, test_df)
    train_df, test_df = drop_calc(train_df, test_df)

    # This prints out (rows, columns) in each dataframe
    print('Train shape: %s' % str(train_df.shape))
    print('Test shape: %s' % str(test_df.shape))

    params = {
        'bagging_temperature': 1,
        'name': 'experiment',
        'random_strength': 1,
        'has_time': False,
        'store_all_simple_ctr': False,
        'verbose': True,
        'use_best_model': True,
        'random_seed': 41,
        'thread_count': 2,
        'od_type': 'IncToDec',
        'od_wait': 20,
        'od_pval': 0.01,
        'feature_border_type': 'MinEntropy',
        'loss_function': 'Logloss',
        'rsm': 1,
        'l2_leaf_reg': 23,
        'depth': 6,
        'learning_rate': 0.057,
        'iterations': 10000,
        'leaf_estimation_method': 'Newton'
    }

    train_predict(train_df, test_df, params)
Ejemplo n.º 4
0
def main():
    # Read input data
    train_df = pd.read_csv('input/train.csv')
    test_df = pd.read_csv('input/test.csv')

    # This prints out (rows, columns) in each dataframe
    print('original train shape: %s' % str(train_df.shape))
    print('original test shape: %s' % str(test_df.shape))

    train_df, test_df = fillna(train_df, test_df)
    train_df, test_df = count_encoding(train_df, test_df, replace=False)
    train_df, test_df = target_encoding(train_df, test_df, replace=False)
    train_df, test_df = dummy_encoding(train_df, test_df)
    train_df, test_df = high_diff_corr_pca(train_df, test_df, n_features=5)
    train_df, test_df = sum_of_na(train_df, test_df)
    train_df, test_df = drop_calc(train_df, test_df)

    l1_models = ['l1_rgf.csv.gz', 'l1_ffm.csv.gz', 'l1_catb.csv.gz']

    # This prints out (rows, columns) in each dataframe
    print('Train shape: %s' % str(train_df.shape))
    print('Test shape: %s' % str(test_df.shape))

    params = {
        'application': 'binary',
        'num_threads': 2,
        'boosting': 'gbdt',
        'max_bin': 16,
        'learning_rate': 0.025,
        'num_leaves': 52,
        'feature_fraction': 0.45,
        'bagging_fraction': 0.75,
        'bagging_freq': 16,
        'min_data_in_leaf': 740,
        'min_child_weight': 2.0
    }

    train_predict(train_df, test_df, l1_models, params)
Ejemplo n.º 5
0
def main():
    # Read input data
    train_df = pd.read_csv('input/train.csv')
    test_df = pd.read_csv('input/test.csv')

    # This prints out (rows, columns) in each dataframe
    print('original train shape: %s' % str(train_df.shape))
    print('original test shape: %s' % str(test_df.shape))

    train_df, test_df = fillna(train_df, test_df)
    train_df, test_df = count_encoding(train_df, test_df, replace=False)
    train_df, test_df = target_encoding(train_df, test_df, replace=False)
    train_df, test_df = dummy_encoding(train_df, test_df)
    train_df, test_df = high_diff_corr_pca(train_df, test_df, n_features=5)
    train_df, test_df = sum_of_na(train_df, test_df)
    train_df, test_df = drop_calc(train_df, test_df)

    # This prints out (rows, columns) in each dataframe
    print('Train shape: %s' % str(train_df.shape))
    print('Test shape: %s' % str(test_df.shape))

    params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'seed': 41,
        'nthread': 32,
        'silent': True,
        'eta': 0.025,
        'max_depth': 5,
        'min_child_weight': 9.15,
        'gamma': 0.59,
        'subsample': 0.8,
        'colsample_bytree': 0.8
    }

    train_predict(train_df, test_df, params)