Exemple #1
0
def main(c_runtime, c_transformer, c_model, c_trainer, c_log):

    with blocktimer('Preprocess'):
        train, test = Transformer.run(**c_transformer.__dict__)
        X_train, y_train, X_test = split_X_y(train, test)

    with blocktimer('Tune & Train'):
        modelfactory = ModelFactory()

        # tune the model params
        model = modelfactory.create(c_model)
        optimal_c_model = tune_gbdt_params(model, X_train, y_train,
                                           c_trainer.n_splits)

        # train with best params, full data
        model = modelfactory.create(optimal_c_model)
        model = model.train(X_train, y_train)

    with blocktimer('Predict'):
        sub = pd.DataFrame(columns=['TransactionID', 'isFraud'])
        sub['TransactionID'] = test['TransactionID']

        y_test = model.predict(X_test)

        sub['isFraud'] = y_test
        sub.to_csv(c_runtime.out_sub_path, index=False)
        logger.info(f'Saved {c_runtime.out_sub_path}')
Exemple #2
0
def main(c):

    with blocktimer('Preprocess'):
        train, test = Transformer.run(**c.transformer.__dict__)
        X_train, y_train, X_test = split_X_y(train, test)
        test = test.sort_values('TransactionDT')

    with blocktimer('Tune & Train'):
        modelfactory = ModelFactory()

        # tune the model params
        model = modelfactory.create(c.model)
        optimal_c_model = tune_gbdt_params(model, X_train, y_train,
                                           c.trainer.n_splits)

        # train with best params, full data
        model = modelfactory.create(optimal_c_model)
        model = model.train(X_train, y_train)

        # save results
        model.save(c.model.dir /
                   f'model_{c.runtime.VERSION}_{c.model.TYPE}.pkl')

        importance = pd.DataFrame(model.feature_importance,
                                  index=X_train.columns,
                                  columns=['importance'])

        importance_path = c.runtime.ROOTDIR / 'feature/importance' / f'importance_{c.runtime.VERSION}.csv'
        importance.to_csv(importance_path)
        logger.info(f'Saved {str(importance_path)}')

    with blocktimer('Predict'):
        sub = pd.DataFrame(columns=['TransactionID', 'isFraud'])
        sub['TransactionID'] = test['TransactionID']

        y_test = model.predict(X_test)

        sub['isFraud'] = y_test
        sub.to_csv(c.runtime.out_sub_path, index=False)
        logger.debug(f'Saved {c.runtime.out_sub_path}')
Exemple #3
0
def objective(trial, X_train, y_train, X_test, cols, c):
    '''
    Define objectives for optuna
    '''
    modelfactory = ModelFactory()
    if c.model.type == 'lightgbm':
        max_depth = trial.suggest_int('max_depth', 3, 12)
        params_to_tune = {
            # num_leaves should be smaller than approximately 2^max_depth*0.75
            'num_leaves':
            2**max_depth * 3 // 4,
            'max_depth':
            max_depth,
            'min_child_weight':
            trial.suggest_loguniform('min_child_weight', 1e-3, 1e0),
            'reg_alpha':
            trial.suggest_loguniform('reg_alpha', 1e-2, 1e0),
            'reg_lambda':
            trial.suggest_loguniform('reg_lambda', 1e-2, 1e0),
            'min_data_in_leaf':
            trial.suggest_int('min_data_in_leaf', 50, 200),
            'feature_fraction':
            trial.suggest_uniform('feature_fraction', 0, 1),
            'bagging_fraction':
            trial.suggest_uniform('bagging_fraction', 0, 1)
        }
    elif c.model.type == 'xgboost':
        params_to_tune = {
            'min_split_loss':
            trial.suggest_loguniform('min_split_loss', 1e-3, 1e0),
            'max_depth':
            trial.suggest_int('max_depth', 3, 12),
            'min_child_weight':
            trial.suggest_loguniform('min_child_weight', 1e-3, 1e0),
            'subsample':
            trial.suggest_uniform('subsample', 0, 1),
            'colsample_bytree':
            trial.suggest_uniform('colsample_bytree', 0.0, 1.0),
            'reg_alpha':
            trial.suggest_loguniform('reg_alpha', 1e-3, 1e0),
            'reg_lambda':
            trial.suggest_loguniform('reg_lambda', 1e-3, 1e0)
        }
    elif c.model.type == 'catboost':
        max_depth = trial.suggest_int('max_depth', 3, 12)
        params_to_tune = {
            # num_leaves should be smaller than approximately 2^max_depth*0.75
            # 'num_leaves': 2 ** max_depth * 3 // 4,
            'max_depth': max_depth,
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-2, 1e0)
        }

    # apply suggested params
    params = c.model.params.copy()
    params.update(params_to_tune)

    # Train by 6-fold CV
    oof = np.zeros(len(X_train))
    preds = np.zeros(len(X_test))
    skf = GroupKFold(n_splits=6)
    for i, (idxT, idxV) in enumerate(
            skf.split(X_train, y_train, groups=X_train['DT_M'])):
        fold = i + 1
        month = X_train.iloc[idxV]['DT_M'].iloc[0]
        model_fold_path = f'data/model/model_{c.runtime.version}_opt_fold{fold}{c.runtime.dsize}.pkl'
        model = modelfactory.create(c.model)
        logger.info(f'Fold {fold} withholding month {month}')
        logger.info(
            f'rows of train= {len(idxT)}, rows of holdout= {len(idxV)}')

        model = model.train(
            X_train[cols].iloc[idxT],
            y_train.iloc[idxT],
            X_train[cols].iloc[idxV],
            y_train.iloc[idxV],
            params=params,
            num_boost_round=c.train.num_boost_round,
            early_stopping_rounds=c.train.early_stopping_rounds,
            fold=i + 1)

        oof[idxV] = model.predict(X_train[cols].iloc[idxV])
        preds += model.predict(X_test[cols]) / skf.n_splits

        r.paths.update({f'model_fold_{fold}_path': model_fold_path})
        model.save(r.paths[f'model_fold_{fold}_path'])
        del model

    score = roc_auc_score(y_train, oof)
    logger.info(f'Fold {fold} OOF cv= {score}')
    mlflow.log_metric('oof_cv_score', score, step=trial.number)
    return score
Exemple #4
0
def main(c):
    dsize = '.small' if c.runtime.use_small_data is True else ''
    paths = EasyDict()
    scores = EasyDict()
    modelfactory = ModelFactory()

    with blocktimer('Preprocess', level=INFO):
        paths.in_train_path = f'data/feature/{c.features[0]}_train.pkl'
        paths.in_test_path = f'data/feature/{c.features[0]}_test.pkl'
        train = pd.read_pickle(paths.in_train_path)
        test = pd.read_pickle(paths.in_test_path)
        logger.debug(f'Loaded feature {c.features[0]}')

        if c.runtime.use_small_data:
            frac = 0.001
            train = train.sample(frac=frac, random_state=42)
            test = test.sample(frac=frac, random_state=42)
        logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}')

        # Split into X, y
        X_train = train.drop('isFraud', axis=1)
        X_test = test
        y_train = train['isFraud'].copy(deep=True)
        del train, test

    with blocktimer('Optimize', level=INFO):
        if c.train.optimize_num_boost_round is True:
            # tune the model params
            model = modelfactory.create(c.model)
            best_iteration = optimize_num_boost_round(model, X_train[c.cols],
                                                      y_train,
                                                      c.train.n_splits, dsize,
                                                      paths, scores)
        else:
            logger.debug('Skip optimization')
            best_iteration = c.train.num_boost_round

    with blocktimer('Train', level=INFO):
        logger.debug(f'Now using the following {len(c.cols)} features.')
        logger.debug(f'{np.array(c.cols)}')

        # CHRIS - TRAIN 75% PREDICT 25%
        idxT = X_train.index[:3 * len(X_train) // 4]
        idxV = X_train.index[3 * len(X_train) // 4:]
        '''
        model = modelfactory.create(c.model)
        model = model.train(X_train.loc[idxT, :], y_train[idxT],
                            X_train.loc[idxV, :], y_train[idxV],
                            num_boost_round=best_iteration)
        importance = pd.DataFrame(model.feature_importance,
                                  index=X_train.columns,
                                  columns=['importance'])

        # save results
        paths.out_model_dir = f'data/model/model_{c.runtime.version}_{c.model.type}{dsize}.pkl'
        paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv'
        model.save(paths.out_model_dir)
        importance.to_csv(paths.importance_path)
        '''

        from sklearn.model_selection import GroupKFold
        from sklearn.metrics import roc_auc_score
        oof = np.zeros(len(X_train))
        preds = np.zeros(len(X_test))

        skf = GroupKFold(n_splits=6)
        for i, (idxT, idxV) in enumerate(
                skf.split(X_train, y_train, groups=X_train['DT_M'])):
            month = X_train.iloc[idxV]['DT_M'].iloc[0]
            logger.info(f'Fold {i+1} withholding month {month}')
            logger.info(
                f'rows of train ={len(idxT)}, rows of holdout ={len(idxV)}')

            categorical_features = [
                'ProductCD',
                'M4',
                'card1',
                'card2',
                'card3',
                'card5',
                'card6',
                'addr1',
                'addr2',
                'dist1',
                'dist2',
                'P_emaildomain',
                'R_emaildomain',
            ]

            model = modelfactory.create(c.model)
            model = model.train(
                X_train[c.cols].iloc[idxT],
                y_train.iloc[idxT],
                X_train[c.cols].iloc[idxV],
                y_train.iloc[idxV],
                num_boost_round=best_iteration,
                early_stopping_rounds=c.train.early_stopping_rounds,
                # categorical_features=categorical_features,
                fold=i + 1)

            oof[idxV] += model.predict(X_train[c.cols].iloc[idxV])
            preds += model.predict(X_test[c.cols]) / skf.n_splits
            del model
        logger.info(f'OOF cv= {roc_auc_score(y_train, oof)}')
        paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv'
        # model.save(paths.out_model_dir)
        '''
        importance = pd.DataFrame(model.feature_importance,
                                  index=X_train.columns,
                                  columns=['importance'])
        importance.to_csv(paths.importance_path)
        '''

    with blocktimer('Predict', level=INFO):
        # y_test = model.predict(X_test)
        sub = pd.DataFrame(columns=['TransactionID', 'isFraud'])
        sub['TransactionID'] = X_test.reset_index()['TransactionID']
        # sub['isFraud'] = y_test
        sub['isFraud'] = preds

        paths.out_sub_path = f'data/submission/submission_{c.runtime.version}{dsize}.csv'
        sub.to_csv(paths.out_sub_path, index=False)

    result = EasyDict()
    result.update(c)
    result.scores = scores
    result.paths = paths
    return result
Exemple #5
0
def main(c):
    dsize = '.small' if c.runtime.use_small_data is True else ''
    paths = EasyDict()
    scores = EasyDict()
    result = EasyDict()
    result.update(c)
    modelfactory = ModelFactory()

    with blocktimer('Preprocess', level=INFO):
        paths.in_train_path = f'data/feature/{c.features[0]}_train.pkl'
        paths.in_test_path = f'data/feature/{c.features[0]}_test.pkl'
        train = pd.read_pickle(paths.in_train_path)
        test = pd.read_pickle(paths.in_test_path)
        logger.debug(f'Loaded feature {c.features[0]}')

        if c.runtime.use_small_data:
            frac = 0.001
            train = train.sample(frac=frac, random_state=42)
            test = test.sample(frac=frac, random_state=42)
        logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}')

        # Split into X, y
        X_train = train.drop('isFraud', axis=1)
        X_test = test
        y_train = train['isFraud'].copy(deep=True)
        del train, test

    with blocktimer('Optimize num_boost_round', level=INFO):
        if c.train.optimize_num_boost_round is True:
            # tune the model params
            model = modelfactory.create(c.model)
            best_iteration = optimize_num_boost_round(model, X_train[c.cols],
                                                      y_train,
                                                      c.train.n_splits, dsize,
                                                      paths, scores)
        else:
            logger.debug('Skip optimization')
            best_iteration = c.train.num_boost_round

    with blocktimer('Optimize model params', level=INFO):
        if c.train.optimize_model_params is True:
            # define objective for optuna
            def objectives(trial):
                max_depth = trial.suggest_int('max_depth', 3, 12)
                params = {
                    'boosting_type':
                    'gbdt',
                    # num_leaves should be smaller than approximately 2^max_depth*0.75
                    'num_leaves':
                    2**max_depth * 3 // 4,
                    'max_depth':
                    max_depth,
                    'learning_rate':
                    0.05,
                    'objective':
                    'binary',
                    'min_child_weight':
                    trial.suggest_loguniform('min_child_weight', 1e-3,
                                             1e0),  # 0.03454472573214212,
                    'reg_alpha':
                    trial.suggest_loguniform('reg_alpha', 1e-2,
                                             1e0),  # 0.3899927210061127,
                    'reg_lambda':
                    trial.suggest_loguniform('reg_lambda', 1e-2,
                                             1e0),  # 0.6485237330340494,
                    'random_state':
                    42,
                    'min_data_in_leaf':
                    trial.suggest_int('min_data_in_leaf', 50, 200),  # 106,
                    'metric':
                    'auc',
                    'max_bin':
                    255
                }
                c.model.params = params

                # Train by 6-fold CV
                oof = np.zeros(len(X_train))
                preds = np.zeros(len(X_test))

                skf = GroupKFold(n_splits=6)
                for i, (idxT, idxV) in enumerate(
                        skf.split(X_train, y_train, groups=X_train['DT_M'])):
                    fold = i + 1
                    month = X_train.iloc[idxV]['DT_M'].iloc[0]
                    model_fold_path = f'data/model/model_{c.runtime.version}_{c.model.type}_opt_fold{fold}{dsize}.pkl'
                    model = modelfactory.create(c.model)
                    logger.info(f'Fold {fold} withholding month {month}')
                    logger.info(
                        f'rows of train= {len(idxT)}, rows of holdout= {len(idxV)}'
                    )

                    model = model.train(
                        X_train[c.cols].iloc[idxT],
                        y_train.iloc[idxT],
                        X_train[c.cols].iloc[idxV],
                        y_train.iloc[idxV],
                        num_boost_round=best_iteration,
                        early_stopping_rounds=c.train.early_stopping_rounds,
                        # categorical_features=categorical_features,
                        fold=i + 1)

                    oof[idxV] = model.predict(X_train[c.cols].iloc[idxV])
                    preds += model.predict(X_test[c.cols]) / skf.n_splits

                    paths.update({f'model_fold_{fold}_path': model_fold_path})
                    model.save(paths[f'model_fold_{fold}_path'])
                    del model
                score = roc_auc_score(y_train, oof)
                logger.info(f'Fold {fold} OOF cv= {score}')
                return score

            # run optimization
            opt = optuna.create_study(
                direction='maximize',
                study_name=f'parameter_study_0016{dsize}',
                storage=
                f'sqlite:///data/optimization/parameter_study_0016{dsize}.db',
                load_if_exists=True)
            opt.optimize(objectives, n_trials=20)
            trial = opt.best_trial
            logger.debug(f'Best trial: {trial.value}')
            logger.debug(f'Best params: {trial.params}')
            scores.best_trial = trial.value
            result.optimize = {}
            result.optimize.best_params = trial.params
        else:
            logger.debug('Skip optimization')

    with blocktimer('Train', level=INFO):
        if c.train.train_model:
            logger.debug(f'Now using the following {len(c.cols)} features.')
            logger.debug(f'{np.array(c.cols)}')

            oof = np.zeros(len(X_train))
            preds = np.zeros(len(X_test))

            skf = GroupKFold(n_splits=6)
            for i, (idxT, idxV) in enumerate(
                    skf.split(X_train, y_train, groups=X_train['DT_M'])):
                month = X_train.iloc[idxV]['DT_M'].iloc[0]
                logger.info(f'Fold {i+1} withholding month {month}')
                logger.info(
                    f'rows of train ={len(idxT)}, rows of holdout ={len(idxV)}'
                )
                '''
                categorical_features = ['ProductCD', 'M4',
                                        'card1', 'card2', 'card3', 'card5', 'card6',
                                        'addr1', 'addr2', 'dist1', 'dist2',
                                        'P_emaildomain', 'R_emaildomain',
                                        ]
                '''

                model = modelfactory.create(c.model)
                model = model.train(
                    X_train[c.cols].iloc[idxT],
                    y_train.iloc[idxT],
                    X_train[c.cols].iloc[idxV],
                    y_train.iloc[idxV],
                    num_boost_round=best_iteration,
                    early_stopping_rounds=c.train.early_stopping_rounds,
                    # categorical_features=categorical_features,
                    fold=i + 1)

                oof[idxV] = model.predict(X_train[c.cols].iloc[idxV])
                preds += model.predict(X_test[c.cols]) / skf.n_splits
                del model
            logger.info(f'OOF cv= {roc_auc_score(y_train, oof)}')
            paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv'
            # model.save(paths.out_model_dir)
            '''
            importance = pd.DataFrame(model.feature_importance,
                                      index=X_train.columns,
                                      columns=['importance'])
            importance.to_csv(paths.importance_path)
            '''

    with blocktimer('Predict', level=INFO):
        if c.train.predict:
            sub = pd.DataFrame(columns=['TransactionID', 'isFraud'])
            sub['TransactionID'] = X_test.reset_index()['TransactionID']
            sub['isFraud'] = preds

            paths.out_sub_path = f'data/submission/submission_{c.runtime.version}{dsize}.csv'
            sub.to_csv(paths.out_sub_path, index=False)

    result.scores = scores
    result.paths = paths
    return result