Exemple #1
0
    if model.best_iteration is not None:
        optimal_c_model.params['num_boost_round'] = model.best_iteration
    else:
        logger.warn(
            'Did not meet early stopping. Try larger num_boost_rounds.')
    # no need after optimized num_boost_round
    del optimal_c_model.params['early_stopping_rounds']
    return optimal_c_model


if __name__ == "__main__":
    gc.enable()
    warnings.filterwarnings('ignore')

    # read config & apply option
    c = EasyDict(config)
    opt = parse_option()
    c.transformer.USE_SMALL_DATA = opt.small
    c.log.slackauth.NO_SEND_MESSAGE = opt.nomsg

    seed_everything(c.runtime.RANDOM_SEED)

    create_logger('main', **c.log)
    logger = getLogger('main')
    logger.info(
        f':thinking_face: Starting experiment {c.runtime.VERSION}_{c.runtime.DESCRIPTION}'
    )

    try:
        main(c)
        logger.info(
Exemple #2
0
        mlflow.log_metric('best_trial', trial.number)
        mlflow.log_metric('best_score', trial.value)
        mlflow.log_params(trial.params)

        return r


if __name__ == '__main__':
    gc.enable()
    warnings.filterwarnings('ignore')

    # get option
    opt = parse_option()
    # c is for config
    c = json.load(open(f'config/config_{opt.version}.json'))
    c = EasyDict(c)
    c.runtime = {}
    c.runtime.version = opt.version
    c.runtime.use_small_data = opt.small
    c.runtime.no_send_message = opt.nomsg
    c.runtime.random_seed = opt.seed
    c.runtime.dsize = '.small' if c.runtime.use_small_data is True else ''

    # dict to save results
    r = EasyDict()
    r.update(c)
    r.paths = {}
    r.paths.result = f'config/result_{c.runtime.version}{c.runtime.dsize}.json'

    seed_everything(c.runtime.random_seed)
Exemple #3
0
            f'Early stopping. Best iteration is: {model.best_iteration}')
        scores.best_iteration = model.best_iteration
        return model.best_iteration
    else:
        logger.warn(
            'Did not meet early stopping. Try larger num_boost_rounds.')
        scores.best_iteration = None
        return c.train.num_boost_round


if __name__ == "__main__":
    gc.enable()
    warnings.filterwarnings('ignore')

    # slack config
    slackauth = EasyDict(json.load(open('./slackauth.json', 'r')))
    slackauth.token_path = Path().home() / slackauth.token_file

    # get option
    opt = parse_option()
    c = json.load(open(f'config/config_{opt.version}.json'))
    c = EasyDict(c)
    c.runtime = {}
    c.runtime.version = opt.version
    c.runtime.use_small_data = opt.small
    c.runtime.no_send_message = opt.nomsg
    c.runtime.random_seed = opt.seed

    seed_everything(c.runtime.random_seed)

    dsize = '.small' if c.runtime.use_small_data is True else ''
Exemple #4
0
def main(c):
    dsize = '.small' if c.runtime.use_small_data is True else ''
    paths = EasyDict()
    scores = EasyDict()
    modelfactory = ModelFactory()

    with blocktimer('Preprocess', level=INFO):
        paths.in_train_path = f'data/feature/{c.features[0]}_train.pkl'
        paths.in_test_path = f'data/feature/{c.features[0]}_test.pkl'
        train = pd.read_pickle(paths.in_train_path)
        test = pd.read_pickle(paths.in_test_path)
        logger.debug(f'Loaded feature {c.features[0]}')

        if c.runtime.use_small_data:
            frac = 0.001
            train = train.sample(frac=frac, random_state=42)
            test = test.sample(frac=frac, random_state=42)
        logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}')

        # Split into X, y
        X_train = train.drop('isFraud', axis=1)
        X_test = test
        y_train = train['isFraud'].copy(deep=True)
        del train, test

    with blocktimer('Optimize', level=INFO):
        if c.train.optimize_num_boost_round is True:
            # tune the model params
            model = modelfactory.create(c.model)
            best_iteration = optimize_num_boost_round(model, X_train[c.cols],
                                                      y_train,
                                                      c.train.n_splits, dsize,
                                                      paths, scores)
        else:
            logger.debug('Skip optimization')
            best_iteration = c.train.num_boost_round

    with blocktimer('Train', level=INFO):
        logger.debug(f'Now using the following {len(c.cols)} features.')
        logger.debug(f'{np.array(c.cols)}')

        # CHRIS - TRAIN 75% PREDICT 25%
        idxT = X_train.index[:3 * len(X_train) // 4]
        idxV = X_train.index[3 * len(X_train) // 4:]
        '''
        model = modelfactory.create(c.model)
        model = model.train(X_train.loc[idxT, :], y_train[idxT],
                            X_train.loc[idxV, :], y_train[idxV],
                            num_boost_round=best_iteration)
        importance = pd.DataFrame(model.feature_importance,
                                  index=X_train.columns,
                                  columns=['importance'])

        # save results
        paths.out_model_dir = f'data/model/model_{c.runtime.version}_{c.model.type}{dsize}.pkl'
        paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv'
        model.save(paths.out_model_dir)
        importance.to_csv(paths.importance_path)
        '''

        from sklearn.model_selection import GroupKFold
        from sklearn.metrics import roc_auc_score
        oof = np.zeros(len(X_train))
        preds = np.zeros(len(X_test))

        skf = GroupKFold(n_splits=6)
        for i, (idxT, idxV) in enumerate(
                skf.split(X_train, y_train, groups=X_train['DT_M'])):
            month = X_train.iloc[idxV]['DT_M'].iloc[0]
            logger.info(f'Fold {i+1} withholding month {month}')
            logger.info(
                f'rows of train ={len(idxT)}, rows of holdout ={len(idxV)}')

            categorical_features = [
                'ProductCD',
                'M4',
                'card1',
                'card2',
                'card3',
                'card5',
                'card6',
                'addr1',
                'addr2',
                'dist1',
                'dist2',
                'P_emaildomain',
                'R_emaildomain',
            ]

            model = modelfactory.create(c.model)
            model = model.train(
                X_train[c.cols].iloc[idxT],
                y_train.iloc[idxT],
                X_train[c.cols].iloc[idxV],
                y_train.iloc[idxV],
                num_boost_round=best_iteration,
                early_stopping_rounds=c.train.early_stopping_rounds,
                # categorical_features=categorical_features,
                fold=i + 1)

            oof[idxV] += model.predict(X_train[c.cols].iloc[idxV])
            preds += model.predict(X_test[c.cols]) / skf.n_splits
            del model
        logger.info(f'OOF cv= {roc_auc_score(y_train, oof)}')
        paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv'
        # model.save(paths.out_model_dir)
        '''
        importance = pd.DataFrame(model.feature_importance,
                                  index=X_train.columns,
                                  columns=['importance'])
        importance.to_csv(paths.importance_path)
        '''

    with blocktimer('Predict', level=INFO):
        # y_test = model.predict(X_test)
        sub = pd.DataFrame(columns=['TransactionID', 'isFraud'])
        sub['TransactionID'] = X_test.reset_index()['TransactionID']
        # sub['isFraud'] = y_test
        sub['isFraud'] = preds

        paths.out_sub_path = f'data/submission/submission_{c.runtime.version}{dsize}.csv'
        sub.to_csv(paths.out_sub_path, index=False)

    result = EasyDict()
    result.update(c)
    result.scores = scores
    result.paths = paths
    return result
Exemple #5
0
def main(c):
    dsize = '.small' if c.runtime.use_small_data is True else ''
    paths = EasyDict()
    scores = EasyDict()
    result = EasyDict()
    result.update(c)
    modelfactory = ModelFactory()

    with blocktimer('Preprocess', level=INFO):
        paths.in_train_path = f'data/feature/{c.features[0]}_train.pkl'
        paths.in_test_path = f'data/feature/{c.features[0]}_test.pkl'
        train = pd.read_pickle(paths.in_train_path)
        test = pd.read_pickle(paths.in_test_path)
        logger.debug(f'Loaded feature {c.features[0]}')

        if c.runtime.use_small_data:
            frac = 0.001
            train = train.sample(frac=frac, random_state=42)
            test = test.sample(frac=frac, random_state=42)
        logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}')

        # Split into X, y
        X_train = train.drop('isFraud', axis=1)
        X_test = test
        y_train = train['isFraud'].copy(deep=True)
        del train, test

    with blocktimer('Optimize num_boost_round', level=INFO):
        if c.train.optimize_num_boost_round is True:
            # tune the model params
            model = modelfactory.create(c.model)
            best_iteration = optimize_num_boost_round(model, X_train[c.cols],
                                                      y_train,
                                                      c.train.n_splits, dsize,
                                                      paths, scores)
        else:
            logger.debug('Skip optimization')
            best_iteration = c.train.num_boost_round

    with blocktimer('Optimize model params', level=INFO):
        if c.train.optimize_model_params is True:
            # define objective for optuna
            def objectives(trial):
                max_depth = trial.suggest_int('max_depth', 3, 12)
                params = {
                    'boosting_type':
                    'gbdt',
                    # num_leaves should be smaller than approximately 2^max_depth*0.75
                    'num_leaves':
                    2**max_depth * 3 // 4,
                    'max_depth':
                    max_depth,
                    'learning_rate':
                    0.05,
                    'objective':
                    'binary',
                    'min_child_weight':
                    trial.suggest_loguniform('min_child_weight', 1e-3,
                                             1e0),  # 0.03454472573214212,
                    'reg_alpha':
                    trial.suggest_loguniform('reg_alpha', 1e-2,
                                             1e0),  # 0.3899927210061127,
                    'reg_lambda':
                    trial.suggest_loguniform('reg_lambda', 1e-2,
                                             1e0),  # 0.6485237330340494,
                    'random_state':
                    42,
                    'min_data_in_leaf':
                    trial.suggest_int('min_data_in_leaf', 50, 200),  # 106,
                    'metric':
                    'auc',
                    'max_bin':
                    255
                }
                c.model.params = params

                # Train by 6-fold CV
                oof = np.zeros(len(X_train))
                preds = np.zeros(len(X_test))

                skf = GroupKFold(n_splits=6)
                for i, (idxT, idxV) in enumerate(
                        skf.split(X_train, y_train, groups=X_train['DT_M'])):
                    fold = i + 1
                    month = X_train.iloc[idxV]['DT_M'].iloc[0]
                    model_fold_path = f'data/model/model_{c.runtime.version}_{c.model.type}_opt_fold{fold}{dsize}.pkl'
                    model = modelfactory.create(c.model)
                    logger.info(f'Fold {fold} withholding month {month}')
                    logger.info(
                        f'rows of train= {len(idxT)}, rows of holdout= {len(idxV)}'
                    )

                    model = model.train(
                        X_train[c.cols].iloc[idxT],
                        y_train.iloc[idxT],
                        X_train[c.cols].iloc[idxV],
                        y_train.iloc[idxV],
                        num_boost_round=best_iteration,
                        early_stopping_rounds=c.train.early_stopping_rounds,
                        # categorical_features=categorical_features,
                        fold=i + 1)

                    oof[idxV] = model.predict(X_train[c.cols].iloc[idxV])
                    preds += model.predict(X_test[c.cols]) / skf.n_splits

                    paths.update({f'model_fold_{fold}_path': model_fold_path})
                    model.save(paths[f'model_fold_{fold}_path'])
                    del model
                score = roc_auc_score(y_train, oof)
                logger.info(f'Fold {fold} OOF cv= {score}')
                return score

            # run optimization
            opt = optuna.create_study(
                direction='maximize',
                study_name=f'parameter_study_0016{dsize}',
                storage=
                f'sqlite:///data/optimization/parameter_study_0016{dsize}.db',
                load_if_exists=True)
            opt.optimize(objectives, n_trials=20)
            trial = opt.best_trial
            logger.debug(f'Best trial: {trial.value}')
            logger.debug(f'Best params: {trial.params}')
            scores.best_trial = trial.value
            result.optimize = {}
            result.optimize.best_params = trial.params
        else:
            logger.debug('Skip optimization')

    with blocktimer('Train', level=INFO):
        if c.train.train_model:
            logger.debug(f'Now using the following {len(c.cols)} features.')
            logger.debug(f'{np.array(c.cols)}')

            oof = np.zeros(len(X_train))
            preds = np.zeros(len(X_test))

            skf = GroupKFold(n_splits=6)
            for i, (idxT, idxV) in enumerate(
                    skf.split(X_train, y_train, groups=X_train['DT_M'])):
                month = X_train.iloc[idxV]['DT_M'].iloc[0]
                logger.info(f'Fold {i+1} withholding month {month}')
                logger.info(
                    f'rows of train ={len(idxT)}, rows of holdout ={len(idxV)}'
                )
                '''
                categorical_features = ['ProductCD', 'M4',
                                        'card1', 'card2', 'card3', 'card5', 'card6',
                                        'addr1', 'addr2', 'dist1', 'dist2',
                                        'P_emaildomain', 'R_emaildomain',
                                        ]
                '''

                model = modelfactory.create(c.model)
                model = model.train(
                    X_train[c.cols].iloc[idxT],
                    y_train.iloc[idxT],
                    X_train[c.cols].iloc[idxV],
                    y_train.iloc[idxV],
                    num_boost_round=best_iteration,
                    early_stopping_rounds=c.train.early_stopping_rounds,
                    # categorical_features=categorical_features,
                    fold=i + 1)

                oof[idxV] = model.predict(X_train[c.cols].iloc[idxV])
                preds += model.predict(X_test[c.cols]) / skf.n_splits
                del model
            logger.info(f'OOF cv= {roc_auc_score(y_train, oof)}')
            paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv'
            # model.save(paths.out_model_dir)
            '''
            importance = pd.DataFrame(model.feature_importance,
                                      index=X_train.columns,
                                      columns=['importance'])
            importance.to_csv(paths.importance_path)
            '''

    with blocktimer('Predict', level=INFO):
        if c.train.predict:
            sub = pd.DataFrame(columns=['TransactionID', 'isFraud'])
            sub['TransactionID'] = X_test.reset_index()['TransactionID']
            sub['isFraud'] = preds

            paths.out_sub_path = f'data/submission/submission_{c.runtime.version}{dsize}.csv'
            sub.to_csv(paths.out_sub_path, index=False)

    result.scores = scores
    result.paths = paths
    return result