Ejemplo n.º 1
0
def main(c_runtime, c_transformer, c_model, c_trainer, c_log):

    with blocktimer('Preprocess'):
        train, test = Transformer.run(**c_transformer.__dict__)
        X_train, y_train, X_test = split_X_y(train, test)

    with blocktimer('Tune & Train'):
        modelfactory = ModelFactory()

        # tune the model params
        model = modelfactory.create(c_model)
        optimal_c_model = tune_gbdt_params(model, X_train, y_train,
                                           c_trainer.n_splits)

        # train with best params, full data
        model = modelfactory.create(optimal_c_model)
        model = model.train(X_train, y_train)

    with blocktimer('Predict'):
        sub = pd.DataFrame(columns=['TransactionID', 'isFraud'])
        sub['TransactionID'] = test['TransactionID']

        y_test = model.predict(X_test)

        sub['isFraud'] = y_test
        sub.to_csv(c_runtime.out_sub_path, index=False)
        logger.info(f'Saved {c_runtime.out_sub_path}')
Ejemplo n.º 2
0
def main(c):

    with blocktimer('Preprocess'):
        train, test = Transformer.run(**c.transformer.__dict__)
        X_train, y_train, X_test = split_X_y(train, test)
        test = test.sort_values('TransactionDT')

    with blocktimer('Tune & Train'):
        modelfactory = ModelFactory()

        # tune the model params
        model = modelfactory.create(c.model)
        optimal_c_model = tune_gbdt_params(model, X_train, y_train,
                                           c.trainer.n_splits)

        # train with best params, full data
        model = modelfactory.create(optimal_c_model)
        model = model.train(X_train, y_train)

        # save results
        model.save(c.model.dir /
                   f'model_{c.runtime.VERSION}_{c.model.TYPE}.pkl')

        importance = pd.DataFrame(model.feature_importance,
                                  index=X_train.columns,
                                  columns=['importance'])

        importance_path = c.runtime.ROOTDIR / 'feature/importance' / f'importance_{c.runtime.VERSION}.csv'
        importance.to_csv(importance_path)
        logger.info(f'Saved {str(importance_path)}')

    with blocktimer('Predict'):
        sub = pd.DataFrame(columns=['TransactionID', 'isFraud'])
        sub['TransactionID'] = test['TransactionID']

        y_test = model.predict(X_test)

        sub['isFraud'] = y_test
        sub.to_csv(c.runtime.out_sub_path, index=False)
        logger.debug(f'Saved {c.runtime.out_sub_path}')
Ejemplo n.º 3
0
def tune_gbdt_params(model, X, y, n_splits) -> dict:
    '''
    Tune parameters by training
    '''

    # start tuning train log
    create_logger('train', **c.log)
    logger_train = getLogger('train')
    logger_train.debug('{}\t{}\t{}\t{}'.format('fold', 'iteration',
                                               'train_auc', 'val_auc'))

    aucs = list()
    feature_importances = pd.DataFrame()
    feature_importances['feature'] = X.columns

    # split data into train, validation
    folds = TimeSeriesSplit(n_splits=n_splits)
    for i, (idx_train, idx_val) in enumerate(folds.split(X, y)):
        fold = i + 1
        with blocktimer(f'Fold {fold}'):
            # prepare
            logger.info(f'Training on fold {fold}')
            X_train = X.iloc[idx_train]
            y_train = y.iloc[idx_train]
            X_val = X.iloc[idx_val]
            y_val = y.iloc[idx_val]

            # train
            model = model.train_and_validate(X_train, y_train, X_val, y_val,
                                             logger_train, fold)

            # record result
            feature_importances[f'fold_{fold}'] = model.feature_importance
            aucs.append(model.validation_auc)
            # TODO: save models at each steps
            logger.debug(f'Fold {fold} finished')

    logger.info('Training has finished.')
    logger.debug(f'Mean AUC: {np.mean(aucs)}')
    # TODO: save feature importance and other

    # make optimal config from result
    optimal_c_model = model.config
    if model.best_iteration is not None:
        # new param
        optimal_c_model.params['num_boost_round'] = model.best_iteration
    else:
        logger.warn(
            'Did not meet early stopping. Try larger num_boost_rounds.')
    # no need after optimized num_boost_round
    del optimal_c_model.params['early_stopping_rounds']
    return optimal_c_model
Ejemplo n.º 4
0
def tune_gbdt_params(model, X, y, n_splits) -> dict:
    '''
    Tune parameter num_boost_round
    '''
    # start tuning train log
    create_logger('train', **c.log)
    logger_train = getLogger('train')
    logger_train.debug('{}\t{}\t{}\t{}'.format('fold', 'iteration',
                                               'train_auc', 'val_auc'))

    # aucs = list()

    # split data into train, validation
    folds = TimeSeriesSplit(n_splits=n_splits)
    for i, (idx_train, idx_val) in enumerate(folds.split(X, y)):
        fold = i + 1
        with blocktimer(f'Training on Fold {fold}'):
            X_train = X.iloc[idx_train]
            y_train = y.iloc[idx_train]
            X_val = X.iloc[idx_val]
            y_val = y.iloc[idx_val]

            # train
            model = model.train_and_validate(X_train, y_train, X_val, y_val,
                                             logger_train, fold)
            model.save(
                c.model.dir /
                f'model_{c.runtime.VERSION}_{c.model.TYPE}_fold{fold}.pkl')

            # record result
            # aucs.append(model.val_auc)
            # logger.info(f'train_auc: {model.train_auc} val_auc: {model.val_auc}')

    # logger.info(f'Mean AUC: {np.mean(aucs)}')

    # make optimal config from result
    optimal_c_model = model.config
    if model.best_iteration is not None:
        optimal_c_model.params['num_boost_round'] = model.best_iteration
    else:
        logger.warn(
            'Did not meet early stopping. Try larger num_boost_rounds.')
    # no need after optimized num_boost_round
    del optimal_c_model.params['early_stopping_rounds']
    return optimal_c_model
Ejemplo n.º 5
0
def optimize_num_boost_round(model, X, y, n_splits, dsize, paths,
                             scores) -> dict:
    '''
    Estimate best num_boost_round by early stopping
    '''
    # split data into train, validation
    folds = TimeSeriesSplit(n_splits=n_splits)
    for i, (idx_train, idx_val) in enumerate(folds.split(X, y)):
        fold = i + 1
        with blocktimer(f'Training on Fold {fold}', level=INFO):
            X_train = X.iloc[idx_train]
            y_train = y.iloc[idx_train]
            X_val = X.iloc[idx_val]
            y_val = y.iloc[idx_val]

            # train
            model = model.train(
                X_train,
                y_train,
                X_val,
                y_val,
                num_boost_round=c.train.num_boost_round,
                early_stopping_rounds=c.train.early_stopping_rounds,
                fold=fold)
            model_fold_path = f'data/model/model_{c.runtime.version}_{c.model.type}_fold{fold}{dsize}.pkl'
            paths.update({f'model_fold_{fold}_path': model_fold_path})
            model.save(paths[f'model_fold_{fold}_path'])

    # make optimal config from result
    if model.best_iteration > 0:
        logger.info(
            f'Early stopping. Best iteration is: {model.best_iteration}')
        scores.best_iteration = model.best_iteration
        return model.best_iteration
    else:
        logger.warn(
            'Did not meet early stopping. Try larger num_boost_rounds.')
        scores.best_iteration = None
        return c.train.num_boost_round
Ejemplo n.º 6
0
def main(c, r):
    r.scores = {}

    with blocktimer('Preprocess', level=INFO):
        # unpack feature set list. set[i]={name: cols}
        for name, col_list in c.feature.set.items():
            in_train_path = f'data/feature/{name}_train.pkl'
            in_test_path = f'data/feature/{name}_test.pkl'
            cols = col_list
            train = pd.read_pickle(in_train_path)
            test = pd.read_pickle(in_test_path)
            logger.debug(f'Loaded feature {name}')

        if c.runtime.use_small_data:
            frac = 0.001
            train = train.sample(frac=frac, random_state=42)
            test = test.sample(frac=frac, random_state=42)

        logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}')

        # Split into X, y
        X_train = train.drop(c.feature.target, axis=1)
        y_train = train[c.feature.target].copy(deep=True)
        X_test = test
        del train, test

    with blocktimer('Tune hyper params', level=INFO):
        '''
        Run optimization
        '''
        mlflow.log_param('type', c.model.type)
        mlflow.log_param('num_boost_round', c.train.num_boost_round)
        mlflow.log_param('early_stopping_rounds',
                         c.train.early_stopping_rounds)

        f = partial(objective,
                    X_train=X_train,
                    y_train=y_train,
                    X_test=X_test,
                    cols=cols,
                    c=c)
        opt = optuna.create_study(
            direction='maximize',
            study_name=
            f'{experiment_type}_{c.runtime.version}{c.runtime.dsize}',
            storage=
            f'sqlite:///data/optimization/{experiment_type}_{c.runtime.version}{c.runtime.dsize}.db',
            load_if_exists=True)
        opt.optimize(f, n_trials=c.optimize.n_trials)
        trial = opt.best_trial

        r.optimize = {}
        r.scores.best_trial = trial.number
        r.scores.best_score = trial.value
        r.optimize.best_params = trial.params
        tuned_params = c.model.params.copy()
        tuned_params.update(trial.params)
        r.model.tuned_params = tuned_params

        logger.debug(f'Best trial: {trial.number}')
        logger.debug(f'Best score: {trial.value}')
        logger.debug(f'Best params: {trial.params}')

        mlflow.log_metric('best_trial', trial.number)
        mlflow.log_metric('best_score', trial.value)
        mlflow.log_params(trial.params)

        return r
Ejemplo n.º 7
0
def main(c):
    dsize = '.small' if c.runtime.use_small_data is True else ''
    paths = EasyDict()
    scores = EasyDict()
    modelfactory = ModelFactory()

    with blocktimer('Preprocess', level=INFO):
        paths.in_train_path = f'data/feature/{c.features[0]}_train.pkl'
        paths.in_test_path = f'data/feature/{c.features[0]}_test.pkl'
        train = pd.read_pickle(paths.in_train_path)
        test = pd.read_pickle(paths.in_test_path)
        logger.debug(f'Loaded feature {c.features[0]}')

        if c.runtime.use_small_data:
            frac = 0.001
            train = train.sample(frac=frac, random_state=42)
            test = test.sample(frac=frac, random_state=42)
        logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}')

        # Split into X, y
        X_train = train.drop('isFraud', axis=1)
        X_test = test
        y_train = train['isFraud'].copy(deep=True)
        del train, test

    with blocktimer('Optimize', level=INFO):
        if c.train.optimize_num_boost_round is True:
            # tune the model params
            model = modelfactory.create(c.model)
            best_iteration = optimize_num_boost_round(model, X_train[c.cols],
                                                      y_train,
                                                      c.train.n_splits, dsize,
                                                      paths, scores)
        else:
            logger.debug('Skip optimization')
            best_iteration = c.train.num_boost_round

    with blocktimer('Train', level=INFO):
        logger.debug(f'Now using the following {len(c.cols)} features.')
        logger.debug(f'{np.array(c.cols)}')

        # CHRIS - TRAIN 75% PREDICT 25%
        idxT = X_train.index[:3 * len(X_train) // 4]
        idxV = X_train.index[3 * len(X_train) // 4:]
        '''
        model = modelfactory.create(c.model)
        model = model.train(X_train.loc[idxT, :], y_train[idxT],
                            X_train.loc[idxV, :], y_train[idxV],
                            num_boost_round=best_iteration)
        importance = pd.DataFrame(model.feature_importance,
                                  index=X_train.columns,
                                  columns=['importance'])

        # save results
        paths.out_model_dir = f'data/model/model_{c.runtime.version}_{c.model.type}{dsize}.pkl'
        paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv'
        model.save(paths.out_model_dir)
        importance.to_csv(paths.importance_path)
        '''

        from sklearn.model_selection import GroupKFold
        from sklearn.metrics import roc_auc_score
        oof = np.zeros(len(X_train))
        preds = np.zeros(len(X_test))

        skf = GroupKFold(n_splits=6)
        for i, (idxT, idxV) in enumerate(
                skf.split(X_train, y_train, groups=X_train['DT_M'])):
            month = X_train.iloc[idxV]['DT_M'].iloc[0]
            logger.info(f'Fold {i+1} withholding month {month}')
            logger.info(
                f'rows of train ={len(idxT)}, rows of holdout ={len(idxV)}')

            categorical_features = [
                'ProductCD',
                'M4',
                'card1',
                'card2',
                'card3',
                'card5',
                'card6',
                'addr1',
                'addr2',
                'dist1',
                'dist2',
                'P_emaildomain',
                'R_emaildomain',
            ]

            model = modelfactory.create(c.model)
            model = model.train(
                X_train[c.cols].iloc[idxT],
                y_train.iloc[idxT],
                X_train[c.cols].iloc[idxV],
                y_train.iloc[idxV],
                num_boost_round=best_iteration,
                early_stopping_rounds=c.train.early_stopping_rounds,
                # categorical_features=categorical_features,
                fold=i + 1)

            oof[idxV] += model.predict(X_train[c.cols].iloc[idxV])
            preds += model.predict(X_test[c.cols]) / skf.n_splits
            del model
        logger.info(f'OOF cv= {roc_auc_score(y_train, oof)}')
        paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv'
        # model.save(paths.out_model_dir)
        '''
        importance = pd.DataFrame(model.feature_importance,
                                  index=X_train.columns,
                                  columns=['importance'])
        importance.to_csv(paths.importance_path)
        '''

    with blocktimer('Predict', level=INFO):
        # y_test = model.predict(X_test)
        sub = pd.DataFrame(columns=['TransactionID', 'isFraud'])
        sub['TransactionID'] = X_test.reset_index()['TransactionID']
        # sub['isFraud'] = y_test
        sub['isFraud'] = preds

        paths.out_sub_path = f'data/submission/submission_{c.runtime.version}{dsize}.csv'
        sub.to_csv(paths.out_sub_path, index=False)

    result = EasyDict()
    result.update(c)
    result.scores = scores
    result.paths = paths
    return result
Ejemplo n.º 8
0
def main(c):
    dsize = '.small' if c.runtime.use_small_data is True else ''
    paths = EasyDict()
    scores = EasyDict()
    result = EasyDict()
    result.update(c)
    modelfactory = ModelFactory()

    with blocktimer('Preprocess', level=INFO):
        paths.in_train_path = f'data/feature/{c.features[0]}_train.pkl'
        paths.in_test_path = f'data/feature/{c.features[0]}_test.pkl'
        train = pd.read_pickle(paths.in_train_path)
        test = pd.read_pickle(paths.in_test_path)
        logger.debug(f'Loaded feature {c.features[0]}')

        if c.runtime.use_small_data:
            frac = 0.001
            train = train.sample(frac=frac, random_state=42)
            test = test.sample(frac=frac, random_state=42)
        logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}')

        # Split into X, y
        X_train = train.drop('isFraud', axis=1)
        X_test = test
        y_train = train['isFraud'].copy(deep=True)
        del train, test

    with blocktimer('Optimize num_boost_round', level=INFO):
        if c.train.optimize_num_boost_round is True:
            # tune the model params
            model = modelfactory.create(c.model)
            best_iteration = optimize_num_boost_round(model, X_train[c.cols],
                                                      y_train,
                                                      c.train.n_splits, dsize,
                                                      paths, scores)
        else:
            logger.debug('Skip optimization')
            best_iteration = c.train.num_boost_round

    with blocktimer('Optimize model params', level=INFO):
        if c.train.optimize_model_params is True:
            # define objective for optuna
            def objectives(trial):
                max_depth = trial.suggest_int('max_depth', 3, 12)
                params = {
                    'boosting_type':
                    'gbdt',
                    # num_leaves should be smaller than approximately 2^max_depth*0.75
                    'num_leaves':
                    2**max_depth * 3 // 4,
                    'max_depth':
                    max_depth,
                    'learning_rate':
                    0.05,
                    'objective':
                    'binary',
                    'min_child_weight':
                    trial.suggest_loguniform('min_child_weight', 1e-3,
                                             1e0),  # 0.03454472573214212,
                    'reg_alpha':
                    trial.suggest_loguniform('reg_alpha', 1e-2,
                                             1e0),  # 0.3899927210061127,
                    'reg_lambda':
                    trial.suggest_loguniform('reg_lambda', 1e-2,
                                             1e0),  # 0.6485237330340494,
                    'random_state':
                    42,
                    'min_data_in_leaf':
                    trial.suggest_int('min_data_in_leaf', 50, 200),  # 106,
                    'metric':
                    'auc',
                    'max_bin':
                    255
                }
                c.model.params = params

                # Train by 6-fold CV
                oof = np.zeros(len(X_train))
                preds = np.zeros(len(X_test))

                skf = GroupKFold(n_splits=6)
                for i, (idxT, idxV) in enumerate(
                        skf.split(X_train, y_train, groups=X_train['DT_M'])):
                    fold = i + 1
                    month = X_train.iloc[idxV]['DT_M'].iloc[0]
                    model_fold_path = f'data/model/model_{c.runtime.version}_{c.model.type}_opt_fold{fold}{dsize}.pkl'
                    model = modelfactory.create(c.model)
                    logger.info(f'Fold {fold} withholding month {month}')
                    logger.info(
                        f'rows of train= {len(idxT)}, rows of holdout= {len(idxV)}'
                    )

                    model = model.train(
                        X_train[c.cols].iloc[idxT],
                        y_train.iloc[idxT],
                        X_train[c.cols].iloc[idxV],
                        y_train.iloc[idxV],
                        num_boost_round=best_iteration,
                        early_stopping_rounds=c.train.early_stopping_rounds,
                        # categorical_features=categorical_features,
                        fold=i + 1)

                    oof[idxV] = model.predict(X_train[c.cols].iloc[idxV])
                    preds += model.predict(X_test[c.cols]) / skf.n_splits

                    paths.update({f'model_fold_{fold}_path': model_fold_path})
                    model.save(paths[f'model_fold_{fold}_path'])
                    del model
                score = roc_auc_score(y_train, oof)
                logger.info(f'Fold {fold} OOF cv= {score}')
                return score

            # run optimization
            opt = optuna.create_study(
                direction='maximize',
                study_name=f'parameter_study_0016{dsize}',
                storage=
                f'sqlite:///data/optimization/parameter_study_0016{dsize}.db',
                load_if_exists=True)
            opt.optimize(objectives, n_trials=20)
            trial = opt.best_trial
            logger.debug(f'Best trial: {trial.value}')
            logger.debug(f'Best params: {trial.params}')
            scores.best_trial = trial.value
            result.optimize = {}
            result.optimize.best_params = trial.params
        else:
            logger.debug('Skip optimization')

    with blocktimer('Train', level=INFO):
        if c.train.train_model:
            logger.debug(f'Now using the following {len(c.cols)} features.')
            logger.debug(f'{np.array(c.cols)}')

            oof = np.zeros(len(X_train))
            preds = np.zeros(len(X_test))

            skf = GroupKFold(n_splits=6)
            for i, (idxT, idxV) in enumerate(
                    skf.split(X_train, y_train, groups=X_train['DT_M'])):
                month = X_train.iloc[idxV]['DT_M'].iloc[0]
                logger.info(f'Fold {i+1} withholding month {month}')
                logger.info(
                    f'rows of train ={len(idxT)}, rows of holdout ={len(idxV)}'
                )
                '''
                categorical_features = ['ProductCD', 'M4',
                                        'card1', 'card2', 'card3', 'card5', 'card6',
                                        'addr1', 'addr2', 'dist1', 'dist2',
                                        'P_emaildomain', 'R_emaildomain',
                                        ]
                '''

                model = modelfactory.create(c.model)
                model = model.train(
                    X_train[c.cols].iloc[idxT],
                    y_train.iloc[idxT],
                    X_train[c.cols].iloc[idxV],
                    y_train.iloc[idxV],
                    num_boost_round=best_iteration,
                    early_stopping_rounds=c.train.early_stopping_rounds,
                    # categorical_features=categorical_features,
                    fold=i + 1)

                oof[idxV] = model.predict(X_train[c.cols].iloc[idxV])
                preds += model.predict(X_test[c.cols]) / skf.n_splits
                del model
            logger.info(f'OOF cv= {roc_auc_score(y_train, oof)}')
            paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv'
            # model.save(paths.out_model_dir)
            '''
            importance = pd.DataFrame(model.feature_importance,
                                      index=X_train.columns,
                                      columns=['importance'])
            importance.to_csv(paths.importance_path)
            '''

    with blocktimer('Predict', level=INFO):
        if c.train.predict:
            sub = pd.DataFrame(columns=['TransactionID', 'isFraud'])
            sub['TransactionID'] = X_test.reset_index()['TransactionID']
            sub['isFraud'] = preds

            paths.out_sub_path = f'data/submission/submission_{c.runtime.version}{dsize}.csv'
            sub.to_csv(paths.out_sub_path, index=False)

    result.scores = scores
    result.paths = paths
    return result