Ejemplo n.º 1
0
def run_cv(model,
           X,
           y,
           folds=3,
           cv_type=StratifiedKFold,
           success_metric=roc_auc_score) -> Tuple:
    """
    Run the specified cross validation on the given model using the given X, y.
    Returns a tuple where:
     - the first item is the mean CV score 
     - the second item is the std of the CV scores
    """
    try:
        cv = cv_type(n_splits=folds, shuffle=True)

        scores = []
        for train_idx, test_idx in cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            model.fit(X_train, y_train)
            y_pred = model.predict_proba(X_test)

            score = success_metric(y_test, y_pred[:, 1])
            scores.append(score)

        score_mean, score_std = np.mean(scores), np.std(scores)

        LOAN_LOGGER.info('CV on model completed')
        return score_mean, score_std
    except:
        message = 'CV on model NOT completed'
        log_and_stop(LOAN_LOGGER, message)
Ejemplo n.º 2
0
def _run_boosting_hyperopt(selected_model, X_train, y_train, X_test, y_test,
                           max_evals):
    """
    Run Hyperopt for the LGBM or XGB models. The models are trained and tested on the given X and y.
    The score metric is ROC_AUC.
    tpe.suggest has been modified, so that only the first 3 tries are random, instead of the default 20.
    The function returns a tuple where:
        - the first item is a dictionary returned by the fmin function
        - the second item is the trials variable used in hyperopt
    """
    def objective(space):
        model_params = {
            'colsample_bytree': space['colsample_bytree'],
            'learning_rate': space['learning_rate'],
            'max_depth': int(space['max_depth']),
            'min_child_weight': int(space['min_child_weight']),
            'n_estimators': int(space['n_estimators']),
            'reg_alpha': space['reg_alpha'],
            'reg_lambda': space['reg_lambda'],
            'subsample': space['subsample'],
            'num_leaves': 20,
            'random_state': 2020,
            'importance_type': 'gain',
            'n_jobs': -1
        }

        model = selected_model(**model_params)
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)
        score = -roc_auc_score(y_test, y_pred[:, 1])

        return {'loss': score, 'status': STATUS_OK}

    try:
        space = {
            'max_depth': hp.quniform('ho_max_depth', 5, 20, 1),
            'colsample_bytree': hp.uniform('ho_colsample_bytree', 0.8, 1.),
            'learning_rate': hp.uniform('ho_learning_rate', 0.05, 0.2),
            'subsample': hp.uniform('ho_subsample', 0.7, 1.),
            'min_child_weight': hp.quniform('ho_min_child_weight', 1, 10, 1),
            'reg_alpha': hp.loguniform('ho_reg_alpha', 0., 1.),
            'reg_lambda': hp.uniform('ho_reg_lambda', 0.7, 1.),
            'n_estimators': hp.quniform('ho_n_estimators', 50, 500, 5)
        }

        trials = Trials()
        best_params = fmin(fn=objective,
                           space=space,
                           algo=partial(tpe.suggest, n_startup_jobs=3),
                           max_evals=max_evals,
                           trials=trials)

        LOAN_LOGGER.info('Boosting Hyperopt finished successfully')
        return best_params, trials
    except:
        message = 'Boosting Hyperopt NOT finished successfully'
        log_and_stop(LOAN_LOGGER, message)
Ejemplo n.º 3
0
def drop_rows_with_nans(dataset: pd.DataFrame) -> pd.DataFrame:
    """
    Drop rows which contain NaN in one of the 3 columns: `annual_inc`, `earliest_cr_line`, 'pub_rec_bankruptcies'.
    Function returns a new dataset with removed NaN rows.
    """
    try:
        df = dataset.copy()
        df = df.dropna(subset=['annual_inc', 'earliest_cr_line'])

        LOAN_LOGGER.info('Rows containing NaNs removed')
        return df
    except:
        message = 'Rows containing NaNs NOT removed'
        log_and_stop(LOAN_LOGGER, message)
Ejemplo n.º 4
0
def create_log_features(dataset: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate the logarithms for skewed the column annual_inc.
    Function returns a dataset with new features.
    """
    try:
        df = dataset.copy()
        df['annual_inc_log'] = np.log1p(df['annual_inc'].values)

        LOAN_LOGGER.info('Logarithm features created')
        return df
    except:
        message = 'Logarithm features NOT created'
        log_and_stop(LOAN_LOGGER, message)
Ejemplo n.º 5
0
def extract_number_from_text(dataset: pd.DataFrame) -> pd.DataFrame:
    """
    Extract the number of months from the term column
    and the digit from the sub_grade column.
    Function returns a new dataset with new features.
    """
    try:
        df = dataset.copy()
        df['term_month'] = df['term'].map(lambda x: int(x.strip()[:2]))
        df['sub_grade_digit'] = df['sub_grade'].map(lambda x: int(x[1]))

        LOAN_LOGGER.info('Number values extracted from text columns')
        return df
    except:
        message = 'Number values NOT extracted from text columns'
        log_and_stop(LOAN_LOGGER, message)
Ejemplo n.º 6
0
def factorize_categorical_features(dataset: pd.DataFrame,
                                   factorized_dict: Dict) -> pd.DataFrame:
    """
    Changing all categorical variables into numerical variables based on the given factorized dictionary.
    Function returns a new dataset where factorized columns end with `_cat`.
    """
    try:
        df = dataset.copy()
        for cat_feat in factorized_dict.keys():
            df['{}_cat'.format(cat_feat)] = df[cat_feat].map(
                lambda x: factorized_dict[cat_feat].get_loc(x))

        LOAN_LOGGER.info('Categorical features factorized')
        return df
    except:
        message = 'Categorical features NOT factorized'
        log_and_stop(LOAN_LOGGER, message)
Ejemplo n.º 7
0
def create_factorizing_dict(dataset: pd.DataFrame,
                            cat_feats: List[str]) -> Dict:
    """
    Create a dictionary which contains all label-number relations for the given categorical variables.
    Function returns the created dictionary.
    """
    try:
        df = dataset.copy()
        factorized_dict = {}
        for cat_feat in cat_feats:
            factorized_dict[cat_feat] = pd.factorize(df[cat_feat])[1]

        LOAN_LOGGER.info('Factorized dictionary created')
        return factorized_dict
    except:
        message = 'Factorized dictionary NOT created'
        log_and_stop(LOAN_LOGGER, message)
Ejemplo n.º 8
0
def optimize_dtypes(dataset: pd.DataFrame, dtype_cols) -> pd.DataFrame:
    """
    Optimize the dtype for the given set of columns in order to use less memory.
    Additionally garbage collection is run.
    Function returns a new dataset with new dtypes.
    """
    try:
        df = dataset.copy()
        for key in dtype_cols.keys():
            df.loc[:, key] = df[key].astype(dtype_cols[key])

        gc.collect()

        LOAN_LOGGER.info('dtypes optimized for given columns')
        return df
    except:
        message = 'dtypes NOT optimized for given columns'
        log_and_stop(LOAN_LOGGER, message)
Ejemplo n.º 9
0
def _run_tree_hyperopt(selected_model, X_train, y_train, X_test, y_test,
                       max_evals):
    """
    Run Hyperopt for the DecisionTree model. The model is trained and tested on the given X and y.
    The score metric is ROC_AUC.
    tpe.suggest has been modified, so that only the first 3 tries are random, instead of the default 20.
    The function returns a tuple where:
        - the first item is a dictionary returned by the fmin function
        - the second item is the trials variable used in hyperopt
    """
    def objective(space):
        model_params = {
            'max_depth': int(space['max_depth']),
            'min_samples_split': int(space['min_samples_split']),
            'min_samples_leaf': int(space['min_samples_leaf']),
            'random_state': 2020
        }

        model = selected_model(**model_params)
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)
        score = -roc_auc_score(y_test, y_pred[:, 1])

        return {'loss': score, 'status': STATUS_OK}

    try:
        space = {
            'max_depth': hp.quniform('ho_max_depth', 5, 20, 1),
            'min_samples_split': hp.quniform('ho_min_samples_split', 2, 10, 1),
            'min_samples_leaf': hp.quniform('ho_min_samples_leaf', 1, 10, 1),
        }

        trials = Trials()
        best_params = fmin(fn=objective,
                           space=space,
                           algo=partial(tpe.suggest, n_startup_jobs=3),
                           max_evals=max_evals,
                           trials=trials)

        LOAN_LOGGER.info('Tree Hyperopt finished successfully')
        return best_params, trials
    except:
        message = 'Tree Hyperopt NOT finished successfully'
        log_and_stop(LOAN_LOGGER, message)
Ejemplo n.º 10
0
def create_date_features(dataset: pd.DataFrame) -> pd.DataFrame:
    """
    Create features based on datetime columns: issue_d, earliest_cr_line.
    Function returns a new dataset with new features.
    """
    try:
        df = dataset.copy()
        df['issue_d_month'] = df['issue_d'].dt.month
        df['issue_d_year'] = df['issue_d'].dt.year
        df['earliest_cr_line_month'] = df['earliest_cr_line'].dt.month
        df['earliest_cr_line_year'] = df['earliest_cr_line'].dt.year
        df['days_between_earliest_cr_and_issue'] = (
            df['issue_d'] - df['earliest_cr_line']).dt.days

        LOAN_LOGGER.info('Date features created')
        return df
    except:
        message = 'Date features NOT created'
        log_and_stop(LOAN_LOGGER, message)
Ejemplo n.º 11
0
def create_target_variable(dataset: pd.DataFrame) -> pd.DataFrame:
    """
    Create the column bad_loan which is True if loan_status has any of the following values:
    `Charged Off`, `Late (31-120 days)`, 'Late (16-30 days)',
    `Does not meet the credit policy. Status:Charged Off`, `Default`.
    Function returns a new dataset with the target variable.
    """
    try:
        df = dataset.copy()
        bad_status = [
            'Charged Off', 'Late (31-120 days)', 'Late (16-30 days)',
            'Does not meet the credit policy. Status:Charged Off', 'Default'
        ]
        df['bad_loan'] = df['loan_status'].isin(bad_status)

        LOAN_LOGGER.info('Target Variable created')
        return df
    except:
        message = 'Target Variable NOT created'
        log_and_stop(LOAN_LOGGER, message)
Ejemplo n.º 12
0
def prepare_train_test_sets(features, target) -> Tuple:
    """
    Change the DataFrame and Series into numpy arrays and divide the data into a training and test set.
    Test set will consist 20% of observations.
    Function returns X and y divided into training and test sets.
    """
    try:
        X = np.array(features, dtype=np.float)
        y = np.array(target, dtype=np.float)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=2020,
                                                            stratify=y)

        LOAN_LOGGER.info('Train and test sets prepared')
        return X_train, X_test, y_train, y_test
    except:
        message = 'Train and test sets NOT prepared'
        log_and_stop(LOAN_LOGGER, message)
Ejemplo n.º 13
0
def train_and_test_model(model,
                         X_train,
                         y_train,
                         X_test,
                         y_test,
                         success_metric=roc_auc_score):
    """
    Train the given model on the given training set and then test it on the given test set.
    Returns a tuple where:
     - the first item is the score achieved on the test set 
     - the second item are the predicted probabilities
    """
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)
        score = success_metric(y_test, y_pred[:, 1])

        LOAN_LOGGER.info('Model trained on train set and tested on test set')
        return score, y_pred
    except:
        message = 'Model NOT trained on train set and tested on test set'
        log_and_stop(LOAN_LOGGER, message)
Ejemplo n.º 14
0
def read_csv_file(file: str, usecols: List[str]) -> pd.DataFrame:
    """
    Read the specified file consisting loan data with specified usecols.
    Columns issue_d and earliest_cr_line are parsed as datetime.
    Numerical columns have explicitly specified dtypes for memory optimization.
    Function returns a read file.
    """
    try:
        file_path = Path(__file__).parents[0].absolute() / 'data' / file

        read_file = pd.read_csv(file_path,
                                usecols=usecols,
                                parse_dates=['issue_d', 'earliest_cr_line'],
                                dtype={
                                    'int_rate': np.float16,
                                    'installment': np.float32,
                                    'annual_inc': np.float32
                                })

        LOAN_LOGGER.info('File read')
        return read_file
    except:
        message = 'File NOT read'
        log_and_stop(LOAN_LOGGER, message)
Ejemplo n.º 15
0
def get_model_params(selected_model,
                     X_train,
                     y_train,
                     X_test,
                     y_test,
                     hyperopt=False,
                     max_evals=10):
    """
    Retrieve parameters for the selected model, either by using the hyperopt algorithm or loading a given set of parameters.
    Hyperopt accepts only the XGB, LGBM or DecisionTree model.
    The given set of parameters was found using hyperopt.
    The function returns a tuple where:
        - the first item is a dictionary of model parameters ready to be used in a model
        - the second item is either the trials variable if hyperopt was done, or the string `No hyperopt done` otherwise
    """
    try:
        global AVAILABLE_MODELS
        assert (str(selected_model)
                in AVAILABLE_MODELS.keys()), 'Allowed models are {}'.format(
                    AVAILABLE_MODELS.keys())

        if selected_model in ('LGBM', 'XGB'):
            model_type = 'Boosting'
        else:
            model_type = 'DecisionTree'

        LOAN_LOGGER.info('Correct model chosen for parameter retrieval')
    except:
        message = 'Correct model NOT chosen for parameter retrieval'
        log_and_stop(LOAN_LOGGER, message)
    else:
        if hyperopt:
            if model_type == 'Boosting':
                best_params, trials = _run_boosting_hyperopt(
                    AVAILABLE_MODELS[selected_model], X_train, y_train, X_test,
                    y_test, max_evals)
                params_dict = {
                    'colsample_bytree': best_params['ho_colsample_bytree'],
                    'learning_rate': best_params['ho_learning_rate'],
                    'max_depth': int(best_params['ho_max_depth']),
                    'min_child_weight':
                    int(best_params['ho_min_child_weight']),
                    'n_estimators': int(best_params['ho_n_estimators']),
                    'reg_alpha': best_params['ho_reg_alpha'],
                    'reg_lambda': best_params['ho_reg_lambda'],
                    'subsample': best_params['ho_subsample'],
                    'num_leaves': 20,
                    'random_state': 2020,
                    'importance_type': 'gain',
                    'n_jobs': -1
                }
            else:
                best_params, trials = _run_tree_hyperopt(
                    AVAILABLE_MODELS[selected_model], X_train, y_train, X_test,
                    y_test, max_evals)
                params_dict = {
                    'max_depth': int(best_params['ho_max_depth']),
                    'min_samples_leaf':
                    int(best_params['ho_min_samples_leaf']),
                    'min_samples_split':
                    int(best_params['ho_min_samples_split']),
                    'random_state': 2020
                }

            return params_dict, trials
        else:
            params_dict = {
                'Boosting': {
                    'colsample_bytree': 0.8899759555042142,
                    'learning_rate': 0.09532621848124778,
                    'max_depth': 11,
                    'min_child_weight': 4,
                    'n_estimators': 215,
                    'reg_alpha': 2.016992556501955,
                    'reg_lambda': 0.7643883757438669,
                    'subsample': 0.7651869713043127,
                    'num_leaves': 20,
                    'random_state': 2020,
                    'importance_type': 'gain',
                    'n_jobs': -1
                },
                'DecisionTree': {
                    'max_depth': 11,
                    'min_samples_leaf': 8,
                    'min_samples_split': 10,
                    'random_state': 2020
                },
            }

            return params_dict[model_type], 'No hyperopt done'