コード例 #1
0
def test_transition_features():
    expl = Explanation(
        estimator='some estimator',
        targets=[
            TargetExplanation('class1',
                              feature_weights=FeatureWeights(
                                  pos=[FeatureWeight('pos', 13, value=1)],
                                  neg=[],
                              )),
            TargetExplanation('class2',
                              feature_weights=FeatureWeights(
                                  pos=[FeatureWeight('pos', 13, value=1)],
                                  neg=[],
                              )),
        ],
        transition_features=TransitionFeatureWeights(
            class_names=['class2', 'class1'],  # reverse on purpose
            coef=np.array([[1.5, 2.5], [3.5, 4.5]]),
        ))
    df_dict = format_as_dataframes(expl)
    assert isinstance(df_dict, dict)
    assert set(df_dict) == {'targets', 'transition_features'}
    assert df_dict['targets'].equals(format_as_dataframe(expl.targets))
    df = df_dict['transition_features']
    print(df)
    print(format_as_text(expl))
    assert str(df) == ('to      class2  class1\n'
                       'from                  \n'
                       'class2     1.5     2.5\n'
                       'class1     3.5     4.5')

    with pytest.warns(UserWarning):
        single_df = format_as_dataframe(expl)
    assert single_df.equals(df)
コード例 #2
0
def test_transition_features():
    expl = Explanation(
        estimator='some estimator',
        targets=[
            TargetExplanation('class1',
                              feature_weights=FeatureWeights(
                                  pos=[FeatureWeight('pos', 13, value=1)],
                                  neg=[],
                              )),
            TargetExplanation('class2',
                              feature_weights=FeatureWeights(
                                  pos=[FeatureWeight('pos', 13, value=1)],
                                  neg=[],
                              )),
        ],
        transition_features=TransitionFeatureWeights(
            class_names=['class2', 'class1'],  # reverse on purpose
            coef=np.array([[1.5, 2.5], [3.5, 4.5]]),
        ))
    df_dict = format_as_dataframes(expl)
    assert isinstance(df_dict, dict)
    assert set(df_dict) == {'targets', 'transition_features'}
    assert df_dict['targets'].equals(format_as_dataframe(expl.targets))
    df = df_dict['transition_features']
    print(df)
    print(format_as_text(expl))
    expected = pd.DataFrame([
        {
            'from': 'class2',
            'to': 'class2',
            'coef': 1.5
        },
        {
            'from': 'class2',
            'to': 'class1',
            'coef': 2.5
        },
        {
            'from': 'class1',
            'to': 'class2',
            'coef': 3.5
        },
        {
            'from': 'class1',
            'to': 'class1',
            'coef': 4.5
        },
    ],
                            columns=['from', 'to', 'coef'])
    assert df.equals(expected)
    with pytest.warns(UserWarning):
        single_df = format_as_dataframe(expl)
    assert single_df.equals(df)
コード例 #3
0
ファイル: model.py プロジェクト: sai-krishna-msk/KickAssist
    def pred(self):
        self._prepData()
        self.pred_oh_intr = eli5.format_as_dataframe(
            eli5.explain_prediction_xgboost(self.model_oh.get_booster(),
                                            self.df_pred_oh.iloc[0],
                                            feature_filter=self._filter_func))
        self.pred_hel_intr = eli5.format_as_dataframe(
            eli5.explain_prediction_xgboost(self.model_hel.get_booster(),
                                            self.df_pred_hel.iloc[0],
                                            feature_filter=self._filter_func))

        self.pred_oh = self.model_oh.predict_proba(self.df_pred_oh)
        self.pred_hel = self.model_hel.predict_proba(self.df_pred_hel)
コード例 #4
0
def test_feature_importances(with_std, with_value):
    expl = Explanation(estimator='some estimator',
                       feature_importances=FeatureImportances(
                           importances=[
                               FeatureWeight('a',
                                             1,
                                             std=0.1 if with_std else None,
                                             value=1 if with_value else None),
                               FeatureWeight('b',
                                             2,
                                             std=0.2 if with_std else None,
                                             value=3 if with_value else None),
                           ],
                           remaining=10,
                       ))
    df_dict = format_as_dataframes(expl)
    assert isinstance(df_dict, dict)
    assert list(df_dict) == ['feature_importances']
    df = df_dict['feature_importances']
    expected_df = pd.DataFrame({'weight': [1, 2]}, index=['a', 'b'])
    if with_std:
        expected_df['std'] = [0.1, 0.2]
    if with_value:
        expected_df['value'] = [1, 3]
    print(df, expected_df, sep='\n')
    assert expected_df.equals(df)

    single_df = format_as_dataframe(expl)
    assert expected_df.equals(single_df)
コード例 #5
0
ファイル: example.py プロジェクト: ilgrad/tasks
def main():
    df = pd.read_excel('data/mr_vs_fr_30.xlsx')
    df = df.sample(frac=1, random_state=seed)

    df['text_lemmatized'] = df['text'].apply(morphText)

    X_train, X_test, y_train, y_test = train_test_split(
        df['text_lemmatized'], df['label'], test_size=0.3, random_state=42, stratify=df['label'])

    flag_test = True
    get_pipe(X_train, y_train, flag_test, X_test, y_test)

    flag_test = False
    pipe = get_pipe(df['text_lemmatized'], df['label'], flag_test)

    k = 0
    words = []
    for index, row in df.iterrows():
        te5 = TextExplainer(clf=DecisionTreeClassifier(max_depth=5), random_state=seed)
        te5.fit(row['text_lemmatized'], pipe.predict_proba)
        df_eli5_w = eli5.format_as_dataframe(te5.explain_weights())
        print('class {}'.format('male' if row['label'] == 0 else 'woman'))
        print('predict:')
        print(df_eli5_w)
        print(100*'*')
        temp_m = ', '.join(df_eli5_w[df_eli5_w['weight'] > 0]['feature'].tolist())
        if temp_m:
            words.append(temp_m)
        else:
            words.append('')
        k += 1

    df['words'] = words
    df.to_excel('mr_vs_fr_words_30.xlsx', index=False)
コード例 #6
0
def test_targets_with_value():
    expl = Explanation(
        estimator='some estimator',
        targets=[
            TargetExplanation('y',
                              feature_weights=FeatureWeights(
                                  pos=[
                                      FeatureWeight('a', 13, value=1),
                                      FeatureWeight('b', 5, value=2)
                                  ],
                                  neg=[
                                      FeatureWeight('neg1', -10, value=3),
                                      FeatureWeight('neg2', -1, value=4)
                                  ],
                              )),
            TargetExplanation('y2',
                              feature_weights=FeatureWeights(
                                  pos=[FeatureWeight('f', 1, value=5)],
                                  neg=[],
                              )),
        ],
    )
    df = format_as_dataframe(expl)
    expected_df = pd.DataFrame(
        {
            'weight': [13, 5, -1, -10, 1],
            'value': [1, 2, 4, 3, 5]
        },
        columns=['weight', 'value'],
        index=pd.MultiIndex.from_tuples([('y', 'a'), ('y', 'b'), ('y', 'neg2'),
                                         ('y', 'neg1'), ('y2', 'f')],
                                        names=['target', 'feature']))
    print(df, expected_df, sep='\n')
    assert expected_df.equals(df)
コード例 #7
0
def print_eli5(click_data, category):
    pred = pd.read_csv(category + '_xy.csv')
    model = joblib.load(category + ".h5")
    pred = pred.loc[(pred['grid_x'] == click_data['points'][0]['lon']) &
                    (pred['grid_y'] == click_data['points'][0]['lat']), :]
    pred_sqr = pred['eurogrid_0250_1'].values[0]
    dane_model = df.loc[df['eurogrid_0250_1'] == pred_sqr, :]
    dict_ = eli5.format_as_dataframe(eli5.explain_weights(model))
    cols = dict_['feature'].values
    maping = {}
    for i in range(len(cols)):
        maping['x' + str(i)] = cols[i]


#    print(dane_model.columns)
    expl = dane_model.loc[:, cols]
    #    print(expl.head())
    all_cols = itertools.permutations(cols)
    for cols in all_cols:
        try:
            expl = expl.loc[:, list(cols)]
            expl = eli5.formatters.format_as_dataframe(
                eli5.explain_prediction(model, expl))
            break
        except:
            continue
    expl['feature'] = expl['feature'].apply(lambda x: map_x(x, maping))
    return generate_table(expl)
コード例 #8
0
def test_targets(with_std, with_value):
    expl = Explanation(
        estimator='some estimator',
        targets=[
            TargetExplanation(
                'y',
                feature_weights=FeatureWeights(
                    pos=[
                        FeatureWeight('a',
                                      13,
                                      std=0.13 if with_std else None,
                                      value=2 if with_value else None),
                        FeatureWeight('b',
                                      5,
                                      std=0.5 if with_std else None,
                                      value=1 if with_value else None)
                    ],
                    neg=[
                        FeatureWeight('neg1',
                                      -10,
                                      std=0.2 if with_std else None,
                                      value=5 if with_value else None),
                        FeatureWeight('neg2',
                                      -1,
                                      std=0.3 if with_std else None,
                                      value=4 if with_value else None)
                    ],
                )),
            TargetExplanation('y2',
                              feature_weights=FeatureWeights(
                                  pos=[FeatureWeight('f', 1)],
                                  neg=[],
                              )),
        ],
    )
    df_dict = format_as_dataframes(expl)
    assert isinstance(df_dict, dict)
    assert list(df_dict) == ['targets']
    df = df_dict['targets']
    expected_df = pd.DataFrame(
        {
            'target': ['y', 'y', 'y', 'y', 'y2'],
            'feature': ['a', 'b', 'neg2', 'neg1', 'f'],
            'weight': [13, 5, -1, -10, 1]
        },
        columns=['target', 'feature', 'weight'])
    if with_std:
        expected_df['std'] = [0.13, 0.5, 0.3, 0.2, None]
    if with_value:
        expected_df['value'] = [2, 1, 4, 5, None]
    print(df, expected_df, sep='\n')
    assert expected_df.equals(df)

    single_df = format_as_dataframe(expl)
    assert expected_df.equals(single_df)
コード例 #9
0
def test_explain_prediction(boston_train):
    X, y, feature_names = boston_train
    reg = LinearRegression()
    reg.fit(X, y)
    expl = explain_prediction(reg, X[0])
    df = format_as_dataframe(expl)
    check_prediction_df(df, expl)
    check_prediction_df(explain_prediction_df(reg, X[0]), expl)
    df_dict = explain_prediction_dfs(reg, X[0])
    assert set(df_dict.keys()) == {'targets'}
    check_prediction_df(df_dict['targets'], expl)
コード例 #10
0
def test_explain_weights_fi(boston_train):
    X, y, feature_names = boston_train
    reg = ExtraTreesRegressor()
    reg.fit(X, y)
    expl = explain_weights(reg)
    df = format_as_dataframe(expl)
    assert list(df.columns) == ['weight', 'std']
    for fw in expl.feature_importances.importances:
        df_fw = df.loc[fw.feature]
        assert np.isclose(df_fw['weight'], fw.weight)
        assert np.isclose(df_fw['std'], fw.std)
コード例 #11
0
ファイル: pycode_modelling.py プロジェクト: dncholls/capstone
def prediction(model, test_feat, test_lab, cross_valid, name, pred_prob):
    if pred_prob == False:
        pred = model.predict(test_feat)

        if cross_valid == False:
            return results(test_lab, pred, cross_valid, name)

        else:
            performance = results(test_lab, pred, cross_valid, name)
            return performance

    else:
        pred = model.predict(test_feat)
        results(test_lab, pred, cross_valid, name)

        if name == "Decision Tree" or name == "Random Forest":
            expl = explain_prediction_tree_classifier(model, test_feat.iloc[0])
            expl_df = format_as_dataframe(expl)

            if name == "Decision Tree":
                expl_df.to_csv('/.../expl_dt.csv')

            else:
                expl_df.to_csv('/.../expl_rf.csv')
                expfi = explain_rf_feature_importance(
                    model, feature_names=list(test_feat))
                expfi_df = format_as_dataframe(expfi)
                expfi_df.to_csv('/.../expfi_rf.csv')

        elif name == "XGBoost":
            expl = explain_prediction_xgboost(model, test_feat.iloc[0])
            expl_df = format_as_dataframe(expl)
            expl_df.to_csv('/.../expl_xgb.csv')

            expfi = explain_weights_xgboost(model,
                                            feature_names=list(test_feat))
            expfi_df = format_as_dataframe(expfi)
            expfi_df.to_csv('/.../expfi_xgb.csv')

        else:
            print("//")
コード例 #12
0
ファイル: app.py プロジェクト: tu-artem/diabetes
def explain():
    data = request.get_json(force=True)
    # app.logger.info(data)
    df = pd.DataFrame(data, index=[0])
    data_array = transformer.transform(df)

    exp = explain_prediction(model,
                             data_array[0],
                             feature_names=all_feature_names,
                             top=(5, 5),
                             targets=[True])

    output = format_as_dataframe(exp).to_dict()
    return jsonify(output)
コード例 #13
0
ファイル: explain.py プロジェクト: mabu-dev/gobbli
def st_lime_explanation(
    text: str,
    predict_func: Callable[[List[str]], np.ndarray],
    unique_labels: List[str],
    n_samples: int,
    position_dependent: bool = True,
):
    # TODO just use ELI5's built-in visualization when streamlit supports it:
    # https://github.com/streamlit/streamlit/issues/779
    with st.spinner("Generating LIME explanations..."):
        te = TextExplainer(
            random_state=1, n_samples=n_samples, position_dependent=position_dependent
        )
        te.fit(text, predict_func)
    st.json(te.metrics_)
    explanation = te.explain_prediction()
    explanation_df = eli5.format_as_dataframe(explanation)
    for target_ndx, target in enumerate(
        sorted(explanation.targets, key=lambda t: -t.proba)
    ):
        target_explanation_df = explanation_df[
            explanation_df["target"] == target_ndx
        ].copy()

        target_explanation_df["contribution"] = (
            target_explanation_df["weight"] * target_explanation_df["value"]
        )
        target_explanation_df["abs_contribution"] = abs(
            target_explanation_df["contribution"]
        )
        target_explanation_df = (
            target_explanation_df.drop("target", axis=1)
            .sort_values(by="abs_contribution", ascending=False)
            .reset_index(drop=True)
        )
        st.subheader(
            f"Target: {unique_labels[target_ndx]} (probability {target.proba:.4f}, score {target.score:.4f})"
        )
        st.dataframe(target_explanation_df)
コード例 #14
0
def main():
    training_labels, testing_labels = bring_in_labels()

    # bring in actual datasets, 18 data is given training labels
    # Team name removed to avoid error in ML fitting
    url18 = 'https://www.basketball-reference.com/leagues/NBA_2019.html'
    url19 = 'https://www.basketball-reference.com/leagues/NBA_2020.html'

    rstats_18 = data_prep.scrape_regular(url18)
    rstats_18 = rstats_18.merge(training_labels, how='left', on='Team')
    rstats_18 = rstats_18.loc[:, rstats_18.columns != 'Team']

    astats_18 = data_prep.scrape_advanced(url18)
    astats_18 = astats_18.merge(training_labels, how='left',  on='Team')
    astats_18 = astats_18.loc[:, astats_18.columns != 'Team']

    # 19 data is given testing labels
    rstats_19 = data_prep.scrape_regular(url19)
    rstats_19 = rstats_19.merge(testing_labels, how='left',  on='Team')
    rstats_19 = rstats_19.loc[:, rstats_19.columns != 'Team']

    astats_19 = data_prep.scrape_advanced(url19)
    astats_19 = astats_19.merge(testing_labels, how='left',  on='Team')
    astats_19 = astats_19.loc[:, astats_19.columns != 'Team']

    # Train both models on their 2018 data
    r_model = train_model(rstats_18)
    a_model = train_model(astats_18)

    # Test the r_models ability to predict
    rtest_labels = rstats_19['W/L%']
    rtest_features = rstats_19.loc[:, rstats_18.columns != 'W/L%']

    r_model_predictions = r_model.predict(rtest_features)
    r_model_mse = mean_squared_error(rtest_labels, r_model_predictions)

    # test the a_models ability to predict
    atest_labels = astats_19['W/L%']
    atest_features = astats_19.loc[:, astats_18.columns != 'W/L%']

    a_model_predictions = a_model.predict(atest_features)
    a_model_mse = mean_squared_error(atest_labels, a_model_predictions)

    # Stands for regular feature names
    rf_names = rstats_18.columns
    rf_names = list(rf_names[:len(rf_names) - 1])
    r_imp_features = eli5.format_as_dataframe(eli5.explain_weights(
                                              r_model,
                                              top=10,
                                              feature_names=rf_names))

    # Stands for advanced feature names
    af_names = astats_18.columns
    af_names = list(af_names[:len(af_names) - 1])
    a_imp_features = eli5.format_as_dataframe(eli5.explain_weights(
                                              a_model,
                                              top=10,
                                              feature_names=af_names))

    plot_MSE_diffs(r_model_mse, a_model_mse)
    plot_rif(r_imp_features)
    plot_aif(a_imp_features)
コード例 #15
0
ファイル: train_model.py プロジェクト: mycarta/untapped-nrj
def main(input_file_path, output_file_path, tgt="Oil_norm", n_splits=5):
    input_file_name = os.path.join(input_file_path, "Train_final.pck")
    input_file_name_test = os.path.join(input_file_path, "Test_final.pck")
    input_file_name_val = os.path.join(input_file_path, "Validation_final.pck")

    output_file_name = os.path.join(output_file_path, f"models_lgbm_{tgt}.pck")

    df = pd.read_pickle(input_file_name).drop(exclude_cols, axis=1)
    df_test = pd.read_pickle(input_file_name_test)
    df_val = pd.read_pickle(input_file_name_val).drop(exclude_cols, axis=1)

    ids = df_test["EPAssetsId"]

    ids_uwi = df_test["UWI"]

    df_test = df_test.drop(exclude_cols, axis=1)

    cv = KFold(n_splits=n_splits, shuffle=False)
    models = []
    scores = []
    scores_dm = []
    y = df.loc[~df[tgt].isna(), tgt]
    X = df.loc[~df[tgt].isna(), :].drop(
        [
            "Oil_norm", "Gas_norm", "Water_norm", "EPAssetsId",
            "_Normalized`IP`BOE/d"
        ],
        axis=1,
    )
    X_test = df_test.copy().drop("EPAssetsId", axis=1)

    X_holdout, y_holdout = (
        df_val.loc[~df_val[tgt].isna(), :].drop(
            [
                "Oil_norm",
                "Gas_norm",
                "Water_norm",
                "EPAssetsId",
                "_Normalized`IP`BOE/d",
            ],
            axis=1,
        ),
        df_val.loc[~df_val[tgt].isna(), tgt],
    )

    preds_test = np.zeros((n_splits, df_test.shape[0]))
    preds_holdout = np.zeros((n_splits, X_holdout.shape[0]))

    for k, (train_index, test_index) in enumerate(cv.split(X, y)):
        X_train, X_val = X.iloc[train_index, :], X.iloc[test_index, :]

        # model = LGBMRegressor(num_leaves=16, learning_rate=0.1, n_estimators=300, reg_lambda=30, reg_alpha=30,
        # objective='mae',random_state=123)

        model = LogLGBM(
            num_leaves=16,
            learning_rate=0.05,
            n_estimators=900,
            reg_lambda=0,
            reg_alpha=0,
            objective="mae",
            random_state=123,
            feature_fraction=0.7,
        )
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]
        geom_mean = gmean(y_train)
        dm = DummyRegressor(strategy="constant", constant=geom_mean)

        model.fit(X_train, y_train, categorical_feature=CAT_COLUMNS)
        # model.fit(X_train, y_train)
        dm.fit(X_train, y_train)

        score = mean_absolute_error(y_holdout, model.predict(X_holdout))
        score_dm = mean_absolute_error(y_val, dm.predict(X_val))

        # logging.info(f' Score = {score}')
        models.append(model)
        scores.append(score)
        scores_dm.append((score_dm))
        logger.warning(f"Holdout score = {score}")
        preds_test[k, :] = model.predict(X_test).reshape(1, -1)
        preds_holdout[k, :] = model.predict(X_holdout).reshape(1, -1)

    with open(output_file_name, "wb") as f:
        pickle.dump(models, f)
    logger.info(scores)
    logger.info(f"Mean scores LGBM = {np.mean(scores)}")
    logger.info(f"Mean scores Dummy = {np.mean(scores_dm)}")

    preds_df = pd.DataFrame({
        "EPAssetsID": ids,
        "UWI": ids_uwi,
        tgt: preds_test.mean(axis=0)
    })
    preds_df_val = pd.DataFrame({
        tgt: preds_holdout.mean(axis=0),
        "gt": y_holdout
    })
    score_holdout = mean_absolute_error(preds_df_val["gt"], preds_df_val[tgt])
    logger.warning(f"Final score on holdout: {score_holdout}")
    print(eli5.format_as_dataframe(eli5.explain_weights(model)))

    return preds_df, score_holdout, preds_df_val
def main(
    input_file_path,
    output_file_path,
    tgt="Oil_norm",
    interim_file_path=None,
    n_splits=7,
):
    input_file_name = os.path.join(input_file_path, "Train_final.pck")
    input_file_name_test = os.path.join(input_file_path, "Test_final.pck")
    input_file_name_val = os.path.join(input_file_path, "Validation_final.pck")
    exclude_cols = exclude_cols_dict.get(tgt)
    output_file_name = os.path.join(output_file_path, f"models_lgbm_{tgt}.pck")

    df = pd.read_pickle(input_file_name).drop(exclude_cols, axis=1)
    df_test = pd.read_pickle(input_file_name_test)
    df_val = pd.read_pickle(input_file_name_val).drop(exclude_cols, axis=1)
    df_all = pd.concat([df, df_val], axis=0)

    df_all[tgt] = df_all[tgt].fillna(value=0)

    ids = df_test["EPAssetsId"]

    ids_uwi = df_test["UWI"]

    df_test = df_test.drop(exclude_cols, axis=1)

    cv = KFold(n_splits=n_splits, shuffle=False)
    models = []
    scores = []
    scores_dm = []

    y = df_all.loc[~df_all[tgt].isna(), tgt]
    id_X =df_all.loc[~df_all[tgt].isna(),["EPAssetsId"]]

    X = df_all.loc[~df_all[tgt].isna(), :].drop(
        ["Oil_norm", "Gas_norm", "Water_norm", "EPAssetsId", "_Normalized`IP`BOE/d"],
        axis=1,
    )
    X_test = df_test.copy().drop("EPAssetsId", axis=1)

    preds_test = np.zeros((n_splits, df_test.shape[0]))
    preds_holdout = []
    y_true = []
    id_list=[]

    np.random.seed(123)

    best_params = pd.read_csv(
        os.path.join(output_file_path, f"LGBM_{tgt}_feats_final_Trials.csv")
    ).head(20)
    datasets = {}
    for k, (train_index, test_index) in enumerate(cv.split(X, y)):
        X_train, X_holdout = X.iloc[train_index, :], X.iloc[test_index, :]
        id_X_holdout = id_X.iloc[test_index]

        # model = LGBMRegressor(num_leaves=16, learning_rate=0.1, n_estimators=300, reg_lambda=30, reg_alpha=30,
        # objective='mae',random_state=123)

        params = best_params.iloc[0, :].to_dict()
        model = LogLGBM(
            learning_rate=0.05,
            n_estimators=3500,
            objective="mse",
            num_leaves=np.int(params["num_leaves"]),
            feature_fraction=params["feature_fraction"],
            min_data_in_leaf=np.int(params["min_data_in_leaf"]),
            bagging_fraction=params["bagging_fraction"],
            lambda_l1=params["lambda_l1"],
            lambda_l2=params["lambda_l2"],
            random_state=k,
        )
        y_train, y_holdout = y.iloc[train_index], y.iloc[test_index]
        geom_mean = gmean(y_train)
        dm = DummyRegressor(strategy="constant", constant=geom_mean)

        model.fit(
            X_train,
            y_train,
            categorical_feature=set(CAT_COLUMNS) - set(exclude_cols),
            eval_set=(X_holdout, y_holdout),
            early_stopping_rounds=150,
            verbose=200,
        )
        # model.fit(X_train, y_train)
        dm.fit(X_train, y_train)

        score = mean_absolute_error(y_holdout, model.predict(X_holdout))
        score_dm = mean_absolute_error(y_holdout, dm.predict(X_holdout))

        # logging.info(f' Score = {score}')
        models.append(model)
        scores.append(score)

        scores_dm.append((score_dm))
        logger.warning(f"Holdout score = {score}")
        preds_test[k, :] = model.predict(X_test)
        preds_holdout.append(model.predict(X_holdout).reshape(1, -1))
        y_true.append(y_holdout.values.reshape(1, -1))

        print(
            mean_absolute_error(
                y_holdout.values.reshape(1, -1), model.predict(X_holdout).reshape(1, -1)
            )
        )

    with open(output_file_name, "wb") as f:
        pickle.dump(models, f)
    logger.info(scores)
    logger.info(f"Mean scores LGBM = {np.mean(scores)}")
    logger.info(f"Mean scores Dummy = {np.mean(scores_dm)}")

    preds_df = pd.DataFrame(
        {"EPAssetsID": ids, "UWI": ids_uwi, tgt: preds_test.mean(axis=0)}
    )
    n_points = np.hstack(y_true).shape[0]

    preds_df_val = pd.DataFrame(
        {tgt: np.hstack(preds_holdout)[0, :], f"gt_{tgt}": np.hstack(y_true)[0, :]}
    )
    logger.warning(f"Final scores on holdout: {np.mean(scores)} +- {np.std(scores)}")
    logger.warning(
        f"Final scores on full holdout: {mean_absolute_error(preds_df_val[f'gt_{tgt}'], preds_df_val[tgt])}"
    )

    print(eli5.format_as_dataframe(eli5.explain_weights(model, top=60)))

    return preds_df, preds_df_val, np.mean(scores)
コード例 #17
0
def test_bad_list():
    with pytest.raises(ValueError):
        format_as_dataframe([1])
コード例 #18
0
import eli5
from eli5.sklearn import PermutationImportance
import numpy as np
import matplotlib.pylot as plt
import pandas as pd

#fetch best performing model
best_model = RF_gscv.best_estimator_
best_model2 = MLP_gscv.best_estimator_

#fit permutation importance on test data
perm = PermutationImportance(best_model).fit(test_img, test_lab)
perm2 = PermutationImportance(best_model2).fit(test_img, test_lab)

#show weights
wghts = eli5.format_as_dataframe(eli5.explain_weights(perm))
wghts2 = eli5.format_as_dataframe(eli5.explain_weights(perm2))

#write dataframes to csv
wghts.to_csv(
    'D:/studies/phd/WV3_Data_July2019/010039360030_01/L_Sabie_subset/rf_permImportance.csv',
    encoding='utf-8',
    index=False)
wghts2.to_csv(
    'D:/studies/phd/WV3_Data_July2019/010039360030_01/L_Sabie_subset/mlp_permImportance.csv',
    encoding='utf-8',
    index=False)

gLawn = mlp_map_prob[:, 3]
w = x_img_arr[:, -9]
plt.scatter(w, gLawn)
コード例 #19
0
def train(data, *regs,
          save_to=None, concat_features=False, explain=False):
    coords = utils.load_coords()
    concated_xs = np.concatenate(data['xs'], axis=1)
    all_rmse, all_patch_rmse, all_baselines = [], [], []
    regs_name = ', '.join(type(reg).__name__ for reg in regs)
    fitted_regs = []
    expl_by_cls = defaultdict(list)
    for cls in range(utils.N_CLASSES):
        ids = data['ids'][cls]
        scales = data['scales'][cls]
        ys = data['ys'][cls]
        xs = input_features(concated_xs if concat_features else data['xs'][cls])
        # indices = np.array(sorted(range(len(ids)), key=lambda i: (scales[i], ids[i])))
        # ids, xs, ys = ids[indices], xs[indices], ys[indices]
        pred, fitted = train_predict(regs, xs, ys, ids)
        ys_by_id, pred_by_id = [], []
        unique_ids = sorted(set(ids))
        pred_by_id = get_pred_by_id(ids, pred, unique_ids)
        for img_id in unique_ids:
            try:
                ys_by_id.append((coords.loc[[img_id]].cls == cls).sum())
            except KeyError:
                ys_by_id.append(0)
        pred_by_id = round_prediction(pred_by_id)
        patch_rmse = np.sqrt(metrics.mean_squared_error(ys, pred))
        rmse = np.sqrt(metrics.mean_squared_error(ys_by_id, pred_by_id))
        baseline_rmse = np.sqrt(metrics.mean_squared_error(
            cross_val_predict(DummyRegressor(), [[0]] * len(ys_by_id), ys_by_id, cv=5),
            ys_by_id))
        print('cls {}, patch mean {:.3f}, patch RMSE {:.3f}, '
              'image mean {:.2f}, image RMSE {:.2f}, baseline RMSE {:.2f}'
              .format(cls, np.mean(ys), patch_rmse,
                      np.mean(ys_by_id), rmse, baseline_rmse))
        all_rmse.append(rmse)
        all_patch_rmse.append(patch_rmse)
        all_baselines.append(baseline_rmse)
        if save_to:
            fitted_regs.append(fitted)
        if explain:
            for reg in fitted:
                expl = eli5.explain_weights(reg, feature_names=FEATURE_NAMES)
                expl_by_cls[cls].append(expl)
                print(type(reg).__name__, format_as_text(
                    expl, show=('method', 'targets', 'feature_importances')))
    print('{} with {} features: mean patch RMSE {:.3f}, mean image RMSE {:.2f}, '
          'mean baseline RMSE {:.2f}'
          .format(regs_name, ', '.join(FEATURE_NAMES),
                  np.mean(all_patch_rmse), np.mean(all_rmse),
                  np.mean(all_baselines)))
    if save_to:
        joblib.dump(fitted_regs, save_to)
        print('Saved to', save_to)

    if explain:
        dfs = []
        for cls, expls in expl_by_cls.items():
            for expl in expls:
                df = eli5.format_as_dataframe(expl)
                df['cls'] = cls
                df['estimator'] = expl.estimator.split('(')[0]
                dfs.append(df)
        df = pd.concat(dfs)
        df.reset_index(inplace=True)
        df['feature'] = df['index']
        del df['index']
        df = df[['feature', 'cls', 'estimator', 'std', 'weight']]
        df.to_csv('feature_importances.csv', index=None)
コード例 #20
0
def main_run_linear_models(train_ds,
                           val_ds,
                           test_ds,
                           data_props,
                           max_backlooking=None,
                           layer_type='dense',
                           activation_funcs=['sigmoid', 'relu', 'tanh'],
                           max_serach_iterations=200,
                           NN_max_depth=3,
                           MAX_EPOCHS=800,
                           patience=25,
                           model_name='linear',
                           examples=None,
                           return_permutation_importances=True,
                           redo_serach_best_model=False):
    mlflow.set_experiment(model_name)
    experiment_date_time = int(
        datetime.datetime.now().strftime("%Y%m%d%H%M%S"))

    flatten_input = True if layer_type == 'dense' else False

    def _extract_just_important_data_props(data_props):
        kwargs = {}
        kwargs['dataset_cols_X_just_these'] = data_props['third_filter'][
            'cols_just_these']
        kwargs['dataset_cols_X_exclude'] = data_props['third_filter'][
            'cols_drop']
        kwargs['dataset_cols_y'] = data_props['third_filter'][
            'y_cols_just_these']
        kwargs['dataset_hash_input'] = int(data_props['first_step']['dataset'])
        kwargs['dataset_hash_first'] = data_props['first_step_data_hash']
        kwargs['dataset_hash_second'] = data_props['second_step_data_hash']
        kwargs['dataset_split_method'] = data_props['second_step'][
            'split_method']
        kwargs['dataset_split_steps_train'] = data_props['second_step'][
            'split_props']['train_time_steps']
        kwargs['dataset_split_steps_val'] = data_props['second_step'][
            'split_props']['val_time_steps']
        kwargs['dataset_split_steps_test'] = data_props['second_step'][
            'split_props']['test_time_steps']
        kwargs['dataset_iter_step'] = data_props['iter_step']
        kwargs['dataset_normalization'] = data_props['second_step'][
            'normalize_method']
        kwargs['dataset_window_backlooking'] = data_props['first_step'][
            'window_input_width']
        kwargs['dataset_window_prediction'] = data_props['first_step'][
            'window_pred_width']
        kwargs['dataset_window_shift'] = data_props['first_step'][
            'window_shift']
        return kwargs

    def _hp_tranform_param_dict(param_dict):
        new_param_dict = {}
        for key, value in param_dict.items():
            if type(value) == list:
                new_param_dict[key] = hp.choice(key, value)
            elif type(value) == set:
                new_param_dict[key] = hp.uniform(key, *values)
            else:
                new_param_dict[key] = value
        return new_param_dict

    max_backlooking = data_props['first_step'][
        'window_input_width'] if max_backlooking is None else max_backlooking

    param_grid = dict(
        n_layers=list(range(1, NN_max_depth + 1)),
        first_layer_nodes=[0] if NN_max_depth == 1 else [128, 64, 32, 16, 8],
        last_layer_nodes=[0] if NN_max_depth == 1 else [64, 32, 16, 8, 4],
        activation_func=activation_funcs,
        backlooking_window=list(range(1, max_backlooking + 1)))
    hp_param_dict = _hp_tranform_param_dict(param_dict=param_grid)
    hp_param_dict['model_name'] = model_name
    hp_param_dict['data_props'] = data_props
    hp_param_dict['layer_type'] = layer_type

    def _optimize_objective(*args, **kwargs):
        if args != ():
            kwargs = args[
                0]  # if positional arguments expect first to be dictionary with all kwargs
        if type(kwargs) != dict:
            raise Exception(
                f'kwargs is not  dict - it is {type(kwargs)} with values: {kwargs}'
            )

        backlooking_window = kwargs.pop('backlooking_window')
        n_layers = kwargs.pop('n_layers')
        first_layer_nodes = kwargs.pop('first_layer_nodes')
        last_layer_nodes = kwargs.pop('last_layer_nodes')
        activation_func = kwargs.pop('activation_func')
        return_everything = kwargs.pop('return_everything', False)
        verbose = kwargs.pop('verbose', 0)
        model_name = kwargs.pop('model_name', 'linear')
        data_props = kwargs.pop('data_props')
        layer_type = kwargs.pop('layer_type', 'dense')

        dataset = _get_prep_data(train_ds,
                                 val_ds,
                                 test_ds,
                                 flatten=flatten_input,
                                 keep_last_n_periods=backlooking_window)

        now = datetime.datetime.now()
        date_time = str(now.strftime("%y%m%d%H%M%S"))
        model_name = f"{date_time}_{model_name}_w{backlooking_window}_l{n_layers}_a{activation_func}"

        kwargs = dict(
            model_name=model_name,
            n_layers=n_layers,
            first_layer_nodes=first_layer_nodes,
            last_layer_nodes=last_layer_nodes,
            activation_func=activation_func,
            input_size=dataset['input_shape'] if layer_type == 'dense' else
            tuple(list(train_ds.element_spec[0].shape)[1:]),
            output_size=dataset['output_shape'],
            backlooking_window=backlooking_window,
            layer_type=layer_type)

        model = createmodel(**kwargs)
        history, mlflow_additional_params = compile_and_fit(
            model=model,
            train=dataset['train_ds'],
            val=dataset['val_ds'],
            MAX_EPOCHS=MAX_EPOCHS,
            patience=patience,
            model_name=model_name,
            verbose=verbose)

        # Get all data props for documentation in MLflow
        kwargs.update(_extract_just_important_data_props(data_props))
        kwargs['run'] = experiment_date_time
        mlflow_additional_params['kwargs'] = kwargs

        train_performance = dict(
            zip(model.metrics_names,
                evaluate_model(model=model, tf_data=dataset['train_ds'])))
        val_performance = dict(
            zip(model.metrics_names,
                evaluate_model(model=model, tf_data=dataset['val_ds'])))
        test_performance = dict(
            zip(
                model.metrics_names,
                evaluate_model(
                    model=model,
                    tf_data=dataset['test_ds'],
                    mlflow_additional_params=mlflow_additional_params)))
        mlflow_additional_params['data_props'] = data_props

        # Only save model if close to 15% best models
        try:
            best_loss = float(trials.best_trial['result']['loss'])
            current_loss = min(history.history['val_loss'])
            if current_loss <= best_loss * (1 + 0.15):
                save_model = True
            else:
                save_model = False
        except:
            save_model = True
        mlflow_saved = my_helpers.mlflow_last_run_add_param(
            param_dict=mlflow_additional_params, save_model=save_model)

        tf.keras.backend.clear_session()

        return_metrics = dict(loss=val_performance['loss'],
                              all_metrics={
                                  'train': train_performance,
                                  'val': val_performance,
                                  'test': test_performance
                              },
                              status=STATUS_OK,
                              mlflow=mlflow_saved,
                              model_name=model_name)

        if return_everything:
            return_metrics['model'] = model
            return_metrics['history'] = history

        return return_metrics

    ###### Get old best model records ######

    storage_file_path = os.path.join(
        my_helpers.get_project_directories(key='cache_dir'),
        'storage_best_model.json')
    if not os.path.exists(storage_file_path):
        best_model_storage = {}
    else:
        with open(storage_file_path) as json_file:
            best_model_storage = json.load(json_file)

    ######## Search for best model ########

    if redo_serach_best_model or model_name not in best_model_storage or data_props[
            'iter_step'] not in best_model_storage[model_name]:
        warnings.filterwarnings('ignore')
        trials = Trials()
        best = fmin(fn=_optimize_objective,
                    space=hp_param_dict,
                    algo=tpe.suggest,
                    max_evals=max_serach_iterations,
                    trials=trials,
                    early_stop_fn=no_progress_loss(iteration_stop_count=int(
                        max_serach_iterations / 4),
                                                   percent_increase=0.025))
        warnings.simplefilter('always')

        # getting all parameters for best model storage
        mlflow_best_model = trials.best_trial['result']['mlflow']
        best_params = {}
        for key, idx in best.items():
            best_params[key] = param_grid[key][idx]

        coef_names_ = list(
            data_props['look_ups']['out_lookup_col_name']['X'].keys())
        coef_names_ = coef_names_ + [
            col + f'_sft_{i}'
            for i in range(1, best_params['backlooking_window'])
            for col in coef_names_
        ]

        # Saving best model to storage
        if model_name not in best_model_storage:
            best_model_storage[model_name] = {}
        if data_props['iter_step'] not in best_model_storage[model_name]:
            best_model_storage[model_name][data_props['iter_step']] = {
                'best_model': {
                    'result': {
                        'loss': 10**10
                    }
                },
                'history': {}
            }

        best_model_param = dict(
            result={
                'loss': trials.best_trial['result']['loss'],
                'all_metrics': trials.best_trial['result']['all_metrics']
            },
            model_name=trials.best_trial['result']['model_name'],
            model_id=trials.best_trial['result']['mlflow']['model_id'],
            run_id=experiment_date_time,
            input_coefs=coef_names_,
            path_saved_model=trials.best_trial['result']['mlflow']
            ['saved_model_path'],
            status=trials.best_trial['result']['status'],
            params=best_params,
            data=_extract_just_important_data_props(data_props))

        best_model_storage[model_name][data_props['iter_step']]['history'][
            experiment_date_time] = best_model_param
        if trials.best_trial['result']['loss'] < best_model_storage[model_name][
                data_props['iter_step']]['best_model']['result']['loss']:
            best_model_storage[model_name][
                data_props['iter_step']]['best_model'] = best_model_param

        with open(storage_file_path, 'w') as outfile:
            json.dump(best_model_storage, outfile)

    else:
        # Get best model from storage
        best_model_param = best_model_storage[model_name][
            data_props['iter_step']]['best_model']

    ######## Get Best model again ########
    best_model = tf.keras.models.load_model(
        best_model_param['path_saved_model'])
    best_model.compile(loss=tf.losses.MeanAbsoluteError(),
                       optimizer=tf.optimizers.Adam(),
                       metrics=[
                           tf.metrics.MeanAbsoluteError(),
                           CustomMeanDirectionalAccuracy(),
                           tf.losses.Huber(),
                           tf.metrics.MeanAbsolutePercentageError(),
                           tf.metrics.MeanSquaredError(),
                           tf.metrics.MeanSquaredLogarithmicError()
                       ])
    print('Best model is:', best_model_param)

    out = dict(best_model_param)

    ####### Get examples for plotting #######
    if examples is not None:
        example_X = examples['X']
        periods = best_model_param['params']['backlooking_window']
        if layer_type == 'dense':
            example_X = tf.data.Dataset.from_tensors(
                np.reshape(example_X[:, -periods:, :],
                           (example_X.shape[0], -1)))
        else:
            example_X = tf.data.Dataset.from_tensors(example_X)
        out['examples_pred_y'] = best_model.predict(example_X)

    ###### For 1 layer dense/linear models get coef & p-values ######
    if NN_max_depth == 1 and isinstance(best_model.layers[0],
                                        tf.keras.layers.Dense):
        # Get coefs
        intercept_ = best_model.layers[0].bias.numpy()
        coef_ = best_model.layers[0].weights[0].numpy()
        out['coef_'] = pd.Series(
            dict(
                zip(['intercept_'] + best_model_param['input_coefs'],
                    intercept_.tolist() + coef_.squeeze().tolist())))

        dataset = _get_prep_data(train_ds,
                                 val_ds,
                                 test_ds,
                                 flatten=True,
                                 keep_last_n_periods=best_model_param['params']
                                 ['backlooking_window'])

        # get p-values
        import app.d_prediction.my_custom_pvalue_calc as my_p_lib

        out['p_values'] = {}
        for data_set in ['train', 'val', 'test']:
            y_pred = best_model.predict(dataset[f'{data_set}_X'])
            y_pred = np.reshape(y_pred, (-1, 1))
            try:
                p_values = my_p_lib.coef_pval(dataset[f'{data_set}_X'],
                                              dataset[f'{data_set}_y'], coef_,
                                              intercept_, y_pred)
                p_values = pd.Series(
                    dict(zip(best_model_param['input_coefs'], p_values)))
                out['p_values'][data_set] = p_values
            except:
                warnings.warn(
                    "P-Values: ValueError: Input contains infinity or nan.")
                out['p_values'][data_set] = pd.Series(
                    dict(
                        zip(best_model_param['input_coefs'],
                            ['error'] * len(best_model_param['input_coefs']))))
        out['p_values'] = pd.DataFrame(out['p_values'])

    ##### Get Column Feature Importance #####
    if return_permutation_importances:
        if 'feature_importance' in best_model_param:
            out['feature_importance'] = best_model_param['feature_importance']

        else:
            import eli5
            from eli5.sklearn import PermutationImportance

            sklearn_model = KerasRegressor(build_fn=best_model)
            sklearn_model.model = best_model

            dataset = _get_prep_data(
                train_ds,
                val_ds,
                test_ds,
                flatten=flatten_input,
                keep_last_n_periods=best_model_param['params']
                ['backlooking_window'])

            out['feature_importance'] = {}
            for data_set in ['train', 'val']:
                # Calculate actual FeatureImporttance
                try:
                    perm = PermutationImportance(
                        sklearn_model, cv='prefit').fit(
                            dataset[f'{data_set}_X'].numpy(),
                            np.reshape(dataset[f'{data_set}_y'].numpy(),
                                       (-1, 1)))
                    feature_importances = eli5.format_as_dataframe(
                        eli5.explain_weights(
                            perm,
                            feature_names=best_model_param['input_coefs'],
                            top=10**10))
                    out['feature_importance'][
                        data_set] = feature_importances.set_index(
                            'feature').to_dict()
                except:
                    warnings.warn(
                        "PermutationImportance: ValueError: Input contains infinity or a value too large for dtype('float16')."
                    )

            if out['feature_importance'] != {}:
                best_model_param['feature_importance'] = out[
                    'feature_importance']
                best_model_storage[model_name][
                    data_props['iter_step']]['best_model'][
                        'feature_importance'] = out['feature_importance']
                best_model_storage[model_name][
                    data_props['iter_step']]['history'][experiment_date_time][
                        'feature_importance'] = out['feature_importance']

                with open(storage_file_path, 'w') as outfile:
                    json.dump(best_model_storage, outfile)

    out['status'] = 'ok'
    return out
コード例 #21
0
    def process(self, inputs):

        max_features = self._campaign_configuration['FeatureSelection']['max_features']

        # setting parameters for XGboost design space expoloration
        xgboost_parameters = copy.deepcopy(self._campaign_configuration)

        xgboost_parameters['General']['techniques'] = ['XGBoost']

        xgboost_parameters['General']['run_num'] = 1

        local_root_directory = self._campaign_configuration['General']['output']
        for token in self._prefix:
            local_root_directory = os.path.join(local_root_directory, token)
        xgboost_parameters['General']['output'] = local_root_directory

        del xgboost_parameters['FeatureSelection']

        model_building_var = model_building.model_building.ModelBuilding(0)

        if 'XGBoost' not in xgboost_parameters:
            # default parameters if not provided in the ini file
            xgboost_parameters['XGBoost'] = {}
            xgboost_parameters['XGBoost']['min_child_weight'] = [1, 3]
            xgboost_parameters['XGBoost']['gamma'] = [0, 1]
            xgboost_parameters['XGBoost']['n_estimators'] = [50, 100, 150, 250]
            xgboost_parameters['XGBoost']['learning_rate'] = [0.01, 0.05, 0.1]
            xgboost_parameters['XGBoost']['max_depth'] = [1, 2, 3, 5, 9, 13]

        best_conf = model_building_var.process(xgboost_parameters, inputs, int(self._campaign_configuration['General']['j']))

        # best_conf is a XGBoost configuration exeperiment
        xgb_regressor = best_conf.get_regressor()

        # top = None means all
        expl = eli5.xgboost.explain_weights_xgboost(xgb_regressor, feature_names=inputs.x_columns, top=max_features, importance_type='gain')

        # text version
        expl_weights = eli5.format_as_text(expl)

        self._logger.debug("XGBoost feature scores:\n%s", str(expl_weights))

        df = eli5.format_as_dataframe(expl)  # data frame version

        xgb_sorted_features = df['feature'].values.tolist()  # features list

        features_sig = df['weight'].values.tolist()  # significance score weights

        cumulative_significance = 0

        tolerance = self._campaign_configuration['FeatureSelection']['XGBoost_tolerance']

        index = 0

        while cumulative_significance < tolerance and index < len(features_sig):
            cumulative_significance = cumulative_significance + features_sig[index]
            index = index + 1

        feat_res = xgb_sorted_features[0:index]

        self._logger.info("XGBoost selected features: %s", str(feat_res))

        data = inputs
        data.x_columns = feat_res

        return data
コード例 #22
0
ファイル: ml.py プロジェクト: lin-justin/cafe
def classify(features,
             labels,
             model='all',
             resample_method=None,
             scoring='roc_auc_ovo',
             cv=10,
             n_iter=10):
    '''
    A nested function to apply machine learning classification

    Args:
        features: A pandas dataframe containing the features
        labels: A pandas dataframe containing the labels
        model: Options are: 'rf' - Random Forest
                            'gbm' - Gradient Boosting
                            'dt' - Decision Tree
                            'et' - Extremely Randomized Tree 
                            'log_sgd' - Logistic Regression with Stochastic
                                        Gradient Descent learning
                            'all' - Tests out all five of the models and identifies
                                    which model is the best based on the cross-validation
                                    score
                Default is 'all'
        resample_method: Resampling to deal with imbalanced data
                         Reference: https://imbalanced-learn.readthedocs.io/en/stable/combine.html#bpm2004
                         Options are:
                                    'smote_tomek' - https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTETomek.html#imblearn.combine.SMOTETomek
                                    'smote_enn' - https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTEENN.html#imblearn.combine.SMOTEENN
                         Default is None
        scoring: The metric for evaluating model performance
                 Reference: https://scikit-learn.org/stable/modules/model_evaluation.html
                 Default is 'roc_auc_ovo'
        cv: The number of splits for cross-validation
            Default is 10
        n_iter: The number of parameter settings that are sampled
                Reference: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
                Default is 10

    Returns
            the tuned classifier
            a report containing the f1-score, precision, and recall for each class
            the Matthews Correlation Coefficient value
            the log loss (cross-entropy loss)
            a pandas dataframe of the features predicted to be of a certain class given its weight and value
            a confusion matrix figure
            a ROC curve figure
    '''

    # Make sure the features and labels are of type pandas dataframes
    if not isinstance(features, pd.DataFrame) and not isinstance(
            labels, pd.DataFrame):
        raise TypeError(
            'The features and labels are not of a pandas dataframe type.')

    # Make sure the number of rows are the same in features and labels
    assert features.shape[0] == labels.shape[0], 'Unequal number of rows.'

    # Get the names of the features
    feature_names = features.columns.values

    def standardize(X):
        '''
        Custom standardization function

        Args:
            X: the features as a numpy array

        Returns the standardized features
        '''
        return (X - np.mean(X)) / np.std(X)

    def which_model(X, y, model='all'):
        '''
        Using the baseline models (default parameters) of Random Forest, 
        Gradient Boosting, Decision Tree, Extremely Randomized Tree, and 
        Logistic Regression with Stochastic Gradient Descent learning on 
        the entire dataset (with cross-validation) to determine which model is best.
        
        The user can either test the 5 models individually or test all of them by
        setting model = 'all'
        
        Args:
            X: The numpy array containing the features
            y: The numpy array containing the labels
            model: Options are: 'rf' - Random Forest
                                'gbm' - Gradient Boosting
                                'dt' - Decision Tree
                                'et' - Extremely Randomized Tree 
                                'log_sgd' - Logistic Regression with Stochastic
                                            Gradient Descent learning
                                'all' - Tests out all five of the models and identifies
                                        which model is the best based on the cross-validation
                                        score
                    Default is 'all'
                
        Returns best model
        '''

        if model == 'all':
            # Pipelines help prevent data leakage
            pipelines = []
            pipelines.append(
                ('Random Forest',
                 skl_pipeline([
                     ('Standardization', FunctionTransformer(standardize)),
                     ('RF', RandomForestClassifier(random_state=9999))
                 ])))
            pipelines.append(
                ('Gradient Boosting',
                 skl_pipeline([
                     ('Standardization', FunctionTransformer(standardize)),
                     ('GBM', GradientBoostingClassifier(random_state=9999))
                 ])))
            pipelines.append(
                ('Decision Tree',
                 skl_pipeline([
                     ('Standardization', FunctionTransformer(standardize)),
                     ('DT', DecisionTreeClassifier(random_state=9999))
                 ])))
            pipelines.append(
                ('Extra Trees',
                 skl_pipeline([
                     ('Standardization', FunctionTransformer(standardize)),
                     ('ET', ExtraTreesClassifier(random_state=9999))
                 ])))
            pipelines.append(
                ('Logistic Regression (SGD)',
                 skl_pipeline([
                     ('Standardization', FunctionTransformer(standardize)),
                     ('LOGSGD', SGDClassifier(loss='log', random_state=9999))
                 ])))

            print('\nSelecting model...')
            print('\nModel\tScore')
            print('-------------')
            results = []
            names = []
            for name, model in pipelines:
                # Apply cross validation
                cv_results = cross_val_score(model,
                                             X,
                                             y,
                                             cv=cv,
                                             scoring=scoring)
                results.append(np.mean(cv_results))
                names.append(name)
                print('{}: {:.4f} ± {:.4f}'.format(name, np.mean(cv_results),
                                                   np.std(cv_results)))

            names_results = list(zip(names, results))
            # Return model with highest score value
            selected_model = max(names_results, key=lambda item: item[1])
            print('\nThe selected model is', selected_model)

            if 'Gradient Boosting' in selected_model:
                return GradientBoostingClassifier()
            elif 'Random Forest' in selected_model:
                return RandomForestClassifier()
            elif 'Decision Tree' in selected_model:
                return DecisionTreeClassifier()
            elif 'Extra Trees' in selected_model:
                return ExtraTreesClassifier()
            elif 'Logistic Regression (SGD)' in selected_model:
                return SGDClassifier(loss='log')

        elif model == 'rf':
            rf_pipe = skl_pipeline([
                ('Standardization', FunctionTransformer(standardize)),
                ('RF', RandomForestClassifier(random_state=9999))
            ])
            rf_score = cross_val_score(rf_pipe, X, y, cv=cv, scoring=scoring)
            print('\nRandom Forest score: {:.4f} ± {:.4f}'.format(
                np.mean(rf_score), np.std(rf_score)))
            return RandomForestClassifier()

        elif model == 'gbm':
            gb_pipe = skl_pipeline([
                ('Standardization', FunctionTransformer(standardize)),
                ('GBM', GradientBoostingClassifier(random_state=9999))
            ])
            gb_score = cross_val_score(gb_pipe, X, y, cv=cv, scoring=scoring)
            print('\nGradient Boosting score: {:.4f} ± {:.4f}'.format(
                np.mean(gb_score), np.std(gb_score)))
            return GradientBoostingClassifier()

        elif model == 'dt':
            dt_pipe = skl_pipeline([
                ('Standardization', FunctionTransformer(standardize)),
                ('DT', DecisionTreeClassifier(random_state=9999))
            ])
            dt_score = cross_val_score(dt_pipe, X, y, cv=cv, scoring=scoring)
            print('\nDecision Tree score: {:.4f} ± {:.4f}'.format(
                np.mean(dt_score), np.std(dt_score)))
            return DecisionTreeClassifier()

        elif model == 'et':
            et_pipe = skl_pipeline([
                ('Standardization', FunctionTransformer(standardize)),
                ('ET', ExtraTreesClassifier(random_state=9999))
            ])
            et_score = cross_val_score(et_pipe, X, y, cv=cv, scoring=scoring)
            print('\nExtra Trees score: {:.4f} ± {:.4f}'.format(
                np.mean(et_score), np.std(et_score)))
            return ExtraTreesClassifier()

        elif model == 'log_sgd':
            log_pipe = skl_pipeline([
                ('Standardization', FunctionTransformer(standardize)),
                ('LOGSGD', SGDClassifier(loss='log', random_state=9999))
            ])
            log_score = cross_val_score(log_pipe, X, y, cv=cv, scoring=scoring)
            print('\nLogistic Regression (SGD) score: {:.4f} ± {:.4f}'.format(
                np.mean(log_score), np.std(log_score)))
            return SGDClassifier(loss='log')

    def train(selected_model, X_train, y_train, resample_method=None):
        '''
        Train and tune the hyperparameters of the selected model

        Random Search is used because the parameter search space is large
        and performs as well as Grid Search.

        Hyperparameter tuning is more art than science as it is based on
        expertise and experience

        Args:
            selected_model: The model from which_model()
            X_train: The training features
            y_train: The training labels

        Returns the tuned classifier
        '''

        print('\nStarting training...')

        if resample_method == None:

            if selected_model.__class__.__name__ == 'GradientBoostingClassifier':

                start = time()

                pipe = skl_pipeline([('Standardization',
                                      FunctionTransformer(standardize)),
                                     ('clf', selected_model)])

                gb_grid = {
                    'clf__n_estimators': [
                        int(x)
                        for x in np.linspace(start=100, stop=1000, num=10)
                    ],
                    'clf__subsample': [0.7, 0.8],
                    'clf__learning_rate': [0.001, 0.01, 0.1],
                    'clf__max_depth':
                    [int(x) for x in np.linspace(10, 110, num=11)],
                    'clf__max_features': ['sqrt', 'log2'],
                    'clf__min_samples_split': [2, 5, 10],
                    'clf__min_samples_leaf': [1, 2, 4],
                    'clf__loss': ['deviance'],
                    'clf__random_state': [9999]
                }

                clf = RandomizedSearchCV(pipe,
                                         gb_grid,
                                         cv=cv,
                                         n_iter=n_iter,
                                         scoring=scoring,
                                         n_jobs=-1,
                                         random_state=9999)

                clf.fit(X_train, y_train)

                end = time()

                time_elapsed = end - start

                print('\nThe best {}-fold cross valdiation score is {:.4f}.'.
                      format(cv, clf.best_score_))
                print('The best parameters are:\n',
                      clf.best_estimator_.get_params()['clf'])
                print('Training took {:.0f}m {:.0f}s.'.format(
                    time_elapsed // 60, time_elapsed % 60))

                return clf

            elif selected_model.__class__.__name__ == 'RandomForestClassifier':

                start = time()

                pipe = skl_pipeline([('Standardization',
                                      FunctionTransformer(standardize)),
                                     ('clf', selected_model)])

                rf_grid = {
                    'clf__n_estimators': [
                        int(x)
                        for x in np.linspace(start=100, stop=1000, num=10)
                    ],
                    'clf__max_depth':
                    [int(x) for x in np.linspace(10, 110, num=11)],
                    'clf__max_features': ['sqrt', 'auto'],
                    'clf__min_samples_leaf': [1, 2, 4],
                    'clf__min_samples_split': [2, 5, 10],
                    'clf__bootstrap': [True],
                    'clf__class_weight': ['balanced', None],
                    'clf__random_state': [9999]
                }

                clf = RandomizedSearchCV(pipe,
                                         rf_grid,
                                         cv=cv,
                                         n_iter=n_iter,
                                         scoring=scoring,
                                         n_jobs=-1,
                                         random_state=9999)

                clf.fit(X_train, y_train)

                end = time()

                time_elapsed = end - start

                print('\nThe best {}-fold cross valdiation score is {:.4f}.'.
                      format(cv, clf.best_score_))
                print('The best parameters are:\n',
                      clf.best_estimator_.get_params()['clf'])
                print('Training took {:.0f}m {:.0f}s.'.format(
                    time_elapsed // 60, time_elapsed % 60))

                return clf

            elif selected_model.__class__.__name__ == 'DecisionTreeClassifier':

                start = time()

                pipe = skl_pipeline([('Standardization',
                                      FunctionTransformer(standardize)),
                                     ('clf', selected_model)])

                dt_grid = {
                    'clf__criterion': ['gini'],
                    'clf__splitter': ['best', 'random'],
                    'clf__max_depth':
                    [int(x) for x in np.linspace(10, 110, num=11)],
                    'clf__min_samples_leaf': [1, 2, 4],
                    'clf__max_features': ['sqrt', 'auto'],
                    'clf__min_samples_split': [2, 5, 10],
                    'clf__class_weight': ['balanced', None],
                    'clf__random_state': [9999]
                }

                clf = RandomizedSearchCV(pipe,
                                         dt_grid,
                                         cv=cv,
                                         n_iter=n_iter,
                                         scoring=scoring,
                                         n_jobs=-1,
                                         random_state=9999)

                clf.fit(X_train, y_train)

                end = time()

                time_elapsed = end - start

                print('\nThe best {}-fold cross valdiation score is {:.4f}.'.
                      format(cv, clf.best_score_))
                print('The best parameters are:\n',
                      clf.best_estimator_.get_params()['clf'])
                print('Training took {:.0f}m {:.0f}s.'.format(
                    time_elapsed // 60, time_elapsed % 60))

                return clf

            elif selected_model.__class__.__name__ == 'ExtraTreesClassifier':

                start = time()

                pipe = skl_pipeline([('Standardization',
                                      FunctionTransformer(standardize)),
                                     ('clf', selected_model)])

                et_grid = {
                    'clf__criterion': ['gini'],
                    'clf__n_estimators': [
                        int(x)
                        for x in np.linspace(start=100, stop=1000, num=10)
                    ],
                    'clf__bootstrap': [True, False],
                    'clf__max_depth':
                    [int(x) for x in np.linspace(10, 110, num=11)],
                    'clf__min_samples_leaf': [1, 2, 4],
                    'clf__max_features': ['sqrt', 'auto'],
                    'clf__min_samples_split': [2, 5, 10],
                    'clf__class_weight': ['balanced', None],
                    'clf__random_state': [9999]
                }

                clf = RandomizedSearchCV(pipe,
                                         et_grid,
                                         cv=cv,
                                         n_iter=n_iter,
                                         scoring=scoring,
                                         n_jobs=-1,
                                         random_state=9999)

                clf.fit(X_train, y_train)

                end = time()

                time_elapsed = end - start

                print('\nThe best {}-fold cross valdiation score is {:.4f}.'.
                      format(cv, clf.best_score_))
                print('The best parameters are:\n',
                      clf.best_estimator_.get_params()['clf'])
                print('Training took {:.0f}m {:.0f}s.'.format(
                    time_elapsed // 60, time_elapsed % 60))

                return clf

            elif 'log' in selected_model.get_params().values():

                start = time()

                pipe = skl_pipeline([('Standardization',
                                      FunctionTransformer(standardize)),
                                     ('clf', selected_model)])

                log_grid = {
                    'clf__loss': ['log'],
                    'clf__penalty': ['l2', 'l1', 'elasticnet'],
                    'clf__alpha': [0.01, 0.001, 0.0001],
                    'clf__max_iter': [1000, 5000],
                    'clf__class_weight': ['balanced', None],
                    'clf__random_state': [9999]
                }

                clf = RandomizedSearchCV(pipe,
                                         log_grid,
                                         cv=cv,
                                         n_iter=n_iter,
                                         scoring=scoring,
                                         n_jobs=-1,
                                         random_state=9999)

                clf.fit(X_train, y_train)

                end = time()

                time_elapsed = end - start

                print('\nThe best {}-fold cross valdiation score is {:.4f}.'.
                      format(cv, clf.best_score_))
                print('The best parameters are:\n',
                      clf.best_estimator_.get_params()['clf'])
                print('Training took {:.0f}m {:.0f}s.'.format(
                    time_elapsed // 60, time_elapsed % 60))

                return clf

        elif resample_method == 'smote_tomek':

            if selected_model.__class__.__name__ == 'GradientBoostingClassifier':

                start = time()

                pipe = imbl_pipeline([('Standardization',
                                       FunctionTransformer(standardize)),
                                      ('SMOTETOMEK', SMOTETomek()),
                                      ('clf', selected_model)])

                gb_grid = {
                    'clf__n_estimators': [
                        int(x)
                        for x in np.linspace(start=100, stop=1000, num=10)
                    ],
                    'clf__subsample': [0.7, 0.8],
                    'clf__learning_rate': [0.001, 0.01, 0.1],
                    'clf__max_depth':
                    [int(x) for x in np.linspace(10, 110, num=11)],
                    'clf__max_features': ['sqrt', 'log2'],
                    'clf__min_samples_split': [2, 5, 10],
                    'clf__min_samples_leaf': [1, 2, 4],
                    'clf__loss': ['deviance'],
                    'clf__random_state': [9999],
                    'SMOTETOMEK__random_state': [9999]
                }

                clf = RandomizedSearchCV(pipe,
                                         gb_grid,
                                         cv=cv,
                                         n_iter=n_iter,
                                         scoring=scoring,
                                         n_jobs=-1,
                                         random_state=9999)

                clf.fit(X_train, y_train)

                end = time()

                time_elapsed = end - start

                print('\nThe best {}-fold cross valdiation score is {:.4f}.'.
                      format(cv, clf.best_score_))
                print('The best parameters are:\n',
                      clf.best_estimator_.get_params()['clf'])
                print('Training took {:.0f}m {:.0f}s.'.format(
                    time_elapsed // 60, time_elapsed % 60))

                return clf

            elif selected_model.__class__.__name__ == 'RandomForestClassifier':

                start = time()

                pipe = imbl_pipeline([('Standardization',
                                       FunctionTransformer(standardize)),
                                      ('SMOTETOMEK', SMOTETomek()),
                                      ('clf', selected_model)])

                rf_grid = {
                    'clf__n_estimators': [
                        int(x)
                        for x in np.linspace(start=100, stop=1000, num=10)
                    ],
                    'clf__max_depth':
                    [int(x) for x in np.linspace(10, 110, num=11)],
                    'clf__max_features': ['sqrt', 'auto'],
                    'clf__min_samples_leaf': [1, 2, 4],
                    'clf__min_samples_split': [2, 5, 10],
                    'clf__bootstrap': [True],
                    'clf__class_weight': ['balanced', None],
                    'clf__random_state': [9999],
                    'SMOTETOMEK__random_state': [9999]
                }

                clf = RandomizedSearchCV(pipe,
                                         rf_grid,
                                         cv=cv,
                                         n_iter=n_iter,
                                         scoring=scoring,
                                         n_jobs=-1,
                                         random_state=9999)

                clf.fit(X_train, y_train)

                end = time()

                time_elapsed = end - start

                print('\nThe best {}-fold cross valdiation score is {:.4f}.'.
                      format(cv, clf.best_score_))
                print('The best parameters are:\n',
                      clf.best_estimator_.get_params()['clf'])
                print('Training took {:.0f}m {:.0f}s.'.format(
                    time_elapsed // 60, time_elapsed % 60))

                return clf

            elif selected_model.__class__.__name__ == 'DecisionTreeClassifier':

                start = time()

                pipe = imbl_pipeline([('Standardization',
                                       FunctionTransformer(standardize)),
                                      ('SMOTETOMEK', SMOTETomek()),
                                      ('clf', selected_model)])

                dt_grid = {
                    'clf__criterion': ['gini'],
                    'clf__splitter': ['best', 'random'],
                    'clf__max_depth':
                    [int(x) for x in np.linspace(10, 110, num=11)],
                    'clf__min_samples_leaf': [1, 2, 4],
                    'clf__max_features': ['sqrt', 'auto'],
                    'clf__min_samples_split': [2, 5, 10],
                    'clf__class_weight': ['balanced', None],
                    'clf__random_state': [9999],
                    'SMOTETOMEK__random_state': [9999]
                }

                clf = RandomizedSearchCV(pipe,
                                         dt_grid,
                                         cv=cv,
                                         n_iter=n_iter,
                                         scoring=scoring,
                                         n_jobs=-1,
                                         random_state=9999)

                clf.fit(X_train, y_train)

                end = time()

                time_elapsed = end - start

                print('\nThe best {}-fold cross valdiation score is {:.4f}.'.
                      format(cv, clf.best_score_))
                print('The best parameters are:\n',
                      clf.best_estimator_.get_params()['clf'])
                print('Training took {:.0f}m {:.0f}s.'.format(
                    time_elapsed // 60, time_elapsed % 60))

                return clf

            elif selected_model.__class__.__name__ == 'ExtraTreesClassifier':

                start = time()

                pipe = imbl_pipeline([('Standardization',
                                       FunctionTransformer(standardize)),
                                      ('SMOTETOMEK', SMOTETomek()),
                                      ('clf', selected_model)])

                et_grid = {
                    'clf__criterion': ['gini'],
                    'clf__n_estimators': [
                        int(x)
                        for x in np.linspace(start=100, stop=1000, num=10)
                    ],
                    'clf__bootstrap': [True, False],
                    'clf__max_depth':
                    [int(x) for x in np.linspace(10, 110, num=11)],
                    'clf__min_samples_leaf': [1, 2, 4],
                    'clf__max_features': ['sqrt', 'auto'],
                    'clf__min_samples_split': [2, 5, 10],
                    'clf__class_weight': ['balanced', None],
                    'clf__random_state': [9999]
                }

                clf = RandomizedSearchCV(pipe,
                                         et_grid,
                                         cv=cv,
                                         n_iter=n_iter,
                                         scoring=scoring,
                                         n_jobs=-1,
                                         random_state=9999)

                clf.fit(X_train, y_train)

                end = time()

                time_elapsed = end - start

                print('\nThe best {}-fold cross valdiation score is {:.4f}.'.
                      format(cv, clf.best_score_))
                print('The best parameters are:\n',
                      clf.best_estimator_.get_params()['clf'])
                print('Training took {:.0f}m {:.0f}s.'.format(
                    time_elapsed // 60, time_elapsed % 60))

                return clf

            elif 'log' in selected_model.get_params().values():

                start = time()

                pipe = imbl_pipeline([('Standardization',
                                       FunctionTransformer(standardize)),
                                      ('SMOTETOMEK', SMOTETomek()),
                                      ('clf', selected_model)])

                log_grid = {
                    'clf__loss': ['log'],
                    'clf__penalty': ['l2', 'l1', 'elasticnet'],
                    'clf__alpha': [0.01, 0.001, 0.0001],
                    'clf__max_iter': [1000, 5000],
                    'clf__class_weight': ['balanced', None],
                    'clf__random_state': [9999],
                    'SMOTETOMEK__random_state': [9999]
                }

                clf = RandomizedSearchCV(pipe,
                                         log_grid,
                                         cv=cv,
                                         n_iter=n_iter,
                                         scoring=scoring,
                                         n_jobs=-1,
                                         random_state=9999)

                clf.fit(X_train, y_train)

                end = time()

                time_elapsed = end - start

                print('\nThe best {}-fold cross valdiation score is {:.4f}.'.
                      format(cv, clf.best_score_))
                print('The best parameters are:\n',
                      clf.best_estimator_.get_params()['clf'])
                print('Training took {:.0f}m {:.0f}s.'.format(
                    time_elapsed // 60, time_elapsed % 60))

                return clf

        elif resample_method == 'smote_enn':

            if selected_model.__class__.__name__ == 'GradientBoostingClassifier':

                start = time()

                pipe = imbl_pipeline([('Standardization',
                                       FunctionTransformer(standardize)),
                                      ('SMOTENN', SMOTEENN()),
                                      ('clf', selected_model)])

                gb_grid = {
                    'clf__n_estimators': [
                        int(x)
                        for x in np.linspace(start=100, stop=1000, num=10)
                    ],
                    'clf__subsample': [0.7, 0.8],
                    'clf__learning_rate': [0.001, 0.01, 0.1],
                    'clf__max_depth':
                    [int(x) for x in np.linspace(10, 110, num=11)],
                    'clf__max_features': ['sqrt', 'log2'],
                    'clf__min_samples_split': [2, 5, 10],
                    'clf__min_samples_leaf': [1, 2, 4],
                    'clf__loss': ['deviance'],
                    'clf__random_state': [9999],
                    'SMOTENN__random_state': [9999]
                }

                clf = RandomizedSearchCV(pipe,
                                         gb_grid,
                                         cv=cv,
                                         n_iter=n_iter,
                                         scoring=scoring,
                                         n_jobs=-1,
                                         random_state=9999)

                clf.fit(X_train, y_train)

                end = time()

                time_elapsed = end - start

                print('\nThe best {}-fold cross valdiation score is {:.4f}.'.
                      format(cv, clf.best_score_))
                print('The best parameters are:\n',
                      clf.best_estimator_.get_params()['clf'])
                print('Training took {:.0f}m {:.0f}s.'.format(
                    time_elapsed // 60, time_elapsed % 60))

                return clf

            elif selected_model.__class__.__name__ == 'RandomForestClassifier':

                start = time()

                pipe = imbl_pipeline([('Standardization',
                                       FunctionTransformer(standardize)),
                                      ('SMOTENN', SMOTEENN()),
                                      ('clf', selected_model)])

                rf_grid = {
                    'clf__n_estimators': [
                        int(x)
                        for x in np.linspace(start=100, stop=1000, num=10)
                    ],
                    'clf__max_depth':
                    [int(x) for x in np.linspace(10, 110, num=11)],
                    'clf__max_features': ['sqrt', 'auto'],
                    'clf__min_samples_leaf': [1, 2, 4],
                    'clf__min_samples_split': [2, 5, 10],
                    'clf__bootstrap': [True],
                    'clf__class_weight': ['balanced', None],
                    'clf__random_state': [9999],
                    'SMOTENN__random_state': [9999]
                }

                clf = RandomizedSearchCV(pipe,
                                         rf_grid,
                                         cv=cv,
                                         n_iter=n_iter,
                                         scoring=scoring,
                                         n_jobs=-1,
                                         random_state=9999)

                clf.fit(X_train, y_train)

                end = time()

                time_elapsed = end - start

                print('\nThe best {}-fold cross valdiation score is {:.4f}.'.
                      format(cv, clf.best_score_))
                print('The best parameters are:\n',
                      clf.best_estimator_.get_params()['clf'])
                print('Training took {:.0f}m {:.0f}s.'.format(
                    time_elapsed // 60, time_elapsed % 60))

                return clf

            elif selected_model.__class__.__name__ == 'DecisionTreeClassifier':

                start = time()

                pipe = imbl_pipeline([('Standardization',
                                       FunctionTransformer(standardize)),
                                      ('SMOTENN', SMOTEENN()),
                                      ('clf', selected_model)])

                dt_grid = {
                    'clf__criterion': ['gini'],
                    'clf__splitter': ['best', 'random'],
                    'clf__max_depth':
                    [int(x) for x in np.linspace(10, 110, num=11)],
                    'clf__min_samples_leaf': [1, 2, 4],
                    'clf__max_features': ['sqrt', 'auto'],
                    'clf__min_samples_split': [2, 5, 10],
                    'clf__class_weight': ['balanced', None],
                    'clf__random_state': [9999],
                    'SMOTENN__random_state': [9999]
                }

                clf = RandomizedSearchCV(pipe,
                                         dt_grid,
                                         cv=cv,
                                         n_iter=n_iter,
                                         scoring=scoring,
                                         n_jobs=-1,
                                         random_state=9999)

                clf.fit(X_train, y_train)

                end = time()

                time_elapsed = end - start

                print('\nThe best {}-fold cross valdiation score is {:.4f}.'.
                      format(cv, clf.best_score_))
                print('The best parameters are:\n',
                      clf.best_estimator_.get_params()['clf'])
                print('Training took {:.0f}m {:.0f}s.'.format(
                    time_elapsed // 60, time_elapsed % 60))

                return clf

            elif selected_model.__class__.__name__ == 'ExtraTreesClassifier':

                start = time()

                pipe = imbl_pipeline([('Standardization',
                                       FunctionTransformer(standardize)),
                                      ('SMOTENN', SMOTEENN()),
                                      ('clf', selected_model)])

                et_grid = {
                    'clf__criterion': ['gini'],
                    'clf__n_estimators': [
                        int(x)
                        for x in np.linspace(start=100, stop=1000, num=10)
                    ],
                    'clf__bootstrap': [True, False],
                    'clf__max_depth':
                    [int(x) for x in np.linspace(10, 110, num=11)],
                    'clf__min_samples_leaf': [1, 2, 4],
                    'clf__max_features': ['sqrt', 'auto'],
                    'clf__min_samples_split': [2, 5, 10],
                    'clf__class_weight': ['balanced', None],
                    'clf__random_state': [9999]
                }

                clf = RandomizedSearchCV(pipe,
                                         et_grid,
                                         cv=cv,
                                         n_iter=n_iter,
                                         scoring=scoring,
                                         n_jobs=-1,
                                         random_state=9999)

                clf.fit(X_train, y_train)

                end = time()

                time_elapsed = end - start

                print('\nThe best {}-fold cross valdiation score is {:.4f}.'.
                      format(cv, clf.best_score_))
                print('The best parameters are:\n',
                      clf.best_estimator_.get_params()['clf'])
                print('Training took {:.0f}m {:.0f}s.'.format(
                    time_elapsed // 60, time_elapsed % 60))

                return clf

            elif 'log' in selected_model.get_params().values():

                start = time()

                pipe = imbl_pipeline([('Standardization',
                                       FunctionTransformer(standardize)),
                                      ('SMOTENN', SMOTEENN()),
                                      ('clf', selected_model)])

                log_grid = {
                    'clf__loss': ['log'],
                    'clf__penalty': ['l2', 'l1', 'elasticnet'],
                    'clf__alpha': [0.01, 0.001, 0.0001],
                    'clf__max_iter': [1000, 5000],
                    'clf__class_weight': ['balanced', None],
                    'clf__random_state': [9999],
                    'SMOTENN__random_state': [9999]
                }

                clf = RandomizedSearchCV(pipe,
                                         log_grid,
                                         cv=cv,
                                         n_iter=n_iter,
                                         scoring=scoring,
                                         n_jobs=-1,
                                         random_state=9999)

                clf.fit(X_train, y_train)

                end = time()

                time_elapsed = end - start

                print('\nThe best {}-fold cross valdiation score is {:.4f}.'.
                      format(cv, clf.best_score_))
                print('The best parameters are:\n',
                      clf.best_estimator_.get_params()['clf'])
                print('Training took {:.0f}m {:.0f}s.'.format(
                    time_elapsed // 60, time_elapsed % 60))

                return clf

    def evaluate(clf, X_test, y_test):
        '''
        Evaluate the tuned classifier's performance on the testing set

        Args:
            clf: The tuned classifier from train()
            X_test: The test data features
            y_test: The test data labels

        Returns 
                a report containing the f1-score, precision, and recall of each class
                the Matthew's Correlation Coefficient
                the log loss (cross-entropy loss)
                a confusion matrix figure
                a ROC curve figure
        '''
        def plot_confusion_matrix(y_test, y_pred):
            '''
            Confusion matrix

            Args:
                y_test: The test set labels
                y_pred: The predicted labels

            Returns confusion matrix figure
            '''
            cm = confusion_matrix(y_test, y_pred)
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

            df_cm = pd.DataFrame(cm,
                                 columns=np.unique(y_test),
                                 index=np.unique(y_test))
            df_cm.index.name = 'True Labels'
            df_cm.columns.name = 'Predicted Labels'

            cm_fig = sns.heatmap(df_cm, cmap='Blues', annot=True, cbar=False)
            for _, spine in cm_fig.spines.items():
                spine.set_visible(True)

            plt.title('{} Confusion Matrix'.format(model_name))
            plt.yticks(rotation=0)

            return cm_fig

        y_pred = clf.predict(X_test)
        report = classification_report(y_test, y_pred)

        mcc = matthews_corrcoef(y_test, y_pred)

        if 'log' in clf.best_estimator_.get_params().values():
            model_name = 'Logistic Regression (SGD)'
            y_probas = clf.best_estimator_['clf'].predict_proba(X_test)
        else:
            model_name = selected_model.__class__.__name__
            y_probas = clf.predict_proba(X_test)

        conf_mat = plot_confusion_matrix(y_test, y_pred)

        # Binary classification vs multi-class classification ROC curve
        if len(np.unique(y_test)) > 2:
            roc_curve = plot_roc(y_test,
                                 y_probas,
                                 title='{} ROC curve'.format(model_name))
        else:
            roc_curve = plot_roc_curve(clf, X_test, y_test, name=model_name)
            roc_curve.figure_.suptitle('{} ROC curve'.format(model_name))

        loss_score = log_loss(y_test, y_probas)

        # Get the features of their predicted class based on weight and value
        if 'log' in selected_model.get_params().values():
            feat_imp = eli5.sklearn.explain_prediction_linear_classifier(
                clf.best_estimator_['clf'],
                X_test[1],
                feature_names=feature_names)
        else:
            feat_imp = eli5.sklearn.explain_prediction.explain_prediction_tree_classifier(
                clf.best_estimator_['clf'],
                X_test[1],
                feature_names=feature_names)

        return report, mcc, loss_score, feat_imp, conf_mat, roc_curve

    X = features.to_numpy()
    y = labels.to_numpy()

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        stratify=y,
                                                        test_size=0.2,
                                                        random_state=9999)

    selected_model = which_model(X, y, model=model)

    tuned_clf = train(selected_model,
                      X_train,
                      y_train,
                      resample_method=resample_method)

    report, mcc, loss_score, feat_imp, conf_mat, roc_curve = evaluate(
        tuned_clf, X_test, y_test)

    feat_imp = eli5.format_as_dataframe(feat_imp)

    return tuned_clf, report, mcc, loss_score, feat_imp, conf_mat, roc_curve
コード例 #23
0
def main(input_file_path,
         output_file_path,
         tgt="Oil_norm",
         interim_file_path=None,
         n_splits=11):

    condition = condition_dict[tgt]

    input_file_name = os.path.join(input_file_path, "Train_final.pck")
    input_file_name_test = os.path.join(input_file_path, "Test_final.pck")
    input_file_name_val = os.path.join(input_file_path, "Validation_final.pck")
    exclude_cols = exclude_cols_dict.get(tgt)
    output_file_name = os.path.join(output_file_path, f"models_lgbm_{tgt}.pck")

    df = pd.read_pickle(input_file_name).drop(exclude_cols, axis=1)
    df_test = pd.read_pickle(input_file_name_test)
    df_val = pd.read_pickle(input_file_name_val).drop(exclude_cols, axis=1)
    df_all = pd.concat([df, df_val], axis=0)

    ids = df_test["EPAssetsId"].copy()

    ids_uwi = df_test["UWI"].copy()

    df_test = df_test.drop(exclude_cols, axis=1)

    cv = KFold(n_splits=n_splits, shuffle=False)
    models = []
    scores = []
    scores_dm = []

    y = df_all.loc[~df_all[tgt].isna(), tgt]
    id_X = df_all.loc[~df_all[tgt].isna(), ["EPAssetsId"]]

    X = df_all.loc[~df_all[tgt].isna(), :].drop(
        [
            "Oil_norm", "Gas_norm", "Water_norm", "EPAssetsId",
            "_Normalized`IP`BOE/d"
        ],
        axis=1,
    )
    # Filter large vals
    # condition = y < threshold_dict[tgt]
    # X = X.loc[condition,:]
    # y = y.loc[condition]

    X_test = df_test.copy().drop("EPAssetsId", axis=1)

    preds_test = np.zeros((n_splits, df_test.shape[0]))
    preds_holdout = []
    y_true = []
    id_list = []

    np.random.seed(123)

    best_params = pd.read_csv(
        os.path.join(output_file_path,
                     f'LGBM_{tgt}_feats_final_Trials.csv')).head(20)
    datasets = {}
    for k, (train_index, test_index) in enumerate(cv.split(X, y)):
        X_train, X_holdout = X.iloc[train_index, :], X.iloc[test_index, :]
        print(X_train.shape)
        id_X_holdout = id_X.iloc[test_index]

        # model = LGBMRegressor(num_leaves=16, learning_rate=0.1, n_estimators=300, reg_lambda=30, reg_alpha=30,
        # objective='mae',random_state=123)
        if tgt == 'Oil_norm':
            params = best_params.iloc[k, :].to_dict()
        else:
            params = best_params.iloc[0, :].to_dict()

        model = LogRF(target=tgt, max_depth=17, n_estimators=200)
        y_train, y_holdout = y.iloc[train_index], y.iloc[test_index]
        idx = (y_train > condition[0]) & (y_train < condition[1])
        X_train, y_train = X_train.loc[idx, :], y_train.loc[idx]
        # Calculate a fill value:
        #target_log_mean = np.median(np.log(y_train[y_train > 0]))
        #target_fill_val = np.exp(target_log_mean)
        target_fill_val = 0
        y_train = y_train.fillna(value=target_fill_val)
        logging.info(f'Filling {tgt} with {target_fill_val}')

        y_holdout = y_holdout.fillna(value=0)
        geom_mean = gmean(y_train)
        dm = DummyRegressor(strategy="constant", constant=geom_mean)
        X_train = X_train.fillna(-999)
        X_holdout = X_holdout.fillna(-999)
        X_test = X_test.fillna(-999)

        model.fit(X_train, y_train)
        # model.fit(X_train, y_train)
        dm.fit(X_train, y_train)
        y_pred_holdout = model.predict(X_holdout)
        score = mean_absolute_error(y_holdout, y_pred_holdout)
        score_dm = mean_absolute_error(y_holdout, dm.predict(X_holdout))

        # logging.info(f' Score = {score}')
        models.append(model)
        scores.append(score)

        scores_dm.append((score_dm))
        logger.warning(f"Holdout score = {score}")
        preds_test[k, :] = model.predict(X_test)
        preds_holdout.append(y_pred_holdout.reshape(1, -1))
        y_true.append(y_holdout.values.reshape(1, -1))
        id_list.append(id_X_holdout.values.reshape(1, -1))

        print(
            mean_absolute_error(y_holdout.values.reshape(1, -1),
                                y_pred_holdout.reshape(1, -1)))

    with open(output_file_name, "wb") as f:
        pickle.dump(models, f)
    logger.info(scores)
    logger.info(f"Mean scores LGBM = {np.mean(scores)}")
    logger.info(f"Mean scores Dummy = {np.mean(scores_dm)}")

    # preds_df = pd.DataFrame(
    #     {"EPAssetsID": ids, "UWI": ids_uwi, tgt: preds_test.mean(axis=0)}
    # )
    preds_df = pd.DataFrame({
        "EPAssetsID": ids,
        "UWI": ids_uwi,
        tgt: mean_log(preds_test)
    })
    n_points = np.hstack(y_true).shape[0]
    preds_df_val = pd.DataFrame({
        tgt: np.hstack(preds_holdout)[0, :],
        f"gt_{tgt}": np.hstack(y_true)[0, :],
        'EPAssetsId': np.hstack(id_list)[0, :]
    })
    logger.warning(
        f"Final scores on holdout: {np.mean(scores)} +- {np.std(scores)}")
    logger.warning(
        f"Final scores on full holdout: {mean_absolute_error(preds_df_val[f'gt_{tgt}'], preds_df_val[tgt])}"
    )

    print(eli5.format_as_dataframe(eli5.explain_weights(model, top=60)))

    return preds_df, preds_df_val, np.mean(scores)