Ejemplo n.º 1
0
def run_cochran_q_test(y_test, *model_predictions, output_name):
    """
    Runs Cochran's Q test to determine if there is a statistically significant difference in more than two models' class
    predictions. The function can support up to five sets of predictions. Results are saved locally.

    :param y_test: y_test series
    :param model_predictions: arbitrary number of model predictions
    :param output_name: name to append to file to identify models used in the test
    """
    n_models = len(model_predictions)
    if n_models == 3:
        chi2, p = cochrans_q(y_test.values, model_predictions[0].values,
                             model_predictions[1].values,
                             model_predictions[2].values)
    elif n_models == 4:
        chi2, p = cochrans_q(y_test.values, model_predictions[0].values,
                             model_predictions[1].values,
                             model_predictions[2].values,
                             model_predictions[3].values)
    elif n_models == 5:
        chi2, p = cochrans_q(y_test.values, model_predictions[0].values,
                             model_predictions[1].values,
                             model_predictions[2].values,
                             model_predictions[3].values,
                             model_predictions[4].values)
    else:
        raise Exception(
            'function cannot support more than five sets of predictions')
    pd.DataFrame({
        'chi2': [chi2],
        'p': [p]
    }).to_csv(
        os.path.join('modeling', 'comparison_files',
                     f'{output_name}_cochrans_q_test.csv'))
Ejemplo n.º 2
0
def test_compare_to_mcnemar_on_2_models():

    y_true = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0])

    ym1 = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0])

    ym2 = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0])

    q, p = cochrans_q(y_true, ym1, ym2)

    mcn_q, mcn_p = mcnemar(mcnemar_table(y_true, ym1, ym2),
                           corrected=False,
                           exact=False)

    assert q == mcn_q
    assert p == mcn_p
Ejemplo n.º 3
0
def test_on_dataset():
    y_true = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0])

    ym1 = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0])

    ym2 = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0])

    ym3 = np.array([1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    1, 1])

    q, p_value = cochrans_q(y_true, ym1, ym2, ym3)

    assert round(q, 3) == 7.529
    assert round(p_value, 3) == 0.023
Ejemplo n.º 4
0
def test_compare_to_mcnemar_on_2_models():

    y_true = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0])

    ym1 = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0])

    ym2 = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0])

    q, p = cochrans_q(y_true, ym1, ym2)

    mcn_q, mcn_p = mcnemar(mcnemar_table(y_true, ym1, ym2),
                           corrected=False,
                           exact=False)

    assert q == mcn_q
    assert p == mcn_p
Ejemplo n.º 5
0
def test_on_dataset():
    y_true = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                       0, 0, 0, 0, 0])

    ym1 = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0])

    ym2 = np.array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0])

    ym3 = np.array([1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                    1, 1])

    q, p_value = cochrans_q(y_true, ym1, ym2, ym3)

    assert round(q, 3) == 7.529
    assert round(p_value, 3) == 0.023
Ejemplo n.º 6
0
def summarize_feature_comparisons(
        base_clf: BaseEstimator, comparison_clfs: Dict[str, BaseEstimator], X_test, y_test
):
    from mlxtend.evaluate import mcnemar, cochrans_q, mcnemar_table

    summary_dict = collections.OrderedDict()
    mcnemar_tbs = dict()

    # create list of predicted values
    base_y_predict = base_clf.predict(X_test)
    y_predictions = [base_y_predict]
    for idx, (name, clf) in enumerate(comparison_clfs.items()):
        # get the probability
        y_predict_proba = clf.predict_proba(X_test)
        y_predict = clf.predict(X_test)

        # form mcnemar tables against base classifier
        tb = mcnemar_table(y_test, base_y_predict, y_predict)
        mcnemar_tbs[f"base vs {name}"] = tb.values()

        # store predictions per classifier
        y_predictions.append(y_predict)

    # first run cochrans Q test
    qstat, pval = cochrans_q(y_test, *y_predictions)
    summary_dict["cochrans_q"] = qstat
    summary_dict["cochrans_q_pval"] = pval

    # run mcnemars test against all the predictions
    for name, table in mcnemar_tbs.items():
        chi2stat, pval = mcnemar(table, exact=True)
        summary_dict[f"mcnemar_{name}_chi2stat"] = chi2stat
        summary_dict[f"mcnemar_{name}_pval"] = pval

    return summary_dict
Ejemplo n.º 7
0
# Implementing GridSearchCV on Logistic Regression model
model2 = GridSearchCV(lr, parameters_lr, cv = cv)
model2.fit(X_train, y_train) # Fitting on training data


# In[16]:


# Implementing GridSearchCV on MLP classifier
model3 = GridSearchCV(mlp, parameters_mlp, cv = cv)
model3.fit(X_train, y_train) # Fitting on training data


# In[17]:


y_model1 = model1.predict(X_test)
y_model2 = model2.predict(X_test)
y_model3 = model3.predict(X_test)
y_test = np.array(y_test)


# In[18]:


q, p_value = cochrans_q(y_test, y_model1, y_model2, y_model3)

print('Q: %.3f' % q)
print('p-value: %.3f' % p_value)

Ejemplo n.º 8
0
    result = f_oneway(df_result['uni_R'].to_numpy(),
                      df_result['bi_R'].to_numpy(),
                      df_result['unibi_R'].to_numpy())
    print("ANNOVA R : %0.5f, %0.5f" % result)

    result = f_oneway(df_result['uni_F1'].to_numpy(),
                      df_result['bi_F1'].to_numpy(),
                      df_result['unibi_F1'].to_numpy())
    print("ANNOVA F1 : %0.5f, %0.5f" % result)

    # Coher Q analysis
    y_uni = sr_uni.to_numpy()
    y_bi = sr_bi.to_numpy()
    y_unibi = sr_unibi.to_numpy()
    q, p_value = cochrans_q(y, y_uni, y_bi, y_unibi)
    print("COHRAN Q-Test: q: %0.5f, p_value: %0.5f" % (q, p_value))

    l_grams = ['uni', 'bi', 'unibi']
    l_rslt = [y_uni, y_bi, y_unibi]
    l_pair = list(zip(l_grams, l_rslt))

    l_mcnemar_rslt = []
    for i, t0 in enumerate(l_pair):
        for j, t1 in enumerate(l_pair[i + 1:]):
            k0 = t0[0]
            k1 = t1[0]
            v0 = t0[1]
            v1 = t1[1]

            tb = mcnemar_table(y_target=y, y_model1=v0, y_model2=v1)
Ejemplo n.º 9
0
    print("ANNOVA R : %0.5f, %0.5f" % result)

    result = f_oneway(
        df_result['tc_lower_F1'].to_numpy(),
        df_result['tc_swrem_F1'].to_numpy(),
        df_result['tc_stem_F1'].to_numpy(),
        df_result['tc_swrem_stem_F1'].to_numpy(),
    )
    print("ANNOVA F1 : %0.5f, %0.5f" % result)

    # COHRAN Q-Test
    y_tc_lower = sr_tc_lower.to_numpy()
    y_tc_swrem = sr_tc_swrem.to_numpy()
    y_tc_stem = sr_tc_stem.to_numpy()
    y_tc_swrem_stem = sr_tc_swrem_stem.to_numpy()
    q, p_value = cochrans_q(y, y_tc_lower, y_tc_swrem, y_tc_stem,
                            y_tc_swrem_stem)
    print("COHRAN Q-Test: q: %0.5f, p_value: %0.5f" % (q, p_value))

    l_repr = ['tc_lower', 'tc_swrem', 'tc_stem', 'tc_swrem_stem']
    l_rslt = [y_tc_lower, y_tc_swrem, y_tc_stem, y_tc_swrem_stem]
    l_pair = list(zip(l_repr, l_rslt))

    l_mcnemar_rslt = []
    for i, t0 in enumerate(l_pair):
        for j, t1 in enumerate(l_pair[i + 1:]):
            k0 = t0[0]
            k1 = t1[0]
            v0 = t0[1]
            v1 = t1[1]

            tb = mcnemar_table(y_target=y, y_model1=v0, y_model2=v1)
Ejemplo n.º 10
0
    print("ANNOVA R : %0.5f, %0.5f" % result)

    result = f_oneway(df_result['multi_nb_F1'].to_numpy(),
                      df_result['svc_F1'].to_numpy(),
                      df_result['lsvc_F1'].to_numpy(),
                      df_result['rf_F1'].to_numpy(),
                      df_result['lr_F1'].to_numpy(),
                      df_result['ada_F1'].to_numpy())
    print("ANNOVA F1 : %0.5f, %0.5f" % result)

    # Coher Q analysis
    q, p_value = cochrans_q(
        y,
        sr_multi_nb.to_numpy(),
        sr_svc.to_numpy(),
        sr_lsvc.to_numpy(),
        sr_rf.to_numpy(),
        sr_lr.to_numpy(),
        sr_ada.to_numpy(),
    )
    print("COHRAN Q-Test: q: %0.5f, p_value: %0.5f" % (q, p_value))

    f_out = basename(f_in)
    df_result.to_excel(f_in.replace('.json', '_RESULT.xlsx'))

    l_algo = ['multi_nb', 'svc', 'lsvc', 'rf', 'lr', 'ada']

    l_rslt = [
        sr_multi_nb.to_numpy(),
        sr_svc.to_numpy(),
        sr_lsvc.to_numpy(),
Ejemplo n.º 11
0
    print("ANNOVA R : %0.5f, %0.5f" % result)
    str_annova_f += "%0.5f " % (result[0])
    str_annova_p += "%0.5f " % (result[1])

    result = f_oneway(df_result['t_lower_F1'].to_numpy(),
                      df_result['c_lower_F1'].to_numpy(),
                      df_result['tc_lower_F1'].to_numpy())
    print("ANNOVA F1 : %0.5f, %0.5f" % result)
    str_annova_f += "%0.5f " % (result[0])
    str_annova_p += "%0.5f " % (result[1])

    # COHRAN Q-Test
    y_tc_lower = sr_tc_lower.to_numpy()
    y_t_lower = sr_t_lower.to_numpy()
    y_c_lower = sr_c_lower.to_numpy()
    q, p_value = cochrans_q(y, y_tc_lower, y_t_lower, y_c_lower)
    print("COHRAN Q-Test: q: %0.5f, p_value: %0.5f" % (q, p_value))

    l_repr = ['t_lower', 'c_lower', 'tc_lower']
    l_rslt = [y_t_lower, y_c_lower, y_tc_lower]
    l_pair = list(zip(l_repr, l_rslt))

    str_result = "MCNEMAR RESULT: "

    for i, t0 in enumerate(l_pair):
        for j, t1 in enumerate(l_pair[i + 1:]):
            k0 = t0[0]
            k1 = t1[0]
            v0 = t0[1]
            v1 = t1[1]