def calc_results_for_ensamble(X, y, train_index, test_index, nrun, clf):

    all_rez = []
    for i in range(nrun):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        X_train, y_train = random_upsample_balance(X_train, y_train)
        X_test, y_test = random_upsample_balance(X_test, y_test)

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        all_rez.append(y_pred)

    y_pred = np.array([
        Counter(ys).most_common()[0][0]
        for ys in np.array(all_rez).transpose()
    ])

    acc = np.mean(y_test == y_pred)
    recall_0 = recall_score(y_test, y_pred, pos_label=0)
    recall_1 = recall_score(y_test, y_pred, pos_label=1)

    return acc, recall_0, recall_1
Example #2
0
def calc_results_onlystudy(X, y, train_index, test_index, clf):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train, y_train = random_upsample_balance(X_train, y_train)
    X_test, y_test   = random_upsample_balance(X_test, y_test)
    
    return calc_results_simple(X_train, X_test, y_train, y_test, clf)
def get_balanced_split_for_study(full_dataset, study, y_field="posOutcome"):
    X, y = prepare_dataset(full_dataset, study, y_field=y_field)

    kf = RepeatedStratifiedKFold(n_splits=5)
    (train_index, test_index) = next(kf.split(X, y))

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    X_train, y_train = random_upsample_balance(X_train, y_train)
    X_test, y_test = random_upsample_balance(X_test, y_test)

    return X_train, X_test, y_train, y_test
Example #4
0
def calc_results_withfull_simple(X, y, train_index, test_index, full_dataset, test_study, clf):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train, y_train = random_upsample_balance(X_train, y_train)
    X_test, y_test   = random_upsample_balance(X_test, y_test)
    
                    
    X_train_other, y_train_other = get_balanced_studies_except_test_study(full_dataset, test_study)

    X_train = np.concatenate([X_train, X_train_other])
    y_train = np.concatenate([y_train, y_train_other])
    
    return calc_results_simple(X_train, X_test, y_train, y_test, clf)
def calc_results_for_fold(X, y, train_index, test_index, clf):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    X_train, y_train = random_upsample_balance(X_train, y_train)
    X_test, y_test = random_upsample_balance(X_test, y_test)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    acc = np.mean(y_test == y_pred)
    recall_0 = recall_score(y_test, y_pred, pos_label=0)
    recall_1 = recall_score(y_test, y_pred, pos_label=1)

    return acc, recall_0, recall_1
Example #6
0
def calc_results_withfull_balanced2(X, y, train_index, test_index, full_dataset, test_study, clf):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train, y_train = random_upsample_balance(X_train, y_train)
    X_test, y_test   = random_upsample_balance(X_test, y_test)
    
                    
    X_train_other, y_train_other = get_balanced_studies_except_test_study(full_dataset, test_study)
    N_rep =  int(len(y_train_other) / len(y_train))
    
    X_train_rep = np.repeat(X_train, N_rep, axis=0)
    y_train_rep = np.repeat(y_train, N_rep, axis=0)
    
    X_train = np.concatenate([X_train_rep, X_train_other])
    y_train = np.concatenate([y_train_rep, y_train_other])
    
    return calc_results_simple(X_train, X_test, y_train, y_test, clf)
def get_balanced_study(full_dataset, study):
    X, y = prepare_dataset(full_dataset, study)
    return random_upsample_balance(X, y)
from funs_balance import random_upsample_balance

X = [[1, 1], [2, 1], [3, 1], [4, 1], [1, 0], [2, 0]]
y = [1, 1, 1, 1, 0, 0]

Xb, yb = random_upsample_balance(X, y)
print(Xb)
print(yb)
Example #9
0
def print_results(dataset, set1, set2):
    X_set1, y_set1 = prepare_full_dataset(
        dataset.loc[dataset['patient_ID'].isin(set1)])
    X_set2, y_set2 = prepare_full_dataset(
        dataset.loc[dataset['patient_ID'].isin(set2)])

    #    X_set1 = np.random.rand(*X_set1.shape)
    #    X_set2 = np.random.rand(*X_set2.shape)

    X_set1_wf = add_one_features(X_set1, 0)
    X_set2_wf = add_one_features(X_set2, 1)

    X_genes_wf = np.concatenate([X_set1_wf, X_set2_wf])
    y_all = np.concatenate([y_set1, y_set2])

    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)
    print_order = [
        "genes", "genes_set", "genes_biased", "genes_double", "study"
    ]

    max_len_order = max(map(len, print_order))

    rez = defaultdict(list)

    for i, (train_index, test_index) in enumerate(kf.split(X_genes_wf, y_all)):
        X_genes_wf_train, X_genes_wf_test = X_genes_wf[
            train_index], X_genes_wf[test_index]
        y_train, y_test = y_all[train_index], y_all[test_index]

        print("before balanced")
        print_count_two_sets(X_genes_wf_train[:, 0], y_train)
        print_count_two_sets(X_genes_wf_test[:, 0], y_test)
        #        print("counter before balance", Counter(X_genes_wf_train[:,0]), Counter(X_genes_wf_test[:,0]), Counter(y_train), Counter(y_test))
        X_genes_wf_train, y_train = random_upsample_balance(
            X_genes_wf_train, y_train)
        X_genes_wf_test, y_test = random_upsample_balance(
            X_genes_wf_test, y_test)
        #        print("counter after balance", Counter(X_genes_wf_train[:,0]), Counter(X_genes_wf_test[:,0]), Counter(y_train), Counter(y_test))
        print("after balanced")
        print_count_two_sets(X_genes_wf_train[:, 0], y_train)
        print_count_two_sets(X_genes_wf_test[:, 0], y_test)

        X_genes_train = X_genes_wf_train[:, 1:]
        X_genes_test = X_genes_wf_test[:, 1:]

        Xs_train = X_genes_wf_train[:, :1]
        Xs_test = X_genes_wf_test[:, :1]

        rez["genes"].append(
            calc_results_simple(X_genes_train, X_genes_test, y_train, y_test,
                                XGBClassifier()))
        rez["genes_set"].append(
            calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train,
                                y_test, XGBClassifier()))
        rez["genes_biased"].append(
            calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train,
                                y_test, BiasedXgboost()))
        rez["genes_double"].append(
            calc_results_simple(X_genes_wf_train, X_genes_wf_test, y_train,
                                y_test, DoubleXgboost()))
        rez["study"].append(
            calc_results_simple(Xs_train, Xs_test, y_train, y_test,
                                XGBClassifier()))

        for order in print_order:
            print(order, " " * (max_len_order - len(order)), ": ",
                  list_to_4g_str(rez[order][-1]))
        print("")

    for order in print_order:
        print("==> ", order, " " * (max_len_order - len(order)), ": ",
              list2d_to_4g_str_pm(rez[order]))