def get_balanced_split_for_study(full_dataset, study, y_field="posOutcome"):
    X, y = prepare_dataset(full_dataset, study, y_field=y_field)

    kf = RepeatedStratifiedKFold(n_splits=5)
    (train_index, test_index) = next(kf.split(X, y))

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    X_train, y_train = random_upsample_balance(X_train, y_train)
    X_test, y_test = random_upsample_balance(X_test, y_test)

    return X_train, X_test, y_train, y_test
Exemple #2
0
def get_balanced_study(full_dataset, study):
    X, y = prepare_dataset(full_dataset, study)
    return random_upsample_balance(X, y)
full_notrea_dataset = drop_trea(full_dataset)
mike_notrea_dataset = drop_trea(mike_dataset)

all_studies = list(set(full_dataset['study']))

print_order = [
    "full", "full_notrea", "full_pam50", "mike", "mike_svm", "mike_logi",
    "mike_notrea", "mike_notrea_svm", "mike_notrea_logi", "mike_pam50",
    "mike_pam50_svm", "mike_pam50_logi", "trea", "trea_svm", "trea_logi"
]
max_len_order = max(map(len, print_order))

for study in ['study_20194_GPL96_all-bmc15']:
    #for study in ['study_17705_GPL96_MDACC_Tissue_BC_Tamoxifen-bmc15']:

    X_full, y_full = prepare_dataset(full_dataset, study)
    X_full_notrea, _ = prepare_dataset(full_notrea_dataset, study)
    X_mike, _ = prepare_dataset(mike_dataset, study)
    X_mike_notrea, _ = prepare_dataset(mike_notrea_dataset, study)
    X_trea, _ = prepare_dataset(treat_dataset, study)

    print("==>", study, count_to_str(y_full))
    kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10)

    rez = defaultdict(list)
    for i, (train_index, test_index) in enumerate(kf.split(X_full, y_full)):
        y_train, y_test = y_full[train_index], y_full[test_index]
        nrun = 11
        print("split: ", i, "train: ", count_to_str(y_train), "  test:",
              count_to_str(y_test))