def get_balanced_split_for_study(full_dataset, study, y_field="posOutcome"): X, y = prepare_dataset(full_dataset, study, y_field=y_field) kf = RepeatedStratifiedKFold(n_splits=5) (train_index, test_index) = next(kf.split(X, y)) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] X_train, y_train = random_upsample_balance(X_train, y_train) X_test, y_test = random_upsample_balance(X_test, y_test) return X_train, X_test, y_train, y_test
def get_balanced_study(full_dataset, study): X, y = prepare_dataset(full_dataset, study) return random_upsample_balance(X, y)
full_notrea_dataset = drop_trea(full_dataset) mike_notrea_dataset = drop_trea(mike_dataset) all_studies = list(set(full_dataset['study'])) print_order = [ "full", "full_notrea", "full_pam50", "mike", "mike_svm", "mike_logi", "mike_notrea", "mike_notrea_svm", "mike_notrea_logi", "mike_pam50", "mike_pam50_svm", "mike_pam50_logi", "trea", "trea_svm", "trea_logi" ] max_len_order = max(map(len, print_order)) for study in ['study_20194_GPL96_all-bmc15']: #for study in ['study_17705_GPL96_MDACC_Tissue_BC_Tamoxifen-bmc15']: X_full, y_full = prepare_dataset(full_dataset, study) X_full_notrea, _ = prepare_dataset(full_notrea_dataset, study) X_mike, _ = prepare_dataset(mike_dataset, study) X_mike_notrea, _ = prepare_dataset(mike_notrea_dataset, study) X_trea, _ = prepare_dataset(treat_dataset, study) print("==>", study, count_to_str(y_full)) kf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10) rez = defaultdict(list) for i, (train_index, test_index) in enumerate(kf.split(X_full, y_full)): y_train, y_test = y_full[train_index], y_full[test_index] nrun = 11 print("split: ", i, "train: ", count_to_str(y_train), " test:", count_to_str(y_test))