Exemple #1
0
def test_oob_too_little_estimators(make_whas500):
    whas500 = make_whas500(to_numeric=True)

    forest = RandomSurvivalForest(n_estimators=3,
                                  oob_score=True,
                                  random_state=2)
    with pytest.warns(UserWarning,
                      match="Some inputs do not have OOB scores. "
                      "This probably means too few trees were used "
                      "to compute any reliable oob estimates."):
        forest.fit(whas500.x, whas500.y)
def test_fit_no_bootstrap(make_whas500):
    whas500 = make_whas500(to_numeric=True)

    forest = RandomSurvivalForest(n_estimators=10, bootstrap=False, random_state=2)
    forest.fit(whas500.x, whas500.y)

    pred = forest.predict(whas500.x)

    expected_c = (0.931881994437717, 70030, 5119, 0, 14)
    assert_cindex_almost_equal(
        whas500.y["fstat"], whas500.y["lenfol"], pred, expected_c)
def test_fit_with_small_max_samples(make_whas500):
    whas500 = make_whas500(to_numeric=True)

    # First fit with no restriction on max samples
    est1 = RandomSurvivalForest(
        n_estimators=1,
        random_state=1,
        max_samples=None,
    )

    # Second fit with max samples restricted to just 2
    est2 = RandomSurvivalForest(
        n_estimators=1,
        random_state=1,
        max_samples=2,
    )

    est1.fit(whas500.x, whas500.y)
    est2.fit(whas500.x, whas500.y)

    tree1 = est1.estimators_[0].tree_
    tree2 = est2.estimators_[0].tree_

    msg = "Tree without `max_samples` restriction should have more nodes"
    assert tree1.node_count > tree2.node_count, msg
def test_predict_step_function_warning(make_whas500, func):
    whas500 = make_whas500(to_numeric=True)

    forest = RandomSurvivalForest(n_estimators=3,
                                  oob_score=True,
                                  random_state=2)
    forest.fit(whas500.x, whas500.y)

    pred_fn = getattr(forest, func)

    with pytest.warns(
            FutureWarning,
            match="{} will return an array of StepFunction instances in 0.14".
            format(func)):
        pred_fn(whas500.x)
def test_pipeline_predict(breast_cancer, func):
    X_str, _ = load_breast_cancer()
    X_num, y = breast_cancer

    est = RandomSurvivalForest(n_estimators=10, random_state=1)
    est.fit(X_num[10:], y[10:])

    pipe = make_pipeline(OneHotEncoder(),
                         RandomSurvivalForest(n_estimators=10, random_state=1))
    pipe.fit(X_str[10:], y[10:])

    tree_pred = getattr(est, func)(X_num[:10], return_array=True)
    pipe_pred = getattr(pipe, func)(X_str[:10], return_array=True)

    assert_array_almost_equal(tree_pred, pipe_pred)
Exemple #6
0
def test_fit_predict(make_whas500):
    whas500 = make_whas500(to_numeric=True)

    forest = RandomSurvivalForest(random_state=2)
    forest.fit(whas500.x, whas500.y)

    assert len(forest.estimators_) == 100

    pred = forest.predict(whas500.x)
    assert numpy.isfinite(pred).all()
    assert numpy.all(pred >= 0)

    expected_c = (0.9026201280123488, 67831, 7318, 0, 14)
    assert_cindex_almost_equal(whas500.y["fstat"], whas500.y["lenfol"], pred,
                               expected_c)
Exemple #7
0
    def fit_regressors(self, features, performances, random_state):
        imputer = [SimpleImputer() for _ in range(self.num_algorithms)]
        scaler = [StandardScaler() for _ in range(self.num_algorithms)]
        models = [
            RandomSurvivalForest(
                n_estimators=self.n_estimators,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                max_features=self.max_features,
                bootstrap=self.bootstrap,
                oob_score=self.oob_score,
                n_jobs=1,
                random_state=random_state) for _ in range(self.num_algorithms)
        ]

        for alg_id in range(self.num_algorithms):
            # prepare survival forest dataset and split the data accordingly
            X_train, Y_train = self.construct_dataset_for_algorithm_id(
                features, performances, alg_id, self.algorithm_cutoff_time)
            X_train = imputer[alg_id].fit_transform(features)
            X_train = scaler[alg_id].fit_transform(X_train)
            models[alg_id].fit(X_train, Y_train)

        return imputer, scaler, models
def test_predict_step_function(make_whas500, func):
    whas500 = make_whas500(to_numeric=True)

    forest = RandomSurvivalForest(n_estimators=10, random_state=2)
    forest.fit(whas500.x[10:], whas500.y[10:])

    pred_fn = getattr(forest, func)

    ret_array = pred_fn(whas500.x[:10], return_array=True)
    fn_array = pred_fn(whas500.x[:10], return_array=False)

    assert ret_array.shape[0] == fn_array.shape[0]

    for fn, arr in zip(fn_array, ret_array):
        assert_array_almost_equal(fn.x, forest.event_times_)
        assert_array_almost_equal(fn.y, arr)
Exemple #9
0
def _random_survival_forest(formatted_x, formatted_y, test_size, n_estimators,
                            min_samples_split, min_samples_leaf, max_features,
                            random_state):

    x_train, x_test, y_train, y_test = train_test_split(
        formatted_x,
        formatted_y,
        test_size=test_size,
        random_state=random_state)

    cur_rsf = RandomSurvivalForest(n_estimators=n_estimators,
                                   min_samples_split=min_samples_split,
                                   min_samples_leaf=min_samples_leaf,
                                   max_features=max_features,
                                   n_jobs=-1,
                                   random_state=random_state)
    cur_rsf.fit(x_train, y_train)

    return cur_rsf.score(x_test, y_test)
Exemple #10
0
def test_fit_predict_chf(make_whas500):
    whas500 = make_whas500(to_numeric=True)

    forest = RandomSurvivalForest(n_estimators=10, random_state=2)
    forest.fit(whas500.x, whas500.y)

    assert len(forest.estimators_) == 10

    chf = forest.predict_cumulative_hazard_function(whas500.x)
    assert chf.shape == (500, forest.event_times_.shape[0])

    assert numpy.isfinite(chf).all()
    assert numpy.all(chf >= 0.0)

    vals, counts = numpy.unique(chf[:, 0], return_counts=True)
    assert vals[0] == 0.0
    assert numpy.max(counts) == counts[0]

    d = numpy.apply_along_axis(numpy.diff, 1, chf)
    assert (d >= 0).all()
Exemple #11
0
def test_fit_predict_surv(make_whas500):
    whas500 = make_whas500(to_numeric=True)

    forest = RandomSurvivalForest(n_estimators=10, random_state=2)
    forest.fit(whas500.x, whas500.y)

    assert len(forest.estimators_) == 10

    surv = forest.predict_survival_function(whas500.x)
    assert surv.shape == (500, forest.event_times_.shape[0])

    assert numpy.isfinite(surv).all()
    assert numpy.all(surv >= 0.0)
    assert numpy.all(surv <= 1.0)

    vals, counts = numpy.unique(surv[:, 0], return_counts=True)
    assert vals[-1] == 1.0
    assert numpy.max(counts) == counts[-1]

    d = numpy.apply_along_axis(numpy.diff, 1, surv)
    assert (d <= 0).all()
Exemple #12
0
def test_oob_score(make_whas500):
    whas500 = make_whas500(to_numeric=True)

    forest = RandomSurvivalForest(oob_score=True,
                                  bootstrap=False,
                                  random_state=2)
    with pytest.raises(ValueError,
                       match="Out of bag estimation only available "
                       "if bootstrap=True"):
        forest.fit(whas500.x, whas500.y)

    forest.set_params(bootstrap=True)
    forest.fit(whas500.x, whas500.y)

    assert forest.oob_prediction_.shape == (whas500.x.shape[0], )
    assert round(abs(forest.oob_score_ - 0.753010685), 6) == 0.0
    def __init__(self, n_estimators=100):

        super(RandomSurvForest, self).__init__()
        # super().__init__()

        self.n_estimators = n_estimators
        self.name = 'RandomSurvForest'

        self.model = RandomSurvivalForest(n_estimators=self.n_estimators)
        self.direction = 1
        self.prob_FLAG = True

        self.explained = "*Random Survival Forest"
        self.image_name = "RandomSurvForest.png"
        self.image_size = (500, 500)
def test_fit_int_time(make_whas500):
    whas500 = make_whas500(to_numeric=True)
    y = whas500.y
    y_int = numpy.empty(y.shape[0],
                        dtype=[(y.dtype.names[0], bool),
                               (y.dtype.names[1], int)])
    y_int[:] = y

    forest_f = RandomSurvivalForest(oob_score=True, random_state=2).fit(
        whas500.x[50:], y[50:])
    forest_i = RandomSurvivalForest(oob_score=True, random_state=2).fit(
        whas500.x[50:], y_int[50:])

    assert len(forest_f.estimators_) == len(forest_i.estimators_)
    assert forest_f.n_features_ == forest_i.n_features_
    assert forest_f.oob_score_ == forest_i.oob_score_
    assert_array_almost_equal(forest_f.event_times_, forest_i.event_times_)

    pred_f = forest_f.predict(whas500.x[:50])
    pred_i = forest_i.predict(whas500.x[:50])

    assert_array_almost_equal(pred_f, pred_i)
Exemple #15
0
          'T2w_contrast_mean']], data[['tum_vol', 'ADC_ave', 'T2w_ave']],
    data[['tum_area_from_vol', 'dens_ADCT2w']], data[['init_tum_area']],
    data[['TTum330_alphaG1120_vd02']]
]

y = np.zeros(76,
             dtype={
                 'names': ('bio_rec_6', 'bio_rec_6_delay'),
                 'formats': ('bool', 'int')
             })
y['bio_rec_6'] = data[['bio_rec_6']].to_numpy().ravel()
y['bio_rec_6_delay'] = data[['bio_rec_6_delay']].to_numpy().ravel()

#rsf = RandomSurvivalForest(n_estimators = 100, min_samples_split = 10,
#                           min_samples_leaf = 15, max_features = "sqrt")
rsf = RandomSurvivalForest(n_estimators=100, min_samples_leaf=15)

scores = np.zeros((N, len(tx)))

for j in range(N):
    cv = StratifiedKFold(n_splits=K, shuffle=True)
    for i, x in enumerate(tx):
        x = x.to_numpy()
        for k, (train, test) in enumerate(cv.split(x, y['bio_rec_6'])):
            xtrain, ytrain = x[train], y[train]
            xtest, ytest = x[test], y[test]
            rsf.fit(xtrain, ytrain)
            scores[j, i] += rsf.score(xtest, ytest)
scores /= K

mean_scores = np.mean(scores, axis=0)
def test_fit_max_samples(make_whas500, max_samples, exc_type, exc_msg):
    whas500 = make_whas500(to_numeric=True)
    forest = RandomSurvivalForest(max_samples=max_samples)
    with pytest.raises(exc_type, match=exc_msg):
        forest.fit(whas500.x, whas500.y)
Exemple #17
0
    def test_regression(self):
        seed = 15
        train_performances = self.train_performances
        train_features = self.train_inst
        test_performances = self.test_performances
        test_features = self.test_inst
        # print(train_performances)
        # melted_train_performances = pd.melt(train_performances.reset_index(
        # ), id_vars="index", value_name="performance", var_name="algorithm")
        # print(melted_train_performances)
        # joined_train_data = train_features.join(
        #     melted_train_performances.set_index("index"))
        # joined_train_data["algorithm"] = joined_train_data["algorithm"].astype(
        #     "category")
        # for index, algorithm in enumerate(self.algorithms):

        # train_data = encoder.fit_transform(joined_train_data)

        dataset = []
        indices = []
        for inst_index, row in train_performances.iterrows():
            for alg_index, algorithm in enumerate(self.algorithms):
                cur_features = self.train_inst.loc[inst_index]
                alg_enc = len(self.algorithms) * [0]
                alg_enc[alg_index] = 1
                alg_one_hot = pd.Series(alg_enc, index=self.algorithms)
                cur_performance = row.loc[algorithm]
                new_row = cur_features.append(alg_one_hot).append(
                    pd.Series(data=[cur_performance], index=["performance"]))
                dataset.append(new_row)
                indices.append(inst_index)

        df_train = pd.DataFrame(dataset, index=indices)
        print(df_train)

        X_train = df_train.iloc[:, :-1]
        y_train = df_train.iloc[:, -1]
        # X_test = test_data.iloc[:, :-1]
        # y_test = test_data.iloc[:, -1]

        X_train = X_train.to_numpy().astype(np.float64)
        y_train = y_train.to_numpy().astype(np.float64)

        mask = y_train <= 2500.0
        timeouted_runs = ~mask

        # the time at which the observation ends is actually the cutoff, not the par10
        y_train[timeouted_runs] = 700.0
        structured_y_train = np.rec.fromarrays([mask, y_train],
                                               names="terminated,runtime")
        print(structured_y_train)

        model = RandomSurvivalForest(n_estimators=100,
                                     n_jobs=1,
                                     random_state=seed)
        print("Starting to fit model")
        model.fit(X_train, structured_y_train)

        # evaluate model
        result_data_rsf = []
        for index, row in test_features.iterrows():
            predicted_performances = []
            # predicted_performances = [-1] * len(
            #     test_scenario.performance_data.columns)
            for alg_index, algorithm in enumerate(self.algorithms):
                # print(algorithm)
                cur_features = row
                alg_enc = len(self.algorithms) * [0]
                alg_enc[alg_index] = 1
                alg_one_hot = pd.Series(alg_enc, index=self.algorithms)
                new_row = cur_features.append(alg_one_hot)
                new_row_np = new_row.to_numpy().astype(np.float64).reshape(
                    1, -1)
                predicted_performance = model.predict(new_row_np)
                predicted_performances.append(predicted_performance[0])
            result_data_rsf.append([index, *predicted_performances])

        performance_cols = [x + "_performance" for x in self.algorithms]
        result_columns_rsf = ["problem_instance"]
        result_columns_rsf += performance_cols
        results_rsf = pd.DataFrame(data=result_data_rsf,
                                   columns=result_columns_rsf)
        print(results_rsf)

        taus = []
        for index, row in self.test_performances.iterrows():
            true_performances = row.to_numpy()
            true_ranking = np.argsort(true_performances)
            predicted_scores = results_rsf.loc[index].to_numpy()[1:]
            predicted_ranking = np.argsort(predicted_scores)[::-1]
            print("true", true_performances[true_ranking])
            print("predicted", predicted_scores[predicted_ranking])
            print(predicted_scores)
            print()
            print("argmax", np.argmax(predicted_scores))
            print("true ranking", true_ranking)
            print("predicted ranking", predicted_ranking)
            print("\n")
            # print("predicted scores", predicted_scores)
            taus.append(kendalltau(true_ranking, predicted_ranking)[0])

        print("taus", taus)
        print("Average Kendalls tau", np.mean(taus))
Exemple #18
0
                         sort=False).astype(np.float64)

output_train['SurvivalTime'] = output_train['SurvivalTime'].astype(np.float64)

random_state = 20
X_train, X_test, y_train, y_test = train_test_split(
    features_RTF,
    output_train.to_records(index=False),
    test_size=0.25,
    random_state=random_state)

# In[832]:

rsf = RandomSurvivalForest(n_estimators=100,
                           min_samples_split=10,
                           min_samples_leaf=15,
                           max_features="sqrt",
                           n_jobs=-1,
                           random_state=random_state)
rsf.fit(X_train, y_train)

# In[833]:

rsf.score(X_test, y_test)

# In[813]:


def get_predict_time_RSF(predict, feature, model, fit=True):
    median_survival_time = np.zeros(np.size(predict, 0))
    times = model.event_times_
    for i in range(np.size(predict, 0)):
Exemple #19
0
                   axis=1)
categorical_ix = [0, 2, 3, 4, 5, 6, 7, 8, 15, 16, 17]
categorical_columns = data_x.columns[categorical_ix].values
data_x_one_hot = pd.get_dummies(data_x, columns=categorical_columns)

data_y = data[['Mortality', 'SurvivalWeeks']]
data_y['Mortality'] = data_y['Mortality'].astype(bool)
data_y = np.array(list(data_y.to_records(index=False)))

X_train, X_test, y_train, y_test = train_test_split(data_x_one_hot,
                                                    data_y,
                                                    test_size=0.25,
                                                    random_state=369)

a = X_test[X_test['AF_1.0'] == 1].iloc[0:3, :]
demo_x = pd.concat([
    X_test[X_test['AF_1.0'] == 1].iloc[0:3, :],
    X_test[X_test['AF_1.0'] == 0].iloc[0:3, :]
])

rsf = RandomSurvivalForest(n_estimators=1000,
                           min_samples_split=10,
                           min_samples_leaf=15,
                           max_features="sqrt",
                           n_jobs=-1,
                           random_state=369)
rsf.fit(X_train, y_train)

print(rsf.score(X_test, y_test))

print('done')
Exemple #20
0
        labels_crit_train[i] = (True,  label_train[i, 0])
    else:
        labels_crit_train[i] = (False, label_train[i, 0])

for i in range(labels_crit_test.shape[0]):
    if label_test[i, 1] == 1:
        labels_crit_test[i] = (True,  label_test[i, 0])
    else:
        labels_crit_test[i] = (False, label_test[i, 0])


# Define the parameter of random survival forest
n_estimators = 'number of estimators'
max_depth = 'the max depth'
min_samples_split = 'num of min_samples_split'
min_samples_leaf = 'num of min_samples_leaf'
max_features = 'num of features to use'
bootstrap = True
rsf = RandomSurvivalForest(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, bootstrap=bootstrap,
                           min_samples_split=min_samples_split,
                           min_samples_leaf=min_samples_leaf)

# fit the model
rsf.fit(feat_train, labels_crit_train)

# obtain the c-index on test data
result_test = rsf.score(feat_test, labels_crit_test)

# or if you would like to know the predicted risk scores
risks_test = rsf.predict(feat_test)
Exemple #21
0
        Labels_Crit_Test[i] = (False, Time_Test[i, 0])

# The DL Feature based Prediction model
random_state = 50
n_estimators = 30
max_depth = None
min_samples_split = 8
min_samples_leaf = 6
max_features = 'sqrt'
n_jobs = 6
bootstrap = True

rsf = RandomSurvivalForest(n_estimators=n_estimators,
                           max_depth=max_depth,
                           max_features=max_features,
                           random_state=random_state,
                           bootstrap=bootstrap,
                           min_samples_split=min_samples_split,
                           min_samples_leaf=min_samples_leaf)
rsf.fit(Feat_Train, Labels_Crit_Train)

# The risk scores of each subject
Scores_Train_DL = rsf.predict(Feat_Train)
Scores_Test_DL = rsf.predict(Feat_Test)
Scores_Val_DL = rsf.predict(Feat_Val)
Scores_Train_DL = np.expand_dims(Scores_Train_DL, axis=1)
Scores_Test_DL = np.expand_dims(Scores_Test_DL, axis=1)
Scores_Val_DL = np.expand_dims(Scores_Val_DL, axis=1)

C_Ind_Test_DL_Best = rsf.score(Feat_Test, Labels_Crit_Test)
Exemple #22
0
                       'formats': ('?', 'f8')
                   })
y_struc['vital_status'] = y['event'].astype('bool')
y_struc['os_time'] = y['time']

X = data.copy()

train_scores = []
test_scores = []
for i in tqdm(range(1, 11)):
    random_state = i
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_struc, test_size=0.2, random_state=random_state)
    rsf = RandomSurvivalForest(n_estimators=200,
                               min_samples_split=20,
                               min_samples_leaf=25,
                               max_features="sqrt",
                               n_jobs=-1,
                               random_state=random_state)
    rsf.fit(X_train, y_train)
    train_scores.append(rsf.score(X_train, y_train))
    test_scores.append(rsf.score(X_test, y_test))

print('training', np.asarray(train_scores))
print('training_mean',
      np.asarray(train_scores).mean(), 'training_stdev',
      np.asarray(train_scores).std())
print('test', np.asarray(test_scores))
print('test_mean',
      np.asarray(test_scores).mean(), 'test_stdev',
      np.asarray(test_scores).std())
Exemple #23
0
import preprocessing
from sksurv.ensemble import RandomSurvivalForest
from sklearn.model_selection import cross_validate
"""
Reading data
"""


def load_data(features=None):
    X_df, y_df, _ = preprocessing.load_owkin_data()
    if features != None:
        X_df = X_df[features]
    X = X_df.to_numpy()
    y = preprocessing.y_dataframe_to_rsf_input(y_df)
    return X_df, y_df, X, y


X_df, y_df, X, y = load_data()
feature_name = list(X_df.columns.values)
"""
Train model
"""
params = {'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 10}
rsf = RandomSurvivalForest(n_estimators=params['n_estimators'],
                           min_samples_split=params['min_samples_split'],
                           min_samples_leaf=params['min_samples_leaf'],
                           max_features="sqrt",
                           n_jobs=-1)
print(cross_validate(rsf, X, y, cv=5))
Exemple #24
0
X_no_grade = X.drop("tgrade", axis=1)
Xt = OneHotEncoder().fit_transform(X_no_grade)
Xt = np.column_stack((Xt.values, grade_num))

feature_names = X_no_grade.columns.tolist() + ["tgrade"]

random_state = 20

X_train, X_test, y_train, y_test = train_test_split(Xt,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=random_state)

rsf = RandomSurvivalForest(n_estimators=1000,
                           min_samples_split=10,
                           min_samples_leaf=15,
                           max_features="sqrt",
                           n_jobs=-1,
                           random_state=random_state)

rsf.fit(X_train, y_train)

a = np.empty(X_test.shape[0], dtype=[("age", float), ("pnodes", float)])
a["age"] = X_test[:, 0]
a["pnodes"] = X_test[:, 4]

sort_idx = np.argsort(a, order=["pnodes", "age"])
X_test_sel = pd.DataFrame(X_test[np.concatenate(
    (sort_idx[:3], sort_idx[-3:]))],
                          columns=feature_names)

pd.Series(rsf.predict(X_test_sel))
Exemple #25
0
def test_fit_warm_start(make_whas500):
    whas500 = make_whas500(to_numeric=True)

    forest = RandomSurvivalForest(n_estimators=11, max_depth=2, random_state=2)
    forest.fit(whas500.x, whas500.y)

    assert len(forest.estimators_) == 11
    assert all((e.max_depth == 2 for e in forest.estimators_))

    forest.set_params(warm_start=True)
    with pytest.warns(UserWarning,
                      match="Warm-start fitting without increasing "
                      "n_estimators does not fit new trees."):
        forest.fit(whas500.x, whas500.y)

    forest.set_params(n_estimators=3)
    with pytest.raises(ValueError,
                       match="n_estimators=3 must be larger or equal to "
                       r"len\(estimators_\)=11 when warm_start==True"):
        forest.fit(whas500.x, whas500.y)

    forest.set_params(n_estimators=23)
    forest.fit(whas500.x, whas500.y)

    assert len(forest.estimators_) == 23
    assert all((e.max_depth == 2 for e in forest.estimators_))
Exemple #26
0
cph = CoxPHSurvivalAnalysis()
c_index['cph'] = cross_val_score(cph,
                                 X_ss,
                                 y_ss,
                                 cv=k_fold,
                                 scoring=c_index_scorer,
                                 verbose=1)

# %%
# Random Survival Forests (RSFs)

#%%
X_rsf = df.copy(deep=True)

rsf = RandomSurvivalForest(random_state=SEED, n_jobs=-1, verbose=True)
rsf.criterion = 'log_rank'

if not (PATH_PARAMETERS / 'param_search_results_rsf.pkl').exists():
    helpers.save_best_params(rsf,
                             X_rsf,
                             y_ss,
                             PATH_PARAMETERS,
                             k_fold=k_fold,
                             scorer=c_index_scorer,
                             n_iter=25)

param_search_results_rsf = load(PATH_PARAMETERS /
                                'param_search_results_rsf.pkl')
c_index['rsf'] = helpers.get_best_test_scores(param_search_results_rsf)
Exemple #27
0
    print('Testing on %d-----------------------------' % val_id)
    print(x_train.shape, x_val.shape)

    # special for RSF
    dt = np.dtype('bool,float')
    y_train_surv = np.array([(bool(e), y) for e, y in zip(e_train, y_train)],
                            dtype=dt)
    y_val_surv = np.array([(bool(e), y) for e, y in zip(e_val, y_val)],
                          dtype=dt)
    print(y_train_surv.shape, y_val_surv.shape)

    # train RSF
    rsf = RandomSurvivalForest(n_estimators=200,
                               min_samples_split=70,
                               min_samples_leaf=30,
                               max_features="log2",
                               oob_score=True,
                               n_jobs=-1,
                               random_state=20)
    rsf.fit(x_train, y_train_surv)

    preds = -rsf.predict(x_val)

    cindex_train = rsf.score(x_train, y_train_surv)
    cindex_oob = rsf.oob_score_
    cindex_val = rsf.score(x_val, y_val_surv)
    cindex_val_events = rsf.score(x_val[e_val == 1], y_val_surv[e_val == 1])

    rsf_cindex_trains.append(cindex_train)
    rsf_cindex_vals.append(cindex_val)
    rsf_cindex_vals_events.append(cindex_val_events)
def RSF_bootstrap(fp, num=False):
    df = pd.read_csv(fp, index_col=0)

    # configure bootstrap (sampling 50% of data)
    n_iterations = 100
    n_size = int(len(df) * 0.50)

    # parameters
    NUMESTIMATORS = 100
    TESTSIZE = 0.20
    random_state = 20

    # calculate population of statistics
    metrics = []
    for i in range(n_iterations):
        # prepare sample

        # if indicated, include number of mets (col 42)
        if num:
            sample = resample(df.iloc[:, np.r_[:20, 40, 41, 42]],
                              n_samples=n_size)
            X = sample.iloc[:, np.r_[:20, 42]].copy()

        else:
            sample = resample(df.iloc[:, np.r_[:20, 40, 41]], n_samples=n_size)
            X = sample.iloc[:, :20].copy()

        X = X.to_numpy().astype('float64')
        y = sample[['Event', 'Time']].copy()
        y['Event'] = y['Event'].astype('bool')
        y['Time'] = y['Time'].astype('float64')
        y = y.to_records(index=False)

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=TESTSIZE, random_state=random_state)
        rsf = RandomSurvivalForest(n_estimators=NUMESTIMATORS,
                                   min_samples_split=15,
                                   min_samples_leaf=8,
                                   max_features="sqrt",
                                   n_jobs=-1,
                                   random_state=random_state)
        rsf.fit(X_train, y_train)

        score = rsf.score(X_test, y_test)
        metrics.append(score)

    # calculate confidence interval
    alpha = 0.95
    p = ((1.0 - alpha) / 2.0) * 100
    lower = max(0.0, np.percentile(metrics, p))
    p = (alpha + ((1.0 - alpha) / 2.0)) * 100
    upper = min(1.0, np.percentile(metrics, p))
    med = np.percentile(metrics, 50)

    # identify aggregation method name
    if num:
        name = fp.split('/')[-1].split('_')[0] + ' + NumMets'
    else:
        name = fp.split('/')[-1].split('_')[0]

    return print(name, 'RSF', '%.3f (%.3f-%.3f)' % (med, lower, upper))