def test_oob_too_little_estimators(make_whas500): whas500 = make_whas500(to_numeric=True) forest = RandomSurvivalForest(n_estimators=3, oob_score=True, random_state=2) with pytest.warns(UserWarning, match="Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates."): forest.fit(whas500.x, whas500.y)
def test_fit_no_bootstrap(make_whas500): whas500 = make_whas500(to_numeric=True) forest = RandomSurvivalForest(n_estimators=10, bootstrap=False, random_state=2) forest.fit(whas500.x, whas500.y) pred = forest.predict(whas500.x) expected_c = (0.931881994437717, 70030, 5119, 0, 14) assert_cindex_almost_equal( whas500.y["fstat"], whas500.y["lenfol"], pred, expected_c)
def test_fit_with_small_max_samples(make_whas500): whas500 = make_whas500(to_numeric=True) # First fit with no restriction on max samples est1 = RandomSurvivalForest( n_estimators=1, random_state=1, max_samples=None, ) # Second fit with max samples restricted to just 2 est2 = RandomSurvivalForest( n_estimators=1, random_state=1, max_samples=2, ) est1.fit(whas500.x, whas500.y) est2.fit(whas500.x, whas500.y) tree1 = est1.estimators_[0].tree_ tree2 = est2.estimators_[0].tree_ msg = "Tree without `max_samples` restriction should have more nodes" assert tree1.node_count > tree2.node_count, msg
def test_predict_step_function_warning(make_whas500, func): whas500 = make_whas500(to_numeric=True) forest = RandomSurvivalForest(n_estimators=3, oob_score=True, random_state=2) forest.fit(whas500.x, whas500.y) pred_fn = getattr(forest, func) with pytest.warns( FutureWarning, match="{} will return an array of StepFunction instances in 0.14". format(func)): pred_fn(whas500.x)
def test_pipeline_predict(breast_cancer, func): X_str, _ = load_breast_cancer() X_num, y = breast_cancer est = RandomSurvivalForest(n_estimators=10, random_state=1) est.fit(X_num[10:], y[10:]) pipe = make_pipeline(OneHotEncoder(), RandomSurvivalForest(n_estimators=10, random_state=1)) pipe.fit(X_str[10:], y[10:]) tree_pred = getattr(est, func)(X_num[:10], return_array=True) pipe_pred = getattr(pipe, func)(X_str[:10], return_array=True) assert_array_almost_equal(tree_pred, pipe_pred)
def test_fit_predict(make_whas500): whas500 = make_whas500(to_numeric=True) forest = RandomSurvivalForest(random_state=2) forest.fit(whas500.x, whas500.y) assert len(forest.estimators_) == 100 pred = forest.predict(whas500.x) assert numpy.isfinite(pred).all() assert numpy.all(pred >= 0) expected_c = (0.9026201280123488, 67831, 7318, 0, 14) assert_cindex_almost_equal(whas500.y["fstat"], whas500.y["lenfol"], pred, expected_c)
def fit_regressors(self, features, performances, random_state): imputer = [SimpleImputer() for _ in range(self.num_algorithms)] scaler = [StandardScaler() for _ in range(self.num_algorithms)] models = [ RandomSurvivalForest( n_estimators=self.n_estimators, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_features=self.max_features, bootstrap=self.bootstrap, oob_score=self.oob_score, n_jobs=1, random_state=random_state) for _ in range(self.num_algorithms) ] for alg_id in range(self.num_algorithms): # prepare survival forest dataset and split the data accordingly X_train, Y_train = self.construct_dataset_for_algorithm_id( features, performances, alg_id, self.algorithm_cutoff_time) X_train = imputer[alg_id].fit_transform(features) X_train = scaler[alg_id].fit_transform(X_train) models[alg_id].fit(X_train, Y_train) return imputer, scaler, models
def test_predict_step_function(make_whas500, func): whas500 = make_whas500(to_numeric=True) forest = RandomSurvivalForest(n_estimators=10, random_state=2) forest.fit(whas500.x[10:], whas500.y[10:]) pred_fn = getattr(forest, func) ret_array = pred_fn(whas500.x[:10], return_array=True) fn_array = pred_fn(whas500.x[:10], return_array=False) assert ret_array.shape[0] == fn_array.shape[0] for fn, arr in zip(fn_array, ret_array): assert_array_almost_equal(fn.x, forest.event_times_) assert_array_almost_equal(fn.y, arr)
def _random_survival_forest(formatted_x, formatted_y, test_size, n_estimators, min_samples_split, min_samples_leaf, max_features, random_state): x_train, x_test, y_train, y_test = train_test_split( formatted_x, formatted_y, test_size=test_size, random_state=random_state) cur_rsf = RandomSurvivalForest(n_estimators=n_estimators, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_features=max_features, n_jobs=-1, random_state=random_state) cur_rsf.fit(x_train, y_train) return cur_rsf.score(x_test, y_test)
def test_fit_predict_chf(make_whas500): whas500 = make_whas500(to_numeric=True) forest = RandomSurvivalForest(n_estimators=10, random_state=2) forest.fit(whas500.x, whas500.y) assert len(forest.estimators_) == 10 chf = forest.predict_cumulative_hazard_function(whas500.x) assert chf.shape == (500, forest.event_times_.shape[0]) assert numpy.isfinite(chf).all() assert numpy.all(chf >= 0.0) vals, counts = numpy.unique(chf[:, 0], return_counts=True) assert vals[0] == 0.0 assert numpy.max(counts) == counts[0] d = numpy.apply_along_axis(numpy.diff, 1, chf) assert (d >= 0).all()
def test_fit_predict_surv(make_whas500): whas500 = make_whas500(to_numeric=True) forest = RandomSurvivalForest(n_estimators=10, random_state=2) forest.fit(whas500.x, whas500.y) assert len(forest.estimators_) == 10 surv = forest.predict_survival_function(whas500.x) assert surv.shape == (500, forest.event_times_.shape[0]) assert numpy.isfinite(surv).all() assert numpy.all(surv >= 0.0) assert numpy.all(surv <= 1.0) vals, counts = numpy.unique(surv[:, 0], return_counts=True) assert vals[-1] == 1.0 assert numpy.max(counts) == counts[-1] d = numpy.apply_along_axis(numpy.diff, 1, surv) assert (d <= 0).all()
def test_oob_score(make_whas500): whas500 = make_whas500(to_numeric=True) forest = RandomSurvivalForest(oob_score=True, bootstrap=False, random_state=2) with pytest.raises(ValueError, match="Out of bag estimation only available " "if bootstrap=True"): forest.fit(whas500.x, whas500.y) forest.set_params(bootstrap=True) forest.fit(whas500.x, whas500.y) assert forest.oob_prediction_.shape == (whas500.x.shape[0], ) assert round(abs(forest.oob_score_ - 0.753010685), 6) == 0.0
def __init__(self, n_estimators=100): super(RandomSurvForest, self).__init__() # super().__init__() self.n_estimators = n_estimators self.name = 'RandomSurvForest' self.model = RandomSurvivalForest(n_estimators=self.n_estimators) self.direction = 1 self.prob_FLAG = True self.explained = "*Random Survival Forest" self.image_name = "RandomSurvForest.png" self.image_size = (500, 500)
def test_fit_int_time(make_whas500): whas500 = make_whas500(to_numeric=True) y = whas500.y y_int = numpy.empty(y.shape[0], dtype=[(y.dtype.names[0], bool), (y.dtype.names[1], int)]) y_int[:] = y forest_f = RandomSurvivalForest(oob_score=True, random_state=2).fit( whas500.x[50:], y[50:]) forest_i = RandomSurvivalForest(oob_score=True, random_state=2).fit( whas500.x[50:], y_int[50:]) assert len(forest_f.estimators_) == len(forest_i.estimators_) assert forest_f.n_features_ == forest_i.n_features_ assert forest_f.oob_score_ == forest_i.oob_score_ assert_array_almost_equal(forest_f.event_times_, forest_i.event_times_) pred_f = forest_f.predict(whas500.x[:50]) pred_i = forest_i.predict(whas500.x[:50]) assert_array_almost_equal(pred_f, pred_i)
'T2w_contrast_mean']], data[['tum_vol', 'ADC_ave', 'T2w_ave']], data[['tum_area_from_vol', 'dens_ADCT2w']], data[['init_tum_area']], data[['TTum330_alphaG1120_vd02']] ] y = np.zeros(76, dtype={ 'names': ('bio_rec_6', 'bio_rec_6_delay'), 'formats': ('bool', 'int') }) y['bio_rec_6'] = data[['bio_rec_6']].to_numpy().ravel() y['bio_rec_6_delay'] = data[['bio_rec_6_delay']].to_numpy().ravel() #rsf = RandomSurvivalForest(n_estimators = 100, min_samples_split = 10, # min_samples_leaf = 15, max_features = "sqrt") rsf = RandomSurvivalForest(n_estimators=100, min_samples_leaf=15) scores = np.zeros((N, len(tx))) for j in range(N): cv = StratifiedKFold(n_splits=K, shuffle=True) for i, x in enumerate(tx): x = x.to_numpy() for k, (train, test) in enumerate(cv.split(x, y['bio_rec_6'])): xtrain, ytrain = x[train], y[train] xtest, ytest = x[test], y[test] rsf.fit(xtrain, ytrain) scores[j, i] += rsf.score(xtest, ytest) scores /= K mean_scores = np.mean(scores, axis=0)
def test_fit_max_samples(make_whas500, max_samples, exc_type, exc_msg): whas500 = make_whas500(to_numeric=True) forest = RandomSurvivalForest(max_samples=max_samples) with pytest.raises(exc_type, match=exc_msg): forest.fit(whas500.x, whas500.y)
def test_regression(self): seed = 15 train_performances = self.train_performances train_features = self.train_inst test_performances = self.test_performances test_features = self.test_inst # print(train_performances) # melted_train_performances = pd.melt(train_performances.reset_index( # ), id_vars="index", value_name="performance", var_name="algorithm") # print(melted_train_performances) # joined_train_data = train_features.join( # melted_train_performances.set_index("index")) # joined_train_data["algorithm"] = joined_train_data["algorithm"].astype( # "category") # for index, algorithm in enumerate(self.algorithms): # train_data = encoder.fit_transform(joined_train_data) dataset = [] indices = [] for inst_index, row in train_performances.iterrows(): for alg_index, algorithm in enumerate(self.algorithms): cur_features = self.train_inst.loc[inst_index] alg_enc = len(self.algorithms) * [0] alg_enc[alg_index] = 1 alg_one_hot = pd.Series(alg_enc, index=self.algorithms) cur_performance = row.loc[algorithm] new_row = cur_features.append(alg_one_hot).append( pd.Series(data=[cur_performance], index=["performance"])) dataset.append(new_row) indices.append(inst_index) df_train = pd.DataFrame(dataset, index=indices) print(df_train) X_train = df_train.iloc[:, :-1] y_train = df_train.iloc[:, -1] # X_test = test_data.iloc[:, :-1] # y_test = test_data.iloc[:, -1] X_train = X_train.to_numpy().astype(np.float64) y_train = y_train.to_numpy().astype(np.float64) mask = y_train <= 2500.0 timeouted_runs = ~mask # the time at which the observation ends is actually the cutoff, not the par10 y_train[timeouted_runs] = 700.0 structured_y_train = np.rec.fromarrays([mask, y_train], names="terminated,runtime") print(structured_y_train) model = RandomSurvivalForest(n_estimators=100, n_jobs=1, random_state=seed) print("Starting to fit model") model.fit(X_train, structured_y_train) # evaluate model result_data_rsf = [] for index, row in test_features.iterrows(): predicted_performances = [] # predicted_performances = [-1] * len( # test_scenario.performance_data.columns) for alg_index, algorithm in enumerate(self.algorithms): # print(algorithm) cur_features = row alg_enc = len(self.algorithms) * [0] alg_enc[alg_index] = 1 alg_one_hot = pd.Series(alg_enc, index=self.algorithms) new_row = cur_features.append(alg_one_hot) new_row_np = new_row.to_numpy().astype(np.float64).reshape( 1, -1) predicted_performance = model.predict(new_row_np) predicted_performances.append(predicted_performance[0]) result_data_rsf.append([index, *predicted_performances]) performance_cols = [x + "_performance" for x in self.algorithms] result_columns_rsf = ["problem_instance"] result_columns_rsf += performance_cols results_rsf = pd.DataFrame(data=result_data_rsf, columns=result_columns_rsf) print(results_rsf) taus = [] for index, row in self.test_performances.iterrows(): true_performances = row.to_numpy() true_ranking = np.argsort(true_performances) predicted_scores = results_rsf.loc[index].to_numpy()[1:] predicted_ranking = np.argsort(predicted_scores)[::-1] print("true", true_performances[true_ranking]) print("predicted", predicted_scores[predicted_ranking]) print(predicted_scores) print() print("argmax", np.argmax(predicted_scores)) print("true ranking", true_ranking) print("predicted ranking", predicted_ranking) print("\n") # print("predicted scores", predicted_scores) taus.append(kendalltau(true_ranking, predicted_ranking)[0]) print("taus", taus) print("Average Kendalls tau", np.mean(taus))
sort=False).astype(np.float64) output_train['SurvivalTime'] = output_train['SurvivalTime'].astype(np.float64) random_state = 20 X_train, X_test, y_train, y_test = train_test_split( features_RTF, output_train.to_records(index=False), test_size=0.25, random_state=random_state) # In[832]: rsf = RandomSurvivalForest(n_estimators=100, min_samples_split=10, min_samples_leaf=15, max_features="sqrt", n_jobs=-1, random_state=random_state) rsf.fit(X_train, y_train) # In[833]: rsf.score(X_test, y_test) # In[813]: def get_predict_time_RSF(predict, feature, model, fit=True): median_survival_time = np.zeros(np.size(predict, 0)) times = model.event_times_ for i in range(np.size(predict, 0)):
axis=1) categorical_ix = [0, 2, 3, 4, 5, 6, 7, 8, 15, 16, 17] categorical_columns = data_x.columns[categorical_ix].values data_x_one_hot = pd.get_dummies(data_x, columns=categorical_columns) data_y = data[['Mortality', 'SurvivalWeeks']] data_y['Mortality'] = data_y['Mortality'].astype(bool) data_y = np.array(list(data_y.to_records(index=False))) X_train, X_test, y_train, y_test = train_test_split(data_x_one_hot, data_y, test_size=0.25, random_state=369) a = X_test[X_test['AF_1.0'] == 1].iloc[0:3, :] demo_x = pd.concat([ X_test[X_test['AF_1.0'] == 1].iloc[0:3, :], X_test[X_test['AF_1.0'] == 0].iloc[0:3, :] ]) rsf = RandomSurvivalForest(n_estimators=1000, min_samples_split=10, min_samples_leaf=15, max_features="sqrt", n_jobs=-1, random_state=369) rsf.fit(X_train, y_train) print(rsf.score(X_test, y_test)) print('done')
labels_crit_train[i] = (True, label_train[i, 0]) else: labels_crit_train[i] = (False, label_train[i, 0]) for i in range(labels_crit_test.shape[0]): if label_test[i, 1] == 1: labels_crit_test[i] = (True, label_test[i, 0]) else: labels_crit_test[i] = (False, label_test[i, 0]) # Define the parameter of random survival forest n_estimators = 'number of estimators' max_depth = 'the max depth' min_samples_split = 'num of min_samples_split' min_samples_leaf = 'num of min_samples_leaf' max_features = 'num of features to use' bootstrap = True rsf = RandomSurvivalForest(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, bootstrap=bootstrap, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf) # fit the model rsf.fit(feat_train, labels_crit_train) # obtain the c-index on test data result_test = rsf.score(feat_test, labels_crit_test) # or if you would like to know the predicted risk scores risks_test = rsf.predict(feat_test)
Labels_Crit_Test[i] = (False, Time_Test[i, 0]) # The DL Feature based Prediction model random_state = 50 n_estimators = 30 max_depth = None min_samples_split = 8 min_samples_leaf = 6 max_features = 'sqrt' n_jobs = 6 bootstrap = True rsf = RandomSurvivalForest(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, random_state=random_state, bootstrap=bootstrap, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf) rsf.fit(Feat_Train, Labels_Crit_Train) # The risk scores of each subject Scores_Train_DL = rsf.predict(Feat_Train) Scores_Test_DL = rsf.predict(Feat_Test) Scores_Val_DL = rsf.predict(Feat_Val) Scores_Train_DL = np.expand_dims(Scores_Train_DL, axis=1) Scores_Test_DL = np.expand_dims(Scores_Test_DL, axis=1) Scores_Val_DL = np.expand_dims(Scores_Val_DL, axis=1) C_Ind_Test_DL_Best = rsf.score(Feat_Test, Labels_Crit_Test)
'formats': ('?', 'f8') }) y_struc['vital_status'] = y['event'].astype('bool') y_struc['os_time'] = y['time'] X = data.copy() train_scores = [] test_scores = [] for i in tqdm(range(1, 11)): random_state = i X_train, X_test, y_train, y_test = train_test_split( X, y_struc, test_size=0.2, random_state=random_state) rsf = RandomSurvivalForest(n_estimators=200, min_samples_split=20, min_samples_leaf=25, max_features="sqrt", n_jobs=-1, random_state=random_state) rsf.fit(X_train, y_train) train_scores.append(rsf.score(X_train, y_train)) test_scores.append(rsf.score(X_test, y_test)) print('training', np.asarray(train_scores)) print('training_mean', np.asarray(train_scores).mean(), 'training_stdev', np.asarray(train_scores).std()) print('test', np.asarray(test_scores)) print('test_mean', np.asarray(test_scores).mean(), 'test_stdev', np.asarray(test_scores).std())
import preprocessing from sksurv.ensemble import RandomSurvivalForest from sklearn.model_selection import cross_validate """ Reading data """ def load_data(features=None): X_df, y_df, _ = preprocessing.load_owkin_data() if features != None: X_df = X_df[features] X = X_df.to_numpy() y = preprocessing.y_dataframe_to_rsf_input(y_df) return X_df, y_df, X, y X_df, y_df, X, y = load_data() feature_name = list(X_df.columns.values) """ Train model """ params = {'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 10} rsf = RandomSurvivalForest(n_estimators=params['n_estimators'], min_samples_split=params['min_samples_split'], min_samples_leaf=params['min_samples_leaf'], max_features="sqrt", n_jobs=-1) print(cross_validate(rsf, X, y, cv=5))
X_no_grade = X.drop("tgrade", axis=1) Xt = OneHotEncoder().fit_transform(X_no_grade) Xt = np.column_stack((Xt.values, grade_num)) feature_names = X_no_grade.columns.tolist() + ["tgrade"] random_state = 20 X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.25, random_state=random_state) rsf = RandomSurvivalForest(n_estimators=1000, min_samples_split=10, min_samples_leaf=15, max_features="sqrt", n_jobs=-1, random_state=random_state) rsf.fit(X_train, y_train) a = np.empty(X_test.shape[0], dtype=[("age", float), ("pnodes", float)]) a["age"] = X_test[:, 0] a["pnodes"] = X_test[:, 4] sort_idx = np.argsort(a, order=["pnodes", "age"]) X_test_sel = pd.DataFrame(X_test[np.concatenate( (sort_idx[:3], sort_idx[-3:]))], columns=feature_names) pd.Series(rsf.predict(X_test_sel))
def test_fit_warm_start(make_whas500): whas500 = make_whas500(to_numeric=True) forest = RandomSurvivalForest(n_estimators=11, max_depth=2, random_state=2) forest.fit(whas500.x, whas500.y) assert len(forest.estimators_) == 11 assert all((e.max_depth == 2 for e in forest.estimators_)) forest.set_params(warm_start=True) with pytest.warns(UserWarning, match="Warm-start fitting without increasing " "n_estimators does not fit new trees."): forest.fit(whas500.x, whas500.y) forest.set_params(n_estimators=3) with pytest.raises(ValueError, match="n_estimators=3 must be larger or equal to " r"len\(estimators_\)=11 when warm_start==True"): forest.fit(whas500.x, whas500.y) forest.set_params(n_estimators=23) forest.fit(whas500.x, whas500.y) assert len(forest.estimators_) == 23 assert all((e.max_depth == 2 for e in forest.estimators_))
cph = CoxPHSurvivalAnalysis() c_index['cph'] = cross_val_score(cph, X_ss, y_ss, cv=k_fold, scoring=c_index_scorer, verbose=1) # %% # Random Survival Forests (RSFs) #%% X_rsf = df.copy(deep=True) rsf = RandomSurvivalForest(random_state=SEED, n_jobs=-1, verbose=True) rsf.criterion = 'log_rank' if not (PATH_PARAMETERS / 'param_search_results_rsf.pkl').exists(): helpers.save_best_params(rsf, X_rsf, y_ss, PATH_PARAMETERS, k_fold=k_fold, scorer=c_index_scorer, n_iter=25) param_search_results_rsf = load(PATH_PARAMETERS / 'param_search_results_rsf.pkl') c_index['rsf'] = helpers.get_best_test_scores(param_search_results_rsf)
print('Testing on %d-----------------------------' % val_id) print(x_train.shape, x_val.shape) # special for RSF dt = np.dtype('bool,float') y_train_surv = np.array([(bool(e), y) for e, y in zip(e_train, y_train)], dtype=dt) y_val_surv = np.array([(bool(e), y) for e, y in zip(e_val, y_val)], dtype=dt) print(y_train_surv.shape, y_val_surv.shape) # train RSF rsf = RandomSurvivalForest(n_estimators=200, min_samples_split=70, min_samples_leaf=30, max_features="log2", oob_score=True, n_jobs=-1, random_state=20) rsf.fit(x_train, y_train_surv) preds = -rsf.predict(x_val) cindex_train = rsf.score(x_train, y_train_surv) cindex_oob = rsf.oob_score_ cindex_val = rsf.score(x_val, y_val_surv) cindex_val_events = rsf.score(x_val[e_val == 1], y_val_surv[e_val == 1]) rsf_cindex_trains.append(cindex_train) rsf_cindex_vals.append(cindex_val) rsf_cindex_vals_events.append(cindex_val_events)
def RSF_bootstrap(fp, num=False): df = pd.read_csv(fp, index_col=0) # configure bootstrap (sampling 50% of data) n_iterations = 100 n_size = int(len(df) * 0.50) # parameters NUMESTIMATORS = 100 TESTSIZE = 0.20 random_state = 20 # calculate population of statistics metrics = [] for i in range(n_iterations): # prepare sample # if indicated, include number of mets (col 42) if num: sample = resample(df.iloc[:, np.r_[:20, 40, 41, 42]], n_samples=n_size) X = sample.iloc[:, np.r_[:20, 42]].copy() else: sample = resample(df.iloc[:, np.r_[:20, 40, 41]], n_samples=n_size) X = sample.iloc[:, :20].copy() X = X.to_numpy().astype('float64') y = sample[['Event', 'Time']].copy() y['Event'] = y['Event'].astype('bool') y['Time'] = y['Time'].astype('float64') y = y.to_records(index=False) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=TESTSIZE, random_state=random_state) rsf = RandomSurvivalForest(n_estimators=NUMESTIMATORS, min_samples_split=15, min_samples_leaf=8, max_features="sqrt", n_jobs=-1, random_state=random_state) rsf.fit(X_train, y_train) score = rsf.score(X_test, y_test) metrics.append(score) # calculate confidence interval alpha = 0.95 p = ((1.0 - alpha) / 2.0) * 100 lower = max(0.0, np.percentile(metrics, p)) p = (alpha + ((1.0 - alpha) / 2.0)) * 100 upper = min(1.0, np.percentile(metrics, p)) med = np.percentile(metrics, 50) # identify aggregation method name if num: name = fp.split('/')[-1].split('_')[0] + ' + NumMets' else: name = fp.split('/')[-1].split('_')[0] return print(name, 'RSF', '%.3f (%.3f-%.3f)' % (med, lower, upper))