def convert_dataset(dataset): data_x_numeric = dataset.loc[:, dataset.columns != "status"] data_x_numeric = data_x_numeric.loc[:, data_x_numeric.columns != "time"] # convert string columns to categorical type for col in data_x_numeric.columns: if str(data_x_numeric[col].dtype) == "object": data_x_numeric[col] = data_x_numeric[col].astype('category') data_x_numeric = OneHotEncoder().fit_transform(data_x_numeric) data_y = dataset[["status", "time"]] data_y = data_y.reindex(columns=["status", "time"]) data_y["status"] = data_y["status"].astype('bool') pd_y_values = data_y.copy() pd_y_values = pd_y_values.rename(index=int, columns={"status": "event"}) pd_y_values = pd_y_values.reindex(columns=["time", "event"]) # test on sorted input data test_data = data_x_numeric.copy() test_timed_data = test_data test_timed_data['time'] = pd_y_values["time"] return data_x_numeric, pd_y_values, test_timed_data
def fit_and_prepare(x_train, y_train, test_df): # 3.1. Prepare Y----- y_train.specific_death = y_train.specific_death.astype(bool) # Transform it into a structured array y_train = y_train.to_records(index=False) # 3.2. Prepare X----- # obtain the x variables that are categorical categorical_feature_mask = x_train.dtypes == object # Filter categorical columns using mask and turn it into a list categorical_cols = x_train.columns[categorical_feature_mask].tolist() # Ensure categorical columns are category type for col in categorical_cols: x_train[col] = x_train[col].astype('category') test_df[col] = test_df[col].astype('category') # 3.3. Fit model----- # initiate encoder = OneHotEncoder() estimator = CoxPHSurvivalAnalysis() # fit model estimator.fit(encoder.fit_transform(x_train), y_train) # transform the test variables to match the train x_test = encoder.transform(test_df) return (estimator, x_test, x_train, y_train)
def test_transform(create_data): data, _ = create_data() t = OneHotEncoder().fit(data) data, expected_data = create_data(165) actual_data = t.transform(data) tm.assert_frame_equal(actual_data, expected_data) data = pd.concat((data.iloc[:, :2], data.iloc[:, 5:], data.iloc[:, 2:5]), axis=1) actual_data = t.transform(data) tm.assert_frame_equal(actual_data, expected_data)
def test_transform_other_columns(create_data): data, _ = create_data() t = OneHotEncoder().fit(data) data, _ = create_data(125) data_renamed = data.rename(columns={"binary_1": "renamed_1"}) with pytest.raises( ValueError, match=r"1 features are missing from data: \['binary_1'\]"): t.transform(data_renamed) data_dropped = data.drop('trinary', axis=1) with pytest.raises( ValueError, match=r"1 features are missing from data: \['trinary'\]"): t.transform(data_dropped) data_renamed = data.rename(columns={ "binary_1": "renamed_1", "many": "too_many" }) with pytest.raises( ValueError, match= r"2 features are missing from data: \['binary_1', 'many'\]"): t.transform(data_renamed)
def test_fit_unpenalized(): X, y = load_breast_cancer() included = X["grade"] != "unkown" X = X.loc[included, :] y = y[included.values] X["grade"] = pandas.Series(pandas.Categorical( X["grade"].astype(object), categories=["intermediate", "poorly differentiated", "well differentiated"]), index=X.index, name="grade") enc = OneHotEncoder() X = enc.fit_transform(X) cols_unpen = ['age', 'size', 'grade=poorly differentiated', 'grade=well differentiated', 'er=positive'] X = pandas.concat(( X.loc[:, cols_unpen], X.drop(cols_unpen, axis=1)), axis=1) alphas = numpy.ones(X.shape[1]) alphas[:len(cols_unpen)] = 0.0 cph = CoxPHSurvivalAnalysis(alpha=alphas) cph.fit(X, y) coef = numpy.array([ -0.0228825990482334, 0.635554486750423, -0.242079636336473, -1.30197563647684, -2.27790151300312, 0.291950212930807, 0.210861165049552, -0.612456645638769, -0.453414844486013, -0.1239424190253, 0.196855946938761, 1.08724198521351, -0.313645443818603, -0.660016141198812, 1.07104977404073, 0.559632480471393, -0.47740746012516, -1.26199769642326, -1.40486191330444, -0.418517018253652, 0.284936091689505, -0.215531076378674, -0.200889269720281, 0.341231176941461, 0.0307350667648337, -0.212527052910377, -0.3019678509188, 0.54491723178866, -0.286914381308269, 0.370374100647823, -0.496258248067704, 0.624528657777646, 0.287884026214139, 0.022095151910937, 0.910293732936019, -0.13076488639207, 0.0857209529827562, -0.0922302696963889, 0.498136631416287, 0.937133644376614, 0.395090607856869, -1.04727952099579, -0.54974694800345, 0.442372971174454, -0.745558450753062, -0.0920496108021893, 0.75549238586293, 0.562496351046743, 0.259183349320614, 0.405816113039412, -0.0969485695700491, -0.507388915258978, -0.474246597197329, -0.209335517183595, 0.187390427612498, -0.0522568530719332, 0.0806559868641646, -0.0397654339013217, -0.269582356665396, 0.791793553908743, 0.344208857844796, -0.180165785909583, -0.7927695046551, 0.0311635012097026, -0.579429950080662, -0.264770995160963, 0.869512689697827, 0.765479119494175, -0.173588059680979, -0.199781736503338, -0.58712767650975, -0.457389854855, 0.3891865514653, 0.707309743580534, -0.121997864690072, 0.0447174402649954, 0.0319336975869795, 0.0117988435665652, -0.593691059339064, -0.838107176656365, -0.247955128152877 ]) assert_array_almost_equal(cph.coef_, coef)
def test_alpha_too_small(self): X, y = load_breast_cancer() Xt = OneHotEncoder().fit_transform(X) index = numpy.array([ 0, 1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 33, 34, 36, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 68, 70, 71, 72, 75, 76, 78, 79, 80, 82, 84, 85, 86, 87, 88, 90, 91, 92, 93, 94, 95, 98, 99, 100, 102, 103, 104, 105, 107, 108, 109, 110, 111, 113, 114, 115, 116, 117, 118, 119, 120, 121, 124, 125, 126, 127, 128, 130, 131, 132, 133, 135, 136, 137, 138, 139, 140, 143, 144, 145, 147, 148, 150, 151, 153, 154, 155, 156, 157, 158, 160, 161, 164, 165, 166, 167, 168, 169, 170, 171, 172, 174, 175, 177, 178, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191, 192, 193, 194, 195, 196, 197 ]) nn = CoxnetSurvivalAnalysis(alphas=[0.007295025406624247], l1_ratio=1.0) Xf, yf = Xt.iloc[index], y[index] self.assertRaisesRegex( ArithmeticError, "Numerical error, because weights are too large. Consider increasing alpha.", nn.fit, Xf, yf)
def test_fit(create_data): data, expected_data = create_data() t = OneHotEncoder().fit(data) assert t.feature_names_.tolist() == ['binary_1', 'binary_2', 'trinary', 'many'] assert set(t.encoded_columns_) == set(expected_data.columns) assert t.categories_ == {k: data[k].cat.categories for k in ['binary_1', 'binary_2', 'trinary', 'many']}
def test_fit(self): data = create_data() expected_data = encoded_data(data) t = OneHotEncoder().fit(data) self.assertListEqual(t.feature_names_.tolist(), ['binary_1', 'binary_2', 'trinary', 'many']) self.assertSetEqual(set(t.encoded_columns_), set(expected_data.columns)) self.assertDictEqual(t.categories_, {k: data[k].cat.categories for k in ['binary_1', 'binary_2', 'trinary', 'many']})
def test_brier_coxph(): X, y = load_gbsg2() X.loc[:, "tgrade"] = X.loc[:, "tgrade"].map(len).astype(int) Xt = OneHotEncoder().fit_transform(X) est = CoxPHSurvivalAnalysis(ties="efron").fit(Xt, y) survs = est.predict_survival_function(Xt) preds = [fn(1825) for fn in survs] _, score = brier_score(y, y, preds, 1825) assert round(abs(score[0] - 0.208817407492645), 5) == 0
def test_pipeline_predict(breast_cancer, name, func): X_str, _ = load_breast_cancer() X_num, y = breast_cancer est = FORESTS[name](n_estimators=10, random_state=1) est.fit(X_num[10:], y[10:]) pipe = make_pipeline(OneHotEncoder(), FORESTS[name](n_estimators=10, random_state=1)) pipe.fit(X_str[10:], y[10:]) tree_pred = getattr(est, func)(X_num[:10], return_array=True) pipe_pred = getattr(pipe, func)(X_str[:10], return_array=True) assert_array_almost_equal(tree_pred, pipe_pred)
def convert_dataset(dataset): # convert string columns to categorical type for col in dataset.columns: if str(dataset[col].dtype) == "object": dataset.loc[:, col] = dataset[col].astype('category') data_x_numeric = OneHotEncoder().fit_transform(dataset[[ "horTh", "age", "menostat", "tsize", "tgrade", "pnodes", "progrec", "estrec" ]]) data_y = dataset[["time", "cens"]] data_y = data_y.reindex(columns=["cens", "time"]) data_y["cens"] = data_y["cens"].astype('bool') pd_y_values = data_y.copy() pd_y_values = pd_y_values.rename(index=int, columns={"cens": "event"}) pd_y_values = pd_y_values.reindex(columns=["time", "event"]) # test on sorted input data test_data = data_x_numeric.copy() test_timed_data = test_data test_timed_data['time'] = pd_y_values["time"] return data_x_numeric, pd_y_values, test_timed_data
def test_pipeline_predict(func): X_str, y = load_breast_cancer() X_num = encode_categorical(X_str) est = RandomSurvivalForest(n_estimators=10, random_state=1) est.fit(X_num[10:], y[10:]) pipe = make_pipeline(OneHotEncoder(), RandomSurvivalForest(n_estimators=10, random_state=1)) pipe.fit(X_str[10:], y[10:]) tree_pred = getattr(est, func)(X_num[:10]) pipe_pred = getattr(pipe, func)(X_str[:10]) assert_array_almost_equal(tree_pred, pipe_pred)
def test_transform_other_columns(self): data = create_data() t = OneHotEncoder().fit(data) data = create_data(125) data_renamed = data.rename(columns={"binary_1": "renamed_1"}) self.assertRaisesRegex(ValueError, "1 features are missing from data: \['binary_1'\]", t.transform, data_renamed) data_dropped = data.drop('trinary', axis=1) self.assertRaisesRegex(ValueError, "1 features are missing from data: \['trinary'\]", t.transform, data_dropped) data_renamed = data.rename(columns={"binary_1": "renamed_1", "many": "too_many"}) self.assertRaisesRegex(ValueError, "2 features are missing from data: \['binary_1', 'many'\]", t.transform, data_renamed)
def test_pipeline_predict(func): X_str, y = load_breast_cancer() X_num = column.encode_categorical(X_str) est = CoxnetSurvivalAnalysis(alpha_min_ratio=0.0001, l1_ratio=1.0, fit_baseline_model=True) est.fit(X_num[10:], y[10:]) pipe = make_pipeline( OneHotEncoder(), CoxnetSurvivalAnalysis(alpha_min_ratio=0.0001, l1_ratio=1.0, fit_baseline_model=True)) pipe.fit(X_str[10:], y[10:]) tree_pred = getattr(est, func)(X_num[:10]) pipe_pred = getattr(pipe, func)(X_str[:10]) for s1, s2 in zip(tree_pred, pipe_pred): assert_array_almost_equal(s1.x, s2.x) assert_array_almost_equal(s1.y, s2.y)
def test_fit_transform(create_data): data, expected_data = create_data() actual_data = OneHotEncoder().fit_transform(data) tm.assert_frame_equal(actual_data, expected_data)
plt.legend(loc="best") for value in data_x["Celltype"].unique(): mask = data_x["Celltype"] == value time_cell, survival_prob_cell = kaplan_meier_estimator(data_y["Status"][mask], data_y["Survival_in_days"][mask]) plt.step(time_cell, survival_prob_cell, where="post", label="%s (n = %d)" % (value, mask.sum())) plt.ylabel("est. probability of survival $\hat{S}(t)$") plt.xlabel("time $t$") plt.legend(loc="best") from sksurv.preprocessing import OneHotEncoder data_x_numeric = OneHotEncoder().fit_transform(data_x) data_x_numeric.head() from sksurv.linear_model import CoxPHSurvivalAnalysis estimator = CoxPHSurvivalAnalysis() estimator.fit(data_x_numeric, data_y) pd.Series(estimator.coef_, index=data_x_numeric.columns) x_new = pd.DataFrame.from_dict({ 1: [65, 0, 0, 1, 60, 1, 0, 1], 2: [65, 0, 0, 1, 60, 1, 0, 0], 3: [65, 0, 1, 0, 60, 1, 0, 0], 4: [65, 0, 1, 0, 60, 1, 0, 1]}, columns=data_x_numeric.columns, orient='index')
for col in x.columns: if censored_percentage[col] > 0.6: not_enough_data.append(col) x = x.drop(not_enough_data, axis=1) # Impute missing values with mode x = CustomImputer(strategy='mode').fit_transform(x) # Removes low-variance categorical features categorical = x.select_dtypes(['object']).columns cat = x[categorical] cat[cat.select_dtypes(['object']).columns] = cat.select_dtypes( ['object']).apply(lambda y: y.astype('category')) cat = OneHotEncoder().fit_transform(cat) selector = VarianceThreshold(.8 * (1 - .8)) selector.fit_transform(cat) columns = cat.columns labels_c = [] for index in selector.get_support(indices=True): labels_c.append(columns[index]) selected_categorical = pd.DataFrame(selector.fit_transform(cat), columns=labels_c) # Feature selection for numeric features numeric = x.select_dtypes(['float64']).columns num = x[numeric] selector = SelectFpr(score_func=f_regression, alpha=0.05) selected_numeric = selector.fit_transform(num, survival) columns = num.columns
]] T = df['Duration'] E = df['LapseIndicator'].apply(lambda x: True if x == 1 else False) df2['E'] = E df2['T'] = T X, y = get_x_y(df2, ['E', 'T'], pos_label=True) for c in X.columns.values: if c != 'AGE AT DOC': X[c] = X[c].astype('category') data_x_numeric = OneHotEncoder().fit_transform(X) #%% estimator = CoxnetSurvivalAnalysis(verbose=True) estimator.fit(data_x_numeric, y) #%% print(estimator.score(data_x_numeric, y)) print() scores = fit_and_score_features(data_x_numeric.values, y) print( pd.Series(scores, index=data_x_numeric.columns).sort_values(ascending=False)) #%%
#Intersection between two files #df=intersection(df1, df2) #Adding genes based on the sample, status and time #df=merge_frames(df1,df2) #Converting the integer to 0 and 1 to boolean for python df["Status"] = df["Status"].astype(bool) #data contains the time and status column and X will have all the mutation present or absent corresponding to each gene data = df.iloc[0:, 1:3] X = df.iloc[0:, 3:] #storing the value used to store status and time in tuple Y = data.to_records(index=False) X = OneHotEncoder().fit_transform(X) #Running the module for 50 randomly generated penalty values estimator = CoxnetSurvivalAnalysis(n_alphas=100, l1_ratio=1, alpha_min_ratio=0.01, max_iter=10000) estimator.fit(X, Y) #Making the dataframe for the coefficients of each genes corresponding to that alpha value coefficients_lasso = pd.DataFrame(estimator.coef_, index=X.columns, columns=np.round(estimator.alphas_, 5)) alphas = estimator.alphas_ print(coefficients_lasso)
]] T = df['Duration'] E = df['LapseIndicator'].apply(lambda x: True if x == 1 else False) df2['E'] = E df2['T'] = T X, y = get_x_y(df2, ['E', 'T'], pos_label=True) for c in X.columns.values: if c != 'AGE AT DOC': X[c] = X[c].astype('category') data_x_numeric = OneHotEncoder().fit_transform(X) from sklearn.decomposition import NMF model = NMF(n_components=8) data_x_numeric = model.fit_transform(data_x_numeric) #%% estimator = CoxPHSurvivalAnalysis(verbose=True, n_iter=10000) estimator.fit(data_x_numeric, y) #%% print() print(pd.Series(estimator.coef_, index=data_x_numeric.columns)) print()
from sklearn.model_selection import train_test_split from sklearn.preprocessing import OrdinalEncoder from sksurv.datasets import load_gbsg2 from sksurv.preprocessing import OneHotEncoder from sksurv.ensemble import RandomSurvivalForest X, y = load_gbsg2() grade_str = X.loc[:, "tgrade"].astype(object).values[:, np.newaxis] grade_num = OrdinalEncoder( categories=[["I", "II", "III"]]).fit_transform(grade_str) X_no_grade = X.drop("tgrade", axis=1) Xt = OneHotEncoder().fit_transform(X_no_grade) Xt = np.column_stack((Xt.values, grade_num)) feature_names = X_no_grade.columns.tolist() + ["tgrade"] random_state = 20 X_train, X_test, y_train, y_test = train_test_split(Xt, y, test_size=0.25, random_state=random_state) rsf = RandomSurvivalForest(n_estimators=1000, min_samples_split=10, min_samples_leaf=15, max_features="sqrt",
def plot_cumulative_dynamic_auc(risk_score, label, color=None): auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, risk_score, times) plt.plot(times, auc, marker="o", color=color, label=label) plt.xlabel("days from enrollment") plt.ylabel("time-dependent AUC") plt.axhline(mean_auc, color=color, linestyle="--") plt.legend() for i, col in enumerate(num_columns): plot_cumulative_dynamic_auc(x_test[:, i], col, color="C{}".format(i)) ret = concordance_index_ipcw(y_train, y_test, x_test[:, i], tau=times[-1]) from sksurv.datasets import load_veterans_lung_cancer va_x, va_y = load_veterans_lung_cancer() cph = make_pipeline(OneHotEncoder(), CoxPHSurvivalAnalysis()) cph.fit(va_x, va_y) va_times = np.arange(7, 183, 7) # estimate performance on training data, thus use `va_y` twice. va_auc, va_mean_auc = cumulative_dynamic_auc(va_y, va_y, cph.predict(va_x), va_times) plt.plot(va_times, va_auc, marker="o") plt.axhline(va_mean_auc, linestyle="--") plt.xlabel("days from enrollment") plt.ylabel("time-dependent AUC") plt.grid(True)