Example #1
0
    def test_fit_custom_kernel(self):
        alphas = numpy.exp(numpy.linspace(numpy.log(0.001), numpy.log(0.5), 5))
        svm_grid = ParameterGrid({"alpha": alphas})

        transform = ClinicalKernelTransform(fit_once=True)
        transform.prepare(self.x)

        base_estimators = []
        for i, params in enumerate(svm_grid):
            model = FastSurvivalSVM(max_iter=100, random_state=0, **params)
            base_estimators.append(("svm_linear_%d" % i, model))

        for i, params in enumerate(svm_grid):
            model = FastKernelSurvivalSVM(kernel=transform.pairwise_kernel,
                                          max_iter=45,
                                          tol=1e-5,
                                          random_state=0,
                                          **params)
            base_estimators.append(("svm_kernel_%d" % i, model))

        cv = KFold(n_splits=3, shuffle=True, random_state=0)
        meta = EnsembleSelection(base_estimators,
                                 n_estimators=0.4,
                                 scorer=score_cindex,
                                 cv=cv,
                                 n_jobs=4)

        meta.fit(self.x.values, self.y)
        self.assertEqual(len(meta), 10)
        self.assertTupleEqual(meta.scores_.shape, (10, ))

        p = meta.predict(self.x.values)

        score = concordance_index_censored(self.y['fstat'], self.y['lenfol'],
                                           p)
        expected_score = numpy.array([0.7978084, 59938, 15178, 33, 119])
        assert_array_almost_equal(score, expected_score)
def _create_regression_ensemble():
    aft_grid = ParameterGrid({"alpha": 2.**numpy.arange(-2, 12, 2)})
    svm_grid = ParameterGrid({"alpha": 2.**numpy.arange(-12, 0, 2)})

    base_estimators = []
    for i, params in enumerate(aft_grid):
        model = IPCRidge(max_iter=1000, **params)
        base_estimators.append(("aft_%d" % i, model))

    for i, params in enumerate(svm_grid):
        model = FastSurvivalSVM(rank_ratio=0,
                                fit_intercept=True,
                                max_iter=100,
                                random_state=1,
                                **params)
        base_estimators.append(("svm_%d" % i, model))

    cv = KFold(n_splits=4, shuffle=True, random_state=0)
    meta = EnsembleSelectionRegressor(base_estimators,
                                      n_estimators=0.4,
                                      scorer=_score_rmse,
                                      cv=cv,
                                      n_jobs=1)
    return meta
Example #3
0
    def _create_ensemble(self, **kwargs):
        boosting_grid = ParameterGrid({
            "n_estimators": [100, 250],
            "subsample": [1.0, 0.75, 0.5]
        })
        svm_grid = ParameterGrid({"alpha": 2.**numpy.arange(-9, 5, 2)})

        base_estimators = []
        for i, params in enumerate(boosting_grid):
            model = ComponentwiseGradientBoostingSurvivalAnalysis(
                random_state=0, **params)
            base_estimators.append(("gbm_%d" % i, model))

        for i, params in enumerate(svm_grid):
            model = FastSurvivalSVM(max_iter=100, random_state=0, **params)
            base_estimators.append(("svm_%d" % i, model))

        cv = KFold(n_splits=4, shuffle=True, random_state=0)
        meta = EnsembleSelection(base_estimators,
                                 n_estimators=0.4,
                                 scorer=score_cindex,
                                 cv=cv,
                                 **kwargs)
        return meta
for idx, item in enumerate(data_y['time_to_event']):
    if item < 0:
        data_y['time_to_event'][idx] = 0
# data_y
# df.groupby('status').count()

# Part 2: Fast Training of Support Vector Machines for Survival Analysis

from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sksurv.column import encode_categorical
from sksurv.metrics import concordance_index_censored
from sksurv.svm import FastSurvivalSVM

## create estimator
estimator = FastSurvivalSVM(optimizer="rbtree",
                            max_iter=1000,
                            tol=1e-6,
                            random_state=0)

pd.DataFrame(data_y)['status'].count()


## define a function for evaluating the performance of models during grid search using Harrell's concordance index
def score_survival_model(model, X, y):
    prediction = model.predict(X)
    result = concordance_index_censored(y['status'], y['time_to_event'],
                                        prediction)
    return result[0]


param_grid = {'alpha': [0.001, 0.01, 0.1, 0.5, 1, 10, 100, 1000]}
cv = ShuffleSplit(n_splits=200, test_size=0.3, random_state=0)
    def apply_survival_regression(col_list,
                                  sub_restricted_tmp_df,
                                  regression_df,
                                  folder,
                                  ith_method,
                                  additional_cols,
                                  loc,
                                  tcga_y,
                                  test_res_df,
                                  extra_pred_train=None,
                                  extra_pred_test=None,
                                  repeats=0):
        X = pd.get_dummies(sub_restricted_tmp_df[col_list])

        X_train, X_test = X.loc[train_index,
                                X.columns[2:]], X.loc[test_index,
                                                      X.columns[2:]]
        y_train, y_test = tcga_y[train_index], tcga_y[test_index]

        X_train = X_train.loc[:, X_train.nunique() != 1]
        X_test = X_test.loc[:, X_train.columns]
        scaler = preprocessing.StandardScaler().fit(X_train)
        X_train_s = scaler.transform(X_train)
        X_test_s = scaler.transform(X_test)

        XX_train = X.loc[train_index]
        XX_test = X.loc[test_index]
        XX_train = XX_train.loc[:, XX_train.nunique() != 1]
        XX_test = XX_test.loc[:, XX_train.columns]

        # survival svm
        lin_svm = FastSurvivalSVM(rank_ratio=0.8,
                                  fit_intercept=True,
                                  max_iter=200)
        lin_svm.fit(X_train_s, y_train)
        T_pred_train = lin_svm.predict(X_train_s)
        ci_train, pval_train = my_ci_pvalue(XX_train['survival_days'],
                                            T_pred_train,
                                            XX_train['binary_vital_status'],
                                            repeats)
        T_pred_test = lin_svm.predict(X_test_s)
        ci_test, pval_test = my_ci_pvalue(XX_test['survival_days'],
                                          T_pred_test,
                                          XX_test['binary_vital_status'],
                                          repeats)
        variable_base_name = [
            r.split('_{}_'.format(folder))[1] if ith_method in r else r
            for r in X_train.columns
        ]
        coef_dict = dict(
            zip(['{}_coef'.format(c) for c in variable_base_name],
                lin_svm.coef_))
        t = pd.DataFrame(
            {
                'cancer_loc': loc,
                'ith_method': ith_method,
                'folder': folder,
                'additional_cols': additional_cols[0],
                'regression_method': 'linear_survival_svm',
                'train_score': ci_train,
                'pval_train': pval_train,
                'test_score': ci_test,
                'pval_test': pval_test,
                **coef_dict
            },
            index=[0])
        regression_df = pd.concat((regression_df, t), sort=True)
        t = pd.DataFrame(
            {
                'ith_method': ith_method,
                'folder': folder,
                'additional_cols': additional_cols[0],
                'train_score': ci_train,
                'pval_train': pval_train,
                'test_score': ci_test,
                'pval_test': pval_test,
                **{
                    obs_idx: T_pred_test[obs_idx]
                    for obs_idx in range(len(T_pred_test))
                }
            },
            index=[0])
        test_res_df = pd.concat((test_res_df, t), sort=True)
        if extra_pred_test is not None:
            for i, c in enumerate(extra_pred_test):
                ci_test, pval_test = my_ci_pvalue(
                    XX_test['survival_days'], T_pred_test + extra_pred_test[i],
                    XX_test['binary_vital_status'], 0)
                ci_train, pval_train = my_ci_pvalue(
                    XX_train['survival_days'],
                    T_pred_train + extra_pred_train[i],
                    XX_train['binary_vital_status'], 0)
                t = pd.DataFrame(
                    {
                        'cancer_loc': loc,
                        'ith_method': ith_method,
                        'folder': folder,
                        'additional_cols': additional_cols[i + 1],
                        'regression_method': 'linear_survival_svm',
                        'train_score': ci_train,
                        'pval_train': pval_train,
                        'test_score': ci_test,
                        'pval_test': pval_test
                    },
                    index=[0])
                regression_df = pd.concat((regression_df, t), sort=True)
                t = pd.DataFrame(
                    {
                        'ith_method': ith_method,
                        'folder': folder,
                        'additional_cols': additional_cols[i + 1],
                        'train_score': ci_train,
                        'pval_train': pval_train,
                        'test_score': ci_test,
                        'pval_test': pval_test,
                        **{
                            obs_idx: T_pred_test + extra_pred_test[i][obs_idx]
                            for obs_idx in range(
                                len(T_pred_test + extra_pred_test[i]))
                        }
                    },
                    index=[0])
                test_res_df = pd.concat((test_res_df, t), sort=True)
        return regression_df, test_res_df, T_pred_test, T_pred_train
Example #6
0
File: hpt.py Project: Letris/HS
def RandomGridSearchRFC_Fixed(X, Y, splits, model, survival):
    """
    This function looks for the best set o parameters for RFC method
    Input: 
        X: training set
        Y: labels of training set
        splits: cross validation splits, used to make sure the parameters are stable
    Output:
        clf.best_params_: dictionary with the parameters, to use: param_svm['kernel']
    """

    start_svm = time.time()

    if model == 'svm':
        clf = svm.SVC()

        tuned_parameters = {
            'C': ([0.01, 1, 10]),
            'kernel': (['rbf', 'linear']),
            # 'kernel': (['linear', 'rbf', 'sigmoid']),
            # 'degree': ([1,3,5,10]),
            # 'decision_function_shape' : (['ovo', 'ovr']),
            # 'cache_size': ([500,1000,1500,2000]),
            'shrinking': ([False, True]),
            # 'probability': ([False, True])
        }

    if model == 'cart':
        clf = tree.DecisionTreeClassifier()

        tuned_parameters = {
            'criterion': (['gini', 'entropy']),
            'max_depth': ([10, 20]),
            'min_samples_split': ([2, 3, 5]),
            'min_samples_leaf': ([2, 3, 5]),
        }

    if model == 'rf':
        clf = ensemble.RandomForestClassifier()

        tuned_parameters = {
            'n_estimators': ([200, 500, 1000]),
            # 'max_features': (['auto', 'sqrt', 'log2',1,4,8]),
            'max_depth': ([10, 20]),
            # 'criterion':    (['gini', 'entropy']),
            'min_samples_split': [2, 3, 5],
            'min_samples_leaf': [2, 3, 5],
        }

    if model == 'xgboost':
        clf = XGBClassifier()

        tuned_parameters = {
            'booster': (['gbtree']),
            'max_depth': ([5, 10, 20]),
            'reg_lambda': ([0, 1]),
            'reg_alpha': ([0, 1]),
            'subsample': ([0.5, 1])
        }

    if model == 'lr':
        clf = linear_model.LogisticRegression()

        tuned_parameters = {'solver': (['liblinear', 'sag', 'saga'])}

    if model == 'cox':

        clf = CoxnetSurvivalAnalysis()
        tuned_parameters = {
            'n_alphas': ([50, 100, 200]),
            'l1_ratio': ([0.1, 0.5, 1]),
        }

    if model == 'survSVM':
        clf = FastSurvivalSVM()

        tuned_parameters = {
            'alpha': ([0.5, 1]),
            'rank_ratio': ([0.5, 1]),
            'max_iter': ([20, 40, 80]),
            'optimizer': (['rbtree', 'avltree']),
        }

    if model == 'gb':
        clf = GradientBoostingSurvivalAnalysis()

        tuned_parameters = {
            'learning_rate': ([0.1, 0.3]),
            'n_estimators': ([100, 200, 400]),
            'max_depth': ([3, 6, 12])
        }

    if survival == True:
        scorer = make_scorer(CI, greater_is_better=True)

        y_for_cv = np.array([t[0] for t in Y])
        cv = StratifiedKFold(y_for_cv, n_folds=splits)  # x-validation

    else:
        cv = StratifiedKFold(Y, n_folds=splits)  # x-validation
        scores = ['roc_auc']

    print('  ...performing x-validation')

    clf = GridSearchCV(clf,
                       tuned_parameters,
                       scoring='%s' % scores[0],
                       cv=cv,
                       verbose=10)  #scoring='%s' % scores[0]

    clf.fit(X, Y)

    end_svm = time.time()
    print("Total time to process: ", end_svm - start_svm)

    return (clf.best_params_, clf)
Example #7
0
regr_best = CoxnetSurvivalAnalysis(alphas=gcv.best_params_["alphas"],
                                   l1_ratio=0.8,
                                   alpha_min_ratio=0.1,
                                   max_iter=3e5).fit(X, Y)
y_regr = regr_best.predict(X_lb)

ci_lb = concordance_index_censored(Y_lb["vitalStatus"],
                                   Y_lb["overallSurvival"], y_regr)[0]
print("concordance index = %0.4f" % ci_lb)

# In[ ]
zero_mask = np.array([Y[ii][1] == 0 for ii in range(len(Y))])

surv_mdl = FastSurvivalSVM(rank_ratio=0.8,
                           fit_intercept=True,
                           optimizer="rbtree",
                           tol=1e-4,
                           max_iter=100,
                           random_state=0)

param_grid = {'alpha': np.logspace(-2, 2, num=100)}
cv = KFold(n_splits=5, shuffle=True, random_state=0)
grid_cv = GridSearchCV(surv_mdl,
                       param_grid,
                       scoring=score_survival_model,
                       n_jobs=-1,
                       cv=cv)
grid_cv.fit(X[~zero_mask], Y[~zero_mask])

plot_gridcv_results(grid_cv, param_grid["alpha"])
surv_mdl_best = FastSurvivalSVM(alpha=grid_cv.best_params_["alpha"],
                                rank_ratio=0.8,
print("\n")
print("%.1f%% of records are censored" % (n_censored / y.shape[0] * 100))

# Dibujando
plt.figure(figsize=(9, 6))
val, bins, patches = plt.hist(
    (y["Survival_in_days"][y["Status"]], y["Survival_in_days"][~y["Status"]]),
    bins=30,
    stacked=True)
plt.legend(patches, ["Time of Death", "Time of Censoring"])

# First, we need to create an initial model with default parameters
# that is subsequently used in the grid search.

estimator = FastSurvivalSVM(optimizer="rbtree",
                            max_iter=1000,
                            tol=1e-6,
                            random_state=0)


# Creando la metrica
def score_survival_model(model, X, y):
    prediction = model.predict(X)
    result = concordance_index_censored(y['Status'], y['Survival_in_days'],
                                        prediction)
    return result[0]


param_grid = {'alpha': 2.**np.arange(-12, 13, 2)}
cv = ShuffleSplit(n_splits=200, test_size=0.5, random_state=0)
gcv = GridSearchCV(estimator,
                   param_grid,