def predict_cumulative_hazard(self, x, t=None, **kwargs): """ Predicts the cumulative hazard function H(t, x) Parameters ---------- * `x` : **array-like** *shape=(n_samples, n_features)* -- array-like representing the datapoints. x should not be standardized before, the model will take care of it * `t`: **double** *(default=None)* -- time at which the prediction should be performed. If None, then return the function for all available t. Returns ------- * `cumulative_hazard`: **numpy.ndarray** -- array-like representing the prediction of the cumulative_hazard function """ # Checking if the data has the right format x = utils.check_data(x) # Calculating hazard/cumulative_hazard hazard = self.predict_hazard(x, t, **kwargs) cumulative_hazard = np.cumsum(hazard, 1) return cumulative_hazard
def predict(self, X, t=None, num_threads=-1): # Checking if the data has the right format X = utils.check_data(X) if X.ndim == 1: X = X.reshape(1, -1) T = np.array([1.] * X.shape[0]) E = np.array([1.] * X.shape[0]) input_data = np.c_[T, E, X] # Loading the attributes of the model self.load_properties() # Computing Survival survival = np.array( self.model.predict_survival(input_data, num_threads)) # Computing hazard hazard = np.array(self.model.predict_hazard(input_data, num_threads)) # Computing density density = hazard * survival if t is None: return hazard, density, survival else: min_index = [abs(a_j_1 - t) for (a_j_1, a_j) in self.time_buckets] index = np.argmin(min_index) return hazard[:, index], density[:, index], survival[:, index]
def predict_risk(self, x, **kwargs): """ Predicts the Risk Score/Mortality function for all t, R(x) = sum( cumsum(hazard(t, x)) ) According to Random survival forests from Ishwaran H et al https://arxiv.org/pdf/0811.1645.pdf Parameters ---------- * `x` : **array-like** *shape=(n_samples, n_features)* -- array-like representing the datapoints. x should not be standardized before, the model will take care of it Returns ------- * `risk_score`: **numpy.ndarray** -- array-like representing the prediction of Risk Score function """ # Checking if the data has the right format x = utils.check_data(x) # Calculating cumulative_hazard/risk cumulative_hazard = self.predict_cumulative_hazard(x, None, **kwargs) risk_score = np.sum(cumulative_hazard, 1) return risk_score
def predict_risk_chunk(self, X, num_threads=-1, printChunk=False): # Checking if the data has the right format X = utils.check_data(X) if X.ndim == 1: X = X.reshape(1, -1) T = np.array([1.] * X.shape[0]) E = np.array([1.] * X.shape[0]) input_data = np.c_[T, E, X] # Loading the attributes of the model self.load_properties() n_rows, n_cols = input_data.shape nparts = round(n_rows / 1000) split_array = np.array_split(input_data, nparts) survival = np.empty(n_rows) survIndex = 0 for s in split_array: survival[survIndex:(survIndex + np.size(s, axis=0))] = np.array( self.model.predict_risk(s, num_threads)) survIndex += np.size(s, axis=0) #if printChunk: #print(survIndex) # Computing risk return survival
def predict(self, x, t=None): """ Predicting the hazard, density and survival functions Arguments: * x: pd.Dataframe or np.ndarray or list x is the testing dataset containing the features x should not be standardized before, the model will take care of it * t: float (default=None) Time at which hazard, density and survival functions should be calculated. If None, the method returns the functions for all times t. """ # Convert x into the right format x = utils.check_data(x) # Sacling the dataset if x.ndim == 1: x = self.scaler.transform(x.reshape(1, -1)) elif x.ndim == 2: x = self.scaler.transform(x) # Calculating risk_score, hazard, density and survival phi = np.exp(np.dot(x, self.weights)) hazard = self.baseline_hazard * phi.reshape(-1, 1) survival = np.power(self.baseline_survival, phi.reshape(-1, 1)) density = hazard * survival if t is None: return hazard, density, survival else: min_index = [abs(a_j_1 - t) for (a_j_1, a_j) in self.time_buckets] index = np.argmin(min_index) return hazard[:, index], density[:, index], survival[:, index]
def predict_risk(self, x): """ Predicting the risk score function Parameters: ----------- * x: pd.Dataframe or np.ndarray or list x is the testing dataset containing the features x should not be standardized before, the model will take care of it """ # Convert x into the right format x = utils.check_data(x) # Scaling the dataset if x.ndim == 1: x = self.scaler.transform(x.reshape(1, -1)) elif x.ndim == 2: x = self.scaler.transform(x) else: # Ensuring x has 2 dimensions if x.ndim == 1: x = np.reshape(x, (1, -1)) # Calculating risk_score risk_score = self.risk_function(x) return risk_score
def predict_risk(self, x, use_log=False): """ Predicting the risk score functions Arguments: * x: pd.Dataframe or np.ndarray or list x is the testing dataset containing the features x should not be standardized before, the model will take care of it """ # Convert x into the right format x = utils.check_data(x) # Scaling the dataset if x.ndim == 1: x = self.scaler.transform(x.reshape(1, -1)) elif x.ndim == 2: x = self.scaler.transform(x) # Calculating risk_score risk_score = np.exp(np.dot(x, self.weights)) if not use_log: risk_score = np.exp(risk_score) return risk_score
def predict_risk(self, x, use_log=False): """ Predicting the risk score functions Arguments: * x: pd.Dataframe or np.ndarray or list x is the testing dataset containing the features x should not be standardized before, the model will take care of it """ # Convert x into the right format x = utils.check_data(x) # Scaling the data if self.auto_scaler: if x.ndim == 1: x = self.scaler.transform(x.reshape(1, -1)) elif x.ndim == 2: x = self.scaler.transform(x) else: # Ensuring x has 2 dimensions if x.ndim == 1: x = np.reshape(x, (1, -1)) # Transforming into pytorch objects x = torch.cuda.FloatTensor(x) # Calculating risk_score score = self.model(x).data.cpu().numpy().flatten() if not use_log: score = np.exp(score) return score
def predict_cdf(self, x, t=None, **kwargs): """ Predicts the cumulative density function F(t, x) Parameters ---------- * `x` : **array-like** *shape=(n_samples, n_features)* -- array-like representing the datapoints. x should not be standardized before, the model will take care of it * `t`: **double** *(default=None)* -- time at which the prediction should be performed. If None, then return the function for all available t. Returns ------- * `cdf`: **numpy.ndarray** -- array-like representing the prediction of the cumulative density function """ # Checking if the data has the right format x = utils.check_data(x) # Calculating survival and cdf survival = self.predict_survival(x, t, **kwargs) cdf = 1. - survival return cdf
def bootstrap_concordance_index_chunk(model, X, T, E, include_ties = True, additional_results = False, n_iterations = 1000, n_size = 1000, **kwargs): stats = list() risk = model.predict_risk_chunk(X, **kwargs) risk, T, E = utils.check_data(risk, T, E) for i in range(n_iterations): tempR, tempT, tempE = resample(risk, T, E, n_samples=n_size) order = np.argsort(-tempT) tempR = tempR[order] tempT = tempT[order] tempE = tempE[order] # Calculating th c-index results = _concordance_index(tempR, tempT, tempE, include_ties) stats.append(results[0]) if i % 10 == 0: print(i) alpha = 0.95 p = ((1.0 - alpha) / 2.0) * 100 lower = max(0.0, np.percentile(stats, p)) p = (alpha + ((1.0 - alpha) / 2.0)) * 100 upper = min(1.0, np.percentile(stats, p)) print('%.1f confidence interval %.3f%% and %.3f%%' % (alpha * 100, lower, upper)) print(np.average(stats)) # confidence intervals alpha = 0.95 p = ((1.0 - alpha) / 2.0) * 100 lower = max(0.0, np.percentile(stats, p)) p = (alpha + ((1.0 - alpha) / 2.0)) * 100 upper = min(1.0, np.percentile(stats, p)) print('%.1f confidence interval %.3f%% and %.3f%%' % (alpha * 100, lower, upper)) print(np.average(stats))
def predict(self, x, t=None): """ Predicting the hazard, density and survival functions Parameters: ---------- * `x` : **array-like** *shape=(n_samples, n_features)* -- array-like representing the datapoints. x should not be standardized before, the model will take care of it * `t`: **double** *(default=None)* -- time at which the prediction should be performed. If None, then return the function for all available t. """ # Convert x into the right format x = utils.check_data(x) # Scaling the data if self.auto_scaler: if x.ndim == 1: x = self.scaler.transform(x.reshape(1, -1)) elif x.ndim == 2: x = self.scaler.transform(x) else: # Ensuring x has 2 dimensions if x.ndim == 1: x = np.reshape(x, (1, -1)) # Transforming into pytorch objects x = torch.FloatTensor(x) # Predicting using linear/nonlinear function score_torch = self.model(x) score = score_torch.data.numpy() # Cretaing the time triangles Triangle1 = np.tri(self.num_times, self.num_times + 1) Triangle2 = np.tri(self.num_times + 1, self.num_times + 1) # Calculating the score, density, hazard and Survival phi = np.exp(np.dot(score, Triangle1)) div = np.repeat(np.sum(phi, 1).reshape(-1, 1), phi.shape[1], axis=1) density = (phi / div) Survival = np.dot(density, Triangle2) hazard = density[:, :-1] / Survival[:, 1:] # Returning the full functions of just one time point if t is None: return hazard, density, Survival else: min_abs_value = [ abs(a_j_1 - t) for (a_j_1, a_j) in self.time_buckets ] index = np.argmin(min_abs_value) return hazard[:, index], density[:, index], Survival[:, index]
def predict_risk(self, x, use_log=False): """ Predicting the risk score functions Arguments: * x: pd.Dataframe or np.ndarray or list x is the testing dataset containing the features x should not be standardized before, the model will take care of it """ # Convert x into the right format x = utils.check_data(x) if isinstance(x, list): # Scaling data if self.auto_scaler: for index, X in enumerate(x): if X.ndim == 1: X = self.scaler.transform(X.reshape(1, -1)) elif X.ndim == 2: X = self.scaler.transform(X) x[index] = X else: for index, X in enumerate(x): # Ensuring x has 2 dimensions if X.ndim == 1: X = np.reshape(X, (1, -1)) x[index] = X else: # Scaling data if self.auto_scaler: x = self.scaler.fit_transform(x) # Transform into torch.Tensor if isinstance(x, list): for j, input_ in enumerate(x): x[j] = torch.FloatTensor(input_) if torch.cuda.is_available(): x[j] = x[j].cuda() else: x = torch.FloatTensor(x) if torch.cuda.is_available(): x = x.cuda() # Transforming into pytorch objects #x = X_original # Calculating risk_score score = self.model(x).cpu().data.numpy().flatten() if not use_log: score = np.exp(score) return score
def predict_risk(self, X, num_threads=-1): # Checking if the data has the right format X = utils.check_data(X) if X.ndim == 1: X = X.reshape(1, -1) T = np.array([1.] * X.shape[0]) E = np.array([1.] * X.shape[0]) input_data = np.c_[T, E, X] # Loading the attributes of the model self.load_properties() # Computing risk risk = self.model.predict_risk(input_data, num_threads) return np.array(risk)
def predict(self, x, t=None): """ Predicting the hazard, density and survival functions Parameters: ----------- * x: pd.Dataframe or np.ndarray or list x is the testing dataset containing the features x should not be standardized before, the model will take care of it * t: float (default=None) Time at which hazard, density and survival functions should be calculated. If None, the method returns the functions for all times t. """ # Convert x into the right format x = utils.check_data(x) # Scaling the dataset if x.ndim == 1: x = self.scaler.transform(x.reshape(1, -1)) elif x.ndim == 2: x = self.scaler.transform(x) else: # Ensuring x has 2 dimensions if x.ndim == 1: x = np.reshape(x, (1, -1)) # Calculating risk_score, hazard, density and survival BX = self.risk_function(x) hazard = self.hazard_function(self.times, BX.reshape(-1, 1)) survival = self.survival_function(self.times, BX.reshape(-1, 1)) density = (hazard * survival) if t is None: return hazard, density, survival else: min_abs_value = [ abs(a_j_1 - t) for (a_j_1, a_j) in self.time_buckets ] index = np.argmin(min_abs_value) return hazard[:, index], density[:, index], survival[:, index]
def predict_risk(self, x, use_log=False): """ Predicts the Risk Score Parameter ---------- * `x`, np.ndarray array-like representing the datapoints * `use_log`: bool - (default=False) Applies the log function to the risk values Returns ------- * `risk_score`, np.ndarray array-like representing the prediction of Risk Score function """ # Ensuring that the C++ model has the fitted parameters self.load_properties() # Convert x into the right format x = utils.check_data(x) # Scaling the dataset if x.ndim == 1: if self.with_bias: x = np.r_[x, 1.] x = self.scaler.transform(x.reshape(1, -1)) elif x.ndim == 2: n = x.shape[0] if self.with_bias: x = np.c_[x, [1.] * n] x = self.scaler.transform(x) # Calculating prdiction risk = np.exp(self.model.get_score(x)) if use_log: return np.log(risk) else: return risk
def predict_chunk(self, X, t=1, num_threads=-1): # Checking if the data has the right format X = utils.check_data(X) if X.ndim == 1: X = X.reshape(1, -1) T = np.array([1.] * X.shape[0]) E = np.array([1.] * X.shape[0]) input_data = np.c_[T, E, X] n_rows, n_cols = input_data.shape nparts = round(n_rows / 1000) split_array = np.array_split(input_data, nparts) survival = np.empty(n_rows) self.load_properties() min_index = [abs(a_j_1 - t) for (a_j_1, a_j) in self.time_buckets] index = np.argmin(min_index) survIndex = 0 for s in split_array: survival[survIndex:(survIndex + np.size(s, axis=0))] = np.array( self.model.predict_survival(s, num_threads))[:, index] survIndex += np.size(s, axis=0) #print(survIndex) return survival
def fit(self, X, T, E, init_method='glorot_uniform', optimizer='adam', lr=1e-4, num_epochs=1000, l2_reg=1e-2, verbose=True, is_min_time_zero=True, extra_pct_time=0.1): """ Fit the estimator based on the given parameters. Parameters: ----------- * `X` : **array-like**, *shape=(n_samples, n_features)* -- The input samples. * `T` : **array-like** -- The target values describing when the event of interest or censoring occurred. * `E` : **array-like** -- The values that indicate if the event of interest occurred i.e.: E[i]=1 corresponds to an event, and E[i] = 0 means censoring, for all i. * `init_method` : **str** *(default = 'glorot_uniform')* -- Initialization method to use. Here are the possible options: * `glorot_uniform`: Glorot/Xavier uniform initializer * `he_uniform`: He uniform variance scaling initializer * `uniform`: Initializing tensors with uniform (-1, 1) distribution * `glorot_normal`: Glorot normal initializer, * `he_normal`: He normal initializer. * `normal`: Initializing tensors with standard normal distribution * `ones`: Initializing tensors to 1 * `zeros`: Initializing tensors to 0 * `orthogonal`: Initializing tensors with a orthogonal matrix, * `optimizer`: **str** *(default = 'adam')* -- iterative method for optimizing a differentiable objective function. Here are the possible options: - `adadelta` - `adagrad` - `adam` - `adamax` - `rmsprop` - `sparseadam` - `sgd` * `lr`: **float** *(default=1e-4)* -- learning rate used in the optimization * `num_epochs`: **int** *(default=1000)* -- The number of iterations in the optimization * `l2_reg`: **float** *(default=1e-4)* -- L2 regularization parameter for the model coefficients * `verbose`: **bool** *(default=True)* -- Whether or not producing detailed logging about the modeling * `extra_pct_time`: **float** *(default=0.1)* -- Providing an extra fraction of time in the time axis * `is_min_time_zero`: **bool** *(default=True)* -- Whether the the time axis starts at 0 Returns: -------- * self : object Example: -------- #### 1 - Importing packages import numpy as np import pandas as pd from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split from pysurvival.models.simulations import SimulationModel from pysurvival.models.parametric import GompertzModel from pysurvival.utils.metrics import concordance_index from pysurvival.utils.display import integrated_brier_score #%matplotlib inline # To use with Jupyter notebooks #### 2 - Generating the dataset from a Gompertz parametric model # Initializing the simulation model sim = SimulationModel( survival_distribution = 'Gompertz', risk_type = 'linear', censored_parameter = 10.0, alpha = .01, beta = 3.0 ) # Generating N random samples N = 1000 dataset = sim.generate_data(num_samples = N, num_features = 3) # Showing a few data-points time_column = 'time' event_column = 'event' dataset.head(2) #### 3 - Creating the modeling dataset # Defining the features features = sim.features # Building training and testing sets # index_train, index_test = train_test_split( range(N), test_size = 0.2) data_train = dataset.loc[index_train].reset_index( drop = True ) data_test = dataset.loc[index_test].reset_index( drop = True ) # Creating the X, T and E input X_train, X_test = data_train[features], data_test[features] T_train, T_test = data_train['time'].values, data_test['time'].values E_train, E_test = data_train['event'].values, data_test['event'].values #### 4 - Creating an instance of the Gompertz model and fitting the data # Building the model gomp_model = GompertzModel() gomp_model.fit(X_train, T_train, E_train, lr=1e-2, init_method='zeros', optimizer ='adam', l2_reg = 1e-3, num_epochs=2000) #### 5 - Cross Validation / Model Performances c_index = concordance_index(gomp_model, X_test, T_test, E_test) #0.8 print('C-index: {:.2f}'.format(c_index)) ibs = integrated_brier_score(gomp_model, X_test, T_test, E_test, t_max=30, figure_size=(20, 6.5) ) """ # Checking data format (i.e.: transforming into numpy array) X, T, E = utils.check_data(X, T, E) T = np.maximum(T, 1e-6) self.get_times(T, is_min_time_zero, extra_pct_time) # Extracting data parameters nb_units, self.num_vars = X.shape input_shape = self.num_vars # Scaling data if self.auto_scaler: X = self.scaler.fit_transform(X) # Does the model need a parameter called Beta is_beta_used = True init_alpha = 1. if self.name == 'ExponentialModel': is_beta_used = False if self.name == 'GompertzModel': init_alpha = 1000. # Initializing the model model = nn.ParametricNet(input_shape, init_method, init_alpha, is_beta_used) # Trasnforming the inputs into tensors X = torch.FloatTensor(X) T = torch.FloatTensor(T.reshape(-1, 1)) E = torch.FloatTensor(E.reshape(-1, 1)) # Performing order 1 optimization model, loss_values = opt.optimize(self.loss_function, model, optimizer, lr, num_epochs, verbose, X=X, T=T, E=E, l2_reg=l2_reg) # Saving attributes self.model = model.eval() self.loss_values = loss_values # Calculating the AIC self.aic = 2 * self.loss_values[-1] self.aic -= 2 * (self.num_vars + 1 + is_beta_used * 1. - 1) return self
def create_risk_groups(model, X, use_log=True, num_bins=50, figure_size=(20, 8), **kwargs): """ Computing and displaying the histogram of the risk scores of the given model and test set X. If it is provided args, it will assign a color coding to the scores that are below and above the given thresholds. Parameters: ----------- * model : Pysurvival object Pysurvival model * X : array-like, shape=(n_samples, n_features) The input samples. * use_log: boolean (default=True) Whether applying the log function to the risk score * num_bins: int (default=50) The number of equal-width bins that will constitute the histogram * figure_size: tuple of double (default= (16, 6)) width, height in inches representing the size of the chart * kwargs: dict (optional) kwargs = low_risk = {'lower_bound': 0, 'upper_bound': 20, 'color': 'red'}, high_risk = {'lower_bound': 20, 'upper_bound': 120, 'color': 'blue'} that define the risk group """ # Ensuring that the input data has the right format X = utils.check_data(X) # Computing the risk scores risk = model.predict_risk(X) if use_log: risk = np.log(risk) # Displaying simple histogram if len(kwargs) == 0: # Initializing the chart fig, ax1 = plt.subplots(figsize=figure_size) risk_groups = None # Applying any color coding else: # Initializing the results risk_groups = {} # Initializing the chart fig, ((ax1, ax2)) = plt.subplots(1, 2, figsize=figure_size) # Displaying simple histogram with risk groups nums_per_bins, bins, patches = ax2.hist(risk, bins=num_bins) ax2.set_title('Risk groups with colors', fontsize=15) # Number of group definitions num_group_def = len(kwargs.values()) # Extracting the bounds values bounds = {} colors_ = {} indexes = {} group_names = [] handles = [] # we need to check that the boundaries match the bins is_not_valid = 0 for group_name, group_def in kwargs.items(): # by ensuring that the bounds are not outside # the bins values min_bin, max_bin = min(bins), max(bins) if (group_def['lower_bound'] < min_bin and \ group_def['upper_bound'] < min_bin) or \ (group_def['lower_bound'] > max_bin and \ group_def['upper_bound'] > max_bin): is_not_valid += 1 # Extracting the bounds bounds[group_name] = (group_def['lower_bound'], group_def['upper_bound']) # Extracting the colors colors_[group_name] = group_def['color'] # Creating index placeholders indexes[group_name] = [] group_names.append(group_name) color_indv = group_def['color'] handles.append(Rectangle((0, 0), 1, 1, color=color_indv, ec="k")) if is_not_valid >= num_group_def: error_msg = "The boundaries definitions {} do not match" error_msg += ", the values of the risk scores." error_msg = error_msg.format(list(bounds.values())) raise ValueError(error_msg) # Assigning each rectangle/bin to its group definition # and color colored_patches = [] bin_index = {} for i, bin_, patch_ in zip(range(num_bins), bins, patches): # Check if the bin belongs to this bound def for grp_name, bounds_ in bounds.items(): if bounds_[0] <= bin_ < bounds_[-1]: bin_index[i] = grp_name # Extracting color color_ = colors_[grp_name] if color_ not in colors.CSS4_COLORS: error_msg = '{} is not a valid color' error_msg = error_msg.format(colors_[grp_name]) raise ValueError(error_msg) patch_.set_facecolor(color_) # Saving the rectangles colored_patches.append(patch_) # Assigning each sample to its group risk_bins = np.minimum(np.digitize(risk, bins, True), num_bins - 1) for i, r in enumerate(risk_bins): # Extracting the right group_name group_name = bin_index[r] indexes[group_name].append(i) # Displaying the original distribution ax1.hist(risk, bins=num_bins, color='black', alpha=0.5) ax1.set_title('Risk Score Distribution', fontsize=15) # Show everything plt.show() # Returning results if risk_groups is not None: for group_name in group_names: result = (colors_[group_name], indexes[group_name]) risk_groups[group_name] = result return risk_groups
def fit(self, X, T, E, max_features='sqrt', max_depth=5, min_node_size=10, num_threads=-1, weights=None, sample_size_pct=0.63, alpha=0.5, minprop=0.1, num_random_splits=100, importance_mode='impurity_corrected', seed=None, save_memory=False): """ Arguments: --------- * X : array-like, shape=(n_samples, n_features) The input samples. * T : array-like, shape = [n_samples] The target values describing when the event of interest or censoring occurred * E : array-like, shape = [n_samples] The Event indicator array such that E = 1. if the event occurred E = 0. if censoring occurred * max_features : int, float or string, optional (default="all") The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and `int(max_features * n_features)` features are considered at each split. - If "sqrt", then `max_features=sqrt(n_features)` - If "log2", then `max_features=log2(n_features)`. * min_node_size : int(default=10) The minimum number of samples required to be at a leaf node * num_threads: int (Default: -1) The number of jobs to run in parallel for both fit and predict. If -1, then the number of jobs is set to the number of cores. * weights: array-like, shape = [n_samples] (default=None) Weights for sampling of training observations. Observations with larger weights will be selected with higher probability in the bootstrap * sample_size_pct: double (default = 0.63) Percentage of original samples used in each tree building * alpha: float For "maxstat" splitrule: Significance threshold to allow splitting. * minprop: float For "maxstat" splitrule: Lower quantile of covariate distribution to be considered for splitting. * num_random_splits: int (default=100) For "extratrees" splitrule, it is the Number of random splits to consider for each candidate splitting variable. * importance_mode: (default=impurity_corrected) Variable importance mode. Here are the 2 options: - `impurity` or `impurity_corrected`: it's the unbiased heterogeneity reduction developed by Sandri & Zuccolotto (2008) - "permutation" it's unnormalized as recommended by Nicodemus et al. - "normalized_permutation" it's normalized version of the permutation importance computations by Breiman et al. * `seed`: int (default=None) -- seed used by the random number generator. If None, the current timestamp converted in UNIX is used. * save_memory: bool (default=False) -- Use memory saving splitting mode. This will slow down the model training. So, only set to `True` if you encounter memory problems. Example: -------- #### 1 - Importing packages import numpy as np import pandas as pd from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split from pysurvival.models.simulations import SimulationModel from pysurvival.models.survival_forest import ConditionalSurvivalForestModel from pysurvival.utils.metrics import concordance_index from pysurvival.utils.display import integrated_brier_score #%matplotlib inline # To use with Jupyter notebooks #### 2 - Generating the dataset from a Exponential parametric model # Initializing the simulation model sim = SimulationModel( survival_distribution = 'exponential', risk_type = 'linear', censored_parameter = 1, alpha = 3) # Generating N random samples N = 1000 dataset = sim.generate_data(num_samples = N, num_features=4) # Showing a few data-points dataset.head(2) #### 3 - Creating the modeling dataset # Defining the features features = sim.features # Building training and testing sets # index_train, index_test = train_test_split( range(N), test_size = 0.2) data_train = dataset.loc[index_train].reset_index( drop = True ) data_test = dataset.loc[index_test].reset_index( drop = True ) # Creating the X, T and E input X_train, X_test = data_train[features], data_test[features] T_train, T_test = data_train['time'].values, data_test['time'].values E_train, E_test = data_train['event'].values, data_test['event'].values #### 4 - Creating an instance of the model and fitting the data. # Building the model csf = ConditionalSurvivalForestModel(num_trees=200) csf.fit(X_train, T_train, E_train, max_features="sqrt", max_depth=5, min_node_size=20, alpha = 0.05, minprop=0.1) #### 5 - Cross Validation / Model Performances c_index = concordance_index(l_mtlr, X_test, T_test, E_test) #0.81 print('C-index: {:.2f}'.format(c_index)) ibs = integrated_brier_score(l_mtlr, X_test, T_test, E_test, t_max=30, figure_size=(20, 6.5) ) """ # Collecting features names N, self.num_variables = X.shape if isinstance(X, pd.DataFrame): features = X.columns.tolist() else: features = ['x_{}'.format(i) for i in range(self.num_variables)] all_data_features = ["time", "event"] + features # Transforming the strings into bytes all_data_features = utils.as_bytes(all_data_features, python_version=PYTHON_VERSION) # Checking the format of the data X, T, E = utils.check_data(X, T, E) if X.ndim == 1: X = X.reshape(1, -1) T = T.reshape(1, -1) E = E.reshape(1, -1) input_data = np.c_[T, E, X] # Number of trees num_trees = self.num_trees # Seed if seed is None: seed = 0 # sample_size_pct if not isinstance(sample_size_pct, float): error = "Error: Invalid value for sample_size_pct, " error += "please provide a value that is > 0 and <= 1." raise ValueError(error) if (sample_size_pct <= 0 or sample_size_pct > 1): error = "Error: Invalid value for sample_size_pct, " error += "please provide a value that is > 0 and <= 1." raise ValueError(error) # Split Rule if self.splitrule.lower() == 'logrank': split_mode = 1 alpha = 0 minprop = 0 num_random_splits = 1 elif self.splitrule.lower() == "maxstat": split_mode = 4 num_random_splits = 1 # Maxstat splitting if not isinstance(alpha, float): error = "Error: Invalid value for alpha, " error += "please provide a value that is > 0 and < 1." raise ValueError(error) if (alpha <= 0 or alpha >= 1): error = "Error: Invalid value for alpha, " error += "please provide a value between 0 and 1." raise ValueError(error) if not isinstance(minprop, float): error = "Error: Invalid value for minprop, " error += "please provide a value between 0 and 0.5" raise ValueError(error) if (minprop < 0 or minprop > 0.5): error = "Error: Invalid value for minprop, " error += "please provide a value between 0 and 0.5" raise ValueError(error) elif self.splitrule.lower() == 'extratrees': split_mode = 5 alpha = 0 minprop = 0 # Number of variables to possibly split at in each node self.max_features = max_features if isinstance(self.max_features, str): if self.max_features.lower() == 'sqrt': num_variables_to_use = int(np.sqrt(self.num_variables)) elif 'log' in self.max_features.lower(): num_variables_to_use = int(np.log(self.num_variables)) elif self.max_features.lower() == 'all': num_variables_to_use = self.num_variables else: raise ValueError("Unknown max features option") elif isinstance(self.max_features, float) or \ isinstance(self.max_features, int): if 0 < self.max_features < 1: num_variables_to_use = int(self.num_variables * self.max_features) elif self.max_features >= 1: num_variables_to_use = min(self.num_variables, self.max_features) if self.max_features > self.num_variables: msg = "max features value is greater than the number of " msg += "variables ({num_variables}) of the input X. " msg += "So it was set to {num_variables}." msg = msg.format(num_variables=self.num_variables) warnings.warn(msg, UserWarning) elif self.max_features <= 0: raise ValueError("max features is a positive value") else: raise ValueError("Unknown max features option") # Defining importance mode if 'permutation' in importance_mode.lower(): if 'scaled' in importance_mode.lower() or \ 'normalized' in importance_mode.lower(): importance_mode = 2 else: importance_mode = 3 elif 'impurity' in importance_mode.lower(): importance_mode = 5 else: error = "{} is not a valid importance mode".format(importance_mode) raise ValueError(error) # Weights if weights is None: case_weights = [1. / N] * N else: case_weights = utils.check_data(weights) if abs(sum(case_weights) - 1.) >= 1e-4: raise Exception( "The sum of the weights needs to be equal to 1.") if len(case_weights) != N: raise Exception("weights length needs to be {} ".format(N)) # Fitting the model using the C++ object verbose = True self.model.fit(input_data, all_data_features, case_weights, num_trees, num_variables_to_use, min_node_size, max_depth, alpha, minprop, num_random_splits, sample_size_pct, importance_mode, split_mode, verbose, seed, num_threads, save_memory) # Saving the attributes self.save_properties() self.get_time_buckets() # Extracting the Variable Importance self.variable_importance = {} for i, value in enumerate(self.variable_importance_): self.variable_importance[features[i]] = value # Saving the importance in a dataframe self.variable_importance_table = pd.DataFrame( data={'feature': list(self.variable_importance.keys()), 'importance': list(self.variable_importance.values()) }, columns=['feature', 'importance']).\ sort_values('importance', ascending=0).reset_index(drop=True) importance = self.variable_importance_table['importance'].values importance = np.maximum(importance, 0.) sum_imp = sum(importance) * 1. self.variable_importance_table['pct_importance'] = importance / sum_imp return self
def generate_data(self, num_samples=100, num_features=3, feature_weights=None): """ Generating a dataset of simulated survival times from a given distribution through the hazard function using the Cox model Parameters: ----------- * `num_samples`: **int** *(default=100)* -- Number of samples to generate * `num_features`: **int** *(default=3)* -- Number of features to generate * `feature_weights`: **array-like** *(default=None)* -- list of the coefficients of the underlying Cox-Model. The features linked to each coefficient are generated from random distribution from the following list: * binomial * chisquare * exponential * gamma * normal * uniform * laplace If None then feature_weights = [1.]*num_features Returns: -------- * dataset: pandas.DataFrame dataset of simulated survival times, event status and features Example: -------- from pysurvival.models.simulations import SimulationModel # Initializing the simulation model sim = SimulationModel( survival_distribution = 'gompertz', risk_type = 'linear', censored_parameter = 5.0, alpha = 0.01, beta = 5., ) # Generating N Random samples N = 1000 dataset = sim.generate_data(num_samples = N, num_features=5) # Showing a few data-points dataset.head() """ # Data parameters self.num_variables = num_features if feature_weights is None: self.feature_weights = [1.] * self.num_variables feature_weights = self.feature_weights else: feature_weights = utils.check_data(feature_weights) if num_features != len(feature_weights): error = "The length of feature_weights ({}) " error += "and num_features ({}) are not the same." error = error.format(len(feature_weights), num_features) raise ValueError(error) self.feature_weights = feature_weights # Generating random features # Creating the features X = np.zeros((num_samples, self.num_variables)) columns = [] for i in range(self.num_variables): key, X[:, i] = self.random_data(num_samples) columns.append('x_' + str(i + 1)) X_std = self.scaler.fit_transform(X) BX = self.risk_function(X_std) # Building the survival times T = self.time_function(BX) C = np.random.normal(loc=self.censored_parameter, scale=5, size=num_samples) C = np.maximum(C, 0.) time = np.minimum(T, C) E = 1. * (T == time) # Building dataset self.features = columns self.dataset = pd.DataFrame(data=np.c_[X, time, E], columns=columns + ['time', 'event']) # Building the time axis and time buckets self.times = np.linspace(0., max(self.dataset['time']), self.bins) self.get_time_buckets() # Building baseline functions self.baseline_hazard = self.hazard_function(self.times, 0) self.baseline_survival = self.survival_function(self.times, 0) # Printing summary message message_to_print = "Number of data-points: {} - Number of events: {}" print(message_to_print.format(num_samples, sum(E))) return self.dataset
def compare_to_actual(model, X, T, E, times=None, is_at_risk=False, figure_size=(16, 6), metrics=['rmse', 'mean', 'median'], **kwargs): """ Comparing the actual and predicted number of units at risk and units experiencing an event at each time t. Parameters: ----------- * model : pysurvival model The model that will be used for prediction * X : array-like, shape=(n_samples, n_features) The input samples. * T : array-like, shape = [n_samples] The target values describing when the event of interest or censoring occured * E : array-like, shape = [n_samples] The Event indicator array such that E = 1. if the event occured E = 0. if censoring occured * times: array-like, (default=None) A vector of timepoints. * is_at_risk: bool (default=True) Whether the function returns Expected number of units at risk or the Expected number of units experiencing the events. * figure_size: tuple of double (default= (16, 6)) width, height in inches representing the size of the chart * metrics: str or list of str (default='all') Indicates the performance metrics to compute: - if None, then no metric is computed - if str, then the metric is computed - if list of str, then the metrics are computed The available metrics are: - RMSE: root mean squared error - Mean Abs Error: mean absolute error - Median Abs Error: median absolute error Returns: -------- * results: float or dict Performance metrics """ # Initializing the Kaplan-Meier model X, T, E = utils.check_data(X, T, E) kmf = KaplanMeierModel() kmf.fit(T, E) # Creating actual vs predicted N = T.shape[0] # Defining the time axis if times is None: times = kmf.times # Number of Expected number of units at risk # or the Expected number of units experiencing the events actual = [] actual_upper = [] actual_lower = [] predicted = [] if is_at_risk: model_predicted = np.sum(model.predict_survival(X, **kwargs), 0) for t in times: min_index = [abs(a_j_1 - t) for (a_j_1, a_j) in model.time_buckets] index = np.argmin(min_index) actual.append(N * kmf.predict_survival(None, t)) actual_upper.append(N * kmf.predict_survival_upper(None, t)) actual_lower.append(N * kmf.predict_survival_lower(None, t)) predicted.append(model_predicted[index]) else: model_predicted = np.sum(model.predict_density(X, **kwargs), 0) for t in times: min_index = [abs(a_j_1 - t) for (a_j_1, a_j) in model.time_buckets] index = np.argmin(min_index) actual.append(N * kmf.predict_density(None, t)) h = kmf.predict_hazard(None, t) actual_upper.append(N * kmf.predict_survival_upper(None, t) * h) actual_lower.append(N * kmf.predict_survival_lower(None, t) * h) predicted.append(model_predicted[index]) # Computing the performance metrics results = None title = 'Actual vs Predicted' if metrics is not None: # RMSE rmse = np.sqrt(mean_squared_error(actual, predicted)) # Median Abs Error med_ae = median_absolute_error(actual, predicted) # Mean Abs Error mae = mean_absolute_error(actual, predicted) if isinstance(metrics, str): # RMSE if 'rmse' in metrics.lower() or 'root' in metrics.lower(): results = rmse title += "\n" title += "RMSE = {:.3f}".format(rmse) # Median Abs Error elif 'median' in metrics.lower(): results = med_ae title += "\n" title += "Median Abs Error = {:.3f}".format(med_ae) # Mean Abs Error elif 'mean' in metrics.lower(): results = mae title += "\n" title += "Mean Abs Error = {:.3f}".format(mae) else: raise NotImplementedError( '{} is not a valid metric function.'.format(metrics)) elif isinstance(metrics, list) or isinstance(metrics, numpy.ndarray): results = {} # RMSE is_rmse = False if any([('rmse' in m.lower() or 'root' in m.lower()) \ for m in metrics]): is_rmse = True results['root_mean_squared_error'] = rmse title += "\n" title += "RMSE = {:.3f}".format(rmse) # Median Abs Error is_med_ae = False if any(['median' in m.lower() for m in metrics]): is_med_ae = True results['median_absolute_error'] = med_ae title += "\n" title += "Median Abs Error = {:.3f}".format(med_ae) # Mean Abs Error is_mae = False if any(['mean' in m.lower() for m in metrics]): is_mae = True results['mean_absolute_error'] = mae title += "\n" title += "Mean Abs Error = {:.3f}".format(mae) if all([not is_mae, not is_rmse, not is_med_ae]): error = 'The provided metrics are not available.' raise NotImplementedError(error) # Plotting fig, ax = plt.subplots(figsize=figure_size) ax.plot(times, actual, color='red', label='Actual', alpha=0.8, lw=3) ax.plot(times, predicted, color='blue', label='Predicted', alpha=0.8, lw=3) plt.xlim(0, max(T)) # Filling the areas between the Survival and Confidence Intervals curves plt.fill_between(times, actual, actual_lower, label='Confidence Intervals - Lower', color='red', alpha=0.2) plt.fill_between(times, actual, actual_upper, label='Confidence Intervals - Upper', color='red', alpha=0.2) # Finalizing the chart plt.title(title, fontsize=15) plt.legend(fontsize=15) plt.show() return results
def fit(self, X, T, E, init_method='glorot_normal', lr=1e-2, max_iter=100, l2_reg=1e-2, alpha=0.95, tol=1e-3, verbose=True): """ Fitting a proportional hazards regression model using the Efron's approximation method to take into account tied times. As the Hessian matrix of the log-likelihood can be calculated without too much effort, the model parameters are computed using the Newton_Raphson Optimization scheme: W_new = W_old - lr*<Hessian^(-1), gradient> Arguments: --------- * `X` : **array-like**, *shape=(n_samples, n_features)* -- The input samples. * `T` : **array-like** -- The target values describing when the event of interest or censoring occurred. * `E` : **array-like** -- The values that indicate if the event of interest occurred i.e.: E[i]=1 corresponds to an event, and E[i] = 0 means censoring, for all i. * `init_method` : **str** *(default = 'glorot_uniform')* -- Initialization method to use. Here are the possible options: * `glorot_uniform`: Glorot/Xavier uniform initializer * `he_uniform`: He uniform variance scaling initializer * `uniform`: Initializing tensors with uniform (-1, 1) distribution * `glorot_normal`: Glorot normal initializer, * `he_normal`: He normal initializer. * `normal`: Initializing tensors with standard normal distribution * `ones`: Initializing tensors to 1 * `zeros`: Initializing tensors to 0 * `orthogonal`: Initializing tensors with a orthogonal matrix, * `lr`: **float** *(default=1e-4)* -- learning rate used in the optimization * `max_iter`: **int** *(default=100)* -- The maximum number of iterations in the Newton optimization * `l2_reg`: **float** *(default=1e-4)* -- L2 regularization parameter for the model coefficients * `alpha`: **float** *(default=0.95)* -- Confidence interval * `tol`: **float** *(default=1e-3)* -- Tolerance for stopping criteria * `verbose`: **bool** *(default=True)* -- Whether or not producing detailed logging about the modeling Example: -------- #### 1 - Importing packages import numpy as np import pandas as pd from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split from pysurvival.models.simulations import SimulationModel from pysurvival.models.semi_parametric import CoxPHModel from pysurvival.utils.metrics import concordance_index from pysurvival.utils.display import integrated_brier_score #%pylab inline # To use with Jupyter notebooks #### 2 - Generating the dataset from a Log-Logistic parametric model # Initializing the simulation model sim = SimulationModel( survival_distribution = 'log-logistic', risk_type = 'linear', censored_parameter = 10.1, alpha = 0.1, beta=1.2 ) # Generating N random samples N = 1000 dataset = sim.generate_data(num_samples = N, num_features = 3) #### 3 - Creating the modeling dataset # Defining the features features = sim.features # Building training and testing sets # index_train, index_test = train_test_split( range(N), test_size = 0.2) data_train = dataset.loc[index_train].reset_index( drop = True ) data_test = dataset.loc[index_test].reset_index( drop = True ) # Creating the X, T and E input X_train, X_test = data_train[features], data_test[features] T_train, T_test = data_train['time'].values, data_test['time'].values E_train, E_test = data_train['event'].values, data_test['event'].values #### 4 - Creating an instance of the Cox PH model and fitting the data. # Building the model coxph = CoxPHModel() coxph.fit(X_train, T_train, E_train, lr=0.5, l2_reg=1e-2, init_method='zeros') #### 5 - Cross Validation / Model Performances c_index = concordance_index(coxph, X_test, T_test, E_test) #0.92 print('C-index: {:.2f}'.format(c_index)) ibs = integrated_brier_score(coxph, X_test, T_test, E_test, t_max=10, figure_size=(20, 6.5) ) References: ----------- * https://en.wikipedia.org/wiki/Proportional_hazards_model#Tied_times * Efron, Bradley (1974). "The Efficiency of Cox's Likelihood Function for Censored Data". Journal of the American Statistical Association. 72 (359): 557-565. """ # Collecting features names N, self.num_vars = X.shape if isinstance(X, pd.DataFrame): self.variables = X.columns.tolist() else: self.variables = ['x_{}'.format(i) for i in range(self.num_vars)] # Checking the format of the data X, T, E = utils.check_data(X, T, E) order = np.argsort(-T) T = T[order] E = E[order] X = self.scaler.fit_transform(X[order, :]) self.std_scale = np.sqrt(self.scaler.var_) # Initializing the model self.model = _CoxPHModel() # Creating the time axis self.model.get_times(T, E) # Initializing the parameters W = np.zeros(self.num_vars) W = opt.initialization(init_method, W, False).flatten() W = W.astype(np.float64) # Optimizing to find best parameters epsilon = 1e-9 self.model.newton_optimization(X, T, E, W, lr, l2_reg, tol, epsilon, max_iter, verbose) # Saving the Cython attributes in the Python object self.weights = np.array(self.model.W) self.loss = self.model.loss self.times = np.array(self.model.times) self.gradient = np.array(self.model.gradient) self.Hessian = np.array(self.model.Hessian) self.inv_Hessian = np.array(self.model.inv_Hessian) self.loss_values = np.array(self.model.loss_values) self.grad2_values = np.array(self.model.grad2_values) # Computing baseline functions score = np.exp(np.dot(X, self.weights)) baselines = _baseline_functions(score, T, E) # Saving the Cython attributes in the Python object self.baseline_hazard = np.array(baselines[1]) self.baseline_survival = np.array(baselines[2]) del self.model self.get_time_buckets() # Calculating summary self.get_summary(alpha) return self
def fit(self, X, T, E, init_method='glorot_uniform', optimizer='adam', lr=1e-4, num_epochs=1000, dropout=0.2, batch_normalization=False, bn_and_dropout=False, l2_reg=1e-5, verbose=True): """ Fit the estimator based on the given parameters. Parameters: ----------- * `X` : **array-like**, *shape=(n_samples, n_features)* -- The input samples. * `T` : **array-like** -- The target values describing when the event of interest or censoring occurred. * `E` : **array-like** -- The values that indicate if the event of interest occurred i.e.: E[i]=1 corresponds to an event, and E[i] = 0 means censoring, for all i. * `init_method` : **str** *(default = 'glorot_uniform')* -- Initialization method to use. Here are the possible options: * `glorot_uniform`: Glorot/Xavier uniform initializer * `he_uniform`: He uniform variance scaling initializer * `uniform`: Initializing tensors with uniform (-1, 1) distribution * `glorot_normal`: Glorot normal initializer, * `he_normal`: He normal initializer. * `normal`: Initializing tensors with standard normal distribution * `ones`: Initializing tensors to 1 * `zeros`: Initializing tensors to 0 * `orthogonal`: Initializing tensors with a orthogonal matrix, * `optimizer`: **str** *(default = 'adam')* -- iterative method for optimizing a differentiable objective function. Here are the possible options: - `adadelta` - `adagrad` - `adam` - `adamax` - `rmsprop` - `sparseadam` - `sgd` * `lr`: **float** *(default=1e-4)* -- learning rate used in the optimization * `num_epochs`: **int** *(default=1000)* -- The number of iterations in the optimization * `dropout`: **float** *(default=0.5)* -- Randomly sets a fraction rate of input units to 0 at each update during training time, which helps prevent overfitting. * `l2_reg`: **float** *(default=1e-4)* -- L2 regularization parameter for the model coefficients * `batch_normalization`: **bool** *(default=True)* -- Applying Batch Normalization or not * `bn_and_dropout`: **bool** *(default=False)* -- Applying Batch Normalization and Dropout at the same time * `verbose`: **bool** *(default=True)* -- Whether or not producing detailed logging about the modeling Example: -------- #### 1 - Importing packages import numpy as np import pandas as pd from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split from pysurvival.models.simulations import SimulationModel from pysurvival.models.semi_parametric import NonLinearCoxPHModel from pysurvival.utils.metrics import concordance_index from pysurvival.utils.display import integrated_brier_score #%matplotlib inline # To use with Jupyter notebooks #### 2 - Generating the dataset from a nonlinear Weibull parametric model # Initializing the simulation model sim = SimulationModel( survival_distribution = 'weibull', risk_type = 'Gaussian', censored_parameter = 2.1, alpha = 0.1, beta=3.2 ) # Generating N random samples N = 1000 dataset = sim.generate_data(num_samples = N, num_features=3) # Showing a few data-points dataset.head(2) #### 3 - Creating the modeling dataset # Defining the features features = sim.features # Building training and testing sets # index_train, index_test = train_test_split( range(N), test_size = 0.2) data_train = dataset.loc[index_train].reset_index( drop = True ) data_test = dataset.loc[index_test].reset_index( drop = True ) # Creating the X, T and E input X_train, X_test = data_train[features], data_test[features] T_train, T_test = data_train['time'].values, data_test['time'].values E_train, E_test = data_train['event'].values, data_test['event'].values #### 4 - Creating an instance of the NonLinear CoxPH model and fitting # the data. # Defining the MLP structure. Here we will build a 1-hidden layer # with 150 units and `BentIdentity` as its activation function structure = [ {'activation': 'BentIdentity', 'num_units': 150}, ] # Building the model nonlinear_coxph = NonLinearCoxPHModel(structure=structure) nonlinear_coxph.fit(X_train, T_train, E_train, lr=1e-3, init_method='xav_uniform') #### 5 - Cross Validation / Model Performances c_index = concordance_index(nonlinear_coxph, X_test, T_test, E_test) print('C-index: {:.2f}'.format(c_index)) ibs = integrated_brier_score(nonlinear_coxph, X_test, T_test, E_test, t_max=10, figure_size=(20, 6.5) ) """ # Checking data format (i.e.: transforming into numpy array) X, T, E = utils.check_data(X, T, E) # Extracting data parameters N, self.num_vars = X.shape input_shape = self.num_vars # Scaling data if self.auto_scaler: X_original = self.scaler.fit_transform(X) # Sorting X, T, E in descending order according to T order = np.argsort(-T) T = T[order] E = E[order] X_original = X_original[order, :] self.times = np.unique(T[E.astype(bool)]) self.nb_times = len(self.times) self.get_time_buckets() # Initializing the model model = nn.NeuralNet(input_shape, 1, self.structure, init_method, dropout, batch_normalization, bn_and_dropout) # Looping through the data to calculate the loss X = torch.cuda.FloatTensor(X_original) # Computing the Risk and Fail tensors Risk, Fail = self.risk_fail_matrix(T, E) Risk = torch.cuda.FloatTensor(Risk) Fail = torch.cuda.FloatTensor(Fail) # Computing Efron's matrices Efron_coef, Efron_one, Efron_anti_one = self.efron_matrix() Efron_coef = torch.cuda.FloatTensor(Efron_coef) Efron_one = torch.cuda.FloatTensor(Efron_one) Efron_anti_one = torch.cuda.FloatTensor(Efron_anti_one) # Performing order 1 optimization model, loss_values = opt.optimize(self.loss_function, model, optimizer, lr, num_epochs, verbose, X=X, Risk=Risk, Fail=Fail, Efron_coef=Efron_coef, Efron_one=Efron_one, Efron_anti_one=Efron_anti_one, l2_reg=l2_reg) # Saving attributes self.model = model.eval() self.loss_values = loss_values # Computing baseline functions x = X_original x = torch.cuda.FloatTensor(x) # Calculating risk_score score = np.exp( self.model(torch.cuda.FloatTensor(x)).data.cpu().numpy().flatten()) baselines = _baseline_functions(score, T, E) # Saving the Cython attributes in the Python object self.times = np.array(baselines[0]) self.baseline_hazard = np.array(baselines[1]) self.baseline_survival = np.array(baselines[2]) return self
def fit(self, T, E, weights = None, alpha=0.95): """ Fitting the model according to the provided data. Parameters: ----------- * `T` : **array-like** -- The target values describing when the event of interest or censoring occurred. * `E` : **array-like** -- The values that indicate if the event of interest occurred i.e.: E[i]=1 corresponds to an event, and E[i] = 0 symbols censoring, for all i. * `weights` : **array-like** *(default = None)* -- Array of weights that are assigned to individual samples. If not provided, then each sample is given a unit weight. * `alpha`: **float** *(default = 0.05)* -- Significance level Returns: -------- * self : object Example: -------- # Importing modules import numpy as np from matplotlib import pyplot as plt from pysurvival.utils.display import display_non_parametric # %matplotlib inline #Uncomment when using Jupyter # Generating random times and event indicators T = np.round(np.abs(np.random.normal(10, 10, 1000)), 1) E = np.random.binomial(1, 0.3, 1000) # Initializing the KaplanMeierModel from pysurvival.models.non_parametric import KaplanMeierModel km_model = KaplanMeierModel() # Fitting the model km_model.fit(T, E, alpha=0.95) # Displaying the survival function and confidence intervals display_non_parametric(km_model) # Initializing the SmoothKaplanMeierModel from pysurvival.models.non_parametric import SmoothKaplanMeierModel skm_model = SmoothKaplanMeierModel(bandwith=0.1, kernel='normal') # Fitting the model skm_model.fit(T, E) # Displaying the survival function and confidence intervals display_non_parametric(skm_model) """ # Checking the format of the data T, E = utils.check_data(T, E) # weighting if weights is None: weights = [1.]*T.shape[0] # Confidence Intervals z = stats.norm.ppf((1. - alpha) / 2.) # Building the Kaplan-Meier model survival = self.model.fit(T, E, weights, z) if sum(survival) <= 0. : mem_error = "The kernel matrix cannot fit in memory." mem_error += "You should use a bigger bandwidth b" raise MemoryError(mem_error) # Saving all properties self.save_properties() # Generating the Survival table if 'smooth' not in self.name.lower() : self.get_survival_table()
def fit(self, X, T, E, with_bias=True, init_method='glorot_normal', lr=1e-2, max_iter=100, l2_reg=1e-4, tol=1e-3, verbose=True): """ Fitting a Survival Support Vector Machine model. As the Hessian matrix of the log-likelihood can be calculated without too much effort, the model parameters are computed using the Newton_Raphson Optimization scheme: W_new = W_old - lr*<Hessian^(-1), gradient> Arguments: --------- * `X` : array-like, shape=(n_samples, n_features) The input samples. * `T` : array-like, shape = [n_samples] The target values describing when the event of interest or censoring occurred * `E` : array-like, shape = [n_samples] The Event indicator array such that E = 1. if the event occurred E = 0. if censoring occurred * `with_bias`: bool (default=True) Whether a bias should be added * `init_method` : str (default = 'glorot_uniform') Initialization method to use. Here are the possible options: * 'glorot_uniform': Glorot/Xavier uniform initializer, * 'he_uniform': He uniform variance scaling initializer * 'uniform': Initializing tensors with uniform (-1, 1) distribution * 'glorot_normal': Glorot normal initializer, * 'he_normal': He normal initializer. * 'normal': Initializing tensors with standard normal distribution * 'ones': Initializing tensors to 1 * 'zeros': Initializing tensors to 0 * 'orthogonal': Initializing tensors with a orthogonal matrix, * `lr`: float (default=1e-4) learning rate used in the optimization * `max_iter`: int (default=100) The maximum number of iterations in the Newton optimization * `l2_reg`: float (default=1e-4) L2 regularization parameter for the model coefficients * `alpha`: float (default=0.95) Confidence interval * `tol`: float (default=1e-3) Tolerance for stopping criteria * `verbose`: bool (default=True) Whether or not producing detailed logging about the modeling Example: -------- #### 1 - Importing packages import numpy as np import pandas as pd from pysurvival.models.svm import LinearSVMModel from pysurvival.models.svm import KernelSVMModel from pysurvival.models.simulations import SimulationModel from pysurvival.utils.metrics import concordance_index from sklearn.model_selection import train_test_split from scipy.stats.stats import pearsonr # %pylab inline # to use in jupyter notebooks #### 2 - Generating the dataset from the parametric model # Initializing the simulation model sim = SimulationModel( survival_distribution = 'Log-Logistic', risk_type = 'linear', censored_parameter = 1.1, alpha = 1.5, beta = 4) # Generating N Random samples N = 1000 dataset = sim.generate_data(num_samples = N, num_features = 4) #### 3 - Splitting the dataset into training and testing sets # Defining the features features = sim.features # Building training and testing sets # index_train, index_test = train_test_split( range(N), test_size = 0.2) data_train = dataset.loc[index_train].reset_index( drop = True ) data_test = dataset.loc[index_test].reset_index( drop = True ) # Creating the X, T and E input X_train, X_test = data_train[features], data_test[features] T_train, T_test = data_train['time'].values, data_test['time'].values E_train, E_test = data_train['event'].values, data_test['event'].values #### 4 - Creating an instance of the SVM model and fitting the data. svm_model = LinearSVMModel() svm_model = KernelSVMModel(kernel='Gaussian', scale=0.25) svm_model.fit(X_train, T_train, E_train, init_method='he_uniform', with_bias = True, lr = 0.5, tol = 1e-3, l2_reg = 1e-3) #### 5 - Cross Validation / Model Performances c_index = concordance_index(svm_model, X_test, T_test, E_test) #0.93 print('C-index: {:.2f}'.format(c_index)) #### 6 - Comparing the model predictions to Actual risk score # Comparing risk scores svm_risks = svm_model.predict_risk(X_test) actual_risks = sim.predict_risk(X_test).flatten() print("corr={:.4f}, p_value={:.5f}".format(*pearsonr(svm_risks, actual_risks)))# corr=-0.9992, p_value=0.00000 """ # Collecting features names N, self.num_vars = X.shape if isinstance(X, pd.DataFrame): self.variables = X.columns.tolist() else: self.variables = ['x_{}'.format(i) for i in range(self.num_vars)] # Adding a bias or not self.with_bias = with_bias if with_bias: self.variables += ['intercept'] p = int(self.num_vars + 1. * with_bias) # Checking the format of the data X, T, E = utils.check_data(X, T, E) if with_bias: # Adding the intercept X = np.c_[X, [1.] * N] X = self.scaler.fit_transform(X) # Initializing the parameters if self.kernel_type == 0: W = np.zeros((p, 1)) else: W = np.zeros((N, 1)) W = opt.initialization(init_method, W, False).flatten() W = W.astype(np.float64) # Optimizing to find best parameters self.model.newton_optimization(X, T, E, W, lr, l2_reg, tol, max_iter, verbose) self.save_properties() return self
def concordance_index(model, X, T, E, include_ties = True, additional_results=False, **kwargs): """ Computing the C-index based on *On The C-Statistics For Evaluating Overall Adequacy Of Risk Prediction Procedures With Censored Survival Data* and *Estimating the Concordance Probability in a Survival Analysis with a Discrete Number of Risk Groups* and *Concordance for Survival Time Data: Fixed and Time-Dependent Covariates and Possible Ties in Predictor and Time Similarly to the AUC, C-index = 1 corresponds to the best model prediction, and C-index = 0.5 represents a random prediction. Parameters: ----------- * model : Pysurvival object Pysurvival model * X : array-like, shape=(n_samples, n_features) The input samples. * E : array-like, shape = [n_samples] The Event indicator array such that E = 1. if the event occured E = 0. if censoring occured * include_ties: bool (default=True) Specifies whether ties in risk score are included in calculations * additional_results: bool (default=False) Specifies whether only the c-index should be returned (False) or if a dict of values should returned. the values are: - c-index - nb_pairs - nb_concordant_pairs Returns: -------- * results: double or dict (if additional_results = True) - results is the c-index (double) if additional_results = False - results is dict if additional_results = True such that results[0] = C-index; results[1] = nb_pairs; results[2] = nb_concordant_pairs; Example: -------- """ # Checking the format of the data risk = model.predict_risk(X, **kwargs) risk, T, E = utils.check_data(risk, T, E) # Ordering risk, T and E in descending order according to T order = np.argsort(-T) risk = risk[order] T = T[order] E = E[order] # Calculating th c-index results = _concordance_index(risk, T, E, include_ties) if not additional_results: return results[0] else: return results
def fit(self, X, T, E, init_method='glorot_uniform', optimizer='adam', lr=1e-4, num_epochs=1000, dropout=0.2, l2_reg=1e-2, l2_smooth=1e-2, batch_normalization=False, bn_and_dropout=False, verbose=True, extra_pct_time=0.1, is_min_time_zero=True, max_norm=1.0, min_clamp_value=1e-8, max_clamp_value=torch.finfo(torch.float32).max - 1): """ Fit the estimator based on the given parameters. Parameters: ----------- * `X` : **array-like**, *shape=(n_samples, n_features)* -- The input samples. * `T` : **array-like** -- The target values describing when the event of interest or censoring occurred. * `E` : **array-like** -- The values that indicate if the event of interest occurred i.e.: E[i]=1 corresponds to an event, and E[i] = 0 means censoring, for all i. * `init_method` : **str** *(default = 'glorot_uniform')* -- Initialization method to use. Here are the possible options: * `glorot_uniform`: Glorot/Xavier uniform initializer * `he_uniform`: He uniform variance scaling initializer * `uniform`: Initializing tensors with uniform (-1, 1) distribution * `glorot_normal`: Glorot normal initializer, * `he_normal`: He normal initializer. * `normal`: Initializing tensors with standard normal distribution * `ones`: Initializing tensors to 1 * `zeros`: Initializing tensors to 0 * `orthogonal`: Initializing tensors with a orthogonal matrix, * `optimizer`: **str** *(default = 'adam')* -- iterative method for optimizing a differentiable objective function. Here are the possible options: - `adadelta` - `adagrad` - `adam` - `adamax` - `rmsprop` - `sparseadam` - `sgd` * `lr`: **float** *(default=1e-4)* -- learning rate used in the optimization * `num_epochs`: **int** *(default=1000)* -- The number of iterations in the optimization * `dropout`: **float** *(default=0.5)* -- Randomly sets a fraction rate of input units to 0 at each update during training time, which helps prevent overfitting. * `l2_reg`: **float** *(default=1e-4)* -- L2 regularization parameter for the model coefficients * `l2_smooth`: **float** *(default=1e-4)* -- Second L2 regularizer that ensures the parameters vary smoothly across consecutive time points. * `batch_normalization`: **bool** *(default=True)* -- Applying Batch Normalization or not * `bn_and_dropout`: **bool** *(default=False)* -- Applying Batch Normalization and Dropout at the same time * `display_loss`: **bool** *(default=True)* -- Whether or not showing the loss function values at each update * `verbose`: **bool** *(default=True)* -- Whether or not producing detailed logging about the modeling * `extra_pct_time`: **float** *(default=0.1)* -- Providing an extra fraction of time in the time axis * `is_min_time_zero`: **bool** *(default=True)* -- Whether the the time axis starts at 0 * `max_norm`: **float** *(default=1.0)* -- Max l2 norm for gradient clipping **Returns:** * self : object Example: -------- #### 1 - Importing packages import numpy as np import pandas as pd from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split from pysurvival.models.simulations import SimulationModel from pysurvival.models.multi_task import LinearMultiTaskModel from pysurvival.utils.metrics import concordance_index #%matplotlib inline # To use with Jupyter notebooks #### 2 - Generating the dataset from a Weibull parametric model # Initializing the simulation model sim = SimulationModel( survival_distribution = 'Weibull', risk_type = 'linear', censored_parameter = 10.0, alpha = .01, beta = 3.0 ) # Generating N random samples N = 1000 dataset = sim.generate_data(num_samples = N, num_features = 3) # Showing a few data-points time_column = 'time' event_column = 'event' dataset.head(2) #### 3 - Creating the modeling dataset # Defining the features features = sim.features # Building training and testing sets # index_train, index_test = train_test_split( range(N), test_size = 0.2) data_train = dataset.loc[index_train].reset_index( drop = True ) data_test = dataset.loc[index_test].reset_index( drop = True ) # Creating the X, T and E input X_train, X_test = data_train[features], data_test[features] T_train, T_test = data_train['time'].values, data_test['time'].values E_train, E_test = data_train['event'].values, data_test['event'].values #### 4 - Initializing a MTLR model and fitting the data. # Building a Linear model mtlr = LinearMultiTaskModel(bins=50) mtlr.fit(X_train, T_train, E_train, lr=5e-3, init_method='orthogonal') # Building a Neural MTLR # structure = [ {'activation': 'Swish', 'num_units': 150}, ] # mtlr = NeuralMultiTaskModel(structure=structure, bins=150) # mtlr.fit(X_train, T_train, E_train, lr=5e-3, init_method='adam') #### 5 - Cross Validation / Model Performances c_index = concordance_index(mtlr, X_test, T_test, E_test) #0.95 print('C-index: {:.2f}'.format(c_index)) """ # Checking data format (i.e.: transforming into numpy array) X, T, E = utils.check_data(X, T, E) input_shape = [] # Extracting data parameters if isinstance(X, list): nb_inputs = len(X) for data in X: nb_units, num_vars = data.shape input_shape.append(num_vars) # Scaling data if self.auto_scaler: for index, data in enumerate(X): X[index] = self.scaler.fit_transform(data) else: nb_inputs = 1 nb_units, self.num_vars = X.shape input_shape.append(self.num_vars) # Scaling data if self.auto_scaler: X = self.scaler.fit_transform(X) # Building the time axis, time buckets and output Y X_cens, X_uncens, Y_cens, Y_uncens \ = self.compute_XY(X, T, E, is_min_time_zero, extra_pct_time) # Initializing the model model = nn.NeuralNet(input_shape, self.num_times, self.structure, init_method, dropout, batch_normalization, bn_and_dropout) # Creating the Triangular matrix Triangle = np.tri(self.num_times, self.num_times + 1, dtype=np.float32) Triangle = torch.FloatTensor(Triangle) if torch.cuda.is_available(): model = model.cuda() Triangle = Triangle.cuda() # Performing order 1 optimization model, loss_values = opt.optimize(self.loss_function, model, optimizer, lr, num_epochs, verbose, X_cens=X_cens, X_uncens=X_uncens, Y_cens=Y_cens, Y_uncens=Y_uncens, Triangle=Triangle, l2_reg=l2_reg, l2_smooth=l2_smooth, max_norm=max_norm, min_clamp_value=min_clamp_value, max_clamp_value=max_clamp_value) # Saving attributes self.model = model.eval() self.loss_values = loss_values return self
def brier_score(model, X, T, E, t_max=None, use_mean_point=True, **kwargs): """ Computing the Brier score at all times t such that t <= t_max; it represents the average squared distances between the observed survival status and the predicted survival probability. In the case of right censoring, it is necessary to adjust the score by weighting the squared distances to avoid bias. It can be achieved by using the inverse probability of censoring weights method (IPCW), (proposed by Graf et al. 1999; Gerds and Schumacher 2006) by using the estimator of the conditional survival function of the censoring times calculated using the Kaplan-Meier method, such that : BS(t) = 1/N*( W_1(t)*(Y_1(t) - S_1(t))^2 + ... + W_N(t)*(Y_N(t) - S_N(t))^2) In terms of benchmarks, a useful model will have a Brier score below 0.25. Indeed, it is easy to see that if for all i in [1,N], if S(t, xi) = 0.5, then BS(t) = 0.25. Parameters: ----------- * model : Pysurvival object Pysurvival model * X : array-like, shape=(n_samples, n_features) The input samples. * T : array-like, shape = [n_samples] The target values describing when the event of interest or censoring occured * E : array-like, shape = [n_samples] The Event indicator array such that E = 1. if the event occured E = 0. if censoring occured * t_max: float Maximal time for estimating the prediction error curves. If missing the largest value of the response variable is used. Returns: -------- * (times, brier_scores):tuple of arrays -times represents the time axis at which the brier scores were computed - brier_scores represents the values of the brier scores Example: -------- """ # Checking the format of the data T, E = utils.check_data(T, E) # computing the Survival function Survival = model.predict_survival(X, None, **kwargs) # Extracting the time buckets times = model.times time_buckets = model.time_buckets # Ordering Survival, T and E in descending order according to T order = np.argsort(-T) Survival = Survival[order, :] T = T[order] E = E[order] if t_max is None or t_max <= 0.: t_max = max(T) # Calculating the brier scores at each t <= t_max results = _brier_score(Survival, T, E, t_max, times, time_buckets, use_mean_point) times = results[0] brier_scores = results[1] return (times, brier_scores)