def fit(self, X, T, E, init_method='glorot_normal', lr=1e-2, max_iter=100, l2_reg=1e-2, alpha=0.95, tol=1e-3, verbose=True): """ Fitting a proportional hazards regression model using the Efron's approximation method to take into account tied times. As the Hessian matrix of the log-likelihood can be calculated without too much effort, the model parameters are computed using the Newton_Raphson Optimization scheme: W_new = W_old - lr*<Hessian^(-1), gradient> Arguments: --------- * `X` : **array-like**, *shape=(n_samples, n_features)* -- The input samples. * `T` : **array-like** -- The target values describing when the event of interest or censoring occurred. * `E` : **array-like** -- The values that indicate if the event of interest occurred i.e.: E[i]=1 corresponds to an event, and E[i] = 0 means censoring, for all i. * `init_method` : **str** *(default = 'glorot_uniform')* -- Initialization method to use. Here are the possible options: * `glorot_uniform`: Glorot/Xavier uniform initializer * `he_uniform`: He uniform variance scaling initializer * `uniform`: Initializing tensors with uniform (-1, 1) distribution * `glorot_normal`: Glorot normal initializer, * `he_normal`: He normal initializer. * `normal`: Initializing tensors with standard normal distribution * `ones`: Initializing tensors to 1 * `zeros`: Initializing tensors to 0 * `orthogonal`: Initializing tensors with a orthogonal matrix, * `lr`: **float** *(default=1e-4)* -- learning rate used in the optimization * `max_iter`: **int** *(default=100)* -- The maximum number of iterations in the Newton optimization * `l2_reg`: **float** *(default=1e-4)* -- L2 regularization parameter for the model coefficients * `alpha`: **float** *(default=0.95)* -- Confidence interval * `tol`: **float** *(default=1e-3)* -- Tolerance for stopping criteria * `verbose`: **bool** *(default=True)* -- Whether or not producing detailed logging about the modeling Example: -------- #### 1 - Importing packages import numpy as np import pandas as pd from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split from pysurvival.models.simulations import SimulationModel from pysurvival.models.semi_parametric import CoxPHModel from pysurvival.utils.metrics import concordance_index from pysurvival.utils.display import integrated_brier_score #%pylab inline # To use with Jupyter notebooks #### 2 - Generating the dataset from a Log-Logistic parametric model # Initializing the simulation model sim = SimulationModel( survival_distribution = 'log-logistic', risk_type = 'linear', censored_parameter = 10.1, alpha = 0.1, beta=1.2 ) # Generating N random samples N = 1000 dataset = sim.generate_data(num_samples = N, num_features = 3) #### 3 - Creating the modeling dataset # Defining the features features = sim.features # Building training and testing sets # index_train, index_test = train_test_split( range(N), test_size = 0.2) data_train = dataset.loc[index_train].reset_index( drop = True ) data_test = dataset.loc[index_test].reset_index( drop = True ) # Creating the X, T and E input X_train, X_test = data_train[features], data_test[features] T_train, T_test = data_train['time'].values, data_test['time'].values E_train, E_test = data_train['event'].values, data_test['event'].values #### 4 - Creating an instance of the Cox PH model and fitting the data. # Building the model coxph = CoxPHModel() coxph.fit(X_train, T_train, E_train, lr=0.5, l2_reg=1e-2, init_method='zeros') #### 5 - Cross Validation / Model Performances c_index = concordance_index(coxph, X_test, T_test, E_test) #0.92 print('C-index: {:.2f}'.format(c_index)) ibs = integrated_brier_score(coxph, X_test, T_test, E_test, t_max=10, figure_size=(20, 6.5) ) References: ----------- * https://en.wikipedia.org/wiki/Proportional_hazards_model#Tied_times * Efron, Bradley (1974). "The Efficiency of Cox's Likelihood Function for Censored Data". Journal of the American Statistical Association. 72 (359): 557-565. """ # Collecting features names N, self.num_vars = X.shape if isinstance(X, pd.DataFrame): self.variables = X.columns.tolist() else: self.variables = ['x_{}'.format(i) for i in range(self.num_vars)] # Checking the format of the data X, T, E = utils.check_data(X, T, E) order = np.argsort(-T) T = T[order] E = E[order] X = self.scaler.fit_transform(X[order, :]) self.std_scale = np.sqrt(self.scaler.var_) # Initializing the model self.model = _CoxPHModel() # Creating the time axis self.model.get_times(T, E) # Initializing the parameters W = np.zeros(self.num_vars) W = opt.initialization(init_method, W, False).flatten() W = W.astype(np.float64) # Optimizing to find best parameters epsilon = 1e-9 self.model.newton_optimization(X, T, E, W, lr, l2_reg, tol, epsilon, max_iter, verbose) # Saving the Cython attributes in the Python object self.weights = np.array(self.model.W) self.loss = self.model.loss self.times = np.array(self.model.times) self.gradient = np.array(self.model.gradient) self.Hessian = np.array(self.model.Hessian) self.inv_Hessian = np.array(self.model.inv_Hessian) self.loss_values = np.array(self.model.loss_values) self.grad2_values = np.array(self.model.grad2_values) # Computing baseline functions score = np.exp(np.dot(X, self.weights)) baselines = _baseline_functions(score, T, E) # Saving the Cython attributes in the Python object self.baseline_hazard = np.array(baselines[1]) self.baseline_survival = np.array(baselines[2]) del self.model self.get_time_buckets() # Calculating summary self.get_summary(alpha) return self
def fit(self, X, T, E, init_method='glorot_uniform', optimizer='adam', lr=1e-4, num_epochs=1000, dropout=0.2, batch_normalization=False, bn_and_dropout=False, l2_reg=1e-5, verbose=True): """ Fit the estimator based on the given parameters. Parameters: ----------- * `X` : **array-like**, *shape=(n_samples, n_features)* -- The input samples. * `T` : **array-like** -- The target values describing when the event of interest or censoring occurred. * `E` : **array-like** -- The values that indicate if the event of interest occurred i.e.: E[i]=1 corresponds to an event, and E[i] = 0 means censoring, for all i. * `init_method` : **str** *(default = 'glorot_uniform')* -- Initialization method to use. Here are the possible options: * `glorot_uniform`: Glorot/Xavier uniform initializer * `he_uniform`: He uniform variance scaling initializer * `uniform`: Initializing tensors with uniform (-1, 1) distribution * `glorot_normal`: Glorot normal initializer, * `he_normal`: He normal initializer. * `normal`: Initializing tensors with standard normal distribution * `ones`: Initializing tensors to 1 * `zeros`: Initializing tensors to 0 * `orthogonal`: Initializing tensors with a orthogonal matrix, * `optimizer`: **str** *(default = 'adam')* -- iterative method for optimizing a differentiable objective function. Here are the possible options: - `adadelta` - `adagrad` - `adam` - `adamax` - `rmsprop` - `sparseadam` - `sgd` * `lr`: **float** *(default=1e-4)* -- learning rate used in the optimization * `num_epochs`: **int** *(default=1000)* -- The number of iterations in the optimization * `dropout`: **float** *(default=0.5)* -- Randomly sets a fraction rate of input units to 0 at each update during training time, which helps prevent overfitting. * `l2_reg`: **float** *(default=1e-4)* -- L2 regularization parameter for the model coefficients * `batch_normalization`: **bool** *(default=True)* -- Applying Batch Normalization or not * `bn_and_dropout`: **bool** *(default=False)* -- Applying Batch Normalization and Dropout at the same time * `verbose`: **bool** *(default=True)* -- Whether or not producing detailed logging about the modeling Example: -------- #### 1 - Importing packages import numpy as np import pandas as pd from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split from pysurvival.models.simulations import SimulationModel from pysurvival.models.semi_parametric import NonLinearCoxPHModel from pysurvival.utils.metrics import concordance_index from pysurvival.utils.display import integrated_brier_score #%matplotlib inline # To use with Jupyter notebooks #### 2 - Generating the dataset from a nonlinear Weibull parametric model # Initializing the simulation model sim = SimulationModel( survival_distribution = 'weibull', risk_type = 'Gaussian', censored_parameter = 2.1, alpha = 0.1, beta=3.2 ) # Generating N random samples N = 1000 dataset = sim.generate_data(num_samples = N, num_features=3) # Showing a few data-points dataset.head(2) #### 3 - Creating the modeling dataset # Defining the features features = sim.features # Building training and testing sets # index_train, index_test = train_test_split( range(N), test_size = 0.2) data_train = dataset.loc[index_train].reset_index( drop = True ) data_test = dataset.loc[index_test].reset_index( drop = True ) # Creating the X, T and E input X_train, X_test = data_train[features], data_test[features] T_train, T_test = data_train['time'].values, data_test['time'].values E_train, E_test = data_train['event'].values, data_test['event'].values #### 4 - Creating an instance of the NonLinear CoxPH model and fitting # the data. # Defining the MLP structure. Here we will build a 1-hidden layer # with 150 units and `BentIdentity` as its activation function structure = [ {'activation': 'BentIdentity', 'num_units': 150}, ] # Building the model nonlinear_coxph = NonLinearCoxPHModel(structure=structure) nonlinear_coxph.fit(X_train, T_train, E_train, lr=1e-3, init_method='xav_uniform') #### 5 - Cross Validation / Model Performances c_index = concordance_index(nonlinear_coxph, X_test, T_test, E_test) print('C-index: {:.2f}'.format(c_index)) ibs = integrated_brier_score(nonlinear_coxph, X_test, T_test, E_test, t_max=10, figure_size=(20, 6.5) ) """ # Checking data format (i.e.: transforming into numpy array) X, T, E = utils.check_data(X, T, E) # Extracting data parameters N, self.num_vars = X.shape input_shape = self.num_vars # Scaling data if self.auto_scaler: X_original = self.scaler.fit_transform(X) # Sorting X, T, E in descending order according to T order = np.argsort(-T) T = T[order] E = E[order] X_original = X_original[order, :] self.times = np.unique(T[E.astype(bool)]) self.nb_times = len(self.times) self.get_time_buckets() # Initializing the model model = nn.NeuralNet(input_shape, 1, self.structure, init_method, dropout, batch_normalization, bn_and_dropout) # Looping through the data to calculate the loss X = torch.cuda.FloatTensor(X_original) # Computing the Risk and Fail tensors Risk, Fail = self.risk_fail_matrix(T, E) Risk = torch.cuda.FloatTensor(Risk) Fail = torch.cuda.FloatTensor(Fail) # Computing Efron's matrices Efron_coef, Efron_one, Efron_anti_one = self.efron_matrix() Efron_coef = torch.cuda.FloatTensor(Efron_coef) Efron_one = torch.cuda.FloatTensor(Efron_one) Efron_anti_one = torch.cuda.FloatTensor(Efron_anti_one) # Performing order 1 optimization model, loss_values = opt.optimize(self.loss_function, model, optimizer, lr, num_epochs, verbose, X=X, Risk=Risk, Fail=Fail, Efron_coef=Efron_coef, Efron_one=Efron_one, Efron_anti_one=Efron_anti_one, l2_reg=l2_reg) # Saving attributes self.model = model.eval() self.loss_values = loss_values # Computing baseline functions x = X_original x = torch.cuda.FloatTensor(x) # Calculating risk_score score = np.exp( self.model(torch.cuda.FloatTensor(x)).data.cpu().numpy().flatten()) baselines = _baseline_functions(score, T, E) # Saving the Cython attributes in the Python object self.times = np.array(baselines[0]) self.baseline_hazard = np.array(baselines[1]) self.baseline_survival = np.array(baselines[2]) return self