Exemple #1
0
    def predict_cumulative_hazard(self, x, t=None, **kwargs):
        """ Predicts the cumulative hazard function H(t, x)

            Parameters
            ----------
            * `x` : **array-like** *shape=(n_samples, n_features)* --
                array-like representing the datapoints. 
                x should not be standardized before, the model
                will take care of it

            * `t`: **double** *(default=None)* --
                time at which the prediction should be performed. 
                If None, then return the function for all available t.

            Returns
            -------
            * `cumulative_hazard`: **numpy.ndarray** --
                array-like representing the prediction of the cumulative_hazard
                function
        """

        # Checking if the data has the right format
        x = utils.check_data(x)

        # Calculating hazard/cumulative_hazard
        hazard = self.predict_hazard(x, t, **kwargs)
        cumulative_hazard = np.cumsum(hazard, 1)
        return cumulative_hazard
    def predict(self, X, t=None, num_threads=-1):

        # Checking if the data has the right format
        X = utils.check_data(X)
        if X.ndim == 1:
            X = X.reshape(1, -1)
        T = np.array([1.] * X.shape[0])
        E = np.array([1.] * X.shape[0])
        input_data = np.c_[T, E, X]

        # Loading the attributes of the model
        self.load_properties()

        # Computing Survival
        survival = np.array(
            self.model.predict_survival(input_data, num_threads))

        # Computing hazard
        hazard = np.array(self.model.predict_hazard(input_data, num_threads))

        # Computing density
        density = hazard * survival

        if t is None:
            return hazard, density, survival
        else:
            min_index = [abs(a_j_1 - t) for (a_j_1, a_j) in self.time_buckets]
            index = np.argmin(min_index)
            return hazard[:, index], density[:, index], survival[:, index]
Exemple #3
0
    def predict_risk(self, x, **kwargs):
        """ Predicts the Risk Score/Mortality function for all t,
            R(x) = sum( cumsum(hazard(t, x)) )
            According to Random survival forests from Ishwaran H et al
            https://arxiv.org/pdf/0811.1645.pdf

            Parameters
            ----------
            * `x` : **array-like** *shape=(n_samples, n_features)* --
                array-like representing the datapoints. 
                x should not be standardized before, the model
                will take care of it

            Returns
            -------
            * `risk_score`: **numpy.ndarray** --
                array-like representing the prediction of Risk Score function
        """

        # Checking if the data has the right format
        x = utils.check_data(x)

        # Calculating cumulative_hazard/risk
        cumulative_hazard = self.predict_cumulative_hazard(x, None, **kwargs)
        risk_score = np.sum(cumulative_hazard, 1)
        return risk_score
    def predict_risk_chunk(self, X, num_threads=-1, printChunk=False):

        # Checking if the data has the right format
        X = utils.check_data(X)
        if X.ndim == 1:
            X = X.reshape(1, -1)
        T = np.array([1.] * X.shape[0])
        E = np.array([1.] * X.shape[0])
        input_data = np.c_[T, E, X]

        # Loading the attributes of the model
        self.load_properties()
        n_rows, n_cols = input_data.shape
        nparts = round(n_rows / 1000)
        split_array = np.array_split(input_data, nparts)
        survival = np.empty(n_rows)
        survIndex = 0
        for s in split_array:
            survival[survIndex:(survIndex + np.size(s, axis=0))] = np.array(
                self.model.predict_risk(s, num_threads))
            survIndex += np.size(s, axis=0)
            #if printChunk:
            #print(survIndex)
        # Computing risk
        return survival
    def predict(self, x, t=None):
        """ 
        Predicting the hazard, density and survival functions
        
        Arguments:
            * x: pd.Dataframe or np.ndarray or list
                x is the testing dataset containing the features
                x should not be standardized before, the model
                will take care of it
            * t: float (default=None)
                Time at which hazard, density and survival functions
                should be calculated. If None, the method returns 
                the functions for all times t. 
        """

        # Convert x into the right format
        x = utils.check_data(x)

        # Sacling the dataset
        if x.ndim == 1:
            x = self.scaler.transform(x.reshape(1, -1))
        elif x.ndim == 2:
            x = self.scaler.transform(x)

        # Calculating risk_score, hazard, density and survival
        phi = np.exp(np.dot(x, self.weights))
        hazard = self.baseline_hazard * phi.reshape(-1, 1)
        survival = np.power(self.baseline_survival, phi.reshape(-1, 1))
        density = hazard * survival
        if t is None:
            return hazard, density, survival
        else:
            min_index = [abs(a_j_1 - t) for (a_j_1, a_j) in self.time_buckets]
            index = np.argmin(min_index)
            return hazard[:, index], density[:, index], survival[:, index]
Exemple #6
0
    def predict_risk(self, x):
        """
        Predicting the risk score function
        
        Parameters:
        -----------
            * x: pd.Dataframe or np.ndarray or list
                x is the testing dataset containing the features
                x should not be standardized before, the model
                will take care of it
        """

        # Convert x into the right format
        x = utils.check_data(x)

        # Scaling the dataset
        if x.ndim == 1:
            x = self.scaler.transform(x.reshape(1, -1))

        elif x.ndim == 2:
            x = self.scaler.transform(x)

        else:
            # Ensuring x has 2 dimensions
            if x.ndim == 1:
                x = np.reshape(x, (1, -1))

        # Calculating risk_score
        risk_score = self.risk_function(x)

        return risk_score
    def predict_risk(self, x, use_log=False):
        """
        Predicting the risk score functions
        
        Arguments:
            * x: pd.Dataframe or np.ndarray or list
                x is the testing dataset containing the features
                x should not be standardized before, the model
                will take care of it
        """

        # Convert x into the right format
        x = utils.check_data(x)

        # Scaling the dataset
        if x.ndim == 1:
            x = self.scaler.transform(x.reshape(1, -1))
        elif x.ndim == 2:
            x = self.scaler.transform(x)

        # Calculating risk_score
        risk_score = np.exp(np.dot(x, self.weights))
        if not use_log:
            risk_score = np.exp(risk_score)

        return risk_score
    def predict_risk(self, x, use_log=False):
        """
        Predicting the risk score functions
        
        Arguments:
            * x: pd.Dataframe or np.ndarray or list
                x is the testing dataset containing the features
                x should not be standardized before, the model
                will take care of it
        """

        # Convert x into the right format
        x = utils.check_data(x)

        # Scaling the data
        if self.auto_scaler:
            if x.ndim == 1:
                x = self.scaler.transform(x.reshape(1, -1))
            elif x.ndim == 2:
                x = self.scaler.transform(x)
        else:
            # Ensuring x has 2 dimensions
            if x.ndim == 1:
                x = np.reshape(x, (1, -1))

        # Transforming into pytorch objects
        x = torch.cuda.FloatTensor(x)

        # Calculating risk_score
        score = self.model(x).data.cpu().numpy().flatten()
        if not use_log:
            score = np.exp(score)

        return score
Exemple #9
0
    def predict_cdf(self, x, t=None, **kwargs):
        """ Predicts the cumulative density function F(t, x)

            Parameters
            ----------
            * `x` : **array-like** *shape=(n_samples, n_features)* --
                array-like representing the datapoints. 
                x should not be standardized before, the model
                will take care of it

            * `t`: **double** *(default=None)* --
                time at which the prediction should be performed. 
                If None, then return the function for all available t.

            Returns
            -------
            * `cdf`: **numpy.ndarray** --
                array-like representing the prediction of the cumulative 
                density function 
        """

        # Checking if the data has the right format
        x = utils.check_data(x)

        # Calculating survival and cdf
        survival = self.predict_survival(x, t, **kwargs)
        cdf = 1. - survival
        return cdf
Exemple #10
0
def bootstrap_concordance_index_chunk(model, X, T, E, include_ties = True, additional_results = False, n_iterations = 1000, n_size = 1000, **kwargs):
    stats = list()
    risk = model.predict_risk_chunk(X, **kwargs)
    risk, T, E = utils.check_data(risk, T, E)

    for i in range(n_iterations):
        tempR, tempT, tempE = resample(risk, T, E, n_samples=n_size)
        order = np.argsort(-tempT)
        tempR = tempR[order]
        tempT = tempT[order]
        tempE = tempE[order]

        # Calculating th c-index
        results = _concordance_index(tempR, tempT, tempE, include_ties)
        stats.append(results[0])
        if i % 10 == 0:
            print(i)
            alpha = 0.95
            p = ((1.0 - alpha) / 2.0) * 100
            lower = max(0.0, np.percentile(stats, p))
            p = (alpha + ((1.0 - alpha) / 2.0)) * 100
            upper = min(1.0, np.percentile(stats, p))
            print('%.1f confidence interval %.3f%% and %.3f%%' % (alpha * 100, lower, upper))
            print(np.average(stats))
    # confidence intervals
    alpha = 0.95
    p = ((1.0 - alpha) / 2.0) * 100
    lower = max(0.0, np.percentile(stats, p))
    p = (alpha + ((1.0 - alpha) / 2.0)) * 100
    upper = min(1.0, np.percentile(stats, p))
    print('%.1f confidence interval %.3f%% and %.3f%%' % (alpha * 100, lower, upper))
    print(np.average(stats))
Exemple #11
0
    def predict(self, x, t=None):
        """ Predicting the hazard, density and survival functions

        Parameters:
        ----------
        * `x` : **array-like** *shape=(n_samples, n_features)* --
            array-like representing the datapoints.
            x should not be standardized before, the model
            will take care of it

        * `t`: **double** *(default=None)* --
             time at which the prediction should be performed.
             If None, then return the function for all available t.
        """

        # Convert x into the right format
        x = utils.check_data(x)

        # Scaling the data
        if self.auto_scaler:
            if x.ndim == 1:
                x = self.scaler.transform(x.reshape(1, -1))
            elif x.ndim == 2:
                x = self.scaler.transform(x)
        else:
            # Ensuring x has 2 dimensions
            if x.ndim == 1:
                x = np.reshape(x, (1, -1))

        # Transforming into pytorch objects
        x = torch.FloatTensor(x)

        # Predicting using linear/nonlinear function
        score_torch = self.model(x)
        score = score_torch.data.numpy()

        # Cretaing the time triangles
        Triangle1 = np.tri(self.num_times, self.num_times + 1)
        Triangle2 = np.tri(self.num_times + 1, self.num_times + 1)

        # Calculating the score, density, hazard and Survival
        phi = np.exp(np.dot(score, Triangle1))
        div = np.repeat(np.sum(phi, 1).reshape(-1, 1), phi.shape[1], axis=1)
        density = (phi / div)
        Survival = np.dot(density, Triangle2)
        hazard = density[:, :-1] / Survival[:, 1:]

        # Returning the full functions of just one time point
        if t is None:
            return hazard, density, Survival
        else:
            min_abs_value = [
                abs(a_j_1 - t) for (a_j_1, a_j) in self.time_buckets
            ]
            index = np.argmin(min_abs_value)
            return hazard[:, index], density[:, index], Survival[:, index]
    def predict_risk(self, x, use_log=False):
        """
        Predicting the risk score functions
        
        Arguments:
            * x: pd.Dataframe or np.ndarray or list
                x is the testing dataset containing the features
                x should not be standardized before, the model
                will take care of it
        """

        # Convert x into the right format
        x = utils.check_data(x)

        if isinstance(x, list):
            # Scaling data
            if self.auto_scaler:
                for index, X in enumerate(x):
                    if X.ndim == 1:
                        X = self.scaler.transform(X.reshape(1, -1))
                    elif X.ndim == 2:
                        X = self.scaler.transform(X)
                    x[index] = X
            else:
                for index, X in enumerate(x):
                    # Ensuring x has 2 dimensions
                    if X.ndim == 1:
                        X = np.reshape(X, (1, -1))

                    x[index] = X
        else:
            # Scaling data
            if self.auto_scaler:
                x = self.scaler.fit_transform(x)

        # Transform into torch.Tensor
        if isinstance(x, list):
            for j, input_ in enumerate(x):
                x[j] = torch.FloatTensor(input_)
                if torch.cuda.is_available():
                    x[j] = x[j].cuda()
        else:
            x = torch.FloatTensor(x)
            if torch.cuda.is_available():
                x = x.cuda()

        # Transforming into pytorch objects
        #x = X_original

        # Calculating risk_score
        score = self.model(x).cpu().data.numpy().flatten()
        if not use_log:
            score = np.exp(score)

        return score
    def predict_risk(self, X, num_threads=-1):

        # Checking if the data has the right format
        X = utils.check_data(X)
        if X.ndim == 1:
            X = X.reshape(1, -1)
        T = np.array([1.] * X.shape[0])
        E = np.array([1.] * X.shape[0])
        input_data = np.c_[T, E, X]

        # Loading the attributes of the model
        self.load_properties()

        # Computing risk
        risk = self.model.predict_risk(input_data, num_threads)
        return np.array(risk)
Exemple #14
0
    def predict(self, x, t=None):
        """ 
        Predicting the hazard, density and survival functions
        
        Parameters:
        -----------
            * x: pd.Dataframe or np.ndarray or list
                x is the testing dataset containing the features
                x should not be standardized before, the model
                will take care of it
            * t: float (default=None)
                Time at which hazard, density and survival functions
                should be calculated. If None, the method returns 
                the functions for all times t. 
        """

        # Convert x into the right format
        x = utils.check_data(x)

        # Scaling the dataset
        if x.ndim == 1:
            x = self.scaler.transform(x.reshape(1, -1))

        elif x.ndim == 2:
            x = self.scaler.transform(x)

        else:
            # Ensuring x has 2 dimensions
            if x.ndim == 1:
                x = np.reshape(x, (1, -1))

        # Calculating risk_score, hazard, density and survival
        BX = self.risk_function(x)
        hazard = self.hazard_function(self.times, BX.reshape(-1, 1))
        survival = self.survival_function(self.times, BX.reshape(-1, 1))
        density = (hazard * survival)

        if t is None:
            return hazard, density, survival
        else:
            min_abs_value = [
                abs(a_j_1 - t) for (a_j_1, a_j) in self.time_buckets
            ]
            index = np.argmin(min_abs_value)
            return hazard[:, index], density[:, index], survival[:, index]
Exemple #15
0
    def predict_risk(self, x, use_log=False):
        """ Predicts the Risk Score
        
            Parameter
            ----------
            * `x`, np.ndarray
                 array-like representing the datapoints

            * `use_log`: bool - (default=False)
                Applies the log function to the risk values

            Returns
            -------
            * `risk_score`, np.ndarray
                array-like representing the prediction of Risk Score function
        """

        # Ensuring that the C++ model has the fitted parameters
        self.load_properties()

        # Convert x into the right format
        x = utils.check_data(x)

        # Scaling the dataset
        if x.ndim == 1:
            if self.with_bias:
                x = np.r_[x, 1.]
            x = self.scaler.transform(x.reshape(1, -1))
        elif x.ndim == 2:
            n = x.shape[0]
            if self.with_bias:
                x = np.c_[x, [1.] * n]
            x = self.scaler.transform(x)

        # Calculating prdiction
        risk = np.exp(self.model.get_score(x))

        if use_log:
            return np.log(risk)
        else:
            return risk
    def predict_chunk(self, X, t=1, num_threads=-1):

        # Checking if the data has the right format
        X = utils.check_data(X)
        if X.ndim == 1:
            X = X.reshape(1, -1)
        T = np.array([1.] * X.shape[0])
        E = np.array([1.] * X.shape[0])
        input_data = np.c_[T, E, X]
        n_rows, n_cols = input_data.shape
        nparts = round(n_rows / 1000)
        split_array = np.array_split(input_data, nparts)
        survival = np.empty(n_rows)
        self.load_properties()
        min_index = [abs(a_j_1 - t) for (a_j_1, a_j) in self.time_buckets]
        index = np.argmin(min_index)
        survIndex = 0
        for s in split_array:
            survival[survIndex:(survIndex + np.size(s, axis=0))] = np.array(
                self.model.predict_survival(s, num_threads))[:, index]
            survIndex += np.size(s, axis=0)
            #print(survIndex)
        return survival
Exemple #17
0
    def fit(self,
            X,
            T,
            E,
            init_method='glorot_uniform',
            optimizer='adam',
            lr=1e-4,
            num_epochs=1000,
            l2_reg=1e-2,
            verbose=True,
            is_min_time_zero=True,
            extra_pct_time=0.1):
        """ 
        Fit the estimator based on the given parameters.

        Parameters:
        -----------
        * `X` : **array-like**, *shape=(n_samples, n_features)* --
            The input samples.

        * `T` : **array-like** -- 
            The target values describing when the event of interest or censoring
            occurred.

        * `E` : **array-like** --
            The values that indicate if the event of interest occurred i.e.: 
            E[i]=1 corresponds to an event, and E[i] = 0 means censoring, 
            for all i.

        * `init_method` : **str** *(default = 'glorot_uniform')* -- 
            Initialization method to use. Here are the possible options:

            * `glorot_uniform`:  Glorot/Xavier uniform initializer 
            * `he_uniform`:  He uniform variance scaling initializer 
            * `uniform`: Initializing tensors with uniform (-1, 1) distribution
            * `glorot_normal`: Glorot normal initializer,
            * `he_normal`: He normal initializer.
            * `normal`: Initializing tensors with standard normal distribution
            * `ones`: Initializing tensors to 1
            * `zeros`: Initializing tensors to 0
            * `orthogonal`: Initializing tensors with a orthogonal matrix,

        * `optimizer`:  **str** *(default = 'adam')* -- 
            iterative method for optimizing a differentiable objective function.
            Here are the possible options:

            - `adadelta`
            - `adagrad`
            - `adam`
            - `adamax`
            - `rmsprop`
            - `sparseadam`
            - `sgd`

        * `lr`: **float** *(default=1e-4)* -- 
            learning rate used in the optimization

        * `num_epochs`: **int** *(default=1000)* -- 
            The number of iterations in the optimization

        * `l2_reg`: **float** *(default=1e-4)* -- 
            L2 regularization parameter for the model coefficients

        * `verbose`: **bool** *(default=True)* -- 
            Whether or not producing detailed logging about the modeling

        * `extra_pct_time`: **float** *(default=0.1)* -- 
            Providing an extra fraction of time in the time axis

        * `is_min_time_zero`: **bool** *(default=True)* -- 
            Whether the the time axis starts at 0

        Returns:
        --------
        * self : object


        Example:
        --------

        #### 1 - Importing packages
        import numpy as np
        import pandas as pd
        from matplotlib import pyplot as plt
        from sklearn.model_selection import train_test_split
        from pysurvival.models.simulations import SimulationModel
        from pysurvival.models.parametric import GompertzModel
        from pysurvival.utils.metrics import concordance_index
        from pysurvival.utils.display import integrated_brier_score
        #%matplotlib inline  # To use with Jupyter notebooks

        #### 2 - Generating the dataset from a Gompertz parametric model
        # Initializing the simulation model
        sim = SimulationModel( survival_distribution = 'Gompertz',  
                               risk_type = 'linear',
                               censored_parameter = 10.0, 
                               alpha = .01, beta = 3.0 )

        # Generating N random samples 
        N = 1000
        dataset = sim.generate_data(num_samples = N, num_features = 3)

        # Showing a few data-points 
        time_column = 'time'
        event_column = 'event'
        dataset.head(2)

        #### 3 - Creating the modeling dataset
        # Defining the features
        features = sim.features

        # Building training and testing sets #
        index_train, index_test = train_test_split( range(N), test_size = 0.2)
        data_train = dataset.loc[index_train].reset_index( drop = True )
        data_test  = dataset.loc[index_test].reset_index( drop = True )

        # Creating the X, T and E input
        X_train, X_test = data_train[features], data_test[features]
        T_train, T_test = data_train['time'].values, data_test['time'].values
        E_train, E_test = data_train['event'].values, data_test['event'].values

        #### 4 - Creating an instance of the Gompertz model and fitting the data
        # Building the model
        gomp_model = GompertzModel() 
        gomp_model.fit(X_train, T_train, E_train, lr=1e-2, init_method='zeros',
            optimizer ='adam', l2_reg = 1e-3, num_epochs=2000)

        #### 5 - Cross Validation / Model Performances
        c_index = concordance_index(gomp_model, X_test, T_test, E_test) #0.8
        print('C-index: {:.2f}'.format(c_index))

        ibs = integrated_brier_score(gomp_model, X_test, T_test, E_test, 
            t_max=30, figure_size=(20, 6.5) )

        """

        # Checking data format (i.e.: transforming into numpy array)
        X, T, E = utils.check_data(X, T, E)
        T = np.maximum(T, 1e-6)
        self.get_times(T, is_min_time_zero, extra_pct_time)

        # Extracting data parameters
        nb_units, self.num_vars = X.shape
        input_shape = self.num_vars

        # Scaling data
        if self.auto_scaler:
            X = self.scaler.fit_transform(X)

        # Does the model need a parameter called Beta
        is_beta_used = True
        init_alpha = 1.
        if self.name == 'ExponentialModel':
            is_beta_used = False
        if self.name == 'GompertzModel':
            init_alpha = 1000.

        # Initializing the model
        model = nn.ParametricNet(input_shape, init_method, init_alpha,
                                 is_beta_used)

        # Trasnforming the inputs into tensors
        X = torch.FloatTensor(X)
        T = torch.FloatTensor(T.reshape(-1, 1))
        E = torch.FloatTensor(E.reshape(-1, 1))

        # Performing order 1 optimization
        model, loss_values = opt.optimize(self.loss_function,
                                          model,
                                          optimizer,
                                          lr,
                                          num_epochs,
                                          verbose,
                                          X=X,
                                          T=T,
                                          E=E,
                                          l2_reg=l2_reg)

        # Saving attributes
        self.model = model.eval()
        self.loss_values = loss_values

        # Calculating the AIC
        self.aic = 2 * self.loss_values[-1]
        self.aic -= 2 * (self.num_vars + 1 + is_beta_used * 1. - 1)

        return self
Exemple #18
0
def create_risk_groups(model,
                       X,
                       use_log=True,
                       num_bins=50,
                       figure_size=(20, 8),
                       **kwargs):
    """
    Computing and displaying the histogram of the risk scores of the given 
    model and test set X. If it is provided args, it will assign a color coding 
    to the scores that are below and above the given thresholds.

    Parameters:
    -----------
    
    * model : Pysurvival object
        Pysurvival model

    * X : array-like, shape=(n_samples, n_features)
        The input samples.
    
    * use_log: boolean (default=True)
        Whether applying the log function to the risk score
        
    * num_bins: int (default=50)
        The number of equal-width bins that will constitute the histogram
        
    * figure_size: tuple of double (default= (16, 6))
        width, height in inches representing the size of the chart 

    * kwargs: dict (optional)
        kwargs = low_risk = {'lower_bound': 0, 'upper_bound': 20, 'color': 'red'},
                 high_risk = {'lower_bound': 20, 'upper_bound': 120, 'color': 'blue'}
            that define the risk group
      
    """

    # Ensuring that the input data has the right format
    X = utils.check_data(X)

    # Computing the risk scores
    risk = model.predict_risk(X)
    if use_log:
        risk = np.log(risk)

    # Displaying simple histogram
    if len(kwargs) == 0:

        # Initializing the chart
        fig, ax1 = plt.subplots(figsize=figure_size)
        risk_groups = None

    # Applying any color coding
    else:
        # Initializing the results
        risk_groups = {}

        # Initializing the chart
        fig, ((ax1, ax2)) = plt.subplots(1, 2, figsize=figure_size)

        # Displaying simple histogram with risk groups
        nums_per_bins, bins, patches = ax2.hist(risk, bins=num_bins)
        ax2.set_title('Risk groups with colors', fontsize=15)

        # Number of group definitions
        num_group_def = len(kwargs.values())

        # Extracting the bounds values
        bounds = {}
        colors_ = {}
        indexes = {}
        group_names = []
        handles = []

        # we need to check that the boundaries match the bins
        is_not_valid = 0
        for group_name, group_def in kwargs.items():

            # by ensuring that the bounds are not outside
            # the bins values
            min_bin, max_bin = min(bins), max(bins)
            if (group_def['lower_bound'] < min_bin and \
                group_def['upper_bound'] < min_bin) or \
                    (group_def['lower_bound'] > max_bin and \
                     group_def['upper_bound'] > max_bin):
                is_not_valid += 1

            # Extracting the bounds
            bounds[group_name] = (group_def['lower_bound'],
                                  group_def['upper_bound'])

            # Extracting the colors
            colors_[group_name] = group_def['color']

            # Creating index placeholders
            indexes[group_name] = []
            group_names.append(group_name)
            color_indv = group_def['color']
            handles.append(Rectangle((0, 0), 1, 1, color=color_indv, ec="k"))

        if is_not_valid >= num_group_def:
            error_msg = "The boundaries definitions {} do not match"
            error_msg += ", the values of the risk scores."
            error_msg = error_msg.format(list(bounds.values()))
            raise ValueError(error_msg)

        # Assigning each rectangle/bin to its group definition
        # and color
        colored_patches = []
        bin_index = {}
        for i, bin_, patch_ in zip(range(num_bins), bins, patches):

            # Check if the bin belongs to this bound def
            for grp_name, bounds_ in bounds.items():

                if bounds_[0] <= bin_ < bounds_[-1]:
                    bin_index[i] = grp_name

                    # Extracting color
                    color_ = colors_[grp_name]
                    if color_ not in colors.CSS4_COLORS:
                        error_msg = '{} is not a valid color'
                        error_msg = error_msg.format(colors_[grp_name])
                        raise ValueError(error_msg)

                    patch_.set_facecolor(color_)

            # Saving the rectangles
            colored_patches.append(patch_)

        # Assigning each sample to its group
        risk_bins = np.minimum(np.digitize(risk, bins, True), num_bins - 1)
        for i, r in enumerate(risk_bins):
            # Extracting the right group_name
            group_name = bin_index[r]
            indexes[group_name].append(i)

    # Displaying the original distribution
    ax1.hist(risk, bins=num_bins, color='black', alpha=0.5)
    ax1.set_title('Risk Score Distribution', fontsize=15)

    # Show everything
    plt.show()

    # Returning results
    if risk_groups is not None:
        for group_name in group_names:
            result = (colors_[group_name], indexes[group_name])
            risk_groups[group_name] = result

    return risk_groups
    def fit(self,
            X,
            T,
            E,
            max_features='sqrt',
            max_depth=5,
            min_node_size=10,
            num_threads=-1,
            weights=None,
            sample_size_pct=0.63,
            alpha=0.5,
            minprop=0.1,
            num_random_splits=100,
            importance_mode='impurity_corrected',
            seed=None,
            save_memory=False):
        """
        Arguments:
        ---------
        * X : array-like, shape=(n_samples, n_features)
            The input samples.

        * T : array-like, shape = [n_samples] 
            The target values describing when the event of interest or censoring
            occurred

        * E : array-like, shape = [n_samples] 
            The Event indicator array such that E = 1. if the event occurred
            E = 0. if censoring occurred

        * max_features : int, float or string, optional (default="all")
            The number of features to consider when looking for the best split:
            - If int, then consider `max_features` features at each split.
            - If float, then `max_features` is a fraction and
              `int(max_features * n_features)` features are considered at each
              split.
            - If "sqrt", then `max_features=sqrt(n_features)` 
            - If "log2", then `max_features=log2(n_features)`.

        * min_node_size : int(default=10)
            The minimum number of samples required to be at a leaf node

        * num_threads: int (Default: -1)
            The number of jobs to run in parallel for both fit and predict. 
            If -1, then the number of jobs is set to the number of cores.

        * weights: array-like, shape = [n_samples] (default=None)
            Weights for sampling of training observations. 
            Observations with larger weights will be selected with 
            higher probability in the bootstrap

        * sample_size_pct: double (default = 0.63)
            Percentage of original samples used in each tree building

        * alpha: float
            For "maxstat" splitrule: Significance threshold to allow splitting.

        * minprop: float
            For "maxstat" splitrule: Lower quantile of covariate 
            distribution to be considered for splitting.

        * num_random_splits: int (default=100)
            For "extratrees" splitrule, it is the Number of random splits 
            to consider for each candidate splitting variable.

        * importance_mode:  (default=impurity_corrected)
            Variable importance mode. Here are the 2 options:
            - `impurity` or `impurity_corrected`: 
                it's the unbiased heterogeneity reduction developed 
                by Sandri & Zuccolotto (2008)
            - "permutation" it's unnormalized as recommended by Nicodemus et al.
            - "normalized_permutation" it's normalized version of the 
                permutation importance computations by Breiman et al.

        * `seed`: int (default=None) -- 
            seed used by the random number generator. If None, the current 
            timestamp converted in UNIX is used.

        * save_memory:  bool (default=False) --
            Use memory saving splitting mode. This will slow down the model 
            training. So, only set to `True` if you encounter memory problems.


        Example:
        --------

        #### 1 - Importing packages
        import numpy as np
        import pandas as pd
        from matplotlib import pyplot as plt
        from sklearn.model_selection import train_test_split
        from pysurvival.models.simulations import SimulationModel
        from pysurvival.models.survival_forest import ConditionalSurvivalForestModel
        from pysurvival.utils.metrics import concordance_index
        from pysurvival.utils.display import integrated_brier_score
        #%matplotlib inline # To use with Jupyter notebooks

        #### 2 - Generating the dataset from a Exponential parametric model
        # Initializing the simulation model
        sim = SimulationModel( survival_distribution = 'exponential',  
                               risk_type = 'linear',
                               censored_parameter = 1, 
                               alpha = 3)

        # Generating N random samples 
        N = 1000
        dataset = sim.generate_data(num_samples = N, num_features=4)

        # Showing a few data-points 
        dataset.head(2)

        #### 3 - Creating the modeling dataset
        # Defining the features
        features = sim.features

        # Building training and testing sets #
        index_train, index_test = train_test_split( range(N), test_size = 0.2)
        data_train = dataset.loc[index_train].reset_index( drop = True )
        data_test  = dataset.loc[index_test].reset_index( drop = True )

        # Creating the X, T and E input
        X_train, X_test = data_train[features], data_test[features]
        T_train, T_test = data_train['time'].values, data_test['time'].values
        E_train, E_test = data_train['event'].values, data_test['event'].values


        #### 4 - Creating an instance of the model and fitting the data.
        # Building the model
        csf = ConditionalSurvivalForestModel(num_trees=200) 
        csf.fit(X_train, T_train, E_train, 
                max_features="sqrt", max_depth=5, min_node_size=20,
                alpha = 0.05, minprop=0.1)


        #### 5 - Cross Validation / Model Performances
        c_index = concordance_index(l_mtlr, X_test, T_test, E_test) #0.81
        print('C-index: {:.2f}'.format(c_index))

        ibs = integrated_brier_score(l_mtlr, X_test, T_test, E_test, t_max=30, 
                    figure_size=(20, 6.5) )
        """

        # Collecting features names
        N, self.num_variables = X.shape
        if isinstance(X, pd.DataFrame):
            features = X.columns.tolist()
        else:
            features = ['x_{}'.format(i) for i in range(self.num_variables)]
        all_data_features = ["time", "event"] + features

        # Transforming the strings into bytes
        all_data_features = utils.as_bytes(all_data_features,
                                           python_version=PYTHON_VERSION)

        # Checking the format of the data
        X, T, E = utils.check_data(X, T, E)
        if X.ndim == 1:
            X = X.reshape(1, -1)
            T = T.reshape(1, -1)
            E = E.reshape(1, -1)
        input_data = np.c_[T, E, X]

        # Number of trees
        num_trees = self.num_trees

        # Seed
        if seed is None:
            seed = 0

        # sample_size_pct
        if not isinstance(sample_size_pct, float):
            error = "Error: Invalid value for sample_size_pct, "
            error += "please provide a value that is > 0 and <= 1."
            raise ValueError(error)

        if (sample_size_pct <= 0 or sample_size_pct > 1):
            error = "Error: Invalid value for sample_size_pct, "
            error += "please provide a value that is > 0 and <= 1."
            raise ValueError(error)

        # Split Rule
        if self.splitrule.lower() == 'logrank':
            split_mode = 1
            alpha = 0
            minprop = 0
            num_random_splits = 1

        elif self.splitrule.lower() == "maxstat":
            split_mode = 4
            num_random_splits = 1

            # Maxstat splitting
            if not isinstance(alpha, float):
                error = "Error: Invalid value for alpha, "
                error += "please provide a value that is > 0 and < 1."
                raise ValueError(error)

            if (alpha <= 0 or alpha >= 1):
                error = "Error: Invalid value for alpha, "
                error += "please provide a value between 0 and 1."
                raise ValueError(error)

            if not isinstance(minprop, float):
                error = "Error: Invalid value for minprop, "
                error += "please provide a value between 0 and 0.5"
                raise ValueError(error)

            if (minprop < 0 or minprop > 0.5):
                error = "Error: Invalid value for minprop, "
                error += "please provide a value between 0 and 0.5"
                raise ValueError(error)

        elif self.splitrule.lower() == 'extratrees':
            split_mode = 5
            alpha = 0
            minprop = 0

        # Number of variables to possibly split at in each node
        self.max_features = max_features
        if isinstance(self.max_features, str):

            if self.max_features.lower() == 'sqrt':
                num_variables_to_use = int(np.sqrt(self.num_variables))

            elif 'log' in self.max_features.lower():
                num_variables_to_use = int(np.log(self.num_variables))

            elif self.max_features.lower() == 'all':
                num_variables_to_use = self.num_variables

            else:
                raise ValueError("Unknown max features option")

        elif isinstance(self.max_features, float) or \
            isinstance(self.max_features, int):

            if 0 < self.max_features < 1:
                num_variables_to_use = int(self.num_variables *
                                           self.max_features)

            elif self.max_features >= 1:
                num_variables_to_use = min(self.num_variables,
                                           self.max_features)
                if self.max_features > self.num_variables:
                    msg = "max features value is greater than the number of "
                    msg += "variables ({num_variables}) of the input X. "
                    msg += "So it was set to {num_variables}."
                    msg = msg.format(num_variables=self.num_variables)
                    warnings.warn(msg, UserWarning)

            elif self.max_features <= 0:
                raise ValueError("max features is a positive value")

        else:
            raise ValueError("Unknown max features option")

        # Defining importance mode
        if 'permutation' in importance_mode.lower():

            if 'scaled' in importance_mode.lower() or \
            'normalized' in importance_mode.lower():
                importance_mode = 2
            else:
                importance_mode = 3

        elif 'impurity' in importance_mode.lower():
            importance_mode = 5

        else:
            error = "{} is not a valid importance mode".format(importance_mode)
            raise ValueError(error)

        # Weights
        if weights is None:
            case_weights = [1. / N] * N
        else:
            case_weights = utils.check_data(weights)

            if abs(sum(case_weights) - 1.) >= 1e-4:
                raise Exception(
                    "The sum of the weights needs to be equal to 1.")

            if len(case_weights) != N:
                raise Exception("weights length needs to be {} ".format(N))

        # Fitting the model using the C++ object
        verbose = True
        self.model.fit(input_data, all_data_features, case_weights, num_trees,
                       num_variables_to_use, min_node_size, max_depth, alpha,
                       minprop, num_random_splits, sample_size_pct,
                       importance_mode, split_mode, verbose, seed, num_threads,
                       save_memory)

        # Saving the attributes
        self.save_properties()
        self.get_time_buckets()

        # Extracting the Variable Importance
        self.variable_importance = {}
        for i, value in enumerate(self.variable_importance_):
            self.variable_importance[features[i]] = value

        # Saving the importance in a dataframe
        self.variable_importance_table = pd.DataFrame(
            data={'feature': list(self.variable_importance.keys()),
                  'importance': list(self.variable_importance.values())
                    },
            columns=['feature', 'importance']).\
            sort_values('importance', ascending=0).reset_index(drop=True)
        importance = self.variable_importance_table['importance'].values
        importance = np.maximum(importance, 0.)
        sum_imp = sum(importance) * 1.
        self.variable_importance_table['pct_importance'] = importance / sum_imp

        return self
Exemple #20
0
    def generate_data(self,
                      num_samples=100,
                      num_features=3,
                      feature_weights=None):
        """ 
        Generating a dataset of simulated survival times from a given 
        distribution through the hazard function using the Cox model  
        
        Parameters:
        -----------
        * `num_samples`: **int** *(default=100)* --
            Number of samples to generate

        * `num_features`: **int** *(default=3)* --
            Number of features to generate

        * `feature_weights`: **array-like** *(default=None)* -- 
            list of the coefficients of the underlying Cox-Model. 
            The features linked to each coefficient are generated 
            from random distribution from the following list:

            * binomial
            * chisquare
            * exponential
            * gamma
            * normal
            * uniform
            * laplace

            If None then feature_weights = [1.]*num_features

        Returns:
        --------
        * dataset: pandas.DataFrame
            dataset of simulated survival times, event status and features


        Example:
        --------
        from pysurvival.models.simulations import SimulationModel

        # Initializing the simulation model
        sim = SimulationModel( survival_distribution = 'gompertz',  
                               risk_type = 'linear',
                               censored_parameter = 5.0, 
                               alpha = 0.01, 
                               beta = 5., )

        # Generating N Random samples
        N = 1000
        dataset = sim.generate_data(num_samples = N, num_features=5)

        # Showing a few data-points
        dataset.head()
        """

        # Data parameters
        self.num_variables = num_features
        if feature_weights is None:
            self.feature_weights = [1.] * self.num_variables
            feature_weights = self.feature_weights

        else:
            feature_weights = utils.check_data(feature_weights)
            if num_features != len(feature_weights):
                error = "The length of feature_weights ({}) "
                error += "and num_features ({}) are not the same."
                error = error.format(len(feature_weights), num_features)
                raise ValueError(error)
            self.feature_weights = feature_weights

        # Generating random features
        # Creating the features
        X = np.zeros((num_samples, self.num_variables))
        columns = []
        for i in range(self.num_variables):
            key, X[:, i] = self.random_data(num_samples)
            columns.append('x_' + str(i + 1))
        X_std = self.scaler.fit_transform(X)
        BX = self.risk_function(X_std)

        # Building the survival times
        T = self.time_function(BX)
        C = np.random.normal(loc=self.censored_parameter,
                             scale=5,
                             size=num_samples)
        C = np.maximum(C, 0.)
        time = np.minimum(T, C)
        E = 1. * (T == time)

        # Building dataset
        self.features = columns
        self.dataset = pd.DataFrame(data=np.c_[X, time, E],
                                    columns=columns + ['time', 'event'])

        # Building the time axis and time buckets
        self.times = np.linspace(0., max(self.dataset['time']), self.bins)
        self.get_time_buckets()

        # Building baseline functions
        self.baseline_hazard = self.hazard_function(self.times, 0)
        self.baseline_survival = self.survival_function(self.times, 0)

        # Printing summary message
        message_to_print = "Number of data-points: {} - Number of events: {}"
        print(message_to_print.format(num_samples, sum(E)))

        return self.dataset
Exemple #21
0
def compare_to_actual(model,
                      X,
                      T,
                      E,
                      times=None,
                      is_at_risk=False,
                      figure_size=(16, 6),
                      metrics=['rmse', 'mean', 'median'],
                      **kwargs):
    """
    Comparing the actual and predicted number of units at risk and units 
    experiencing an event at each time t.

    Parameters:
    -----------
    * model : pysurvival model
        The model that will be used for prediction

    * X : array-like, shape=(n_samples, n_features)
        The input samples.

    * T : array-like, shape = [n_samples] 
        The target values describing when the event of interest or censoring
        occured

    * E : array-like, shape = [n_samples] 
        The Event indicator array such that E = 1. if the event occured
        E = 0. if censoring occured

    * times: array-like, (default=None)
        A vector of timepoints.

    * is_at_risk: bool (default=True)
        Whether the function returns Expected number of units at risk
        or the Expected number of units experiencing the events.

    * figure_size: tuple of double (default= (16, 6))
        width, height in inches representing the size of the chart 

    * metrics: str or list of str (default='all')
        Indicates the performance metrics to compute:
            - if None, then no metric is computed
            - if str, then the metric is computed
            - if list of str, then the metrics are computed

        The available metrics are:
            - RMSE: root mean squared error
            - Mean Abs Error: mean absolute error
            - Median Abs Error: median absolute error

    Returns:
    --------
    * results: float or dict
        Performance metrics   

    """

    # Initializing the Kaplan-Meier model
    X, T, E = utils.check_data(X, T, E)
    kmf = KaplanMeierModel()
    kmf.fit(T, E)

    # Creating actual vs predicted
    N = T.shape[0]

    # Defining the time axis
    if times is None:
        times = kmf.times

    # Number of Expected number of units at risk
    # or the Expected number of units experiencing the events
    actual = []
    actual_upper = []
    actual_lower = []
    predicted = []
    if is_at_risk:
        model_predicted = np.sum(model.predict_survival(X, **kwargs), 0)

        for t in times:
            min_index = [abs(a_j_1 - t) for (a_j_1, a_j) in model.time_buckets]
            index = np.argmin(min_index)
            actual.append(N * kmf.predict_survival(None, t))
            actual_upper.append(N * kmf.predict_survival_upper(None, t))
            actual_lower.append(N * kmf.predict_survival_lower(None, t))
            predicted.append(model_predicted[index])

    else:
        model_predicted = np.sum(model.predict_density(X, **kwargs), 0)

        for t in times:
            min_index = [abs(a_j_1 - t) for (a_j_1, a_j) in model.time_buckets]
            index = np.argmin(min_index)
            actual.append(N * kmf.predict_density(None, t))
            h = kmf.predict_hazard(None, t)
            actual_upper.append(N * kmf.predict_survival_upper(None, t) * h)
            actual_lower.append(N * kmf.predict_survival_lower(None, t) * h)
            predicted.append(model_predicted[index])

    # Computing the performance metrics
    results = None
    title = 'Actual vs Predicted'
    if metrics is not None:

        # RMSE
        rmse = np.sqrt(mean_squared_error(actual, predicted))

        # Median Abs Error
        med_ae = median_absolute_error(actual, predicted)

        # Mean Abs Error
        mae = mean_absolute_error(actual, predicted)

        if isinstance(metrics, str):

            # RMSE
            if 'rmse' in metrics.lower() or 'root' in metrics.lower():
                results = rmse
                title += "\n"
                title += "RMSE = {:.3f}".format(rmse)

            # Median Abs Error
            elif 'median' in metrics.lower():
                results = med_ae
                title += "\n"
                title += "Median Abs Error = {:.3f}".format(med_ae)

            # Mean Abs Error
            elif 'mean' in metrics.lower():
                results = mae
                title += "\n"
                title += "Mean Abs Error = {:.3f}".format(mae)

            else:
                raise NotImplementedError(
                    '{} is not a valid metric function.'.format(metrics))

        elif isinstance(metrics, list) or isinstance(metrics, numpy.ndarray):
            results = {}

            # RMSE
            is_rmse = False
            if any([('rmse' in m.lower() or 'root' in m.lower()) \
                    for m in metrics]):
                is_rmse = True
                results['root_mean_squared_error'] = rmse
                title += "\n"
                title += "RMSE = {:.3f}".format(rmse)

            # Median Abs Error
            is_med_ae = False
            if any(['median' in m.lower() for m in metrics]):
                is_med_ae = True
                results['median_absolute_error'] = med_ae
                title += "\n"
                title += "Median Abs Error = {:.3f}".format(med_ae)

            # Mean Abs Error
            is_mae = False
            if any(['mean' in m.lower() for m in metrics]):
                is_mae = True
                results['mean_absolute_error'] = mae
                title += "\n"
                title += "Mean Abs Error = {:.3f}".format(mae)

            if all([not is_mae, not is_rmse, not is_med_ae]):
                error = 'The provided metrics are not available.'
                raise NotImplementedError(error)

    # Plotting
    fig, ax = plt.subplots(figsize=figure_size)
    ax.plot(times, actual, color='red', label='Actual', alpha=0.8, lw=3)
    ax.plot(times, predicted, color='blue', label='Predicted', alpha=0.8, lw=3)
    plt.xlim(0, max(T))

    # Filling the areas between the Survival and Confidence Intervals curves
    plt.fill_between(times,
                     actual,
                     actual_lower,
                     label='Confidence Intervals - Lower',
                     color='red',
                     alpha=0.2)
    plt.fill_between(times,
                     actual,
                     actual_upper,
                     label='Confidence Intervals - Upper',
                     color='red',
                     alpha=0.2)

    # Finalizing the chart
    plt.title(title, fontsize=15)
    plt.legend(fontsize=15)
    plt.show()

    return results
    def fit(self,
            X,
            T,
            E,
            init_method='glorot_normal',
            lr=1e-2,
            max_iter=100,
            l2_reg=1e-2,
            alpha=0.95,
            tol=1e-3,
            verbose=True):
        """
        Fitting a proportional hazards regression model using
        the Efron's approximation method to take into account tied times.
        
        As the Hessian matrix of the log-likelihood can be 
        calculated without too much effort, the model parameters are 
        computed using the Newton_Raphson Optimization scheme:
                W_new = W_old - lr*<Hessian^(-1), gradient>
        
        Arguments:
        ---------
        * `X` : **array-like**, *shape=(n_samples, n_features)* --
            The input samples.

        * `T` : **array-like** -- 
            The target values describing when the event of interest or 
            censoring occurred.

        * `E` : **array-like** --
            The values that indicate if the event of interest occurred 
            i.e.: E[i]=1 corresponds to an event, and E[i] = 0 means censoring, 
            for all i.

        * `init_method` : **str** *(default = 'glorot_uniform')* -- 
            Initialization method to use. Here are the possible options:

            * `glorot_uniform`: Glorot/Xavier uniform initializer
            * `he_uniform`: He uniform variance scaling initializer
            * `uniform`: Initializing tensors with uniform (-1, 1) distribution
            * `glorot_normal`: Glorot normal initializer,
            * `he_normal`: He normal initializer.
            * `normal`: Initializing tensors with standard normal distribution
            * `ones`: Initializing tensors to 1
            * `zeros`: Initializing tensors to 0
            * `orthogonal`: Initializing tensors with a orthogonal matrix,
            
        * `lr`: **float** *(default=1e-4)* -- 
            learning rate used in the optimization

        * `max_iter`: **int** *(default=100)* -- 
            The maximum number of iterations in the Newton optimization

        * `l2_reg`: **float** *(default=1e-4)* -- 
            L2 regularization parameter for the model coefficients

        * `alpha`: **float** *(default=0.95)* -- 
            Confidence interval

        * `tol`: **float** *(default=1e-3)* -- 
            Tolerance for stopping criteria

        * `verbose`: **bool** *(default=True)* -- 
            Whether or not producing detailed logging about the modeling
 
        Example:
        --------

        #### 1 - Importing packages
        import numpy as np
        import pandas as pd
        from matplotlib import pyplot as plt
        from sklearn.model_selection import train_test_split
        from pysurvival.models.simulations import SimulationModel
        from pysurvival.models.semi_parametric import CoxPHModel
        from pysurvival.utils.metrics import concordance_index
        from pysurvival.utils.display import integrated_brier_score
        #%pylab inline  # To use with Jupyter notebooks


        #### 2 - Generating the dataset from a Log-Logistic parametric model
        # Initializing the simulation model
        sim = SimulationModel( survival_distribution = 'log-logistic',  
                               risk_type = 'linear',
                               censored_parameter = 10.1, 
                               alpha = 0.1, beta=1.2 )

        # Generating N random samples 
        N = 1000
        dataset = sim.generate_data(num_samples = N, num_features = 3)

        #### 3 - Creating the modeling dataset
        # Defining the features
        features = sim.features

        # Building training and testing sets #
        index_train, index_test = train_test_split( range(N), test_size = 0.2)
        data_train = dataset.loc[index_train].reset_index( drop = True )
        data_test  = dataset.loc[index_test].reset_index( drop = True )

        # Creating the X, T and E input
        X_train, X_test = data_train[features], data_test[features]
        T_train, T_test = data_train['time'].values, data_test['time'].values
        E_train, E_test = data_train['event'].values, data_test['event'].values


        #### 4 - Creating an instance of the Cox PH model and fitting the data.
        # Building the model
        coxph = CoxPHModel()
        coxph.fit(X_train, T_train, E_train, lr=0.5, l2_reg=1e-2, 
            init_method='zeros')


        #### 5 - Cross Validation / Model Performances
        c_index = concordance_index(coxph, X_test, T_test, E_test) #0.92
        print('C-index: {:.2f}'.format(c_index))

        ibs = integrated_brier_score(coxph, X_test, T_test, E_test, t_max=10, 
                    figure_size=(20, 6.5) )

        References:
        -----------
        * https://en.wikipedia.org/wiki/Proportional_hazards_model#Tied_times
        * Efron, Bradley (1974). "The Efficiency of Cox's Likelihood 
          Function for Censored Data". Journal of the American Statistical 
          Association. 72 (359): 557-565. 
        """

        # Collecting features names
        N, self.num_vars = X.shape
        if isinstance(X, pd.DataFrame):
            self.variables = X.columns.tolist()
        else:
            self.variables = ['x_{}'.format(i) for i in range(self.num_vars)]

        # Checking the format of the data
        X, T, E = utils.check_data(X, T, E)
        order = np.argsort(-T)
        T = T[order]
        E = E[order]
        X = self.scaler.fit_transform(X[order, :])
        self.std_scale = np.sqrt(self.scaler.var_)

        # Initializing the model
        self.model = _CoxPHModel()

        # Creating the time axis
        self.model.get_times(T, E)

        # Initializing the parameters
        W = np.zeros(self.num_vars)
        W = opt.initialization(init_method, W, False).flatten()
        W = W.astype(np.float64)

        # Optimizing to find best parameters
        epsilon = 1e-9
        self.model.newton_optimization(X, T, E, W, lr, l2_reg, tol, epsilon,
                                       max_iter, verbose)

        # Saving the Cython attributes in the Python object
        self.weights = np.array(self.model.W)
        self.loss = self.model.loss
        self.times = np.array(self.model.times)
        self.gradient = np.array(self.model.gradient)
        self.Hessian = np.array(self.model.Hessian)
        self.inv_Hessian = np.array(self.model.inv_Hessian)
        self.loss_values = np.array(self.model.loss_values)
        self.grad2_values = np.array(self.model.grad2_values)

        # Computing baseline functions
        score = np.exp(np.dot(X, self.weights))
        baselines = _baseline_functions(score, T, E)

        # Saving the Cython attributes in the Python object
        self.baseline_hazard = np.array(baselines[1])
        self.baseline_survival = np.array(baselines[2])
        del self.model
        self.get_time_buckets()

        # Calculating summary
        self.get_summary(alpha)

        return self
    def fit(self,
            X,
            T,
            E,
            init_method='glorot_uniform',
            optimizer='adam',
            lr=1e-4,
            num_epochs=1000,
            dropout=0.2,
            batch_normalization=False,
            bn_and_dropout=False,
            l2_reg=1e-5,
            verbose=True):
        """ 
        Fit the estimator based on the given parameters.

        Parameters:
        -----------
        * `X` : **array-like**, *shape=(n_samples, n_features)* --
            The input samples.

        * `T` : **array-like** -- 
            The target values describing when the event of interest or censoring
            occurred.

        * `E` : **array-like** --
            The values that indicate if the event of interest occurred i.e.: 
            E[i]=1 corresponds to an event, and E[i] = 0 means censoring, 
            for all i.

        * `init_method` : **str** *(default = 'glorot_uniform')* -- 
            Initialization method to use. Here are the possible options:

            * `glorot_uniform`: Glorot/Xavier uniform initializer
            * `he_uniform`: He uniform variance scaling initializer 
            * `uniform`: Initializing tensors with uniform (-1, 1) distribution
            * `glorot_normal`: Glorot normal initializer,
            * `he_normal`: He normal initializer.
            * `normal`: Initializing tensors with standard normal distribution
            * `ones`: Initializing tensors to 1
            * `zeros`: Initializing tensors to 0
            * `orthogonal`: Initializing tensors with a orthogonal matrix,

        * `optimizer`:  **str** *(default = 'adam')* -- 
            iterative method for optimizing a differentiable objective function.
            Here are the possible options:

            - `adadelta`
            - `adagrad`
            - `adam`
            - `adamax`
            - `rmsprop`
            - `sparseadam`
            - `sgd`

        * `lr`: **float** *(default=1e-4)* -- 
            learning rate used in the optimization

        * `num_epochs`: **int** *(default=1000)* -- 
            The number of iterations in the optimization

        * `dropout`: **float** *(default=0.5)* -- 
            Randomly sets a fraction rate of input units to 0 
            at each update during training time, which helps prevent overfitting.

        * `l2_reg`: **float** *(default=1e-4)* -- 
            L2 regularization parameter for the model coefficients

        * `batch_normalization`: **bool** *(default=True)* -- 
            Applying Batch Normalization or not

        * `bn_and_dropout`: **bool** *(default=False)* -- 
            Applying Batch Normalization and Dropout at the same time

        * `verbose`: **bool** *(default=True)* -- 
            Whether or not producing detailed logging about the modeling
                

        Example:
        --------

        #### 1 - Importing packages
        import numpy as np
        import pandas as pd
        from matplotlib import pyplot as plt
        from sklearn.model_selection import train_test_split
        from pysurvival.models.simulations import SimulationModel
        from pysurvival.models.semi_parametric import NonLinearCoxPHModel
        from pysurvival.utils.metrics import concordance_index
        from pysurvival.utils.display import integrated_brier_score
        #%matplotlib inline  # To use with Jupyter notebooks

        #### 2 - Generating the dataset from a nonlinear Weibull parametric model
        # Initializing the simulation model
        sim = SimulationModel( survival_distribution = 'weibull',  
                               risk_type = 'Gaussian',
                               censored_parameter = 2.1, 
                               alpha = 0.1, beta=3.2 )

        # Generating N random samples 
        N = 1000
        dataset = sim.generate_data(num_samples = N, num_features=3)

        # Showing a few data-points 
        dataset.head(2)

        #### 3 - Creating the modeling dataset
        # Defining the features
        features = sim.features

        # Building training and testing sets #
        index_train, index_test = train_test_split( range(N), test_size = 0.2)
        data_train = dataset.loc[index_train].reset_index( drop = True )
        data_test  = dataset.loc[index_test].reset_index( drop = True )

        # Creating the X, T and E input
        X_train, X_test = data_train[features], data_test[features]
        T_train, T_test = data_train['time'].values, data_test['time'].values
        E_train, E_test = data_train['event'].values, data_test['event'].values


        #### 4 - Creating an instance of the NonLinear CoxPH model and fitting 
        # the data.

        # Defining the MLP structure. Here we will build a 1-hidden layer 
        # with 150 units and `BentIdentity` as its activation function
        structure = [ {'activation': 'BentIdentity', 'num_units': 150},  ]

        # Building the model
        nonlinear_coxph = NonLinearCoxPHModel(structure=structure) 
        nonlinear_coxph.fit(X_train, T_train, E_train, lr=1e-3, 
            init_method='xav_uniform')


        #### 5 - Cross Validation / Model Performances
        c_index = concordance_index(nonlinear_coxph, X_test, T_test, E_test)
        print('C-index: {:.2f}'.format(c_index))

        ibs = integrated_brier_score(nonlinear_coxph, X_test, T_test, E_test, 
            t_max=10, figure_size=(20, 6.5) )

        """

        # Checking data format (i.e.: transforming into numpy array)
        X, T, E = utils.check_data(X, T, E)

        # Extracting data parameters
        N, self.num_vars = X.shape
        input_shape = self.num_vars

        # Scaling data
        if self.auto_scaler:
            X_original = self.scaler.fit_transform(X)

        # Sorting X, T, E in descending order according to T
        order = np.argsort(-T)
        T = T[order]
        E = E[order]
        X_original = X_original[order, :]
        self.times = np.unique(T[E.astype(bool)])
        self.nb_times = len(self.times)
        self.get_time_buckets()

        # Initializing the model
        model = nn.NeuralNet(input_shape, 1, self.structure, init_method,
                             dropout, batch_normalization, bn_and_dropout)

        # Looping through the data to calculate the loss
        X = torch.cuda.FloatTensor(X_original)

        # Computing the Risk and Fail tensors
        Risk, Fail = self.risk_fail_matrix(T, E)
        Risk = torch.cuda.FloatTensor(Risk)
        Fail = torch.cuda.FloatTensor(Fail)

        # Computing Efron's matrices
        Efron_coef, Efron_one, Efron_anti_one = self.efron_matrix()
        Efron_coef = torch.cuda.FloatTensor(Efron_coef)
        Efron_one = torch.cuda.FloatTensor(Efron_one)
        Efron_anti_one = torch.cuda.FloatTensor(Efron_anti_one)

        # Performing order 1 optimization
        model, loss_values = opt.optimize(self.loss_function,
                                          model,
                                          optimizer,
                                          lr,
                                          num_epochs,
                                          verbose,
                                          X=X,
                                          Risk=Risk,
                                          Fail=Fail,
                                          Efron_coef=Efron_coef,
                                          Efron_one=Efron_one,
                                          Efron_anti_one=Efron_anti_one,
                                          l2_reg=l2_reg)

        # Saving attributes
        self.model = model.eval()
        self.loss_values = loss_values

        # Computing baseline functions
        x = X_original
        x = torch.cuda.FloatTensor(x)

        # Calculating risk_score
        score = np.exp(
            self.model(torch.cuda.FloatTensor(x)).data.cpu().numpy().flatten())
        baselines = _baseline_functions(score, T, E)

        # Saving the Cython attributes in the Python object
        self.times = np.array(baselines[0])
        self.baseline_hazard = np.array(baselines[1])
        self.baseline_survival = np.array(baselines[2])

        return self
    def fit(self, T,  E, weights = None, alpha=0.95):
        """ Fitting the model according to the provided data.

        Parameters:
        -----------
        * `T` : **array-like** -- 
            The target values describing when the event of interest or censoring
            occurred.

        * `E` : **array-like** --
            The values that indicate if the event of interest occurred i.e.: E[i]=1
            corresponds to an event, and E[i] = 0 symbols censoring, for all i.

        * `weights` : **array-like** *(default = None)* -- 
            Array of weights that are assigned to individual samples.
            If not provided, then each sample is given a unit weight.

        * `alpha`: **float** *(default = 0.05)* --
            Significance level

        Returns:
        --------
        * self : object


        Example:
        --------

        # Importing modules
        import numpy as np
        from matplotlib import pyplot as plt
        from pysurvival.utils.display import display_non_parametric
        # %matplotlib inline #Uncomment when using Jupyter 

        # Generating random times and event indicators 
        T = np.round(np.abs(np.random.normal(10, 10, 1000)), 1)
        E = np.random.binomial(1, 0.3, 1000)

        # Initializing the KaplanMeierModel
        from pysurvival.models.non_parametric import KaplanMeierModel
        km_model = KaplanMeierModel()

        # Fitting the model 
        km_model.fit(T, E, alpha=0.95)

        # Displaying the survival function and confidence intervals
        display_non_parametric(km_model)

        # Initializing the SmoothKaplanMeierModel
        from pysurvival.models.non_parametric import SmoothKaplanMeierModel
        skm_model = SmoothKaplanMeierModel(bandwith=0.1, kernel='normal')

        # Fitting the model
        skm_model.fit(T, E)

        # Displaying the survival function and confidence intervals
        display_non_parametric(skm_model)
        """

        # Checking the format of the data 
        T, E = utils.check_data(T, E)

        # weighting
        if weights is None:
            weights = [1.]*T.shape[0]

        # Confidence Intervals
        z = stats.norm.ppf((1. - alpha) / 2.)

        # Building the Kaplan-Meier model
        survival = self.model.fit(T, E, weights, z)
        if sum(survival) <= 0. :
            mem_error = "The kernel matrix cannot fit in memory."
            mem_error += "You should use a bigger bandwidth b"
            raise MemoryError(mem_error)

        # Saving all properties
        self.save_properties()

        # Generating the Survival table
        if 'smooth' not in self.name.lower() :
            self.get_survival_table()
Exemple #25
0
    def fit(self,
            X,
            T,
            E,
            with_bias=True,
            init_method='glorot_normal',
            lr=1e-2,
            max_iter=100,
            l2_reg=1e-4,
            tol=1e-3,
            verbose=True):
        """
        Fitting a Survival Support Vector Machine model.

        As the Hessian matrix of the log-likelihood can be 
        calculated without too much effort, the model parameters are 
        computed using the Newton_Raphson Optimization scheme:
                W_new = W_old - lr*<Hessian^(-1), gradient>

        Arguments:
        ---------
        
        * `X` : array-like, shape=(n_samples, n_features)
            The input samples.

        * `T` : array-like, shape = [n_samples] 
            The target values describing when the event of interest or censoring
            occurred

        * `E` : array-like, shape = [n_samples] 
            The Event indicator array such that E = 1. if the event occurred
            E = 0. if censoring occurred

        * `with_bias`: bool (default=True)
            Whether a bias should be added 

        * `init_method` : str (default = 'glorot_uniform')
            Initialization method to use. Here are the possible options:
                * 'glorot_uniform': Glorot/Xavier uniform initializer, 
                * 'he_uniform': He uniform variance scaling initializer
                * 'uniform': Initializing tensors with uniform (-1, 1) distribution
                * 'glorot_normal': Glorot normal initializer,
                * 'he_normal': He normal initializer.
                * 'normal': Initializing tensors with standard normal distribution
                * 'ones': Initializing tensors to 1
                * 'zeros': Initializing tensors to 0
                * 'orthogonal': Initializing tensors with a orthogonal matrix,

        * `lr`: float (default=1e-4)
            learning rate used in the optimization

        * `max_iter`: int (default=100)
            The maximum number of iterations in the Newton optimization

        * `l2_reg`: float (default=1e-4)
            L2 regularization parameter for the model coefficients

        * `alpha`: float (default=0.95)
            Confidence interval

        * `tol`: float (default=1e-3)
            Tolerance for stopping criteria

        * `verbose`: bool (default=True)
            Whether or not producing detailed logging about the modeling


        Example:
        --------

        #### 1 - Importing packages
        import numpy as np
        import pandas as pd
        from pysurvival.models.svm import LinearSVMModel
        from pysurvival.models.svm import KernelSVMModel
        from pysurvival.models.simulations import SimulationModel
        from pysurvival.utils.metrics import concordance_index
        from sklearn.model_selection import train_test_split
        from scipy.stats.stats import pearsonr   
        # %pylab inline # to use in jupyter notebooks

        #### 2 - Generating the dataset from the parametric model
        # Initializing the simulation model
        sim = SimulationModel( survival_distribution = 'Log-Logistic',  
                               risk_type = 'linear',
                               censored_parameter = 1.1, 
                               alpha = 1.5, beta = 4)

        # Generating N Random samples
        N = 1000
        dataset = sim.generate_data(num_samples = N, num_features = 4)

        #### 3 - Splitting the dataset into training and testing sets
        # Defining the features
        features = sim.features

        # Building training and testing sets #
        index_train, index_test = train_test_split( range(N), test_size = 0.2)
        data_train = dataset.loc[index_train].reset_index( drop = True )
        data_test  = dataset.loc[index_test].reset_index( drop = True )

        # Creating the X, T and E input
        X_train, X_test = data_train[features], data_test[features]
        T_train, T_test = data_train['time'].values, data_test['time'].values
        E_train, E_test = data_train['event'].values, data_test['event'].values


        #### 4 - Creating an instance of the SVM model and fitting the data.
        svm_model = LinearSVMModel()
        svm_model = KernelSVMModel(kernel='Gaussian', scale=0.25)
        svm_model.fit(X_train, T_train, E_train, init_method='he_uniform', 
            with_bias = True, lr = 0.5,  tol = 1e-3,  l2_reg = 1e-3)

        #### 5 - Cross Validation / Model Performances
        c_index = concordance_index(svm_model, X_test, T_test, E_test) #0.93
        print('C-index: {:.2f}'.format(c_index))

        #### 6 - Comparing the model predictions to Actual risk score
        # Comparing risk scores
        svm_risks = svm_model.predict_risk(X_test)
        actual_risks = sim.predict_risk(X_test).flatten()
        print("corr={:.4f}, p_value={:.5f}".format(*pearsonr(svm_risks, 
            actual_risks)))# corr=-0.9992, p_value=0.00000

        """

        # Collecting features names
        N, self.num_vars = X.shape
        if isinstance(X, pd.DataFrame):
            self.variables = X.columns.tolist()
        else:
            self.variables = ['x_{}'.format(i) for i in range(self.num_vars)]

        # Adding a bias or not
        self.with_bias = with_bias
        if with_bias:
            self.variables += ['intercept']
        p = int(self.num_vars + 1. * with_bias)

        # Checking the format of the data
        X, T, E = utils.check_data(X, T, E)

        if with_bias:
            # Adding the intercept
            X = np.c_[X, [1.] * N]
        X = self.scaler.fit_transform(X)

        # Initializing the parameters
        if self.kernel_type == 0:
            W = np.zeros((p, 1))
        else:
            W = np.zeros((N, 1))
        W = opt.initialization(init_method, W, False).flatten()
        W = W.astype(np.float64)

        # Optimizing to find best parameters
        self.model.newton_optimization(X, T, E, W, lr, l2_reg, tol, max_iter,
                                       verbose)
        self.save_properties()

        return self
Exemple #26
0
def concordance_index(model, X, T, E, include_ties = True,
    additional_results=False, **kwargs):
    """ 
    Computing the C-index based on *On The C-Statistics For Evaluating Overall
    Adequacy Of Risk Prediction Procedures With Censored Survival Data* and
    *Estimating the Concordance Probability in a Survival Analysis
    with a Discrete Number of Risk Groups* and *Concordance for Survival 
    Time Data: Fixed and Time-Dependent Covariates and Possible Ties in
    Predictor and Time 

    Similarly to the AUC, C-index = 1 corresponds to the best model 
    prediction, and C-index = 0.5 represents a random prediction.

    Parameters:
    -----------
    * model : Pysurvival object
        Pysurvival model

    * X : array-like, shape=(n_samples, n_features)
        The input samples.

    * E : array-like, shape = [n_samples] 
        The Event indicator array such that E = 1. if the event occured
        E = 0. if censoring occured
    
    * include_ties: bool (default=True)
        Specifies whether ties in risk score are included in calculations

    * additional_results: bool (default=False)
        Specifies whether only the c-index should be returned (False)
        or if a dict of values should returned. the values are:
            - c-index
            - nb_pairs
            - nb_concordant_pairs

    Returns:
    --------
        * results: double or dict (if additional_results = True)
            - results is the c-index (double) if additional_results = False
            - results is dict if additional_results = True such that
                results[0] = C-index;
                results[1] = nb_pairs;
                results[2] = nb_concordant_pairs;
                    
    Example:
    --------


    """

    # Checking the format of the data 
    risk = model.predict_risk(X, **kwargs)
    risk, T, E = utils.check_data(risk, T, E)

    # Ordering risk, T and E in descending order according to T
    order = np.argsort(-T)
    risk = risk[order]
    T = T[order]
    E = E[order]

    # Calculating th c-index
    results = _concordance_index(risk, T, E, include_ties)

    if not additional_results:
        return results[0]
    else:
        return results
Exemple #27
0
    def fit(self,
            X,
            T,
            E,
            init_method='glorot_uniform',
            optimizer='adam',
            lr=1e-4,
            num_epochs=1000,
            dropout=0.2,
            l2_reg=1e-2,
            l2_smooth=1e-2,
            batch_normalization=False,
            bn_and_dropout=False,
            verbose=True,
            extra_pct_time=0.1,
            is_min_time_zero=True,
            max_norm=1.0,
            min_clamp_value=1e-8,
            max_clamp_value=torch.finfo(torch.float32).max - 1):
        """ Fit the estimator based on the given parameters.

        Parameters:
        -----------
        * `X` : **array-like**, *shape=(n_samples, n_features)* --
            The input samples.

        * `T` : **array-like** -- 
            The target values describing when the event of interest or censoring
            occurred.

        * `E` : **array-like** --
            The values that indicate if the event of interest occurred i.e.: 
            E[i]=1 corresponds to an event, and E[i] = 0 means censoring, 
            for all i.

        * `init_method` : **str** *(default = 'glorot_uniform')* -- 
            Initialization method to use. Here are the possible options:

            * `glorot_uniform`: Glorot/Xavier uniform initializer
            * `he_uniform`: He uniform variance scaling initializer
            * `uniform`: Initializing tensors with uniform (-1, 1) distribution
            * `glorot_normal`: Glorot normal initializer,
            * `he_normal`: He normal initializer.
            * `normal`: Initializing tensors with standard normal distribution
            * `ones`: Initializing tensors to 1
            * `zeros`: Initializing tensors to 0
            * `orthogonal`: Initializing tensors with a orthogonal matrix,

        * `optimizer`:  **str** *(default = 'adam')* -- 
            iterative method for optimizing a differentiable objective function.
            Here are the possible options:

            - `adadelta`
            - `adagrad`
            - `adam`
            - `adamax`
            - `rmsprop`
            - `sparseadam`
            - `sgd`

        * `lr`: **float** *(default=1e-4)* -- 
            learning rate used in the optimization

        * `num_epochs`: **int** *(default=1000)* -- 
            The number of iterations in the optimization

        * `dropout`: **float** *(default=0.5)* -- 
            Randomly sets a fraction rate of input units to 0 
            at each update during training time, which helps prevent overfitting.

        * `l2_reg`: **float** *(default=1e-4)* -- 
            L2 regularization parameter for the model coefficients

        * `l2_smooth`: **float** *(default=1e-4)* -- 
            Second L2 regularizer that ensures the parameters vary smoothly 
            across consecutive time points.

        * `batch_normalization`: **bool** *(default=True)* -- 
            Applying Batch Normalization or not

        * `bn_and_dropout`: **bool** *(default=False)* -- 
            Applying Batch Normalization and Dropout at the same time

        * `display_loss`: **bool** *(default=True)* -- 
            Whether or not showing the loss function values at each update

        * `verbose`: **bool** *(default=True)* -- 
            Whether or not producing detailed logging about the modeling

        * `extra_pct_time`: **float** *(default=0.1)* -- 
            Providing an extra fraction of time in the time axis

        * `is_min_time_zero`: **bool** *(default=True)* -- 
            Whether the the time axis starts at 0

        * `max_norm`: **float** *(default=1.0)* --
            Max l2 norm for gradient clipping

        **Returns:**

        * self : object


        Example:
        --------
            
        #### 1 - Importing packages
        import numpy as np
        import pandas as pd
        from matplotlib import pyplot as plt
        from sklearn.model_selection import train_test_split
        from pysurvival.models.simulations import SimulationModel
        from pysurvival.models.multi_task import LinearMultiTaskModel
        from pysurvival.utils.metrics import concordance_index
        #%matplotlib inline  # To use with Jupyter notebooks


        #### 2 - Generating the dataset from a Weibull parametric model
        # Initializing the simulation model
        sim = SimulationModel( survival_distribution = 'Weibull',  
                               risk_type = 'linear',
                               censored_parameter = 10.0, 
                               alpha = .01, beta = 3.0 )

        # Generating N random samples 
        N = 1000
        dataset = sim.generate_data(num_samples = N, num_features = 3)

        # Showing a few data-points 
        time_column = 'time'
        event_column = 'event'
        dataset.head(2)

        #### 3 - Creating the modeling dataset
        # Defining the features
        features = sim.features

        # Building training and testing sets #
        index_train, index_test = train_test_split( range(N), test_size = 0.2)
        data_train = dataset.loc[index_train].reset_index( drop = True )
        data_test  = dataset.loc[index_test].reset_index( drop = True )

        # Creating the X, T and E input
        X_train, X_test = data_train[features], data_test[features]
        T_train, T_test = data_train['time'].values, data_test['time'].values
        E_train, E_test = data_train['event'].values, data_test['event'].values

        #### 4 - Initializing a MTLR model and fitting the data.
        # Building a Linear model
        mtlr = LinearMultiTaskModel(bins=50) 
        mtlr.fit(X_train, T_train, E_train, lr=5e-3, init_method='orthogonal')

        # Building a Neural MTLR
        # structure = [ {'activation': 'Swish', 'num_units': 150},  ]
        # mtlr = NeuralMultiTaskModel(structure=structure, bins=150) 
        # mtlr.fit(X_train, T_train, E_train, lr=5e-3, init_method='adam')

        #### 5 - Cross Validation / Model Performances
        c_index = concordance_index(mtlr, X_test, T_test, E_test) #0.95
        print('C-index: {:.2f}'.format(c_index))

        """

        # Checking data format (i.e.: transforming into numpy array)
        X, T, E = utils.check_data(X, T, E)

        input_shape = []
        # Extracting data parameters
        if isinstance(X, list):
            nb_inputs = len(X)
            for data in X:
                nb_units, num_vars = data.shape
                input_shape.append(num_vars)
            # Scaling data
            if self.auto_scaler:
                for index, data in enumerate(X):
                    X[index] = self.scaler.fit_transform(data)
        else:
            nb_inputs = 1
            nb_units, self.num_vars = X.shape
            input_shape.append(self.num_vars)
            # Scaling data
            if self.auto_scaler:
                X = self.scaler.fit_transform(X)

        # Building the time axis, time buckets and output Y
        X_cens, X_uncens, Y_cens, Y_uncens \
            = self.compute_XY(X, T, E, is_min_time_zero, extra_pct_time)

        # Initializing the model
        model = nn.NeuralNet(input_shape, self.num_times, self.structure,
                             init_method, dropout, batch_normalization,
                             bn_and_dropout)

        # Creating the Triangular matrix
        Triangle = np.tri(self.num_times, self.num_times + 1, dtype=np.float32)
        Triangle = torch.FloatTensor(Triangle)

        if torch.cuda.is_available():
            model = model.cuda()
            Triangle = Triangle.cuda()

        # Performing order 1 optimization
        model, loss_values = opt.optimize(self.loss_function,
                                          model,
                                          optimizer,
                                          lr,
                                          num_epochs,
                                          verbose,
                                          X_cens=X_cens,
                                          X_uncens=X_uncens,
                                          Y_cens=Y_cens,
                                          Y_uncens=Y_uncens,
                                          Triangle=Triangle,
                                          l2_reg=l2_reg,
                                          l2_smooth=l2_smooth,
                                          max_norm=max_norm,
                                          min_clamp_value=min_clamp_value,
                                          max_clamp_value=max_clamp_value)

        # Saving attributes
        self.model = model.eval()
        self.loss_values = loss_values

        return self
Exemple #28
0
def brier_score(model, X, T, E, t_max=None, use_mean_point=True, **kwargs):
    """ 
    Computing the Brier score at all times t such that t <= t_max;
    it represents the average squared distances between 
    the observed survival status and the predicted
    survival probability.

    In the case of right censoring, it is necessary to adjust
    the score by weighting the squared distances to 
    avoid bias. It can be achieved by using 
    the inverse probability of censoring weights method (IPCW),
    (proposed by Graf et al. 1999; Gerds and Schumacher 2006)
    by using the estimator of the conditional survival function
    of the censoring times calculated using the Kaplan-Meier method,
    such that :
    BS(t) = 1/N*( W_1(t)*(Y_1(t) - S_1(t))^2 + ... + 
                  W_N(t)*(Y_N(t) - S_N(t))^2)

    In terms of benchmarks, a useful model will have a Brier score below 
    0.25. Indeed, it is easy to see that if for all i in [1,N], 
    if S(t, xi) = 0.5, then BS(t) = 0.25.

    Parameters:
    -----------
    * model : Pysurvival object
        Pysurvival model

    * X : array-like, shape=(n_samples, n_features)
        The input samples.

    * T : array-like, shape = [n_samples] 
        The target values describing when the event of interest or censoring
        occured

    * E : array-like, shape = [n_samples] 
        The Event indicator array such that E = 1. if the event occured
        E = 0. if censoring occured
    
    * t_max: float 
        Maximal time for estimating the prediction error curves. 
        If missing the largest value of the response variable is used.

    Returns:
    --------
        * (times, brier_scores):tuple of arrays
            -times represents the time axis at which the brier scores were 
              computed
            - brier_scores represents the values of the brier scores
                    
    Example:
    --------


    """
    # Checking the format of the data 
    T, E = utils.check_data(T, E)

    # computing the Survival function
    Survival = model.predict_survival(X, None, **kwargs)

    # Extracting the time buckets
    times = model.times
    time_buckets = model.time_buckets

    # Ordering Survival, T and E in descending order according to T
    order = np.argsort(-T)
    Survival = Survival[order, :]
    T = T[order]
    E = E[order]

    if t_max is None or t_max <= 0.:
        t_max = max(T)

    # Calculating the brier scores at each t <= t_max
    results = _brier_score(Survival, T, E, t_max, times, time_buckets,
        use_mean_point)
    times = results[0] 
    brier_scores = results[1] 

    return (times, brier_scores)