예제 #1
0
    def predict(self,
                neighbor_idxs,
                Survival_train,
                Censored_train,
                Survival_test=None,
                Censored_test=None,
                K=15,
                Method='non-cumulative'):
        """
        Predict testing set using 'prototype' (i.e. training) set using KNN
        
        neighbor_idxs - indices of nearest neighbors; (N_test, N_train)
        Survival_train - training sample time-to-event; (N,) np array
        Censored_train - training sample censorship status; (N,) np array
        K           - number of nearest-neighbours to use, int
        Method      - cumulative vs non-cumulative probability
        """

        # Keep only desired K
        neighbor_idxs = neighbor_idxs[:, 0:K]

        # Initialize
        N_test = neighbor_idxs.shape[0]
        T_test = np.zeros([N_test])

        if Method == 'non-cumulative':

            # Convert outcomes to "alive status" at each time point
            alive_train = sUtils.getAliveStatus(Survival_train, Censored_train)

            # Get survival prediction for each patient
            for idx in range(N_test):

                status = alive_train[neighbor_idxs[idx, :], :]
                totalKnown = np.sum(status >= 0, axis=0)
                status[status < 0] = 0

                # remove timepoints where there are no known statuses
                status = status[:, totalKnown != 0]
                totalKnown = totalKnown[totalKnown != 0]

                # get "average" predicted survival time
                status = np.sum(status, axis=0) / totalKnown

                # now get overall time prediction
                T_test[idx] = np.sum(status)

        elif Method == 'cumulative':

            for idx in range(N_test):

                # Get at-risk groups for each time point for nearest neighbors
                T = Survival_train[neighbor_idxs[idx, :]]
                O = 1 - Censored_train[neighbor_idxs[idx, :]]
                T, O, at_risk, _ = sUtils.calc_at_risk(T, O)

                N_at_risk = K - at_risk

                # Calcuate cumulative probability of survival
                P = np.cumprod((N_at_risk - O) / N_at_risk)

                # now get overall time prediction
                T_test[idx] = np.sum(P)

        else:
            raise ValueError(
                "Method is either 'cumulative' or 'non-cumulative'.")

        # Get c-index
        #======================================================================
        CI = 0
        if Survival_test is not None:
            assert (Censored_test is not None)
            CI = sUtils.c_index(T_test,
                                Survival_test,
                                Censored_test,
                                prediction_type='survival_time')

        return T_test, CI
예제 #2
0
    def post_nca_bagging(self, X_test, X_train,
                         Survival_train,
                         Censored_train,
                         Survival_test=None,
                         Censored_test=None,
                         min_n_feats=10,
                         n_subspaces=20,
                         K=30,
                         Method="cumulative-time",
                         norm=2):

        """
        Get accuracy using bagged subspaces KNN approach
        following NCA and sorting features by absolute weight.

        Args:
        ------
        X_test, X_train - training and testing set
                          IMPORTANT: Must be NCA-transformed
                          first and columns sorted by absolute
                          feature weight

        n_subspaces - no of subspaces to use.
        min_n_feats - minimum no of features to use
        """

        if Method == "cumulative-hazard":
            prediction_type = "risk"
        else:
            prediction_type = "survival_time"
        
        # sanity checks
        if n_subspaces > X_test.shape[1]:
            n_subspaces = X_test.shape[1]
        if min_n_feats > X_test.shape[1]:
            min_n_feats = X_test.shape[1]-1
            
        # initialize
        preds = np.zeros([X_test.shape[0], n_subspaces-min_n_feats])

        maxidxs = np.arange(min_n_feats, X_test.shape[1])
        np.random.shuffle(maxidxs)
        maxidxs = maxidxs[0: n_subspaces-min_n_feats]

        for subspace, fidx_max in enumerate(maxidxs):

            #print('\t\tSubspace {} of {}'.format(subspace, n_subspaces-1))
            
            # Get neighbor indices    
            neighbor_idxs = self._get_neighbor_idxs(\
                    X_test[:, 0:fidx_max], 
                    X_train[:, 0:fidx_max], 
                    norm = norm)
        
            # Predict testing set
            t_test, _ = self.predict(neighbor_idxs,
                                     Survival_train, Censored_train, 
                                     K=K, Method=Method)
           
            preds[:, subspace] = t_test

        # Aggregate prediction
        t_test = np.median(preds, axis=1)

        # Get Ci if survival data available
        Ci = 0
        if Survival_test is not None:
            assert (Censored_test is not None)
            Ci = sUtils.c_index(t_test, Survival_test, Censored_test, 
                                prediction_type= prediction_type)
        return t_test, Ci
예제 #3
0
    def predict(self, neighbor_idxs,
                Survival_train, Censored_train, 
                Survival_test = None, Censored_test = None, 
                K = 30, Method = "cumulative-time"):
        
        """
        Predict testing set using 'prototype' (i.e. training) set using KNN
        
        neighbor_idxs - indices of nearest neighbors; (N_test, N_train)
        Survival_train - training sample time-to-event; (N,) np array
        Censored_train - training sample censorship status; (N,) np array
        K           - number of nearest-neighbours to use, int
        """
        
        # Keep only desired K
        neighbor_idxs = neighbor_idxs[:, 0:K]

        # Initialize        
        N_test = neighbor_idxs.shape[0]
        T_test = np.zeros([N_test])

        if Method == 'non-cumulative':
            
            # Convert outcomes to "alive status" at each time point 
            alive_train = sUtils.getAliveStatus(Survival_train, Censored_train)
    
            # Get survival prediction for each patient            
            for idx in range(N_test):
                
                status = alive_train[neighbor_idxs[idx, :], :]
                totalKnown = np.sum(status >= 0, axis = 0)
                status[status < 0] = 0
                
                # remove timepoints where there are no known statuses
                # (i.e. after last neighbor dies or gets censored)
                status = status[:, totalKnown != 0]
                totalKnown = totalKnown[totalKnown != 0]
                
                # get "average" predicted survival time
                status = np.sum(status, axis = 0) / totalKnown
                
                # now get overall time prediction            
                T_test[idx] = np.sum(status)
                
        elif Method in ['cumulative-time', 'cumulative-hazard']:

                # itirate through patients

                for idx in range(N_test):
                    
                    # Get time and censorship
                    T = Survival_train[neighbor_idxs[idx, :]]
                    C = Censored_train[neighbor_idxs[idx, :]]
    
                    if C.min() == 1:
                        # All cases are censored
                        if Method == "cumulative-time":
                            T_test[idx] = T.max()
                        elif Method == "cumulative-hazard":
                            T_test[idx] = 0
                        continue
                        
                    if Method == "cumulative-time":
                    
                        # Get km estimator
                        t, f = self._km_estimator(T, C)
                    
                        # Get mean survival time
                        T_test[idx] = np.sum(np.diff(t) * f[0:-1])
                    
                    elif Method == 'cumulative-hazard':
                    
                        # Get NA estimator
                        T = Survival_train[neighbor_idxs[idx, :]]
                        C = Censored_train[neighbor_idxs[idx, :]]
                        t, f = self._na_estimator(T, C)
                    
                        # Get integral under cum. hazard curve
                        T_test[idx] = np.sum(np.diff(t) * f[0:-1])
        
        else:
            raise ValueError("Method not implemented.")
                   
        
        # Get c-index
        Ci = 0
        if Method == "cumulative-hazard":
            prediction_type = "risk"
        else:
            prediction_type = "survival_time"

        if Survival_test is not None:
            assert (Censored_test is not None)
            Ci = sUtils.c_index(T_test, Survival_test, Censored_test, 
                                prediction_type= prediction_type)
            
        return T_test, Ci
예제 #4
0
    def predict_with_bagging(self, X_test, X_train,
                             Survival_train,
                             Censored_train,
                             Survival_test=None,
                             Censored_test=None,
                             n_bags=50, 
                             feats_per_bag=None,
                             K=30,
                             Method="cumulative-time",
                             norm=2):
        
        """
        Predict survival with random subspace bagging.
        """
        
        if Method == "cumulative-hazard":
            prediction_type = "risk"
        else:
            prediction_type = "survival_time"
        
        #
        # sanity checks and defaults
        #
        
        assign_defaults = False
        
        if feats_per_bag is None:
            assign_defaults = True
        else:
            assert ("int" in str(type(feats_per_bag)))
            if feats_per_bag > X_test.shape[1]:
                assign_defaults = True
                
        if assign_defaults:
            feats_per_bag = np.int32(0.75 * X_test.shape[1])
        
        #
        # initialize
        #
        
        preds = np.zeros([X_test.shape[0], n_bags])
        
        # Doing all the shufling first since for some reason
        # np shuffle does not work insider the next loop!
        idxs = np.arange(X_train.shape[1])
        idx_shuffles = []
        for shuff in range(n_bags):
            np.random.shuffle(idxs)
            idx_shuffles.append(idxs.copy()[0:feats_per_bag])
        
        #
        # predict using random subspaces
        #
        
        for bag, idxs in enumerate(idx_shuffles):
            
            # Get neighbor indices    
            neighbor_idxs = self._get_neighbor_idxs(\
                    X_test[:, idxs], 
                    X_train[:, idxs], 
                    norm = norm)
        
            # Predict testing set
            t_test, _ = self.predict(neighbor_idxs,
                                     Survival_train, Censored_train, 
                                     K=K, Method=Method)
           
            preds[:, bag] = t_test
        
        # Aggregate prediction
        t_test = np.median(preds, axis=1)

        # Get Ci if survival data available
        Ci = 0
        if Survival_test is not None:
            assert (Censored_test is not None)
            Ci = sUtils.c_index(t_test, Survival_test, Censored_test, 
                                prediction_type= prediction_type)
        return t_test, Ci
예제 #5
0
            foldidx_val = [
                'fold_{}_'.format(fold + 1) in j for j in val_files
            ].index(True)
            foldidx_test = [
                'fold_{}_'.format(fold + 1) in j for j in test_files
            ].index(True)
            preds_val = read_table(pred_path + val_files[foldidx_val], sep=' ')
            preds_test = read_table(pred_path + test_files[foldidx_test],
                                    sep=' ')

            # Get validation set accuracy
            ci_val = []
            for hyperpars in range(preds_val.shape[1]):
                ci_val.append(
                    sUtils.c_index(preds_val.values[:, hyperpars],
                                   Survival[splitIdxs['valid'][fold]],
                                   Censored[splitIdxs['valid'][fold]],
                                   prediction_type='risk'))

            # Get testing set accuracy for optimal hyperparams
            ci_test.append(
                sUtils.c_index(preds_test.values[:, np.argmax(ci_val)],
                               Survival[splitIdxs['test'][fold]],
                               Censored[splitIdxs['test'][fold]],
                               prediction_type='risk'))

        # append summary stats
        ci_test.extend([np.median(ci_test), np.mean(ci_test), \
                        np.percentile(ci_test, 25), np.percentile(ci_test, 75), \
                        np.std(ci_test)])

        # append to final results table