def psurvival(row, phenotype_df, duration_col = 'T', event_col = 'E', other_cols = []):
    """
    duration_col: survival time
    event_col: whether an event (death or other) has ocured or not. 0 for no, 1 for yes
    other_cols: other variables to consider in the regression
    """
    # phenotype_df = phenotype_df.T
    phenotype_df = phenotype_df.join(row.astype(float))
    phenotype_df[duration_col] = phenotype_df[duration_col].astype(float)
    phenotype_df[event_col] = phenotype_df[event_col].astype(int)

    # The following lines deal with char conflicts in patsy formulas
    duration_col = duration_col.replace(' ','_').replace('.','_').replace('-','_')
    event_col = event_col.replace(' ','_').replace('.','_').replace('-','_')
    other_cols = [x.replace(' ','_').replace('.','_').replace('-','_') for x in other_cols]
    row.name = row.name.replace(' ','_').replace('.','_').replace('-','_')
    phenotype_df.columns = [x.replace(' ','_').replace('.','_').replace('-','_') for x in phenotype_df.columns]

    formula = row.name + ' + ' + duration_col + ' + ' + event_col
    if not not other_cols:
        other_cols = [x.replace(' ','_').replace('.','_') for x in other_cols]
        formula = formula + ' + ' + ' + '.join(other_cols)
    X = patsy.dmatrix(formula_like = formula, data = phenotype_df, return_type = 'dataframe')
    X = X.drop(['Intercept'], axis = 1)
    cph = lifelines.CoxPHFitter()
    cph.fit(X, duration_col = duration_col, event_col = event_col)
    result = cph.summary.loc[row.name]
    return result
Exemple #2
0
def _cv_coxph_c(
    z,
    survival,
    penalty,
    duration_column="duration",
    observed_column="observed",
    cv_folds=5,
):
    try:
        import lifelines
        import lifelines.utils
    except ImportError:
        raise ImportError(
            "The module ``lifelines`` was not found. It is required for this functionality. You may install it using `pip install lifelines`."
        )

    cph = lifelines.CoxPHFitter(penalizer=penalty)
    survdf = pd.concat([survival, z], axis=1, sort=False).dropna()

    kfold = KFold(cv_folds)
    scores = list()

    for train_index, test_index in kfold.split(survdf):
        x_train, x_test = survdf.iloc[train_index], survdf.iloc[test_index]

        cph.fit(x_train, duration_column, observed_column)
        cindex = lifelines.utils.concordance_index(
            x_test[duration_column],
            -cph.predict_partial_hazard(x_test),
            x_test[observed_column],
        )
        scores.append(cindex)

    return scores
	def train(self, trainMatrix):
		self._trainFMean = trainMatrix[:,3:].mean(0)
		sur_matrix = np.concatenate((trainMatrix[:,1:3] , trainMatrix[:,3:] - self._trainFMean), axis=1)
		data = pd.DataFrame(sur_matrix)
		self._cf = ll.CoxPHFitter()
		self._cf.fit(data, 1, event_col=0)
		bh = self._cf.baseline_hazard_
		self._bh = np.zeros((np.shape(bh)[0] ,2))
		self._bh[:,0] = np.asarray(list(bh.index))
		self._bh[:,1] = np.cumsum(np.asarray(bh))
Exemple #4
0
def _cph_coefs(z, survival, duration_column, observed_column, penalizer=0):
    """Compute one CPH model for each latent factor (column) in z.
    Return summaries (beta values, p values, confidence intervals)
    """
    try:
        import lifelines
    except ImportError:
        raise ImportError('The module ``lifelines`` was not found. It is required for this functionality. You may install it using `pip install lifelines`.')
    return pd.concat([
        lifelines.CoxPHFitter(penalizer=penalizer).fit(survival.assign(LF=z.loc[:,i]).dropna(),
            duration_column, observed_column).summary.loc['LF'].rename(i)
        for i in z.columns], axis=1)
Exemple #5
0
def fit_model():
    df = get_df(inpath, filename)
    # dropping columns for reference categories
    df.drop(['Concrete', 'Urban'], axis=1, inplace=True)
    coxph = ll.CoxPHFitter()
    model = coxph.fit(df,
                      duration_col='duration',
                      event_col='degraded_obs',
                      cluster_col='id',
                      show_progress=True)
    pprint(model.summary)
    return model
Exemple #6
0
def _cv_coxph_c(z, survival, penalty,
    duration_column='duration', observed_column='observed', cv_folds=5):
    try:
        import lifelines
        import lifelines.utils
    except ImportError:
        raise ImportError('The module ``lifelines`` was not found. It is required for this functionality. You may install it using `pip install lifelines`.')

    cph = lifelines.CoxPHFitter(penalizer=penalty)
    survdf = pd.concat([survival, z], axis=1, sort=False).dropna()

    scores = lifelines.utils.k_fold_cross_validation(cph,
        survdf, duration_column, event_col=observed_column, k=cv_folds)
    return scores
Exemple #7
0
def coxph_model(formula, data, time_col, event_col, **kwargs):
    # pylint: disable=no-member
    # pylint gets confused by dmatrix
    sdata = patsy.dmatrix(formula, data=data, return_type="dataframe").join(
        data[[time_col, event_col]])
    sdata = sdata.ix[:, sdata.columns != "Intercept"]
    if not (hasattr(kwargs, "penalizer")):
        kwargs["penalizer"] = 0.1
    if not (hasattr(kwargs, "normalize")):
        kwargs['normalize'] = False
    cf = ll.CoxPHFitter(**kwargs)
    cf.fit(sdata, time_col, event_col)
    cf.print_summary()
    return cf
def survival_npcs(row, phenotype_df, duration_col = 'T', event_col = 'E', other_cols = []):
    """
    duration_col: survival time
    event_col: whether an event (death or other) has ocured or not. 0 for no, 1 for yes
    other_cols: other variables to consider in the regression
    """

    row.name = row.name.replace(' ','_').replace('.','_').replace('-','_')
    row_npcs = row
    columns_names = []
    formula = ''
    for n in range(len(row_npcs[0])):
        pc_name = row.name + '_pc' + str(n+1)
        columns_names.append(pc_name)
        formula = formula + pc_name + ' + '

    row_npcs = pd.DataFrame(row_npcs.tolist(), index = row_npcs.index)
    row_npcs.columns = columns_names

    # phenotype_df = phenotype_df.join(row.astype(float))
    phenotype_df = phenotype_df.join(row_npcs.astype(float))

    phenotype_df[duration_col] = phenotype_df[duration_col].astype(float)
    phenotype_df[event_col] = phenotype_df[event_col].astype(int)

    # The following lines deal with char conflicts in patsy formulas
    duration_col = duration_col.replace(' ','_').replace('.','_').replace('-','_')
    event_col = event_col.replace(' ','_').replace('.','_').replace('-','_')
    other_cols = [x.replace(' ','_').replace('.','_').replace('-','_') for x in other_cols]
    # row.name = row.name.replace(' ','_').replace('.','_').replace('-','_')
    phenotype_df.columns = [x.replace(' ','_').replace('.','_').replace('-','_') for x in phenotype_df.columns]

    # formula = row.name + ' + ' + duration_col + ' + ' + event_col
    formula = formula + duration_col + ' + ' + event_col
    if not not other_cols:
        other_cols = [x.replace(' ','_').replace('.','_') for x in other_cols]
        formula = formula + ' + ' + ' + '.join(other_cols)

    X = patsy.dmatrix(formula_like = formula, data = phenotype_df, return_type = 'dataframe')
    X = X.drop(['Intercept'], axis = 1)
    cph = lifelines.CoxPHFitter()
    cph.fit(X, duration_col = duration_col, event_col = event_col)
    result = cph.summary.loc[columns_names]
    return result
Exemple #9
0
    def test_fit_kwargs(self):
        ipw = IPW(learner=LogisticRegression(max_iter=1000))
        weighted_standardized_survival = WeightedStandardizedSurvival(
            survival_model=lifelines.CoxPHFitter(), weight_model=ipw)

        # Without fit_kwargs - should raise StatisticalWarning with a suggestion to pass robust=True in fit
        with self.assertWarns(lifelines.exceptions.StatisticalWarning):
            weighted_standardized_survival.fit(self.X, self.a, self.t, self.y)

        # With fit_kwargs - should not raise StatisticalWarning (might raise other warnings, though)
        with self.assertRaises(
                AssertionError
        ):  # negation workaround since there's no assertNotWarns
            with self.assertWarns(lifelines.exceptions.StatisticalWarning):
                weighted_standardized_survival.fit(self.X,
                                                   self.a,
                                                   self.t,
                                                   self.y,
                                                   fit_kwargs={'robust': True})
Exemple #10
0
 def coxph(self, **kwargs):
     """
     CoxPH plot using baidutongji all_source dataframe as input.
     :param kwargs:
     :return:
     """
     title = kwargs['title']
     path = kwargs['path']
     df_raw = self.data_frame
     df_raw = df_raw.applymap(lambda x: x if re.search(
         "[-+]?[0-9]*\.?[0-9]+", str(x)) else np.nan)
     if kwargs['exclude']:
         df = df_raw.drop(kwargs['exclude'], axis=1)
     df = df.dropna(how='any')
     fit, ax = plt.subplots()
     cph = lifelines.CoxPHFitter()
     cph.fit(df, 'avg_visit_time')
     cph.plot(hazard_ratios=True, ax=ax)
     plt.title(title)
     plt.tight_layout()
     plt.savefig(path)
     plt.close('all')
Exemple #11
0
def get_hazard_ratios(df_test):
    cph = lifelines.CoxPHFitter()
    cph.fit(df_test, duration_col=TIME, event_col=OBSERVED)
    return cph.summary
Exemple #12
0
 def test_cox(self):
     standardized_survival_cox = StandardizedSurvival(
         survival_model=lifelines.CoxPHFitter())
     standardized_survival_cox.fit(self.X, self.a, self.t, self.y)
     _ = standardized_survival_cox.estimate_population_outcome(
         self.X, self.a, self.t, self.y)
def plot_single_SVR(prediction,
                    mutation_data,
                    label_type,
                    survival=False,
                    show_plots=False,
                    alpha=0.95):
    if type(prediction) is not pd.core.frame.DataFrame:
        if os.path.isfile(prediction):
            prediction = pd.read_hdf(prediction)

    keys = prediction.keys()
    SVRs = list()
    label = keys[0]
    SVRs = prediction[label]['classifiers']

    Y_test = prediction[label]['Y_test']
    X_test = prediction[label]['X_test']
    Y_train = prediction[label]['X_train']

    if survival:
        # Also extract time to event and if event occurs from mutation data
        labels = [[label_type], ['E'], ['T']]
    else:
        labels = [[label_type]]

    if type(mutation_data) is not dict:
        if os.path.isfile(mutation_data):
            mutation_data = gp.load_mutation_status(mutation_data, labels)

    patient_IDs = mutation_data['patient_IDs']
    mutation_label = mutation_data['mutation_label']

    # Initialize scoring metrics
    r2score = list()
    MSE = list()
    coefICC = list()
    PearsonC = list()
    PearsonP = list()
    SpearmanC = list()
    SpearmanP = list()

    if survival:
        cindex = list()
        coxp = list()
        coxcoef = list()

    patient_MSE = dict()

    for i in range(0, len(Y_test)):
        test_patient_IDs = prediction[label]['patient_ID_test'][i]

        # FIXME: Put some wrong patient IDs in test files
        for num in range(0, len(test_patient_IDs)):
            if 'features_' in test_patient_IDs[num]:
                test_patient_IDs[num] = test_patient_IDs[num][9::]

            if '__tpl.hdf5' in test_patient_IDs[num]:
                test_patient_IDs[num] = test_patient_IDs[num][0:-10]

        test_patient_IDs = np.asarray(test_patient_IDs)

        X_temp = X_test[i]

        test_indices = list()
        for i_ID in test_patient_IDs:
            # FIXME: Error in specific study
            if i_ID == '112_recurrence-preop':
                i_ID = '112_recurrence_preop'
            test_indices.append(np.where(patient_IDs == i_ID)[0][0])

        y_truth = [mutation_label[0][k][0] for k in test_indices]

        if type(SVRs) == list or type(SVRs) == tuple:
            estimator = SVRs[i]
        else:
            estimator = SVRs

        scaler = estimator.best_scaler
        try:
            y_prediction = estimator.predict(scaler.transform(X_temp))
        except ValueError:
            y_prediction = estimator.predict(X_temp)

        y_truth = np.asarray(y_truth)

        # if survival:
        #     # Normalize the scores
        #     y_prediction = np.subtract(1.01, np.divide(y_prediction, np.max(y_prediction)))

        print "Truth: ", y_truth
        print "Prediction: ", y_prediction

        # Compute error per patient
        for i_truth, i_predict, i_test_ID in zip(y_truth, y_prediction,
                                                 test_patient_IDs):
            if i_test_ID not in patient_MSE.keys():
                patient_MSE[i_test_ID] = list()
            patient_MSE[i_test_ID].append((i_truth - i_predict)**2)

        # Compute evaluation metrics
        r2score.append(r2_score(y_truth, y_prediction))
        MSE.append(mean_squared_error(y_truth, y_prediction))
        coefICC.append(ICC(np.column_stack((y_prediction, y_truth))))
        C = pearsonr(y_prediction, y_truth)
        PearsonC.append(C[0])
        PearsonP.append(C[1])
        C = spearmanr(y_prediction, y_truth)
        SpearmanC.append(C.correlation)
        SpearmanP.append(C.pvalue)

        if survival:
            # Extract time to event and event from label data
            E_truth = np.asarray(
                [mutation_label[1][k][0] for k in test_indices])
            T_truth = np.asarray(
                [mutation_label[2][k][0] for k in test_indices])

            # Concordance index
            cindex.append(
                1 - ll.utils.concordance_index(T_truth, y_prediction, E_truth))

            # Fit Cox model using SVR output, time to event and event
            data = {'predict': y_prediction, 'E': E_truth, 'T': T_truth}
            data = pd.DataFrame(data=data, index=test_patient_IDs)

            cph = ll.CoxPHFitter()
            cph.fit(data, duration_col='T', event_col='E')

            coxcoef.append(cph.summary['coef']['predict'])
            coxp.append(cph.summary['p']['predict'])

    # Compute confidence intervals for given metrics
    N_1 = float(len(Y_train[0]))
    N_2 = float(len(Y_test[0]))

    if len(r2score) == 1:
        # No confidence intevals, just take the scores
        stats = dict()
        stats["r2_score:"] = str(r2score[0])
        stats["MSE:"] = str(MSE[0])
        stats["ICC:"] = str(coefICC[0])
        stats["PearsonC:"] = str(PearsonC[0])
        stats["SpearmanC: "] = str(SpearmanC[0])
        stats["PearsonP:"] = str(PearsonP[0])
        stats["SpearmanP: "] = str(SpearmanP[0])

        if survival:
            stats["Concordance:"] = str(cindex[0])
            stats["Cox coef.:"] = str(coxcoef[0])
            stats["Cox p:"] = str(coxp[0])
    else:
        # Compute confidence intervals from cross validations
        stats = dict()
        stats["r2_score 95%:"] = str(
            compute_CI.compute_confidence(r2score, N_1, N_2, alpha))
        stats["MSE 95%:"] = str(
            compute_CI.compute_confidence(MSE, N_1, N_2, alpha))
        stats["ICC 95%:"] = str(
            compute_CI.compute_confidence(coefICC, N_1, N_2, alpha))
        stats["PearsonC 95%:"] = str(
            compute_CI.compute_confidence(PearsonC, N_1, N_2, alpha))
        stats["SpearmanC 95%: "] = str(
            compute_CI.compute_confidence(SpearmanC, N_1, N_2, alpha))
        stats["PearsonP 95%:"] = str(
            compute_CI.compute_confidence(PearsonP, N_1, N_2, alpha))
        stats["SpearmanP 95%: "] = str(
            compute_CI.compute_confidence(SpearmanP, N_1, N_2, alpha))

        if survival:
            stats["Concordance 95%:"] = str(
                compute_CI.compute_confidence(cindex, N_1, N_2, alpha))
            stats["Cox coef. 95%:"] = str(
                compute_CI.compute_confidence(coxcoef, N_1, N_2, alpha))
            stats["Cox p 95%:"] = str(
                compute_CI.compute_confidence(coxp, N_1, N_2, alpha))

    for k, v in stats.iteritems():
        print k, v

    # Calculate and sort individual patient MSE
    patient_MSE = {k: np.mean(v) for k, v in patient_MSE.iteritems()}
    order = np.argsort(patient_MSE.values())
    sortedkeys = np.asarray(patient_MSE.keys())[order].tolist()
    sortedvalues = np.asarray(patient_MSE.values())[order].tolist()
    patient_MSE = [(k, v) for k, v in zip(sortedkeys, sortedvalues)]

    for p in patient_MSE:
        print p[0], p[1]

    stats["Patient_MSE"] = patient_MSE

    if show_plots:
        # TODO: Plot metrics, see also plot_SVM
        pass

    return stats
def readcrossval(feat_m1,
                 config,
                 sinkfolder,
                 patientinfo,
                 outputfolder,
                 feat_m2=None,
                 feat_m3=None,
                 alpha=0.95,
                 label_type=None,
                 survival=False,
                 n_classifiers=[1, 5, 10]):
    # n_classifiers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20 ,25, 30, 40 , 50]
    n_classifiers = [1]
    config = config_io.load_config(config)
    sinks = glob.glob(sinkfolder + 'RS*.hdf5')

    # Sort sinks based on creation date
    sinktimes = [os.path.getmtime(f) for f in sinks]
    sinks = [s for _, s in sorted(zip(sinktimes, sinks))]

    if label_type is None:
        label_type = config['Genetics']['mutation_type']

    if survival:
        # Also extract time to event and if event occurs from mutation data
        labels = [label_type, ['E'], ['T']]
    else:
        labels = [[label_type]]

    if feat_m1:
        label_data, _ =\
            readdata(feat_m1, feat_m2, feat_m3, patientinfo,
                     labels)
    else:
        # No feature files found
        label_data, _ = findmutationdata(patientinfo, labels)

    for n_class in n_classifiers:
        output_json = os.path.join(
            outputfolder, ('performance_{}.json').format(str(n_class)))

        sensitivity = list()
        specificity = list()
        precision = list()
        accuracy = list()
        auc = list()
        # auc_train = list()
        f1_score_list = list()

        patient_classification_list = dict()

        patient_IDs = label_data['patient_IDs']
        mutation_label = label_data['mutation_label']

        trained_classifiers = list()

        y_score = list()
        y_test = list()
        pid_test = list()
        y_predict = list()

        # For SVR
        r2score = list()
        MSE = list()
        coefICC = list()
        PearsonC = list()
        PearsonP = list()
        SpearmanC = list()
        SpearmanP = list()

        if survival:
            cindex = list()
            coxp = list()
            coxcoef = list()

        patient_MSE = dict()

        csvfile = os.path.join(outputfolder, 'scores.csv')
        towrite = list()

        empty_scores = {k: '' for k in natsort.natsorted(patient_IDs)}
        empty_scores = collections.OrderedDict(sorted(empty_scores.items()))
        towrite.append(["Patient"] + empty_scores.keys())
        params = dict()
        for num, s in enumerate(sinks):
            scores = empty_scores.copy()
            print("Processing {} / {}.").format(str(num + 1), str(len(sinks)))
            with open(s, 'r') as fp:
                sr = pd.read_hdf(fp)
            sr = sr['Constructed crossvalidation']
            t = sr.trained_classifier
            trained_classifiers.append(sr.trained_classifier)

            # Extract test info
            test_patient_IDs = sr.patient_ID_test
            X_test = sr.X_test
            Y_test = sr.Y_test

            # Extract sample size
            N_1 = float(len(sr.patient_ID_train))
            N_2 = float(len(sr.patient_ID_test))

            test_indices = list()
            for i_ID in test_patient_IDs:
                test_indices.append(np.where(patient_IDs == i_ID)[0][0])

                if i_ID not in patient_classification_list:
                    patient_classification_list[i_ID] = dict()
                    patient_classification_list[i_ID]['N_test'] = 0
                    patient_classification_list[i_ID]['N_correct'] = 0
                    patient_classification_list[i_ID]['N_wrong'] = 0

                patient_classification_list[i_ID]['N_test'] += 1

            # y_truth = [mutation_label[0][k] for k in test_indices]
            # FIXME: order can be switched, need to find a smart fix
            # 1 for normal, 0 for KM
            y_truth = [mutation_label[0][k][0] for k in test_indices]

            # Predict using the top N classifiers
            results = t.cv_results_['rank_test_score']
            indices = range(0, len(results))
            sortedindices = [x for _, x in sorted(zip(results, indices))]
            sortedindices = sortedindices[0:n_class]
            y_prediction = np.zeros([n_class, len(y_truth)])
            y_score = np.zeros([n_class, len(y_truth)])

            # Get some base objects required
            feature_labels = pd.read_hdf(feat_m1[0]).feature_labels
            base_estimator = t.estimator
            X_train = [(x, feature_labels) for x in sr.X_train]
            y_train = sr.Y_train
            y_train_prediction = np.zeros([n_class, len(y_train)])
            scorer = t.scorer_
            train = np.asarray(range(0, len(y_train)))
            test = train
            del sr  # Save some memory
            # cv_iter = list(t.cv.iter(X_train, y_train))

            # NOTE: need to build this in the SearchCVFastr Object
            for i, index in enumerate(sortedindices):
                print("Processing number {} of {} classifiers.").format(
                    str(i + 1), str(n_class))
                X_testtemp = X_test[:]

                # Get the parameters from the index
                parameters_est = t.cv_results_['params'][index]
                parameters_all = t.cv_results_['params_all'][index]

                # NOTE: kernel parameter can be unicode
                kernel = str(parameters_est[u'kernel'])
                del parameters_est[u'kernel']
                del parameters_all[u'kernel']
                parameters_est['kernel'] = kernel
                parameters_all['kernel'] = kernel

                # Refit a classifier using the settings given
                print("Refitting classifier with best settings.")
                best_estimator = clone(base_estimator).set_params(
                    **parameters_est)

                ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler =\
                    fit_and_score(best_estimator, X_train, y_train, scorer,
                                  train, test, True, parameters_all,
                                  t.fit_params,
                                  t.return_train_score,
                                  True, True, True,
                                  t.error_score)

                X = [x[0] for x in X_train]
                if GroupSel is not None:
                    X = GroupSel.transform(X)
                    X_testtemp = GroupSel.transform(X_testtemp)

                if SelectModel is not None:
                    X = SelectModel.transform(X)
                    X_testtemp = SelectModel.transform(X_testtemp)

                if VarSel is not None:
                    X = VarSel.transform(X)
                    X_testtemp = VarSel.transform(X_testtemp)

                if scaler is not None:
                    X = scaler.transform(X)
                    X_testtemp = scaler.transform(X_testtemp)

                if y_train is not None:
                    best_estimator.fit(X, y_train, **t.fit_params)
                else:
                    best_estimator.fit(X, **t.fit_params)

                # Predict the posterios using the fitted classifier for the training set
                print("Evaluating performance on training set.")
                if hasattr(best_estimator, 'predict_proba'):
                    probabilities = best_estimator.predict_proba(X)
                    y_train_prediction[i, :] = probabilities[:, 1]
                else:
                    # Regression has no probabilities
                    probabilities = best_estimator.predict(X)
                    y_train_prediction[i, :] = probabilities[:]

                # Predict the posterios using the fitted classifier for the test set
                print("Evaluating performance on test set.")
                if hasattr(best_estimator, 'predict_proba'):
                    probabilities = best_estimator.predict_proba(X_testtemp)
                    y_prediction[i, :] = probabilities[:, 1]
                else:
                    # Regression has no probabilities
                    probabilities = best_estimator.predict(X_testtemp)
                    y_prediction[i, :] = probabilities[:]

                if type(t.estimator) == sklearn.svm.classes.SVC:
                    y_score[i, :] = best_estimator.decision_function(
                        X_testtemp)
                else:
                    y_score[i, :] = best_estimator.decision_function(
                        X_testtemp)[:, 0]

                # Add number parameter settings
                for k in parameters_all.keys():
                    if k not in params.keys():
                        params[k] = list()
                    params[k].append(parameters_all[k])

                # Save some memory
                del best_estimator, X, X_testtemp, ret, GroupSel, VarSel, SelectModel, scaler, parameters_est, parameters_all, probabilities

            # Take mean over posteriors of top n
            y_train_prediction_m = np.mean(y_train_prediction, axis=0)
            y_prediction_m = np.mean(y_prediction, axis=0)

            # NOTE: Not sure if this is best way to compute AUC
            y_score = y_prediction_m

            if type(t.estimator) == sklearn.svm.classes.SVC:
                # Look for optimal F1 performance on training set
                thresholds = np.arange(0, 1, 0.01)
                f1_scores = list()
                y_train_prediction = np.zeros(y_train_prediction_m.shape)
                for t in thresholds:
                    for ip, y in enumerate(y_train_prediction_m):
                        if y > t:
                            y_train_prediction[ip] = 1
                        else:
                            y_train_prediction[ip] = 0

                    f1_scores.append(
                        f1_score(y_train_prediction,
                                 y_train,
                                 average='weighted'))

                # Use best threshold to determine test score
                best_index = np.argmax(f1_scores)
                best_thresh = thresholds[best_index]
                best_thresh = 0.5
                y_prediction = np.zeros(y_prediction_m.shape)
                for ip, y in enumerate(y_prediction_m):
                    if y > best_thresh:
                        y_prediction[ip] = 1
                    else:
                        y_prediction[ip] = 0

                # y_prediction = t.predict(X_temp)

                y_prediction = [min(max(y, 0), 1) for y in y_prediction]
            else:
                y_prediction = y_prediction_m
                y_prediction = [min(max(y, 0), 1) for y in y_prediction]

            print "Truth: ", y_truth
            print "Prediction: ", y_prediction

            for k, v in zip(test_patient_IDs, y_prediction):
                scores[k] = v

            # for k, v in scores.iteritems():
            #     print k, v
            #
            # raise IOError
            towrite.append(["Iteration " + str()] + scores.values())

            if type(t.estimator) == sklearn.svm.classes.SVC:
                for i_truth, i_predict, i_test_ID in zip(
                        y_truth, y_prediction, test_patient_IDs):
                    if i_truth == i_predict:
                        patient_classification_list[i_test_ID][
                            'N_correct'] += 1
                    else:
                        patient_classification_list[i_test_ID]['N_wrong'] += 1

            if type(t.estimator) == sklearn.svm.classes.SVC:
                c_mat = confusion_matrix(y_truth, y_prediction)
                TN = c_mat[0, 0]
                FN = c_mat[1, 0]
                TP = c_mat[1, 1]
                FP = c_mat[0, 1]

                if FN == 0 and TP == 0:
                    sensitivity.append(0)
                else:
                    sensitivity.append(float(TP) / (TP + FN))
                if FP == 0 and TN == 0:
                    specificity.append(0)
                else:
                    specificity.append(float(TN) / (FP + TN))
                if TP == 0 and FP == 0:
                    precision.append(0)
                else:
                    precision.append(float(TP) / (TP + FP))
                accuracy.append(accuracy_score(y_truth, y_prediction))
                # y_score = t.decision_function(X_temp)
                auc.append(roc_auc_score(y_truth, y_score))
                f1_score_list.append(
                    f1_score(y_truth, y_prediction, average='weighted'))
            # elif type(t.estimator) == sklearn.svm.classes.SVR:
            else:
                # y_score.extend(svm[k].ix('svms')[0].predict_proba(X_test))
                # y_predict.extend(svm[k].ix('svms')[0].predict(X_test))
                # y_test.extend(Y_test)
                # pid_test.extend(pidt)
                r2score.append(r2_score(y_truth, y_prediction))
                MSE.append(mean_squared_error(y_truth, y_prediction))
                coefICC.append(ICC(np.column_stack((y_prediction, y_truth))))
                C = pearsonr(y_prediction, y_truth)
                PearsonC.append(C[0])
                PearsonP.append(C[1])
                C = spearmanr(y_prediction, y_truth)
                SpearmanC.append(C.correlation)
                SpearmanP.append(C.pvalue)

                if survival:
                    # Extract time to event and event from label data
                    E_truth = np.asarray(
                        [mutation_label[1][k][0] for k in test_indices])
                    T_truth = np.asarray(
                        [mutation_label[2][k][0] for k in test_indices])

                    # Concordance index
                    cindex.append(1 - ll.utils.concordance_index(
                        T_truth, y_prediction, E_truth))

                    # Fit Cox model using SVR output, time to event and event
                    data = {
                        'predict': y_prediction,
                        'E': E_truth,
                        'T': T_truth
                    }
                    data = pd.DataFrame(data=data, index=test_patient_IDs)

                    try:
                        cph = ll.CoxPHFitter()
                        cph.fit(data, duration_col='T', event_col='E')

                        coxcoef.append(cph.summary['coef']['predict'])
                        coxp.append(cph.summary['p']['predict'])
                    except ValueError:
                        # Convergence halted, delta contains nan values?
                        coxcoef.append(1)
                        coxp.append(0)
                    except np.linalg.LinAlgError:
                        #FIXME: Singular matrix
                        coxcoef.append(1)
                        coxp.append(0)

        towrite = zip(*towrite)
        with open(csvfile, 'wb') as csv_file:
            writer = csv.writer(csv_file)
            for w in towrite:
                writer.writerow(w)

        # print(N_1)
        # print(N_2)

        if type(t.estimator) == sklearn.svm.classes.SVC:
            N_iterations = len(sinks)
            accuracy_mean = np.mean(accuracy)
            S_uj = 1.0 / max((N_iterations - 1), 1) * np.sum(
                (accuracy_mean - accuracy)**2.0)

            # print Y_test

            accuracy_var = np.sqrt((1.0 / N_iterations + N_2 / N_1) * S_uj)
            # print(accuracy_var)
            # print(np.sqrt(1/N_iterations*S_uj))
            # print(st.sem(accuracy))

            stats = dict()
            stats["Accuracy 95%:"] = str(CI(accuracy, N_1, N_2, alpha))

            stats["AUC 95%:"] = str(CI(auc, N_1, N_2, alpha))

            stats["F1-score 95%:"] = str(CI(f1_score_list, N_1, N_2, alpha))

            stats["Precision 95%:"] = str(CI(precision, N_1, N_2, alpha))

            stats["Sensitivity 95%: "] = str(CI(sensitivity, N_1, N_2, alpha))

            stats["Specificity 95%:"] = str(CI(specificity, N_1, N_2, alpha))

            print("Accuracy 95%:" + str(CI(accuracy, N_1, N_2, alpha)))

            print("AUC 95%:" + str(CI(auc, N_1, N_2, alpha)))

            print("F1-score 95%:" + str(CI(f1_score_list, N_1, N_2, alpha)))

            print("Precision 95%:" + str(CI(precision, N_1, N_2, alpha)))

            print("Sensitivity 95%: " + str(CI(sensitivity, N_1, N_2, alpha)))

            print("Specificity 95%:" + str(CI(specificity, N_1, N_2, alpha)))

            alwaysright = dict()
            alwayswrong = dict()
            for i_ID in patient_classification_list:
                percentage_right = patient_classification_list[i_ID][
                    'N_correct'] / float(
                        patient_classification_list[i_ID]['N_test'])

                # print(i_ID + ' , ' + str(patient_classification_list[i_ID]['N_test']) + ' : ' + str(percentage_right) + '\n')
                if percentage_right == 1.0:
                    label = mutation_label[0][np.where(i_ID == patient_IDs)]
                    label = label[0][0]
                    alwaysright[i_ID] = label
                    # alwaysright.append(('{} ({})').format(i_ID, label))
                    print(("Always Right: {}, label {}").format(i_ID, label))

                if percentage_right == 0:
                    label = mutation_label[0][np.where(
                        i_ID == patient_IDs)].tolist()
                    label = label[0][0]
                    alwayswrong[i_ID] = label
                    # alwayswrong.append(('{} ({})').format(i_ID, label))
                    print(("Always Wrong: {}, label {}").format(i_ID, label))

            stats["Always right"] = alwaysright
            stats["Always wrong"] = alwayswrong
            # Gather all scores for all patients and average
            pid_unique = list(set(pid_test))
            pid_unique = sorted(pid_unique)
            posteriors = dict()
            for pid in pid_unique:
                posteriors[pid] = list()

                counts = 0
                for num, allid in enumerate(pid_test):
                    if allid == pid:
                        counts += 1
                        posteriors[pid].append(y_score[num][0])
                        truelabel = y_test[num]

                posteriors[pid] = [np.mean(posteriors[pid]), truelabel, counts]
        # elif type(t.estimator) == sklearn.svm.classes.SVR:
        else:
            # Compute confidence intervals from cross validations
            stats = dict()
            stats["r2_score 95%:"] = str(CI(r2score, N_1, N_2, alpha))
            stats["MSE 95%:"] = str(CI(MSE, N_1, N_2, alpha))
            stats["ICC 95%:"] = str(CI(coefICC, N_1, N_2, alpha))
            stats["PearsonC 95%:"] = str(CI(PearsonC, N_1, N_2, alpha))
            stats["SpearmanC 95%: "] = str(CI(SpearmanC, N_1, N_2, alpha))
            stats["PearsonP 95%:"] = str(CI(PearsonP, N_1, N_2, alpha))
            stats["SpearmanP 95%: "] = str(CI(SpearmanP, N_1, N_2, alpha))

            if survival:
                stats["Concordance 95%:"] = str(CI(cindex, N_1, N_2, alpha))
                stats["Cox coef. 95%:"] = str(CI(coxcoef, N_1, N_2, alpha))
                stats["Cox p 95%:"] = str(CI(coxp, N_1, N_2, alpha))

            # Calculate and sort individual patient MSE
            patient_MSE = {k: np.mean(v) for k, v in patient_MSE.iteritems()}
            order = np.argsort(patient_MSE.values())
            sortedkeys = np.asarray(patient_MSE.keys())[order].tolist()
            sortedvalues = np.asarray(patient_MSE.values())[order].tolist()
            patient_MSE = [(k, v) for k, v in zip(sortedkeys, sortedvalues)]

            for p in patient_MSE:
                print p[0], p[1]

            stats["Patient_MSE"] = patient_MSE

            for k, v in stats.iteritems():
                print k, v

        # Check which parameters were most often used
        params = paracheck(params)
        # params = dict()
        # for num, classf in enumerate(trained_classifiers):
        #     params_temp = classf.best_params_
        #     if num == 0:
        #         for k in params_temp.keys():
        #             params[k] = list()
        #             params[k].append(params_temp[k])
        #     else:
        #         for k in params_temp.keys():
        #             params[k].append(params_temp[k])
        #
        # print params

        # # Make histograms or box plots of params
        # for k in params.keys():
        #     para = params[k]
        #     print k
        #     if type(para[0]) is unicode:
        #         letter_counts = Counter(para)
        #         values = letter_counts.values()
        #         keys = letter_counts.keys()
        #         print keys, values
        #         plt.bar(range(len(values)), values, align='center')
        #         plt.xticks(range(len(keys)), keys)
        #         plt.show()
        #     else:
        #         # Make a standard boxplot
        #         plt.figure()
        #         plt.boxplot(para, 0, 'gD')
        #         plt.show()

        # Save output
        savedict = dict()
        savedict["Statistics"] = stats
        savedict['Parameters'] = params

        if type(output_json) is list:
            output_json = ''.join(output_json)

        if not os.path.exists(os.path.dirname(output_json)):
            os.makedirs(os.path.dirname(output_json))

        with open(output_json, 'w') as fp:
            json.dump(savedict, fp, indent=4)

        print("Saved data!")
Exemple #15
0
 def _run(self):
     self._cf = lifelines.CoxPHFitter()
     self._cf.fit(self.df, self.survival_col, event_col=self.cens_col, include_likelihood=True)
def time_between(r):
    if r.isnull()['Loan Paid In Full Date']:
        return r['Loan Maturity Date'] - r['Funded Date']
    else:
        return r['Loan Paid In Full Date'] - r['Funded Date']
    #if r.isnull()['Charge Off Date']:
    #    return r['Loan Maturity Date'] - r['Funded Date']
    #else:
    #    return r['Charge Off Date'] - r['Funded Date']


T = df.apply(lambda r: time_between(r), axis=1).dt.days
E = df['Loan Status']

# survival analysis
cph = lifelines.CoxPHFitter()
X = df
X['T'] = T

# One Hot encode the categorical features
cat_vars = ['Grade', 'Loan Purpose', 'Housing Status']
for var in cat_vars:
    X = pd.concat((X, pd.get_dummies(X[var])), 1)
    X = X.drop(var, axis=1)

# remove unused datetime features
X = X.drop([
    'Charge Off Date', 'Funded Date', 'Loan Maturity Date',
    'Loan Paid In Full Date'
],
           axis=1)
# print(df.shape)
# df.head()

modelspec = 'manufacturer + capacity'

dft = pt.dmatrix(modelspec, df, return_type='dataframe')
design_info = dft.design_info
dft = dft.join(df[['maxhours', 'failed']])

## NOTE: CoxPHFitter expects reduced-rank design matrix WITHOUT intercept
## https://courses.nus.edu.sg/course/stacar/internet/st3242/handouts/notes3.pdf
del dft['Intercept']
dft.head().T

cx = sa.CoxPHFitter(normalize=False)
cx.fit(df=dft,
       duration_col='maxhours',
       event_col='failed',
       show_progress=True,
       include_likelihood=True)

fig, axes = plt.subplots(nrows=1, ncols=2, squeeze=False, sharex=True)
cx.baseline_cumulative_hazard_.plot(ax=axes[0, 0],
                                    legend=False,
                                    title='Baseline cumulative hazard rate')
cx.baseline_survival_.plot(ax=axes[0, 1],
                           legend=False,
                           title='Baseline survival rate')

cx.summary
Exemple #18
0
def lsReg(M, params):
    import lifelines as lf

    cph = lf.CoxPHFitter()
    return (cph.fit(M, duration_col=params['tmCol'],
                    event_col=params['ixCol']))
#define which cluster feature to examine
#options: [coexpression_cluster, functional_proteins_cluster, immunoregulatory_protein_cluster]
#can either examine each one-at-a-time or iterate through them
cluster_choice = "coexpression_cluster"

#manipulate the architecture distinction the variable which is originally dtype: str into a quantitative categorical variable
df["Architecture"] = df["Architecture"].astype("category").cat.codes

#Create two separate feature matrices for recurrence and survival
recurrence_df = df.drop(columns=["Survival", "Survival_time"])[[
    cluster_choice, "grade", "age", "Architecture", "Recurrence",
    "Recurrence_time"
]]
survival_df = df.drop(columns=["Recurrence", "Recurrence_time"])[[
    cluster_choice, "grade", "age", "Architecture", "Survival", "Survival_time"
]]

#Define and fit Cox PH Fitter for recurrence
recurrence_cph = lifelines.CoxPHFitter()
recurrence_cph.fit(recurrence_df,
                   duration_col='Recurrence_time',
                   event_col='Recurrence')
recurrence_summary = recurrence_cph.print_summary()

#Define and fit Cox PH Fitter for survival
survival_cph = lifelines.CoxPHFitter()
survival_cph.fit(survival_df,
                 duration_col='Survival_time',
                 event_col='Survival')
survival_summary = survival_cph.print_summary()