Ejemplo n.º 1
0
def km_curve(labels_ids, survival_dataset, tested_gene_expression_headers_columns, gene_group , k=None, label_index=None):
    ax = plt.subplot(111)

    kmf = KaplanMeierFitter()
    all_labels = np.array([y for x in labels_ids for y in x])
    label_event_list = []
    label_duration_list = []
    results = []
    for i, cur_labels in enumerate(labels_ids):
        label_event = survival_dataset[np.in1d(survival_dataset[:, 0], cur_labels) & np.in1d(survival_dataset[:, 0], tested_gene_expression_headers_columns), 4].astype(np.int32)
        label_duration = survival_dataset[np.in1d(survival_dataset[:, 0], cur_labels) & np.in1d(survival_dataset[:, 0], tested_gene_expression_headers_columns), 3].astype(np.int32)
        label_event_list.append(label_event)
        label_duration_list.append(label_duration)
        labels_c = all_labels[~np.in1d(all_labels,cur_labels) & np.in1d(all_labels, tested_gene_expression_headers_columns)]
        label_event_c = survival_dataset[np.in1d(survival_dataset[:, 0], labels_c), 4].astype(np.int32)
        label_duration_c = survival_dataset[np.in1d(survival_dataset[:, 0], labels_c), 3].astype(np.int32)

        lr_results = logrank_test(label_duration, label_duration_c, label_event, label_event_c, alpha=.95)
        if len(label_duration) != 0:
            kmf.fit(list(label_duration), event_observed=list(label_event), label="cluster {} n={}, logrank pval = {}".format(i,len(label_duration), '{0:1.3e}'.format(lr_results.p_value))) # '%.7f' %
            kmf.plot(ax=ax, show_censors=True)
            print "lrank cluster {} vs all: {}".format(i, lr_results.p_value)
            results.append(lr_results.p_value)
            for j, cur_duration in enumerate(label_duration_list[:-1]):
                lr_results = logrank_test(label_duration, label_duration_list[j], label_event, label_event_list[j], alpha=.95)
                print "lrank cluster {} vs cluster {}: {}".format(i, j, lr_results.p_value)
    plt.ylim(0, 1);

    plt.title("clustering survival analysis");
    plt.savefig(os.path.join(constants.BASE_PROFILE,"output" ,"cluster_by_p_{}_{}_k={}_label_i={}_{}.png".format(constants.CANCER_TYPE, gene_group.split("/")[-1],k,label_index , time.time())))
    plt.cla()

    return results
Ejemplo n.º 2
0
def run_logrank(df1, df2, df3, f, cohorts):
    check = False
    title = '\n('
    if cohorts > 2:
        results = logrank_test(df1['duration'], df2['duration'], df1['event_obs'], df2['event_obs'], alpha=.99)
        f.write('__ p-value ___|__ test statistic __|____ test result ____|__ is significant __\n')
        f.write(str(results.p_value) + ' | ' + str(results.test_statistic)+' | '+str(results.test_result)+ ' | '+str(results.is_significant)+'\n')
        check = check_value(results.p_value)
        title += str(round(results.p_value, 4))
        title += ','
        results = logrank_test(df2['duration'], df3['duration'], df2['event_obs'], df3['event_obs'], alpha=.99)
        f.write('__ p-value ___|__ test statistic __|____ test result ____|__ is significant __\n')
        f.write(str(results.p_value) + ' | ' + str(results.test_statistic)+' | '+str(results.test_result)+ ' | '+str(results.is_significant)+'\n')
        check = check_value(results.p_value)
        title += str(round(results.p_value, 4))
        title += ','
        results = logrank_test(df1['duration'], df3['duration'], df1['event_obs'], df3['event_obs'], alpha=.99)
        f.write('__ p-value ___|__ test statistic __|____ test result ____|__ is significant __\n')
        f.write(str(results.p_value) + ' | ' + str(results.test_statistic)+' | '+str(results.test_result)+ ' | '+str(results.is_significant)+'\n')
        check = check_value(results.p_value)
        title += str(round(results.p_value, 4))
        title += ')'
    else:
        results = logrank_test(df1['duration'], df2['duration'], df1['event_obs'], df2['event_obs'], alpha=.99)
        f.write('__ p-value ___|__ test statistic __|____ test result ____|__ is significant __\n')
        f.write(str(results.p_value) + ' | ' + str(results.test_statistic)+' | '+str(results.test_result)+ ' | '+str(results.is_significant)+'\n')
        check = check_value(results.p_value)
        title += str(round(results.p_value, 4))
        title += ')'
    if check:
        title += "*"
    return title
Ejemplo n.º 3
0
def test_logrank_test_is_symmetric():
    data1 = np.random.exponential(5, size=(2000, 1)).astype(int)
    data2 = np.random.exponential(1, size=(2000, 1)).astype(int)
    result1 = stats.logrank_test(data1, data2)
    result2 = stats.logrank_test(data2, data1)
    assert abs(result1.p_value - result2.p_value) < 10e-8
    assert result2.is_significant == result1.is_significant
Ejemplo n.º 4
0
def test_log_rank_returns_None_if_equal_arrays():
    T = np.random.exponential(5, size=200)
    result = stats.logrank_test(T, T, alpha=0.95)
    assert not result.is_significant

    C = np.random.binomial(2, 0.8, size=200)
    result = stats.logrank_test(T, T, C, C, alpha=0.95)
    assert not result.is_significant
def test_log_rank_returns_None_if_equal_arrays():
    T = np.random.exponential(5, size=200)
    result = stats.logrank_test(T, T)
    assert result.p_value > 0.05

    C = np.random.binomial(1, 0.8, size=200)
    result = stats.logrank_test(T, T, C, C)
    assert result.p_value > 0.05
Ejemplo n.º 6
0
def test_log_rank_returns_None_if_equal_arrays():
    T = np.random.exponential(5, size=200)
    result = stats.logrank_test(T, T, alpha=0.95)
    assert result.p_value > 0.05

    C = np.random.binomial(2, 0.8, size=200)
    result = stats.logrank_test(T, T, C, C, alpha=0.95)
    assert result.p_value > 0.05
def survival_for_two(df, treat, ctrl, legends, title, figname):

    # select the time and status info for treat and control group
    ix = df['group'] == treat
    t1 = df.loc[ix]['time']
    print(t1.shape)
    e1 = df.loc[ix]['status']
    t2 = df.loc[~ix]['time']
    print(t2.shape)
    e2 = df.loc[~ix]['status']

    results = logrank_test(t1, t2, event_observed_A=e1, event_observed_B=e2)
    pvalue = results.p_value
    print('pvalue:\t{}'.format(pvalue))

    # survival curves
    plt.figure(figsize=(3., 3.))
    ax = plt.subplot(111)

    kmf_control = KaplanMeierFitter()
    #g1 = kmf_control.fit(t1, e1, label=legends[0]).plot(ax=ax,show_censors=True,\
    g1 = kmf_control.fit(t1, e1).plot(ax=ax,show_censors=True,\
                        censor_styles={'ms': 12, 'marker': '+'},ci_show=False,c='red',ls='-')

    kmf_exp = KaplanMeierFitter()
    #g2 = kmf_exp.fit(t2, e2, label=legends[1]).plot(ax=ax,show_censors=True,\
    g2 = kmf_exp.fit(t2, e2).plot(ax=ax,show_censors=True,\
                    censor_styles={'ms': 12, 'marker': '+'},ci_show=False,c='k',ls='--')

    handles, labels = ax.get_legend_handles_labels()
    print(labels)
    lg = ax.legend(handles[1::2],
                   legends,
                   loc='lower left',
                   borderaxespad=-.15,
                   handletextpad=.2,
                   labelspacing=.3,
                   handlelength=1,
                   frameon=False)
    if pvalue < 1:
        plt.axes().text(df['time'].max() * 0.45,
                        0.45,
                        'p={:.2f}'.format(pvalue),
                        fontsize=16,
                        ha='center')


#          plt.axes().text(df['time'].max()*0.45,0.45,'p={:.2e}'.format(pvalue),fontsize=16,ha='center')
    plt.ylim([-0.02, 1.05])
    #     plt.xlim([0,max_val*1])
    plt.title(title, fontsize=22)
    plt.xlabel('Days', fontsize=22)
    plt.ylabel('Survival probability', fontsize=22)
    plt.savefig(figname,
                bbox_inches='tight',
                pad_inches=.1,
                dpi=600,
                transparent=True)
    plt.close()
    return results
Ejemplo n.º 8
0
def test_unequal_intensity_event_observed():
    data1 = np.random.exponential(5, size=(2000, 1))
    data2 = np.random.exponential(1, size=(2000, 1))
    eventA = np.random.binomial(1, 0.5, size=(2000, 1))
    eventB = np.random.binomial(1, 0.5, size=(2000, 1))
    result = stats.logrank_test(data1, data2, event_observed_A=eventA, event_observed_B=eventB)
    assert result.p_value < 0.05
def test_unequal_intensity_event_observed():
    data1 = np.random.exponential(5, size=(2000, 1))
    data2 = np.random.exponential(1, size=(2000, 1))
    eventA = np.random.binomial(1, 0.5, size=(2000, 1))
    eventB = np.random.binomial(1, 0.5, size=(2000, 1))
    result = stats.logrank_test(data1, data2, event_observed_A=eventA, event_observed_B=eventB)
    assert result.p_value < 0.05
def test_log_rank_test_on_waltons_dataset():
    df = load_waltons()
    ix = df["group"] == "miR-137"
    waltonT1 = df.loc[ix]["T"]
    waltonT2 = df.loc[~ix]["T"]
    result = stats.logrank_test(waltonT1, waltonT2)
    assert result.p_value < 0.05
Ejemplo n.º 11
0
def test_waltons_dataset():
    df = load_waltons()
    ix = df['group'] == 'miR-137'
    waltonT1 = df.loc[ix]['T']
    waltonT2 = df.loc[~ix]['T']
    result = stats.logrank_test(waltonT1, waltonT2)
    assert result.p_value < 0.05
Ejemplo n.º 12
0
	def __KM_analysis(self,duration_table,expressed_array,unexpressed_array,freq_set):
		data = {}
		expressed_T = []
		expressed_C = []
		unexpressed_T = []
		unexpressed_C = []
		for idx,row in enumerate(duration_table):
			if(idx>0):
				if row[0] in unexpressed_array and row[1] !=  "NA" and row[2] !=  "NA":
					unexpressed_T.append(float(row[1]))
					unexpressed_C.append(int(row[2]))
				elif row[0] in expressed_array and row[1] != "NA" and row[2] !=  "NA":
					expressed_T.append(float(row[1]))
					expressed_C.append(int(row[2]))

		results = logrank_test(expressed_T, unexpressed_T, expressed_C, unexpressed_C, alpha=.95 )
		if(results.p_value < .0006):
			ax = plt.subplot(111)
			kmf = KaplanMeierFitter()
			kmf.fit(expressed_T, event_observed=expressed_C, label="Satisfying")
			kmf.plot(ax=ax, ci_force_lines=False)
			kmf.fit(unexpressed_T, event_observed=unexpressed_C, label="None-Satisfying")
			kmf.plot(ax=ax, ci_force_lines=False)
			plt.ylim(0,1)
			plt.title("Lifespans ("+str(freq_set)+")")
			plt.show()	
		return results.p_value
Ejemplo n.º 13
0
def logrank_pval(stime, censor, g1):
    res = logrank_test(stime[g1],
                       stime[~g1],
                       censor[g1],
                       censor[~g1],
                       alpha=.95)
    return res.p_value
Ejemplo n.º 14
0
Archivo: kme.py Proyecto: shmalex/pyjs
def plot_two_groups(data, t_col_name, e_col_name, g_name, alpha):
    '''
    functino to render the 2 groups and calculate the p values
    '''
    T = data[t_col_name]
    E = data[e_col_name]

    groups = df[g_name]

    # get unique groups to get 1st and 2nd groups names
    uniques = df[g_name].unique()

    ix = (groups == uniques[0])

    kmf = KaplanMeierFitter()
    # plot first group
    kmf.fit(T[~ix], E[~ix], label=uniques[1])
    ax = kmf.plot()

    # plot second group
    kmf.fit(T[ix], E[ix], label=uniques[0])
    kmf.plot(ax=ax)
    # get resoults for p Values
    results = logrank_test(T[ix], T[~ix], E[ix], E[~ix], alpha=alpha)
    plt.title('p-value: {0:.4f}, alpha: {1:.2f}'.format(
        results.p_value, alpha))
Ejemplo n.º 15
0
def kmplot(df_high, df_low, ax):
    kmf_high = KaplanMeierFitter()
    kmf_low = KaplanMeierFitter()
    try:
        kmf_high.fit(durations=df_high.duration,
                     event_observed=df_high.event,
                     label='High: n = ' + str(len(df_high)))
        kmf_low.fit(durations=df_low.duration,
                    event_observed=df_low.event,
                    label="Low: n = " + str(len(df_low)))
    except ValueError:
        return ("NA", "0", "0", "0", "0")
    kmf_high.plot(ax=ax, color="red", show_censors=True, ci_show=False)
    kmf_low.plot(ax=ax, color="black", show_censors=True, ci_show=False)
    statistics_result = logrank_test(df_high.duration,
                                     df_low.duration,
                                     event_observed_A=df_high.event,
                                     event_observed_B=df_low.event)
    p_value = statistics_result.p_value
    ax.set_xlabel('Time (months)')
    ax.set_ylabel('Probability')
    ax.text(0.95,
            0.02,
            'logrank P = ' + str('%.4f' % p_value),
            verticalalignment='bottom',
            horizontalalignment='right',
            transform=ax.transAxes,
            color='black',
            fontsize=11)
    plt.legend(loc=3)
    hm5 = kmf_high.predict(60)
    hm10 = kmf_high.predict(120)
    lm5 = kmf_low.predict(60)
    lm10 = kmf_low.predict(120)
    return (p_value, hm5, hm10, lm5, lm10)
Ejemplo n.º 16
0
def test_significance(df1, df2):
    results = logrank_test(df1['os_years'],
                           df2['os_years'],
                           df1['CENSOR'],
                           df2['CENSOR'],
                           alpha=.99)
    return results.p_value
Ejemplo n.º 17
0
    def logrank_test(self, treatment_a, treatment_b, t1error=0.05):
        """Calculate a log rank test (Mantel-Cox) statistic between two treatments
        Calls lifelines.statistics.logrank_test for calculation.

        Arguments:

            treatment_a     -  The legend label for this group as used
                               with add_mean.

            treatment_b     -  The legend label for this group as used
                               with add_mean.

            t1error         -  probability of a type 1 error (alpha)
                               Default: 0.05
        """
        if not self.endpoint or not self.volume_data:
            print(
                'you need to add data with .add_mean() before using logrank_test'
            )
            raise ValueError
        survival_a = volume_to_survival(self.volume_data[treatment_a],
                                        endpoint=self.endpoint)
        survival_b = volume_to_survival(self.volume_data[treatment_b],
                                        endpoint=self.endpoint)
        result = logrank_test(list(survival_a['Time']),
                              list(survival_b['Time']),
                              list(survival_a['Observed']),
                              list(survival_b['Observed']),
                              alpha=1 - t1error)
        result.print_summary()
        return result
Ejemplo n.º 18
0
def test_log_rank_test_on_waltons_dataset():
    df = load_waltons()
    ix = df['group'] == 'miR-137'
    waltonT1 = df.loc[ix]['T']
    waltonT2 = df.loc[~ix]['T']
    result = stats.logrank_test(waltonT1, waltonT2)
    assert result.p_value < 0.05
def logrank_statistics(x, y, feature, min_leaf):
    """
    Compute logrank_test of liflines package.
    :param x: Input samples
    :param y: Labels
    :param feature: Feature index
    :param min_leaf: Minimum number of leafs for each split.
    :return: best score, best split value, left indices, right indices
    """
    x_feature = x.reset_index(drop=True).iloc[:, feature]
    score_opt = 0
    split_val_opt = None
    lhs_idxs = None
    rhs_idxs = None

    for split_val in x_feature.sort_values(ascending=True, kind="quicksort").unique():
        feature1 = list(x_feature[x_feature <= split_val].index)
        feature2 = list(x_feature[x_feature > split_val].index)
        if len(feature1) < min_leaf or len(feature2) < min_leaf:
            continue
        durations_a = y.iloc[feature1, 0]
        event_observed_a = y.iloc[feature1, 1]
        durations_b = y.iloc[feature2, 0]
        event_observed_b = y.iloc[feature2, 1]
        results = logrank_test(durations_A=durations_a, durations_B=durations_b,
                               event_observed_A=event_observed_a, event_observed_B=event_observed_b)
        score = results.test_statistic

        if score > score_opt:
            score_opt = round(score, 3)
            split_val_opt = round(split_val, 3)
            lhs_idxs = feature1
            rhs_idxs = feature2

    return score_opt, split_val_opt, lhs_idxs, rhs_idxs
Ejemplo n.º 20
0
def plot_survival_curves(rec_t, rec_e, antirec_t, antirec_e, experiment_name = '', output_file = None):
    # Set-up plots
    plt.figure(figsize=(12,3))
    ax = plt.subplot(111)

    # Fit survival curves
    kmf = KaplanMeierFitter()
    kmf.fit(rec_t, event_observed=rec_e, label=' '.join([experiment_name, "Recommendation"]))   
    kmf.plot(ax=ax,linestyle="-")
    kmf.fit(antirec_t, event_observed=antirec_e, label=' '.join([experiment_name, "Anti-Recommendation"]))
    kmf.plot(ax=ax,linestyle="--")
    
    # Format graph
    plt.ylim(0,1);
    ax.set_xlabel('Timeline (months)',fontsize='large')
    ax.set_ylabel('Percentage of Population Alive',fontsize='large')
    
    # Calculate p-value
    results = logrank_test(rec_t, antirec_t, rec_e, antirec_e, alpha=.95)
    results.print_summary()

    # Location the label at the 1st out of 9 tick marks
    xloc = max(np.max(rec_t),np.max(antirec_t)) / 9
    if results.p_value < 1e-5:
        ax.text(xloc,.2,'$p < 1\mathrm{e}{-5}$',fontsize=20)
    else:
        ax.text(xloc,.2,'$p=%f$' % results.p_value,fontsize=20)
    plt.legend(loc='best',prop={'size':15})


    if output_file:
        plt.tight_layout()
        pylab.savefig(output_file)
Ejemplo n.º 21
0
def compute_pval(preds, labels, threshold, alpha):
    """
    preds: np.array (1D) with predictions
    labels. np.array (2D) with n_patients x 2 where first column survival,
    second event status

    Returns
    -------
    p_value of difference between risk groups obtained by using given threshold
    """
    low_risk_idx = np.where(preds <= threshold)[0]
    high_risk_idx = np.where(preds > threshold)[0]

    try:
        test_res = logrank_test(labels[low_risk_idx, 0],
                                labels[high_risk_idx, 0],
                                event_observed_A=labels[low_risk_idx, 1],
                                event_observed_B=labels[high_risk_idx, 1],
                                alpha=alpha)
        p_val = test_res.p_value
    except Exception as e:
        print("WW: Caught exception in compute_pval", e)
        p_val = np.nan

    return p_val
Ejemplo n.º 22
0
def test_waltons_dataset():
    df = load_waltons()
    ix = df['group'] == 'miR-137'
    waltonT1 = df.loc[ix]['T']
    waltonT2 = df.loc[~ix]['T']
    result = stats.logrank_test(waltonT1, waltonT2)
    assert result.is_significant
Ejemplo n.º 23
0
def logrank(T, C):
    print "Running logrank test..."

    p_values = []
    for i in range(0, len(groups)):
        T1 = []
        T2 = []
        C1 = []
        C2 = []
        for j in range(0, len(filenames)):
            if T[j] != -1:
                if groups[i][j] == 1:
                    T1.append(T[j])
                    C1.append(C[j])
                if groups[i][j] == 2:
                    T2.append(T[j])
                    C2.append(C[j])

        if len(T1) != 0:
            logrank = logrank_test(T1, T2, C1, C2, alpha=0.99)
            p_values.append(logrank.p_value)
        else:
            p_values.append(100)

    return p_values
Ejemplo n.º 24
0
 def dichot(self, T, F, surv_prob, median):
     T1 = T[surv_prob >= median]
     T2 = T[surv_prob < median]
     E1 = F[surv_prob >= median]
     E2 = F[surv_prob < median]
     result = logrank_test(T1, T2, E1, E2)
     p = result.p_value
     return T1, T2, E1, E2, p
Ejemplo n.º 25
0
def test_logrank_test_with_t_0():
    control_T = [1, 1, 2, 2, 3, 4, 4, 5, 5, 8, 8, 8, 8, 11, 11, 12, 12, 15, 17, 22, 23]
    control_E = np.ones_like(control_T)

    treatment_T = [6, 6, 6, 7, 10, 13, 16, 22, 23, 6, 9, 10, 11, 17, 19, 20, 25, 32, 32, 34, 25]
    treatment_E = [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

    result = stats.logrank_test(control_T, treatment_T, event_observed_A=control_E, event_observed_B=treatment_E, t_0=10)
def test_logrank_test_output_against_R_1():
    df = load_g3()
    ix = df["group"] == "RIT"
    d1, e1 = df.loc[ix]["time"], df.loc[ix]["event"]
    d2, e2 = df.loc[~ix]["time"], df.loc[~ix]["event"]

    expected = 0.0138
    result = stats.logrank_test(d1, d2, event_observed_A=e1, event_observed_B=e2)
    assert abs(result.p_value - expected) < 0.0001
Ejemplo n.º 27
0
    def set_quality(self):

        self.logrank_test = logrank_test(
            self.sub_group['survival_times'],
            self.sub_group_complement['survival_times'],
            self.sub_group['events'], self.sub_group_complement['events'])
        self.quality = 1 - self.logrank_test.p_value

        return
def test_equal_intensity_with_negative_data():
    data1 = np.random.normal(0, size=(2000, 1))
    data1 -= data1.mean()
    data1 /= data1.std()
    data2 = np.random.normal(0, size=(2000, 1))
    data2 -= data2.mean()
    data2 /= data2.std()
    result = stats.logrank_test(data1, data2)
    assert result.p_value > 0.05
Ejemplo n.º 29
0
def test_logrank_test_output_against_R_1():
    df = load_g3()
    ix = (df['group'] == 'RIT')
    d1, e1 = df.loc[ix]['time'], df.loc[ix]['event']
    d2, e2 = df.loc[~ix]['time'], df.loc[~ix]['event']

    expected = 0.0138
    result = stats.logrank_test(d1, d2, event_observed_A=e1, event_observed_B=e2)
    assert abs(result.p_value - expected) < 0.0001
Ejemplo n.º 30
0
def test_equal_intensity_with_negative_data():
    data1 = np.random.normal(0, size=(2000, 1))
    data1 -= data1.mean()
    data1 /= data1.std()
    data2 = np.random.normal(0, size=(2000, 1))
    data2 -= data2.mean()
    data2 /= data2.std()
    result = stats.logrank_test(data1, data2)
    assert result.p_value > 0.05
Ejemplo n.º 31
0
    def metrices(self, T, surv_prob, F, y, year, train_val, median):
        brier_true = np.cumprod(y[:, 0:np.nonzero(breaks > 365 * year)[0][0]],
                                axis=1)[:, -1]
        conc = concordance_index(T, surv_prob, F)
        brier = brier_score_loss(brier_true, surv_prob)

        T1 = T[surv_prob >= median]
        T2 = T[surv_prob < median]
        E1 = F[surv_prob >= median]
        E2 = F[surv_prob < median]
        result = logrank_test(T1, T2, E1, E2)
        p = result.p_value

        plt.rc('font', family='serif')
        plt.rc('xtick', labelsize='x-small')
        plt.rc('ytick', labelsize='x-small')

        # fig, ax = plt.subplots(ncols=1, figsize=(8,8))
        # #plt.figure(figsize=(12,4))
        # #plt.subplot(1,2,1)
        # days_plot = 9*365

        # kmf = KaplanMeierFitter()
        # for i in range(2):
        #     if i==0:
        #         kmf.fit(T1, event_observed = E1)
        #     elif i==1:
        #         kmf.fit(T2, event_observed = E2)
        #     kmf.plot()
        # N1='N='+ str(len(T1))
        # N2='N='+ str(len(T2))

        # ax.set_xticks(np.arange(0, days_plot, 365))
        # ax.set_yticks(np.arange(0, 1.125, 0.125))
        # ax.tick_params(axis='x', labelsize=12)
        # ax.tick_params(axis='y', labelsize=12)
        # ax.set_xlim([0, days_plot])
        # ax.set_ylim([0,1])
        # ax.text(50, 0.025, 'logrank p-value = ' +str('%.3g'%(p)), bbox=dict(facecolor='red', alpha=0.3), fontsize=10)

        # ax.set_xlabel('Follow-up time (days)', fontsize = 14)
        # ax.set_ylabel('Probability of survival', fontsize = 14)
        # ax.legend(['Low Risk Individuals ' + N1 ,'High Risk Individuals ' + N2 ])
        # ax.set_title('%s set Kaplan-Meier Curves'%(train_val), fontweight = 'bold', fontsize = 14)
        # ax.grid()
        # plt.show()

        print(
            "%s year %s concordance index for %s:" %
            (str(year), train_val, str(self.omics)), conc)
        print(
            "%s year %s brier score for %s:" %
            (str(year), train_val, str(self.omics)), brier)
        print("P-value:", p)
        return conc, brier, p
Ejemplo n.º 32
0
def kaplanmeier_stats_two(unique_groups, grouped_data, analysis_type):
	results = logrank_test(grouped_data.get_group(unique_groups[0])['survival'], grouped_data.get_group(unique_groups[1])['survival'], grouped_data.get_group(unique_groups[0])['event'], grouped_data.get_group(unique_groups[1])['event'])
	with open('Kaplan_%s.txt' % (analysis_type), 'w') as f:
		test_name = "%s vs %s" % (unique_groups[0], unique_groups[1])
		f.write('Test\tP-Value\tSignificant\n')
		# print(test_name)
		# print(results.p_value)
		# print(results.is_significant)
		# print()
		f.write('%s\t%4g\t%s\n' % (test_name, results.p_value, results.is_significant))
	return results
Ejemplo n.º 33
0
def main():
    savedir = "logrank"
    os.makedirs(savedir, exist_ok=True)
    file_male = "count_age_each_sex_male.csv"
    file_female = "count_age_each_sex_female.csv"
    data_male = (pd.read_csv(file_male)).values.tolist()
    data_female = (pd.read_csv(file_female)).values.tolist()
    count = 0
    count_u = 0
    result = []

    for i in range(len(data_male)):
        if data_male[i][1] < 50:
            break
        d_male = data_male[i][2::]
        d_male_list = make_data(d_male)
        d_female = data_female[i][2::]
        d_female_list = make_data(d_female)
        diag = data_male[i][0]

        
        #print(len(d_male_list))
        #print(len(d_female_list))
        #if len(d_male_list) < 20 or len(d_female_list) < 20:
        #    continue
        count += 1
        results = logrank_test(d_male_list, d_female_list)
        #results.print_summary()
        title = "logrank, p = " + '{:.3e}'.format(results.p_value) + "m" + str(len(d_male_list)) + "_f" + str(len(d_female_list)) + "\n"
        if results.p_value < 0.05:
            count_u += 1
            #print(diag)
            #print(results.p_value)
            #cox(d_male_list, d_female_list)
            #make_graph(d_male, d_female, diag, savedir, results.p_value)
            make_kmf_plt(d_male_list, d_female_list, diag, savedir, title)

        """
        else:
            make_graph(d_male, d_female, diag, "logrank_false", title)
            make_kmf_plt(d_male_list, d_female_list, diag, "logrank_false", title)
        """
        #result.append(results.p_value)
            
    """
    result.sort()
    result = [math.log10(x) for x in result]
    plt.plot(result)
    plt.show()
    """
    print("test")
    print(count)
    print("under 0.05")
    print(count_u)
Ejemplo n.º 34
0
def test_peto_weighted_logrank_on_leukemia_dataset():
    """
    Test against result from "Survival Analysis: A Self-learning Text" by Kleinbaum & Klein, 3rd edition, 2012.
    """
    data = load_leukemia()
    group_1 = data[data["Rx"] == 0]
    group_2 = data[data["Rx"] == 1]

    result = stats.logrank_test(group_1["t"], group_2["t"], group_1["status"], group_2["status"], weightings="peto")

    assert abs(result.test_statistic - 14.084139) < 10e-6
    assert result.test_name == "Peto_test"
Ejemplo n.º 35
0
def test_logrank_test_output_against_R_1():
    df = load_g3()
    ix = (df['group'] == 'RIT')
    d1, e1 = df.loc[ix]['time'], df.loc[ix]['event']
    d2, e2 = df.loc[~ix]['time'], df.loc[~ix]['event']

    expected = 0.0138
    result = stats.logrank_test(d1,
                                d2,
                                event_observed_A=e1,
                                event_observed_B=e2)
    assert abs(result.p_value - expected) < 0.0001
Ejemplo n.º 36
0
def plotKaplanMeier(DF, GroupName):
    # Bring in matplotlib
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    from matplotlib import colors
    matplotlib.style.use('seaborn-talk')
    import seaborn as sns
    # fit data
    sns.set(font_scale=.95)
    # plot baseline survival
    KMF = KaplanMeierFitter()
    # get log rank results
    Unique = sorted(DF[GroupName].unique())[::-1]
    G0 = Unique[0]
    G1 = Unique[1]
    Group0 = DF[DF[GroupName] == G0]
    Group1 = DF[DF[GroupName] == G1]
    LR_Results = logrank_test(Group0["OS (months)"].astype(float),
                              Group1["OS (months)"].astype(float),
                              Group0["Patient Status"].astype(int),
                              Group1["Patient Status"].astype(int))
    # build figure
    fig, ax = plt.subplots(1, 1, figsize=(7, 5))
    # add logrank to plot
    # iterate over unique groups and plot KM curves
    Median_Survival_List = []
    for Group in Unique:
        TempDF = DF[DF[GroupName] == Group]
        KMF = KaplanMeierFitter()
        KMF.fit(TempDF["OS (months)"].astype(float),
                TempDF["Patient Status"].astype(int),
                label=Group).plot(ax=ax)
        print("KMF.median_survival_time_", KMF.median_survival_time_)
        Median_Survival_List.append(float(KMF.median_survival_time_))
    Median_Survival_List = sorted(Median_Survival_List)
    Annotation = "logrank: "+str(round(LR_Results.test_statistic,3))+"\n"+\
        "pvalue: "+str(round(LR_Results.p_value,5))+"\n"+\
        "median survival times: "+str(Median_Survival_List)
    ax.text(.05,
            .05,
            Annotation,
            horizontalalignment='left',
            verticalalignment='bottom',
            transform=ax.transAxes)
    # set labels
    ax.set_xlabel("Time (months)")
    ax.set_ylabel("Overall survival probability")
    plt.tight_layout()
    fig.savefig("KaplanMeierSurvival.png")
    plt.clf()
    plt.close()
    return (LR_Results)
Ejemplo n.º 37
0
def cox_log_rank(hazardsdata, labels, survtime_all):
    median = np.median(hazardsdata)
    hazards_dichotomize = np.zeros([len(hazardsdata)], dtype=int)
    hazards_dichotomize[hazardsdata > median] = 1
    idx = hazards_dichotomize == 0
    T1 = survtime_all[idx]
    T2 = survtime_all[~idx]
    E1 = labels[idx]
    E2 = labels[~idx]
    results = logrank_test(T1, T2, event_observed_A=E1, event_observed_B=E2)
    pvalue_pred = results.p_value
    return (pvalue_pred)
def compare_kmc(data, factor, status, interval):
    f = data[factor].drop_duplicates().tolist()
    f.sort()

    for i in f:
        group_i = data[data[factor] == i]
        interv_i = group_i[interval]
        interv_i.reset_index(drop=True)
        interv_i = interv_i.tolist()
        interv_i.append(0)

        sta_i = []
        for j in group_i[status]:
            c = bool(j)
            sta_i.append(c)
        sta_i.append(False)

        time, survival_prob = kaplan_meier_estimator(sta_i, interv_i)
        plt.step(time,
                 survival_prob,
                 where='post',
                 label=str(factor) + '=%s' % i)
    plt.ylabel("est. probability of survival")
    plt.xlabel("time $(days)$")
    plt.legend(loc='best')

    if data[factor].nunique() != 2:
        print('The factor', factor,
              'is non-binary, so the Logrank statistics is not calculated')
    else:
        f = data[factor].drop_duplicates().tolist()
        f.sort()
        time = []
        censor = []

        for i in f:
            interv_i = []
            group_i = df[df[factor] == i]
            for j in group_i[interval]:
                interv_i.append(j)
            time.append(interv_i)

            censor_i = []
            for k in group_i[status]:
                censor_i.append(k)
            censor.append(censor_i)
        T = time[0]
        T1 = time[1]
        E = censor[0]
        E1 = censor[1]

        results = logrank_test(T, T1, E, E1)
        results.print_summary()
Ejemplo n.º 39
0
def test_multivariate_log_rank_is_identital_to_log_rank_for_n_equals_2():
    N = 200
    T1 = np.random.exponential(5, size=N)
    T2 = np.random.exponential(5, size=N)
    C1 = np.random.binomial(2, 0.9, size=N)
    C2 = np.random.binomial(2, 0.9, size=N)
    result = stats.logrank_test(T1, T2, C1, C2, alpha=0.95)

    T = np.r_[T1, T2]
    C = np.r_[C1, C2]
    G = np.array([1] * 200 + [2] * 200)
    result_m = stats.multivariate_logrank_test(T, G, C, alpha=0.95)
    assert result.p_value == result_m.p_value
Ejemplo n.º 40
0
def test_logrank_test_output_against_R_2():
    # from https://stat.ethz.ch/education/semesters/ss2011/seminar/contents/presentation_2.pdf
    control_T = [1, 1, 2, 2, 3, 4, 4, 5, 5, 8, 8, 8, 8, 11, 11, 12, 12, 15, 17, 22, 23]
    control_E = np.ones_like(control_T)

    treatment_T = [6, 6, 6, 7, 10, 13, 16, 22, 23, 6, 9, 10, 11, 17, 19, 20, 25, 32, 32, 34, 25]
    treatment_E = [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

    result = stats.logrank_test(control_T, treatment_T, event_observed_A=control_E, event_observed_B=treatment_E)
    expected_p_value = 4.17e-05

    assert abs(result.p_value - expected_p_value) < 0.0001
    assert abs(result.test_statistic - 16.8) < 0.1
def kmplot(df_high, df_low):
	kmf_high = KaplanMeierFitter()
	kmf_low = KaplanMeierFitter()
	try:
		kmf_high.fit(durations = df_high.duration, event_observed = df_high.event, label = 'High: n = ' + str(len(df_high)))
		kmf_low.fit(durations = df_low.duration, event_observed = df_low.event, label = "Low: n = " + str(len(df_low)))
	except ValueError:
		return("NA", "0", "0", "0", "0")

	statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A = df_high.event, event_observed_B = df_low.event)
	p_value = statistics_result.p_value
                                       
	hm5 = kmf_high.predict(60)
	hm10 = kmf_high.predict(120)
	lm5 = kmf_low.predict(60)
	lm10 = kmf_low.predict(120)
	return(p_value, hm5, hm10, lm5, lm10)
Ejemplo n.º 42
0
def logrank(df,
            condition_col,
            censor_col,
            survival_col,
            threshold=None):
    if threshold is not None:
        if threshold == 'median':
            threshold = df[condition_col].median()
        condition = df[condition_col] > threshold
    else:
        condition = df[condition_col]
    df_with_condition = df[condition]
    df_no_condition = df[~condition]
    survival_no_condition = df_no_condition[survival_col]
    survival_with_condition = df_with_condition[survival_col]
    event_no_condition = (df_no_condition[censor_col].astype(bool))
    event_with_condition = (df_with_condition[censor_col].astype(bool))
    return logrank_test(survival_no_condition, 
                        survival_with_condition, 
                        event_observed_A=event_no_condition, 
                        event_observed_B=event_with_condition)
def kmplot(df_high, df_low, ax):
	kmf_high = KaplanMeierFitter()
	kmf_low = KaplanMeierFitter()
	try:
		kmf_high.fit(durations = df_high.duration, event_observed = df_high.event, label = 'High: n = ' + str(len(df_high)))
		kmf_low.fit(durations = df_low.duration, event_observed = df_low.event, label = "Low: n = " + str(len(df_low)))
	except ValueError:
		return("NA", "0", "0", "0", "0")
	kmf_high.plot(ax = ax, color = "red", show_censors=True,  ci_show=False)
	kmf_low.plot(ax = ax, color = "black", show_censors=True, ci_show=False)
	statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A = df_high.event, event_observed_B = df_low.event)
	p_value = statistics_result.p_value
	ax.set_xlabel('Time (months)')
	ax.set_ylabel('Probability')
	ax.text(0.95, 0.02, 'logrank P = ' + str('%.4f' % p_value), verticalalignment='bottom', horizontalalignment='right', transform=ax.transAxes,
        color = 'black', fontsize = 11)
	plt.legend(loc=3)
	hm5 = kmf_high.predict(60)
	hm10 = kmf_high.predict(120)
	lm5 = kmf_low.predict(60)
	lm10 = kmf_low.predict(120)
	return(p_value, hm5, hm10, lm5, lm10)
Ejemplo n.º 44
0
def estCoxPHTE(df, treatment_col='treated', duration_col='dx', event_col='disease', covars=[]):
    """Estimates treatment efficacy using proportional hazards (Cox model).
    
    Parameters
    ----------
    df : pandas.DataFrame
    
    treatment_col : string
        Column in df indicating treatment.
    duration_col : string
        Column in df indicating survival times.
    event_col : string
        Column in df indicating events (censored data are 0)
    covars : list
        List of other columns to include in Cox model as covariates.
    
    Returns
    -------
    est : float
        Estimate of vaccine efficacy
    ci : vector, length 2
        95% confidence interval, [LL, UL]
    pvalue : float
        P-value for H0: VE=0"""
    
    coxphf = CoxPHFitter()
    
    coxphf.fit(df[[treatment_col, duration_col, event_col]+covars], duration_col=duration_col, event_col=event_col)
    
    te = 1 - np.exp(coxphf.hazards_.loc['coef', treatment_col])
    ci = 1 - np.exp(coxphf.confidence_intervals_[treatment_col].loc[['upper-bound', 'lower-bound']])
    pvalue = coxphf._compute_p_values()[0]

    ind1 = df[treatment_col] == 0
    ind2 = df[treatment_col] == 1
    results = logrank_test(df[duration_col].loc[ind1], df[duration_col].loc[ind2], event_observed_A=df[event_col].loc[ind1], event_observed_B=df[event_col].loc[ind2])
    index = ['TE', 'UB', 'LB', 'pvalue', 'logrank_pvalue', 'model']
    return pd.Series([te, ci['upper-bound'], ci['lower-bound'], pvalue, results.p_value, coxphf], index=index)
Ejemplo n.º 45
0
def run_logrank(df1, df2, df3, f, cohorts):
    check = False
    title = "\n("
    if cohorts > 2:
        results = logrank_test(df1["duration"], df2["duration"], df1["event_obs"], df2["event_obs"], alpha=0.99)
        f.write("__ p-value ___|__ test statistic __|____ test result ____|__ is significant __\n")
        f.write(
            str(results.p_value)
            + " | "
            + str(results.test_statistic)
            + " | "
            + str(results.test_result)
            + " | "
            + str(results.is_significant)
            + "\n"
        )
        check = check_value(results.p_value)
        title += str(round(results.p_value, 4))
        title += ","
        results = logrank_test(df2["duration"], df3["duration"], df2["event_obs"], df3["event_obs"], alpha=0.99)
        f.write("__ p-value ___|__ test statistic __|____ test result ____|__ is significant __\n")
        f.write(
            str(results.p_value)
            + " | "
            + str(results.test_statistic)
            + " | "
            + str(results.test_result)
            + " | "
            + str(results.is_significant)
            + "\n"
        )
        check = check_value(results.p_value)
        title += str(round(results.p_value, 4))
        title += ","
        results = logrank_test(df1["duration"], df3["duration"], df1["event_obs"], df3["event_obs"], alpha=0.99)
        f.write("__ p-value ___|__ test statistic __|____ test result ____|__ is significant __\n")
        f.write(
            str(results.p_value)
            + " | "
            + str(results.test_statistic)
            + " | "
            + str(results.test_result)
            + " | "
            + str(results.is_significant)
            + "\n"
        )
        check = check_value(results.p_value)
        title += str(round(results.p_value, 4))
        title += ")"
    else:
        results = logrank_test(df1["duration"], df2["duration"], df1["event_obs"], df2["event_obs"], alpha=0.99)
        f.write("__ p-value ___|__ test statistic __|____ test result ____|__ is significant __\n")
        f.write(
            str(results.p_value)
            + " | "
            + str(results.test_statistic)
            + " | "
            + str(results.test_result)
            + " | "
            + str(results.is_significant)
            + "\n"
        )
        check = check_value(results.p_value)
        title += str(round(results.p_value, 4))
        title += ")"
    if check:
        title += "*"
    return title
Ejemplo n.º 46
0
def test_unequal_intensity_with_random_data():
    data1 = np.random.exponential(5, size=(2000, 1))
    data2 = np.random.exponential(1, size=(2000, 1))
    test_result = stats.logrank_test(data1, data2)
    assert test_result.p_value < 0.05
Ejemplo n.º 47
0
def test_valueerror_is_raised_if_alpha_out_of_bounds():
    data1 = np.random.exponential(5, size=(20, 1))
    data2 = np.random.exponential(1, size=(20, 1))
    with pytest.raises(ValueError):
        stats.logrank_test(data1, data2, alpha=95)
Ejemplo n.º 48
0
# 
# It turns out these two DNA types do not have significantly different survival rates.

# ### Using R

# In[31]:

get_ipython().run_cell_magic(u'R', u'', u'survdiff(Surv(time, delta) ~ type)')


# ### Using Python

# In[32]:

from lifelines.statistics import logrank_test
summary_= logrank_test(T, T2, C, C2, alpha=99)

print summary_


# <hr>
# # Estimating Hazard Rates
# 
# ### Using R

# To estimate the hazard function, we compute the cumulative hazard function using the [Nelson-Aalen estimator](), defined as:
# 
# $$\hat{\Lambda} (t) = \sum_{t_i \leq t} \frac{d_i}{n_i}$$
# 
# where $d_i$ is the number of deaths at time $t_i$ and $n_i$ is the number of susceptible individuals. Both R and Python modules use the same estimator. However, in R we will use the `-log` of the Fleming and Harrington estimator, which is equivalent to the Nelson-Aalen.
Ejemplo n.º 49
0
def plot_kmf(df, 
             condition_col, 
             censor_col, 
             survival_col, 
             threshold=None,
             title=None,
             xlabel=None,
             ax=None,
             print_as_title=False):
    """
    Plot survival curves by splitting the dataset into two groups based on
    condition_col

    if threshold is defined, the groups are split based on being > or <
    condition_col

    if threshold == 'median', the threshold is set to the median of condition_col

    Parameters
    ----------
        df: dataframe
        condition_col: string, column which contains the condition to split on
        survival_col: string, column which contains the survival time
        censor_col: string,
        threshold: int or string, if int, condition_col is thresholded,
                                  if 'median', condition_col thresholded 
                                  at its median
        title: Title for the plot, default None
        ax: an existing matplotlib ax, optional, default None
        print_as_title: bool, optional, whether or not to print text
          within the plot's title vs. stdout, default False
    """
    kmf = KaplanMeierFitter()
    if threshold is not None:
        if threshold == 'median':
            threshold = df[condition_col].median()
        condition = df[condition_col] > threshold
        label = '{} > {}'.format(condition_col, threshold)
    else:
        condition = df[condition_col]
        label = '{}'.format(condition_col)

    df_with_condition = df[condition]
    df_no_condition = df[~condition]
    survival_no_condition = df_no_condition[survival_col]
    survival_with_condition = df_with_condition[survival_col]

    event_no_condition = (df_no_condition[censor_col].astype(bool))
    event_with_condition = (df_with_condition[censor_col].astype(bool))
             
    kmf.fit(survival_no_condition, event_no_condition, label="")
    if ax:
        kmf.plot(ax=ax, show_censors=True, ci_show=False)
    else:
        ax = kmf.plot(show_censors=True, ci_show=False)

    kmf.fit(survival_with_condition, event_with_condition, label=(label))
    kmf.plot(ax=ax, show_censors=True, ci_show=False)

    # Set the y-axis to range 0 to 1
    ax.set_ylim(0, 1)

    no_cond_str = "# no condition {}".format(len(survival_no_condition))
    cond_str = "# with condition {}".format(len(survival_with_condition))
    if title:
        ax.set_title(title)
    elif print_as_title:
        ax.set_title("%s | %s" % (no_cond_str, cond_str))
    else:
        print(no_cond_str)
        print(cond_str)

    if xlabel:
        ax.set_xlabel(xlabel)
 
    results = logrank_test(survival_no_condition, 
                           survival_with_condition, 
                           event_observed_A=event_no_condition, 
                           event_observed_B=event_with_condition)
    return results
Ejemplo n.º 50
0
    def fit(self, df, duration_col, event_col):
        """
        Fit a data frame.
        """
        self.xcols = df.columns - [duration_col, event_col]
        r_df = _convert_to_dataframe(df)
        # Insert into namespace
        robjects.globalenv["r_df"] = r_df

        # Build options string
        options = ""
        if self.minsplit is not None:
            options = ", ".join([options, "minsplit={}".format(self.minsplit)])
        if self.minbucket is not None:
            options = ", ".join([options, "minbucket={}".format(self.minbucket)])
        if self.xval is not None:
            options = ", ".join([options, "xval={}".format(self.xval)])
        if self.cp is not None:
            options = ", ".join([options, "cp={}".format(self.cp)])

        if len(options) > 0:
            options = ", control = rpart.control({})".format(options)

        # Make the command
        cmd = ("myfit = rpart(Surv(r_df${time}, r_df${event}) ~ {incols}, " + "data=r_df {options})").format(
            time=duration_col, event=event_col, options=options, incols="+".join(self.xcols)
        )
        # Run the command
        self.myfit = r(cmd)
        # Prune it
        if self.cp is not None and self.cp > 0:
            cmd = "myfit <- prune(myfit, cp={})".format(self.cp)
            self.myfit = r(cmd)

        # Now divide into groups for future
        preds = self.predict(df)
        hazards = np.unique(preds)
        # Just to be safe
        hazards.sort()

        # Convert to actual sizes
        highlim = int(self.highlim * df.shape[0])
        lowlim = int(self.lowlim * df.shape[0])

        # Save subgroups here, initialize to outer groups
        self._high = [hazards[-1]]
        self._low = [hazards[0]]

        # Keep track of entire group here for logrank
        high = preds == hazards[-1]
        low = preds == hazards[0]

        # Low risk iterates forwards
        for g in hazards[1:]:
            if (
                np.sum(low) < lowlim
                or not logrank_test(
                    df.loc[low, duration_col],
                    df.loc[preds == g, duration_col],
                    df.loc[low, event_col],
                    df.loc[preds == g, event_col],
                ).is_significant
            ):
                # Append to group
                self._low.append(g)
                low |= preds == g
            else:
                break

        # Important to go backwards here of course
        for g in reversed(hazards[:-1]):
            if g in self._low:
                break
            if (
                np.sum(high) < highlim
                or not logrank_test(
                    df.loc[high, duration_col],
                    df.loc[preds == g, duration_col],
                    df.loc[high, event_col],
                    df.loc[preds == g, event_col],
                ).is_significant
            ):
                # Append to group
                self._high.append(g)
                high |= preds == g
            else:
                break
        # Mid is the rest

        # Remember sizes for the benefit of others
        self.high_size = np.sum(high)
        self.low_size = np.sum(low)
Ejemplo n.º 51
0
def survival_plot(clinical, fitter, fitter_name, feature, time="life_duration", event="patient_death_date", axis=None):
    """
    Plot survival/hazard of all patients regardless of trait and dependent of trait.
    """
    # duration of life
    T = [i.days / 30. for i in clinical[time]]
    # events:
    # True for observed event (death);
    # else False (this includes death not observed; death by other causes)
    C = [True if i is not pd.NaT else False for i in clinical[event]]

    # drop index (to have syncronised numbers between lists T, C and the index of clinical)
    # this is because we drop a few rows from clinical before during data cleanup
    clinical2 = clinical.reset_index(drop=True)

    # plot
    if axis is None:
        fig, axis = plt.subplots(1)
        save = True
    else:
        save = False

    # Plot survival of all patients regardless of trait
    fitter.fit(T, event_observed=C, label="all patients")
    fitter.plot(ax=axis, show_censors=True)

    # For each type: subset, fit, plot
    # Filter patients which feature is nan
    x = clinical2[feature].unique()
    x = x[~np.array(map(pd.isnull, x))]

    # for each class plot curve
    for value in x:
        # get patients from class
        s = clinical2[clinical2[feature] == value].index.tolist()
        fitter.fit([T[i] for i in s], event_observed=[C[i] for i in s], label=str(value))
        fitter.plot(ax=axis, show_censors=True)
    if fitter_name == "survival":
        axis.set_ylim(0, 1.05)

    # Test pairwise differences
    p_values = list()
    # test each against all
    for a in x:
        a_ = clinical2[clinical2[feature] == a].index.tolist()
        b_ = clinical2.index.tolist()
        p = logrank_test(
            [T[i] for i in a_], [T[i] for i in b_],
            event_observed_A=[C[i] for i in a_],
            event_observed_B=[C[i] for i in b_]).p_value  # .print_summary()
        p_values.append(" vs ".join([str(a), "all"]) + ": %f" % p)
    # test each pairwise combination
    for a, b in itertools.combinations(x, 2):
        a_ = clinical2[clinical2[feature] == a].index.tolist()
        b_ = clinical2[clinical2[feature] == b].index.tolist()
        p = logrank_test(
            [T[i] for i in a_], [T[i] for i in b_],
            event_observed_A=[C[i] for i in a_],
            event_observed_B=[C[i] for i in b_]).p_value  # .print_summary()
        p_values.append(" vs ".join([str(a), str(b)]) + ": %f" % p)

    # Add p-values as anchored text
    try:  # problem with matplotlib < 1.4
        axis.add_artist(AnchoredText("\n".join(p_values), loc=8, frameon=False))
        axis.set_xlabel("time (months)")
    except:
        axis.set_xlabel("time (months)\n%s" % "\n".join(p_values))

    axis.set_title("%s" % feature)
    axis.set_ylabel(fitter_name)
    sns.despine()
    if save:
        fig.savefig(os.path.join(plots_dir, "%s_%s.svg" % (feature, fitter_name)), bbox_inches="tight")
	males = df[df['gender']=='Male']
	females = df[df['gender']=='Female']

	T = df["lifetime"] #measured in days
	C = df["dead"]

	females_ = df["gender"] == "Female"
	males_ = df["gender"] == "Male"

	community_stats = {
		'community': community,
		'size': females.count()[0] + males.count()[0],

		'women_frequency_median' : females['activity_freq'].median(),
		'men_frequency_median' : males['activity_freq'].median(),
		'frequency_difference_median': females['activity_freq'].median() - males['activity_freq'].median(),
		'women_frequency_mean' : females['activity_freq'].mean(),
		'men_frequency_mean' : males['activity_freq'].mean(),
		'frequency_difference_mean': females['activity_freq'].mean() - males['activity_freq'].mean(),
		'frequency_pvalue': 2* stats.mannwhitneyu(females['activity_freq'], males['activity_freq'])[1],

		'women_lifetime_median':kmf.fit(T[females_], event_observed=C[females_], label="Female").median_,
		'men_lifetime_median':kmf.fit(T[males_], event_observed=C[males_], label="Male").median_,
		'lifetime_pvalue': logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 ).p_value

	}

	community_stats['lifetime_difference_median'] = community_stats["women_lifetime_median"] - community_stats["men_lifetime_median"]

	results_db.insert( community_stats )
Ejemplo n.º 53
0
def test_integer_times_logrank_test():
    data1 = np.random.exponential(5, size=(2000, 1)).astype(int)
    data2 = np.random.exponential(1, size=(2000, 1)).astype(int)
    result = stats.logrank_test(data1, data2)
    assert result.p_value < 0.05
print(df.head())
'''
    T  E    group
0   6  1  miR-137
1  13  1  miR-137
2  13  1  miR-137
3  13  1  miR-137
4  19  1  miR-137
'''

T = df['T']
E = df['E']

groups = df['group']
ix = (groups == 'miR-137')

kmf = KaplanMeierFitter()

kmf.fit(T[~ix], E[~ix], label='control')
ax = kmf.plot()

kmf.fit(T[ix], E[ix], label='miR-137')
kmf.plot(ax=ax)

plt.ylabel('Survival Probability')
plt.show()

# Compare the two curves
results = logrank_test(T[ix], T[~ix], event_observed_A=E[ix], event_observed_B=E[~ix])
results.print_summary()
Ejemplo n.º 55
0
def test_logrank_test_is_symmetric():
    data1 = np.random.exponential(5, size=(2000, 1)).astype(int)
    data2 = np.random.exponential(1, size=(2000, 1)).astype(int)
    result1 = stats.logrank_test(data1, data2)
    result2 = stats.logrank_test(data2, data1)
    assert abs(result1.p_value - result2.p_value) < 10e-8
Ejemplo n.º 56
0
def _plot_kmf_single(df,
                     condition_col,
                     survival_col,
                     censor_col,
                     threshold,
                     title,
                     xlabel,
                     ylabel,
                     ax,
                     with_condition_color,
                     no_condition_color,
                     with_condition_label,
                     no_condition_label,
                     color_map,
                     label_map,
                     color_palette,
                     ci_show,
                     print_as_title):
    """
    Helper function to produce a single KM survival plot, among observations in df by groups defined by condition_col.

    All inputs are required - this function is intended to be called by `plot_kmf`.
    """
    # make color inputs consistent hex format
    if colors.is_color_like(with_condition_color):
        with_condition_color = colors.to_hex(with_condition_color)
    if colors.is_color_like(no_condition_color):
        no_condition_color = colors.to_hex(no_condition_color)
    ## prepare data to be plotted; producing 3 outputs:
    # - `condition`, series containing category labels to be plotted
    # - `label_map` (mapping condition values to plot labels)
    # - `color_map` (mapping condition values to plotted colors)
    if threshold is not None:
        is_median = threshold == "median"
        if is_median:
            threshold = df[condition_col].median()
        label_suffix = float_str(threshold)
        condition = df[condition_col] > threshold
        default_label_no_condition = "%s ≤ %s" % (condition_col, label_suffix)
        if is_median:
            label_suffix += " (median)"
        default_label_with_condition = "%s > %s" % (condition_col, label_suffix)
        with_condition_label = with_condition_label or default_label_with_condition
        no_condition_label = no_condition_label or default_label_no_condition
        if not label_map:
            label_map = {False: no_condition_label,
                         True: with_condition_label}
        if not color_map:
            color_map = {False: no_condition_color,
                         True: with_condition_color}
    elif df[condition_col].dtype == 'O' or df[condition_col].dtype.name == "category":
        condition = df[condition_col].astype("category")
        if not label_map:
            label_map = dict()
            [label_map.update({condition_value: '{} = {}'.format(condition_col,
                                                        condition_value)})
                     for condition_value in condition.unique()]
        if not color_map:
            rgb_values = sb.color_palette(color_palette, len(label_map.keys()))
            hex_values = [colors.to_hex(col) for col in rgb_values]
            color_map = dict(zip(label_map.keys(), hex_values))
    elif df[condition_col].dtype == 'bool':
        condition = df[condition_col]
        default_label_with_condition = "= {}".format(condition_col)
        default_label_no_condition = "¬ {}".format(condition_col)
        with_condition_label = with_condition_label or default_label_with_condition
        no_condition_label = no_condition_label or default_label_no_condition
        if not label_map:
            label_map = {False: no_condition_label,
                         True: with_condition_label}
        if not color_map:
            color_map = {False: no_condition_color,
                         True: with_condition_color}
    else:
        raise ValueError('Don\'t know how to plot data of type\
                         {}'.format(df[condition_col].dtype))

    # produce kmf plot for each category (group) identified above
    kmf = KaplanMeierFitter()
    grp_desc = list()
    grp_survival_data = dict()
    grp_event_data = dict()
    grp_names = list(condition.unique())
    for grp_name, grp_df in df.groupby(condition):
        grp_survival = grp_df[survival_col]
        grp_event = (grp_df[censor_col].astype(bool))
        grp_label = label_map[grp_name]
        grp_color = color_map[grp_name]
        kmf.fit(grp_survival, grp_event, label=grp_label)
        desc_str = "# {}: {}".format(grp_label, len(grp_survival))
        grp_desc.append(desc_str)
        grp_survival_data[grp_name] = grp_survival
        grp_event_data[grp_name] = grp_event
        if ax:
            ax = kmf.plot(ax=ax, show_censors=True, ci_show=ci_show, color=grp_color)
        else:
            ax = kmf.plot(show_censors=True, ci_show=ci_show, color=grp_color)

    ## format the plot
    # Set the y-axis to range 0 to 1
    ax.set_ylim(0, 1)
    y_tick_vals = ax.get_yticks()
    ax.set_yticklabels(["%d" % int(y_tick_val * 100) for y_tick_val in y_tick_vals])
    # plot title
    if title:
        ax.set_title(title)
    elif print_as_title:
        ax.set_title(' | '.join(grp_desc))
    else:
        [print(desc) for desc in grp_desc]
    # axis labels
    if xlabel:
        ax.set_xlabel(xlabel)
    if ylabel:
        ax.set_ylabel(ylabel)
    
    ## summarize analytical version of results
    ## again using same groups as are plotted
    if len(grp_names) == 2:
        # use log-rank test for 2 groups
        results = logrank_test(grp_survival_data[grp_names[0]],
                               grp_survival_data[grp_names[1]],
                               event_observed_A=grp_event_data[grp_names[0]],
                               event_observed_B=grp_event_data[grp_names[1]])
    elif len(grp_names) == 1:
        # no analytical result for 1 or 0 groups
        results = NullSurvivalResults()
    else:
        # cox PH fitter for >2 groups
        cf = CoxPHFitter()
        cox_df = patsy.dmatrix('+'.join([condition_col, survival_col,
                                         censor_col]),
                               df, return_type='dataframe')
        del cox_df['Intercept']
        results = cf.fit(cox_df, survival_col, event_col=censor_col)
        results.print_summary()
    # add metadata to results object so caller can print them
    results.survival_data_series = grp_survival_data
    results.event_data_series = grp_event_data
    results.desc = grp_desc
    return results
Ejemplo n.º 57
0
def logrank(out1, time1, out2, time2, alpha=0.05):
    return logrank_test(time1, time2, out1, out2, alpha=1 - alpha)
Ejemplo n.º 58
0
def logrank_pval(stime, censor, g1):
	res = logrank_test(stime[g1], stime[~g1], censor[g1], censor[~g1], alpha=.95)
	return res.p_value
Ejemplo n.º 59
0
import pandas as pd
import sys,os
import json
import numpy as np
from lifelines.statistics import logrank_test
data = sys.argv[1]
df = pd.read_json(data)
df.columns = df.columns.str.replace('\r','')
values = df['value']
gp1_tm = np.array(values[0]).astype(float)
gp1_evt = np.array(values[1]).astype(float)
gp2_tm = np.array(values[2]).astype(float)
gp2_evt = np.array(values[3]).astype(float)
results = logrank_test(gp1_tm, gp2_tm, gp1_evt, gp2_evt)
pvalue = round(results.p_value, 6)
# pvalue = results.p_value
lst = {"logrank_p" : pvalue}
json_last = json.dumps(lst, ensure_ascii = 'false')
print json_last
Ejemplo n.º 60
0
		#for KM-plotting
		data_mat = clean_df.values[:,2:]
		coefs = [np.log(multi_result_dic['exp(coef)'][name]) for name in clean_df.columns[2:]]
		coefs = np.array(coefs).reshape(-1, 1)
		mat_for_plotting = np.concatenate([clean_df.values[:,:2], np.dot(data_mat, coefs)], axis=1)
		critical_val = np.median(mat_for_plotting, axis=0)[2]
		abv_median = mat_for_plotting[mat_for_plotting[:,-1] >= critical_val]
		abv_time = np.array(abv_median[:,0]).tolist()
		abv_event = np.array(abv_median[:,1]).tolist()
		below_median = mat_for_plotting[mat_for_plotting[:,-1] < critical_val]
		blw_time = np.array(below_median[:,0]).tolist()
		blw_event = np.array(below_median[:,1]).tolist()
		# import pdb; pdb.set_trace()
		multi_df = pd.DataFrame.from_dict(multi_result_dic)
		multi_df.columns.values[0] = 'multi Hazard Ratio'
		multi_df.columns.values[1] = 'multi lower'
		multi_df.columns.values[2] = 'multi upper'
		multi_df.columns.values[3] = 'multi p'
		col_list = ['multi Hazard Ratio', 'multi lower', 'multi upper', 'multi p']
		multi_df = multi_df[col_list]
		result = pd.concat([uni_df,multi_df], axis=1)
		result.index.name = "Variable"
		result = result.reset_index()
		result.to_csv("multi_output.csv",mode='w+',index=False)
		rst = logrank_test(abv_time, blw_time, abv_event, blw_event)
		# pvalue = rst.p_value
		pvalue = round(rst.p_value, 6)
		last = {"abv_time" : abv_time,"abv_event" :abv_event, "blw_time" : blw_time,"blw_event": blw_event, "pvalue":pvalue}
		json_last = json.dumps(last, ensure_ascii = 'false')
		print json_last