def km_curve(labels_ids, survival_dataset, tested_gene_expression_headers_columns, gene_group , k=None, label_index=None): ax = plt.subplot(111) kmf = KaplanMeierFitter() all_labels = np.array([y for x in labels_ids for y in x]) label_event_list = [] label_duration_list = [] results = [] for i, cur_labels in enumerate(labels_ids): label_event = survival_dataset[np.in1d(survival_dataset[:, 0], cur_labels) & np.in1d(survival_dataset[:, 0], tested_gene_expression_headers_columns), 4].astype(np.int32) label_duration = survival_dataset[np.in1d(survival_dataset[:, 0], cur_labels) & np.in1d(survival_dataset[:, 0], tested_gene_expression_headers_columns), 3].astype(np.int32) label_event_list.append(label_event) label_duration_list.append(label_duration) labels_c = all_labels[~np.in1d(all_labels,cur_labels) & np.in1d(all_labels, tested_gene_expression_headers_columns)] label_event_c = survival_dataset[np.in1d(survival_dataset[:, 0], labels_c), 4].astype(np.int32) label_duration_c = survival_dataset[np.in1d(survival_dataset[:, 0], labels_c), 3].astype(np.int32) lr_results = logrank_test(label_duration, label_duration_c, label_event, label_event_c, alpha=.95) if len(label_duration) != 0: kmf.fit(list(label_duration), event_observed=list(label_event), label="cluster {} n={}, logrank pval = {}".format(i,len(label_duration), '{0:1.3e}'.format(lr_results.p_value))) # '%.7f' % kmf.plot(ax=ax, show_censors=True) print "lrank cluster {} vs all: {}".format(i, lr_results.p_value) results.append(lr_results.p_value) for j, cur_duration in enumerate(label_duration_list[:-1]): lr_results = logrank_test(label_duration, label_duration_list[j], label_event, label_event_list[j], alpha=.95) print "lrank cluster {} vs cluster {}: {}".format(i, j, lr_results.p_value) plt.ylim(0, 1); plt.title("clustering survival analysis"); plt.savefig(os.path.join(constants.BASE_PROFILE,"output" ,"cluster_by_p_{}_{}_k={}_label_i={}_{}.png".format(constants.CANCER_TYPE, gene_group.split("/")[-1],k,label_index , time.time()))) plt.cla() return results
def run_logrank(df1, df2, df3, f, cohorts): check = False title = '\n(' if cohorts > 2: results = logrank_test(df1['duration'], df2['duration'], df1['event_obs'], df2['event_obs'], alpha=.99) f.write('__ p-value ___|__ test statistic __|____ test result ____|__ is significant __\n') f.write(str(results.p_value) + ' | ' + str(results.test_statistic)+' | '+str(results.test_result)+ ' | '+str(results.is_significant)+'\n') check = check_value(results.p_value) title += str(round(results.p_value, 4)) title += ',' results = logrank_test(df2['duration'], df3['duration'], df2['event_obs'], df3['event_obs'], alpha=.99) f.write('__ p-value ___|__ test statistic __|____ test result ____|__ is significant __\n') f.write(str(results.p_value) + ' | ' + str(results.test_statistic)+' | '+str(results.test_result)+ ' | '+str(results.is_significant)+'\n') check = check_value(results.p_value) title += str(round(results.p_value, 4)) title += ',' results = logrank_test(df1['duration'], df3['duration'], df1['event_obs'], df3['event_obs'], alpha=.99) f.write('__ p-value ___|__ test statistic __|____ test result ____|__ is significant __\n') f.write(str(results.p_value) + ' | ' + str(results.test_statistic)+' | '+str(results.test_result)+ ' | '+str(results.is_significant)+'\n') check = check_value(results.p_value) title += str(round(results.p_value, 4)) title += ')' else: results = logrank_test(df1['duration'], df2['duration'], df1['event_obs'], df2['event_obs'], alpha=.99) f.write('__ p-value ___|__ test statistic __|____ test result ____|__ is significant __\n') f.write(str(results.p_value) + ' | ' + str(results.test_statistic)+' | '+str(results.test_result)+ ' | '+str(results.is_significant)+'\n') check = check_value(results.p_value) title += str(round(results.p_value, 4)) title += ')' if check: title += "*" return title
def test_logrank_test_is_symmetric(): data1 = np.random.exponential(5, size=(2000, 1)).astype(int) data2 = np.random.exponential(1, size=(2000, 1)).astype(int) result1 = stats.logrank_test(data1, data2) result2 = stats.logrank_test(data2, data1) assert abs(result1.p_value - result2.p_value) < 10e-8 assert result2.is_significant == result1.is_significant
def test_log_rank_returns_None_if_equal_arrays(): T = np.random.exponential(5, size=200) result = stats.logrank_test(T, T, alpha=0.95) assert not result.is_significant C = np.random.binomial(2, 0.8, size=200) result = stats.logrank_test(T, T, C, C, alpha=0.95) assert not result.is_significant
def test_log_rank_returns_None_if_equal_arrays(): T = np.random.exponential(5, size=200) result = stats.logrank_test(T, T) assert result.p_value > 0.05 C = np.random.binomial(1, 0.8, size=200) result = stats.logrank_test(T, T, C, C) assert result.p_value > 0.05
def test_log_rank_returns_None_if_equal_arrays(): T = np.random.exponential(5, size=200) result = stats.logrank_test(T, T, alpha=0.95) assert result.p_value > 0.05 C = np.random.binomial(2, 0.8, size=200) result = stats.logrank_test(T, T, C, C, alpha=0.95) assert result.p_value > 0.05
def survival_for_two(df, treat, ctrl, legends, title, figname): # select the time and status info for treat and control group ix = df['group'] == treat t1 = df.loc[ix]['time'] print(t1.shape) e1 = df.loc[ix]['status'] t2 = df.loc[~ix]['time'] print(t2.shape) e2 = df.loc[~ix]['status'] results = logrank_test(t1, t2, event_observed_A=e1, event_observed_B=e2) pvalue = results.p_value print('pvalue:\t{}'.format(pvalue)) # survival curves plt.figure(figsize=(3., 3.)) ax = plt.subplot(111) kmf_control = KaplanMeierFitter() #g1 = kmf_control.fit(t1, e1, label=legends[0]).plot(ax=ax,show_censors=True,\ g1 = kmf_control.fit(t1, e1).plot(ax=ax,show_censors=True,\ censor_styles={'ms': 12, 'marker': '+'},ci_show=False,c='red',ls='-') kmf_exp = KaplanMeierFitter() #g2 = kmf_exp.fit(t2, e2, label=legends[1]).plot(ax=ax,show_censors=True,\ g2 = kmf_exp.fit(t2, e2).plot(ax=ax,show_censors=True,\ censor_styles={'ms': 12, 'marker': '+'},ci_show=False,c='k',ls='--') handles, labels = ax.get_legend_handles_labels() print(labels) lg = ax.legend(handles[1::2], legends, loc='lower left', borderaxespad=-.15, handletextpad=.2, labelspacing=.3, handlelength=1, frameon=False) if pvalue < 1: plt.axes().text(df['time'].max() * 0.45, 0.45, 'p={:.2f}'.format(pvalue), fontsize=16, ha='center') # plt.axes().text(df['time'].max()*0.45,0.45,'p={:.2e}'.format(pvalue),fontsize=16,ha='center') plt.ylim([-0.02, 1.05]) # plt.xlim([0,max_val*1]) plt.title(title, fontsize=22) plt.xlabel('Days', fontsize=22) plt.ylabel('Survival probability', fontsize=22) plt.savefig(figname, bbox_inches='tight', pad_inches=.1, dpi=600, transparent=True) plt.close() return results
def test_unequal_intensity_event_observed(): data1 = np.random.exponential(5, size=(2000, 1)) data2 = np.random.exponential(1, size=(2000, 1)) eventA = np.random.binomial(1, 0.5, size=(2000, 1)) eventB = np.random.binomial(1, 0.5, size=(2000, 1)) result = stats.logrank_test(data1, data2, event_observed_A=eventA, event_observed_B=eventB) assert result.p_value < 0.05
def test_log_rank_test_on_waltons_dataset(): df = load_waltons() ix = df["group"] == "miR-137" waltonT1 = df.loc[ix]["T"] waltonT2 = df.loc[~ix]["T"] result = stats.logrank_test(waltonT1, waltonT2) assert result.p_value < 0.05
def test_waltons_dataset(): df = load_waltons() ix = df['group'] == 'miR-137' waltonT1 = df.loc[ix]['T'] waltonT2 = df.loc[~ix]['T'] result = stats.logrank_test(waltonT1, waltonT2) assert result.p_value < 0.05
def __KM_analysis(self,duration_table,expressed_array,unexpressed_array,freq_set): data = {} expressed_T = [] expressed_C = [] unexpressed_T = [] unexpressed_C = [] for idx,row in enumerate(duration_table): if(idx>0): if row[0] in unexpressed_array and row[1] != "NA" and row[2] != "NA": unexpressed_T.append(float(row[1])) unexpressed_C.append(int(row[2])) elif row[0] in expressed_array and row[1] != "NA" and row[2] != "NA": expressed_T.append(float(row[1])) expressed_C.append(int(row[2])) results = logrank_test(expressed_T, unexpressed_T, expressed_C, unexpressed_C, alpha=.95 ) if(results.p_value < .0006): ax = plt.subplot(111) kmf = KaplanMeierFitter() kmf.fit(expressed_T, event_observed=expressed_C, label="Satisfying") kmf.plot(ax=ax, ci_force_lines=False) kmf.fit(unexpressed_T, event_observed=unexpressed_C, label="None-Satisfying") kmf.plot(ax=ax, ci_force_lines=False) plt.ylim(0,1) plt.title("Lifespans ("+str(freq_set)+")") plt.show() return results.p_value
def logrank_pval(stime, censor, g1): res = logrank_test(stime[g1], stime[~g1], censor[g1], censor[~g1], alpha=.95) return res.p_value
def plot_two_groups(data, t_col_name, e_col_name, g_name, alpha): ''' functino to render the 2 groups and calculate the p values ''' T = data[t_col_name] E = data[e_col_name] groups = df[g_name] # get unique groups to get 1st and 2nd groups names uniques = df[g_name].unique() ix = (groups == uniques[0]) kmf = KaplanMeierFitter() # plot first group kmf.fit(T[~ix], E[~ix], label=uniques[1]) ax = kmf.plot() # plot second group kmf.fit(T[ix], E[ix], label=uniques[0]) kmf.plot(ax=ax) # get resoults for p Values results = logrank_test(T[ix], T[~ix], E[ix], E[~ix], alpha=alpha) plt.title('p-value: {0:.4f}, alpha: {1:.2f}'.format( results.p_value, alpha))
def kmplot(df_high, df_low, ax): kmf_high = KaplanMeierFitter() kmf_low = KaplanMeierFitter() try: kmf_high.fit(durations=df_high.duration, event_observed=df_high.event, label='High: n = ' + str(len(df_high))) kmf_low.fit(durations=df_low.duration, event_observed=df_low.event, label="Low: n = " + str(len(df_low))) except ValueError: return ("NA", "0", "0", "0", "0") kmf_high.plot(ax=ax, color="red", show_censors=True, ci_show=False) kmf_low.plot(ax=ax, color="black", show_censors=True, ci_show=False) statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A=df_high.event, event_observed_B=df_low.event) p_value = statistics_result.p_value ax.set_xlabel('Time (months)') ax.set_ylabel('Probability') ax.text(0.95, 0.02, 'logrank P = ' + str('%.4f' % p_value), verticalalignment='bottom', horizontalalignment='right', transform=ax.transAxes, color='black', fontsize=11) plt.legend(loc=3) hm5 = kmf_high.predict(60) hm10 = kmf_high.predict(120) lm5 = kmf_low.predict(60) lm10 = kmf_low.predict(120) return (p_value, hm5, hm10, lm5, lm10)
def test_significance(df1, df2): results = logrank_test(df1['os_years'], df2['os_years'], df1['CENSOR'], df2['CENSOR'], alpha=.99) return results.p_value
def logrank_test(self, treatment_a, treatment_b, t1error=0.05): """Calculate a log rank test (Mantel-Cox) statistic between two treatments Calls lifelines.statistics.logrank_test for calculation. Arguments: treatment_a - The legend label for this group as used with add_mean. treatment_b - The legend label for this group as used with add_mean. t1error - probability of a type 1 error (alpha) Default: 0.05 """ if not self.endpoint or not self.volume_data: print( 'you need to add data with .add_mean() before using logrank_test' ) raise ValueError survival_a = volume_to_survival(self.volume_data[treatment_a], endpoint=self.endpoint) survival_b = volume_to_survival(self.volume_data[treatment_b], endpoint=self.endpoint) result = logrank_test(list(survival_a['Time']), list(survival_b['Time']), list(survival_a['Observed']), list(survival_b['Observed']), alpha=1 - t1error) result.print_summary() return result
def test_log_rank_test_on_waltons_dataset(): df = load_waltons() ix = df['group'] == 'miR-137' waltonT1 = df.loc[ix]['T'] waltonT2 = df.loc[~ix]['T'] result = stats.logrank_test(waltonT1, waltonT2) assert result.p_value < 0.05
def logrank_statistics(x, y, feature, min_leaf): """ Compute logrank_test of liflines package. :param x: Input samples :param y: Labels :param feature: Feature index :param min_leaf: Minimum number of leafs for each split. :return: best score, best split value, left indices, right indices """ x_feature = x.reset_index(drop=True).iloc[:, feature] score_opt = 0 split_val_opt = None lhs_idxs = None rhs_idxs = None for split_val in x_feature.sort_values(ascending=True, kind="quicksort").unique(): feature1 = list(x_feature[x_feature <= split_val].index) feature2 = list(x_feature[x_feature > split_val].index) if len(feature1) < min_leaf or len(feature2) < min_leaf: continue durations_a = y.iloc[feature1, 0] event_observed_a = y.iloc[feature1, 1] durations_b = y.iloc[feature2, 0] event_observed_b = y.iloc[feature2, 1] results = logrank_test(durations_A=durations_a, durations_B=durations_b, event_observed_A=event_observed_a, event_observed_B=event_observed_b) score = results.test_statistic if score > score_opt: score_opt = round(score, 3) split_val_opt = round(split_val, 3) lhs_idxs = feature1 rhs_idxs = feature2 return score_opt, split_val_opt, lhs_idxs, rhs_idxs
def plot_survival_curves(rec_t, rec_e, antirec_t, antirec_e, experiment_name = '', output_file = None): # Set-up plots plt.figure(figsize=(12,3)) ax = plt.subplot(111) # Fit survival curves kmf = KaplanMeierFitter() kmf.fit(rec_t, event_observed=rec_e, label=' '.join([experiment_name, "Recommendation"])) kmf.plot(ax=ax,linestyle="-") kmf.fit(antirec_t, event_observed=antirec_e, label=' '.join([experiment_name, "Anti-Recommendation"])) kmf.plot(ax=ax,linestyle="--") # Format graph plt.ylim(0,1); ax.set_xlabel('Timeline (months)',fontsize='large') ax.set_ylabel('Percentage of Population Alive',fontsize='large') # Calculate p-value results = logrank_test(rec_t, antirec_t, rec_e, antirec_e, alpha=.95) results.print_summary() # Location the label at the 1st out of 9 tick marks xloc = max(np.max(rec_t),np.max(antirec_t)) / 9 if results.p_value < 1e-5: ax.text(xloc,.2,'$p < 1\mathrm{e}{-5}$',fontsize=20) else: ax.text(xloc,.2,'$p=%f$' % results.p_value,fontsize=20) plt.legend(loc='best',prop={'size':15}) if output_file: plt.tight_layout() pylab.savefig(output_file)
def compute_pval(preds, labels, threshold, alpha): """ preds: np.array (1D) with predictions labels. np.array (2D) with n_patients x 2 where first column survival, second event status Returns ------- p_value of difference between risk groups obtained by using given threshold """ low_risk_idx = np.where(preds <= threshold)[0] high_risk_idx = np.where(preds > threshold)[0] try: test_res = logrank_test(labels[low_risk_idx, 0], labels[high_risk_idx, 0], event_observed_A=labels[low_risk_idx, 1], event_observed_B=labels[high_risk_idx, 1], alpha=alpha) p_val = test_res.p_value except Exception as e: print("WW: Caught exception in compute_pval", e) p_val = np.nan return p_val
def test_waltons_dataset(): df = load_waltons() ix = df['group'] == 'miR-137' waltonT1 = df.loc[ix]['T'] waltonT2 = df.loc[~ix]['T'] result = stats.logrank_test(waltonT1, waltonT2) assert result.is_significant
def logrank(T, C): print "Running logrank test..." p_values = [] for i in range(0, len(groups)): T1 = [] T2 = [] C1 = [] C2 = [] for j in range(0, len(filenames)): if T[j] != -1: if groups[i][j] == 1: T1.append(T[j]) C1.append(C[j]) if groups[i][j] == 2: T2.append(T[j]) C2.append(C[j]) if len(T1) != 0: logrank = logrank_test(T1, T2, C1, C2, alpha=0.99) p_values.append(logrank.p_value) else: p_values.append(100) return p_values
def dichot(self, T, F, surv_prob, median): T1 = T[surv_prob >= median] T2 = T[surv_prob < median] E1 = F[surv_prob >= median] E2 = F[surv_prob < median] result = logrank_test(T1, T2, E1, E2) p = result.p_value return T1, T2, E1, E2, p
def test_logrank_test_with_t_0(): control_T = [1, 1, 2, 2, 3, 4, 4, 5, 5, 8, 8, 8, 8, 11, 11, 12, 12, 15, 17, 22, 23] control_E = np.ones_like(control_T) treatment_T = [6, 6, 6, 7, 10, 13, 16, 22, 23, 6, 9, 10, 11, 17, 19, 20, 25, 32, 32, 34, 25] treatment_E = [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] result = stats.logrank_test(control_T, treatment_T, event_observed_A=control_E, event_observed_B=treatment_E, t_0=10)
def test_logrank_test_output_against_R_1(): df = load_g3() ix = df["group"] == "RIT" d1, e1 = df.loc[ix]["time"], df.loc[ix]["event"] d2, e2 = df.loc[~ix]["time"], df.loc[~ix]["event"] expected = 0.0138 result = stats.logrank_test(d1, d2, event_observed_A=e1, event_observed_B=e2) assert abs(result.p_value - expected) < 0.0001
def set_quality(self): self.logrank_test = logrank_test( self.sub_group['survival_times'], self.sub_group_complement['survival_times'], self.sub_group['events'], self.sub_group_complement['events']) self.quality = 1 - self.logrank_test.p_value return
def test_equal_intensity_with_negative_data(): data1 = np.random.normal(0, size=(2000, 1)) data1 -= data1.mean() data1 /= data1.std() data2 = np.random.normal(0, size=(2000, 1)) data2 -= data2.mean() data2 /= data2.std() result = stats.logrank_test(data1, data2) assert result.p_value > 0.05
def test_logrank_test_output_against_R_1(): df = load_g3() ix = (df['group'] == 'RIT') d1, e1 = df.loc[ix]['time'], df.loc[ix]['event'] d2, e2 = df.loc[~ix]['time'], df.loc[~ix]['event'] expected = 0.0138 result = stats.logrank_test(d1, d2, event_observed_A=e1, event_observed_B=e2) assert abs(result.p_value - expected) < 0.0001
def metrices(self, T, surv_prob, F, y, year, train_val, median): brier_true = np.cumprod(y[:, 0:np.nonzero(breaks > 365 * year)[0][0]], axis=1)[:, -1] conc = concordance_index(T, surv_prob, F) brier = brier_score_loss(brier_true, surv_prob) T1 = T[surv_prob >= median] T2 = T[surv_prob < median] E1 = F[surv_prob >= median] E2 = F[surv_prob < median] result = logrank_test(T1, T2, E1, E2) p = result.p_value plt.rc('font', family='serif') plt.rc('xtick', labelsize='x-small') plt.rc('ytick', labelsize='x-small') # fig, ax = plt.subplots(ncols=1, figsize=(8,8)) # #plt.figure(figsize=(12,4)) # #plt.subplot(1,2,1) # days_plot = 9*365 # kmf = KaplanMeierFitter() # for i in range(2): # if i==0: # kmf.fit(T1, event_observed = E1) # elif i==1: # kmf.fit(T2, event_observed = E2) # kmf.plot() # N1='N='+ str(len(T1)) # N2='N='+ str(len(T2)) # ax.set_xticks(np.arange(0, days_plot, 365)) # ax.set_yticks(np.arange(0, 1.125, 0.125)) # ax.tick_params(axis='x', labelsize=12) # ax.tick_params(axis='y', labelsize=12) # ax.set_xlim([0, days_plot]) # ax.set_ylim([0,1]) # ax.text(50, 0.025, 'logrank p-value = ' +str('%.3g'%(p)), bbox=dict(facecolor='red', alpha=0.3), fontsize=10) # ax.set_xlabel('Follow-up time (days)', fontsize = 14) # ax.set_ylabel('Probability of survival', fontsize = 14) # ax.legend(['Low Risk Individuals ' + N1 ,'High Risk Individuals ' + N2 ]) # ax.set_title('%s set Kaplan-Meier Curves'%(train_val), fontweight = 'bold', fontsize = 14) # ax.grid() # plt.show() print( "%s year %s concordance index for %s:" % (str(year), train_val, str(self.omics)), conc) print( "%s year %s brier score for %s:" % (str(year), train_val, str(self.omics)), brier) print("P-value:", p) return conc, brier, p
def kaplanmeier_stats_two(unique_groups, grouped_data, analysis_type): results = logrank_test(grouped_data.get_group(unique_groups[0])['survival'], grouped_data.get_group(unique_groups[1])['survival'], grouped_data.get_group(unique_groups[0])['event'], grouped_data.get_group(unique_groups[1])['event']) with open('Kaplan_%s.txt' % (analysis_type), 'w') as f: test_name = "%s vs %s" % (unique_groups[0], unique_groups[1]) f.write('Test\tP-Value\tSignificant\n') # print(test_name) # print(results.p_value) # print(results.is_significant) # print() f.write('%s\t%4g\t%s\n' % (test_name, results.p_value, results.is_significant)) return results
def main(): savedir = "logrank" os.makedirs(savedir, exist_ok=True) file_male = "count_age_each_sex_male.csv" file_female = "count_age_each_sex_female.csv" data_male = (pd.read_csv(file_male)).values.tolist() data_female = (pd.read_csv(file_female)).values.tolist() count = 0 count_u = 0 result = [] for i in range(len(data_male)): if data_male[i][1] < 50: break d_male = data_male[i][2::] d_male_list = make_data(d_male) d_female = data_female[i][2::] d_female_list = make_data(d_female) diag = data_male[i][0] #print(len(d_male_list)) #print(len(d_female_list)) #if len(d_male_list) < 20 or len(d_female_list) < 20: # continue count += 1 results = logrank_test(d_male_list, d_female_list) #results.print_summary() title = "logrank, p = " + '{:.3e}'.format(results.p_value) + "m" + str(len(d_male_list)) + "_f" + str(len(d_female_list)) + "\n" if results.p_value < 0.05: count_u += 1 #print(diag) #print(results.p_value) #cox(d_male_list, d_female_list) #make_graph(d_male, d_female, diag, savedir, results.p_value) make_kmf_plt(d_male_list, d_female_list, diag, savedir, title) """ else: make_graph(d_male, d_female, diag, "logrank_false", title) make_kmf_plt(d_male_list, d_female_list, diag, "logrank_false", title) """ #result.append(results.p_value) """ result.sort() result = [math.log10(x) for x in result] plt.plot(result) plt.show() """ print("test") print(count) print("under 0.05") print(count_u)
def test_peto_weighted_logrank_on_leukemia_dataset(): """ Test against result from "Survival Analysis: A Self-learning Text" by Kleinbaum & Klein, 3rd edition, 2012. """ data = load_leukemia() group_1 = data[data["Rx"] == 0] group_2 = data[data["Rx"] == 1] result = stats.logrank_test(group_1["t"], group_2["t"], group_1["status"], group_2["status"], weightings="peto") assert abs(result.test_statistic - 14.084139) < 10e-6 assert result.test_name == "Peto_test"
def plotKaplanMeier(DF, GroupName): # Bring in matplotlib import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from matplotlib import colors matplotlib.style.use('seaborn-talk') import seaborn as sns # fit data sns.set(font_scale=.95) # plot baseline survival KMF = KaplanMeierFitter() # get log rank results Unique = sorted(DF[GroupName].unique())[::-1] G0 = Unique[0] G1 = Unique[1] Group0 = DF[DF[GroupName] == G0] Group1 = DF[DF[GroupName] == G1] LR_Results = logrank_test(Group0["OS (months)"].astype(float), Group1["OS (months)"].astype(float), Group0["Patient Status"].astype(int), Group1["Patient Status"].astype(int)) # build figure fig, ax = plt.subplots(1, 1, figsize=(7, 5)) # add logrank to plot # iterate over unique groups and plot KM curves Median_Survival_List = [] for Group in Unique: TempDF = DF[DF[GroupName] == Group] KMF = KaplanMeierFitter() KMF.fit(TempDF["OS (months)"].astype(float), TempDF["Patient Status"].astype(int), label=Group).plot(ax=ax) print("KMF.median_survival_time_", KMF.median_survival_time_) Median_Survival_List.append(float(KMF.median_survival_time_)) Median_Survival_List = sorted(Median_Survival_List) Annotation = "logrank: "+str(round(LR_Results.test_statistic,3))+"\n"+\ "pvalue: "+str(round(LR_Results.p_value,5))+"\n"+\ "median survival times: "+str(Median_Survival_List) ax.text(.05, .05, Annotation, horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes) # set labels ax.set_xlabel("Time (months)") ax.set_ylabel("Overall survival probability") plt.tight_layout() fig.savefig("KaplanMeierSurvival.png") plt.clf() plt.close() return (LR_Results)
def cox_log_rank(hazardsdata, labels, survtime_all): median = np.median(hazardsdata) hazards_dichotomize = np.zeros([len(hazardsdata)], dtype=int) hazards_dichotomize[hazardsdata > median] = 1 idx = hazards_dichotomize == 0 T1 = survtime_all[idx] T2 = survtime_all[~idx] E1 = labels[idx] E2 = labels[~idx] results = logrank_test(T1, T2, event_observed_A=E1, event_observed_B=E2) pvalue_pred = results.p_value return (pvalue_pred)
def compare_kmc(data, factor, status, interval): f = data[factor].drop_duplicates().tolist() f.sort() for i in f: group_i = data[data[factor] == i] interv_i = group_i[interval] interv_i.reset_index(drop=True) interv_i = interv_i.tolist() interv_i.append(0) sta_i = [] for j in group_i[status]: c = bool(j) sta_i.append(c) sta_i.append(False) time, survival_prob = kaplan_meier_estimator(sta_i, interv_i) plt.step(time, survival_prob, where='post', label=str(factor) + '=%s' % i) plt.ylabel("est. probability of survival") plt.xlabel("time $(days)$") plt.legend(loc='best') if data[factor].nunique() != 2: print('The factor', factor, 'is non-binary, so the Logrank statistics is not calculated') else: f = data[factor].drop_duplicates().tolist() f.sort() time = [] censor = [] for i in f: interv_i = [] group_i = df[df[factor] == i] for j in group_i[interval]: interv_i.append(j) time.append(interv_i) censor_i = [] for k in group_i[status]: censor_i.append(k) censor.append(censor_i) T = time[0] T1 = time[1] E = censor[0] E1 = censor[1] results = logrank_test(T, T1, E, E1) results.print_summary()
def test_multivariate_log_rank_is_identital_to_log_rank_for_n_equals_2(): N = 200 T1 = np.random.exponential(5, size=N) T2 = np.random.exponential(5, size=N) C1 = np.random.binomial(2, 0.9, size=N) C2 = np.random.binomial(2, 0.9, size=N) result = stats.logrank_test(T1, T2, C1, C2, alpha=0.95) T = np.r_[T1, T2] C = np.r_[C1, C2] G = np.array([1] * 200 + [2] * 200) result_m = stats.multivariate_logrank_test(T, G, C, alpha=0.95) assert result.p_value == result_m.p_value
def test_logrank_test_output_against_R_2(): # from https://stat.ethz.ch/education/semesters/ss2011/seminar/contents/presentation_2.pdf control_T = [1, 1, 2, 2, 3, 4, 4, 5, 5, 8, 8, 8, 8, 11, 11, 12, 12, 15, 17, 22, 23] control_E = np.ones_like(control_T) treatment_T = [6, 6, 6, 7, 10, 13, 16, 22, 23, 6, 9, 10, 11, 17, 19, 20, 25, 32, 32, 34, 25] treatment_E = [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] result = stats.logrank_test(control_T, treatment_T, event_observed_A=control_E, event_observed_B=treatment_E) expected_p_value = 4.17e-05 assert abs(result.p_value - expected_p_value) < 0.0001 assert abs(result.test_statistic - 16.8) < 0.1
def kmplot(df_high, df_low): kmf_high = KaplanMeierFitter() kmf_low = KaplanMeierFitter() try: kmf_high.fit(durations = df_high.duration, event_observed = df_high.event, label = 'High: n = ' + str(len(df_high))) kmf_low.fit(durations = df_low.duration, event_observed = df_low.event, label = "Low: n = " + str(len(df_low))) except ValueError: return("NA", "0", "0", "0", "0") statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A = df_high.event, event_observed_B = df_low.event) p_value = statistics_result.p_value hm5 = kmf_high.predict(60) hm10 = kmf_high.predict(120) lm5 = kmf_low.predict(60) lm10 = kmf_low.predict(120) return(p_value, hm5, hm10, lm5, lm10)
def logrank(df, condition_col, censor_col, survival_col, threshold=None): if threshold is not None: if threshold == 'median': threshold = df[condition_col].median() condition = df[condition_col] > threshold else: condition = df[condition_col] df_with_condition = df[condition] df_no_condition = df[~condition] survival_no_condition = df_no_condition[survival_col] survival_with_condition = df_with_condition[survival_col] event_no_condition = (df_no_condition[censor_col].astype(bool)) event_with_condition = (df_with_condition[censor_col].astype(bool)) return logrank_test(survival_no_condition, survival_with_condition, event_observed_A=event_no_condition, event_observed_B=event_with_condition)
def kmplot(df_high, df_low, ax): kmf_high = KaplanMeierFitter() kmf_low = KaplanMeierFitter() try: kmf_high.fit(durations = df_high.duration, event_observed = df_high.event, label = 'High: n = ' + str(len(df_high))) kmf_low.fit(durations = df_low.duration, event_observed = df_low.event, label = "Low: n = " + str(len(df_low))) except ValueError: return("NA", "0", "0", "0", "0") kmf_high.plot(ax = ax, color = "red", show_censors=True, ci_show=False) kmf_low.plot(ax = ax, color = "black", show_censors=True, ci_show=False) statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A = df_high.event, event_observed_B = df_low.event) p_value = statistics_result.p_value ax.set_xlabel('Time (months)') ax.set_ylabel('Probability') ax.text(0.95, 0.02, 'logrank P = ' + str('%.4f' % p_value), verticalalignment='bottom', horizontalalignment='right', transform=ax.transAxes, color = 'black', fontsize = 11) plt.legend(loc=3) hm5 = kmf_high.predict(60) hm10 = kmf_high.predict(120) lm5 = kmf_low.predict(60) lm10 = kmf_low.predict(120) return(p_value, hm5, hm10, lm5, lm10)
def estCoxPHTE(df, treatment_col='treated', duration_col='dx', event_col='disease', covars=[]): """Estimates treatment efficacy using proportional hazards (Cox model). Parameters ---------- df : pandas.DataFrame treatment_col : string Column in df indicating treatment. duration_col : string Column in df indicating survival times. event_col : string Column in df indicating events (censored data are 0) covars : list List of other columns to include in Cox model as covariates. Returns ------- est : float Estimate of vaccine efficacy ci : vector, length 2 95% confidence interval, [LL, UL] pvalue : float P-value for H0: VE=0""" coxphf = CoxPHFitter() coxphf.fit(df[[treatment_col, duration_col, event_col]+covars], duration_col=duration_col, event_col=event_col) te = 1 - np.exp(coxphf.hazards_.loc['coef', treatment_col]) ci = 1 - np.exp(coxphf.confidence_intervals_[treatment_col].loc[['upper-bound', 'lower-bound']]) pvalue = coxphf._compute_p_values()[0] ind1 = df[treatment_col] == 0 ind2 = df[treatment_col] == 1 results = logrank_test(df[duration_col].loc[ind1], df[duration_col].loc[ind2], event_observed_A=df[event_col].loc[ind1], event_observed_B=df[event_col].loc[ind2]) index = ['TE', 'UB', 'LB', 'pvalue', 'logrank_pvalue', 'model'] return pd.Series([te, ci['upper-bound'], ci['lower-bound'], pvalue, results.p_value, coxphf], index=index)
def run_logrank(df1, df2, df3, f, cohorts): check = False title = "\n(" if cohorts > 2: results = logrank_test(df1["duration"], df2["duration"], df1["event_obs"], df2["event_obs"], alpha=0.99) f.write("__ p-value ___|__ test statistic __|____ test result ____|__ is significant __\n") f.write( str(results.p_value) + " | " + str(results.test_statistic) + " | " + str(results.test_result) + " | " + str(results.is_significant) + "\n" ) check = check_value(results.p_value) title += str(round(results.p_value, 4)) title += "," results = logrank_test(df2["duration"], df3["duration"], df2["event_obs"], df3["event_obs"], alpha=0.99) f.write("__ p-value ___|__ test statistic __|____ test result ____|__ is significant __\n") f.write( str(results.p_value) + " | " + str(results.test_statistic) + " | " + str(results.test_result) + " | " + str(results.is_significant) + "\n" ) check = check_value(results.p_value) title += str(round(results.p_value, 4)) title += "," results = logrank_test(df1["duration"], df3["duration"], df1["event_obs"], df3["event_obs"], alpha=0.99) f.write("__ p-value ___|__ test statistic __|____ test result ____|__ is significant __\n") f.write( str(results.p_value) + " | " + str(results.test_statistic) + " | " + str(results.test_result) + " | " + str(results.is_significant) + "\n" ) check = check_value(results.p_value) title += str(round(results.p_value, 4)) title += ")" else: results = logrank_test(df1["duration"], df2["duration"], df1["event_obs"], df2["event_obs"], alpha=0.99) f.write("__ p-value ___|__ test statistic __|____ test result ____|__ is significant __\n") f.write( str(results.p_value) + " | " + str(results.test_statistic) + " | " + str(results.test_result) + " | " + str(results.is_significant) + "\n" ) check = check_value(results.p_value) title += str(round(results.p_value, 4)) title += ")" if check: title += "*" return title
def test_unequal_intensity_with_random_data(): data1 = np.random.exponential(5, size=(2000, 1)) data2 = np.random.exponential(1, size=(2000, 1)) test_result = stats.logrank_test(data1, data2) assert test_result.p_value < 0.05
def test_valueerror_is_raised_if_alpha_out_of_bounds(): data1 = np.random.exponential(5, size=(20, 1)) data2 = np.random.exponential(1, size=(20, 1)) with pytest.raises(ValueError): stats.logrank_test(data1, data2, alpha=95)
# # It turns out these two DNA types do not have significantly different survival rates. # ### Using R # In[31]: get_ipython().run_cell_magic(u'R', u'', u'survdiff(Surv(time, delta) ~ type)') # ### Using Python # In[32]: from lifelines.statistics import logrank_test summary_= logrank_test(T, T2, C, C2, alpha=99) print summary_ # <hr> # # Estimating Hazard Rates # # ### Using R # To estimate the hazard function, we compute the cumulative hazard function using the [Nelson-Aalen estimator](), defined as: # # $$\hat{\Lambda} (t) = \sum_{t_i \leq t} \frac{d_i}{n_i}$$ # # where $d_i$ is the number of deaths at time $t_i$ and $n_i$ is the number of susceptible individuals. Both R and Python modules use the same estimator. However, in R we will use the `-log` of the Fleming and Harrington estimator, which is equivalent to the Nelson-Aalen.
def plot_kmf(df, condition_col, censor_col, survival_col, threshold=None, title=None, xlabel=None, ax=None, print_as_title=False): """ Plot survival curves by splitting the dataset into two groups based on condition_col if threshold is defined, the groups are split based on being > or < condition_col if threshold == 'median', the threshold is set to the median of condition_col Parameters ---------- df: dataframe condition_col: string, column which contains the condition to split on survival_col: string, column which contains the survival time censor_col: string, threshold: int or string, if int, condition_col is thresholded, if 'median', condition_col thresholded at its median title: Title for the plot, default None ax: an existing matplotlib ax, optional, default None print_as_title: bool, optional, whether or not to print text within the plot's title vs. stdout, default False """ kmf = KaplanMeierFitter() if threshold is not None: if threshold == 'median': threshold = df[condition_col].median() condition = df[condition_col] > threshold label = '{} > {}'.format(condition_col, threshold) else: condition = df[condition_col] label = '{}'.format(condition_col) df_with_condition = df[condition] df_no_condition = df[~condition] survival_no_condition = df_no_condition[survival_col] survival_with_condition = df_with_condition[survival_col] event_no_condition = (df_no_condition[censor_col].astype(bool)) event_with_condition = (df_with_condition[censor_col].astype(bool)) kmf.fit(survival_no_condition, event_no_condition, label="") if ax: kmf.plot(ax=ax, show_censors=True, ci_show=False) else: ax = kmf.plot(show_censors=True, ci_show=False) kmf.fit(survival_with_condition, event_with_condition, label=(label)) kmf.plot(ax=ax, show_censors=True, ci_show=False) # Set the y-axis to range 0 to 1 ax.set_ylim(0, 1) no_cond_str = "# no condition {}".format(len(survival_no_condition)) cond_str = "# with condition {}".format(len(survival_with_condition)) if title: ax.set_title(title) elif print_as_title: ax.set_title("%s | %s" % (no_cond_str, cond_str)) else: print(no_cond_str) print(cond_str) if xlabel: ax.set_xlabel(xlabel) results = logrank_test(survival_no_condition, survival_with_condition, event_observed_A=event_no_condition, event_observed_B=event_with_condition) return results
def fit(self, df, duration_col, event_col): """ Fit a data frame. """ self.xcols = df.columns - [duration_col, event_col] r_df = _convert_to_dataframe(df) # Insert into namespace robjects.globalenv["r_df"] = r_df # Build options string options = "" if self.minsplit is not None: options = ", ".join([options, "minsplit={}".format(self.minsplit)]) if self.minbucket is not None: options = ", ".join([options, "minbucket={}".format(self.minbucket)]) if self.xval is not None: options = ", ".join([options, "xval={}".format(self.xval)]) if self.cp is not None: options = ", ".join([options, "cp={}".format(self.cp)]) if len(options) > 0: options = ", control = rpart.control({})".format(options) # Make the command cmd = ("myfit = rpart(Surv(r_df${time}, r_df${event}) ~ {incols}, " + "data=r_df {options})").format( time=duration_col, event=event_col, options=options, incols="+".join(self.xcols) ) # Run the command self.myfit = r(cmd) # Prune it if self.cp is not None and self.cp > 0: cmd = "myfit <- prune(myfit, cp={})".format(self.cp) self.myfit = r(cmd) # Now divide into groups for future preds = self.predict(df) hazards = np.unique(preds) # Just to be safe hazards.sort() # Convert to actual sizes highlim = int(self.highlim * df.shape[0]) lowlim = int(self.lowlim * df.shape[0]) # Save subgroups here, initialize to outer groups self._high = [hazards[-1]] self._low = [hazards[0]] # Keep track of entire group here for logrank high = preds == hazards[-1] low = preds == hazards[0] # Low risk iterates forwards for g in hazards[1:]: if ( np.sum(low) < lowlim or not logrank_test( df.loc[low, duration_col], df.loc[preds == g, duration_col], df.loc[low, event_col], df.loc[preds == g, event_col], ).is_significant ): # Append to group self._low.append(g) low |= preds == g else: break # Important to go backwards here of course for g in reversed(hazards[:-1]): if g in self._low: break if ( np.sum(high) < highlim or not logrank_test( df.loc[high, duration_col], df.loc[preds == g, duration_col], df.loc[high, event_col], df.loc[preds == g, event_col], ).is_significant ): # Append to group self._high.append(g) high |= preds == g else: break # Mid is the rest # Remember sizes for the benefit of others self.high_size = np.sum(high) self.low_size = np.sum(low)
def survival_plot(clinical, fitter, fitter_name, feature, time="life_duration", event="patient_death_date", axis=None): """ Plot survival/hazard of all patients regardless of trait and dependent of trait. """ # duration of life T = [i.days / 30. for i in clinical[time]] # events: # True for observed event (death); # else False (this includes death not observed; death by other causes) C = [True if i is not pd.NaT else False for i in clinical[event]] # drop index (to have syncronised numbers between lists T, C and the index of clinical) # this is because we drop a few rows from clinical before during data cleanup clinical2 = clinical.reset_index(drop=True) # plot if axis is None: fig, axis = plt.subplots(1) save = True else: save = False # Plot survival of all patients regardless of trait fitter.fit(T, event_observed=C, label="all patients") fitter.plot(ax=axis, show_censors=True) # For each type: subset, fit, plot # Filter patients which feature is nan x = clinical2[feature].unique() x = x[~np.array(map(pd.isnull, x))] # for each class plot curve for value in x: # get patients from class s = clinical2[clinical2[feature] == value].index.tolist() fitter.fit([T[i] for i in s], event_observed=[C[i] for i in s], label=str(value)) fitter.plot(ax=axis, show_censors=True) if fitter_name == "survival": axis.set_ylim(0, 1.05) # Test pairwise differences p_values = list() # test each against all for a in x: a_ = clinical2[clinical2[feature] == a].index.tolist() b_ = clinical2.index.tolist() p = logrank_test( [T[i] for i in a_], [T[i] for i in b_], event_observed_A=[C[i] for i in a_], event_observed_B=[C[i] for i in b_]).p_value # .print_summary() p_values.append(" vs ".join([str(a), "all"]) + ": %f" % p) # test each pairwise combination for a, b in itertools.combinations(x, 2): a_ = clinical2[clinical2[feature] == a].index.tolist() b_ = clinical2[clinical2[feature] == b].index.tolist() p = logrank_test( [T[i] for i in a_], [T[i] for i in b_], event_observed_A=[C[i] for i in a_], event_observed_B=[C[i] for i in b_]).p_value # .print_summary() p_values.append(" vs ".join([str(a), str(b)]) + ": %f" % p) # Add p-values as anchored text try: # problem with matplotlib < 1.4 axis.add_artist(AnchoredText("\n".join(p_values), loc=8, frameon=False)) axis.set_xlabel("time (months)") except: axis.set_xlabel("time (months)\n%s" % "\n".join(p_values)) axis.set_title("%s" % feature) axis.set_ylabel(fitter_name) sns.despine() if save: fig.savefig(os.path.join(plots_dir, "%s_%s.svg" % (feature, fitter_name)), bbox_inches="tight")
males = df[df['gender']=='Male'] females = df[df['gender']=='Female'] T = df["lifetime"] #measured in days C = df["dead"] females_ = df["gender"] == "Female" males_ = df["gender"] == "Male" community_stats = { 'community': community, 'size': females.count()[0] + males.count()[0], 'women_frequency_median' : females['activity_freq'].median(), 'men_frequency_median' : males['activity_freq'].median(), 'frequency_difference_median': females['activity_freq'].median() - males['activity_freq'].median(), 'women_frequency_mean' : females['activity_freq'].mean(), 'men_frequency_mean' : males['activity_freq'].mean(), 'frequency_difference_mean': females['activity_freq'].mean() - males['activity_freq'].mean(), 'frequency_pvalue': 2* stats.mannwhitneyu(females['activity_freq'], males['activity_freq'])[1], 'women_lifetime_median':kmf.fit(T[females_], event_observed=C[females_], label="Female").median_, 'men_lifetime_median':kmf.fit(T[males_], event_observed=C[males_], label="Male").median_, 'lifetime_pvalue': logrank_test(T[females_], T[males_], C[females_], C[males_], alpha=.95 ).p_value } community_stats['lifetime_difference_median'] = community_stats["women_lifetime_median"] - community_stats["men_lifetime_median"] results_db.insert( community_stats )
def test_integer_times_logrank_test(): data1 = np.random.exponential(5, size=(2000, 1)).astype(int) data2 = np.random.exponential(1, size=(2000, 1)).astype(int) result = stats.logrank_test(data1, data2) assert result.p_value < 0.05
print(df.head()) ''' T E group 0 6 1 miR-137 1 13 1 miR-137 2 13 1 miR-137 3 13 1 miR-137 4 19 1 miR-137 ''' T = df['T'] E = df['E'] groups = df['group'] ix = (groups == 'miR-137') kmf = KaplanMeierFitter() kmf.fit(T[~ix], E[~ix], label='control') ax = kmf.plot() kmf.fit(T[ix], E[ix], label='miR-137') kmf.plot(ax=ax) plt.ylabel('Survival Probability') plt.show() # Compare the two curves results = logrank_test(T[ix], T[~ix], event_observed_A=E[ix], event_observed_B=E[~ix]) results.print_summary()
def test_logrank_test_is_symmetric(): data1 = np.random.exponential(5, size=(2000, 1)).astype(int) data2 = np.random.exponential(1, size=(2000, 1)).astype(int) result1 = stats.logrank_test(data1, data2) result2 = stats.logrank_test(data2, data1) assert abs(result1.p_value - result2.p_value) < 10e-8
def _plot_kmf_single(df, condition_col, survival_col, censor_col, threshold, title, xlabel, ylabel, ax, with_condition_color, no_condition_color, with_condition_label, no_condition_label, color_map, label_map, color_palette, ci_show, print_as_title): """ Helper function to produce a single KM survival plot, among observations in df by groups defined by condition_col. All inputs are required - this function is intended to be called by `plot_kmf`. """ # make color inputs consistent hex format if colors.is_color_like(with_condition_color): with_condition_color = colors.to_hex(with_condition_color) if colors.is_color_like(no_condition_color): no_condition_color = colors.to_hex(no_condition_color) ## prepare data to be plotted; producing 3 outputs: # - `condition`, series containing category labels to be plotted # - `label_map` (mapping condition values to plot labels) # - `color_map` (mapping condition values to plotted colors) if threshold is not None: is_median = threshold == "median" if is_median: threshold = df[condition_col].median() label_suffix = float_str(threshold) condition = df[condition_col] > threshold default_label_no_condition = "%s ≤ %s" % (condition_col, label_suffix) if is_median: label_suffix += " (median)" default_label_with_condition = "%s > %s" % (condition_col, label_suffix) with_condition_label = with_condition_label or default_label_with_condition no_condition_label = no_condition_label or default_label_no_condition if not label_map: label_map = {False: no_condition_label, True: with_condition_label} if not color_map: color_map = {False: no_condition_color, True: with_condition_color} elif df[condition_col].dtype == 'O' or df[condition_col].dtype.name == "category": condition = df[condition_col].astype("category") if not label_map: label_map = dict() [label_map.update({condition_value: '{} = {}'.format(condition_col, condition_value)}) for condition_value in condition.unique()] if not color_map: rgb_values = sb.color_palette(color_palette, len(label_map.keys())) hex_values = [colors.to_hex(col) for col in rgb_values] color_map = dict(zip(label_map.keys(), hex_values)) elif df[condition_col].dtype == 'bool': condition = df[condition_col] default_label_with_condition = "= {}".format(condition_col) default_label_no_condition = "¬ {}".format(condition_col) with_condition_label = with_condition_label or default_label_with_condition no_condition_label = no_condition_label or default_label_no_condition if not label_map: label_map = {False: no_condition_label, True: with_condition_label} if not color_map: color_map = {False: no_condition_color, True: with_condition_color} else: raise ValueError('Don\'t know how to plot data of type\ {}'.format(df[condition_col].dtype)) # produce kmf plot for each category (group) identified above kmf = KaplanMeierFitter() grp_desc = list() grp_survival_data = dict() grp_event_data = dict() grp_names = list(condition.unique()) for grp_name, grp_df in df.groupby(condition): grp_survival = grp_df[survival_col] grp_event = (grp_df[censor_col].astype(bool)) grp_label = label_map[grp_name] grp_color = color_map[grp_name] kmf.fit(grp_survival, grp_event, label=grp_label) desc_str = "# {}: {}".format(grp_label, len(grp_survival)) grp_desc.append(desc_str) grp_survival_data[grp_name] = grp_survival grp_event_data[grp_name] = grp_event if ax: ax = kmf.plot(ax=ax, show_censors=True, ci_show=ci_show, color=grp_color) else: ax = kmf.plot(show_censors=True, ci_show=ci_show, color=grp_color) ## format the plot # Set the y-axis to range 0 to 1 ax.set_ylim(0, 1) y_tick_vals = ax.get_yticks() ax.set_yticklabels(["%d" % int(y_tick_val * 100) for y_tick_val in y_tick_vals]) # plot title if title: ax.set_title(title) elif print_as_title: ax.set_title(' | '.join(grp_desc)) else: [print(desc) for desc in grp_desc] # axis labels if xlabel: ax.set_xlabel(xlabel) if ylabel: ax.set_ylabel(ylabel) ## summarize analytical version of results ## again using same groups as are plotted if len(grp_names) == 2: # use log-rank test for 2 groups results = logrank_test(grp_survival_data[grp_names[0]], grp_survival_data[grp_names[1]], event_observed_A=grp_event_data[grp_names[0]], event_observed_B=grp_event_data[grp_names[1]]) elif len(grp_names) == 1: # no analytical result for 1 or 0 groups results = NullSurvivalResults() else: # cox PH fitter for >2 groups cf = CoxPHFitter() cox_df = patsy.dmatrix('+'.join([condition_col, survival_col, censor_col]), df, return_type='dataframe') del cox_df['Intercept'] results = cf.fit(cox_df, survival_col, event_col=censor_col) results.print_summary() # add metadata to results object so caller can print them results.survival_data_series = grp_survival_data results.event_data_series = grp_event_data results.desc = grp_desc return results
def logrank(out1, time1, out2, time2, alpha=0.05): return logrank_test(time1, time2, out1, out2, alpha=1 - alpha)
import pandas as pd import sys,os import json import numpy as np from lifelines.statistics import logrank_test data = sys.argv[1] df = pd.read_json(data) df.columns = df.columns.str.replace('\r','') values = df['value'] gp1_tm = np.array(values[0]).astype(float) gp1_evt = np.array(values[1]).astype(float) gp2_tm = np.array(values[2]).astype(float) gp2_evt = np.array(values[3]).astype(float) results = logrank_test(gp1_tm, gp2_tm, gp1_evt, gp2_evt) pvalue = round(results.p_value, 6) # pvalue = results.p_value lst = {"logrank_p" : pvalue} json_last = json.dumps(lst, ensure_ascii = 'false') print json_last
#for KM-plotting data_mat = clean_df.values[:,2:] coefs = [np.log(multi_result_dic['exp(coef)'][name]) for name in clean_df.columns[2:]] coefs = np.array(coefs).reshape(-1, 1) mat_for_plotting = np.concatenate([clean_df.values[:,:2], np.dot(data_mat, coefs)], axis=1) critical_val = np.median(mat_for_plotting, axis=0)[2] abv_median = mat_for_plotting[mat_for_plotting[:,-1] >= critical_val] abv_time = np.array(abv_median[:,0]).tolist() abv_event = np.array(abv_median[:,1]).tolist() below_median = mat_for_plotting[mat_for_plotting[:,-1] < critical_val] blw_time = np.array(below_median[:,0]).tolist() blw_event = np.array(below_median[:,1]).tolist() # import pdb; pdb.set_trace() multi_df = pd.DataFrame.from_dict(multi_result_dic) multi_df.columns.values[0] = 'multi Hazard Ratio' multi_df.columns.values[1] = 'multi lower' multi_df.columns.values[2] = 'multi upper' multi_df.columns.values[3] = 'multi p' col_list = ['multi Hazard Ratio', 'multi lower', 'multi upper', 'multi p'] multi_df = multi_df[col_list] result = pd.concat([uni_df,multi_df], axis=1) result.index.name = "Variable" result = result.reset_index() result.to_csv("multi_output.csv",mode='w+',index=False) rst = logrank_test(abv_time, blw_time, abv_event, blw_event) # pvalue = rst.p_value pvalue = round(rst.p_value, 6) last = {"abv_time" : abv_time,"abv_event" :abv_event, "blw_time" : blw_time,"blw_event": blw_event, "pvalue":pvalue} json_last = json.dumps(last, ensure_ascii = 'false') print json_last